canon 0.2.11 → 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +12 -22
  3. data/Rakefile +5 -2
  4. data/lib/canon/cache.rb +3 -1
  5. data/lib/canon/cli.rb +0 -3
  6. data/lib/canon/commands/diff_command.rb +0 -6
  7. data/lib/canon/commands/format_command.rb +0 -4
  8. data/lib/canon/commands.rb +9 -0
  9. data/lib/canon/comparison/child_realignment.rb +0 -2
  10. data/lib/canon/comparison/compare_profile.rb +30 -36
  11. data/lib/canon/comparison/comparison_result.rb +0 -2
  12. data/lib/canon/comparison/diff_node_builder.rb +353 -0
  13. data/lib/canon/comparison/dimensions/dimension.rb +51 -0
  14. data/lib/canon/comparison/dimensions/dimension_set.rb +49 -0
  15. data/lib/canon/comparison/dimensions/registry.rb +101 -60
  16. data/lib/canon/comparison/dimensions.rb +15 -46
  17. data/lib/canon/comparison/html_comparator.rb +18 -141
  18. data/lib/canon/comparison/html_compare_profile.rb +15 -18
  19. data/lib/canon/comparison/json_comparator.rb +4 -165
  20. data/lib/canon/comparison/json_parser.rb +0 -2
  21. data/lib/canon/comparison/markup_comparator.rb +14 -210
  22. data/lib/canon/comparison/match_options/base_resolver.rb +18 -29
  23. data/lib/canon/comparison/match_options/json_resolver.rb +4 -28
  24. data/lib/canon/comparison/match_options/xml_resolver.rb +4 -45
  25. data/lib/canon/comparison/match_options/yaml_resolver.rb +4 -30
  26. data/lib/canon/comparison/match_options.rb +13 -88
  27. data/lib/canon/comparison/pipeline.rb +269 -0
  28. data/lib/canon/comparison/profile_definition.rb +0 -2
  29. data/lib/canon/comparison/ruby_object_comparator.rb +1 -1
  30. data/lib/canon/comparison/strategies/match_strategy_factory.rb +9 -58
  31. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +4 -11
  32. data/lib/canon/comparison/strategies.rb +16 -0
  33. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +0 -3
  34. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +0 -3
  35. data/lib/canon/comparison/xml_comparator/child_comparison.rb +0 -6
  36. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +1 -6
  37. data/lib/canon/comparison/xml_comparator/node_parser.rb +0 -4
  38. data/lib/canon/comparison/xml_comparator.rb +4 -492
  39. data/lib/canon/comparison/xml_comparator_helpers.rb +21 -0
  40. data/lib/canon/comparison/xml_node_comparison.rb +4 -119
  41. data/lib/canon/comparison/yaml_comparator.rb +0 -3
  42. data/lib/canon/comparison.rb +143 -266
  43. data/lib/canon/config/config_dsl.rb +159 -0
  44. data/lib/canon/config/env_provider.rb +0 -3
  45. data/lib/canon/config/env_schema.rb +48 -58
  46. data/lib/canon/config/profile_loader.rb +0 -1
  47. data/lib/canon/config.rb +116 -468
  48. data/lib/canon/diff/diff_block_builder.rb +0 -2
  49. data/lib/canon/diff/diff_classifier.rb +0 -5
  50. data/lib/canon/diff/diff_context.rb +0 -2
  51. data/lib/canon/diff/diff_context_builder.rb +0 -2
  52. data/lib/canon/diff/diff_line_builder.rb +0 -3
  53. data/lib/canon/diff/diff_node_enricher.rb +0 -4
  54. data/lib/canon/diff/diff_node_mapper.rb +0 -4
  55. data/lib/canon/diff/diff_report_builder.rb +0 -4
  56. data/lib/canon/diff/formatting_detector.rb +0 -1
  57. data/lib/canon/diff/node_serializer.rb +0 -7
  58. data/lib/canon/diff.rb +39 -0
  59. data/lib/canon/diff_formatter/by_line/base_formatter.rb +4 -17
  60. data/lib/canon/diff_formatter/by_line/html_formatter.rb +7 -19
  61. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -3
  62. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -3
  63. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +7 -26
  64. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -3
  65. data/lib/canon/diff_formatter/by_object/base_formatter.rb +8 -15
  66. data/lib/canon/diff_formatter/by_object/json_formatter.rb +0 -2
  67. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +0 -2
  68. data/lib/canon/diff_formatter/by_object/yaml_formatter.rb +0 -2
  69. data/lib/canon/diff_formatter/debug_output.rb +0 -2
  70. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +24 -58
  71. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +0 -2
  72. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +1 -2
  73. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +1 -7
  74. data/lib/canon/diff_formatter/diff_detail_formatter.rb +0 -7
  75. data/lib/canon/diff_formatter/diff_detail_formatter_helpers.rb +23 -0
  76. data/lib/canon/diff_formatter.rb +11 -9
  77. data/lib/canon/formatters/html4_formatter.rb +0 -2
  78. data/lib/canon/formatters/html5_formatter.rb +0 -2
  79. data/lib/canon/formatters/html_formatter.rb +0 -3
  80. data/lib/canon/formatters/json_formatter.rb +0 -1
  81. data/lib/canon/formatters/xml_formatter.rb +0 -4
  82. data/lib/canon/formatters/yaml_formatter.rb +0 -1
  83. data/lib/canon/formatters.rb +16 -0
  84. data/lib/canon/html/data_model.rb +0 -10
  85. data/lib/canon/html.rb +4 -3
  86. data/lib/canon/options/cli_generator.rb +0 -2
  87. data/lib/canon/options/registry.rb +0 -2
  88. data/lib/canon/options.rb +9 -0
  89. data/lib/canon/pretty_printer/html.rb +0 -1
  90. data/lib/canon/pretty_printer/xml_normalized.rb +0 -2
  91. data/lib/canon/pretty_printer.rb +12 -0
  92. data/lib/canon/tree_diff/adapters/html_adapter.rb +1 -1
  93. data/lib/canon/tree_diff/adapters.rb +14 -0
  94. data/lib/canon/tree_diff/core/attribute_comparator.rb +0 -6
  95. data/lib/canon/tree_diff/core/node_signature.rb +1 -1
  96. data/lib/canon/tree_diff/core/tree_node.rb +12 -5
  97. data/lib/canon/tree_diff/core.rb +17 -0
  98. data/lib/canon/tree_diff/matchers/hash_matcher.rb +0 -7
  99. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +1 -5
  100. data/lib/canon/tree_diff/matchers/structural_propagator.rb +1 -5
  101. data/lib/canon/tree_diff/matchers.rb +15 -0
  102. data/lib/canon/tree_diff/operation_converter.rb +0 -8
  103. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +2 -12
  104. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +13 -7
  105. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +2 -2
  106. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +4 -6
  107. data/lib/canon/tree_diff/operation_converter_helpers.rb +18 -0
  108. data/lib/canon/tree_diff/operations/operation_detector.rb +2 -5
  109. data/lib/canon/tree_diff/operations.rb +13 -0
  110. data/lib/canon/tree_diff.rb +26 -27
  111. data/lib/canon/validators/base_validator.rb +0 -2
  112. data/lib/canon/validators/html_validator.rb +0 -1
  113. data/lib/canon/validators/json_validator.rb +0 -1
  114. data/lib/canon/validators/xml_validator.rb +0 -1
  115. data/lib/canon/validators/yaml_validator.rb +0 -1
  116. data/lib/canon/validators.rb +12 -0
  117. data/lib/canon/version.rb +1 -1
  118. data/lib/canon/xml/c14n.rb +0 -4
  119. data/lib/canon/xml/data_model.rb +0 -10
  120. data/lib/canon/xml/line_range_mapper.rb +0 -2
  121. data/lib/canon/xml/nodes/attribute_node.rb +0 -2
  122. data/lib/canon/xml/nodes/comment_node.rb +0 -2
  123. data/lib/canon/xml/nodes/element_node.rb +0 -2
  124. data/lib/canon/xml/nodes/namespace_node.rb +0 -2
  125. data/lib/canon/xml/nodes/processing_instruction_node.rb +0 -2
  126. data/lib/canon/xml/nodes/root_node.rb +0 -2
  127. data/lib/canon/xml/nodes/text_node.rb +0 -2
  128. data/lib/canon/xml/nodes.rb +19 -0
  129. data/lib/canon/xml/processor.rb +0 -5
  130. data/lib/canon/xml/sax_builder.rb +0 -7
  131. data/lib/canon/xml.rb +33 -0
  132. data/lib/canon/xml_backend.rb +50 -14
  133. data/lib/canon/xml_parsing.rb +4 -2
  134. data/lib/canon.rb +25 -15
  135. data/lib/tasks/performance.rake +0 -58
  136. data/lib/tasks/performance_comparator.rb +132 -65
  137. data/lib/tasks/performance_helpers.rb +4 -249
  138. data/lib/tasks/performance_report.rb +309 -0
  139. metadata +24 -11
  140. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +0 -64
  141. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +0 -64
  142. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +0 -167
  143. data/lib/canon/comparison/dimensions/base_dimension.rb +0 -107
  144. data/lib/canon/comparison/dimensions/comments_dimension.rb +0 -117
  145. data/lib/canon/comparison/dimensions/element_position_dimension.rb +0 -86
  146. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +0 -115
  147. data/lib/canon/comparison/dimensions/text_content_dimension.rb +0 -102
  148. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +0 -300
@@ -2,21 +2,6 @@
2
2
 
3
3
  require "moxml"
4
4
  require "nokogiri" if Canon::XmlBackend.nokogiri?
5
- require_relative "xml/whitespace_normalizer"
6
- require_relative "comparison/xml_comparator"
7
- require_relative "comparison/html_comparator"
8
- require_relative "comparison/json_comparator"
9
- require_relative "comparison/yaml_comparator"
10
- require_relative "errors"
11
- require_relative "comparison/profile_definition"
12
- require_relative "comparison/format_detector"
13
- require_relative "comparison/html_parser"
14
- require_relative "diff/diff_node_mapper"
15
- require_relative "diff/diff_line"
16
- require_relative "diff/diff_block_builder"
17
- require_relative "diff/diff_context_builder"
18
- require_relative "diff/diff_report_builder"
19
- require_relative "cache"
20
5
 
21
6
  module Canon
22
7
  # Comparison module for XML, HTML, JSON, and YAML documents
@@ -104,7 +89,31 @@ module Canon
104
89
  # - diff_code: Type of difference
105
90
  #
106
91
  module Comparison
92
+ autoload :BaseComparator, "canon/comparison/base_comparator"
107
93
  autoload :ChildRealignment, "canon/comparison/child_realignment"
94
+ autoload :CompareProfile, "canon/comparison/compare_profile"
95
+ autoload :ComparisonResult, "canon/comparison/comparison_result"
96
+ autoload :DiffNodeBuilder, "canon/comparison/diff_node_builder"
97
+ autoload :Dimensions, "canon/comparison/dimensions"
98
+ autoload :FormatDetector, "canon/comparison/format_detector"
99
+ autoload :HtmlComparator, "canon/comparison/html_comparator"
100
+ autoload :HtmlCompareProfile, "canon/comparison/html_compare_profile"
101
+ autoload :HtmlParser, "canon/comparison/html_parser"
102
+ autoload :JsonComparator, "canon/comparison/json_comparator"
103
+ autoload :JsonParser, "canon/comparison/json_parser"
104
+ autoload :MarkupComparator, "canon/comparison/markup_comparator"
105
+ autoload :MatchOptions, "canon/comparison/match_options"
106
+ autoload :NodeInspector, "canon/comparison/node_inspector"
107
+ autoload :Pipeline, "canon/comparison/pipeline"
108
+ autoload :ProfileDefinition, "canon/comparison/profile_definition"
109
+ autoload :RubyObjectComparator, "canon/comparison/ruby_object_comparator"
110
+ autoload :Strategies, "canon/comparison/strategies"
111
+ autoload :WhitespaceSensitivity, "canon/comparison/whitespace_sensitivity"
112
+ autoload :XmlComparator, "canon/comparison/xml_comparator"
113
+ autoload :XmlComparatorHelpers, "canon/comparison/xml_comparator_helpers"
114
+ autoload :XmlNodeComparison, "canon/comparison/xml_node_comparison"
115
+ autoload :XmlParser, "canon/comparison/xml_parser"
116
+ autoload :YamlComparator, "canon/comparison/yaml_comparator"
108
117
 
109
118
  # Comparison result constants
110
119
  EQUIVALENT = 1
@@ -124,6 +133,32 @@ module Canon
124
133
  UNEQUAL_TYPES = 15
125
134
  UNEQUAL_PRIMITIVES = 16
126
135
 
136
+ # Keys that OperationConverter and SemanticTreeMatchStrategy accept.
137
+ # Used to strip diff-only keys (e.g. +max_node_count+) from the
138
+ # fully-resolved match options hash before passing it to components
139
+ # that expect match options only.
140
+ MATCH_OPTION_KEYS = %i[
141
+ match_profile
142
+ match
143
+ preprocessing
144
+ text_content
145
+ structural_whitespace
146
+ attribute_presence
147
+ attribute_order
148
+ attribute_values
149
+ element_position
150
+ comments
151
+ format
152
+ similarity_threshold
153
+ hash_matching
154
+ similarity_matching
155
+ propagation
156
+ preserve_whitespace_elements
157
+ collapse_whitespace_elements
158
+ strip_whitespace_elements
159
+ respect_xml_space
160
+ ].freeze
161
+
127
162
  # Human-readable labels for the integer comparison-result constants
128
163
  # above. Used by the diff reason builders so user-facing reason text
129
164
  # never leaks raw numeric codes (e.g. "7 vs 7" — see lutaml/canon#127).
@@ -195,13 +230,17 @@ module Canon
195
230
  # - :verbose - Return detailed diff array (default: false)
196
231
  # @return [Boolean, Array] true if equivalent, or array of diffs if verbose
197
232
  def equivalent?(obj1, obj2, opts = {})
198
- # Check if semantic tree diff is requested
199
- # Support both :semantic and :semantic_tree for backward compatibility
233
+ # Normalize: match: { semantic_diff: true } → diff_algorithm: :semantic
234
+ if opts.dig(:match, :semantic_diff) || opts.dig(:match, :semantic_tree)
235
+ opts = opts.merge(diff_algorithm: :semantic)
236
+ opts = opts.merge(match: opts[:match].except(:semantic_diff,
237
+ :semantic_tree))
238
+ end
239
+
200
240
  if %i[semantic semantic_tree].include?(opts[:diff_algorithm])
201
241
  return semantic_diff(obj1, obj2, opts)
202
242
  end
203
243
 
204
- # Otherwise use DOM-based comparison (default)
205
244
  dom_diff(obj1, obj2, opts)
206
245
  end
207
246
 
@@ -288,113 +327,90 @@ module Canon
288
327
 
289
328
  # Perform semantic tree diff comparison
290
329
  def semantic_diff(obj1, obj2, opts = {})
291
- require_relative "tree_diff"
330
+ resolved = opts.dup
331
+ format_hint = resolved[:format]
292
332
 
293
- # Capture original strings BEFORE any parsing/transformation
294
- # These are used for display to preserve original formatting
295
- format_hint = opts[:format]
296
- original_str1 = extract_original_string(obj1, format_hint)
297
- original_str2 = extract_original_string(obj2, format_hint)
333
+ # Capture original strings BEFORE any parsing/transformation.
334
+ # These are used for display to preserve original formatting.
335
+ original_str1, original_str2 = Pipeline.capture_originals(obj1, obj2)
298
336
 
299
- # Detect format for both objects
300
- format1 = opts[:format] || FormatDetector.detect(obj1)
301
- format2 = opts[:format] || FormatDetector.detect(obj2)
337
+ # Detect format for both objects.
338
+ format1, format2 = Pipeline.detect_formats(obj1, obj2, format_hint)
302
339
 
303
- # Handle string format (plain text comparison) - semantic tree doesn't support it
340
+ # Semantic tree doesn't support plain-string comparison.
304
341
  if format1 == :string
305
- if opts[:verbose]
342
+ if resolved[:verbose]
306
343
  return obj1.to_s == obj2.to_s ? [] : [:different]
307
344
  else
308
345
  return obj1.to_s == obj2.to_s
309
346
  end
310
347
  end
311
348
 
312
- # Ensure formats match
313
- unless format1 == format2
314
- raise Canon::CompareFormatMismatchError.new(format1, format2)
315
- end
349
+ # Semantic requires exact format match (no ruby_object cross-compat).
350
+ Pipeline.validate_compatible!(format1, format2, strict: true)
316
351
 
317
- # Get global config options if not defined in opts
318
- # This is needed because semantic_diff doesn't go through dom_diff's config handling
319
- if !(opts[:match_profile] || opts[:global_options]) && %i[xml html json
320
- yaml string].include?(format1)
321
- format_config = Canon::Config.instance.public_send(format1)
322
- if format_config.match.profile
323
- opts[:match_profile] =
324
- format_config.match.profile
325
- end
326
- if format_config.match.options && !format_config.match.options.empty?
327
- opts[:global_options] =
328
- format_config.match.options
329
- end
330
- end
352
+ # Merge global config-sourced profile and options into opts.
353
+ resolved = Pipeline.resolve_config(format1, resolved)
331
354
 
332
- # Resolve match options for the format
333
- match_opts_hash = resolve_match_options(format1, opts)
355
+ # Resolve match options for the format.
356
+ match_opts_hash = resolve_match_options(format1, resolved)
334
357
 
335
- # Also read diff options from config (e.g., max_node_count for large documents)
336
- # This is independent of match options and needs to be passed to TreeDiffIntegrator
337
- if !match_opts_hash[:max_node_count] && %i[xml html json yaml
338
- string].include?(format1)
358
+ # Also read diff options from config (e.g., max_node_count for
359
+ # large documents). Independent of match options; passed to
360
+ # TreeDiffIntegrator.
361
+ if !match_opts_hash[:max_node_count] &&
362
+ Pipeline::CONFIG_BACKED_FORMATS.include?(format1)
339
363
  diff_max_node = Canon::Config.instance.public_send(format1).diff.max_node_count
340
364
  if diff_max_node > 10_000
341
- match_opts_hash[:max_node_count] =
342
- diff_max_node
365
+ match_opts_hash[:max_node_count] = diff_max_node
343
366
  end
344
367
  end
345
368
 
346
- # Delegate parsing to comparators (reuses existing preprocessing logic)
347
- doc1, doc2 = parse_with_comparator(obj1, obj2, format1, match_opts_hash)
369
+ # Delegate parsing to comparators (reuses existing preprocessing).
370
+ doc1, doc2 = Pipeline.parse_pair(obj1, obj2, format1, match_opts_hash)
348
371
 
349
- # Normalize format for TreeDiff (html4/html5 -> html)
372
+ # Normalize format for TreeDiff (html4/html5 -> html).
350
373
  tree_diff_format = normalize_format_for_tree_diff(format1)
351
374
 
352
- # Create TreeDiff integrator for the format
353
- # CRITICAL: Use match_opts_hash (resolved options with profile) not opts[:match]
375
+ # Create TreeDiff integrator for the format.
376
+ # CRITICAL: Use match_opts_hash (resolved options with profile)
377
+ # not opts[:match].
354
378
  integrator = Canon::TreeDiff::TreeDiffIntegrator.new(
355
379
  format: tree_diff_format,
356
380
  options: match_opts_hash,
357
381
  )
358
382
 
359
- # Perform diff
383
+ # Perform diff.
360
384
  tree_diff_result = integrator.diff(doc1, doc2)
361
385
 
362
- # Extract only match-related keys for OperationConverter and SemanticTreeMatchStrategy
363
- # These components expect match options, not diff options like max_node_count
364
- match_only_keys = %i[match_profile match preprocessing
365
- text_content structural_whitespace attribute_presence
366
- attribute_order attribute_values element_position
367
- comments format similarity_threshold hash_matching
368
- similarity_matching propagation
369
- preserve_whitespace_elements
370
- collapse_whitespace_elements
371
- strip_whitespace_elements respect_xml_space]
372
- match_options_only = match_opts_hash.slice(*match_only_keys)
373
-
374
- # Convert operations to DiffNodes for unified pipeline
375
- # CRITICAL: Use match_opts_hash (resolved options with profile) not opts[:match]
386
+ # Extract only match-related keys for OperationConverter and
387
+ # SemanticTreeMatchStrategy. These components expect match
388
+ # options, not diff options like max_node_count.
389
+ match_options_only = match_opts_hash.slice(*MATCH_OPTION_KEYS)
390
+
391
+ # Convert operations to DiffNodes for unified pipeline.
376
392
  converter = Canon::TreeDiff::OperationConverter.new(
377
393
  format: format1,
378
394
  match_options: match_options_only,
379
395
  )
380
396
  diff_nodes = converter.convert(tree_diff_result[:operations])
381
397
 
382
- # CRITICAL: Use strategy's preprocess_for_display to ensure proper line-breaking
383
- # This matches DOM diff preprocessing pattern (xml_comparator.rb:106-109)
384
- require_relative "comparison/strategies/semantic_tree_match_strategy"
398
+ # CRITICAL: Use strategy's preprocess_for_display to ensure proper
399
+ # line-breaking. This matches DOM diff preprocessing pattern
400
+ # (xml_comparator.rb:106-109).
385
401
  strategy = Comparison::Strategies::SemanticTreeMatchStrategy.new(
386
402
  format: format1, match_options: match_options_only,
387
403
  )
388
404
  str1, str2 = strategy.preprocess_for_display(doc1, doc2)
389
405
 
390
- # Store tree diff data in match_options for access via result
406
+ # Store tree diff data in match_options for access via result.
391
407
  enhanced_match_options = match_opts_hash.merge(
392
408
  tree_diff_operations: tree_diff_result[:operations],
393
409
  tree_diff_statistics: tree_diff_result[:statistics],
394
410
  tree_diff_matching: tree_diff_result[:matching],
395
411
  )
396
412
 
397
- # Create ComparisonResult for unified handling
413
+ # Create ComparisonResult for unified handling.
398
414
  result = Canon::Comparison::ComparisonResult.new(
399
415
  differences: diff_nodes,
400
416
  preprocessed_strings: [str1, str2],
@@ -405,8 +421,8 @@ module Canon
405
421
  algorithm: :semantic,
406
422
  )
407
423
 
408
- # Return boolean or ComparisonResult based on verbose flag
409
- if opts[:verbose]
424
+ # Return boolean or ComparisonResult based on verbose flag.
425
+ if resolved[:verbose]
410
426
  result
411
427
  else
412
428
  result.equivalent?
@@ -534,16 +550,7 @@ module Canon
534
550
  # @param format [Symbol] Format type
535
551
  # @return [Array<Symbol>] Valid dimensions for the format
536
552
  def valid_dimensions_for_format(format)
537
- case format
538
- when :xml, :html, :html4, :html5
539
- MatchOptions::Xml::MATCH_DIMENSIONS
540
- when :json
541
- MatchOptions::Json::MATCH_DIMENSIONS
542
- when :yaml
543
- MatchOptions::Yaml::MATCH_DIMENSIONS
544
- else
545
- []
546
- end
553
+ Dimensions::Registry.for(format).names
547
554
  end
548
555
 
549
556
  # Helper to extract format from opts for validation
@@ -554,76 +561,6 @@ module Canon
554
561
  opts[:format] || :xml
555
562
  end
556
563
 
557
- # Parse documents using comparator's parse logic (reuses preprocessing)
558
- #
559
- # @param obj1 [Object] First object
560
- # @param obj2 [Object] Second object
561
- # @param format [Symbol] Format type
562
- # @param match_opts_hash [Hash] Resolved match options
563
- # @return [Array<Object, Object>] Parsed documents
564
- def parse_with_comparator(obj1, obj2, format, match_opts_hash)
565
- preprocessing = match_opts_hash[:preprocessing] || :none
566
-
567
- case format
568
- when :xml
569
- # Delegate to XmlComparator's parse - returns Canon::Xml::Node
570
- doc1 = parse_with_cache(obj1, format, preprocessing) do |doc|
571
- XmlComparator.parse(doc, preprocessing)
572
- end
573
- doc2 = parse_with_cache(obj2, format, preprocessing) do |doc|
574
- XmlComparator.parse(doc, preprocessing)
575
- end
576
- [doc1, doc2]
577
- when :html, :html4, :html5
578
- [
579
- parse_with_cache(obj1, format, preprocessing) do |doc|
580
- HtmlComparator.parse(doc, preprocessing)
581
- end,
582
- parse_with_cache(obj2, format, preprocessing) do |doc|
583
- HtmlComparator.parse(doc, preprocessing)
584
- end,
585
- ]
586
- when :json
587
- [
588
- parse_with_cache(obj1, format, :none) do |doc|
589
- JsonComparator.parse(doc)
590
- end,
591
- parse_with_cache(obj2, format, :none) do |doc|
592
- JsonComparator.parse(doc)
593
- end,
594
- ]
595
- when :yaml
596
- [
597
- parse_with_cache(obj1, format, :none) do |doc|
598
- YamlComparator.parse(doc)
599
- end,
600
- parse_with_cache(obj2, format, :none) do |doc|
601
- YamlComparator.parse(doc)
602
- end,
603
- ]
604
- else
605
- [obj1, obj2]
606
- end
607
- end
608
-
609
- # Parse a document with caching
610
- #
611
- # @param doc [Object] Document to parse (string or already parsed)
612
- # @param format [Symbol] Document format
613
- # @param preprocessing [Symbol] Preprocessing option
614
- # @yield Block to parse the document if not cached
615
- # @return [Object] Parsed document
616
- def parse_with_cache(doc, format, preprocessing)
617
- # If already a parsed node, return as-is
618
- return doc unless doc.is_a?(String)
619
-
620
- # Use cache for string documents
621
- Cache.fetch(:document_parse,
622
- Cache.key_for_document(doc, format, preprocessing)) do # rubocop:disable Lint/UselessDefaultValueArgument
623
- yield doc
624
- end
625
- end
626
-
627
564
  # Normalize format for TreeDiff (html4/html5 -> html)
628
565
  #
629
566
  # @param format [Symbol] Original format
@@ -637,28 +574,6 @@ module Canon
637
574
  end
638
575
  end
639
576
 
640
- # Extract original string from various input types
641
- # This preserves the original formatting without minification
642
- #
643
- # @param obj [String, Nokogiri::Node, Canon::Xml::Node, Object] Input object
644
- # @param format [Symbol] Format type for context
645
- # @return [String] Original string representation
646
- def extract_original_string(obj, _format = nil)
647
- case obj
648
- when String
649
- obj
650
- when Nokogiri::XML::Document, Nokogiri::HTML::Document,
651
- Nokogiri::XML::DocumentFragment, Nokogiri::HTML::DocumentFragment
652
- obj.to_html
653
- else
654
- if Canon::XmlParsing.xml_node?(obj) || obj.is_a?(Canon::Xml::Node)
655
- Canon::XmlParsing.serialize(obj)
656
- else
657
- obj.to_s
658
- end
659
- end
660
- end
661
-
662
577
  # Serialize document back to string
663
578
  def serialize_document(doc, format)
664
579
  case format
@@ -683,108 +598,70 @@ module Canon
683
598
 
684
599
  # Perform DOM-based comparison (original behavior)
685
600
  def dom_diff(obj1, obj2, opts = {})
686
- # Use format hint if provided
687
- if opts[:format]
688
- format1 = format2 = opts[:format]
689
- # Parse HTML strings if format is html/html4/html5
690
- if %i[html html4 html5].include?(opts[:format])
691
- # Preserve original strings for display (HTML fragment
692
- # parsers can mutate the DOM).
693
- opts[:_original_str1] = obj1.dup if obj1.is_a?(String)
694
- opts[:_original_str2] = obj2.dup if obj2.is_a?(String)
695
- # Parse all HTML formats (:html, :html4, :html5) with
696
- # Nokogiri::HTML5 so that html4 and html5 share HTML's
697
- # whitespace-sensitivity semantics (issue #118).
698
- #
699
- # The previous html/html4 branch used Nokogiri::XML.fragment
700
- # to dodge Nokogiri::HTML4.fragment's destructive DOM
701
- # mutations. That avoided one problem but introduced a
702
- # bigger one: XML whitespace rules were being applied to
703
- # HTML content. HTML's content model — identical between
704
- # HTML4 and HTML5 — treats whitespace-only text between
705
- # block-level children as insignificant; XML treats every
706
- # whitespace text node as significant. Routing html4 input
707
- # through an XML parser therefore made
708
- # be_html4_equivalent_to reject inputs that
709
- # be_html5_equivalent_to (correctly) accepts.
710
- # Nokogiri::HTML5.fragment is non-destructive (the original
711
- # HTML4.fragment concern does not apply to it) and applies
712
- # HTML's content model uniformly.
713
- obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
714
- obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
715
- end
716
- else
717
- format1 = FormatDetector.detect(obj1)
718
- format2 = FormatDetector.detect(obj2)
601
+ resolved = opts.dup
602
+ format_hint = resolved[:format]
603
+
604
+ # Detect formats (with explicit hint) and pre-parse HTML strings
605
+ # through Nokogiri::HTML5 so html4 and html5 share HTML's
606
+ # whitespace-sensitivity semantics (issue #118). Pre-parsing
607
+ # also lets us snapshot the original strings before the HTML
608
+ # fragment parser mutates the DOM.
609
+ format1, format2 = Pipeline.detect_formats(obj1, obj2, format_hint)
610
+ if %i[html html4 html5].include?(format_hint) && obj1.is_a?(String) &&
611
+ obj2.is_a?(String)
612
+ resolved[:_original_str1] = obj1
613
+ resolved[:_original_str2] = obj2
614
+ obj1, obj2 = Pipeline.preparse_html_pair(obj1, obj2)
719
615
  end
720
616
 
721
- # Handle string format (plain text comparison)
617
+ # Handle string format (plain text comparison).
722
618
  if format1 == :string
723
- if opts[:verbose]
619
+ if resolved[:verbose]
724
620
  return obj1.to_s == obj2.to_s ? [] : [:different]
725
621
  else
726
622
  return obj1.to_s == obj2.to_s
727
623
  end
728
624
  end
729
625
 
730
- # Allow comparing json/yaml strings with ruby objects
731
- # since they parse to the same structure
732
- formats_compatible = format1 == format2 ||
733
- (%i[json ruby_object].include?(format1) &&
734
- %i[json ruby_object].include?(format2)) ||
735
- (%i[yaml ruby_object].include?(format1) &&
736
- %i[yaml ruby_object].include?(format2))
626
+ # DOM allows ruby_object <-> json/yaml cross-compatibility.
627
+ Pipeline.validate_compatible!(format1, format2, strict: false)
737
628
 
738
- unless formats_compatible
739
- raise Canon::CompareFormatMismatchError.new(format1, format2)
740
- end
629
+ # Normalize comparison format (ruby_object -> json by default).
630
+ comparison_format = normalize_comparison_format(format1, format2)
741
631
 
742
- # Normalize format for comparison
743
- comparison_format = case format1
744
- when :ruby_object
745
- # If comparing ruby_object with json/yaml, use that format
746
- %i[json yaml].include?(format2) ? format2 : :json
747
- else
748
- format1
749
- end
750
-
751
- # get match_profile if it is not defined in options
752
- # but defined in config
753
- if %i[xml html json yaml string].include?(comparison_format)
754
- format_config = Canon::Config.instance.public_send(comparison_format)
755
- if opts[:global_profile].nil? && format_config.match.profile
756
- # Config-sourced profile has *global* priority (applied before
757
- # global_options), so that YAML profile_options like
758
- # whitespace_type: :normalize can override the built-in profile
759
- # (e.g. :spec_friendly)'s whitespace_type: :strict. Writing to
760
- # :match_profile here gave the config profile per-call priority,
761
- # which incorrectly overrode the YAML's own overrides.
762
- opts[:global_profile] = format_config.match.profile
763
- end
764
- # Pass YAML profile's extra match options (e.g., preserve_whitespace_elements)
765
- # that are stored in MatchConfig's resolver but not exposed via the
766
- # built-in MATCH_PROFILES system. These supplement the built-in profile.
767
- profile_opts = format_config.match.profile_options
768
- if profile_opts.any? && opts[:global_options].nil?
769
- opts[:global_options] = profile_opts
770
- elsif profile_opts.any?
771
- # Merge: global_options already set (e.g., per-call) takes precedence
772
- opts[:global_options] = opts[:global_options].merge(profile_opts)
773
- end
774
- end
632
+ # Merge global config-sourced profile and options into opts.
633
+ resolved = Pipeline.resolve_config(comparison_format, resolved)
775
634
 
776
635
  case comparison_format
777
636
  when :xml
778
- XmlComparator.equivalent?(obj1, obj2, opts)
637
+ XmlComparator.equivalent?(obj1, obj2, resolved)
779
638
  when :html, :html4, :html5
780
- HtmlComparator.equivalent?(obj1, obj2, opts)
639
+ HtmlComparator.equivalent?(obj1, obj2, resolved)
781
640
  when :json
782
- JsonComparator.equivalent?(obj1, obj2, opts)
641
+ JsonComparator.equivalent?(obj1, obj2, resolved)
783
642
  when :yaml
784
- YamlComparator.equivalent?(obj1, obj2, opts)
643
+ YamlComparator.equivalent?(obj1, obj2, resolved)
785
644
  end
786
645
  end
787
646
 
647
+ # Pick the format used for actual comparison.
648
+ #
649
+ # When comparing ruby_object with json/yaml, use the json/yaml side
650
+ # so both inputs parse to the same Ruby structure. When both sides
651
+ # are ruby_object (or the other side is not json/yaml), default to
652
+ # JSON since ruby_object has no comparator of its own.
653
+ #
654
+ # @param format1 [Symbol]
655
+ # @param format2 [Symbol]
656
+ # @return [Symbol]
657
+ def normalize_comparison_format(format1, format2)
658
+ return format2 if format1 == :ruby_object &&
659
+ %i[json yaml].include?(format2)
660
+ return :json if format1 == :ruby_object
661
+
662
+ format1
663
+ end
664
+
788
665
  # Strip XML declarations and DOCTYPE preambles from an HTML string
789
666
  # so it can be safely parsed with Nokogiri::XML.fragment without
790
667
  # generating processing-instruction nodes.