canon 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +25 -135
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/advanced/extending-canon.adoc +193 -0
  6. data/docs/internals/diffnode-enrichment.adoc +611 -0
  7. data/docs/internals/index.adoc +251 -0
  8. data/docs/lychee.toml +13 -6
  9. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +250 -0
  10. data/docs/understanding/architecture.adoc +749 -33
  11. data/docs/understanding/comparison-pipeline.adoc +122 -0
  12. data/false_positive_analysis.txt +0 -0
  13. data/file1.html +1 -0
  14. data/file2.html +1 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +86 -0
  27. data/lib/canon/comparison/html_comparator.rb +51 -18
  28. data/lib/canon/comparison/html_parser.rb +80 -0
  29. data/lib/canon/comparison/json_comparator.rb +12 -0
  30. data/lib/canon/comparison/json_parser.rb +19 -0
  31. data/lib/canon/comparison/markup_comparator.rb +293 -0
  32. data/lib/canon/comparison/match_options/base_resolver.rb +143 -0
  33. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  34. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  35. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  36. data/lib/canon/comparison/match_options.rb +68 -463
  37. data/lib/canon/comparison/profile_definition.rb +149 -0
  38. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  39. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  40. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  41. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  42. data/lib/canon/comparison/xml_comparator/child_comparison.rb +189 -0
  43. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  44. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  45. data/lib/canon/comparison/xml_comparator/node_parser.rb +74 -0
  46. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +95 -0
  47. data/lib/canon/comparison/xml_comparator.rb +52 -664
  48. data/lib/canon/comparison/xml_node_comparison.rb +297 -0
  49. data/lib/canon/comparison/xml_parser.rb +19 -0
  50. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  51. data/lib/canon/comparison.rb +265 -110
  52. data/lib/canon/diff/diff_node.rb +32 -2
  53. data/lib/canon/diff/node_serializer.rb +191 -0
  54. data/lib/canon/diff/path_builder.rb +143 -0
  55. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  56. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  57. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  58. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  59. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  60. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  61. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  64. data/lib/canon/diff_formatter.rb +1 -1
  65. data/lib/canon/rspec_matchers.rb +1 -1
  66. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  67. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  68. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  69. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  70. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  71. data/lib/canon/version.rb +1 -1
  72. data/old-docs/ADVANCED_TOPICS.adoc +20 -0
  73. data/old-docs/BASIC_USAGE.adoc +16 -0
  74. data/old-docs/CHARACTER_VISUALIZATION.adoc +567 -0
  75. data/old-docs/CLI.adoc +497 -0
  76. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  77. data/old-docs/DIFF_ARCHITECTURE.adoc +435 -0
  78. data/old-docs/DIFF_FORMATTING.adoc +540 -0
  79. data/old-docs/DIFF_PARAMETERS.adoc +261 -0
  80. data/old-docs/DOM_DIFF.adoc +1017 -0
  81. data/old-docs/ENV_CONFIG.adoc +876 -0
  82. data/old-docs/FORMATS.adoc +867 -0
  83. data/old-docs/INPUT_VALIDATION.adoc +477 -0
  84. data/old-docs/MATCHER_BEHAVIOR.adoc +90 -0
  85. data/old-docs/MATCH_ARCHITECTURE.adoc +463 -0
  86. data/old-docs/MATCH_OPTIONS.adoc +912 -0
  87. data/old-docs/MODES.adoc +432 -0
  88. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  89. data/old-docs/OPTIONS.adoc +1387 -0
  90. data/old-docs/PREPROCESSING.adoc +491 -0
  91. data/old-docs/README.old.adoc +2831 -0
  92. data/old-docs/RSPEC.adoc +814 -0
  93. data/old-docs/RUBY_API.adoc +485 -0
  94. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +646 -0
  95. data/old-docs/SEMANTIC_TREE_DIFF.adoc +765 -0
  96. data/old-docs/STRING_COMPARE.adoc +345 -0
  97. data/old-docs/TMP.adoc +3384 -0
  98. data/old-docs/TREE_DIFF.adoc +1080 -0
  99. data/old-docs/UNDERSTANDING_CANON.adoc +17 -0
  100. data/old-docs/VERBOSE.adoc +482 -0
  101. data/old-docs/VISUALIZATION_MAP.adoc +625 -0
  102. data/old-docs/WHITESPACE_TREATMENT.adoc +1155 -0
  103. data/scripts/analyze_current_state.rb +85 -0
  104. data/scripts/analyze_false_positives.rb +114 -0
  105. data/scripts/analyze_remaining_failures.rb +105 -0
  106. data/scripts/compare_current_failures.rb +95 -0
  107. data/scripts/compare_dom_tree_diff.rb +158 -0
  108. data/scripts/compare_failures.rb +151 -0
  109. data/scripts/debug_attribute_extraction.rb +66 -0
  110. data/scripts/debug_blocks_839.rb +115 -0
  111. data/scripts/debug_meta_matching.rb +52 -0
  112. data/scripts/debug_p_matching.rb +192 -0
  113. data/scripts/debug_signature_matching.rb +118 -0
  114. data/scripts/debug_sourcecode_124.rb +32 -0
  115. data/scripts/debug_whitespace_sensitive.rb +192 -0
  116. data/scripts/extract_false_positives.rb +138 -0
  117. data/scripts/find_actual_false_positives.rb +125 -0
  118. data/scripts/investigate_all_false_positives.rb +161 -0
  119. data/scripts/investigate_batch1.rb +127 -0
  120. data/scripts/investigate_classification.rb +150 -0
  121. data/scripts/investigate_classification_detailed.rb +190 -0
  122. data/scripts/investigate_common_failures.rb +342 -0
  123. data/scripts/investigate_false_negative.rb +80 -0
  124. data/scripts/investigate_false_positive.rb +83 -0
  125. data/scripts/investigate_false_positives.rb +227 -0
  126. data/scripts/investigate_false_positives_batch.rb +163 -0
  127. data/scripts/investigate_mixed_content.rb +125 -0
  128. data/scripts/investigate_remaining_16.rb +214 -0
  129. data/scripts/run_single_test.rb +29 -0
  130. data/scripts/test_all_false_positives.rb +95 -0
  131. data/scripts/test_attribute_details.rb +61 -0
  132. data/scripts/test_both_algorithms.rb +49 -0
  133. data/scripts/test_both_simple.rb +49 -0
  134. data/scripts/test_enhanced_semantic_output.rb +125 -0
  135. data/scripts/test_readme_examples.rb +131 -0
  136. data/scripts/test_semantic_tree_diff.rb +99 -0
  137. data/scripts/test_semantic_ux_improvements.rb +135 -0
  138. data/scripts/test_single_false_positive.rb +119 -0
  139. data/scripts/test_size_limits.rb +99 -0
  140. data/test_html_1.html +21 -0
  141. data/test_html_2.html +21 -0
  142. data/test_nokogiri.rb +33 -0
  143. data/test_normalize.rb +45 -0
  144. metadata +123 -2
@@ -1,18 +1,31 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "../xml/c14n"
4
+ require_relative "markup_comparator"
4
5
  require_relative "match_options"
5
6
  require_relative "../diff/diff_node"
6
7
  require_relative "../diff/diff_classifier"
8
+ require_relative "../diff/path_builder"
9
+ require_relative "../diff/node_serializer"
7
10
  require_relative "comparison_result"
8
11
  require_relative "../tree_diff"
9
12
  require_relative "strategies/match_strategy_factory"
13
+ # XmlComparator modules
14
+ require_relative "xml_comparator/node_parser"
15
+ require_relative "xml_comparator/attribute_filter"
16
+ require_relative "xml_comparator/attribute_comparator"
17
+ require_relative "xml_comparator/namespace_comparator"
18
+ require_relative "xml_comparator/node_type_comparator"
19
+ require_relative "xml_comparator/child_comparison"
20
+ require_relative "xml_comparator/diff_node_builder"
10
21
 
11
22
  module Canon
12
23
  module Comparison
13
24
  # XML comparison class
14
25
  # Handles comparison of XML nodes with various options
15
- class XmlComparator
26
+ #
27
+ # Inherits shared comparison functionality from MarkupComparator.
28
+ class XmlComparator < MarkupComparator
16
29
  # Default comparison options for XML
17
30
  DEFAULT_OPTS = {
18
31
  # Structural filtering options
@@ -111,8 +124,8 @@ module Canon
111
124
  # Serialize parsed nodes for consistent formatting
112
125
  # This ensures both sides formatted identically, showing only real differences
113
126
  preprocessed = [
114
- serialize_node_to_xml(node1).gsub(/></, ">\n<"),
115
- serialize_node_to_xml(node2).gsub(/></, ">\n<"),
127
+ serialize_node(node1).gsub(/></, ">\n<"),
128
+ serialize_node(node2).gsub(/></, ">\n<"),
116
129
  ]
117
130
 
118
131
  ComparisonResult.new(
@@ -195,43 +208,9 @@ module Canon
195
208
 
196
209
  # Parse a node from string or return as-is
197
210
  # Applies preprocessing transformation before parsing if specified
211
+ # Delegates to NodeParser module
198
212
  def parse_node(node, preprocessing = :none)
199
- # If already a Canon::Xml::Node, return as-is
200
- return node if node.is_a?(Canon::Xml::Node)
201
-
202
- # If it's a Nokogiri or Moxml node, convert to DataModel
203
- unless node.is_a?(String)
204
- # Convert to XML string then parse through DataModel
205
- xml_str = if node.respond_to?(:to_xml)
206
- node.to_xml
207
- elsif node.respond_to?(:to_s)
208
- node.to_s
209
- else
210
- raise Canon::Error,
211
- "Unable to convert node to string: #{node.class}"
212
- end
213
- return Canon::Xml::DataModel.from_xml(xml_str)
214
- end
215
-
216
- # Apply preprocessing to XML string before parsing
217
- xml_string = case preprocessing
218
- when :normalize
219
- # Normalize whitespace: collapse runs, trim lines
220
- node.lines.map(&:strip).reject(&:empty?).join("\n")
221
- when :c14n
222
- # Canonicalize the XML
223
- Canon::Xml::C14n.canonicalize(node,
224
- with_comments: false)
225
- when :format
226
- # Pretty format the XML
227
- Canon.format(node, :xml)
228
- else
229
- # :none or unrecognized - use as-is
230
- node
231
- end
232
-
233
- # Use Canon::Xml::DataModel for parsing to get Canon::Xml::Node instances
234
- Canon::Xml::DataModel.from_xml(xml_string)
213
+ XmlComparatorHelpers::NodeParser.parse(node, preprocessing)
235
214
  end
236
215
 
237
216
  # Main comparison dispatcher
@@ -279,51 +258,15 @@ module Canon
279
258
  return Comparison::UNEQUAL_NODES_TYPES
280
259
  end
281
260
 
282
- # Dispatch based on node type
283
- # Canon::Xml::Node types use .node_type method that returns symbols
284
- # Nokogiri also has .node_type but returns integers, so check for Symbol
285
- if n1.respond_to?(:node_type) && n2.respond_to?(:node_type) &&
286
- n1.node_type.is_a?(Symbol) && n2.node_type.is_a?(Symbol)
287
- case n1.node_type
288
- when :root
289
- compare_children(n1, n2, opts, child_opts, diff_children,
290
- differences)
291
- when :element
292
- compare_element_nodes(n1, n2, opts, child_opts, diff_children,
293
- differences)
294
- when :text
295
- compare_text_nodes(n1, n2, opts, differences)
296
- when :comment
297
- compare_comment_nodes(n1, n2, opts, differences)
298
- when :cdata
299
- compare_text_nodes(n1, n2, opts, differences)
300
- when :processing_instruction
301
- compare_processing_instruction_nodes(n1, n2, opts, differences)
302
- else
303
- Comparison::EQUIVALENT
304
- end
305
- # Moxml/Nokogiri types use .element?, .text?, etc. methods
306
- elsif n1.respond_to?(:element?) && n1.element?
307
- compare_element_nodes(n1, n2, opts, child_opts, diff_children,
308
- differences)
309
- elsif n1.respond_to?(:text?) && n1.text?
310
- compare_text_nodes(n1, n2, opts, differences)
311
- elsif n1.respond_to?(:comment?) && n1.comment?
312
- compare_comment_nodes(n1, n2, opts, differences)
313
- elsif n1.respond_to?(:cdata?) && n1.cdata?
314
- compare_text_nodes(n1, n2, opts, differences)
315
- elsif n1.respond_to?(:processing_instruction?) &&
316
- n1.processing_instruction?
317
- compare_processing_instruction_nodes(n1, n2, opts, differences)
318
- elsif n1.respond_to?(:root)
319
- # Document node (Moxml/Nokogiri - legacy path)
320
- compare_document_nodes(n1, n2, opts, child_opts, diff_children,
321
- differences)
322
- else
323
- Comparison::EQUIVALENT
324
- end
261
+ # Dispatch based on node type using NodeTypeComparator strategy
262
+ XmlComparatorHelpers::NodeTypeComparator.compare(
263
+ n1, n2, self, opts, child_opts, diff_children, differences
264
+ )
325
265
  end
326
266
 
267
+ # Public comparison methods - exposed for XmlNodeComparison module
268
+ public
269
+
327
270
  # Compare two element nodes
328
271
  def compare_element_nodes(n1, n2, opts, child_opts, diff_children,
329
272
  differences)
@@ -369,153 +312,10 @@ module Canon
369
312
  end
370
313
 
371
314
  # Compare attribute sets
315
+ # Delegates to XmlComparatorHelpers::AttributeComparator
372
316
  def compare_attribute_sets(n1, n2, opts, differences)
373
- # Get attributes using the appropriate method for each node type
374
- raw_attrs1 = n1.respond_to?(:attribute_nodes) ? n1.attribute_nodes : n1.attributes
375
- raw_attrs2 = n2.respond_to?(:attribute_nodes) ? n2.attribute_nodes : n2.attributes
376
-
377
- attrs1 = filter_attributes(raw_attrs1, opts)
378
- attrs2 = filter_attributes(raw_attrs2, opts)
379
-
380
- match_opts = opts[:match_opts]
381
- attribute_order_behavior = match_opts[:attribute_order] || :strict
382
-
383
- # Check attribute order if not ignored
384
- keys1 = attrs1.keys.map(&:to_s)
385
- keys2 = attrs2.keys.map(&:to_s)
386
- if attribute_order_behavior == :strict
387
- # Strict mode: attribute order matters
388
- # Check if keys are in same order
389
-
390
- if keys1 != keys2
391
- # Keys are different or in different order
392
- # First check if it's just ordering (same keys, different order)
393
- if keys1.sort == keys2.sort
394
- # Same keys, different order - this is an attribute_order difference
395
- add_difference(n1, n2, Comparison::UNEQUAL_ATTRIBUTES,
396
- Comparison::UNEQUAL_ATTRIBUTES,
397
- :attribute_order, opts, differences)
398
- return Comparison::UNEQUAL_ATTRIBUTES
399
- else
400
- # Different keys - this is attribute_presence difference
401
- add_difference(n1, n2, Comparison::MISSING_ATTRIBUTE,
402
- Comparison::MISSING_ATTRIBUTE,
403
- :attribute_presence, opts, differences)
404
- return Comparison::MISSING_ATTRIBUTE
405
- end
406
- end
407
-
408
- # Order matches, now check values in order
409
- else
410
- # Ignore/normalize mode: attribute order doesn't affect equivalence
411
- # But in verbose mode, we should still track order differences as informative
412
-
413
- # Check if order differs (but keys are the same)
414
- if keys1 != keys2 && keys1.sort == keys2.sort && opts[:verbose]
415
- # Same keys, different order - create informative DiffNode
416
- # This allows line diffs to be properly classified as informative
417
- add_difference(n1, n2, Comparison::UNEQUAL_ATTRIBUTES,
418
- Comparison::UNEQUAL_ATTRIBUTES,
419
- :attribute_order, opts, differences)
420
- end
421
-
422
- # Sort attributes so order doesn't matter for comparison
423
- attrs1 = attrs1.sort_by { |k, _v| k.to_s }.to_h
424
- attrs2 = attrs2.sort_by { |k, _v| k.to_s }.to_h
425
-
426
- unless attrs1.keys.map(&:to_s).sort == attrs2.keys.map(&:to_s).sort
427
- add_difference(n1, n2, Comparison::MISSING_ATTRIBUTE,
428
- Comparison::MISSING_ATTRIBUTE,
429
- :attribute_presence, opts, differences)
430
- return Comparison::MISSING_ATTRIBUTE
431
- end
432
-
433
- end
434
- attrs1.each do |name, value|
435
- unless attrs2[name] == value
436
- add_difference(n1, n2, Comparison::UNEQUAL_ATTRIBUTES,
437
- Comparison::UNEQUAL_ATTRIBUTES,
438
- :attribute_values, opts, differences)
439
- return Comparison::UNEQUAL_ATTRIBUTES
440
- end
441
- end
442
-
443
- Comparison::EQUIVALENT
444
- end
445
-
446
- # Filter attributes based on options
447
- def filter_attributes(attributes, opts)
448
- filtered = {}
449
- match_opts = opts[:match_opts]
450
-
451
- # Handle Canon::Xml::Node attribute format (array of AttributeNode)
452
- if attributes.is_a?(Array)
453
- attributes.each do |attr|
454
- name = attr.name
455
- value = attr.value
456
-
457
- # Skip namespace declarations - they're handled separately
458
- next if is_namespace_declaration?(name)
459
-
460
- # Skip if attribute name should be ignored
461
- next if should_ignore_attr_by_name?(name, opts)
462
-
463
- # Skip if attribute content should be ignored
464
- next if should_ignore_attr_content?(value, opts)
465
-
466
- # Apply match options for attribute values
467
- behavior = match_opts[:attribute_values] || :strict
468
- value = MatchOptions.process_attribute_value(value, behavior)
469
-
470
- filtered[name] = value
471
- end
472
- else
473
- # Handle Nokogiri and Moxml attribute formats (Hash-like):
474
- # - Nokogiri: key is String name, val is Nokogiri::XML::Attr object
475
- # - Moxml: key is Moxml::Attribute object, val is nil
476
- attributes.each do |key, val|
477
- if key.is_a?(String)
478
- # Nokogiri format: key=name (String), val=attr object
479
- name = key
480
- value = val.respond_to?(:value) ? val.value : val.to_s
481
- else
482
- # Moxml format: key=attr object, val=nil
483
- name = key.respond_to?(:name) ? key.name : key.to_s
484
- value = key.respond_to?(:value) ? key.value : key.to_s
485
- end
486
-
487
- # Skip namespace declarations - they're handled separately
488
- next if is_namespace_declaration?(name)
489
-
490
- # Skip if attribute name should be ignored
491
- next if should_ignore_attr_by_name?(name, opts)
492
-
493
- # Skip if attribute content should be ignored
494
- next if should_ignore_attr_content?(value, opts)
495
-
496
- # Apply match options for attribute values
497
- behavior = match_opts[:attribute_values] || :strict
498
- value = MatchOptions.process_attribute_value(value, behavior)
499
-
500
- filtered[name] = value
501
- end
502
- end
503
-
504
- filtered
505
- end
506
-
507
- # Check if attribute should be ignored by name
508
- def should_ignore_attr_by_name?(name, opts)
509
- opts[:ignore_attrs_by_name].any? do |pattern|
510
- name.include?(pattern)
511
- end
512
- end
513
-
514
- # Check if attribute should be ignored by content
515
- def should_ignore_attr_content?(value, opts)
516
- opts[:ignore_attr_content].any? do |pattern|
517
- value.to_s.include?(pattern)
518
- end
317
+ XmlComparatorHelpers::AttributeComparator.compare(n1, n2, opts,
318
+ differences)
519
319
  end
520
320
 
521
321
  # Compare text nodes
@@ -566,19 +366,6 @@ module Canon
566
366
  matches_per_behavior ? Comparison::EQUIVALENT : Comparison::UNEQUAL_TEXT_CONTENTS
567
367
  end
568
368
 
569
- # Check if the difference between two texts is only whitespace-related
570
- # @param text1 [String] First text
571
- # @param text2 [String] Second text
572
- # @return [Boolean] true if difference is only in whitespace
573
- def whitespace_only_difference?(text1, text2)
574
- # Normalize both texts (collapse/trim whitespace)
575
- norm1 = MatchOptions.normalize_text(text1)
576
- norm2 = MatchOptions.normalize_text(text2)
577
-
578
- # If normalized texts are the same, the difference was only whitespace
579
- norm1 == norm2
580
- end
581
-
582
369
  # Check if whitespace should be preserved strictly for these text nodes
583
370
  # This applies to HTML elements like pre, code, textarea, script, style
584
371
  def should_preserve_whitespace_strictly?(n1, n2)
@@ -680,258 +467,12 @@ module Canon
680
467
 
681
468
  # Compare children of two nodes using semantic matching
682
469
  #
683
- # Uses ElementMatcher to pair children semantically (by identity attributes
684
- # or position), then compares matched pairs and detects position changes.
685
- def compare_children(n1, n2, opts, child_opts, diff_children,
686
- differences)
687
- children1 = filter_children(n1.children, opts)
688
- children2 = filter_children(n2.children, opts)
689
-
690
- # Quick check: if both have no children, they're equivalent
691
- return Comparison::EQUIVALENT if children1.empty? && children2.empty?
692
-
693
- # Check if we can use ElementMatcher (requires Canon::Xml::DataModel nodes)
694
- # ElementMatcher expects nodes with .node_type method that returns symbols
695
- # and only works with element nodes (filters out text, comment, etc.)
696
- can_use_matcher = children1.all? do |c|
697
- c.is_a?(Canon::Xml::Node) && c.node_type == :element
698
- end &&
699
- children2.all? { |c| c.is_a?(Canon::Xml::Node) && c.node_type == :element }
700
-
701
- if can_use_matcher && !children1.empty? && !children2.empty?
702
- # Use ElementMatcher for semantic matching with position tracking
703
- use_element_matcher_comparison(children1, children2, n1, opts,
704
- child_opts, diff_children, differences)
705
- else
706
- # Fall back to simple positional comparison for Moxml/Nokogiri nodes
707
- # Length check
708
- unless children1.length == children2.length
709
- # Determine dimension based on type of first differing child
710
- # When lengths differ, find which child is missing/extra
711
- dimension = :text_content # default
712
-
713
- # Compare position by position to find first difference
714
- max_len = [children1.length, children2.length].max
715
- (0...max_len).each do |i|
716
- if i >= children1.length
717
- # Extra child in children2
718
- dimension = determine_node_dimension(children2[i])
719
- break
720
- elsif i >= children2.length
721
- # Extra child in children1
722
- dimension = determine_node_dimension(children1[i])
723
- break
724
- elsif !same_node_type?(children1[i], children2[i])
725
- # Different node types at same position
726
- dimension = determine_node_dimension(children1[i])
727
- break
728
- end
729
- end
730
-
731
- add_difference(n1, n2, Comparison::MISSING_NODE,
732
- Comparison::MISSING_NODE, dimension, opts,
733
- differences)
734
- return Comparison::MISSING_NODE
735
- end
736
-
737
- # Compare children pairwise by position
738
- result = Comparison::EQUIVALENT
739
- children1.zip(children2).each do |child1, child2|
740
- child_result = compare_nodes(child1, child2, child_opts, child_opts,
741
- diff_children, differences)
742
- result = child_result unless child_result == Comparison::EQUIVALENT
743
- end
744
-
745
- result
746
- end
747
- end
748
-
749
- # Use ElementMatcher for semantic comparison (Canon::Xml::DataModel nodes)
750
- def use_element_matcher_comparison(children1, children2, parent_node,
751
- opts, child_opts, diff_children,
752
- differences)
753
- require_relative "../xml/element_matcher"
754
-
755
- # Create temporary RootNode wrappers to use ElementMatcher
756
- # Don't modify parent pointers - just set @children directly
757
- require_relative "../xml/nodes/root_node"
758
-
759
- temp_root1 = Canon::Xml::Nodes::RootNode.new
760
- temp_root1.instance_variable_set(:@children, children1.dup)
761
-
762
- temp_root2 = Canon::Xml::Nodes::RootNode.new
763
- temp_root2.instance_variable_set(:@children, children2.dup)
764
-
765
- matcher = Canon::Xml::ElementMatcher.new
766
- matches = matcher.match_trees(temp_root1, temp_root2)
767
-
768
- # Filter matches to only include direct children
769
- # match_trees returns ALL descendants, but we only want direct children
770
- matches = matches.select do |m|
771
- (m.elem1.nil? || children1.include?(m.elem1)) &&
772
- (m.elem2.nil? || children2.include?(m.elem2))
773
- end
774
-
775
- # If no matches and children exist, they're all different
776
- if matches.empty? && (!children1.empty? || !children2.empty?)
777
- add_difference(parent_node, parent_node, Comparison::MISSING_NODE,
778
- Comparison::MISSING_NODE, :text_content, opts,
779
- differences)
780
- return Comparison::UNEQUAL_ELEMENTS
781
- end
782
-
783
- all_equivalent = true
784
-
785
- matches.each do |match|
786
- case match.status
787
- when :matched
788
- # Check if element position changed
789
- if match.position_changed?
790
- match_opts = opts[:match_opts]
791
- position_behavior = match_opts[:element_position] || :strict
792
-
793
- # Only create DiffNode if element_position is not :ignore
794
- if position_behavior != :ignore
795
- add_difference(
796
- match.elem1,
797
- match.elem2,
798
- "position #{match.pos1}",
799
- "position #{match.pos2}",
800
- :element_position,
801
- opts,
802
- differences,
803
- )
804
- all_equivalent = false if position_behavior == :strict
805
- end
806
- end
807
-
808
- # Compare the matched elements for content/attribute differences
809
- result = compare_nodes(match.elem1, match.elem2, child_opts,
810
- child_opts, diff_children, differences)
811
- all_equivalent = false unless result == Comparison::EQUIVALENT
812
-
813
- when :deleted
814
- # Element present in first tree but not second
815
- add_difference(match.elem1, nil, Comparison::MISSING_NODE,
816
- Comparison::MISSING_NODE, :element_structure, opts,
817
- differences)
818
- all_equivalent = false
819
-
820
- when :inserted
821
- # Element present in second tree but not first
822
- add_difference(nil, match.elem2, Comparison::MISSING_NODE,
823
- Comparison::MISSING_NODE, :element_structure, opts,
824
- differences)
825
- all_equivalent = false
826
- end
827
- end
828
-
829
- all_equivalent ? Comparison::EQUIVALENT : Comparison::UNEQUAL_ELEMENTS
830
- end
831
-
832
- # Filter children based on options
833
- def filter_children(children, opts)
834
- children.reject do |child|
835
- node_excluded?(child, opts)
836
- end
837
- end
838
-
839
- # Check if node should be excluded
840
- def node_excluded?(node, opts)
841
- match_opts = opts[:match_opts]
842
-
843
- # Determine node type
844
- # Canon::Xml::Node uses node_type that returns Symbol
845
- # Nokogiri uses node_type that returns Integer, so check for Symbol first
846
- if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
847
- node.node_type == :comment
848
- else
849
- node.respond_to?(:comment?) && node.comment?
850
- end
851
-
852
- is_text = if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
853
- node.node_type == :text
854
- else
855
- node.respond_to?(:text?) && node.text?
856
- end
857
-
858
- # Ignore text nodes if specified
859
- return true if opts[:ignore_text_nodes] && is_text
860
-
861
- # Ignore whitespace-only text nodes based on structural_whitespace
862
- # Both :ignore and :normalize should filter out whitespace-only nodes
863
- if %i[ignore
864
- normalize].include?(match_opts[:structural_whitespace]) && is_text
865
- text = node_text(node)
866
- return true if MatchOptions.normalize_text(text).empty?
867
- end
868
-
869
- false
870
- end
871
-
872
- # Determine the appropriate dimension for a node type
873
- # @param node [Object] The node to check
874
- # @return [Symbol] The dimension symbol
875
- def determine_node_dimension(node)
876
- # Canon::Xml::Node types
877
- if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
878
- case node.node_type
879
- when :comment then :comments
880
- when :text, :cdata then :text_content
881
- when :processing_instruction then :processing_instructions
882
- else :text_content
883
- end
884
- # Moxml/Nokogiri types
885
- elsif node.respond_to?(:comment?) && node.comment?
886
- :comments
887
- elsif node.respond_to?(:text?) && node.text?
888
- :text_content
889
- elsif node.respond_to?(:cdata?) && node.cdata?
890
- :text_content
891
- elsif node.respond_to?(:processing_instruction?) && node.processing_instruction?
892
- :processing_instructions
893
- else
894
- :text_content
895
- end
896
- end
897
-
898
- # Check if two nodes are the same type
899
- def same_node_type?(n1, n2)
900
- # Canon::Xml::Node types - check node_type method
901
- if n1.respond_to?(:node_type) && n2.respond_to?(:node_type)
902
- return n1.node_type == n2.node_type
903
- end
904
-
905
- # Moxml/Nokogiri types - check individual type methods
906
- return true if n1.respond_to?(:element?) && n1.element? &&
907
- n2.respond_to?(:element?) && n2.element?
908
- return true if n1.respond_to?(:text?) && n1.text? &&
909
- n2.respond_to?(:text?) && n2.text?
910
- return true if n1.respond_to?(:comment?) && n1.comment? &&
911
- n2.respond_to?(:comment?) && n2.comment?
912
- return true if n1.respond_to?(:cdata?) && n1.cdata? &&
913
- n2.respond_to?(:cdata?) && n2.cdata?
914
- return true if n1.respond_to?(:processing_instruction?) &&
915
- n1.processing_instruction? &&
916
- n2.respond_to?(:processing_instruction?) &&
917
- n2.processing_instruction?
918
- return true if n1.respond_to?(:root) && n2.respond_to?(:root)
919
-
920
- false
921
- end
922
-
923
- # Get text content from a node
924
- def node_text(node)
925
- # Canon::Xml::Node TextNode uses .value
926
- if node.respond_to?(:value)
927
- node.value.to_s
928
- elsif node.respond_to?(:content)
929
- node.content.to_s
930
- elsif node.respond_to?(:text)
931
- node.text.to_s
932
- else
933
- ""
934
- end
470
+ # Delegates to ChildComparison module which handles both ElementMatcher
471
+ # (semantic matching) and simple positional comparison.
472
+ def compare_children(n1, n2, opts, child_opts, diff_children, differences)
473
+ XmlComparatorHelpers::ChildComparison.compare(
474
+ n1, n2, self, opts, child_opts, diff_children, differences
475
+ )
935
476
  end
936
477
 
937
478
  # Extract element path for context (best effort)
@@ -960,67 +501,24 @@ module Canon
960
501
  path
961
502
  end
962
503
 
963
- # Serialize a node to XML string
964
- # @param node [Canon::Xml::Node, Object] Node to serialize
965
- # @return [String] XML string representation
966
- def serialize_node_to_xml(node)
967
- if node.is_a?(Canon::Xml::Nodes::RootNode)
968
- # Serialize all children of root
969
- node.children.map { |child| serialize_node_to_xml(child) }.join
970
- elsif node.is_a?(Canon::Xml::Nodes::ElementNode)
971
- # Serialize element with attributes and children
972
- attrs = node.attribute_nodes.map do |a|
973
- " #{a.name}=\"#{a.value}\""
974
- end.join
975
- children_xml = node.children.map do |c|
976
- serialize_node_to_xml(c)
977
- end.join
978
-
979
- if children_xml.empty?
980
- "<#{node.name}#{attrs}/>"
981
- else
982
- "<#{node.name}#{attrs}>#{children_xml}</#{node.name}>"
983
- end
984
- elsif node.is_a?(Canon::Xml::Nodes::TextNode)
985
- node.value
986
- elsif node.is_a?(Canon::Xml::Nodes::CommentNode)
987
- "<!--#{node.value}-->"
988
- elsif node.is_a?(Canon::Xml::Nodes::ProcessingInstructionNode)
989
- "<?#{node.target} #{node.data}?>"
990
- elsif node.respond_to?(:to_xml)
991
- node.to_xml
992
- else
993
- node.to_s
994
- end
995
- end
504
+ # Serialize a node to string for display
505
+ #
506
+ # @param node [Object, nil] Node to serialize
507
+ # @return [String, nil] Serialized content
508
+ def serialize_node(node)
509
+ return nil if node.nil?
996
510
 
997
- # Add a difference to the differences array
998
- # @param node1 [Object] First node
999
- # @param node2 [Object] Second node
1000
- # @param diff1 [String] Difference type for node1
1001
- # @param diff2 [String] Difference type for node2
1002
- # @param dimension [Symbol] The match dimension causing this difference
1003
- # @param opts [Hash] Options
1004
- # @param differences [Array] Array to append difference to
1005
- def add_difference(node1, node2, diff1, diff2, dimension, _opts,
1006
- differences)
1007
- # All differences must be DiffNode objects (OO architecture)
1008
- if dimension.nil?
1009
- raise ArgumentError,
1010
- "dimension required for DiffNode"
1011
- end
511
+ Canon::Diff::NodeSerializer.serialize(node)
512
+ end
1012
513
 
1013
- # Build informative reason message
1014
- reason = build_difference_reason(node1, node2, diff1, diff2,
1015
- dimension)
514
+ # Extract attributes from a node as a normalized hash
515
+ #
516
+ # @param node [Object, nil] Node to extract attributes from
517
+ # @return [Hash, nil] Normalized attributes hash
518
+ def extract_attributes(node)
519
+ return nil if node.nil?
1016
520
 
1017
- diff_node = Canon::Diff::DiffNode.new(
1018
- node1: node1,
1019
- node2: node2,
1020
- dimension: dimension,
1021
- reason: reason,
1022
- )
1023
- differences << diff_node
521
+ Canon::Diff::NodeSerializer.extract_attributes(node)
1024
522
  end
1025
523
 
1026
524
  # Build a human-readable reason for a difference
@@ -1049,120 +547,10 @@ module Canon
1049
547
  end
1050
548
 
1051
549
  # Compare namespace declarations (xmlns and xmlns:* attributes)
1052
- # @param n1 [Object] First node
1053
- # @param n2 [Object] Second node
1054
- # @param opts [Hash] Options
1055
- # @param differences [Array] Array to append differences to
1056
- # @return [Symbol] Comparison result
550
+ # Delegates to XmlComparatorHelpers::NamespaceComparator
1057
551
  def compare_namespace_declarations(n1, n2, opts, differences)
1058
- ns_decls1 = extract_namespace_declarations(n1)
1059
- ns_decls2 = extract_namespace_declarations(n2)
1060
-
1061
- # Find missing, extra, and changed namespace declarations
1062
- missing = ns_decls1.keys - ns_decls2.keys # In n1 but not n2
1063
- extra = ns_decls2.keys - ns_decls1.keys # In n2 but not n1
1064
- changed = ns_decls1.select do |prefix, uri|
1065
- ns_decls2[prefix] && ns_decls2[prefix] != uri
1066
- end.keys
1067
-
1068
- # If there are any differences, create a DiffNode
1069
- if missing.any? || extra.any? || changed.any?
1070
- # Build a descriptive reason
1071
- reasons = []
1072
- if missing.any?
1073
- reasons << "removed: #{missing.map do |p|
1074
- p.empty? ? 'xmlns' : "xmlns:#{p}"
1075
- end.join(', ')}"
1076
- end
1077
- if extra.any?
1078
- reasons << "added: #{extra.map do |p|
1079
- p.empty? ? 'xmlns' : "xmlns:#{p}"
1080
- end.join(', ')}"
1081
- end
1082
- if changed.any?
1083
- reasons << "changed: #{changed.map do |p|
1084
- p.empty? ? 'xmlns' : "xmlns:#{p}"
1085
- end.join(', ')}"
1086
- end
1087
-
1088
- add_difference(
1089
- n1,
1090
- n2,
1091
- Comparison::UNEQUAL_ATTRIBUTES,
1092
- Comparison::UNEQUAL_ATTRIBUTES,
1093
- :namespace_declarations,
1094
- opts,
1095
- differences,
1096
- )
1097
- return Comparison::UNEQUAL_ATTRIBUTES
1098
- end
1099
-
1100
- Comparison::EQUIVALENT
1101
- end
1102
-
1103
- # Extract namespace declarations from a node
1104
- # @param node [Object] Node to extract namespace declarations from
1105
- # @return [Hash] Hash of prefix => URI mappings
1106
- def extract_namespace_declarations(node)
1107
- declarations = {}
1108
-
1109
- # Handle Canon::Xml::Node (uses namespace_nodes)
1110
- if node.respond_to?(:namespace_nodes)
1111
- node.namespace_nodes.each do |ns|
1112
- # Skip the implicit xml namespace (always present)
1113
- next if ns.prefix == "xml" && ns.uri == "http://www.w3.org/XML/1998/namespace"
1114
-
1115
- prefix = ns.prefix || ""
1116
- declarations[prefix] = ns.uri
1117
- end
1118
- return declarations
1119
- end
1120
-
1121
- # Handle Nokogiri/Moxml nodes (use attributes)
1122
- # Get raw attributes
1123
- raw_attrs = node.respond_to?(:attribute_nodes) ? node.attribute_nodes : node.attributes
1124
-
1125
- # Handle Canon::Xml::Node attribute format (array of AttributeNode)
1126
- if raw_attrs.is_a?(Array)
1127
- raw_attrs.each do |attr|
1128
- name = attr.name
1129
- value = attr.value
1130
-
1131
- if is_namespace_declaration?(name)
1132
- # Extract prefix: "xmlns" -> "", "xmlns:xmi" -> "xmi"
1133
- prefix = name == "xmlns" ? "" : name.split(":", 2)[1]
1134
- declarations[prefix] = value
1135
- end
1136
- end
1137
- else
1138
- # Handle Nokogiri and Moxml attribute formats (Hash-like)
1139
- raw_attrs.each do |key, val|
1140
- if key.is_a?(String)
1141
- # Nokogiri format: key=name (String), val=attr object
1142
- name = key
1143
- value = val.respond_to?(:value) ? val.value : val.to_s
1144
- else
1145
- # Moxml format: key=attr object, val=nil
1146
- name = key.respond_to?(:name) ? key.name : key.to_s
1147
- value = key.respond_to?(:value) ? key.value : key.to_s
1148
- end
1149
-
1150
- if is_namespace_declaration?(name)
1151
- # Extract prefix: "xmlns" -> "", "xmlns:xmi" -> "xmi"
1152
- prefix = name == "xmlns" ? "" : name.split(":", 2)[1]
1153
- declarations[prefix] = value
1154
- end
1155
- end
1156
- end
1157
-
1158
- declarations
1159
- end
1160
-
1161
- # Check if an attribute name is a namespace declaration
1162
- # @param attr_name [String] Attribute name
1163
- # @return [Boolean] true if it's a namespace declaration
1164
- def is_namespace_declaration?(attr_name)
1165
- attr_name == "xmlns" || attr_name.start_with?("xmlns:")
552
+ XmlComparatorHelpers::NamespaceComparator.compare(n1, n2, opts,
553
+ differences)
1166
554
  end
1167
555
  end
1168
556
  end