canon 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +69 -92
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/Gemfile +1 -0
  6. data/docs/_config.yml +90 -1
  7. data/docs/advanced/diff-classification.adoc +82 -2
  8. data/docs/advanced/extending-canon.adoc +193 -0
  9. data/docs/features/match-options/index.adoc +239 -1
  10. data/docs/internals/diffnode-enrichment.adoc +611 -0
  11. data/docs/internals/index.adoc +251 -0
  12. data/docs/lychee.toml +13 -6
  13. data/docs/understanding/architecture.adoc +749 -33
  14. data/docs/understanding/comparison-pipeline.adoc +122 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +87 -0
  27. data/lib/canon/comparison/html_comparator.rb +70 -26
  28. data/lib/canon/comparison/html_compare_profile.rb +8 -2
  29. data/lib/canon/comparison/html_parser.rb +80 -0
  30. data/lib/canon/comparison/json_comparator.rb +12 -0
  31. data/lib/canon/comparison/json_parser.rb +19 -0
  32. data/lib/canon/comparison/markup_comparator.rb +293 -0
  33. data/lib/canon/comparison/match_options/base_resolver.rb +150 -0
  34. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  35. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  36. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  37. data/lib/canon/comparison/match_options.rb +68 -463
  38. data/lib/canon/comparison/profile_definition.rb +149 -0
  39. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  40. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  41. data/lib/canon/comparison/whitespace_sensitivity.rb +208 -0
  42. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  43. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  44. data/lib/canon/comparison/xml_comparator/child_comparison.rb +197 -0
  45. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  46. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  47. data/lib/canon/comparison/xml_comparator/node_parser.rb +79 -0
  48. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +102 -0
  49. data/lib/canon/comparison/xml_comparator.rb +97 -684
  50. data/lib/canon/comparison/xml_node_comparison.rb +319 -0
  51. data/lib/canon/comparison/xml_parser.rb +19 -0
  52. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  53. data/lib/canon/comparison.rb +265 -110
  54. data/lib/canon/diff/diff_classifier.rb +101 -2
  55. data/lib/canon/diff/diff_node.rb +32 -2
  56. data/lib/canon/diff/formatting_detector.rb +1 -1
  57. data/lib/canon/diff/node_serializer.rb +191 -0
  58. data/lib/canon/diff/path_builder.rb +143 -0
  59. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  60. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  61. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  62. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  64. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  65. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  66. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  67. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  68. data/lib/canon/diff_formatter.rb +1 -1
  69. data/lib/canon/rspec_matchers.rb +38 -9
  70. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  71. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  72. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  73. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  74. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  75. data/lib/canon/version.rb +1 -1
  76. data/lib/canon/xml/data_model.rb +24 -13
  77. metadata +48 -2
@@ -1,18 +1,33 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "../xml/c14n"
4
+ require_relative "markup_comparator"
4
5
  require_relative "match_options"
5
6
  require_relative "../diff/diff_node"
6
7
  require_relative "../diff/diff_classifier"
8
+ require_relative "../diff/path_builder"
9
+ require_relative "../diff/node_serializer"
7
10
  require_relative "comparison_result"
8
11
  require_relative "../tree_diff"
9
12
  require_relative "strategies/match_strategy_factory"
13
+ # XmlComparator modules
14
+ require_relative "xml_comparator/node_parser"
15
+ require_relative "xml_comparator/attribute_filter"
16
+ require_relative "xml_comparator/attribute_comparator"
17
+ require_relative "xml_comparator/namespace_comparator"
18
+ require_relative "xml_comparator/node_type_comparator"
19
+ require_relative "xml_comparator/child_comparison"
20
+ require_relative "xml_comparator/diff_node_builder"
21
+ # Whitespace sensitivity module
22
+ require_relative "whitespace_sensitivity"
10
23
 
11
24
  module Canon
12
25
  module Comparison
13
26
  # XML comparison class
14
27
  # Handles comparison of XML nodes with various options
15
- class XmlComparator
28
+ #
29
+ # Inherits shared comparison functionality from MarkupComparator.
30
+ class XmlComparator < MarkupComparator
16
31
  # Default comparison options for XML
17
32
  DEFAULT_OPTS = {
18
33
  # Structural filtering options
@@ -77,9 +92,15 @@ module Canon
77
92
  # Create child_opts with resolved options
78
93
  child_opts = opts.merge(child_opts)
79
94
 
95
+ # Determine if we should preserve whitespace during parsing
96
+ # When structural_whitespace is :strict, preserve all whitespace-only text nodes
97
+ preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
98
+
80
99
  # Parse nodes if they are strings, applying preprocessing if needed
81
- node1 = parse_node(n1, match_opts_hash[:preprocessing])
82
- node2 = parse_node(n2, match_opts_hash[:preprocessing])
100
+ node1 = parse_node(n1, match_opts_hash[:preprocessing],
101
+ preserve_whitespace: preserve_whitespace)
102
+ node2 = parse_node(n2, match_opts_hash[:preprocessing],
103
+ preserve_whitespace: preserve_whitespace)
83
104
 
84
105
  # Store original strings for line diff display (before preprocessing)
85
106
  original1 = if n1.is_a?(String)
@@ -111,8 +132,8 @@ module Canon
111
132
  # Serialize parsed nodes for consistent formatting
112
133
  # This ensures both sides formatted identically, showing only real differences
113
134
  preprocessed = [
114
- serialize_node_to_xml(node1).gsub(/></, ">\n<"),
115
- serialize_node_to_xml(node2).gsub(/></, ">\n<"),
135
+ serialize_node(node1).gsub(/></, ">\n<"),
136
+ serialize_node(node2).gsub(/></, ">\n<"),
116
137
  ]
117
138
 
118
139
  ComparisonResult.new(
@@ -195,43 +216,10 @@ module Canon
195
216
 
196
217
  # Parse a node from string or return as-is
197
218
  # Applies preprocessing transformation before parsing if specified
198
- def parse_node(node, preprocessing = :none)
199
- # If already a Canon::Xml::Node, return as-is
200
- return node if node.is_a?(Canon::Xml::Node)
201
-
202
- # If it's a Nokogiri or Moxml node, convert to DataModel
203
- unless node.is_a?(String)
204
- # Convert to XML string then parse through DataModel
205
- xml_str = if node.respond_to?(:to_xml)
206
- node.to_xml
207
- elsif node.respond_to?(:to_s)
208
- node.to_s
209
- else
210
- raise Canon::Error,
211
- "Unable to convert node to string: #{node.class}"
212
- end
213
- return Canon::Xml::DataModel.from_xml(xml_str)
214
- end
215
-
216
- # Apply preprocessing to XML string before parsing
217
- xml_string = case preprocessing
218
- when :normalize
219
- # Normalize whitespace: collapse runs, trim lines
220
- node.lines.map(&:strip).reject(&:empty?).join("\n")
221
- when :c14n
222
- # Canonicalize the XML
223
- Canon::Xml::C14n.canonicalize(node,
224
- with_comments: false)
225
- when :format
226
- # Pretty format the XML
227
- Canon.format(node, :xml)
228
- else
229
- # :none or unrecognized - use as-is
230
- node
231
- end
232
-
233
- # Use Canon::Xml::DataModel for parsing to get Canon::Xml::Node instances
234
- Canon::Xml::DataModel.from_xml(xml_string)
219
+ # Delegates to NodeParser module
220
+ def parse_node(node, preprocessing = :none, preserve_whitespace: false)
221
+ XmlComparatorHelpers::NodeParser.parse(node, preprocessing,
222
+ preserve_whitespace: preserve_whitespace)
235
223
  end
236
224
 
237
225
  # Main comparison dispatcher
@@ -279,51 +267,15 @@ module Canon
279
267
  return Comparison::UNEQUAL_NODES_TYPES
280
268
  end
281
269
 
282
- # Dispatch based on node type
283
- # Canon::Xml::Node types use .node_type method that returns symbols
284
- # Nokogiri also has .node_type but returns integers, so check for Symbol
285
- if n1.respond_to?(:node_type) && n2.respond_to?(:node_type) &&
286
- n1.node_type.is_a?(Symbol) && n2.node_type.is_a?(Symbol)
287
- case n1.node_type
288
- when :root
289
- compare_children(n1, n2, opts, child_opts, diff_children,
290
- differences)
291
- when :element
292
- compare_element_nodes(n1, n2, opts, child_opts, diff_children,
293
- differences)
294
- when :text
295
- compare_text_nodes(n1, n2, opts, differences)
296
- when :comment
297
- compare_comment_nodes(n1, n2, opts, differences)
298
- when :cdata
299
- compare_text_nodes(n1, n2, opts, differences)
300
- when :processing_instruction
301
- compare_processing_instruction_nodes(n1, n2, opts, differences)
302
- else
303
- Comparison::EQUIVALENT
304
- end
305
- # Moxml/Nokogiri types use .element?, .text?, etc. methods
306
- elsif n1.respond_to?(:element?) && n1.element?
307
- compare_element_nodes(n1, n2, opts, child_opts, diff_children,
308
- differences)
309
- elsif n1.respond_to?(:text?) && n1.text?
310
- compare_text_nodes(n1, n2, opts, differences)
311
- elsif n1.respond_to?(:comment?) && n1.comment?
312
- compare_comment_nodes(n1, n2, opts, differences)
313
- elsif n1.respond_to?(:cdata?) && n1.cdata?
314
- compare_text_nodes(n1, n2, opts, differences)
315
- elsif n1.respond_to?(:processing_instruction?) &&
316
- n1.processing_instruction?
317
- compare_processing_instruction_nodes(n1, n2, opts, differences)
318
- elsif n1.respond_to?(:root)
319
- # Document node (Moxml/Nokogiri - legacy path)
320
- compare_document_nodes(n1, n2, opts, child_opts, diff_children,
321
- differences)
322
- else
323
- Comparison::EQUIVALENT
324
- end
270
+ # Dispatch based on node type using NodeTypeComparator strategy
271
+ XmlComparatorHelpers::NodeTypeComparator.compare(
272
+ n1, n2, self, opts, child_opts, diff_children, differences
273
+ )
325
274
  end
326
275
 
276
+ # Public comparison methods - exposed for XmlNodeComparison module
277
+ public
278
+
327
279
  # Compare two element nodes
328
280
  def compare_element_nodes(n1, n2, opts, child_opts, diff_children,
329
281
  differences)
@@ -369,153 +321,10 @@ module Canon
369
321
  end
370
322
 
371
323
  # Compare attribute sets
324
+ # Delegates to XmlComparatorHelpers::AttributeComparator
372
325
  def compare_attribute_sets(n1, n2, opts, differences)
373
- # Get attributes using the appropriate method for each node type
374
- raw_attrs1 = n1.respond_to?(:attribute_nodes) ? n1.attribute_nodes : n1.attributes
375
- raw_attrs2 = n2.respond_to?(:attribute_nodes) ? n2.attribute_nodes : n2.attributes
376
-
377
- attrs1 = filter_attributes(raw_attrs1, opts)
378
- attrs2 = filter_attributes(raw_attrs2, opts)
379
-
380
- match_opts = opts[:match_opts]
381
- attribute_order_behavior = match_opts[:attribute_order] || :strict
382
-
383
- # Check attribute order if not ignored
384
- keys1 = attrs1.keys.map(&:to_s)
385
- keys2 = attrs2.keys.map(&:to_s)
386
- if attribute_order_behavior == :strict
387
- # Strict mode: attribute order matters
388
- # Check if keys are in same order
389
-
390
- if keys1 != keys2
391
- # Keys are different or in different order
392
- # First check if it's just ordering (same keys, different order)
393
- if keys1.sort == keys2.sort
394
- # Same keys, different order - this is an attribute_order difference
395
- add_difference(n1, n2, Comparison::UNEQUAL_ATTRIBUTES,
396
- Comparison::UNEQUAL_ATTRIBUTES,
397
- :attribute_order, opts, differences)
398
- return Comparison::UNEQUAL_ATTRIBUTES
399
- else
400
- # Different keys - this is attribute_presence difference
401
- add_difference(n1, n2, Comparison::MISSING_ATTRIBUTE,
402
- Comparison::MISSING_ATTRIBUTE,
403
- :attribute_presence, opts, differences)
404
- return Comparison::MISSING_ATTRIBUTE
405
- end
406
- end
407
-
408
- # Order matches, now check values in order
409
- else
410
- # Ignore/normalize mode: attribute order doesn't affect equivalence
411
- # But in verbose mode, we should still track order differences as informative
412
-
413
- # Check if order differs (but keys are the same)
414
- if keys1 != keys2 && keys1.sort == keys2.sort && opts[:verbose]
415
- # Same keys, different order - create informative DiffNode
416
- # This allows line diffs to be properly classified as informative
417
- add_difference(n1, n2, Comparison::UNEQUAL_ATTRIBUTES,
418
- Comparison::UNEQUAL_ATTRIBUTES,
419
- :attribute_order, opts, differences)
420
- end
421
-
422
- # Sort attributes so order doesn't matter for comparison
423
- attrs1 = attrs1.sort_by { |k, _v| k.to_s }.to_h
424
- attrs2 = attrs2.sort_by { |k, _v| k.to_s }.to_h
425
-
426
- unless attrs1.keys.map(&:to_s).sort == attrs2.keys.map(&:to_s).sort
427
- add_difference(n1, n2, Comparison::MISSING_ATTRIBUTE,
428
- Comparison::MISSING_ATTRIBUTE,
429
- :attribute_presence, opts, differences)
430
- return Comparison::MISSING_ATTRIBUTE
431
- end
432
-
433
- end
434
- attrs1.each do |name, value|
435
- unless attrs2[name] == value
436
- add_difference(n1, n2, Comparison::UNEQUAL_ATTRIBUTES,
437
- Comparison::UNEQUAL_ATTRIBUTES,
438
- :attribute_values, opts, differences)
439
- return Comparison::UNEQUAL_ATTRIBUTES
440
- end
441
- end
442
-
443
- Comparison::EQUIVALENT
444
- end
445
-
446
- # Filter attributes based on options
447
- def filter_attributes(attributes, opts)
448
- filtered = {}
449
- match_opts = opts[:match_opts]
450
-
451
- # Handle Canon::Xml::Node attribute format (array of AttributeNode)
452
- if attributes.is_a?(Array)
453
- attributes.each do |attr|
454
- name = attr.name
455
- value = attr.value
456
-
457
- # Skip namespace declarations - they're handled separately
458
- next if is_namespace_declaration?(name)
459
-
460
- # Skip if attribute name should be ignored
461
- next if should_ignore_attr_by_name?(name, opts)
462
-
463
- # Skip if attribute content should be ignored
464
- next if should_ignore_attr_content?(value, opts)
465
-
466
- # Apply match options for attribute values
467
- behavior = match_opts[:attribute_values] || :strict
468
- value = MatchOptions.process_attribute_value(value, behavior)
469
-
470
- filtered[name] = value
471
- end
472
- else
473
- # Handle Nokogiri and Moxml attribute formats (Hash-like):
474
- # - Nokogiri: key is String name, val is Nokogiri::XML::Attr object
475
- # - Moxml: key is Moxml::Attribute object, val is nil
476
- attributes.each do |key, val|
477
- if key.is_a?(String)
478
- # Nokogiri format: key=name (String), val=attr object
479
- name = key
480
- value = val.respond_to?(:value) ? val.value : val.to_s
481
- else
482
- # Moxml format: key=attr object, val=nil
483
- name = key.respond_to?(:name) ? key.name : key.to_s
484
- value = key.respond_to?(:value) ? key.value : key.to_s
485
- end
486
-
487
- # Skip namespace declarations - they're handled separately
488
- next if is_namespace_declaration?(name)
489
-
490
- # Skip if attribute name should be ignored
491
- next if should_ignore_attr_by_name?(name, opts)
492
-
493
- # Skip if attribute content should be ignored
494
- next if should_ignore_attr_content?(value, opts)
495
-
496
- # Apply match options for attribute values
497
- behavior = match_opts[:attribute_values] || :strict
498
- value = MatchOptions.process_attribute_value(value, behavior)
499
-
500
- filtered[name] = value
501
- end
502
- end
503
-
504
- filtered
505
- end
506
-
507
- # Check if attribute should be ignored by name
508
- def should_ignore_attr_by_name?(name, opts)
509
- opts[:ignore_attrs_by_name].any? do |pattern|
510
- name.include?(pattern)
511
- end
512
- end
513
-
514
- # Check if attribute should be ignored by content
515
- def should_ignore_attr_content?(value, opts)
516
- opts[:ignore_attr_content].any? do |pattern|
517
- value.to_s.include?(pattern)
518
- end
326
+ XmlComparatorHelpers::AttributeComparator.compare(n1, n2, opts,
327
+ differences)
519
328
  end
520
329
 
521
330
  # Compare text nodes
@@ -531,7 +340,8 @@ module Canon
531
340
 
532
341
  # For HTML, check if text node is inside whitespace-preserving element
533
342
  # If so, always use strict comparison regardless of text_content setting
534
- if should_preserve_whitespace_strictly?(n1, n2)
343
+ sensitive_element = should_preserve_whitespace_strictly?(n1, n2, opts)
344
+ if sensitive_element
535
345
  behavior = :strict
536
346
  end
537
347
 
@@ -544,15 +354,23 @@ module Canon
544
354
 
545
355
  # Determine the correct dimension for this difference
546
356
  # - If text_content is :strict, ALL differences use :text_content dimension
547
- # - If text_content is :normalize, whitespace-only diffs use :structural_whitespace
357
+ # - If text_content is :normalize, whitespace-only diffs could use :structural_whitespace
358
+ # but we keep :text_content to ensure correct classification behavior
548
359
  # - Otherwise use :text_content
549
- dimension = if behavior == :normalize && whitespace_only_difference?(
550
- text1, text2
551
- )
552
- :structural_whitespace
553
- else
554
- :text_content
555
- end
360
+ # However, if element is whitespace-sensitive (like <pre> in HTML),
361
+ # always use :text_content dimension regardless of behavior
362
+ #
363
+ # NOTE: We keep the dimension as :text_content even for whitespace-only diffs
364
+ # when text_content: :normalize. This ensures that the classification uses
365
+ # the text_content behavior (:normalize) instead of structural_whitespace
366
+ # behavior (:strict for XML), which would incorrectly mark the diff as normative.
367
+ if sensitive_element
368
+ # Whitespace-sensitive element: always use :text_content dimension
369
+ else
370
+ # Always use :text_content for text differences
371
+ # This ensures correct classification based on text_content behavior
372
+ end
373
+ dimension = :text_content
556
374
 
557
375
  # Create DiffNode in verbose mode when raw content differs
558
376
  # This ensures informative diffs are created even for :ignore/:normalize
@@ -566,32 +384,25 @@ module Canon
566
384
  matches_per_behavior ? Comparison::EQUIVALENT : Comparison::UNEQUAL_TEXT_CONTENTS
567
385
  end
568
386
 
569
- # Check if the difference between two texts is only whitespace-related
570
- # @param text1 [String] First text
571
- # @param text2 [String] Second text
572
- # @return [Boolean] true if difference is only in whitespace
573
- def whitespace_only_difference?(text1, text2)
574
- # Normalize both texts (collapse/trim whitespace)
575
- norm1 = MatchOptions.normalize_text(text1)
576
- norm2 = MatchOptions.normalize_text(text2)
577
-
578
- # If normalized texts are the same, the difference was only whitespace
579
- norm1 == norm2
580
- end
581
-
582
387
  # Check if whitespace should be preserved strictly for these text nodes
583
388
  # This applies to HTML elements like pre, code, textarea, script, style
584
- def should_preserve_whitespace_strictly?(n1, n2)
585
- # Only applies to Nokogiri nodes (HTML)
586
- return false unless n1.respond_to?(:parent) && n2.respond_to?(:parent)
587
- return false unless n1.parent.respond_to?(:name) && n2.parent.respond_to?(:name)
389
+ # and elements with xml:space="preserve" or in user-configured whitelist
390
+ def should_preserve_whitespace_strictly?(n1, n2, opts)
391
+ # Use WhitespaceSensitivity module to check if element is sensitive
392
+ # Check both n1 and n2 - if either is in a sensitive element, preserve strictly
393
+ if n1.respond_to?(:parent)
394
+ sensitivity_opts = { match_opts: opts[:match_opts] }
395
+ return true if WhitespaceSensitivity.element_sensitive?(n1,
396
+ sensitivity_opts)
397
+ end
588
398
 
589
- # Elements where whitespace must be preserved in HTML
590
- preserve_elements = %w[pre code textarea script style]
399
+ if n2.respond_to?(:parent)
400
+ sensitivity_opts = { match_opts: opts[:match_opts] }
401
+ return true if WhitespaceSensitivity.element_sensitive?(n2,
402
+ sensitivity_opts)
403
+ end
591
404
 
592
- # Check if either node is inside a whitespace-preserving element
593
- in_preserve_element?(n1, preserve_elements) ||
594
- in_preserve_element?(n2, preserve_elements)
405
+ false
595
406
  end
596
407
 
597
408
  # Check if a node is inside a whitespace-preserving element
@@ -680,258 +491,13 @@ module Canon
680
491
 
681
492
  # Compare children of two nodes using semantic matching
682
493
  #
683
- # Uses ElementMatcher to pair children semantically (by identity attributes
684
- # or position), then compares matched pairs and detects position changes.
494
+ # Delegates to ChildComparison module which handles both ElementMatcher
495
+ # (semantic matching) and simple positional comparison.
685
496
  def compare_children(n1, n2, opts, child_opts, diff_children,
686
- differences)
687
- children1 = filter_children(n1.children, opts)
688
- children2 = filter_children(n2.children, opts)
689
-
690
- # Quick check: if both have no children, they're equivalent
691
- return Comparison::EQUIVALENT if children1.empty? && children2.empty?
692
-
693
- # Check if we can use ElementMatcher (requires Canon::Xml::DataModel nodes)
694
- # ElementMatcher expects nodes with .node_type method that returns symbols
695
- # and only works with element nodes (filters out text, comment, etc.)
696
- can_use_matcher = children1.all? do |c|
697
- c.is_a?(Canon::Xml::Node) && c.node_type == :element
698
- end &&
699
- children2.all? { |c| c.is_a?(Canon::Xml::Node) && c.node_type == :element }
700
-
701
- if can_use_matcher && !children1.empty? && !children2.empty?
702
- # Use ElementMatcher for semantic matching with position tracking
703
- use_element_matcher_comparison(children1, children2, n1, opts,
704
- child_opts, diff_children, differences)
705
- else
706
- # Fall back to simple positional comparison for Moxml/Nokogiri nodes
707
- # Length check
708
- unless children1.length == children2.length
709
- # Determine dimension based on type of first differing child
710
- # When lengths differ, find which child is missing/extra
711
- dimension = :text_content # default
712
-
713
- # Compare position by position to find first difference
714
- max_len = [children1.length, children2.length].max
715
- (0...max_len).each do |i|
716
- if i >= children1.length
717
- # Extra child in children2
718
- dimension = determine_node_dimension(children2[i])
719
- break
720
- elsif i >= children2.length
721
- # Extra child in children1
722
- dimension = determine_node_dimension(children1[i])
723
- break
724
- elsif !same_node_type?(children1[i], children2[i])
725
- # Different node types at same position
726
- dimension = determine_node_dimension(children1[i])
727
- break
728
- end
729
- end
730
-
731
- add_difference(n1, n2, Comparison::MISSING_NODE,
732
- Comparison::MISSING_NODE, dimension, opts,
733
- differences)
734
- return Comparison::MISSING_NODE
735
- end
736
-
737
- # Compare children pairwise by position
738
- result = Comparison::EQUIVALENT
739
- children1.zip(children2).each do |child1, child2|
740
- child_result = compare_nodes(child1, child2, child_opts, child_opts,
741
- diff_children, differences)
742
- result = child_result unless child_result == Comparison::EQUIVALENT
743
- end
744
-
745
- result
746
- end
747
- end
748
-
749
- # Use ElementMatcher for semantic comparison (Canon::Xml::DataModel nodes)
750
- def use_element_matcher_comparison(children1, children2, parent_node,
751
- opts, child_opts, diff_children,
752
- differences)
753
- require_relative "../xml/element_matcher"
754
-
755
- # Create temporary RootNode wrappers to use ElementMatcher
756
- # Don't modify parent pointers - just set @children directly
757
- require_relative "../xml/nodes/root_node"
758
-
759
- temp_root1 = Canon::Xml::Nodes::RootNode.new
760
- temp_root1.instance_variable_set(:@children, children1.dup)
761
-
762
- temp_root2 = Canon::Xml::Nodes::RootNode.new
763
- temp_root2.instance_variable_set(:@children, children2.dup)
764
-
765
- matcher = Canon::Xml::ElementMatcher.new
766
- matches = matcher.match_trees(temp_root1, temp_root2)
767
-
768
- # Filter matches to only include direct children
769
- # match_trees returns ALL descendants, but we only want direct children
770
- matches = matches.select do |m|
771
- (m.elem1.nil? || children1.include?(m.elem1)) &&
772
- (m.elem2.nil? || children2.include?(m.elem2))
773
- end
774
-
775
- # If no matches and children exist, they're all different
776
- if matches.empty? && (!children1.empty? || !children2.empty?)
777
- add_difference(parent_node, parent_node, Comparison::MISSING_NODE,
778
- Comparison::MISSING_NODE, :text_content, opts,
779
- differences)
780
- return Comparison::UNEQUAL_ELEMENTS
781
- end
782
-
783
- all_equivalent = true
784
-
785
- matches.each do |match|
786
- case match.status
787
- when :matched
788
- # Check if element position changed
789
- if match.position_changed?
790
- match_opts = opts[:match_opts]
791
- position_behavior = match_opts[:element_position] || :strict
792
-
793
- # Only create DiffNode if element_position is not :ignore
794
- if position_behavior != :ignore
795
- add_difference(
796
- match.elem1,
797
- match.elem2,
798
- "position #{match.pos1}",
799
- "position #{match.pos2}",
800
- :element_position,
801
- opts,
802
- differences,
803
- )
804
- all_equivalent = false if position_behavior == :strict
805
- end
806
- end
807
-
808
- # Compare the matched elements for content/attribute differences
809
- result = compare_nodes(match.elem1, match.elem2, child_opts,
810
- child_opts, diff_children, differences)
811
- all_equivalent = false unless result == Comparison::EQUIVALENT
812
-
813
- when :deleted
814
- # Element present in first tree but not second
815
- add_difference(match.elem1, nil, Comparison::MISSING_NODE,
816
- Comparison::MISSING_NODE, :element_structure, opts,
817
- differences)
818
- all_equivalent = false
819
-
820
- when :inserted
821
- # Element present in second tree but not first
822
- add_difference(nil, match.elem2, Comparison::MISSING_NODE,
823
- Comparison::MISSING_NODE, :element_structure, opts,
824
- differences)
825
- all_equivalent = false
826
- end
827
- end
828
-
829
- all_equivalent ? Comparison::EQUIVALENT : Comparison::UNEQUAL_ELEMENTS
830
- end
831
-
832
- # Filter children based on options
833
- def filter_children(children, opts)
834
- children.reject do |child|
835
- node_excluded?(child, opts)
836
- end
837
- end
838
-
839
- # Check if node should be excluded
840
- def node_excluded?(node, opts)
841
- match_opts = opts[:match_opts]
842
-
843
- # Determine node type
844
- # Canon::Xml::Node uses node_type that returns Symbol
845
- # Nokogiri uses node_type that returns Integer, so check for Symbol first
846
- if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
847
- node.node_type == :comment
848
- else
849
- node.respond_to?(:comment?) && node.comment?
850
- end
851
-
852
- is_text = if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
853
- node.node_type == :text
854
- else
855
- node.respond_to?(:text?) && node.text?
856
- end
857
-
858
- # Ignore text nodes if specified
859
- return true if opts[:ignore_text_nodes] && is_text
860
-
861
- # Ignore whitespace-only text nodes based on structural_whitespace
862
- # Both :ignore and :normalize should filter out whitespace-only nodes
863
- if %i[ignore
864
- normalize].include?(match_opts[:structural_whitespace]) && is_text
865
- text = node_text(node)
866
- return true if MatchOptions.normalize_text(text).empty?
867
- end
868
-
869
- false
870
- end
871
-
872
- # Determine the appropriate dimension for a node type
873
- # @param node [Object] The node to check
874
- # @return [Symbol] The dimension symbol
875
- def determine_node_dimension(node)
876
- # Canon::Xml::Node types
877
- if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
878
- case node.node_type
879
- when :comment then :comments
880
- when :text, :cdata then :text_content
881
- when :processing_instruction then :processing_instructions
882
- else :text_content
883
- end
884
- # Moxml/Nokogiri types
885
- elsif node.respond_to?(:comment?) && node.comment?
886
- :comments
887
- elsif node.respond_to?(:text?) && node.text?
888
- :text_content
889
- elsif node.respond_to?(:cdata?) && node.cdata?
890
- :text_content
891
- elsif node.respond_to?(:processing_instruction?) && node.processing_instruction?
892
- :processing_instructions
893
- else
894
- :text_content
895
- end
896
- end
897
-
898
- # Check if two nodes are the same type
899
- def same_node_type?(n1, n2)
900
- # Canon::Xml::Node types - check node_type method
901
- if n1.respond_to?(:node_type) && n2.respond_to?(:node_type)
902
- return n1.node_type == n2.node_type
903
- end
904
-
905
- # Moxml/Nokogiri types - check individual type methods
906
- return true if n1.respond_to?(:element?) && n1.element? &&
907
- n2.respond_to?(:element?) && n2.element?
908
- return true if n1.respond_to?(:text?) && n1.text? &&
909
- n2.respond_to?(:text?) && n2.text?
910
- return true if n1.respond_to?(:comment?) && n1.comment? &&
911
- n2.respond_to?(:comment?) && n2.comment?
912
- return true if n1.respond_to?(:cdata?) && n1.cdata? &&
913
- n2.respond_to?(:cdata?) && n2.cdata?
914
- return true if n1.respond_to?(:processing_instruction?) &&
915
- n1.processing_instruction? &&
916
- n2.respond_to?(:processing_instruction?) &&
917
- n2.processing_instruction?
918
- return true if n1.respond_to?(:root) && n2.respond_to?(:root)
919
-
920
- false
921
- end
922
-
923
- # Get text content from a node
924
- def node_text(node)
925
- # Canon::Xml::Node TextNode uses .value
926
- if node.respond_to?(:value)
927
- node.value.to_s
928
- elsif node.respond_to?(:content)
929
- node.content.to_s
930
- elsif node.respond_to?(:text)
931
- node.text.to_s
932
- else
933
- ""
934
- end
497
+ differences)
498
+ XmlComparatorHelpers::ChildComparison.compare(
499
+ n1, n2, self, opts, child_opts, diff_children, differences
500
+ )
935
501
  end
936
502
 
937
503
  # Extract element path for context (best effort)
@@ -960,67 +526,24 @@ module Canon
960
526
  path
961
527
  end
962
528
 
963
- # Serialize a node to XML string
964
- # @param node [Canon::Xml::Node, Object] Node to serialize
965
- # @return [String] XML string representation
966
- def serialize_node_to_xml(node)
967
- if node.is_a?(Canon::Xml::Nodes::RootNode)
968
- # Serialize all children of root
969
- node.children.map { |child| serialize_node_to_xml(child) }.join
970
- elsif node.is_a?(Canon::Xml::Nodes::ElementNode)
971
- # Serialize element with attributes and children
972
- attrs = node.attribute_nodes.map do |a|
973
- " #{a.name}=\"#{a.value}\""
974
- end.join
975
- children_xml = node.children.map do |c|
976
- serialize_node_to_xml(c)
977
- end.join
978
-
979
- if children_xml.empty?
980
- "<#{node.name}#{attrs}/>"
981
- else
982
- "<#{node.name}#{attrs}>#{children_xml}</#{node.name}>"
983
- end
984
- elsif node.is_a?(Canon::Xml::Nodes::TextNode)
985
- node.value
986
- elsif node.is_a?(Canon::Xml::Nodes::CommentNode)
987
- "<!--#{node.value}-->"
988
- elsif node.is_a?(Canon::Xml::Nodes::ProcessingInstructionNode)
989
- "<?#{node.target} #{node.data}?>"
990
- elsif node.respond_to?(:to_xml)
991
- node.to_xml
992
- else
993
- node.to_s
994
- end
995
- end
529
+ # Serialize a node to string for display
530
+ #
531
+ # @param node [Object, nil] Node to serialize
532
+ # @return [String, nil] Serialized content
533
+ def serialize_node(node)
534
+ return nil if node.nil?
996
535
 
997
- # Add a difference to the differences array
998
- # @param node1 [Object] First node
999
- # @param node2 [Object] Second node
1000
- # @param diff1 [String] Difference type for node1
1001
- # @param diff2 [String] Difference type for node2
1002
- # @param dimension [Symbol] The match dimension causing this difference
1003
- # @param opts [Hash] Options
1004
- # @param differences [Array] Array to append difference to
1005
- def add_difference(node1, node2, diff1, diff2, dimension, _opts,
1006
- differences)
1007
- # All differences must be DiffNode objects (OO architecture)
1008
- if dimension.nil?
1009
- raise ArgumentError,
1010
- "dimension required for DiffNode"
1011
- end
536
+ Canon::Diff::NodeSerializer.serialize(node)
537
+ end
1012
538
 
1013
- # Build informative reason message
1014
- reason = build_difference_reason(node1, node2, diff1, diff2,
1015
- dimension)
539
+ # Extract attributes from a node as a normalized hash
540
+ #
541
+ # @param node [Object, nil] Node to extract attributes from
542
+ # @return [Hash, nil] Normalized attributes hash
543
+ def extract_attributes(node)
544
+ return nil if node.nil?
1016
545
 
1017
- diff_node = Canon::Diff::DiffNode.new(
1018
- node1: node1,
1019
- node2: node2,
1020
- dimension: dimension,
1021
- reason: reason,
1022
- )
1023
- differences << diff_node
546
+ Canon::Diff::NodeSerializer.extract_attributes(node)
1024
547
  end
1025
548
 
1026
549
  # Build a human-readable reason for a difference
@@ -1049,120 +572,10 @@ module Canon
1049
572
  end
1050
573
 
1051
574
  # Compare namespace declarations (xmlns and xmlns:* attributes)
1052
- # @param n1 [Object] First node
1053
- # @param n2 [Object] Second node
1054
- # @param opts [Hash] Options
1055
- # @param differences [Array] Array to append differences to
1056
- # @return [Symbol] Comparison result
575
+ # Delegates to XmlComparatorHelpers::NamespaceComparator
1057
576
  def compare_namespace_declarations(n1, n2, opts, differences)
1058
- ns_decls1 = extract_namespace_declarations(n1)
1059
- ns_decls2 = extract_namespace_declarations(n2)
1060
-
1061
- # Find missing, extra, and changed namespace declarations
1062
- missing = ns_decls1.keys - ns_decls2.keys # In n1 but not n2
1063
- extra = ns_decls2.keys - ns_decls1.keys # In n2 but not n1
1064
- changed = ns_decls1.select do |prefix, uri|
1065
- ns_decls2[prefix] && ns_decls2[prefix] != uri
1066
- end.keys
1067
-
1068
- # If there are any differences, create a DiffNode
1069
- if missing.any? || extra.any? || changed.any?
1070
- # Build a descriptive reason
1071
- reasons = []
1072
- if missing.any?
1073
- reasons << "removed: #{missing.map do |p|
1074
- p.empty? ? 'xmlns' : "xmlns:#{p}"
1075
- end.join(', ')}"
1076
- end
1077
- if extra.any?
1078
- reasons << "added: #{extra.map do |p|
1079
- p.empty? ? 'xmlns' : "xmlns:#{p}"
1080
- end.join(', ')}"
1081
- end
1082
- if changed.any?
1083
- reasons << "changed: #{changed.map do |p|
1084
- p.empty? ? 'xmlns' : "xmlns:#{p}"
1085
- end.join(', ')}"
1086
- end
1087
-
1088
- add_difference(
1089
- n1,
1090
- n2,
1091
- Comparison::UNEQUAL_ATTRIBUTES,
1092
- Comparison::UNEQUAL_ATTRIBUTES,
1093
- :namespace_declarations,
1094
- opts,
1095
- differences,
1096
- )
1097
- return Comparison::UNEQUAL_ATTRIBUTES
1098
- end
1099
-
1100
- Comparison::EQUIVALENT
1101
- end
1102
-
1103
- # Extract namespace declarations from a node
1104
- # @param node [Object] Node to extract namespace declarations from
1105
- # @return [Hash] Hash of prefix => URI mappings
1106
- def extract_namespace_declarations(node)
1107
- declarations = {}
1108
-
1109
- # Handle Canon::Xml::Node (uses namespace_nodes)
1110
- if node.respond_to?(:namespace_nodes)
1111
- node.namespace_nodes.each do |ns|
1112
- # Skip the implicit xml namespace (always present)
1113
- next if ns.prefix == "xml" && ns.uri == "http://www.w3.org/XML/1998/namespace"
1114
-
1115
- prefix = ns.prefix || ""
1116
- declarations[prefix] = ns.uri
1117
- end
1118
- return declarations
1119
- end
1120
-
1121
- # Handle Nokogiri/Moxml nodes (use attributes)
1122
- # Get raw attributes
1123
- raw_attrs = node.respond_to?(:attribute_nodes) ? node.attribute_nodes : node.attributes
1124
-
1125
- # Handle Canon::Xml::Node attribute format (array of AttributeNode)
1126
- if raw_attrs.is_a?(Array)
1127
- raw_attrs.each do |attr|
1128
- name = attr.name
1129
- value = attr.value
1130
-
1131
- if is_namespace_declaration?(name)
1132
- # Extract prefix: "xmlns" -> "", "xmlns:xmi" -> "xmi"
1133
- prefix = name == "xmlns" ? "" : name.split(":", 2)[1]
1134
- declarations[prefix] = value
1135
- end
1136
- end
1137
- else
1138
- # Handle Nokogiri and Moxml attribute formats (Hash-like)
1139
- raw_attrs.each do |key, val|
1140
- if key.is_a?(String)
1141
- # Nokogiri format: key=name (String), val=attr object
1142
- name = key
1143
- value = val.respond_to?(:value) ? val.value : val.to_s
1144
- else
1145
- # Moxml format: key=attr object, val=nil
1146
- name = key.respond_to?(:name) ? key.name : key.to_s
1147
- value = key.respond_to?(:value) ? key.value : key.to_s
1148
- end
1149
-
1150
- if is_namespace_declaration?(name)
1151
- # Extract prefix: "xmlns" -> "", "xmlns:xmi" -> "xmi"
1152
- prefix = name == "xmlns" ? "" : name.split(":", 2)[1]
1153
- declarations[prefix] = value
1154
- end
1155
- end
1156
- end
1157
-
1158
- declarations
1159
- end
1160
-
1161
- # Check if an attribute name is a namespace declaration
1162
- # @param attr_name [String] Attribute name
1163
- # @return [Boolean] true if it's a namespace declaration
1164
- def is_namespace_declaration?(attr_name)
1165
- attr_name == "xmlns" || attr_name.start_with?("xmlns:")
577
+ XmlComparatorHelpers::NamespaceComparator.compare(n1, n2, opts,
578
+ differences)
1166
579
  end
1167
580
  end
1168
581
  end