canon 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +31 -149
  3. data/README.adoc +9 -0
  4. data/docs/advanced/semantic-diff-report.adoc +31 -0
  5. data/docs/features/configuration-profiles.adoc +4 -2
  6. data/docs/features/match-options/html-policies.adoc +2 -0
  7. data/docs/features/match-options/index.adoc +40 -0
  8. data/docs/guides/choosing-configuration.adoc +12 -1
  9. data/docs/reference/cli-options.adoc +3 -0
  10. data/docs/reference/options-across-interfaces.adoc +7 -1
  11. data/docs/understanding/formats/html.adoc +9 -2
  12. data/lib/canon/cli.rb +4 -0
  13. data/lib/canon/commands/diff_command.rb +1 -0
  14. data/lib/canon/comparison/comparison_result.rb +79 -0
  15. data/lib/canon/comparison/html_comparator.rb +92 -11
  16. data/lib/canon/comparison/markup_comparator.rb +19 -0
  17. data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
  18. data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
  19. data/lib/canon/comparison/match_options.rb +23 -2
  20. data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
  21. data/lib/canon/comparison/xml_comparator/child_comparison.rb +6 -0
  22. data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
  23. data/lib/canon/comparison/xml_comparator.rb +80 -4
  24. data/lib/canon/comparison/xml_node_comparison.rb +29 -3
  25. data/lib/canon/comparison.rb +84 -22
  26. data/lib/canon/config/env_schema.rb +2 -1
  27. data/lib/canon/config/profiles/metanorma.yml +3 -0
  28. data/lib/canon/config.rb +51 -5
  29. data/lib/canon/diff/diff_classifier.rb +18 -2
  30. data/lib/canon/diff/diff_line_builder.rb +9 -8
  31. data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
  32. data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
  33. data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
  34. data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
  35. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +65 -17
  36. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +17 -0
  37. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
  38. data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
  39. data/lib/canon/diff_formatter.rb +57 -173
  40. data/lib/canon/html/data_model.rb +10 -4
  41. data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
  42. data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
  43. data/lib/canon/version.rb +1 -1
  44. data/lib/canon/xml/c14n.rb +59 -5
  45. data/lib/canon/xml/element_matcher.rb +3 -0
  46. data/lib/canon/xml/node.rb +8 -1
  47. data/lib/canon/xml/nodes/comment_node.rb +4 -0
  48. data/lib/canon/xml/nodes/element_node.rb +4 -0
  49. data/lib/canon/xml/nodes/text_node.rb +4 -0
  50. data/lib/canon/xml/sax_builder.rb +11 -2
  51. data/lib/canon/xml/xpath_engine.rb +238 -0
  52. metadata +6 -2
@@ -13,6 +13,7 @@ require_relative "../diff/diff_classifier"
13
13
  require_relative "strategies/match_strategy_factory"
14
14
  require_relative "../html/data_model"
15
15
  require_relative "xml_node_comparison"
16
+ require_relative "xml_comparator/diff_node_builder"
16
17
  # Whitespace sensitivity module (single source of truth for sensitive elements)
17
18
  require_relative "whitespace_sensitivity"
18
19
 
@@ -172,10 +173,42 @@ module Canon
172
173
  # @param node2 [Object] Second node
173
174
  # @return [Boolean] true if both are document fragments
174
175
  def fragment_nodes?(node1, node2)
175
- (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
176
- node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
177
- (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
178
- node2.is_a?(Nokogiri::XML::DocumentFragment))
176
+ fragment_node?(node1) && fragment_node?(node2)
177
+ end
178
+
179
+ # Check if a single node is a recognised document fragment.
180
+ # All three Nokogiri fragment types (XML, HTML4, HTML5) must be
181
+ # accepted: dom_diff routes html/html4/html5 input through
182
+ # Nokogiri::HTML5.fragment per #118.
183
+ def fragment_node?(node)
184
+ node.is_a?(Nokogiri::XML::DocumentFragment) ||
185
+ node.is_a?(Nokogiri::HTML4::DocumentFragment) ||
186
+ node.is_a?(Nokogiri::HTML5::DocumentFragment)
187
+ end
188
+
189
+ # Record a DiffNode for a fragment-level child-count mismatch.
190
+ # Each surplus child becomes its own MISSING_NODE diff so the
191
+ # downstream report shows what was added or removed.
192
+ def record_fragment_length_mismatch(_node1, _node2, children1,
193
+ children2, differences)
194
+ longer, shorter, side = if children1.length > children2.length
195
+ [children1, children2, :removed]
196
+ else
197
+ [children2, children1, :added]
198
+ end
199
+
200
+ longer[shorter.length...].each do |orphan|
201
+ n1 = side == :removed ? orphan : nil
202
+ n2 = side == :removed ? nil : orphan
203
+ differences <<
204
+ Canon::Comparison::DiffNodeBuilder.build(
205
+ node1: n1,
206
+ node2: n2,
207
+ diff1: Comparison::MISSING_NODE,
208
+ diff2: Comparison::MISSING_NODE,
209
+ dimension: :element_structure,
210
+ )
211
+ end
179
212
  end
180
213
 
181
214
  # Compare children of document fragments
@@ -196,6 +229,13 @@ module Canon
196
229
  children2 = XmlNodeComparison.filter_children(all_children2, opts)
197
230
 
198
231
  if children1.length != children2.length
232
+ # Record the length mismatch as a DiffNode so verbose mode
233
+ # surfaces it. Without this, equivalent? wraps an empty
234
+ # differences array and incorrectly reports the inputs as
235
+ # equivalent.
236
+ record_fragment_length_mismatch(node1, node2,
237
+ children1, children2,
238
+ differences)
199
239
  return Comparison::UNEQUAL_ELEMENTS
200
240
  elsif children1.empty?
201
241
  return Comparison::EQUIVALENT
@@ -291,10 +331,12 @@ module Canon
291
331
  node.to_html
292
332
  end
293
333
 
294
- # Use XML fragment parser to preserve structure without auto-generated elements
295
- # This avoids both HTML4's meta tag insertion and HTML5's tag stripping
296
- # See: https://stackoverflow.com/questions/25998824/stop-nokogiri-from-adding-doctype-and-meta-tags
297
- frag = Nokogiri::XML.fragment(html_string)
334
+ # Use XML fragment parser to preserve structure without auto-generated elements.
335
+ # Decode HTML named entities (&nbsp; etc.) to UTF-8 characters since XML
336
+ # parser only understands the five XML entities.
337
+ frag = Nokogiri::XML.fragment(
338
+ decode_html_named_entities(html_string),
339
+ )
298
340
 
299
341
  # Apply preprocessing if needed
300
342
  if preprocessing == :rendered
@@ -448,8 +490,12 @@ module Canon
448
490
  end
449
491
 
450
492
  # Parse as Nokogiri fragment for DOM comparison
451
- # Use XML fragment parser to avoid auto-inserted meta tags
452
- frag = Nokogiri::XML.fragment(html_string)
493
+ # Use XML fragment parser to avoid auto-inserted meta tags.
494
+ # Decode HTML named entities (&nbsp; etc.) to UTF-8 characters since
495
+ # XML parser only understands the five XML entities.
496
+ frag = Nokogiri::XML.fragment(
497
+ decode_html_named_entities(html_string),
498
+ )
453
499
 
454
500
  # Apply post-parsing filtering for :normalize, :format, and :rendered preprocessing
455
501
  if %i[normalize format rendered].include?(preprocessing)
@@ -496,6 +542,33 @@ module Canon
496
542
 
497
543
  # Detect HTML version from content
498
544
  #
545
+ # Decode HTML named entities to their UTF-8 character equivalents.
546
+ # This is a targeted replacement that only changes entity references,
547
+ # preserving all tag structure. Needed because Nokogiri::XML.fragment
548
+ # only understands the five XML entities (&amp; &lt; &gt; &quot; &apos;).
549
+ #
550
+ # @param str [String] HTML string possibly containing named entities
551
+ # @return [String] String with named entities replaced by UTF-8 chars
552
+ def decode_html_named_entities(str)
553
+ return str unless str.include?("&")
554
+
555
+ str.gsub(/&nbsp;/i, "\u00A0")
556
+ .gsub(/&ensp;/i, "\u2002")
557
+ .gsub(/&emsp;/i, "\u2003")
558
+ .gsub(/&thinsp;/i, "\u2009")
559
+ .gsub(/&copy;/i, "\u00A9")
560
+ .gsub(/&reg;/i, "\u00AE")
561
+ .gsub(/&trade;/i, "\u2122")
562
+ .gsub(/&mdash;/i, "\u2014")
563
+ .gsub(/&ndash;/i, "\u2013")
564
+ .gsub(/&lsquo;/i, "\u2018")
565
+ .gsub(/&rsquo;/i, "\u2019")
566
+ .gsub(/&ldquo;/i, "\u201C")
567
+ .gsub(/&rdquo;/i, "\u201D")
568
+ .gsub(/&bull;/i, "\u2022")
569
+ .gsub(/&hellip;/i, "\u2026")
570
+ end
571
+
499
572
  # @param content [String] HTML content
500
573
  # @return [Symbol] :html5 or :html4
501
574
  def detect_html_version(content)
@@ -721,8 +794,16 @@ compare_profile = nil)
721
794
  parent = text_node.parent
722
795
  next if ancestor_preserves_whitespace?(parent, preserve_whitespace)
723
796
 
797
+ content = text_node.content
798
+
799
+ # NBSP (U+00A0) is never insignificant — don't remove
800
+ next if content.include?("\u00A0")
801
+
802
+ # Whitespace between inline siblings is significant — don't remove
803
+ next if WhitespaceSensitivity.inline_whitespace_significant?(text_node)
804
+
724
805
  # Remove if the text is only whitespace (after normalization)
725
- if text_node.content.strip.empty?
806
+ if content.strip.empty?
726
807
  text_node.remove
727
808
  end
728
809
  end
@@ -182,6 +182,25 @@ module Canon
182
182
  return false unless text_node?(node) && node.parent
183
183
  return false unless MatchOptions.normalize_text(node_text(node)).empty?
184
184
 
185
+ # NBSP (U+00A0) is never insignificant whitespace —
186
+ # it always renders as a visible non-breaking space.
187
+ # For HTML: always preserve NBSP nodes.
188
+ # For XML with whitespace_type: :strict: preserve NBSP nodes so
189
+ # different Unicode whitespace types remain distinguishable.
190
+ format = opts[:format] || match_opts[:format]
191
+ whitespace_type = match_opts[:whitespace_type] || :strict
192
+ if (%i[html html4
193
+ html5].include?(format) || whitespace_type == :strict) && WhitespaceSensitivity.contains_nbsp?(node_text(node))
194
+ return false
195
+ end
196
+
197
+ if %i[html html4
198
+ html5].include?(format) && WhitespaceSensitivity.inline_whitespace_significant?(node)
199
+ # Whitespace between inline element siblings is semantically
200
+ # significant (renders as a visible gap) and must not be stripped.
201
+ return false
202
+ end
203
+
185
204
  return true unless WhitespaceSensitivity.whitespace_preserved?(
186
205
  node.parent, match_opts
187
206
  )
@@ -146,6 +146,7 @@ module Canon
146
146
  respect_xml_space
147
147
  pretty_printed_expected
148
148
  pretty_printed_received
149
+ whitespace_type
149
150
  ]
150
151
 
151
152
  match_options.each do |dimension, behavior|
@@ -24,6 +24,7 @@ module Canon
24
24
  attribute_values: :strict,
25
25
  element_position: :ignore,
26
26
  comments: :ignore,
27
+ whitespace_type: :strict,
27
28
  },
28
29
  xml: {
29
30
  preprocessing: :none,
@@ -34,6 +35,7 @@ module Canon
34
35
  attribute_values: :strict,
35
36
  element_position: :strict,
36
37
  comments: :strict,
38
+ whitespace_type: :strict,
37
39
  },
38
40
  }.freeze
39
41
 
@@ -51,6 +53,7 @@ module Canon
51
53
  attribute_values: :strict,
52
54
  element_position: :strict,
53
55
  comments: :strict,
56
+ whitespace_type: :strict,
54
57
  },
55
58
 
56
59
  # Rendered: Match rendered output (HTML default)
@@ -64,6 +67,7 @@ module Canon
64
67
  attribute_values: :strict,
65
68
  element_position: :strict,
66
69
  comments: :ignore,
70
+ whitespace_type: :strict,
67
71
  },
68
72
 
69
73
  # HTML4: Match HTML4 rendered output
@@ -77,6 +81,7 @@ module Canon
77
81
  attribute_values: :normalize,
78
82
  element_position: :ignore,
79
83
  comments: :ignore,
84
+ whitespace_type: :strict,
80
85
  },
81
86
 
82
87
  # HTML5: Match HTML5 rendered output (same as rendered)
@@ -89,6 +94,7 @@ module Canon
89
94
  attribute_values: :strict,
90
95
  element_position: :ignore,
91
96
  comments: :ignore,
97
+ whitespace_type: :strict,
92
98
  },
93
99
 
94
100
  # Spec-friendly: Formatting doesn't matter
@@ -102,6 +108,7 @@ module Canon
102
108
  attribute_values: :normalize,
103
109
  element_position: :ignore,
104
110
  comments: :ignore,
111
+ whitespace_type: :strict,
105
112
  },
106
113
 
107
114
  # Content-only: Only content matters
@@ -114,6 +121,7 @@ module Canon
114
121
  attribute_values: :normalize,
115
122
  element_position: :ignore,
116
123
  comments: :ignore,
124
+ whitespace_type: :strict,
117
125
  },
118
126
  }.freeze
119
127
 
@@ -69,13 +69,18 @@ module Canon
69
69
  # @param text1 [String] First text
70
70
  # @param text2 [String] Second text
71
71
  # @param behavior [Symbol] Match behavior (:strict, :normalize, :ignore)
72
+ # @param whitespace_type [Symbol] Whitespace type handling (:strict, :normalize)
72
73
  # @return [Boolean] true if texts match according to behavior
73
- def match_text?(text1, text2, behavior)
74
+ def match_text?(text1, text2, behavior, whitespace_type: :strict)
74
75
  case behavior
75
76
  when :strict
76
77
  text1 == text2
77
78
  when :normalize
78
- normalize_text(text1) == normalize_text(text2)
79
+ if whitespace_type == :normalize
80
+ normalize_text(text1) == normalize_text(text2)
81
+ else
82
+ normalize_text_preserving_type(text1) == normalize_text_preserving_type(text2)
83
+ end
79
84
  when :ignore
80
85
  true
81
86
  else
@@ -101,6 +106,22 @@ module Canon
101
106
  .strip # Remove leading/trailing whitespace
102
107
  end
103
108
 
109
+ # Normalize text preserving Unicode whitespace type distinctions.
110
+ #
111
+ # Only ASCII whitespace (space, tab, newline, etc.) is collapsed.
112
+ # Unicode whitespace (NBSP, ideographic space, etc.) is preserved,
113
+ # so different whitespace types remain distinguishable.
114
+ #
115
+ # @param text [String] Text to normalize
116
+ # @return [String] Normalized text with preserved whitespace types
117
+ def normalize_text_preserving_type(text)
118
+ return "" if text.nil?
119
+
120
+ text.to_s
121
+ .gsub(/[ \t\r\n\f\v]+/, " ") # Collapse only ASCII whitespace
122
+ .strip
123
+ end
124
+
104
125
  # Process attribute value according to match behavior
105
126
  #
106
127
  # @param value [String] Attribute value to process
@@ -50,6 +50,15 @@ module Canon
50
50
  # HTML elements where every whitespace character is significant.
51
51
  HTML_PRESERVE_ELEMENTS = %w[pre code textarea script style].freeze
52
52
 
53
+ # HTML inline elements — whitespace between these is semantically
54
+ # significant (renders as a visible space). Whitespace-only text
55
+ # nodes that sit between two inline siblings must not be stripped.
56
+ INLINE_ELEMENTS = %w[
57
+ a abbr acronym b bdo big br button cite code dfn em i img input kbd
58
+ label map object output q s samp select small span strong sub sup
59
+ time tt u var wbr
60
+ ].freeze
61
+
53
62
  class << self
54
63
  # Classify the whitespace behaviour for an element using ancestor walk.
55
64
  #
@@ -213,6 +222,69 @@ module Canon
213
222
  .include?(element_name.to_sym)
214
223
  end
215
224
 
225
+ # Check if a whitespace-only text node sits between two inline element
226
+ # siblings, making the whitespace semantically significant.
227
+ #
228
+ # In HTML rendering, a space between <span>A</span> <span>B</span>
229
+ # produces visible output. Stripping such nodes produces false
230
+ # equivalence.
231
+ #
232
+ # Works with any parent type (element, DocumentFragment, RootNode)
233
+ # since the check is about sibling context, not parent type.
234
+ #
235
+ # @param text_node [Object] Text node (Nokogiri or Canon::Xml::Node)
236
+ # @return [Boolean] true if whitespace is between inline siblings
237
+ def inline_whitespace_significant?(text_node)
238
+ return false unless text_node.respond_to?(:parent)
239
+
240
+ parent = text_node.parent
241
+ return false unless parent
242
+ return false unless parent.respond_to?(:children)
243
+
244
+ siblings = parent.children
245
+ idx = siblings.index(text_node)
246
+ return false unless idx
247
+
248
+ # Look at the IMMEDIATE non-whitespace-text neighbour on each
249
+ # side. Whitespace at a block boundary is collapsed per CSS,
250
+ # so both immediate neighbours must be inline for the
251
+ # whitespace to be significant. Walking all siblings (the
252
+ # earlier behaviour) misclassified whitespace at a block
253
+ # boundary as significant whenever any inline element existed
254
+ # elsewhere among the siblings.
255
+ prev_neighbour = nearest_non_whitespace_sibling(siblings, idx, -1)
256
+ next_neighbour = nearest_non_whitespace_sibling(siblings, idx, 1)
257
+
258
+ inline_element?(prev_neighbour) && inline_element?(next_neighbour)
259
+ end
260
+
261
+ # Walk outward from +idx+ in +direction+ (+1 forward, -1 back),
262
+ # skipping whitespace-only text nodes, and return the first
263
+ # non-whitespace sibling found. Returns nil if none.
264
+ def nearest_non_whitespace_sibling(siblings, idx, direction)
265
+ i = idx + direction
266
+ while i >= 0 && i < siblings.length
267
+ s = siblings[i]
268
+ unless s.respond_to?(:text?) && s.text? &&
269
+ s.respond_to?(:content) && s.content.to_s.strip.empty?
270
+ return s
271
+ end
272
+
273
+ i += direction
274
+ end
275
+ nil
276
+ end
277
+
278
+ # Check if text content contains a non-breaking space (U+00A0).
279
+ # NBSP is NOT collapsible whitespace in HTML — it always renders as
280
+ # a visible space and must never be stripped.
281
+ #
282
+ # @param text [String] Text content to check
283
+ # @return [Boolean] true if text contains U+00A0
284
+ def contains_nbsp?(text)
285
+ text.to_s.include?("\u00A0")
286
+ end
287
+
216
288
  private
217
289
 
218
290
  # Build the Set of preserve whitespace element names (strings).
@@ -336,6 +408,30 @@ module Canon
336
408
  # Nokogiri compatibility
337
409
  parent.respond_to?(:node_type) && parent.node_type == :element
338
410
  end
411
+
412
+ # Get the parent element of a text node, or nil.
413
+ # Works with both Nokogiri and Canon::Xml::Node types.
414
+ def parent_element_of(text_node)
415
+ return nil unless text_node.respond_to?(:parent)
416
+
417
+ parent = text_node.parent
418
+ return nil unless parent
419
+
420
+ if parent.is_a?(Canon::Xml::Nodes::ElementNode)
421
+ parent
422
+ elsif parent.respond_to?(:element?) && parent.element?
423
+ parent
424
+ elsif parent.respond_to?(:node_type) && parent.node_type == :element
425
+ parent
426
+ end
427
+ end
428
+
429
+ # Check if a node is an HTML inline element.
430
+ def inline_element?(node)
431
+ return false unless node.respond_to?(:name)
432
+
433
+ INLINE_ELEMENTS.include?(node.name.to_s.downcase)
434
+ end
339
435
  end
340
436
  end
341
437
  end
@@ -28,6 +28,9 @@ module Canon
28
28
  # @return [Integer] Comparison result code
29
29
  def compare(node1, node2, comparator, opts, child_opts,
30
30
  diff_children, differences)
31
+ # FAST PATH: Object identity - same object means equivalent children
32
+ return Comparison::EQUIVALENT if node1.equal?(node2)
33
+
31
34
  # Apply side-specific pretty-print heuristic when either flag is set:
32
35
  # pretty_printed_expected → drop \n-starting whitespace nodes from node1
33
36
  # pretty_printed_received → drop \n-starting whitespace nodes from node2
@@ -43,6 +46,9 @@ diff_children, differences)
43
46
  # Quick check: if both have no children, they're equivalent
44
47
  return Comparison::EQUIVALENT if children1.empty? && children2.empty?
45
48
 
49
+ # FAST PATH: Identical children arrays mean equivalent subtrees
50
+ return Comparison::EQUIVALENT if children1.equal?(children2)
51
+
46
52
  # Check if we can use ElementMatcher (requires Canon::Xml::DataModel nodes)
47
53
  if can_use_element_matcher?(children1, children2)
48
54
  use_element_matcher_comparison(children1, children2, node1, comparator,
@@ -14,15 +14,18 @@ module Canon
14
14
  # @param node [String, Object] Node to parse
15
15
  # @param preprocessing [Symbol] Preprocessing mode (:none, :normalize, :c14n, :format)
16
16
  # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
17
+ # @param parser [Symbol] Parser backend (:sax or :dom, default from config)
17
18
  # @return [Canon::Xml::Node] Parsed node
18
- def self.parse(node, preprocessing = :none, preserve_whitespace: false)
19
+ def self.parse(node, preprocessing = :none, preserve_whitespace: false,
20
+ parser: nil)
19
21
  # If already a Canon::Xml::Node, return as-is
20
22
  return node if node.is_a?(Canon::Xml::Node)
21
23
 
22
24
  # If it's a Nokogiri or Moxml node, convert to DataModel
23
25
  unless node.is_a?(String)
24
26
  return convert_from_node(node,
25
- preserve_whitespace: preserve_whitespace)
27
+ preserve_whitespace: preserve_whitespace,
28
+ parser: parser)
26
29
  end
27
30
 
28
31
  # Normalize encoding before preprocessing (UTF-16 strings can't use strip, etc.)
@@ -31,9 +34,17 @@ module Canon
31
34
  # Apply preprocessing to XML string before parsing
32
35
  xml_string = apply_preprocessing(node, preprocessing).strip
33
36
 
34
- # Use Canon::Xml::DataModel for parsing to get Canon::Xml::Node instances
35
- Canon::Xml::DataModel.from_xml(xml_string,
37
+ # Select parser backend
38
+ resolved_parser = parser || resolve_parser_config
39
+
40
+ if resolved_parser == :sax
41
+ require_relative "../../xml/sax_builder"
42
+ Canon::Xml::SaxBuilder.parse(xml_string,
36
43
  preserve_whitespace: preserve_whitespace)
44
+ else
45
+ Canon::Xml::DataModel.from_xml(xml_string,
46
+ preserve_whitespace: preserve_whitespace)
47
+ end
37
48
  end
38
49
 
39
50
  # Apply preprocessing transformation to XML string
@@ -62,9 +73,18 @@ module Canon
62
73
  #
63
74
  # @param node [Object] Nokogiri or Moxml node
64
75
  # @param preserve_whitespace [Boolean] Whether to preserve whitespace-only text nodes
76
+ # @param parser [Symbol, nil] Parser backend override
65
77
  # @return [Canon::Xml::Node] Converted node
66
- def self.convert_from_node(node, preserve_whitespace: false)
67
- # Convert to XML string then parse through DataModel
78
+ def self.convert_from_node(node, preserve_whitespace: false,
79
+ parser: nil)
80
+ # FAST PATH: Convert Nokogiri/Moxml nodes directly without string round-trip
81
+ if defined?(Nokogiri::XML::Node) && node.is_a?(Nokogiri::XML::Node)
82
+ return Canon::Xml::DataModel.build_from_nokogiri(
83
+ node, preserve_whitespace: preserve_whitespace
84
+ )
85
+ end
86
+
87
+ # SLOW PATH: Fallback to string serialization for unknown node types
68
88
  xml_str = if node.respond_to?(:to_xml)
69
89
  node.to_xml
70
90
  elsif node.respond_to?(:to_s)
@@ -73,8 +93,26 @@ module Canon
73
93
  raise Canon::Error,
74
94
  "Unable to convert node to string: #{node.class}"
75
95
  end
76
- Canon::Xml::DataModel.from_xml(xml_str,
96
+
97
+ resolved_parser = parser || resolve_parser_config
98
+
99
+ if resolved_parser == :sax
100
+ require_relative "../../xml/sax_builder"
101
+ Canon::Xml::SaxBuilder.parse(xml_str,
77
102
  preserve_whitespace: preserve_whitespace)
103
+ else
104
+ Canon::Xml::DataModel.from_xml(xml_str,
105
+ preserve_whitespace: preserve_whitespace)
106
+ end
107
+ end
108
+
109
+ # Resolve parser config from global config
110
+ #
111
+ # @return [Symbol] :sax or :dom
112
+ def self.resolve_parser_config
113
+ Canon::Config.instance.xml.diff.parser
114
+ rescue StandardError
115
+ :sax
78
116
  end
79
117
  end
80
118
  end
@@ -63,6 +63,18 @@ module Canon
63
63
  # @return [Boolean, Array] true if equivalent, or array of diffs if
64
64
  # verbose
65
65
  def equivalent?(n1, n2, opts = {}, child_opts = {})
66
+ # FAST PATH: Object identity - same object is always equivalent
67
+ # Skip when semantic_diff is requested (caller needs tree diff metadata)
68
+ if n1.equal?(n2) && !opts.dig(:match, :semantic_diff)
69
+ return build_trivial_equivalent_result(n1, n2, opts)
70
+ end
71
+
72
+ # FAST PATH: String content equality - identical strings are equivalent
73
+ # Skip in verbose mode since caller may need full metadata (e.g. tree_diff statistics)
74
+ if !opts[:verbose] && n1.is_a?(String) && n2.is_a?(String) && n1 == n2
75
+ return true
76
+ end
77
+
66
78
  opts = DEFAULT_OPTS.merge(opts)
67
79
 
68
80
  # Resolve match options with format-specific defaults
@@ -92,8 +104,15 @@ module Canon
92
104
  # Create child_opts with resolved options
93
105
  child_opts = opts.merge(child_opts)
94
106
 
95
- # Determine if we should preserve whitespace during parsing
96
- # When structural_whitespace is :strict, preserve all whitespace-only text nodes
107
+ # Determine if we should preserve whitespace during parsing.
108
+ # Only structural_whitespace: :strict forces whitespace-only text
109
+ # nodes to survive parsing. whitespace_type is about distinguishing
110
+ # Unicode whitespace *types* in surviving text-node content, and
111
+ # does NOT require indent text nodes to be kept — libxml's NOBLANKS
112
+ # only strips pure-ASCII whitespace-only nodes, so NBSP-only nodes
113
+ # survive regardless. Coupling whitespace_type: :strict to
114
+ # parsing-time preservation made pretty-printed fixtures produce
115
+ # spurious element-position diffs (issue #112).
97
116
  preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
98
117
 
99
118
  # Parse nodes if they are strings, applying preprocessing if needed
@@ -218,8 +237,57 @@ module Canon
218
237
  preserve_whitespace: preserve_whitespace)
219
238
  end
220
239
 
240
+ # Build result for trivially equivalent inputs (same object or identical strings)
241
+ #
242
+ # Returns plain `true` in non-verbose mode, or a ComparisonResult in verbose mode.
243
+ #
244
+ # @param n1 [Object] First input
245
+ # @param n2 [Object] Second input
246
+ # @param opts [Hash] Raw options (before merge with DEFAULT_OPTS)
247
+ # @return [Boolean, ComparisonResult]
248
+ def build_trivial_equivalent_result(n1, n2, opts)
249
+ return true unless opts[:verbose]
250
+
251
+ # Parse nodes for verbose display
252
+ preserve_whitespace = true
253
+ node1 = parse_node(n1, :none,
254
+ preserve_whitespace: preserve_whitespace)
255
+ node2 = parse_node(n2, :none,
256
+ preserve_whitespace: preserve_whitespace)
257
+ preprocessed = [
258
+ serialize_node(node1).gsub("><", ">\n<"),
259
+ serialize_node(node2).gsub("><", ">\n<"),
260
+ ]
261
+ original1 = if n1.is_a?(String)
262
+ n1
263
+ elsif n1.respond_to?(:to_xml)
264
+ n1.to_xml
265
+ else
266
+ n1.to_s
267
+ end
268
+ original2 = if n2.is_a?(String)
269
+ n2
270
+ elsif n2.respond_to?(:to_xml)
271
+ n2.to_xml
272
+ else
273
+ n2.to_s
274
+ end
275
+
276
+ ComparisonResult.new(
277
+ differences: [],
278
+ preprocessed_strings: preprocessed,
279
+ original_strings: [original1, original2],
280
+ format: :xml,
281
+ match_options: {},
282
+ algorithm: :dom,
283
+ )
284
+ end
285
+
221
286
  # Main comparison dispatcher
222
287
  def compare_nodes(n1, n2, opts, child_opts, diff_children, differences)
288
+ # FAST PATH: Object identity - same object is always equivalent
289
+ return Comparison::EQUIVALENT if n1.equal?(n2)
290
+
223
291
  # Handle DocumentFragment nodes - compare their children instead
224
292
  if n1.is_a?(Nokogiri::XML::DocumentFragment) &&
225
293
  n2.is_a?(Nokogiri::XML::DocumentFragment)
@@ -380,8 +448,10 @@ module Canon
380
448
  raw_differs = text1 != text2
381
449
 
382
450
  # Check if matches according to behavior
451
+ whitespace_type = match_opts[:whitespace_type] || :strict
383
452
  matches_per_behavior = MatchOptions.match_text?(text1, text2,
384
- behavior)
453
+ behavior,
454
+ whitespace_type: whitespace_type)
385
455
 
386
456
  # Determine the correct dimension for this difference
387
457
  # - If text_content is :strict, ALL differences use :text_content dimension
@@ -599,7 +669,13 @@ differences)
599
669
  end
600
670
  return "element '#{node.name}'#{ns_info}: #{diff1} vs #{diff2}"
601
671
  elsif node.respond_to?(:name) && !node.respond_to?(:namespace_uri)
602
- return "element missing: #{node}"
672
+ # TextNode and other nodes without namespace_uri
673
+ display = if node.respond_to?(:value) && node.node_type == :text
674
+ "\"#{truncate_text(node.value)}\""
675
+ else
676
+ node.name.to_s
677
+ end
678
+ return "element missing: #{display}"
603
679
  end
604
680
  end
605
681