canon 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +31 -149
  3. data/README.adoc +9 -0
  4. data/docs/advanced/semantic-diff-report.adoc +96 -0
  5. data/docs/features/configuration-profiles.adoc +4 -2
  6. data/docs/features/diff-formatting/index.adoc +3 -0
  7. data/docs/features/diff-formatting/whitespace-adjacency.adoc +140 -0
  8. data/docs/features/match-options/html-policies.adoc +2 -0
  9. data/docs/features/match-options/index.adoc +40 -0
  10. data/docs/guides/choosing-configuration.adoc +12 -1
  11. data/docs/reference/cli-options.adoc +3 -0
  12. data/docs/reference/environment-variables.adoc +3 -1
  13. data/docs/reference/options-across-interfaces.adoc +7 -1
  14. data/docs/understanding/formats/html.adoc +9 -2
  15. data/lib/canon/cli.rb +4 -0
  16. data/lib/canon/commands/diff_command.rb +1 -0
  17. data/lib/canon/comparison/comparison_result.rb +95 -2
  18. data/lib/canon/comparison/html_comparator.rb +96 -11
  19. data/lib/canon/comparison/markup_comparator.rb +68 -71
  20. data/lib/canon/comparison/match_options/base_resolver.rb +1 -0
  21. data/lib/canon/comparison/match_options/xml_resolver.rb +8 -0
  22. data/lib/canon/comparison/match_options.rb +23 -2
  23. data/lib/canon/comparison/node_inspector.rb +103 -0
  24. data/lib/canon/comparison/whitespace_sensitivity.rb +96 -0
  25. data/lib/canon/comparison/xml_comparator/child_comparison.rb +133 -55
  26. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +24 -23
  27. data/lib/canon/comparison/xml_comparator/node_parser.rb +45 -7
  28. data/lib/canon/comparison/xml_comparator.rb +174 -7
  29. data/lib/canon/comparison/xml_node_comparison.rb +48 -66
  30. data/lib/canon/comparison.rb +143 -22
  31. data/lib/canon/config/env_schema.rb +2 -1
  32. data/lib/canon/config/profiles/metanorma.yml +3 -0
  33. data/lib/canon/config.rb +51 -5
  34. data/lib/canon/diff/diff_classifier.rb +55 -41
  35. data/lib/canon/diff/diff_line_builder.rb +9 -8
  36. data/lib/canon/diff/xml_serialization_formatter.rb +27 -42
  37. data/lib/canon/diff_formatter/by_line/base_formatter.rb +39 -4
  38. data/lib/canon/diff_formatter/by_line/html_formatter.rb +5 -2
  39. data/lib/canon/diff_formatter/by_line_formatter.rb +84 -0
  40. data/lib/canon/diff_formatter/by_object_formatter.rb +53 -0
  41. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +184 -26
  42. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +92 -4
  43. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +29 -0
  44. data/lib/canon/diff_formatter/pretty_diff_formatter.rb +109 -0
  45. data/lib/canon/diff_formatter.rb +128 -175
  46. data/lib/canon/html/data_model.rb +10 -4
  47. data/lib/canon/pretty_printer/html.rb +76 -14
  48. data/lib/canon/pretty_printer/html_void_elements.rb +20 -0
  49. data/lib/canon/pretty_printer/xml_normalized.rb +10 -3
  50. data/lib/canon/tree_diff/adapters/html_adapter.rb +55 -2
  51. data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
  52. data/lib/canon/version.rb +1 -1
  53. data/lib/canon/xml/c14n.rb +59 -5
  54. data/lib/canon/xml/data_model.rb +13 -1
  55. data/lib/canon/xml/element_matcher.rb +3 -0
  56. data/lib/canon/xml/node.rb +23 -1
  57. data/lib/canon/xml/nodes/comment_node.rb +4 -0
  58. data/lib/canon/xml/nodes/element_node.rb +4 -0
  59. data/lib/canon/xml/nodes/text_node.rb +4 -0
  60. data/lib/canon/xml/sax_builder.rb +29 -2
  61. data/lib/canon/xml/xpath_engine.rb +238 -0
  62. metadata +9 -2
@@ -63,6 +63,18 @@ module Canon
63
63
  # @return [Boolean, Array] true if equivalent, or array of diffs if
64
64
  # verbose
65
65
  def equivalent?(n1, n2, opts = {}, child_opts = {})
66
+ # FAST PATH: Object identity - same object is always equivalent
67
+ # Skip when semantic_diff is requested (caller needs tree diff metadata)
68
+ if n1.equal?(n2) && !opts.dig(:match, :semantic_diff)
69
+ return build_trivial_equivalent_result(n1, n2, opts)
70
+ end
71
+
72
+ # FAST PATH: String content equality - identical strings are equivalent
73
+ # Skip in verbose mode since caller may need full metadata (e.g. tree_diff statistics)
74
+ if !opts[:verbose] && n1.is_a?(String) && n2.is_a?(String) && n1 == n2
75
+ return true
76
+ end
77
+
66
78
  opts = DEFAULT_OPTS.merge(opts)
67
79
 
68
80
  # Resolve match options with format-specific defaults
@@ -92,8 +104,15 @@ module Canon
92
104
  # Create child_opts with resolved options
93
105
  child_opts = opts.merge(child_opts)
94
106
 
95
- # Determine if we should preserve whitespace during parsing
96
- # When structural_whitespace is :strict, preserve all whitespace-only text nodes
107
+ # Determine if we should preserve whitespace during parsing.
108
+ # Only structural_whitespace: :strict forces whitespace-only text
109
+ # nodes to survive parsing. whitespace_type is about distinguishing
110
+ # Unicode whitespace *types* in surviving text-node content, and
111
+ # does NOT require indent text nodes to be kept — libxml's NOBLANKS
112
+ # only strips pure-ASCII whitespace-only nodes, so NBSP-only nodes
113
+ # survive regardless. Coupling whitespace_type: :strict to
114
+ # parsing-time preservation made pretty-printed fixtures produce
115
+ # spurious element-position diffs (issue #112).
97
116
  preserve_whitespace = match_opts_hash[:structural_whitespace] == :strict
98
117
 
99
118
  # Parse nodes if they are strings, applying preprocessing if needed
@@ -141,6 +160,8 @@ module Canon
141
160
  format: :xml,
142
161
  match_options: match_opts_hash,
143
162
  algorithm: :dom,
163
+ parse_errors_expected: Comparison.parse_errors_for(node1),
164
+ parse_errors_received: Comparison.parse_errors_for(node2),
144
165
  )
145
166
  elsif result != Comparison::EQUIVALENT && !differences.empty?
146
167
  # Non-verbose mode: check equivalence
@@ -203,6 +224,8 @@ module Canon
203
224
  format: :xml,
204
225
  match_options: match_opts_hash.merge(strategy.metadata),
205
226
  algorithm: :semantic,
227
+ parse_errors_expected: Comparison.parse_errors_for(node1),
228
+ parse_errors_received: Comparison.parse_errors_for(node2),
206
229
  )
207
230
  else
208
231
  # Simple boolean result - equivalent if no normative differences
@@ -218,8 +241,59 @@ module Canon
218
241
  preserve_whitespace: preserve_whitespace)
219
242
  end
220
243
 
244
+ # Build result for trivially equivalent inputs (same object or identical strings)
245
+ #
246
+ # Returns plain `true` in non-verbose mode, or a ComparisonResult in verbose mode.
247
+ #
248
+ # @param n1 [Object] First input
249
+ # @param n2 [Object] Second input
250
+ # @param opts [Hash] Raw options (before merge with DEFAULT_OPTS)
251
+ # @return [Boolean, ComparisonResult]
252
+ def build_trivial_equivalent_result(n1, n2, opts)
253
+ return true unless opts[:verbose]
254
+
255
+ # Parse nodes for verbose display
256
+ preserve_whitespace = true
257
+ node1 = parse_node(n1, :none,
258
+ preserve_whitespace: preserve_whitespace)
259
+ node2 = parse_node(n2, :none,
260
+ preserve_whitespace: preserve_whitespace)
261
+ preprocessed = [
262
+ serialize_node(node1).gsub("><", ">\n<"),
263
+ serialize_node(node2).gsub("><", ">\n<"),
264
+ ]
265
+ original1 = if n1.is_a?(String)
266
+ n1
267
+ elsif n1.respond_to?(:to_xml)
268
+ n1.to_xml
269
+ else
270
+ n1.to_s
271
+ end
272
+ original2 = if n2.is_a?(String)
273
+ n2
274
+ elsif n2.respond_to?(:to_xml)
275
+ n2.to_xml
276
+ else
277
+ n2.to_s
278
+ end
279
+
280
+ ComparisonResult.new(
281
+ differences: [],
282
+ preprocessed_strings: preprocessed,
283
+ original_strings: [original1, original2],
284
+ format: :xml,
285
+ match_options: {},
286
+ algorithm: :dom,
287
+ )
288
+ end
289
+
290
+ public
291
+
221
292
  # Main comparison dispatcher
222
293
  def compare_nodes(n1, n2, opts, child_opts, diff_children, differences)
294
+ # FAST PATH: Object identity - same object is always equivalent
295
+ return Comparison::EQUIVALENT if n1.equal?(n2)
296
+
223
297
  # Handle DocumentFragment nodes - compare their children instead
224
298
  if n1.is_a?(Nokogiri::XML::DocumentFragment) &&
225
299
  n2.is_a?(Nokogiri::XML::DocumentFragment)
@@ -305,7 +379,6 @@ module Canon
305
379
  end
306
380
 
307
381
  # Public comparison methods - exposed for XmlNodeComparison module
308
- public
309
382
 
310
383
  # Compare two element nodes
311
384
  def compare_element_nodes(n1, n2, opts, child_opts, diff_children,
@@ -380,8 +453,10 @@ module Canon
380
453
  raw_differs = text1 != text2
381
454
 
382
455
  # Check if matches according to behavior
456
+ whitespace_type = match_opts[:whitespace_type] || :strict
383
457
  matches_per_behavior = MatchOptions.match_text?(text1, text2,
384
- behavior)
458
+ behavior,
459
+ whitespace_type: whitespace_type)
385
460
 
386
461
  # Determine the correct dimension for this difference
387
462
  # - If text_content is :strict, ALL differences use :text_content dimension
@@ -597,9 +672,16 @@ differences)
597
672
  else
598
673
  " (namespace: #{ns})"
599
674
  end
600
- return "element '#{node.name}'#{ns_info}: #{diff1} vs #{diff2}"
675
+ label = Canon::Comparison.code_pair_label(diff1, diff2)
676
+ return "element '#{node.name}'#{ns_info}: #{label}"
601
677
  elsif node.respond_to?(:name) && !node.respond_to?(:namespace_uri)
602
- return "element missing: #{node}"
678
+ # TextNode and other nodes without namespace_uri
679
+ display = if node.respond_to?(:value) && node.node_type == :text
680
+ "\"#{truncate_text(node.value)}\""
681
+ else
682
+ node.name.to_s
683
+ end
684
+ return "element missing: #{display}"
603
685
  end
604
686
  end
605
687
 
@@ -617,6 +699,10 @@ differences)
617
699
  return build_text_diff_reason(text1, text2)
618
700
  end
619
701
 
702
+ if dimension == :whitespace_adjacency
703
+ return build_whitespace_adjacency_reason(node1, node2)
704
+ end
705
+
620
706
  # For attribute values differences, show the actual values
621
707
  if dimension == :attribute_values
622
708
  attrs1 = extract_attributes(node1)
@@ -633,8 +719,17 @@ differences)
633
719
 
634
720
  if diff1 == Canon::Comparison::MISSING_NODE && diff2 == Canon::Comparison::MISSING_NODE
635
721
  "element structure mismatch (children differ)"
722
+ elsif dimension == :element_structure &&
723
+ diff1 == Canon::Comparison::UNEQUAL_ELEMENTS &&
724
+ diff2 == Canon::Comparison::UNEQUAL_ELEMENTS &&
725
+ (node1.is_a?(Canon::Xml::Node) || node1.is_a?(Nokogiri::XML::Node)) &&
726
+ (node2.is_a?(Canon::Xml::Node) || node2.is_a?(Nokogiri::XML::Node)) &&
727
+ node1.name && node2.name && node1.name != node2.name
728
+ # Most common case: differing element names. Surface the
729
+ # actual names rather than a generic "elements differ".
730
+ "different element name (<#{node1.name}> vs <#{node2.name}>)"
636
731
  else
637
- "#{diff1} vs #{diff2}"
732
+ Canon::Comparison.code_pair_label(diff1, diff2)
638
733
  end
639
734
  end
640
735
 
@@ -748,6 +843,78 @@ differences)
748
843
  "Text: \"#{vis1}\" vs \"#{vis2}\""
749
844
  end
750
845
 
846
+ # Build a Reason line for a +:whitespace_adjacency+ diff (#137).
847
+ # Names which side carries the whitespace, the adjacency position
848
+ # relative to content neighbours, and surfaces the whitespace
849
+ # with visible markers.
850
+ def build_whitespace_adjacency_reason(node1, node2)
851
+ text1 = extract_text_from_node(node1)
852
+ text2 = extract_text_from_node(node2)
853
+
854
+ ni = NodeInspector
855
+ ws_on_first = ni.whitespace_only_text?(node1) &&
856
+ !ni.whitespace_only_text?(node2)
857
+ ws_on_second = ni.whitespace_only_text?(node2) &&
858
+ !ni.whitespace_only_text?(node1)
859
+
860
+ if ws_on_first
861
+ ws_text = text1
862
+ content_text = text2
863
+ present_side = "EXPECTED"
864
+ absent_side = "ACTUAL"
865
+ ws_node = node1
866
+ elsif ws_on_second
867
+ ws_text = text2
868
+ content_text = text1
869
+ present_side = "ACTUAL"
870
+ absent_side = "EXPECTED"
871
+ ws_node = node2
872
+ else
873
+ return build_text_diff_reason(text1, text2)
874
+ end
875
+
876
+ position = whitespace_adjacency_position(ws_node)
877
+ ws_vis = visualize_whitespace(ws_text)
878
+ content_vis = content_text ? visualize_whitespace(truncate_text(content_text)) : "(none)"
879
+
880
+ "Whitespace #{position} \"#{content_vis}\": " \
881
+ "present on #{present_side} (\"#{ws_vis}\"), absent on #{absent_side}"
882
+ end
883
+
884
+ def whitespace_adjacency_position(ws_node)
885
+ return :isolated unless ws_node.is_a?(Canon::Xml::Node) ||
886
+ ws_node.is_a?(Nokogiri::XML::Node)
887
+
888
+ parent = ws_node.parent
889
+ return :isolated if parent.nil?
890
+
891
+ siblings = parent.children
892
+ idx = siblings.index(ws_node)
893
+ return :isolated unless idx
894
+
895
+ before = sibling_with_content?(siblings, idx, -1)
896
+ after = sibling_with_content?(siblings, idx, 1)
897
+
898
+ if before && after then :surrounding
899
+ elsif before then :following
900
+ elsif after then :preceding
901
+ else :isolated
902
+ end
903
+ end
904
+
905
+ def sibling_with_content?(siblings, idx, direction)
906
+ i = idx + direction
907
+ while i >= 0 && i < siblings.length
908
+ s = siblings[i]
909
+ is_ws_text = NodeInspector.text_node?(s) &&
910
+ NodeInspector.text_content(s).strip.empty?
911
+ return true unless is_ws_text
912
+
913
+ i += direction
914
+ end
915
+ false
916
+ end
917
+
751
918
  # Check if text is only whitespace
752
919
  #
753
920
  # @param text [String] Text to check
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "node_inspector"
4
+
3
5
  module Canon
4
6
  module Comparison
5
7
  # XML Node Comparison Utilities
@@ -180,13 +182,9 @@ differences)
180
182
  # @return [Symbol] Comparison result constant
181
183
  def self.dispatch_by_node_type(node1, node2, opts, child_opts,
182
184
  diff_children, differences)
183
- # Canon::Xml::Node types use .node_type method that returns symbols
184
- # Nokogiri also has .node_type but returns integers, so check for Symbol
185
- if node1.respond_to?(:node_type) && node2.respond_to?(:node_type) &&
186
- node1.node_type.is_a?(Symbol) && node2.node_type.is_a?(Symbol)
185
+ if node1.is_a?(Canon::Xml::Node) && node2.is_a?(Canon::Xml::Node)
187
186
  dispatch_canon_node_type(node1, node2, opts, child_opts,
188
187
  diff_children, differences)
189
- # Moxml/Nokogiri types use .element?, .text?, etc. methods
190
188
  else
191
189
  dispatch_legacy_node_type(node1, node2, opts, child_opts,
192
190
  diff_children, differences)
@@ -232,6 +230,17 @@ diff_children, differences)
232
230
  return false unless text_node?(node) && node.parent
233
231
  return false unless MatchOptions.normalize_text(node_text(node)).empty?
234
232
 
233
+ # HTML-specific: NBSP (U+00A0) is never insignificant whitespace —
234
+ # it always renders as a visible non-breaking space.
235
+ format = opts[:format] || match_opts[:format]
236
+ if %i[html html4 html5].include?(format)
237
+ return false if WhitespaceSensitivity.contains_nbsp?(node_text(node))
238
+
239
+ # Whitespace between inline element siblings is semantically
240
+ # significant (renders as a visible gap) and must not be stripped.
241
+ return false if WhitespaceSensitivity.inline_whitespace_significant?(node)
242
+ end
243
+
235
244
  return true unless WhitespaceSensitivity.whitespace_preserved?(
236
245
  node.parent, match_opts
237
246
  )
@@ -275,8 +284,8 @@ diff_children, differences)
275
284
  def self.same_node_type?(node1, node2)
276
285
  return false if node1.class != node2.class
277
286
 
278
- # For Nokogiri/Canon::Xml nodes, check node type
279
- if node1.respond_to?(:node_type) && node2.respond_to?(:node_type)
287
+ case node1
288
+ when Canon::Xml::Node, Nokogiri::XML::Node
280
289
  node1.node_type == node2.node_type
281
290
  else
282
291
  true
@@ -294,34 +303,13 @@ diff_children, differences)
294
303
  # @param check_children [Boolean] Whether to check child nodes
295
304
  # @return [Boolean] true if node is a comment
296
305
  def self.comment_node?(node, check_children: false)
297
- result = false
298
- return true if node.respond_to?(:comment?) && node.comment?
299
- return true if node.respond_to?(:node_type) && node.node_type == :comment
300
-
301
- if node.is_a?(Nokogiri::XML::Element) && !node.children.empty? && check_children
302
- node.children.each do |child|
303
- # Recursively check child nodes for comments
304
- # limit depth to avoid infinite recursion
305
- # in case of circular structures (if any)
306
- if comment_node?(child, check_children: false)
307
- result = true
308
- break
309
- end
310
- end
311
- end
312
- return true if result
313
-
314
- # HTML comments are parsed as TEXT nodes by Nokogiri
315
- # Check if this is a text node with HTML comment content
316
- if text_node?(node)
317
- text = node_text(node)
318
- # Strip whitespace and backslashes for comparison
319
- # Nokogiri escapes HTML comments as "<\\!-- comment -->" in full documents
320
- text_stripped = text.to_s.strip.gsub("\\", "")
321
- return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
322
- end
306
+ return true if NodeInspector.comment_node?(node)
323
307
 
324
- result
308
+ if check_children && node.is_a?(Nokogiri::XML::Element) && !node.children.empty?
309
+ node.children.any? { |child| NodeInspector.comment_node?(child) }
310
+ else
311
+ false
312
+ end
325
313
  end
326
314
 
327
315
  # Check if a node is a text node
@@ -329,9 +317,7 @@ diff_children, differences)
329
317
  # @param node [Object] Node to check
330
318
  # @return [Boolean] true if node is a text node
331
319
  def self.text_node?(node)
332
- (node.respond_to?(:text?) && node.text? &&
333
- !node.respond_to?(:element?)) ||
334
- (node.respond_to?(:node_type) && node.node_type == :text)
320
+ NodeInspector.text_node?(node)
335
321
  end
336
322
 
337
323
  # Extract text content from a node
@@ -341,15 +327,7 @@ diff_children, differences)
341
327
  def self.node_text(node)
342
328
  return "" unless node
343
329
 
344
- if node.respond_to?(:content)
345
- node.content.to_s
346
- elsif node.respond_to?(:text)
347
- node.text.to_s
348
- elsif node.respond_to?(:value)
349
- node.value.to_s
350
- else
351
- ""
352
- end
330
+ NodeInspector.text_content(node)
353
331
  end
354
332
 
355
333
  # Dispatch by Canon::Xml::Node type
@@ -385,21 +363,26 @@ diff_children, differences)
385
363
  # Import XmlComparator to use its comparison methods
386
364
  require_relative "xml_comparator"
387
365
 
388
- if node1.respond_to?(:element?) && node1.element?
389
- XmlComparator.compare_element_nodes(node1, node2, opts, child_opts,
390
- diff_children, differences)
391
- elsif node1.respond_to?(:text?) && node1.text?
392
- XmlComparator.compare_text_nodes(node1, node2, opts, differences)
393
- elsif node1.respond_to?(:comment?) && node1.comment?
394
- XmlComparator.compare_comment_nodes(node1, node2, opts, differences)
395
- elsif node1.respond_to?(:cdata?) && node1.cdata?
396
- XmlComparator.compare_text_nodes(node1, node2, opts, differences)
397
- elsif node1.respond_to?(:processing_instruction?) && node1.processing_instruction?
398
- XmlComparator.compare_processing_instruction_nodes(node1, node2,
399
- opts, differences)
400
- elsif node1.respond_to?(:root)
366
+ case node1
367
+ when Nokogiri::XML::Document
401
368
  XmlComparator.compare_document_nodes(node1, node2, opts, child_opts,
402
369
  diff_children, differences)
370
+ when Nokogiri::XML::Node
371
+ if node1.element?
372
+ XmlComparator.compare_element_nodes(node1, node2, opts, child_opts,
373
+ diff_children, differences)
374
+ elsif node1.text?
375
+ XmlComparator.compare_text_nodes(node1, node2, opts, differences)
376
+ elsif node1.comment?
377
+ XmlComparator.compare_comment_nodes(node1, node2, opts, differences)
378
+ elsif node1.cdata?
379
+ XmlComparator.compare_text_nodes(node1, node2, opts, differences)
380
+ elsif node1.processing_instruction?
381
+ XmlComparator.compare_processing_instruction_nodes(node1, node2,
382
+ opts, differences)
383
+ else
384
+ Comparison::EQUIVALENT
385
+ end
403
386
  else
404
387
  Comparison::EQUIVALENT
405
388
  end
@@ -431,10 +414,11 @@ differences)
431
414
  # @param node [Canon::Xml::Node, Object] Node to serialize
432
415
  # @return [String] XML string representation
433
416
  def self.serialize_node_to_xml(node)
434
- if node.is_a?(Canon::Xml::Nodes::RootNode)
417
+ case node
418
+ when Canon::Xml::Nodes::RootNode
435
419
  # Serialize all children of root
436
420
  node.children.map { |child| serialize_node_to_xml(child) }.join
437
- elsif node.is_a?(Canon::Xml::Nodes::ElementNode)
421
+ when Canon::Xml::Nodes::ElementNode
438
422
  # Serialize element with attributes and children
439
423
  attrs = node.attribute_nodes.map do |a|
440
424
  " #{a.name}=\"#{a.value}\""
@@ -448,14 +432,12 @@ differences)
448
432
  else
449
433
  "<#{node.name}#{attrs}>#{children_xml}</#{node.name}>"
450
434
  end
451
- elsif node.is_a?(Canon::Xml::Nodes::TextNode)
435
+ when Canon::Xml::Nodes::TextNode
452
436
  node.value
453
- elsif node.is_a?(Canon::Xml::Nodes::CommentNode)
437
+ when Canon::Xml::Nodes::CommentNode
454
438
  "<!--#{node.value}-->"
455
- elsif node.is_a?(Canon::Xml::Nodes::ProcessingInstructionNode)
439
+ when Canon::Xml::Nodes::ProcessingInstructionNode
456
440
  "<?#{node.target} #{node.data}?>"
457
- elsif node.respond_to?(:to_xml)
458
- node.to_xml
459
441
  else
460
442
  node.to_s
461
443
  end
@@ -122,6 +122,65 @@ module Canon
122
122
  UNEQUAL_TYPES = 15
123
123
  UNEQUAL_PRIMITIVES = 16
124
124
 
125
+ # Human-readable labels for the integer comparison-result constants
126
+ # above. Used by the diff reason builders so user-facing reason text
127
+ # never leaks raw numeric codes (e.g. "7 vs 7" — see lutaml/canon#127).
128
+ # String diff codes (e.g. "position 3" emitted by ChildComparison)
129
+ # pass through +code_label+ unchanged.
130
+ CODE_LABELS = {
131
+ EQUIVALENT => "equivalent",
132
+ MISSING_ATTRIBUTE => "missing attribute",
133
+ MISSING_NODE => "missing",
134
+ UNEQUAL_ATTRIBUTES => "attributes differ",
135
+ UNEQUAL_COMMENTS => "comments differ",
136
+ UNEQUAL_DOCUMENTS => "documents differ",
137
+ UNEQUAL_ELEMENTS => "elements differ",
138
+ UNEQUAL_NODES_TYPES => "node types differ",
139
+ UNEQUAL_TEXT_CONTENTS => "text content differs",
140
+ MISSING_HASH_KEY => "missing hash key",
141
+ UNEQUAL_HASH_VALUES => "hash values differ",
142
+ UNEQUAL_HASH_KEY_ORDER => "hash key order differs",
143
+ UNEQUAL_ARRAY_LENGTHS => "array lengths differ",
144
+ UNEQUAL_ARRAY_ELEMENTS => "array elements differ",
145
+ UNEQUAL_TYPES => "types differ",
146
+ UNEQUAL_PRIMITIVES => "primitives differ",
147
+ }.freeze
148
+
149
+ # Translate a comparison result code (Integer constant or String label
150
+ # like "position 3") into a human-readable reason fragment. Unknown
151
+ # values pass through via +to_s+ as a defensive fallback.
152
+ #
153
+ # @param code [Integer, String] Comparison result code
154
+ # @return [String] Human-readable label
155
+ def self.code_label(code)
156
+ return code if code.is_a?(String)
157
+
158
+ CODE_LABELS[code] || code.to_s
159
+ end
160
+
161
+ # Build a "diff1 [vs diff2]" reason fragment that never leaks raw
162
+ # integer constants. When both codes are equal, returns the single
163
+ # label (e.g. "elements differ") rather than "elements differ vs
164
+ # elements differ". See lutaml/canon#127.
165
+ #
166
+ # @param diff1 [Integer, String] First diff code
167
+ # @param diff2 [Integer, String] Second diff code
168
+ # @return [String] Reason fragment
169
+ def self.code_pair_label(diff1, diff2)
170
+ return code_label(diff1) if diff1 == diff2
171
+
172
+ "#{code_label(diff1)} vs #{code_label(diff2)}"
173
+ end
174
+
175
+ # Extract parse-time errors from a parsed-tree or Nokogiri fragment.
176
+ # Delegates to NodeInspector for cross-backend type dispatch.
177
+ #
178
+ # @param node [Object, nil] Parsed node
179
+ # @return [Array<String>] Parse errors as strings (empty by default)
180
+ def self.parse_errors_for(node)
181
+ NodeInspector.parse_errors(node)
182
+ end
183
+
125
184
  class << self
126
185
  # Auto-detect format and compare two objects
127
186
  #
@@ -144,6 +203,35 @@ module Canon
144
203
  dom_diff(obj1, obj2, opts)
145
204
  end
146
205
 
206
+ # Summarize the first difference between two documents.
207
+ #
208
+ # Returns a human-readable string describing the first difference
209
+ # when documents differ, or "Equivalent" when they match.
210
+ # This is a lightweight alternative to +equivalent?+ with +verbose: true+.
211
+ #
212
+ # @param obj1 [Object] First object to compare
213
+ # @param obj2 [Object] Second object to compare
214
+ # @param opts [Hash] Comparison options (same as +equivalent?+)
215
+ # @return [String] Summary string
216
+ #
217
+ # @example
218
+ # Canon::Comparison.summarize("<p>Hello</p>", "<p>World</p>")
219
+ # # => "Not equivalent: text content differs at /p[1] (Hello vs World)"
220
+ #
221
+ # Canon::Comparison.summarize("<p>Hello</p>", "<p>Hello</p>")
222
+ # # => "Equivalent"
223
+ def summarize(obj1, obj2, opts = {})
224
+ result = equivalent?(obj1, obj2, opts.merge(verbose: true))
225
+
226
+ if result.is_a?(ComparisonResult)
227
+ result.summary
228
+ elsif result == true
229
+ "Equivalent"
230
+ else
231
+ "Not equivalent"
232
+ end
233
+ end
234
+
147
235
  # Define a custom comparison profile with DSL syntax
148
236
  #
149
237
  # @param name [Symbol] Profile name
@@ -602,26 +690,26 @@ module Canon
602
690
  # parsers can mutate the DOM).
603
691
  opts[:_original_str1] = obj1.dup if obj1.is_a?(String)
604
692
  opts[:_original_str2] = obj2.dup if obj2.is_a?(String)
605
- if opts[:format] == :html5
606
- # HTML5 fragment parsing is safe it normalizes without
607
- # destructive content-model mutations.
608
- obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
609
- obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
610
- else
611
- # HTML4 fragment parsing mutates the DOM (strips <body>
612
- # attributes, re-parents <h1> content, etc.). Use XML
613
- # fragment parsing which preserves structure faithfully.
614
- if obj1.is_a?(String)
615
- obj1 = Nokogiri::XML.fragment(
616
- strip_xml_preamble(obj1),
617
- )
618
- end
619
- if obj2.is_a?(String)
620
- obj2 = Nokogiri::XML.fragment(
621
- strip_xml_preamble(obj2),
622
- )
623
- end
624
- end
693
+ # Parse all HTML formats (:html, :html4, :html5) with
694
+ # Nokogiri::HTML5 so that html4 and html5 share HTML's
695
+ # whitespace-sensitivity semantics (issue #118).
696
+ #
697
+ # The previous html/html4 branch used Nokogiri::XML.fragment
698
+ # to dodge Nokogiri::HTML4.fragment's destructive DOM
699
+ # mutations. That avoided one problem but introduced a
700
+ # bigger one: XML whitespace rules were being applied to
701
+ # HTML content. HTML's content model — identical between
702
+ # HTML4 and HTML5 — treats whitespace-only text between
703
+ # block-level children as insignificant; XML treats every
704
+ # whitespace text node as significant. Routing html4 input
705
+ # through an XML parser therefore made
706
+ # be_html4_equivalent_to reject inputs that
707
+ # be_html5_equivalent_to (correctly) accepts.
708
+ # Nokogiri::HTML5.fragment is non-destructive (the original
709
+ # HTML4.fragment concern does not apply to it) and applies
710
+ # HTML's content model uniformly.
711
+ obj1 = HtmlParser.parse(obj1, :html5) if obj1.is_a?(String)
712
+ obj2 = HtmlParser.parse(obj2, :html5) if obj2.is_a?(String)
625
713
  end
626
714
  else
627
715
  format1 = FormatDetector.detect(obj1)
@@ -662,8 +750,14 @@ module Canon
662
750
  # but defined in config
663
751
  if Canon::Config.instance.respond_to?(comparison_format)
664
752
  format_config = Canon::Config.instance.public_send(comparison_format)
665
- if opts[:match_profile].nil? && format_config.match.profile
666
- opts[:match_profile] = format_config.match.profile
753
+ if opts[:global_profile].nil? && format_config.match.profile
754
+ # Config-sourced profile has *global* priority (applied before
755
+ # global_options), so that YAML profile_options like
756
+ # whitespace_type: :normalize can override the built-in profile
757
+ # (e.g. :spec_friendly)'s whitespace_type: :strict. Writing to
758
+ # :match_profile here gave the config profile per-call priority,
759
+ # which incorrectly overrode the YAML's own overrides.
760
+ opts[:global_profile] = format_config.match.profile
667
761
  end
668
762
  # Pass YAML profile's extra match options (e.g., preserve_whitespace_elements)
669
763
  # that are stored in MatchConfig's resolver but not exposed via the
@@ -701,6 +795,33 @@ module Canon
701
795
  str
702
796
  end
703
797
 
798
+ # Decode HTML named entities (&nbsp; etc.) to their numeric
799
+ # character reference equivalents so that Nokogiri::XML.fragment
800
+ # (which only understands the five XML entities) preserves them
801
+ # as text nodes instead of silently dropping them.
802
+ #
803
+ # Uses Nokogiri's HTML4 parser to resolve the entities — the
804
+ # text is extracted from a fragment so no structural tags are added.
805
+ #
806
+ # @param str [String] HTML string potentially containing named entities
807
+ # @return [String] String with named entities replaced by characters
808
+ def decode_html_entities(str)
809
+ # Fast path: skip if no ampersands present
810
+ return str unless str.include?("&")
811
+
812
+ # Parse as HTML fragment to resolve named entities, then
813
+ # re-serialize as text. This converts &nbsp; → U+00A0, etc.
814
+ doc = Nokogiri::HTML4.fragment(str)
815
+
816
+ # Serialize back, preserving the resolved characters.
817
+ # to_html re-encodes characters, so use inner_html which
818
+ # keeps the character form.
819
+ doc.inner_html
820
+
821
+ # If the serialization re-encoded characters as entities,
822
+ # that's fine — the XML parser understands numeric refs like &#160;
823
+ end
824
+
704
825
  # Detect the format of an object (delegates to FormatDetector)
705
826
  #
706
827
  # @param obj [Object] Object to detect format of
@@ -14,6 +14,7 @@ module Canon
14
14
  show_diffs: :symbol,
15
15
  verbose_diff: :boolean,
16
16
  algorithm: :symbol,
17
+ parser: :symbol,
17
18
  show_raw_inputs: :boolean,
18
19
  show_raw_expected: :boolean,
19
20
  show_raw_received: :boolean,
@@ -66,7 +67,7 @@ module Canon
66
67
 
67
68
  def all_diff_attributes
68
69
  %i[mode use_color context_lines grouping_lines show_diffs
69
- verbose_diff algorithm show_raw_inputs show_raw_expected show_raw_received
70
+ verbose_diff algorithm parser show_raw_inputs show_raw_expected show_raw_received
70
71
  show_preprocessed_inputs show_preprocessed_expected show_preprocessed_received
71
72
  show_prettyprint_inputs show_prettyprint_expected show_prettyprint_received
72
73
  show_line_numbered_inputs character_visualization
@@ -28,6 +28,9 @@ formats:
28
28
  xml:
29
29
  match:
30
30
  profile: spec_friendly
31
+ # Treat different Unicode whitespace types (space, NBSP, ideographic space, etc.)
32
+ # as equivalent — useful for spec comparisons where whitespace type doesn't matter
33
+ whitespace_type: :normalize
31
34
  # Elements where whitespace is PRESERVED exactly (no manipulation)
32
35
  # All whitespace characters are significant in these elements
33
36
  preserve_whitespace_elements: