canon 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec-opal +7 -0
  3. data/.rubocop_todo.yml +16 -61
  4. data/README.adoc +5 -0
  5. data/Rakefile +17 -0
  6. data/docs/features/diff-formatting/comment-asymmetry.adoc +160 -0
  7. data/lib/canon/cli.rb +1 -1
  8. data/lib/canon/color_detector.rb +3 -5
  9. data/lib/canon/comparison/child_realignment.rb +140 -0
  10. data/lib/canon/comparison/compare_profile.rb +1 -4
  11. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +2 -6
  12. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +2 -6
  13. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +2 -6
  14. data/lib/canon/comparison/dimensions/comments_dimension.rb +2 -6
  15. data/lib/canon/comparison/dimensions/element_position_dimension.rb +2 -6
  16. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +2 -6
  17. data/lib/canon/comparison/dimensions/text_content_dimension.rb +3 -5
  18. data/lib/canon/comparison/format_detector.rb +29 -20
  19. data/lib/canon/comparison/html_comparator.rb +36 -75
  20. data/lib/canon/comparison/html_compare_profile.rb +3 -10
  21. data/lib/canon/comparison/html_parser.rb +1 -1
  22. data/lib/canon/comparison/json_comparator.rb +8 -0
  23. data/lib/canon/comparison/node_inspector.rb +150 -58
  24. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +6 -8
  25. data/lib/canon/comparison/whitespace_sensitivity.rb +55 -193
  26. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +5 -10
  27. data/lib/canon/comparison/xml_comparator/child_comparison.rb +32 -77
  28. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +43 -8
  29. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +14 -28
  30. data/lib/canon/comparison/xml_comparator/node_parser.rb +12 -11
  31. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +30 -58
  32. data/lib/canon/comparison/xml_comparator.rb +89 -83
  33. data/lib/canon/comparison/xml_node_comparison.rb +15 -15
  34. data/lib/canon/comparison/yaml_comparator.rb +8 -0
  35. data/lib/canon/comparison.rb +25 -23
  36. data/lib/canon/config/profile_loader.rb +13 -13
  37. data/lib/canon/config.rb +29 -5
  38. data/lib/canon/diff/diff_classifier.rb +16 -42
  39. data/lib/canon/diff/diff_line.rb +1 -1
  40. data/lib/canon/diff/diff_node_enricher.rb +22 -24
  41. data/lib/canon/diff/node_serializer.rb +23 -30
  42. data/lib/canon/diff/path_builder.rb +24 -37
  43. data/lib/canon/diff/source_locator.rb +0 -3
  44. data/lib/canon/diff/xml_serialization_formatter.rb +8 -81
  45. data/lib/canon/diff_formatter/by_line/base_formatter.rb +7 -7
  46. data/lib/canon/diff_formatter/by_line/json_formatter.rb +1 -1
  47. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +1 -1
  48. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +2 -2
  49. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +1 -1
  50. data/lib/canon/diff_formatter/by_line_formatter.rb +1 -1
  51. data/lib/canon/diff_formatter/by_object/base_formatter.rb +11 -15
  52. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +8 -10
  53. data/lib/canon/diff_formatter/by_object_formatter.rb +1 -1
  54. data/lib/canon/diff_formatter/debug_output.rb +12 -24
  55. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +2 -2
  56. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +146 -318
  57. data/lib/canon/diff_formatter/diff_detail_formatter.rb +28 -20
  58. data/lib/canon/diff_formatter/legend.rb +2 -2
  59. data/lib/canon/diff_formatter/pretty_diff_formatter.rb +2 -2
  60. data/lib/canon/diff_formatter/theme.rb +4 -4
  61. data/lib/canon/diff_formatter.rb +2 -2
  62. data/lib/canon/formatters/html_formatter.rb +1 -1
  63. data/lib/canon/formatters/html_formatter_base.rb +1 -1
  64. data/lib/canon/formatters/xml_formatter.rb +7 -32
  65. data/lib/canon/html/data_model.rb +1 -1
  66. data/lib/canon/pretty_printer/html.rb +1 -1
  67. data/lib/canon/pretty_printer/xml.rb +16 -7
  68. data/lib/canon/pretty_printer/xml_normalized.rb +9 -3
  69. data/lib/canon/rspec_matchers.rb +2 -2
  70. data/lib/canon/tree_diff/adapters/html_adapter.rb +1 -1
  71. data/lib/canon/tree_diff/adapters/xml_adapter.rb +1 -1
  72. data/lib/canon/tree_diff/core/tree_node.rb +1 -3
  73. data/lib/canon/validators/html_validator.rb +1 -1
  74. data/lib/canon/validators/xml_validator.rb +1 -1
  75. data/lib/canon/version.rb +1 -1
  76. data/lib/canon/xml/data_model.rb +131 -137
  77. data/lib/canon/xml/namespace_helper.rb +5 -0
  78. data/lib/canon/xml/node.rb +2 -1
  79. data/lib/canon/xml/nodes/root_node.rb +4 -0
  80. data/lib/canon/xml/nodes/text_node.rb +6 -1
  81. data/lib/canon/xml/sax_builder.rb +4 -6
  82. data/lib/canon/xml_backend.rb +49 -0
  83. data/lib/canon/xml_parsing.rb +271 -0
  84. data/lib/canon.rb +3 -1
  85. data/lib/tasks/benchmark_runner.rb +1 -1
  86. data/lib/tasks/performance_helpers.rb +1 -1
  87. metadata +7 -2
@@ -23,14 +23,10 @@ module Canon
23
23
  def extract_data(node)
24
24
  return 0 unless node
25
25
 
26
- # Handle Moxml nodes
27
- if node.is_a?(Moxml::Node)
28
- extract_from_moxml(node)
29
- # Handle Nokogiri nodes
30
- elsif node.is_a?(Nokogiri::XML::Node)
26
+ if Canon::XmlBackend.nokogiri?
31
27
  extract_from_nokogiri(node)
32
28
  else
33
- 0
29
+ extract_from_moxml(node)
34
30
  end
35
31
  end
36
32
 
@@ -25,14 +25,10 @@ module Canon
25
25
  def extract_data(node)
26
26
  return [] unless node
27
27
 
28
- # Handle Moxml nodes
29
- if node.is_a?(Moxml::Node)
30
- extract_from_moxml(node)
31
- # Handle Nokogiri nodes
32
- elsif node.is_a?(Nokogiri::XML::Node)
28
+ if Canon::XmlBackend.nokogiri?
33
29
  extract_from_nokogiri(node)
34
30
  else
35
- []
31
+ extract_from_moxml(node)
36
32
  end
37
33
  end
38
34
 
@@ -23,12 +23,10 @@ module Canon
23
23
  def extract_data(node)
24
24
  return nil unless node
25
25
 
26
- # Handle Moxml nodes
27
- if node.is_a?(Moxml::Node)
28
- extract_from_moxml(node)
29
- # Handle Nokogiri nodes
30
- elsif node.is_a?(Nokogiri::XML::Node)
26
+ if Canon::XmlBackend.nokogiri?
31
27
  extract_from_nokogiri(node)
28
+ else
29
+ extract_from_moxml(node)
32
30
  end
33
31
  end
34
32
 
@@ -22,27 +22,36 @@ module Canon
22
22
  # @param obj [Object] Object to detect format of
23
23
  # @return [Symbol] Format type (:xml, :html, :json, :yaml, :ruby_object, :string)
24
24
  def detect(obj)
25
- case obj
26
- when Moxml::Node, Moxml::Document
27
- :xml
28
- when Nokogiri::HTML::DocumentFragment, Nokogiri::HTML5::DocumentFragment
29
- # HTML DocumentFragments
30
- :html
31
- when Nokogiri::XML::DocumentFragment
32
- # XML DocumentFragments - check if it's actually HTML
33
- obj.document&.html? ? :html : :xml
34
- when Nokogiri::XML::Document, Nokogiri::XML::Node
35
- # Check if it's HTML by looking at the document type
36
- obj.html? ? :html : :xml
37
- when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
38
- :html
39
- when String
40
- detect_string(obj)
41
- when Hash, Array
42
- # Raw Ruby objects (from parsed JSON/YAML)
43
- :ruby_object
25
+ if XmlBackend.moxml?
26
+ case obj
27
+ when Moxml::Node, Moxml::Document
28
+ :xml
29
+ when String
30
+ detect_string(obj)
31
+ when Hash, Array
32
+ :ruby_object
33
+ else
34
+ raise Canon::Error, "Unknown format for object: #{obj.class}"
35
+ end
44
36
  else
45
- raise Canon::Error, "Unknown format for object: #{obj.class}"
37
+ case obj
38
+ when Moxml::Node, Moxml::Document
39
+ :xml
40
+ when Nokogiri::HTML::DocumentFragment, Nokogiri::HTML5::DocumentFragment
41
+ :html
42
+ when Nokogiri::XML::DocumentFragment
43
+ obj.document&.html? ? :html : :xml
44
+ when Nokogiri::XML::Document, Nokogiri::XML::Node
45
+ obj.html? ? :html : :xml
46
+ when Nokogiri::HTML::Document, Nokogiri::HTML5::Document
47
+ :html
48
+ when String
49
+ detect_string(obj)
50
+ when Hash, Array
51
+ :ruby_object
52
+ else
53
+ raise Canon::Error, "Unknown format for object: #{obj.class}"
54
+ end
46
55
  end
47
56
  end
48
57
 
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "nokogiri"
3
+ require "nokogiri" unless RUBY_ENGINE == "opal"
4
4
  require_relative "../comparison" # Load base module with constants first
5
5
  require_relative "markup_comparator"
6
6
  require_relative "xml_comparator"
@@ -167,6 +167,11 @@ module Canon
167
167
  end
168
168
  end
169
169
 
170
+ # Public parsing API for external callers
171
+ def parse(html, preprocessing = :none)
172
+ parse_node_for_semantic(html, preprocessing)
173
+ end
174
+
170
175
  private
171
176
 
172
177
  # Check if both nodes are document fragments
@@ -188,32 +193,9 @@ module Canon
188
193
  node.is_a?(Nokogiri::HTML5::DocumentFragment)
189
194
  end
190
195
 
191
- # Record a DiffNode for a fragment-level child-count mismatch.
192
- # Each surplus child becomes its own MISSING_NODE diff so the
193
- # downstream report shows what was added or removed.
194
- def record_fragment_length_mismatch(_node1, _node2, children1,
195
- children2, differences)
196
- longer, shorter, side = if children1.length > children2.length
197
- [children1, children2, :removed]
198
- else
199
- [children2, children1, :added]
200
- end
201
-
202
- longer[shorter.length...].each do |orphan|
203
- n1 = side == :removed ? orphan : nil
204
- n2 = side == :removed ? nil : orphan
205
- differences <<
206
- Canon::Comparison::DiffNodeBuilder.build(
207
- node1: n1,
208
- node2: n2,
209
- diff1: Comparison::MISSING_NODE,
210
- diff2: Comparison::MISSING_NODE,
211
- dimension: :element_structure,
212
- )
213
- end
214
- end
215
-
216
- # Compare children of document fragments
196
+ # Compare children of document fragments using the shared
197
+ # +ChildRealignment+ walk. Structural orphans are emitted here
198
+ # (the HTML fragment path has no separate length-mismatch step).
217
199
  #
218
200
  # @param node1 [Nokogiri::DocumentFragment] First fragment
219
201
  # @param node2 [Nokogiri::DocumentFragment] Second fragment
@@ -230,29 +212,24 @@ module Canon
230
212
  children1 = XmlNodeComparison.filter_children(all_children1, opts)
231
213
  children2 = XmlNodeComparison.filter_children(all_children2, opts)
232
214
 
233
- if children1.length != children2.length
234
- # Record the length mismatch as a DiffNode so verbose mode
235
- # surfaces it. Without this, equivalent? wraps an empty
236
- # differences array and incorrectly reports the inputs as
237
- # equivalent.
238
- record_fragment_length_mismatch(node1, node2,
239
- children1, children2,
240
- differences)
241
- return Comparison::UNEQUAL_ELEMENTS
242
- elsif children1.empty?
243
- return Comparison::EQUIVALENT
244
- end
215
+ return Comparison::EQUIVALENT if children1.empty? && children2.empty?
245
216
 
246
- # Compare each pair of children
247
- children1.zip(children2).each do |child1, child2|
248
- child_result = XmlNodeComparison.compare_nodes(child1, child2,
249
- opts, child_opts,
250
- diff_children,
251
- differences)
252
- return child_result if child_result != Comparison::EQUIVALENT
217
+ emitter = html_diff_emitter(differences)
218
+ ChildRealignment.walk(children1, children2, emitter,
219
+ emit_structural_orphans: true) do |c1, c2|
220
+ XmlNodeComparison.compare_nodes(c1, c2, opts, child_opts,
221
+ diff_children, differences)
253
222
  end
223
+ end
254
224
 
255
- Comparison::EQUIVALENT
225
+ # Build a diff emitter for the HTML comparator path that
226
+ # creates DiffNode objects via DiffNodeBuilder.
227
+ def html_diff_emitter(differences)
228
+ proc do |n1, n2, d1, d2, dim|
229
+ differences << Canon::Comparison::DiffNodeBuilder.build(
230
+ node1: n1, node2: n2, diff1: d1, diff2: d2, dimension: dim,
231
+ )
232
+ end
256
233
  end
257
234
 
258
235
  # Perform semantic tree diff using SemanticTreeMatchStrategy
@@ -365,13 +342,10 @@ module Canon
365
342
  # Convert to string if needed
366
343
  html_string = if html.is_a?(String)
367
344
  html
368
- elsif html.respond_to?(:to_html)
345
+ elsif Canon::XmlParsing.xml_node?(html)
369
346
  html.to_html
370
- elsif html.respond_to?(:to_s)
371
- html.to_s
372
347
  else
373
- raise Canon::Error,
374
- "Unable to convert HTML to string: #{html.class}"
348
+ html.to_s
375
349
  end
376
350
 
377
351
  # Strip DOCTYPE for consistent parsing
@@ -520,22 +494,18 @@ module Canon
520
494
  end
521
495
 
522
496
  def find_and_normalize_style_script(node)
523
- return unless node.respond_to?(:children)
497
+ return unless node.is_a?(Canon::Xml::Node)
524
498
 
525
499
  node.children.each do |child|
526
500
  next unless child.is_a?(Canon::Xml::Nodes::ElementNode)
527
501
 
528
502
  # If this is a style or script element, normalize its text content
529
503
  if %w[style script].include?(child.name.downcase)
530
- # Get text children and remove HTML comments from them
531
504
  child.children.each do |text_child|
532
505
  next unless text_child.is_a?(Canon::Xml::Nodes::TextNode)
533
506
 
534
- # Remove HTML comments from text content without using regex
535
- # to avoid ReDoS/incomplete sanitization vulnerabilities
536
507
  normalized = remove_html_comments(text_child.value)
537
- # Update the text value
538
- text_child.instance_variable_set(:@value, normalized)
508
+ text_child.value = normalized
539
509
  end
540
510
  end
541
511
 
@@ -612,13 +582,10 @@ module Canon
612
582
  # @param node [Canon::Xml::Node, Nokogiri::HTML::Document] Parsed node
613
583
  # @return [String] Serialized HTML string
614
584
  def serialize_for_display(node)
615
- # Use XmlNodeComparison's serializer for Canon::Xml::Node
616
585
  if node.is_a?(Canon::Xml::Node)
617
586
  XmlNodeComparison.serialize_node_to_xml(node)
618
- elsif node.respond_to?(:to_html)
619
- node.to_html
620
- elsif node.respond_to?(:to_xml)
621
- node.to_xml
587
+ elsif Canon::XmlParsing.xml_node?(node)
588
+ Canon::XmlBackend.nokogiri? ? node.to_html : Canon::XmlParsing.serialize(node)
622
589
  else
623
590
  node.to_s
624
591
  end
@@ -633,16 +600,11 @@ module Canon
633
600
  if html.is_a?(String)
634
601
  html
635
602
  elsif html.is_a?(Canon::Xml::Node)
636
- # Serialize Canon nodes to string
637
603
  Canon::Xml::DataModel.serialize(html)
638
- elsif html.respond_to?(:to_html)
639
- # Nokogiri nodes - use to_html to preserve formatting
640
- html.to_html
641
- elsif html.respond_to?(:to_s)
642
- html.to_s
604
+ elsif Canon::XmlParsing.xml_node?(html)
605
+ Canon::XmlBackend.nokogiri? ? html.to_html : html.to_s
643
606
  else
644
- raise Canon::Error,
645
- "Unable to extract original string from: #{html.class}"
607
+ html.to_s
646
608
  end
647
609
  end
648
610
 
@@ -755,11 +717,10 @@ compare_profile = nil)
755
717
  # Check if any ancestor of the given node preserves whitespace
756
718
  def ancestor_preserves_whitespace?(node, preserve_list)
757
719
  current = node
758
- while current.respond_to?(:name)
720
+ while current.is_a?(Canon::Xml::Node) || Canon::XmlParsing.xml_node?(current)
759
721
  return true if preserve_list.include?(current.name.downcase)
760
722
 
761
- # Stop at document root - documents don't have parents
762
- break if current.is_a?(Nokogiri::XML::Document)
723
+ break if Canon::XmlParsing.document?(current)
763
724
 
764
725
  current = current.parent
765
726
  end
@@ -839,7 +800,7 @@ compare_profile = nil)
839
800
  end
840
801
 
841
802
  # Check if it's a fragment that contains XML processing instructions
842
- if node.respond_to?(:children) && node.children.any? do |child|
803
+ if (node.is_a?(Canon::Xml::Node) || Canon::XmlParsing.xml_node?(node)) && node.children.any? do |child|
843
804
  child.is_a?(Nokogiri::XML::ProcessingInstruction) &&
844
805
  child.name == "xml"
845
806
  end
@@ -48,9 +48,8 @@ module Canon
48
48
 
49
49
  # If key exists, check if it's :strict
50
50
  return match_options[:comments] == :strict
51
- elsif match_options.respond_to?(:behavior_for)
51
+ elsif match_options.is_a?(ResolvedMatchOptions)
52
52
  behavior = behavior_for(dimension)
53
- # In HTML, only :strict makes comments affect equivalence
54
53
  return behavior == :strict
55
54
  end
56
55
  # Default: comments don't affect equivalence in HTML
@@ -106,14 +105,8 @@ module Canon
106
105
  def has_explicit_option?(dimension)
107
106
  if match_options.is_a?(Hash)
108
107
  match_options.key?(dimension)
109
- elsif match_options.respond_to?(:[])
110
- # For ResolvedMatchOptions, check if key exists
111
- begin
112
- match_options[dimension]
113
- true
114
- rescue StandardError
115
- false
116
- end
108
+ elsif match_options.is_a?(ResolvedMatchOptions)
109
+ !match_options.options[dimension].nil?
117
110
  else
118
111
  false
119
112
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "nokogiri"
3
+ require "nokogiri" unless RUBY_ENGINE == "opal"
4
4
 
5
5
  module Canon
6
6
  module Comparison
@@ -26,6 +26,14 @@ module Canon
26
26
  }.freeze
27
27
 
28
28
  class << self
29
+ # Parse JSON from string or return as-is
30
+ #
31
+ # @param obj [String, Hash, Array] JSON string or parsed object
32
+ # @return [Object] Parsed JSON object
33
+ def parse(obj)
34
+ parse_json(obj)
35
+ end
36
+
29
37
  # Compare two JSON objects for equivalence
30
38
  #
31
39
  # @param json1 [String, Hash, Array] First JSON
@@ -4,42 +4,76 @@ module Canon
4
4
  module Comparison
5
5
  # Single source of truth for cross-backend node type operations.
6
6
  #
7
- # The comparison pipeline handles nodes from two backends:
7
+ # The comparison pipeline handles nodes from multiple sources:
8
8
  # * Canon::Xml::Node (+ RootNode, ElementNode, TextNode, etc.) —
9
9
  # custom DOM built by SAX builder and DataModel.
10
- # * Nokogiri::XML::Node (+ subclasses) native Nokogiri nodes used
11
- # by the HTML comparator and some legacy paths.
10
+ # * Canon::TreeDiff::Core::TreeNode semantic tree diff nodes.
11
+ # * Backend-specific nodes (Nokogiri or Moxml) — live parsed nodes.
12
12
  #
13
- # Every method here dispatches on type via +case/when+ (+is_a?+).
14
- # No +respond_to?+ the types are known at every call site.
13
+ # All type dispatch uses backend-branching (`if XmlBackend.nokogiri?`)
14
+ # rather than `case/when` with constant references. This prevents
15
+ # NameError when Nokogiri constants are undefined under Opal.
16
+ #
17
+ # Every node query in the codebase should go through this module.
18
+ # Do not create private dispatch methods in consumers.
15
19
  module NodeInspector
16
- CANON_TEXT_TYPE = :text
17
20
  NOKOGIRI_TEXT_TYPE = defined?(Nokogiri::XML::Node::TEXT_NODE) ? Nokogiri::XML::Node::TEXT_NODE : 3
18
21
 
19
- # True when +node+ is a text node (whitespace, content, etc.).
22
+ # --- Type predicates ---
23
+
20
24
  def self.text_node?(node)
21
- case node
22
- when Canon::Xml::Node
23
- node.node_type == CANON_TEXT_TYPE
24
- when Nokogiri::XML::Node
25
- node.node_type == NOKOGIRI_TEXT_TYPE
25
+ return false unless node
26
+ return node.node_type == :text if node.is_a?(Canon::Xml::Node)
27
+
28
+ if XmlBackend.nokogiri?
29
+ node.is_a?(Nokogiri::XML::Text) || node.is_a?(Moxml::Text)
26
30
  else
27
- false
31
+ node.is_a?(Moxml::Text)
28
32
  end
29
33
  end
30
34
 
31
- # Extract the text content of +node+ as a String.
32
- def self.text_content(node)
33
- case node
34
- when Canon::Xml::Node
35
- node.value.to_s
36
- when Nokogiri::XML::Node
37
- node.content.to_s
35
+ def self.element_node?(node)
36
+ return false unless node
37
+ return node.node_type == :element if node.is_a?(Canon::Xml::Node)
38
+
39
+ if XmlBackend.nokogiri?
40
+ node.is_a?(Nokogiri::XML::Element) || node.is_a?(Moxml::Element)
41
+ else
42
+ node.is_a?(Moxml::Element)
43
+ end
44
+ end
45
+
46
+ def self.comment_node?(node)
47
+ return false unless node
48
+ return node.node_type == :comment if node.is_a?(Canon::Xml::Node)
49
+
50
+ if XmlBackend.nokogiri?
51
+ return true if node.is_a?(Nokogiri::XML::Node) && node.comment?
52
+
53
+ # HTML comments are parsed as TEXT nodes by Nokogiri
54
+ if node.is_a?(Nokogiri::XML::Node) && node.text?
55
+ text_stripped = text_content(node).to_s.strip.gsub("\\", "")
56
+ return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
57
+ end
58
+ false
38
59
  else
39
- node.to_s
60
+ node.is_a?(Moxml::Comment)
40
61
  end
41
62
  end
42
63
 
64
+ def self.document?(node)
65
+ return node.node_type == :root if node.is_a?(Canon::Xml::Node)
66
+
67
+ XmlParsing.document?(node)
68
+ end
69
+
70
+ def self.document_fragment?(node)
71
+ return false unless node
72
+ return false unless node.is_a?(Canon::Xml::Nodes::RootNode)
73
+
74
+ node.fragment?
75
+ end
76
+
43
77
  # True when +node+ is a text node whose content is whitespace-only.
44
78
  # Empty-string text nodes return false — those represent genuine
45
79
  # empty-vs-content asymmetry, not pretty-print indentation.
@@ -50,62 +84,120 @@ module Canon
50
84
  !text.empty? && text.strip.empty?
51
85
  end
52
86
 
53
- # True when +node+ is a comment node.
54
- # For HTML, also detects comments that Nokogiri parses as TEXT nodes
55
- # (content like "<!-- comment -->" or escaped "<\\!-- comment -->").
56
- def self.comment_node?(node)
87
+ # --- Noise classification ---
88
+
89
+ def self.noise_dimension_for(node)
90
+ if whitespace_only_text?(node)
91
+ :whitespace_adjacency
92
+ elsif comment_node?(node)
93
+ :comments
94
+ end
95
+ end
96
+
97
+ def self.noise_node?(node)
98
+ !noise_dimension_for(node).nil?
99
+ end
100
+
101
+ # --- Node queries ---
102
+
103
+ # Unified node name extraction across all node types.
104
+ def self.name(node)
105
+ return nil unless node
106
+ return node.name if node.is_a?(Canon::Xml::Node)
107
+ return node.label if node.is_a?(Canon::TreeDiff::Core::TreeNode)
108
+
109
+ XmlParsing.name(node)
110
+ end
111
+
112
+ # Unified parent access across all node types.
113
+ def self.parent(node)
114
+ return nil unless node
115
+ return node.parent if node.is_a?(Canon::Xml::Node)
116
+ return node.parent if node.is_a?(Canon::TreeDiff::Core::TreeNode)
117
+
118
+ XmlParsing.parent(node)
119
+ end
120
+
121
+ # Unified children access across all node types.
122
+ def self.children(node)
123
+ return [] unless node
124
+ return node.children if node.is_a?(Canon::Xml::Node)
125
+ return node.children || [] if node.is_a?(Canon::TreeDiff::Core::TreeNode)
126
+
127
+ XmlParsing.children(node)
128
+ end
129
+
130
+ # Extract the text content of +node+ as a String.
131
+ def self.text_content(node)
57
132
  case node
133
+ when Canon::Xml::Nodes::TextNode
134
+ node.value.to_s
58
135
  when Canon::Xml::Node
59
- node.node_type == :comment
60
- when Nokogiri::XML::Node
61
- return true if node.comment?
136
+ node.text_content.to_s
137
+ when Moxml::Text
138
+ node.content.to_s
139
+ else
140
+ XmlParsing.text_content(node).to_s
141
+ end
142
+ end
62
143
 
63
- # HTML comments are parsed as TEXT nodes by Nokogiri
64
- if node.text?
65
- text_stripped = text_content(node).to_s.strip.gsub("\\", "")
66
- return true if text_stripped.start_with?("<!--") && text_stripped.end_with?("-->")
67
- end
68
- false
144
+ # Unified node type that always returns a symbol.
145
+ # Returns nil for unrecognised nodes.
146
+ def self.node_type(node)
147
+ return nil unless node
148
+ return node.node_type if node.is_a?(Canon::Xml::Node)
149
+
150
+ if node.is_a?(Canon::TreeDiff::Core::TreeNode)
151
+ node.type&.to_sym
69
152
  else
70
- false
153
+ XmlParsing.node_type(node)
71
154
  end
72
155
  end
73
156
 
74
- # True when +node+ is an element node.
75
- def self.element_node?(node)
76
- case node
77
- when Canon::Xml::Node
78
- node.node_type == :element
79
- when Nokogiri::XML::Node
80
- node.element?
157
+ # Unified attribute value access.
158
+ def self.attribute_value(node, attr_name)
159
+ return nil unless node
160
+
161
+ if node.is_a?(Canon::Xml::Nodes::ElementNode)
162
+ attr = node.attribute_nodes.find { |a| a.name == attr_name.to_s }
163
+ attr&.value
164
+ elsif node.is_a?(Canon::Xml::Node)
165
+ nil
81
166
  else
82
- false
167
+ XmlParsing.attribute_value(node, attr_name)
168
+ end
169
+ end
170
+
171
+ # Unified namespace URI access.
172
+ def self.namespace_uri(node)
173
+ return nil unless node
174
+
175
+ if node.is_a?(Canon::Xml::Node)
176
+ node.is_a?(Canon::Xml::Nodes::ElementNode) ? node.namespace_uri : nil
177
+ else
178
+ XmlParsing.namespace_uri(node)
83
179
  end
84
180
  end
85
181
 
86
182
  # Extract parse-time errors carried on a node or its owning document.
87
- # Returns an Array of Strings.
88
183
  def self.parse_errors(node)
89
- case node
90
- when nil
91
- []
92
- when Canon::Xml::Node
93
- errors = node.parse_errors
94
- Array(errors).map(&:to_s)
95
- when Nokogiri::XML::Document, Nokogiri::HTML5::Document
96
- Array(node.errors).map(&:to_s)
184
+ return [] if node.nil?
185
+ return Array(node.parse_errors).map(&:to_s) if node.is_a?(Canon::Xml::Node)
186
+
187
+ if XmlBackend.nokogiri?
188
+ if node.is_a?(Nokogiri::XML::Document) || node.is_a?(Nokogiri::HTML5::Document)
189
+ Array(node.errors).map(&:to_s)
190
+ else
191
+ []
192
+ end
97
193
  else
98
194
  []
99
195
  end
100
196
  end
101
197
 
102
- # Return the parent node of +node+, or nil when +node+ is not a
103
- # recognised DOM backend type or has no parent.
198
+ # Deprecated: use NodeInspector.parent instead.
104
199
  def self.parent_of(node)
105
- case node
106
- when Canon::Xml::Node, Nokogiri::XML::Node
107
- node.parent
108
- end
200
+ parent(node)
109
201
  end
110
202
  end
111
203
  end
@@ -125,20 +125,18 @@ module Canon
125
125
  # @param doc2 [Object] Second XML document
126
126
  # @return [Array<String>] Preprocessed strings
127
127
  def preprocess_xml(doc1, doc2)
128
- # Serialize XML to string
129
- # Use XmlNodeComparison's serializer for Canon::Xml::Node
130
128
  xml1 = if doc1.is_a?(Canon::Xml::Node)
131
129
  XmlNodeComparison.serialize_node_to_xml(doc1)
132
- elsif doc1.respond_to?(:to_xml)
133
- doc1.to_xml
130
+ elsif Canon::XmlParsing.xml_node?(doc1)
131
+ Canon::XmlParsing.serialize(doc1)
134
132
  else
135
133
  doc1.to_s
136
134
  end
137
135
 
138
136
  xml2 = if doc2.is_a?(Canon::Xml::Node)
139
137
  XmlNodeComparison.serialize_node_to_xml(doc2)
140
- elsif doc2.respond_to?(:to_xml)
141
- doc2.to_xml
138
+ elsif Canon::XmlParsing.xml_node?(doc2)
139
+ Canon::XmlParsing.serialize(doc2)
142
140
  else
143
141
  doc2.to_s
144
142
  end
@@ -167,7 +165,7 @@ module Canon
167
165
  XmlNodeComparison.serialize_node_to_xml(doc1)
168
166
  elsif doc1.is_a?(Nokogiri::XML::DocumentFragment)
169
167
  doc1.to_s
170
- elsif doc1.respond_to?(:to_html)
168
+ elsif Canon::XmlParsing.xml_node?(doc1)
171
169
  doc1.to_html
172
170
  else
173
171
  doc1.to_s
@@ -177,7 +175,7 @@ module Canon
177
175
  XmlNodeComparison.serialize_node_to_xml(doc2)
178
176
  elsif doc2.is_a?(Nokogiri::XML::DocumentFragment)
179
177
  doc2.to_s
180
- elsif doc2.respond_to?(:to_html)
178
+ elsif Canon::XmlParsing.xml_node?(doc2)
181
179
  doc2.to_html
182
180
  else
183
181
  doc2.to_s