canon 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +25 -135
  3. data/README.adoc +13 -13
  4. data/docs/.lycheeignore +69 -0
  5. data/docs/advanced/extending-canon.adoc +193 -0
  6. data/docs/internals/diffnode-enrichment.adoc +611 -0
  7. data/docs/internals/index.adoc +251 -0
  8. data/docs/lychee.toml +13 -6
  9. data/docs/plans/2025-01-17-html-parser-selection-fix.adoc +250 -0
  10. data/docs/understanding/architecture.adoc +749 -33
  11. data/docs/understanding/comparison-pipeline.adoc +122 -0
  12. data/false_positive_analysis.txt +0 -0
  13. data/file1.html +1 -0
  14. data/file2.html +1 -0
  15. data/lib/canon/cache.rb +129 -0
  16. data/lib/canon/comparison/dimensions/attribute_order_dimension.rb +68 -0
  17. data/lib/canon/comparison/dimensions/attribute_presence_dimension.rb +68 -0
  18. data/lib/canon/comparison/dimensions/attribute_values_dimension.rb +171 -0
  19. data/lib/canon/comparison/dimensions/base_dimension.rb +107 -0
  20. data/lib/canon/comparison/dimensions/comments_dimension.rb +121 -0
  21. data/lib/canon/comparison/dimensions/element_position_dimension.rb +90 -0
  22. data/lib/canon/comparison/dimensions/registry.rb +77 -0
  23. data/lib/canon/comparison/dimensions/structural_whitespace_dimension.rb +119 -0
  24. data/lib/canon/comparison/dimensions/text_content_dimension.rb +96 -0
  25. data/lib/canon/comparison/dimensions.rb +54 -0
  26. data/lib/canon/comparison/format_detector.rb +86 -0
  27. data/lib/canon/comparison/html_comparator.rb +51 -18
  28. data/lib/canon/comparison/html_parser.rb +80 -0
  29. data/lib/canon/comparison/json_comparator.rb +12 -0
  30. data/lib/canon/comparison/json_parser.rb +19 -0
  31. data/lib/canon/comparison/markup_comparator.rb +293 -0
  32. data/lib/canon/comparison/match_options/base_resolver.rb +143 -0
  33. data/lib/canon/comparison/match_options/json_resolver.rb +82 -0
  34. data/lib/canon/comparison/match_options/xml_resolver.rb +151 -0
  35. data/lib/canon/comparison/match_options/yaml_resolver.rb +87 -0
  36. data/lib/canon/comparison/match_options.rb +68 -463
  37. data/lib/canon/comparison/profile_definition.rb +149 -0
  38. data/lib/canon/comparison/ruby_object_comparator.rb +180 -0
  39. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +7 -10
  40. data/lib/canon/comparison/xml_comparator/attribute_comparator.rb +177 -0
  41. data/lib/canon/comparison/xml_comparator/attribute_filter.rb +136 -0
  42. data/lib/canon/comparison/xml_comparator/child_comparison.rb +189 -0
  43. data/lib/canon/comparison/xml_comparator/diff_node_builder.rb +115 -0
  44. data/lib/canon/comparison/xml_comparator/namespace_comparator.rb +186 -0
  45. data/lib/canon/comparison/xml_comparator/node_parser.rb +74 -0
  46. data/lib/canon/comparison/xml_comparator/node_type_comparator.rb +95 -0
  47. data/lib/canon/comparison/xml_comparator.rb +52 -664
  48. data/lib/canon/comparison/xml_node_comparison.rb +297 -0
  49. data/lib/canon/comparison/xml_parser.rb +19 -0
  50. data/lib/canon/comparison/yaml_comparator.rb +3 -3
  51. data/lib/canon/comparison.rb +265 -110
  52. data/lib/canon/diff/diff_node.rb +32 -2
  53. data/lib/canon/diff/node_serializer.rb +191 -0
  54. data/lib/canon/diff/path_builder.rb +143 -0
  55. data/lib/canon/diff_formatter/by_line/base_formatter.rb +251 -0
  56. data/lib/canon/diff_formatter/by_line/html_formatter.rb +6 -248
  57. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +38 -229
  58. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +30 -0
  59. data/lib/canon/diff_formatter/diff_detail_formatter/dimension_formatter.rb +579 -0
  60. data/lib/canon/diff_formatter/diff_detail_formatter/location_extractor.rb +121 -0
  61. data/lib/canon/diff_formatter/diff_detail_formatter/node_utils.rb +253 -0
  62. data/lib/canon/diff_formatter/diff_detail_formatter/text_utils.rb +61 -0
  63. data/lib/canon/diff_formatter/diff_detail_formatter.rb +31 -1028
  64. data/lib/canon/diff_formatter.rb +1 -1
  65. data/lib/canon/rspec_matchers.rb +1 -1
  66. data/lib/canon/tree_diff/operation_converter.rb +92 -338
  67. data/lib/canon/tree_diff/operation_converter_helpers/metadata_enricher.rb +71 -0
  68. data/lib/canon/tree_diff/operation_converter_helpers/post_processor.rb +103 -0
  69. data/lib/canon/tree_diff/operation_converter_helpers/reason_builder.rb +168 -0
  70. data/lib/canon/tree_diff/operation_converter_helpers/update_change_handler.rb +188 -0
  71. data/lib/canon/version.rb +1 -1
  72. data/old-docs/ADVANCED_TOPICS.adoc +20 -0
  73. data/old-docs/BASIC_USAGE.adoc +16 -0
  74. data/old-docs/CHARACTER_VISUALIZATION.adoc +567 -0
  75. data/old-docs/CLI.adoc +497 -0
  76. data/old-docs/CUSTOMIZING_BEHAVIOR.adoc +19 -0
  77. data/old-docs/DIFF_ARCHITECTURE.adoc +435 -0
  78. data/old-docs/DIFF_FORMATTING.adoc +540 -0
  79. data/old-docs/DIFF_PARAMETERS.adoc +261 -0
  80. data/old-docs/DOM_DIFF.adoc +1017 -0
  81. data/old-docs/ENV_CONFIG.adoc +876 -0
  82. data/old-docs/FORMATS.adoc +867 -0
  83. data/old-docs/INPUT_VALIDATION.adoc +477 -0
  84. data/old-docs/MATCHER_BEHAVIOR.adoc +90 -0
  85. data/old-docs/MATCH_ARCHITECTURE.adoc +463 -0
  86. data/old-docs/MATCH_OPTIONS.adoc +912 -0
  87. data/old-docs/MODES.adoc +432 -0
  88. data/old-docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +219 -0
  89. data/old-docs/OPTIONS.adoc +1387 -0
  90. data/old-docs/PREPROCESSING.adoc +491 -0
  91. data/old-docs/README.old.adoc +2831 -0
  92. data/old-docs/RSPEC.adoc +814 -0
  93. data/old-docs/RUBY_API.adoc +485 -0
  94. data/old-docs/SEMANTIC_DIFF_REPORT.adoc +646 -0
  95. data/old-docs/SEMANTIC_TREE_DIFF.adoc +765 -0
  96. data/old-docs/STRING_COMPARE.adoc +345 -0
  97. data/old-docs/TMP.adoc +3384 -0
  98. data/old-docs/TREE_DIFF.adoc +1080 -0
  99. data/old-docs/UNDERSTANDING_CANON.adoc +17 -0
  100. data/old-docs/VERBOSE.adoc +482 -0
  101. data/old-docs/VISUALIZATION_MAP.adoc +625 -0
  102. data/old-docs/WHITESPACE_TREATMENT.adoc +1155 -0
  103. data/scripts/analyze_current_state.rb +85 -0
  104. data/scripts/analyze_false_positives.rb +114 -0
  105. data/scripts/analyze_remaining_failures.rb +105 -0
  106. data/scripts/compare_current_failures.rb +95 -0
  107. data/scripts/compare_dom_tree_diff.rb +158 -0
  108. data/scripts/compare_failures.rb +151 -0
  109. data/scripts/debug_attribute_extraction.rb +66 -0
  110. data/scripts/debug_blocks_839.rb +115 -0
  111. data/scripts/debug_meta_matching.rb +52 -0
  112. data/scripts/debug_p_matching.rb +192 -0
  113. data/scripts/debug_signature_matching.rb +118 -0
  114. data/scripts/debug_sourcecode_124.rb +32 -0
  115. data/scripts/debug_whitespace_sensitive.rb +192 -0
  116. data/scripts/extract_false_positives.rb +138 -0
  117. data/scripts/find_actual_false_positives.rb +125 -0
  118. data/scripts/investigate_all_false_positives.rb +161 -0
  119. data/scripts/investigate_batch1.rb +127 -0
  120. data/scripts/investigate_classification.rb +150 -0
  121. data/scripts/investigate_classification_detailed.rb +190 -0
  122. data/scripts/investigate_common_failures.rb +342 -0
  123. data/scripts/investigate_false_negative.rb +80 -0
  124. data/scripts/investigate_false_positive.rb +83 -0
  125. data/scripts/investigate_false_positives.rb +227 -0
  126. data/scripts/investigate_false_positives_batch.rb +163 -0
  127. data/scripts/investigate_mixed_content.rb +125 -0
  128. data/scripts/investigate_remaining_16.rb +214 -0
  129. data/scripts/run_single_test.rb +29 -0
  130. data/scripts/test_all_false_positives.rb +95 -0
  131. data/scripts/test_attribute_details.rb +61 -0
  132. data/scripts/test_both_algorithms.rb +49 -0
  133. data/scripts/test_both_simple.rb +49 -0
  134. data/scripts/test_enhanced_semantic_output.rb +125 -0
  135. data/scripts/test_readme_examples.rb +131 -0
  136. data/scripts/test_semantic_tree_diff.rb +99 -0
  137. data/scripts/test_semantic_ux_improvements.rb +135 -0
  138. data/scripts/test_single_false_positive.rb +119 -0
  139. data/scripts/test_size_limits.rb +99 -0
  140. data/test_html_1.html +21 -0
  141. data/test_html_2.html +21 -0
  142. data/test_nokogiri.rb +33 -0
  143. data/test_normalize.rb +45 -0
  144. metadata +123 -2
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "nokogiri"
4
4
  require_relative "../comparison" # Load base module with constants first
5
+ require_relative "markup_comparator"
5
6
  require_relative "xml_comparator"
6
7
  require_relative "match_options"
7
8
  require_relative "comparison_result"
@@ -11,12 +12,15 @@ require_relative "../diff/diff_node"
11
12
  require_relative "../diff/diff_classifier"
12
13
  require_relative "strategies/match_strategy_factory"
13
14
  require_relative "../html/data_model"
15
+ require_relative "xml_node_comparison"
14
16
 
15
17
  module Canon
16
18
  module Comparison
17
19
  # HTML comparison class
18
20
  # Handles comparison of HTML nodes with various options
19
- class HtmlComparator
21
+ #
22
+ # Inherits shared comparison functionality from MarkupComparator.
23
+ class HtmlComparator < MarkupComparator
20
24
  # Default comparison options for HTML
21
25
  DEFAULT_OPTS = {
22
26
  # Structural filtering options
@@ -108,6 +112,9 @@ module Canon
108
112
 
109
113
  # DocumentFragment nodes need special handling - compare their children
110
114
  # instead of the fragment nodes themselves
115
+ # This is a SAFETY CHECK for legacy cases where Nokogiri nodes might still be used
116
+ # The main path (parse_node) now returns Canon::Xml::Nodes::RootNode, so this
117
+ # check should rarely trigger, but we keep it for robustness
111
118
  if (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
112
119
  node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
113
120
  (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
@@ -117,10 +124,8 @@ module Canon
117
124
  all_children2 = node2.children.to_a
118
125
 
119
126
  # Filter children based on match options (e.g., ignore comments)
120
- children1 = XmlComparator.send(:filter_children, all_children1,
121
- opts)
122
- children2 = XmlComparator.send(:filter_children, all_children2,
123
- opts)
127
+ children1 = XmlNodeComparison.filter_children(all_children1, opts)
128
+ children2 = XmlNodeComparison.filter_children(all_children2, opts)
124
129
 
125
130
  if children1.length != children2.length
126
131
  result = Comparison::UNEQUAL_ELEMENTS
@@ -130,9 +135,10 @@ module Canon
130
135
  # Compare each pair of children
131
136
  result = Comparison::EQUIVALENT
132
137
  children1.zip(children2).each do |child1, child2|
133
- child_result = XmlComparator.send(:compare_nodes, child1, child2,
134
- opts, child_opts, diff_children,
135
- differences)
138
+ child_result = XmlNodeComparison.compare_nodes(child1, child2,
139
+ opts, child_opts,
140
+ diff_children,
141
+ differences)
136
142
  if child_result != Comparison::EQUIVALENT
137
143
  result = child_result
138
144
  break
@@ -140,8 +146,9 @@ module Canon
140
146
  end
141
147
  end
142
148
  else
143
- result = XmlComparator.send(:compare_nodes, node1, node2, opts,
144
- child_opts, diff_children, differences)
149
+ result = XmlNodeComparison.compare_nodes(node1, node2, opts,
150
+ child_opts, diff_children,
151
+ differences)
145
152
  end
146
153
 
147
154
  # Classify DiffNodes as normative/informative if we have verbose output
@@ -287,7 +294,16 @@ module Canon
287
294
  end
288
295
 
289
296
  # Strip DOCTYPE for consistent parsing
290
- html_string = html_string.gsub(/<!DOCTYPE[^>]*>/i, "").strip
297
+ # Use non-regex approach to avoid ReDoS vulnerability
298
+ # DOCTYPE declarations end with first > character
299
+ doctype_start = html_string =~ /<!DOCTYPE/i
300
+ if doctype_start
301
+ doctype_end = html_string.index(">", doctype_start)
302
+ html_string = html_string[0...doctype_start] + html_string[(doctype_end + 1)..] if doctype_end
303
+ html_string.strip!
304
+ else
305
+ html_string = html_string.strip
306
+ end
291
307
 
292
308
  # Apply preprocessing to HTML string before parsing
293
309
  processed_html = case preprocessing
@@ -313,8 +329,15 @@ module Canon
313
329
 
314
330
  # Parse a node from string or return as-is
315
331
  # Applies preprocessing transformation before parsing if specified
316
- # For DOM comparison, returns Nokogiri nodes (not Canon::Xml::Node)
332
+ # Returns Nokogiri nodes for DOM comparison (preserves original behavior)
317
333
  def parse_node(node, preprocessing = :none, match_opts = {})
334
+ # If already a Canon::Xml::Node, convert to Nokogiri for DOM path
335
+ if node.is_a?(Canon::Xml::Node)
336
+ # Canon nodes used in semantic diff path, convert to Nokogiri for DOM path
337
+ xml_str = Canon::Xml::DataModel.serialize(node)
338
+ node = xml_str
339
+ end
340
+
318
341
  # If already a Nokogiri node, check for incompatible XML documents
319
342
  unless node.is_a?(String)
320
343
  # Detect if this is an XML document (not HTML)
@@ -357,7 +380,15 @@ module Canon
357
380
 
358
381
  # Strip DOCTYPE declarations from HTML strings
359
382
  # This normalizes parsed HTML (which includes DOCTYPE) with raw HTML strings
360
- node = node.gsub(/<!DOCTYPE[^>]*>/i, "").strip
383
+ # Use non-regex approach to avoid ReDoS vulnerability
384
+ doctype_start = node =~ /<!DOCTYPE/i
385
+ if doctype_start
386
+ doctype_end = node.index(">", doctype_start)
387
+ node = node[0...doctype_start] + node[(doctype_end + 1)..] if doctype_end
388
+ node.strip!
389
+ else
390
+ node = node.strip
391
+ end
361
392
 
362
393
  # Apply preprocessing to HTML string before parsing
363
394
  html_string = case preprocessing
@@ -380,10 +411,12 @@ module Canon
380
411
  # Use XML fragment parser to avoid auto-inserted meta tags
381
412
  frag = Nokogiri::XML.fragment(html_string)
382
413
 
383
- # Apply :rendered preprocessing if needed
384
- if preprocessing == :rendered
414
+ # Apply post-parsing filtering for :normalize, :format, and :rendered preprocessing
415
+ if %i[normalize format rendered].include?(preprocessing)
385
416
  normalize_html_style_script_comments(frag)
386
- normalize_rendered_whitespace(frag, match_opts)
417
+ if preprocessing == :rendered
418
+ normalize_rendered_whitespace(frag, match_opts)
419
+ end
387
420
  remove_whitespace_only_text_nodes(frag)
388
421
  end
389
422
 
@@ -461,9 +494,9 @@ module Canon
461
494
  # @param node [Canon::Xml::Node, Nokogiri::HTML::Document] Parsed node
462
495
  # @return [String] Serialized HTML string
463
496
  def serialize_for_display(node)
464
- # Use XmlComparator's serializer for Canon::Xml::Node
497
+ # Use XmlNodeComparison's serializer for Canon::Xml::Node
465
498
  if node.is_a?(Canon::Xml::Node)
466
- XmlComparator.send(:serialize_node_to_xml, node)
499
+ XmlNodeComparison.serialize_node_to_xml(node)
467
500
  elsif node.respond_to?(:to_html)
468
501
  node.to_html
469
502
  elsif node.respond_to?(:to_xml)
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+
5
+ module Canon
6
+ module Comparison
7
+ # HTML parsing service with version detection and fragment support
8
+ #
9
+ # Provides HTML parsing capabilities with automatic HTML4/HTML5 version
10
+ # detection. Handles both full documents and fragments.
11
+ #
12
+ # @example Parse HTML string
13
+ # HtmlParser.parse("<div>content</div>", :html5)
14
+ #
15
+ # @example Auto-detect and parse
16
+ # HtmlParser.detect_and_parse("<!DOCTYPE html><html>...</html>")
17
+ class HtmlParser
18
+ class << self
19
+ # Parse HTML string into Nokogiri document with the correct parser
20
+ #
21
+ # @param content [String, Object] Content to parse (returns as-is if not a string)
22
+ # @param format [Symbol] HTML format (:html, :html4, :html5)
23
+ # @return [Nokogiri::HTML::Document, Nokogiri::HTML5::Document, Nokogiri::HTML::DocumentFragment, Object]
24
+ def parse(content, format)
25
+ return content unless content.is_a?(String)
26
+ return content if already_parsed?(content)
27
+
28
+ begin
29
+ case format
30
+ when :html5
31
+ Nokogiri::HTML5.fragment(content)
32
+ when :html4
33
+ Nokogiri::HTML4.fragment(content)
34
+ when :html
35
+ detect_and_parse(content)
36
+ else
37
+ content
38
+ end
39
+ rescue StandardError
40
+ # Fallback to raw string if parsing fails (maintains backward compatibility)
41
+ content
42
+ end
43
+ end
44
+
45
+ # Check if content is already a parsed HTML document/fragment
46
+ #
47
+ # @param content [Object] Content to check
48
+ # @return [Boolean] true if already parsed
49
+ def already_parsed?(content)
50
+ content.is_a?(Nokogiri::HTML::Document) ||
51
+ content.is_a?(Nokogiri::HTML5::Document) ||
52
+ content.is_a?(Nokogiri::HTML::DocumentFragment) ||
53
+ content.is_a?(Nokogiri::HTML5::DocumentFragment)
54
+ end
55
+
56
+ # Detect HTML version from content and parse with appropriate parser
57
+ #
58
+ # @param content [String] HTML content to parse
59
+ # @return [Nokogiri::HTML::DocumentFragment] Parsed fragment
60
+ def detect_and_parse(content)
61
+ version = detect_version(content)
62
+ if version == :html5
63
+ Nokogiri::HTML5.fragment(content)
64
+ else
65
+ Nokogiri::HTML4.fragment(content)
66
+ end
67
+ end
68
+
69
+ # Detect HTML version from content string
70
+ #
71
+ # @param content [String] HTML content
72
+ # @return [Symbol] :html5 or :html4
73
+ def detect_version(content)
74
+ # Check for HTML5 DOCTYPE (case-insensitive)
75
+ content.include?("<!DOCTYPE html>") ? :html5 : :html4
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -125,6 +125,18 @@ module Canon
125
125
  if match_opts[:key_order] != :strict
126
126
  keys1 = keys1.sort_by(&:to_s)
127
127
  keys2 = keys2.sort_by(&:to_s)
128
+ elsif keys1 != keys2
129
+ # Strict mode: key order matters
130
+ # Check if keys are in same order
131
+ # Keys are different or in different order
132
+ # First check if it's just ordering (same keys, different order)
133
+ if keys1.sort_by(&:to_s) == keys2.sort_by(&:to_s)
134
+ # Same keys, different order - this is a key_order difference
135
+ key_path = path.empty? ? "(key order)" : "#{path}.(key order)"
136
+ add_ruby_difference(key_path, keys1, keys2,
137
+ Comparison::UNEQUAL_HASH_KEY_ORDER, opts, differences)
138
+ return Comparison::UNEQUAL_HASH_KEY_ORDER
139
+ end
128
140
  end
129
141
 
130
142
  # Check for missing keys
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module Comparison
5
+ # Public API for JSON parsing operations
6
+ # Provides access to parsing functionality without using send()
7
+ class JsonParser
8
+ # Parse an object to Ruby object
9
+ #
10
+ # @param obj [String, Hash, Array] Object to parse
11
+ # @return [Hash, Array] Parsed Ruby object
12
+ def self.parse_json(obj)
13
+ # Delegate to JsonComparator's private method via public API
14
+ require_relative "json_comparator"
15
+ JsonComparator.parse_json(obj)
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,293 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../comparison" # Load base module with constants
4
+ require_relative "../diff/diff_node"
5
+ require_relative "../diff/path_builder"
6
+
7
+ module Canon
8
+ module Comparison
9
+ # Base class for markup document comparison (XML, HTML)
10
+ #
11
+ # Provides shared comparison functionality for markup documents,
12
+ # including node type checking, text extraction, filtering,
13
+ # and difference creation.
14
+ #
15
+ # Format-specific comparators (XmlComparator, HtmlComparator)
16
+ # inherit from this class and add format-specific behavior.
17
+ class MarkupComparator
18
+ class << self
19
+ # Add a difference to the differences array
20
+ #
21
+ # Creates a DiffNode with enriched metadata including path,
22
+ # serialized content, and attributes for Stage 4 rendering.
23
+ #
24
+ # @param node1 [Object, nil] First node
25
+ # @param node2 [Object, nil] Second node
26
+ # @param diff1 [Symbol] Difference type for node1
27
+ # @param diff2 [Symbol] Difference type for node2
28
+ # @param dimension [Symbol] The match dimension causing this difference
29
+ # @param _opts [Hash] Options (unused but kept for interface compatibility)
30
+ # @param differences [Array] Array to append difference to
31
+ def add_difference(node1, node2, diff1, diff2, dimension, _opts,
32
+ differences)
33
+ # All differences must be DiffNode objects (OO architecture)
34
+ if dimension.nil?
35
+ raise ArgumentError,
36
+ "dimension required for DiffNode"
37
+ end
38
+
39
+ # Build informative reason message
40
+ reason = build_difference_reason(node1, node2, diff1, diff2,
41
+ dimension)
42
+
43
+ # Enrich with path, serialized content, and attributes for Stage 4 rendering
44
+ metadata = enrich_diff_metadata(node1, node2)
45
+
46
+ diff_node = Canon::Diff::DiffNode.new(
47
+ node1: node1,
48
+ node2: node2,
49
+ dimension: dimension,
50
+ reason: reason,
51
+ **metadata,
52
+ )
53
+ differences << diff_node
54
+ end
55
+
56
+ # Enrich DiffNode with canonical path, serialized content, and attributes
57
+ # This extracts presentation-ready metadata from nodes for Stage 4 rendering
58
+ #
59
+ # @param node1 [Object, nil] First node
60
+ # @param node2 [Object, nil] Second node
61
+ # @return [Hash] Enriched metadata hash
62
+ def enrich_diff_metadata(node1, node2)
63
+ {
64
+ path: build_path_for_node(node1 || node2),
65
+ serialized_before: serialize_node(node1),
66
+ serialized_after: serialize_node(node2),
67
+ attributes_before: extract_attributes(node1),
68
+ attributes_after: extract_attributes(node2),
69
+ }
70
+ end
71
+
72
+ # Build canonical path for a node
73
+ #
74
+ # @param node [Object] Node to build path for
75
+ # @return [String, nil] Canonical path with ordinal indices
76
+ def build_path_for_node(node)
77
+ return nil if node.nil?
78
+
79
+ Canon::Diff::PathBuilder.build(node, format: :document)
80
+ end
81
+
82
+ # Serialize a node to string for display
83
+ #
84
+ # @param node [Object, nil] Node to serialize
85
+ # @return [String, nil] Serialized content
86
+ def serialize_node(node)
87
+ return nil if node.nil?
88
+
89
+ # Canon::Xml::Node types
90
+ if node.is_a?(Canon::Xml::Nodes::RootNode)
91
+ # Serialize all children of root
92
+ node.children.map { |child| serialize_node(child) }.join
93
+ elsif node.is_a?(Canon::Xml::Nodes::ElementNode)
94
+ serialize_element_node(node)
95
+ elsif node.is_a?(Canon::Xml::Nodes::TextNode)
96
+ node.value
97
+ elsif node.is_a?(Canon::Xml::Nodes::CommentNode)
98
+ "<!--#{node.value}-->"
99
+ elsif node.is_a?(Canon::Xml::Nodes::ProcessingInstructionNode)
100
+ "<?#{node.target} #{node.data}?>"
101
+ elsif node.respond_to?(:to_xml)
102
+ node.to_xml
103
+ elsif node.respond_to?(:to_html)
104
+ node.to_html
105
+ else
106
+ node.to_s
107
+ end
108
+ end
109
+
110
+ # Extract attributes from a node
111
+ #
112
+ # @param node [Object, nil] Node to extract attributes from
113
+ # @return [Hash, nil] Hash of attribute name => value pairs
114
+ def extract_attributes(node)
115
+ return nil if node.nil?
116
+
117
+ # Canon::Xml::Node ElementNode
118
+ if node.is_a?(Canon::Xml::Nodes::ElementNode)
119
+ node.attribute_nodes.each_with_object({}) do |attr, hash|
120
+ hash[attr.name] = attr.value
121
+ end
122
+ # Nokogiri nodes
123
+ elsif node.respond_to?(:attributes)
124
+ node.attributes.each_with_object({}) do |(_, attr), hash|
125
+ hash[attr.name] = attr.value
126
+ end
127
+ else
128
+ {}
129
+ end
130
+ end
131
+
132
+ # Filter children based on options
133
+ #
134
+ # Removes nodes that should be excluded from comparison based on
135
+ # options like :ignore_nodes, :ignore_comments, etc.
136
+ #
137
+ # @param children [Array] Array of child nodes
138
+ # @param opts [Hash] Comparison options
139
+ # @return [Array] Filtered array of children
140
+ def filter_children(children, opts)
141
+ children.reject do |child|
142
+ node_excluded?(child, opts)
143
+ end
144
+ end
145
+
146
+ # Check if node should be excluded from comparison
147
+ #
148
+ # @param node [Object] Node to check
149
+ # @param opts [Hash] Comparison options
150
+ # @return [Boolean] true if node should be excluded
151
+ def node_excluded?(node, opts)
152
+ return false if node.nil?
153
+ return true if opts[:ignore_nodes]&.include?(node)
154
+ return true if opts[:ignore_comments] && comment_node?(node)
155
+ return true if opts[:ignore_text_nodes] && text_node?(node)
156
+
157
+ # Check structural_whitespace match option
158
+ match_opts = opts[:match_opts]
159
+ # Filter out whitespace-only text nodes
160
+ if match_opts && %i[ignore
161
+ normalize].include?(match_opts[:structural_whitespace]) && text_node?(node)
162
+ text = node_text(node)
163
+ return true if MatchOptions.normalize_text(text).empty?
164
+ end
165
+
166
+ false
167
+ end
168
+
169
+ # Check if two nodes are the same type
170
+ #
171
+ # @param node1 [Object] First node
172
+ # @param node2 [Object] Second node
173
+ # @return [Boolean] true if nodes are same type
174
+ def same_node_type?(node1, node2)
175
+ return false if node1.class != node2.class
176
+
177
+ # For Nokogiri/Canon::Xml nodes, check node type
178
+ if node1.respond_to?(:node_type) && node2.respond_to?(:node_type)
179
+ node1.node_type == node2.node_type
180
+ else
181
+ true
182
+ end
183
+ end
184
+
185
+ # Check if a node is a comment node
186
+ #
187
+ # @param node [Object] Node to check
188
+ # @return [Boolean] true if node is a comment
189
+ def comment_node?(node)
190
+ node.respond_to?(:comment?) && node.comment? ||
191
+ node.respond_to?(:node_type) && node.node_type == :comment
192
+ end
193
+
194
+ # Check if a node is a text node
195
+ #
196
+ # @param node [Object] Node to check
197
+ # @return [Boolean] true if node is a text node
198
+ def text_node?(node)
199
+ node.respond_to?(:text?) && node.text? &&
200
+ !node.respond_to?(:element?) ||
201
+ node.respond_to?(:node_type) && node.node_type == :text
202
+ end
203
+
204
+ # Get text content from a node
205
+ #
206
+ # @param node [Object] Node to get text from
207
+ # @return [String] Text content
208
+ def node_text(node)
209
+ # Canon::Xml::Node TextNode uses .value
210
+ if node.respond_to?(:value)
211
+ node.value.to_s
212
+ # Nokogiri nodes use .content
213
+ elsif node.respond_to?(:content)
214
+ node.content.to_s
215
+ else
216
+ node.to_s
217
+ end
218
+ end
219
+
220
+ # Check if difference between two texts is only whitespace
221
+ #
222
+ # @param text1 [String] First text
223
+ # @param text2 [String] Second text
224
+ # @return [Boolean] true if difference is only in whitespace
225
+ def whitespace_only_difference?(text1, text2)
226
+ # Normalize both texts (collapse/trim whitespace)
227
+ norm1 = MatchOptions.normalize_text(text1)
228
+ norm2 = MatchOptions.normalize_text(text2)
229
+
230
+ # If normalized texts are the same, the difference was only whitespace
231
+ norm1 == norm2
232
+ end
233
+
234
+ # Build a human-readable reason for a difference
235
+ #
236
+ # @param node1 [Object, nil] First node
237
+ # @param node2 [Object, nil] Second node
238
+ # @param diff1 [Symbol] Difference type for node1
239
+ # @param diff2 [Symbol] Difference type for node2
240
+ # @param dimension [Symbol] The dimension of the difference
241
+ # @return [String] Human-readable reason
242
+ def build_difference_reason(_node1, _node2, diff1, diff2, dimension)
243
+ # Default reason - can be overridden in subclasses
244
+ "Difference in #{dimension}: #{diff1} vs #{diff2}"
245
+ end
246
+
247
+ # Serialize an element node to string
248
+ #
249
+ # @param node [Canon::Xml::Nodes::ElementNode] Element node
250
+ # @return [String] Serialized element
251
+ def serialize_element_node(node)
252
+ attrs = node.attribute_nodes.map do |a|
253
+ " #{a.name}=\"#{a.value}\""
254
+ end.join
255
+ children_xml = node.children.map { |c| serialize_node(c) }.join
256
+
257
+ if children_xml.empty?
258
+ "<#{node.name}#{attrs}/>"
259
+ else
260
+ "<#{node.name}#{attrs}>#{children_xml}</#{node.name}>"
261
+ end
262
+ end
263
+
264
+ # Determine the appropriate dimension for a node type
265
+ #
266
+ # @param node [Object] The node to check
267
+ # @return [Symbol] The dimension symbol
268
+ def determine_node_dimension(node)
269
+ # Canon::Xml::Node types
270
+ if node.respond_to?(:node_type) && node.node_type.is_a?(Symbol)
271
+ case node.node_type
272
+ when :comment then :comments
273
+ when :text, :cdata then :text_content
274
+ when :processing_instruction then :processing_instructions
275
+ else :text_content
276
+ end
277
+ # Moxml/Nokogiri types
278
+ elsif node.respond_to?(:comment?) && node.comment?
279
+ :comments
280
+ elsif node.respond_to?(:text?) && node.text?
281
+ :text_content
282
+ elsif node.respond_to?(:cdata?) && node.cdata?
283
+ :text_content
284
+ elsif node.respond_to?(:processing_instruction?) && node.processing_instruction?
285
+ :processing_instructions
286
+ else
287
+ :text_content
288
+ end
289
+ end
290
+ end
291
+ end
292
+ end
293
+ end