canon 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +163 -67
  3. data/README.adoc +400 -7
  4. data/docs/Gemfile +9 -0
  5. data/docs/INDEX.adoc +99 -182
  6. data/docs/_config.yml +100 -0
  7. data/docs/advanced/diff-classification.adoc +547 -0
  8. data/docs/advanced/diff-pipeline.adoc +358 -0
  9. data/docs/advanced/index.adoc +214 -0
  10. data/docs/advanced/semantic-diff-report.adoc +390 -0
  11. data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
  12. data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
  13. data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
  14. data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
  15. data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
  16. data/docs/features/diff-formatting/display-filtering.adoc +472 -0
  17. data/docs/features/diff-formatting/index.adoc +140 -0
  18. data/docs/features/environment-configuration/index.adoc +327 -0
  19. data/docs/features/environment-configuration/override-system.adoc +436 -0
  20. data/docs/features/environment-configuration/size-limits.adoc +273 -0
  21. data/docs/features/index.adoc +173 -0
  22. data/docs/features/input-validation/index.adoc +521 -0
  23. data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
  24. data/docs/features/match-options/html-policies.adoc +312 -0
  25. data/docs/features/match-options/index.adoc +621 -0
  26. data/docs/getting-started/index.adoc +83 -0
  27. data/docs/getting-started/quick-start.adoc +76 -0
  28. data/docs/guides/choosing-configuration.adoc +689 -0
  29. data/docs/guides/index.adoc +181 -0
  30. data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
  31. data/docs/interfaces/index.adoc +101 -0
  32. data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
  33. data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
  34. data/docs/lychee.toml +65 -0
  35. data/docs/reference/cli-options.adoc +418 -0
  36. data/docs/reference/environment-variables.adoc +375 -0
  37. data/docs/reference/index.adoc +204 -0
  38. data/docs/reference/options-across-interfaces.adoc +417 -0
  39. data/docs/understanding/algorithms/dom-diff.adoc +389 -0
  40. data/docs/understanding/algorithms/index.adoc +314 -0
  41. data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
  42. data/docs/understanding/architecture.adoc +447 -0
  43. data/docs/understanding/comparison-pipeline.adoc +317 -0
  44. data/docs/understanding/formats/html.adoc +380 -0
  45. data/docs/understanding/formats/index.adoc +261 -0
  46. data/docs/understanding/formats/json.adoc +390 -0
  47. data/docs/understanding/formats/xml.adoc +366 -0
  48. data/docs/understanding/formats/yaml.adoc +504 -0
  49. data/docs/understanding/index.adoc +130 -0
  50. data/lib/canon/cli.rb +42 -1
  51. data/lib/canon/commands/diff_command.rb +108 -23
  52. data/lib/canon/comparison/compare_profile.rb +101 -0
  53. data/lib/canon/comparison/comparison_result.rb +41 -2
  54. data/lib/canon/comparison/html_comparator.rb +292 -71
  55. data/lib/canon/comparison/html_compare_profile.rb +117 -0
  56. data/lib/canon/comparison/match_options.rb +42 -4
  57. data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
  58. data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
  59. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
  60. data/lib/canon/comparison/xml_comparator.rb +695 -91
  61. data/lib/canon/comparison.rb +207 -2
  62. data/lib/canon/config/env_provider.rb +71 -0
  63. data/lib/canon/config/env_schema.rb +58 -0
  64. data/lib/canon/config/override_resolver.rb +55 -0
  65. data/lib/canon/config/type_converter.rb +59 -0
  66. data/lib/canon/config.rb +158 -29
  67. data/lib/canon/data_model.rb +29 -0
  68. data/lib/canon/diff/diff_classifier.rb +74 -14
  69. data/lib/canon/diff/diff_context_builder.rb +41 -0
  70. data/lib/canon/diff/diff_line.rb +18 -2
  71. data/lib/canon/diff/diff_node.rb +18 -3
  72. data/lib/canon/diff/diff_node_mapper.rb +71 -12
  73. data/lib/canon/diff/formatting_detector.rb +53 -0
  74. data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
  75. data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
  76. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
  77. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
  78. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
  79. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
  80. data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
  81. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
  82. data/lib/canon/diff_formatter/debug_output.rb +7 -1
  83. data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
  84. data/lib/canon/diff_formatter/legend.rb +42 -0
  85. data/lib/canon/diff_formatter.rb +78 -9
  86. data/lib/canon/errors.rb +56 -0
  87. data/lib/canon/formatters/html_formatter_base.rb +35 -1
  88. data/lib/canon/formatters/json_formatter.rb +3 -0
  89. data/lib/canon/formatters/yaml_formatter.rb +3 -0
  90. data/lib/canon/html/data_model.rb +229 -0
  91. data/lib/canon/html.rb +9 -0
  92. data/lib/canon/options/cli_generator.rb +70 -0
  93. data/lib/canon/options/registry.rb +234 -0
  94. data/lib/canon/rspec_matchers.rb +34 -13
  95. data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
  96. data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
  97. data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
  98. data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
  99. data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
  100. data/lib/canon/tree_diff/core/matching.rb +241 -0
  101. data/lib/canon/tree_diff/core/node_signature.rb +164 -0
  102. data/lib/canon/tree_diff/core/node_weight.rb +135 -0
  103. data/lib/canon/tree_diff/core/tree_node.rb +450 -0
  104. data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
  105. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
  106. data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
  107. data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
  108. data/lib/canon/tree_diff/operation_converter.rb +631 -0
  109. data/lib/canon/tree_diff/operations/operation.rb +92 -0
  110. data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
  111. data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
  112. data/lib/canon/tree_diff.rb +33 -0
  113. data/lib/canon/validators/json_validator.rb +3 -1
  114. data/lib/canon/validators/yaml_validator.rb +3 -1
  115. data/lib/canon/version.rb +1 -1
  116. data/lib/canon/xml/data_model.rb +22 -23
  117. data/lib/canon/xml/element_matcher.rb +128 -20
  118. data/lib/canon/xml/namespace_helper.rb +110 -0
  119. data/lib/canon.rb +3 -0
  120. metadata +81 -23
  121. data/_config.yml +0 -116
  122. data/docs/ADVANCED_TOPICS.adoc +0 -20
  123. data/docs/BASIC_USAGE.adoc +0 -16
  124. data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  125. data/docs/DIFF_ARCHITECTURE.adoc +0 -435
  126. data/docs/DIFF_FORMATTING.adoc +0 -540
  127. data/docs/FORMATS.adoc +0 -447
  128. data/docs/INPUT_VALIDATION.adoc +0 -477
  129. data/docs/MATCH_ARCHITECTURE.adoc +0 -463
  130. data/docs/MATCH_OPTIONS.adoc +0 -719
  131. data/docs/MODES.adoc +0 -432
  132. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  133. data/docs/OPTIONS.adoc +0 -1387
  134. data/docs/PREPROCESSING.adoc +0 -491
  135. data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
  136. data/docs/UNDERSTANDING_CANON.adoc +0 -17
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module TreeDiff
5
+ module Core
6
+ # AttributeComparator provides order-independent attribute comparison
7
+ #
8
+ # This class encapsulates the logic for comparing node attributes
9
+ # in a way that respects match options, particularly attribute_order.
10
+ #
11
+ # Key responsibilities:
12
+ # - Compare attributes with configurable order sensitivity
13
+ # - Provide hash-based equality for matching algorithms
14
+ # - Support both strict and normalized comparison modes
15
+ #
16
+ # @example
17
+ # comparator = AttributeComparator.new(attribute_order: :ignore)
18
+ # attrs1 = {class: "TOC", id: "_"}
19
+ # attrs2 = {id: "_", class: "TOC"}
20
+ # comparator.equal?(attrs1, attrs2) # => true
21
+ #
22
+ class AttributeComparator
23
+ attr_reader :attribute_order
24
+
25
+ # Initialize comparator with match options
26
+ #
27
+ # @param attribute_order [Symbol] :strict or :ignore/:normalize
28
+ def initialize(attribute_order: :strict)
29
+ @attribute_order = attribute_order
30
+ end
31
+
32
+ # Compare two attribute hashes for equality
33
+ #
34
+ # @param attrs1 [Hash] First attribute hash
35
+ # @param attrs2 [Hash] Second attribute hash
36
+ # @return [Boolean] True if attributes are considered equal
37
+ def equal?(attrs1, attrs2)
38
+ # Handle nil/empty cases
39
+ return true if attrs1.nil? && attrs2.nil?
40
+ return false if attrs1.nil? || attrs2.nil?
41
+
42
+ attrs1 = attrs1.to_h if attrs1.respond_to?(:to_h)
43
+ attrs2 = attrs2.to_h if attrs2.respond_to?(:to_h)
44
+
45
+ if attribute_order == :strict
46
+ # Strict mode: order matters
47
+ attrs1 == attrs2
48
+ else
49
+ # Ignore/normalize mode: sort keys for comparison
50
+ normalize_for_comparison(attrs1) == normalize_for_comparison(attrs2)
51
+ end
52
+ end
53
+
54
+ # Generate a comparison hash for attribute matching
55
+ #
56
+ # This is used by hash-based matchers to ensure nodes with
57
+ # equivalent attributes (according to match options) get the
58
+ # same hash value.
59
+ #
60
+ # @param attrs [Hash] Attribute hash
61
+ # @return [Hash] Normalized hash for comparison
62
+ def comparison_hash(attrs)
63
+ return {} if attrs.nil? || attrs.empty?
64
+
65
+ if attribute_order == :strict
66
+ attrs
67
+ else
68
+ normalize_for_comparison(attrs)
69
+ end
70
+ end
71
+
72
+ private
73
+
74
+ # Normalize attributes for order-independent comparison
75
+ #
76
+ # @param attrs [Hash] Attribute hash
77
+ # @return [Hash] Sorted attribute hash
78
+ def normalize_for_comparison(attrs)
79
+ attrs.sort.to_h
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,241 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module TreeDiff
5
+ module Core
6
+ # Matching stores and manages node pair matches
7
+ #
8
+ # A matching is a set of pairs (n1, n2) where:
9
+ # 1. One-to-one: Each node appears in at most one pair
10
+ # 2. Prefix closure: If (n1, n2) matched, ancestors can match
11
+ #
12
+ # Features:
13
+ # - Efficient lookup: O(1) for checking if node is matched
14
+ # - Validation: Ensures constraints are maintained
15
+ # - Iteration: Supports enumeration of all pairs
16
+ class Matching
17
+ attr_reader :pairs
18
+
19
+ # Initialize empty matching
20
+ def initialize
21
+ @pairs = []
22
+ @tree1_map = {} # node => matched_node
23
+ @tree2_map = {} # node => matched_node
24
+ end
25
+
26
+ # Add a matched pair
27
+ #
28
+ # @param node1 [TreeNode] Node from tree 1
29
+ # @param node2 [TreeNode] Node from tree 2
30
+ # @return [Boolean] true if added, false if violates constraints
31
+ def add(node1, node2)
32
+ return false unless valid_pair?(node1, node2)
33
+
34
+ @pairs << [node1, node2]
35
+ @tree1_map[node1] = node2
36
+ @tree2_map[node2] = node1
37
+
38
+ true
39
+ end
40
+
41
+ # Remove a matched pair
42
+ #
43
+ # @param node1 [TreeNode] Node from tree 1
44
+ # @param node2 [TreeNode] Node from tree 2
45
+ # @return [Boolean] true if removed, false if not found
46
+ def remove(node1, node2)
47
+ removed = @pairs.delete([node1, node2])
48
+ return false unless removed
49
+
50
+ @tree1_map.delete(node1)
51
+ @tree2_map.delete(node2)
52
+
53
+ true
54
+ end
55
+
56
+ # Check if a node from tree 1 is matched
57
+ #
58
+ # @param node [TreeNode] Node to check
59
+ # @return [Boolean]
60
+ def matched1?(node)
61
+ @tree1_map.key?(node)
62
+ end
63
+
64
+ # Check if a node from tree 2 is matched
65
+ #
66
+ # @param node [TreeNode] Node to check
67
+ # @return [Boolean]
68
+ def matched2?(node)
69
+ @tree2_map.key?(node)
70
+ end
71
+
72
+ # Get the match for a node from tree 1
73
+ #
74
+ # @param node [TreeNode] Node from tree 1
75
+ # @return [TreeNode, nil] Matched node from tree 2, or nil
76
+ def match_for1(node)
77
+ @tree1_map[node]
78
+ end
79
+
80
+ # Get the match for a node from tree 2
81
+ #
82
+ # @param node [TreeNode] Node from tree 2
83
+ # @return [TreeNode, nil] Matched node from tree 1, or nil
84
+ def match_for2(node)
85
+ @tree2_map[node]
86
+ end
87
+
88
+ # Get all unmatched nodes from tree 1
89
+ #
90
+ # @param nodes [Array<TreeNode>] All nodes from tree 1
91
+ # @return [Array<TreeNode>]
92
+ def unmatched1(nodes)
93
+ nodes.reject { |node| matched1?(node) }
94
+ end
95
+
96
+ # Get all unmatched nodes from tree 2
97
+ #
98
+ # @param nodes [Array<TreeNode>] All nodes from tree 2
99
+ # @return [Array<TreeNode>]
100
+ def unmatched2(nodes)
101
+ nodes.reject { |node| matched2?(node) }
102
+ end
103
+
104
+ # Get number of matched pairs
105
+ #
106
+ # @return [Integer]
107
+ def size
108
+ @pairs.size
109
+ end
110
+
111
+ # Check if matching is empty
112
+ #
113
+ # @return [Boolean]
114
+ def empty?
115
+ @pairs.empty?
116
+ end
117
+
118
+ # Iterate over all pairs
119
+ #
120
+ # @yield [node1, node2]
121
+ def each(&block)
122
+ @pairs.each(&block)
123
+ end
124
+
125
+ # Check if matching satisfies all constraints
126
+ #
127
+ # @return [Boolean]
128
+ def valid?
129
+ # Check one-to-one constraint
130
+ return false unless one_to_one?
131
+
132
+ # Check prefix closure constraint
133
+ return false unless prefix_closure?
134
+
135
+ true
136
+ end
137
+
138
+ # Check one-to-one constraint
139
+ #
140
+ # Each node appears in at most one pair
141
+ #
142
+ # @return [Boolean]
143
+ def one_to_one?
144
+ # Check tree1 map has unique values
145
+ tree1_values = @tree1_map.values
146
+ return false unless tree1_values.size == tree1_values.uniq.size
147
+
148
+ # Check tree2 map has unique values
149
+ tree2_values = @tree2_map.values
150
+ return false unless tree2_values.size == tree2_values.uniq.size
151
+
152
+ # Check maps are consistent
153
+ @tree1_map.all? { |n1, n2| @tree2_map[n2] == n1 }
154
+ end
155
+
156
+ # Check prefix closure constraint
157
+ #
158
+ # If (n1, n2) matched and ancestors (a1, a2) matched,
159
+ # then a1 is ancestor of n1 iff a2 is ancestor of n2
160
+ #
161
+ # @return [Boolean]
162
+ def prefix_closure?
163
+ @pairs.each do |node1, node2|
164
+ # Check each ancestor pair
165
+ node1.ancestors.each_with_index do |anc1, idx|
166
+ anc2 = node2.ancestors[idx]
167
+
168
+ # If ancestor matched, must be to corresponding ancestor
169
+ if matched1?(anc1)
170
+ match = match_for1(anc1)
171
+ return false unless match == anc2
172
+ end
173
+ end
174
+ end
175
+
176
+ true
177
+ end
178
+
179
+ # Convert to array of pairs
180
+ #
181
+ # @return [Array<Array<TreeNode, TreeNode>>]
182
+ def to_a
183
+ @pairs.dup
184
+ end
185
+
186
+ # String representation
187
+ #
188
+ # @return [String]
189
+ def to_s
190
+ "#<Matching #{size} pairs>"
191
+ end
192
+
193
+ # Detailed inspection
194
+ #
195
+ # @return [String]
196
+ def inspect
197
+ pairs_str = @pairs.map do |n1, n2|
198
+ "(#{n1.label} ↔ #{n2.label})"
199
+ end.join(", ")
200
+
201
+ "#<Matching [#{pairs_str}]>"
202
+ end
203
+
204
+ private
205
+
206
+ # Check if a pair can be added without violating constraints
207
+ #
208
+ # @param node1 [TreeNode] Node from tree 1
209
+ # @param node2 [TreeNode] Node from tree 2
210
+ # @return [Boolean]
211
+ def valid_pair?(node1, node2)
212
+ # Check one-to-one constraint
213
+ return false if matched1?(node1)
214
+ return false if matched2?(node2)
215
+
216
+ # Check prefix closure constraint
217
+ # If ancestors are matched, they must be matched to each other
218
+ node1.ancestors.each_with_index do |anc1, idx|
219
+ # Get corresponding ancestor in tree2
220
+ anc2_ancestors = node2.ancestors
221
+ return false if idx >= anc2_ancestors.size
222
+
223
+ anc2 = anc2_ancestors[idx]
224
+
225
+ # If anc1 is matched, it must be matched to anc2
226
+ if matched1?(anc1) && match_for1(anc1) != anc2
227
+ return false
228
+ end
229
+
230
+ # If anc2 is matched, it must be matched to anc1
231
+ if matched2?(anc2) && match_for2(anc2) != anc1
232
+ return false
233
+ end
234
+ end
235
+
236
+ true
237
+ end
238
+ end
239
+ end
240
+ end
241
+ end
@@ -0,0 +1,164 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module TreeDiff
5
+ module Core
6
+ # NodeSignature computes unique signatures for tree nodes
7
+ #
8
+ # Based on XDiff (2002, U. Wisconsin) approach:
9
+ # - Signature is the path from root to node
10
+ # - Format: /ancestor1/ancestor2/.../node/type
11
+ # - Used for fast exact matching via hash lookup
12
+ #
13
+ # Features:
14
+ # - Deterministic: Same path always produces same signature
15
+ # - Hierarchical: Parent-child relationships encoded
16
+ # - Type-aware: Distinguishes element vs text nodes
17
+ class NodeSignature
18
+ attr_reader :path, :signature_string
19
+
20
+ # Initialize signature for a node
21
+ #
22
+ # @param node [TreeNode] Node to compute signature for
23
+ # @param include_attributes [Boolean] Whether to include attributes
24
+ def initialize(node, include_attributes: true)
25
+ @node = node
26
+ @include_attributes = include_attributes
27
+ @path = compute_path
28
+ @signature_string = compute_signature_string
29
+ end
30
+
31
+ # Compute and cache signature for a node
32
+ #
33
+ # @param node [TreeNode] Node to compute signature for
34
+ # @param include_attributes [Boolean] Whether to include attributes in signature
35
+ # @return [NodeSignature]
36
+ def self.for(node, include_attributes: true)
37
+ if include_attributes
38
+ node.signature ||= new(node, include_attributes: true)
39
+ else
40
+ # Don't cache loose signatures
41
+ new(node, include_attributes: false)
42
+ end
43
+ end
44
+
45
+ # Check if two signatures are equal
46
+ #
47
+ # @param other [NodeSignature] Signature to compare with
48
+ # @return [Boolean]
49
+ def ==(other)
50
+ return false unless other.is_a?(NodeSignature)
51
+
52
+ signature_string == other.signature_string
53
+ end
54
+
55
+ alias eql? ==
56
+
57
+ # Hash value for use in Hash/Set
58
+ #
59
+ # @return [Integer]
60
+ def hash
61
+ signature_string.hash
62
+ end
63
+
64
+ # String representation
65
+ #
66
+ # @return [String]
67
+ def to_s
68
+ signature_string
69
+ end
70
+
71
+ # Detailed inspection
72
+ #
73
+ # @return [String]
74
+ def inspect
75
+ "#<NodeSignature #{signature_string.inspect}>"
76
+ end
77
+
78
+ private
79
+
80
+ # Compute the path from root to this node
81
+ #
82
+ # @return [Array<String>] Path components
83
+ def compute_path
84
+ components = []
85
+
86
+ # Build path from root to node
87
+ ancestors = @node.ancestors.reverse
88
+ ancestors.each do |ancestor|
89
+ components << path_component(ancestor)
90
+ end
91
+
92
+ # Add the node itself
93
+ components << path_component(@node)
94
+
95
+ components
96
+ end
97
+
98
+ # Get path component for a node
99
+ #
100
+ # @param node [TreeNode] Node to get component for
101
+ # @return [String]
102
+ def path_component(node)
103
+ # For element nodes: use label with sorted attributes
104
+ # For text nodes: use "#text"
105
+ # CRITICAL: Text nodes should use "#text" not "text"
106
+ # Check the label - actual text nodes have no label or label == "text"
107
+ label_str = node.label.to_s.downcase
108
+ if node.label.nil? || label_str.empty? || label_str == "#text" || label_str == "text"
109
+ "#text"
110
+ else
111
+ component = node.label.to_s
112
+
113
+ # Include sorted attributes to distinguish nodes with same label
114
+ # but different attributes (while ignoring attribute order)
115
+ # Only include attributes if requested (for hash matching)
116
+ if @include_attributes && !node.attributes.empty?
117
+ sorted_attrs = node.attributes.sort.to_h
118
+ attrs_str = sorted_attrs.map { |k, v| "#{k}=#{v}" }.join(",")
119
+ component += "{#{attrs_str}}"
120
+ end
121
+
122
+ # CRITICAL: For whitespace-sensitive HTML elements, include the text value
123
+ # in the signature to prevent incorrect matching of nodes with different whitespace
124
+ if @include_attributes && whitespace_sensitive?(node) && node.value
125
+ # Include text value in signature for whitespace-sensitive elements
126
+ # Use inspect to make whitespace visible and handle special characters
127
+ component += "[text=#{node.value.inspect}]"
128
+ end
129
+
130
+ component
131
+ end
132
+ end
133
+
134
+ # Check if a node is in a whitespace-sensitive context
135
+ #
136
+ # HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
137
+ #
138
+ # @param node [TreeNode] Node to check
139
+ # @return [Boolean] True if node is whitespace-sensitive
140
+ def whitespace_sensitive?(node)
141
+ return false unless node
142
+
143
+ # List of HTML elements where whitespace is semantically significant
144
+ whitespace_sensitive_tags = %w[pre code textarea script style]
145
+
146
+ # Check if this node is whitespace-sensitive
147
+ if node.respond_to?(:label)
148
+ label = node.label.to_s.downcase
149
+ return true if whitespace_sensitive_tags.include?(label)
150
+ end
151
+
152
+ false
153
+ end
154
+
155
+ # Compute signature string from path
156
+ #
157
+ # @return [String]
158
+ def compute_signature_string
159
+ "/#{path.join('/')}"
160
+ end
161
+ end
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,135 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module TreeDiff
5
+ module Core
6
+ # NodeWeight computes weights for tree nodes
7
+ #
8
+ # Based on XyDiff/Cobena (2002, INRIA) approach:
9
+ # - Weight reflects subtree size/importance
10
+ # - Formula: 1 + Σ(child_weights)
11
+ # - Text nodes: 1 + log(text_length) for significant text
12
+ # - Used to prioritize matching (heaviest first)
13
+ #
14
+ # Features:
15
+ # - Hierarchical: Parent weight includes all descendants
16
+ # - Text-aware: Longer text has higher weight
17
+ # - Cached: Computed once and reused
18
+ class NodeWeight
19
+ attr_reader :value
20
+
21
+ # Initialize weight for a node
22
+ #
23
+ # @param node [TreeNode] Node to compute weight for
24
+ def initialize(node)
25
+ @node = node
26
+ @value = compute_weight
27
+ end
28
+
29
+ # Compute and cache weight for a node
30
+ #
31
+ # @param node [TreeNode] Node to compute weight for
32
+ # @return [NodeWeight]
33
+ def self.for(node)
34
+ node.weight ||= new(node)
35
+ end
36
+
37
+ # Compare weights (for sorting)
38
+ #
39
+ # @param other [NodeWeight] Weight to compare with
40
+ # @return [Integer] -1, 0, or 1
41
+ def <=>(other)
42
+ return nil unless other.is_a?(NodeWeight)
43
+
44
+ value <=> other.value
45
+ end
46
+
47
+ # Check if equal
48
+ #
49
+ # @param other [NodeWeight] Weight to compare with
50
+ # @return [Boolean]
51
+ def ==(other)
52
+ return false unless other.is_a?(NodeWeight)
53
+
54
+ value == other.value
55
+ end
56
+
57
+ # Numeric value for calculations
58
+ #
59
+ # @return [Float]
60
+ def to_f
61
+ value
62
+ end
63
+
64
+ # Integer value for calculations
65
+ #
66
+ # @return [Integer]
67
+ def to_i
68
+ value.to_i
69
+ end
70
+
71
+ # String representation
72
+ #
73
+ # @return [String]
74
+ def to_s
75
+ value.to_s
76
+ end
77
+
78
+ # Detailed inspection
79
+ #
80
+ # @return [String]
81
+ def inspect
82
+ "#<NodeWeight #{value}>"
83
+ end
84
+
85
+ private
86
+
87
+ # Compute weight based on node type and structure
88
+ #
89
+ # @return [Float]
90
+ def compute_weight
91
+ if @node.text?
92
+ compute_text_weight
93
+ else
94
+ compute_element_weight
95
+ end
96
+ end
97
+
98
+ # Compute weight for text nodes
99
+ #
100
+ # Formula: 1 + log(text_length)
101
+ # - Minimum weight is 1.0 (empty text)
102
+ # - Grows logarithmically with text length
103
+ # - Prevents very long text from dominating
104
+ #
105
+ # @return [Float]
106
+ def compute_text_weight
107
+ text = @node.value.to_s
108
+ return 1.0 if text.empty?
109
+
110
+ # Use natural logarithm (log base e)
111
+ # Add 1 to avoid log(0)
112
+ 1.0 + Math.log(text.length + 1)
113
+ end
114
+
115
+ # Compute weight for element nodes
116
+ #
117
+ # Formula: 1 + Σ(child_weights)
118
+ # - Each node has base weight of 1
119
+ # - Parent weight includes all descendants
120
+ # - Recursive computation
121
+ #
122
+ # @return [Float]
123
+ def compute_element_weight
124
+ return 1.0 if @node.children.empty?
125
+
126
+ child_weights = @node.children.map do |child|
127
+ self.class.for(child).value
128
+ end
129
+
130
+ 1.0 + child_weights.sum
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end