canon 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +163 -67
  3. data/README.adoc +400 -7
  4. data/docs/Gemfile +9 -0
  5. data/docs/INDEX.adoc +99 -182
  6. data/docs/_config.yml +100 -0
  7. data/docs/advanced/diff-classification.adoc +547 -0
  8. data/docs/advanced/diff-pipeline.adoc +358 -0
  9. data/docs/advanced/index.adoc +214 -0
  10. data/docs/advanced/semantic-diff-report.adoc +390 -0
  11. data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
  12. data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
  13. data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
  14. data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
  15. data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
  16. data/docs/features/diff-formatting/display-filtering.adoc +472 -0
  17. data/docs/features/diff-formatting/index.adoc +140 -0
  18. data/docs/features/environment-configuration/index.adoc +327 -0
  19. data/docs/features/environment-configuration/override-system.adoc +436 -0
  20. data/docs/features/environment-configuration/size-limits.adoc +273 -0
  21. data/docs/features/index.adoc +173 -0
  22. data/docs/features/input-validation/index.adoc +521 -0
  23. data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
  24. data/docs/features/match-options/html-policies.adoc +312 -0
  25. data/docs/features/match-options/index.adoc +621 -0
  26. data/docs/getting-started/index.adoc +83 -0
  27. data/docs/getting-started/quick-start.adoc +76 -0
  28. data/docs/guides/choosing-configuration.adoc +689 -0
  29. data/docs/guides/index.adoc +181 -0
  30. data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
  31. data/docs/interfaces/index.adoc +101 -0
  32. data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
  33. data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
  34. data/docs/lychee.toml +65 -0
  35. data/docs/reference/cli-options.adoc +418 -0
  36. data/docs/reference/environment-variables.adoc +375 -0
  37. data/docs/reference/index.adoc +204 -0
  38. data/docs/reference/options-across-interfaces.adoc +417 -0
  39. data/docs/understanding/algorithms/dom-diff.adoc +389 -0
  40. data/docs/understanding/algorithms/index.adoc +314 -0
  41. data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
  42. data/docs/understanding/architecture.adoc +447 -0
  43. data/docs/understanding/comparison-pipeline.adoc +317 -0
  44. data/docs/understanding/formats/html.adoc +380 -0
  45. data/docs/understanding/formats/index.adoc +261 -0
  46. data/docs/understanding/formats/json.adoc +390 -0
  47. data/docs/understanding/formats/xml.adoc +366 -0
  48. data/docs/understanding/formats/yaml.adoc +504 -0
  49. data/docs/understanding/index.adoc +130 -0
  50. data/lib/canon/cli.rb +42 -1
  51. data/lib/canon/commands/diff_command.rb +108 -23
  52. data/lib/canon/comparison/compare_profile.rb +101 -0
  53. data/lib/canon/comparison/comparison_result.rb +41 -2
  54. data/lib/canon/comparison/html_comparator.rb +292 -71
  55. data/lib/canon/comparison/html_compare_profile.rb +117 -0
  56. data/lib/canon/comparison/match_options.rb +42 -4
  57. data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
  58. data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
  59. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
  60. data/lib/canon/comparison/xml_comparator.rb +695 -91
  61. data/lib/canon/comparison.rb +207 -2
  62. data/lib/canon/config/env_provider.rb +71 -0
  63. data/lib/canon/config/env_schema.rb +58 -0
  64. data/lib/canon/config/override_resolver.rb +55 -0
  65. data/lib/canon/config/type_converter.rb +59 -0
  66. data/lib/canon/config.rb +158 -29
  67. data/lib/canon/data_model.rb +29 -0
  68. data/lib/canon/diff/diff_classifier.rb +74 -14
  69. data/lib/canon/diff/diff_context_builder.rb +41 -0
  70. data/lib/canon/diff/diff_line.rb +18 -2
  71. data/lib/canon/diff/diff_node.rb +18 -3
  72. data/lib/canon/diff/diff_node_mapper.rb +71 -12
  73. data/lib/canon/diff/formatting_detector.rb +53 -0
  74. data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
  75. data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
  76. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
  77. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
  78. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
  79. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
  80. data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
  81. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
  82. data/lib/canon/diff_formatter/debug_output.rb +7 -1
  83. data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
  84. data/lib/canon/diff_formatter/legend.rb +42 -0
  85. data/lib/canon/diff_formatter.rb +78 -9
  86. data/lib/canon/errors.rb +56 -0
  87. data/lib/canon/formatters/html_formatter_base.rb +35 -1
  88. data/lib/canon/formatters/json_formatter.rb +3 -0
  89. data/lib/canon/formatters/yaml_formatter.rb +3 -0
  90. data/lib/canon/html/data_model.rb +229 -0
  91. data/lib/canon/html.rb +9 -0
  92. data/lib/canon/options/cli_generator.rb +70 -0
  93. data/lib/canon/options/registry.rb +234 -0
  94. data/lib/canon/rspec_matchers.rb +34 -13
  95. data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
  96. data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
  97. data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
  98. data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
  99. data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
  100. data/lib/canon/tree_diff/core/matching.rb +241 -0
  101. data/lib/canon/tree_diff/core/node_signature.rb +164 -0
  102. data/lib/canon/tree_diff/core/node_weight.rb +135 -0
  103. data/lib/canon/tree_diff/core/tree_node.rb +450 -0
  104. data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
  105. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
  106. data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
  107. data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
  108. data/lib/canon/tree_diff/operation_converter.rb +631 -0
  109. data/lib/canon/tree_diff/operations/operation.rb +92 -0
  110. data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
  111. data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
  112. data/lib/canon/tree_diff.rb +33 -0
  113. data/lib/canon/validators/json_validator.rb +3 -1
  114. data/lib/canon/validators/yaml_validator.rb +3 -1
  115. data/lib/canon/version.rb +1 -1
  116. data/lib/canon/xml/data_model.rb +22 -23
  117. data/lib/canon/xml/element_matcher.rb +128 -20
  118. data/lib/canon/xml/namespace_helper.rb +110 -0
  119. data/lib/canon.rb +3 -0
  120. metadata +81 -23
  121. data/_config.yml +0 -116
  122. data/docs/ADVANCED_TOPICS.adoc +0 -20
  123. data/docs/BASIC_USAGE.adoc +0 -16
  124. data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  125. data/docs/DIFF_ARCHITECTURE.adoc +0 -435
  126. data/docs/DIFF_FORMATTING.adoc +0 -540
  127. data/docs/FORMATS.adoc +0 -447
  128. data/docs/INPUT_VALIDATION.adoc +0 -477
  129. data/docs/MATCH_ARCHITECTURE.adoc +0 -463
  130. data/docs/MATCH_OPTIONS.adoc +0 -719
  131. data/docs/MODES.adoc +0 -432
  132. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  133. data/docs/OPTIONS.adoc +0 -1387
  134. data/docs/PREPROCESSING.adoc +0 -491
  135. data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
  136. data/docs/UNDERSTANDING_CANON.adoc +0 -17
@@ -0,0 +1,450 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module TreeDiff
5
+ module Core
6
+ # TreeNode represents a node in a semantic tree structure
7
+ #
8
+ # This is the fundamental data structure for tree-based diffing,
9
+ # supporting both XML and JSON trees in a format-agnostic way.
10
+ #
11
+ # Key features:
12
+ # - Label: Node name/key (e.g., element name, object key)
13
+ # - Value: Leaf node content (text, number, boolean, etc.)
14
+ # - Children: Ordered list of child nodes
15
+ # - Parent: Reference to parent node (nil for root)
16
+ # - Attributes: Key-value metadata (e.g., XML attributes)
17
+ # - Signature: Computed path-based identifier (XDiff-style)
18
+ # - Weight: Subtree size metric (XyDiff-style)
19
+ # - XID: External identifier for matching (e.g., XML id attribute)
20
+ class TreeNode
21
+ attr_accessor :label, :value, :children, :parent, :attributes,
22
+ :signature, :weight, :xid, :source_node
23
+ attr_reader :metadata
24
+
25
+ # Initialize a new TreeNode
26
+ #
27
+ # @param label [String] Node name/key
28
+ # @param value [String, Numeric, Boolean, nil] Leaf value
29
+ # @param children [Array<TreeNode>] Child nodes
30
+ # @param parent [TreeNode, nil] Parent node
31
+ # @param attributes [Hash] Node attributes
32
+ # @param xid [String, nil] External identifier
33
+ # @param source_node [Object, nil] Original source node (e.g., Nokogiri node)
34
+ def initialize(label:, value: nil, children: [], parent: nil,
35
+ attributes: {}, xid: nil, source_node: nil)
36
+ @label = label
37
+ @value = value
38
+ @children = children
39
+ @parent = parent
40
+ @attributes = attributes
41
+ @xid = xid
42
+ @source_node = source_node
43
+ @metadata = {}
44
+
45
+ # Set this node as parent for all children
46
+ @children.each { |child| child.parent = self }
47
+
48
+ # Computed lazily
49
+ @signature = nil
50
+ @weight = nil
51
+ end
52
+
53
+ # Check if this is a leaf node (no children)
54
+ #
55
+ # @return [Boolean]
56
+ def leaf?
57
+ children.empty?
58
+ end
59
+
60
+ # Check if this is a text node (leaf with value)
61
+ #
62
+ # @return [Boolean]
63
+ def text?
64
+ leaf? && !value.nil?
65
+ end
66
+
67
+ # Check if this is an element node (has children or attributes)
68
+ #
69
+ # @return [Boolean]
70
+ def element?
71
+ !leaf? || !attributes.empty?
72
+ end
73
+
74
+ # Get the root node of this tree
75
+ #
76
+ # @return [TreeNode]
77
+ def root
78
+ node = self
79
+ node = node.parent while node.parent
80
+ node
81
+ end
82
+
83
+ # Get all ancestor nodes from parent to root
84
+ #
85
+ # @return [Array<TreeNode>]
86
+ def ancestors
87
+ result = []
88
+ node = parent
89
+ while node
90
+ result << node
91
+ node = node.parent
92
+ end
93
+ result
94
+ end
95
+
96
+ # Get all descendant nodes (depth-first)
97
+ #
98
+ # @return [Array<TreeNode>]
99
+ def descendants
100
+ result = []
101
+ children.each do |child|
102
+ result << child
103
+ result.concat(child.descendants)
104
+ end
105
+ result
106
+ end
107
+
108
+ # Get sibling nodes (nodes with same parent)
109
+ #
110
+ # @return [Array<TreeNode>]
111
+ def siblings
112
+ return [] unless parent
113
+
114
+ parent.children.reject { |child| child == self }
115
+ end
116
+
117
+ # Get left siblings (siblings before this node)
118
+ #
119
+ # @return [Array<TreeNode>]
120
+ def left_siblings
121
+ return [] unless parent
122
+
123
+ index = parent.children.index(self)
124
+ return [] unless index
125
+
126
+ parent.children[0...index]
127
+ end
128
+
129
+ # Get right siblings (siblings after this node)
130
+ #
131
+ # @return [Array<TreeNode>]
132
+ def right_siblings
133
+ return [] unless parent
134
+
135
+ index = parent.children.index(self)
136
+ return [] unless index
137
+
138
+ parent.children[(index + 1)..]
139
+ end
140
+
141
+ # Get the position of this node among its siblings
142
+ #
143
+ # @return [Integer, nil] 0-based index, or nil if no parent
144
+ def position
145
+ return nil unless parent
146
+
147
+ parent.children.index(self)
148
+ end
149
+
150
+ # Get depth of this node (distance from root)
151
+ #
152
+ # @return [Integer]
153
+ def depth
154
+ ancestors.size
155
+ end
156
+
157
+ # Get height of this node (max distance to any leaf)
158
+ #
159
+ # @return [Integer]
160
+ def height
161
+ return 0 if leaf?
162
+
163
+ 1 + children.map(&:height).max
164
+ end
165
+
166
+ # Get the size of subtree rooted at this node
167
+ #
168
+ # @return [Integer]
169
+ def size
170
+ 1 + children.sum(&:size)
171
+ end
172
+
173
+ # Add a child node
174
+ #
175
+ # @param child [TreeNode] Child to add
176
+ # @param position [Integer, nil] Optional position to insert at
177
+ # @return [TreeNode] The added child
178
+ def add_child(child, position: nil)
179
+ child.parent = self
180
+
181
+ if position
182
+ children.insert(position, child)
183
+ else
184
+ children << child
185
+ end
186
+
187
+ # Invalidate cached computations
188
+ invalidate_cache
189
+
190
+ child
191
+ end
192
+
193
+ # Remove a child node
194
+ #
195
+ # @param child [TreeNode] Child to remove
196
+ # @return [TreeNode, nil] The removed child, or nil if not found
197
+ def remove_child(child)
198
+ removed = children.delete(child)
199
+ removed&.parent = nil
200
+
201
+ # Invalidate cached computations
202
+ invalidate_cache if removed
203
+
204
+ removed
205
+ end
206
+
207
+ # Replace a child node with another
208
+ #
209
+ # @param old_child [TreeNode] Child to replace
210
+ # @param new_child [TreeNode] New child
211
+ # @return [TreeNode, nil] The replaced child, or nil if not found
212
+ def replace_child(old_child, new_child)
213
+ index = children.index(old_child)
214
+ return nil unless index
215
+
216
+ old_child.parent = nil
217
+ new_child.parent = self
218
+ children[index] = new_child
219
+
220
+ # Invalidate cached computations
221
+ invalidate_cache
222
+
223
+ old_child
224
+ end
225
+
226
+ # Check if two nodes match exactly
227
+ #
228
+ # Exact match requires:
229
+ # - Same label
230
+ # - Same value (for text nodes)
231
+ # - Same attributes (key-value pairs)
232
+ # - Same number of children with same labels
233
+ #
234
+ # @param other [TreeNode] Node to compare with
235
+ # @return [Boolean]
236
+ def matches?(other)
237
+ return false unless other.is_a?(TreeNode)
238
+ return false unless label == other.label
239
+ return false unless value == other.value
240
+ return false unless attributes == other.attributes
241
+ return false unless children.size == other.children.size
242
+
243
+ # Check children have same labels
244
+ children.map(&:label) == other.children.map(&:label)
245
+ end
246
+
247
+ # Calculate similarity score with another node
248
+ #
249
+ # Uses Jaccard index on combined content:
250
+ # - Label
251
+ # - Value
252
+ # - Attribute keys and values
253
+ # - Child labels
254
+ #
255
+ # @param other [TreeNode] Node to compare with
256
+ # @return [Float] Similarity score 0.0 to 1.0
257
+ def similarity_to(other)
258
+ return 0.0 unless other.is_a?(TreeNode)
259
+
260
+ # Extract comparable elements
261
+ set1 = content_set
262
+ set2 = other.content_set
263
+
264
+ # Jaccard index: |intersection| / |union|
265
+ return 0.0 if set1.empty? && set2.empty?
266
+
267
+ intersection = (set1 & set2).size.to_f
268
+ union = (set1 | set2).size.to_f
269
+
270
+ intersection / union
271
+ end
272
+
273
+ # Calculate semantic distance to another node
274
+ #
275
+ # Semantic distance considers:
276
+ # - Depth difference (structural distance)
277
+ # - Content similarity (inverse)
278
+ # - Attribute differences
279
+ #
280
+ # @param other [TreeNode] Node to compare with
281
+ # @return [Float] Distance metric (0 = identical)
282
+ def semantic_distance_to(other)
283
+ return Float::INFINITY unless other.is_a?(TreeNode)
284
+
285
+ # Component 1: Depth difference (structural)
286
+ depth_diff = (depth - other.depth).abs.to_f
287
+
288
+ # Component 2: Content dissimilarity
289
+ content_diff = 1.0 - similarity_to(other)
290
+
291
+ # Component 3: Attribute differences
292
+ attr_diff = attribute_difference(other)
293
+
294
+ # Weighted combination
295
+ depth_diff * 0.3 + content_diff * 0.5 + attr_diff * 0.2
296
+ end
297
+
298
+ # Get content as a set for similarity calculation
299
+ #
300
+ # @return [Set<String>]
301
+ def content_set
302
+ result = Set.new
303
+
304
+ # Add label
305
+ result << "label:#{label}" if label
306
+
307
+ # Add value
308
+ result << "value:#{value}" if value
309
+
310
+ # Add attributes
311
+ attributes.each do |key, val|
312
+ result << "attr:#{key}=#{val}"
313
+ end
314
+
315
+ # Add child labels
316
+ children.each do |child|
317
+ result << "child:#{child.label}"
318
+ end
319
+
320
+ result
321
+ end
322
+
323
+ # Calculate attribute difference with another node
324
+ #
325
+ # @param other [TreeNode] Node to compare with
326
+ # @return [Float] Difference score 0.0 to 1.0
327
+ def attribute_difference(other)
328
+ keys1 = Set.new(attributes.keys)
329
+ keys2 = Set.new(other.attributes.keys)
330
+
331
+ all_keys = keys1 | keys2
332
+ return 0.0 if all_keys.empty?
333
+
334
+ diff_count = 0
335
+
336
+ all_keys.each do |key|
337
+ val1 = attributes[key]
338
+ val2 = other.attributes[key]
339
+
340
+ diff_count += 1 if val1 != val2
341
+ end
342
+
343
+ diff_count.to_f / all_keys.size
344
+ end
345
+
346
+ # Get XPath for this node
347
+ #
348
+ # @return [String] XPath expression
349
+ def xpath
350
+ # If we have a source node that supports xpath, use it
351
+ if @source_node.respond_to?(:path)
352
+ return @source_node.path
353
+ end
354
+
355
+ # Otherwise construct path from tree structure
356
+ construct_path
357
+ end
358
+
359
+ # Construct path from tree structure
360
+ #
361
+ # @return [String] Path expression
362
+ def construct_path
363
+ segments = []
364
+ node = self
365
+
366
+ while node
367
+ if node.parent
368
+ # Get position among siblings with same label
369
+ siblings = node.parent.children.select do |c|
370
+ c.label == node.label
371
+ end
372
+ position = siblings.index(node) + 1 # 1-based indexing for XPath
373
+
374
+ # Always include index for clarity and precision
375
+ segments.unshift("#{node.label}[#{position}]")
376
+ else
377
+ segments.unshift(node.label)
378
+ end
379
+
380
+ node = node.parent
381
+ end
382
+
383
+ "/#{segments.join('/')}"
384
+ end
385
+
386
+ # Deep clone this node and its subtree
387
+ #
388
+ # @return [TreeNode]
389
+ def deep_clone
390
+ cloned_children = children.map(&:deep_clone)
391
+
392
+ TreeNode.new(
393
+ label: label,
394
+ value: value,
395
+ children: cloned_children,
396
+ parent: nil,
397
+ attributes: attributes.dup,
398
+ xid: xid,
399
+ source_node: source_node, # Preserve source node reference
400
+ )
401
+ end
402
+
403
+ # Convert to hash representation
404
+ #
405
+ # @return [Hash]
406
+ def to_h
407
+ result = {
408
+ label: label,
409
+ value: value,
410
+ attributes: attributes,
411
+ xid: xid,
412
+ children: children.map(&:to_h),
413
+ }
414
+
415
+ result[:signature] = signature if signature
416
+ result[:weight] = weight if weight
417
+
418
+ result
419
+ end
420
+
421
+ # String representation for debugging
422
+ #
423
+ # @return [String]
424
+ def inspect
425
+ attrs = []
426
+ attrs << "label=#{label.inspect}"
427
+ attrs << "value=#{value.inspect}" if value
428
+ attrs << "xid=#{xid.inspect}" if xid
429
+ attrs << "children=#{children.size}" unless children.empty?
430
+ attrs << "attributes=#{attributes.size}" unless attributes.empty?
431
+
432
+ "#<TreeNode #{attrs.join(' ')}>"
433
+ end
434
+
435
+ alias to_s inspect
436
+
437
+ private
438
+
439
+ # Invalidate cached computations
440
+ def invalidate_cache
441
+ @signature = nil
442
+ @weight = nil
443
+
444
+ # Propagate upward
445
+ parent&.send(:invalidate_cache)
446
+ end
447
+ end
448
+ end
449
+ end
450
+ end
@@ -0,0 +1,258 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../core/tree_node"
4
+ require_relative "../core/node_signature"
5
+ require_relative "../core/node_weight"
6
+ require_relative "../core/matching"
7
+ require_relative "../core/attribute_comparator"
8
+
9
+ module Canon
10
+ module TreeDiff
11
+ module Matchers
12
+ # HashMatcher performs fast exact subtree matching
13
+ #
14
+ # Based on XyDiff/Cobena (2002, INRIA) BULD algorithm:
15
+ # - Build signature map for tree1
16
+ # - Process nodes by weight (heaviest first)
17
+ # - Match identical subtrees via signature lookup
18
+ # - Propagate matches to ancestors
19
+ #
20
+ # Complexity: O(n log n) where n is number of nodes
21
+ #
22
+ # Features:
23
+ # - Hash-based exact matching (O(1) lookup)
24
+ # - Weight-based prioritization (largest subtrees first)
25
+ # - Automatic ancestor propagation
26
+ # - Handles both element and text nodes
27
+ class HashMatcher
28
+ attr_reader :tree1, :tree2, :matching, :match_options
29
+
30
+ # Initialize matcher with two trees
31
+ #
32
+ # @param tree1 [TreeNode] First tree root
33
+ # @param tree2 [TreeNode] Second tree root
34
+ # @param options [Hash] Match options (includes text_content, attribute_order, etc.)
35
+ def initialize(tree1, tree2, options = {})
36
+ @tree1 = tree1
37
+ @tree2 = tree2
38
+ @matching = Core::Matching.new
39
+ @signature_map = {}
40
+ @matched_tree1 = Set.new
41
+ @matched_tree2 = Set.new
42
+ @options = options
43
+ @match_options = options # Store full match options for text comparison
44
+ @attribute_comparator = Core::AttributeComparator.new(
45
+ attribute_order: options[:attribute_order] || :ignore,
46
+ )
47
+ end
48
+
49
+ # Perform hash-based matching
50
+ #
51
+ # @return [Core::Matching] The resulting matching
52
+ def match
53
+ # Step 1: Build signature map for tree1
54
+ build_signature_map
55
+
56
+ # Step 2: Get all nodes from tree2 sorted by weight (heaviest first)
57
+ tree2_nodes = collect_nodes(tree2).sort_by do |node|
58
+ -Core::NodeWeight.for(node).value
59
+ end
60
+
61
+ # Step 3: Match nodes from tree2 to tree1 via signatures
62
+ tree2_nodes.each do |node2|
63
+ next if @matched_tree2.include?(node2)
64
+
65
+ match_node(node2)
66
+ end
67
+
68
+ @matching
69
+ end
70
+
71
+ private
72
+
73
+ # Build signature map for tree1
74
+ #
75
+ # Maps signatures to arrays of nodes (multiple nodes can share signature)
76
+ def build_signature_map
77
+ collect_nodes(tree1).each do |node|
78
+ sig = Core::NodeSignature.for(node)
79
+ @signature_map[sig] ||= []
80
+ @signature_map[sig] << node
81
+ end
82
+ end
83
+
84
+ # Collect all nodes from a tree (depth-first)
85
+ #
86
+ # @param root [TreeNode] Root of tree
87
+ # @return [Array<TreeNode>]
88
+ def collect_nodes(root)
89
+ nodes = [root]
90
+ nodes.concat(root.descendants)
91
+ nodes
92
+ end
93
+
94
+ # Try to match a node from tree2 to tree1
95
+ #
96
+ # @param node2 [TreeNode] Node from tree2
97
+ def match_node(node2)
98
+ sig2 = Core::NodeSignature.for(node2)
99
+
100
+ # Find candidate nodes in tree1 with same signature
101
+ candidates = @signature_map[sig2] || []
102
+
103
+ # Filter to unmatched candidates
104
+ candidates = candidates.reject { |n| @matched_tree1.include?(n) }
105
+
106
+ return if candidates.empty?
107
+
108
+ # Find best match among candidates
109
+ best_match = find_best_match(node2, candidates)
110
+
111
+ return unless best_match
112
+
113
+ # Add match if it satisfies constraints
114
+ if @matching.add(best_match, node2)
115
+ @matched_tree1 << best_match
116
+ @matched_tree2 << node2
117
+
118
+ # Try to propagate match to ancestors
119
+ propagate_to_ancestors(best_match, node2)
120
+ end
121
+ end
122
+
123
+ # Find best match among candidates
124
+ #
125
+ # For exact matching, we need:
126
+ # 1. Same signature (already filtered)
127
+ # 2. Matching subtrees (same structure and values)
128
+ #
129
+ # @param node2 [TreeNode] Node from tree2
130
+ # @param candidates [Array<TreeNode>] Candidate nodes from tree1
131
+ # @return [TreeNode, nil]
132
+ def find_best_match(node2, candidates)
133
+ # For hash matching, we want exact subtree equality
134
+ # Find first candidate that has matching subtree
135
+ candidates.find do |node1|
136
+ subtrees_match?(node1, node2)
137
+ end
138
+ end
139
+
140
+ # Check if two subtrees match exactly
141
+ #
142
+ # @param node1 [TreeNode] Node from tree1
143
+ # @param node2 [TreeNode] Node from tree2
144
+ # @return [Boolean]
145
+ def subtrees_match?(node1, node2)
146
+ # Check root nodes match
147
+ return false unless nodes_match?(node1, node2)
148
+
149
+ # Check children count
150
+ return false unless node1.children.size == node2.children.size
151
+
152
+ # Check each child subtree matches
153
+ node1.children.zip(node2.children).all? do |child1, child2|
154
+ subtrees_match?(child1, child2)
155
+ end
156
+ end
157
+
158
+ # Check if two nodes match (not including subtrees)
159
+ #
160
+ # Uses normalized text comparison based on match_options.
161
+ #
162
+ # @param node1 [TreeNode] Node from tree1
163
+ # @param node2 [TreeNode] Node from tree2
164
+ # @return [Boolean]
165
+ def nodes_match?(node1, node2)
166
+ return false unless node1.label == node2.label
167
+
168
+ # CRITICAL FIX: Use normalized text comparison
169
+ return false unless text_equivalent?(node1, node2)
170
+
171
+ return false unless @attribute_comparator.equal?(node1.attributes,
172
+ node2.attributes)
173
+
174
+ true
175
+ end
176
+
177
+ # Check if text values are equivalent according to match options
178
+ #
179
+ # Same logic as in OperationDetector for consistency.
180
+ #
181
+ # @param node1 [TreeNode] First node
182
+ # @param node2 [TreeNode] Second node
183
+ # @return [Boolean] True if text values are equivalent
184
+ def text_equivalent?(node1, node2)
185
+ text1 = node1.value
186
+ text2 = node2.value
187
+
188
+ # Both nil or empty = equivalent
189
+ return true if (text1.nil? || text1.empty?) && (text2.nil? || text2.empty?)
190
+ return false if (text1.nil? || text1.empty?) || (text2.nil? || text2.empty?)
191
+
192
+ # If both normalize to empty (whitespace-only), treat as equivalent
193
+ norm1 = normalize_text(text1)
194
+ norm2 = normalize_text(text2)
195
+ return true if norm1.empty? && norm2.empty?
196
+
197
+ # Apply normalization based on match_options
198
+ text_content_mode = @match_options[:text_content] || :normalize
199
+
200
+ case text_content_mode
201
+ when :strict
202
+ text1 == text2
203
+ when :normalize, :normalized
204
+ norm1 == norm2
205
+ else
206
+ norm1 == norm2
207
+ end
208
+ end
209
+
210
+ # Normalize text for comparison
211
+ #
212
+ # @param text [String, nil] Text to normalize
213
+ # @return [String] Normalized text
214
+ def normalize_text(text)
215
+ return "" if text.nil? || text.empty?
216
+
217
+ text.gsub(/\s+/, " ").strip
218
+ end
219
+
220
+ # Propagate match to ancestors if possible
221
+ #
222
+ # If both nodes have parents and:
223
+ # - Parents have same signature
224
+ # - Parents are not yet matched
225
+ # - All matched children align
226
+ # Then match the parents too
227
+ #
228
+ # @param node1 [TreeNode] Matched node from tree1
229
+ # @param node2 [TreeNode] Matched node from tree2
230
+ def propagate_to_ancestors(node1, node2)
231
+ parent1 = node1.parent
232
+ parent2 = node2.parent
233
+
234
+ return unless parent1 && parent2
235
+ return if @matched_tree1.include?(parent1)
236
+ return if @matched_tree2.include?(parent2)
237
+
238
+ # Check if parents have same signature
239
+ sig1 = Core::NodeSignature.for(parent1)
240
+ sig2 = Core::NodeSignature.for(parent2)
241
+ return unless sig1 == sig2
242
+
243
+ # Check if parents match structurally
244
+ return unless nodes_match?(parent1, parent2)
245
+
246
+ # Try to match parents
247
+ if @matching.add(parent1, parent2)
248
+ @matched_tree1 << parent1
249
+ @matched_tree2 << parent2
250
+
251
+ # Recursively propagate upward
252
+ propagate_to_ancestors(parent1, parent2)
253
+ end
254
+ end
255
+ end
256
+ end
257
+ end
258
+ end