canon 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -0,0 +1,450 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module TreeDiff
|
|
5
|
+
module Core
|
|
6
|
+
# TreeNode represents a node in a semantic tree structure
|
|
7
|
+
#
|
|
8
|
+
# This is the fundamental data structure for tree-based diffing,
|
|
9
|
+
# supporting both XML and JSON trees in a format-agnostic way.
|
|
10
|
+
#
|
|
11
|
+
# Key features:
|
|
12
|
+
# - Label: Node name/key (e.g., element name, object key)
|
|
13
|
+
# - Value: Leaf node content (text, number, boolean, etc.)
|
|
14
|
+
# - Children: Ordered list of child nodes
|
|
15
|
+
# - Parent: Reference to parent node (nil for root)
|
|
16
|
+
# - Attributes: Key-value metadata (e.g., XML attributes)
|
|
17
|
+
# - Signature: Computed path-based identifier (XDiff-style)
|
|
18
|
+
# - Weight: Subtree size metric (XyDiff-style)
|
|
19
|
+
# - XID: External identifier for matching (e.g., XML id attribute)
|
|
20
|
+
class TreeNode
|
|
21
|
+
attr_accessor :label, :value, :children, :parent, :attributes,
|
|
22
|
+
:signature, :weight, :xid, :source_node
|
|
23
|
+
attr_reader :metadata
|
|
24
|
+
|
|
25
|
+
# Initialize a new TreeNode
|
|
26
|
+
#
|
|
27
|
+
# @param label [String] Node name/key
|
|
28
|
+
# @param value [String, Numeric, Boolean, nil] Leaf value
|
|
29
|
+
# @param children [Array<TreeNode>] Child nodes
|
|
30
|
+
# @param parent [TreeNode, nil] Parent node
|
|
31
|
+
# @param attributes [Hash] Node attributes
|
|
32
|
+
# @param xid [String, nil] External identifier
|
|
33
|
+
# @param source_node [Object, nil] Original source node (e.g., Nokogiri node)
|
|
34
|
+
def initialize(label:, value: nil, children: [], parent: nil,
|
|
35
|
+
attributes: {}, xid: nil, source_node: nil)
|
|
36
|
+
@label = label
|
|
37
|
+
@value = value
|
|
38
|
+
@children = children
|
|
39
|
+
@parent = parent
|
|
40
|
+
@attributes = attributes
|
|
41
|
+
@xid = xid
|
|
42
|
+
@source_node = source_node
|
|
43
|
+
@metadata = {}
|
|
44
|
+
|
|
45
|
+
# Set this node as parent for all children
|
|
46
|
+
@children.each { |child| child.parent = self }
|
|
47
|
+
|
|
48
|
+
# Computed lazily
|
|
49
|
+
@signature = nil
|
|
50
|
+
@weight = nil
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Check if this is a leaf node (no children)
|
|
54
|
+
#
|
|
55
|
+
# @return [Boolean]
|
|
56
|
+
def leaf?
|
|
57
|
+
children.empty?
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Check if this is a text node (leaf with value)
|
|
61
|
+
#
|
|
62
|
+
# @return [Boolean]
|
|
63
|
+
def text?
|
|
64
|
+
leaf? && !value.nil?
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Check if this is an element node (has children or attributes)
|
|
68
|
+
#
|
|
69
|
+
# @return [Boolean]
|
|
70
|
+
def element?
|
|
71
|
+
!leaf? || !attributes.empty?
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Get the root node of this tree
|
|
75
|
+
#
|
|
76
|
+
# @return [TreeNode]
|
|
77
|
+
def root
|
|
78
|
+
node = self
|
|
79
|
+
node = node.parent while node.parent
|
|
80
|
+
node
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Get all ancestor nodes from parent to root
|
|
84
|
+
#
|
|
85
|
+
# @return [Array<TreeNode>]
|
|
86
|
+
def ancestors
|
|
87
|
+
result = []
|
|
88
|
+
node = parent
|
|
89
|
+
while node
|
|
90
|
+
result << node
|
|
91
|
+
node = node.parent
|
|
92
|
+
end
|
|
93
|
+
result
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Get all descendant nodes (depth-first)
|
|
97
|
+
#
|
|
98
|
+
# @return [Array<TreeNode>]
|
|
99
|
+
def descendants
|
|
100
|
+
result = []
|
|
101
|
+
children.each do |child|
|
|
102
|
+
result << child
|
|
103
|
+
result.concat(child.descendants)
|
|
104
|
+
end
|
|
105
|
+
result
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Get sibling nodes (nodes with same parent)
|
|
109
|
+
#
|
|
110
|
+
# @return [Array<TreeNode>]
|
|
111
|
+
def siblings
|
|
112
|
+
return [] unless parent
|
|
113
|
+
|
|
114
|
+
parent.children.reject { |child| child == self }
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Get left siblings (siblings before this node)
|
|
118
|
+
#
|
|
119
|
+
# @return [Array<TreeNode>]
|
|
120
|
+
def left_siblings
|
|
121
|
+
return [] unless parent
|
|
122
|
+
|
|
123
|
+
index = parent.children.index(self)
|
|
124
|
+
return [] unless index
|
|
125
|
+
|
|
126
|
+
parent.children[0...index]
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# Get right siblings (siblings after this node)
|
|
130
|
+
#
|
|
131
|
+
# @return [Array<TreeNode>]
|
|
132
|
+
def right_siblings
|
|
133
|
+
return [] unless parent
|
|
134
|
+
|
|
135
|
+
index = parent.children.index(self)
|
|
136
|
+
return [] unless index
|
|
137
|
+
|
|
138
|
+
parent.children[(index + 1)..]
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Get the position of this node among its siblings
|
|
142
|
+
#
|
|
143
|
+
# @return [Integer, nil] 0-based index, or nil if no parent
|
|
144
|
+
def position
|
|
145
|
+
return nil unless parent
|
|
146
|
+
|
|
147
|
+
parent.children.index(self)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Get depth of this node (distance from root)
|
|
151
|
+
#
|
|
152
|
+
# @return [Integer]
|
|
153
|
+
def depth
|
|
154
|
+
ancestors.size
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Get height of this node (max distance to any leaf)
|
|
158
|
+
#
|
|
159
|
+
# @return [Integer]
|
|
160
|
+
def height
|
|
161
|
+
return 0 if leaf?
|
|
162
|
+
|
|
163
|
+
1 + children.map(&:height).max
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Get the size of subtree rooted at this node
|
|
167
|
+
#
|
|
168
|
+
# @return [Integer]
|
|
169
|
+
def size
|
|
170
|
+
1 + children.sum(&:size)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Add a child node
|
|
174
|
+
#
|
|
175
|
+
# @param child [TreeNode] Child to add
|
|
176
|
+
# @param position [Integer, nil] Optional position to insert at
|
|
177
|
+
# @return [TreeNode] The added child
|
|
178
|
+
def add_child(child, position: nil)
|
|
179
|
+
child.parent = self
|
|
180
|
+
|
|
181
|
+
if position
|
|
182
|
+
children.insert(position, child)
|
|
183
|
+
else
|
|
184
|
+
children << child
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Invalidate cached computations
|
|
188
|
+
invalidate_cache
|
|
189
|
+
|
|
190
|
+
child
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Remove a child node
|
|
194
|
+
#
|
|
195
|
+
# @param child [TreeNode] Child to remove
|
|
196
|
+
# @return [TreeNode, nil] The removed child, or nil if not found
|
|
197
|
+
def remove_child(child)
|
|
198
|
+
removed = children.delete(child)
|
|
199
|
+
removed&.parent = nil
|
|
200
|
+
|
|
201
|
+
# Invalidate cached computations
|
|
202
|
+
invalidate_cache if removed
|
|
203
|
+
|
|
204
|
+
removed
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Replace a child node with another
|
|
208
|
+
#
|
|
209
|
+
# @param old_child [TreeNode] Child to replace
|
|
210
|
+
# @param new_child [TreeNode] New child
|
|
211
|
+
# @return [TreeNode, nil] The replaced child, or nil if not found
|
|
212
|
+
def replace_child(old_child, new_child)
|
|
213
|
+
index = children.index(old_child)
|
|
214
|
+
return nil unless index
|
|
215
|
+
|
|
216
|
+
old_child.parent = nil
|
|
217
|
+
new_child.parent = self
|
|
218
|
+
children[index] = new_child
|
|
219
|
+
|
|
220
|
+
# Invalidate cached computations
|
|
221
|
+
invalidate_cache
|
|
222
|
+
|
|
223
|
+
old_child
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# Check if two nodes match exactly
|
|
227
|
+
#
|
|
228
|
+
# Exact match requires:
|
|
229
|
+
# - Same label
|
|
230
|
+
# - Same value (for text nodes)
|
|
231
|
+
# - Same attributes (key-value pairs)
|
|
232
|
+
# - Same number of children with same labels
|
|
233
|
+
#
|
|
234
|
+
# @param other [TreeNode] Node to compare with
|
|
235
|
+
# @return [Boolean]
|
|
236
|
+
def matches?(other)
|
|
237
|
+
return false unless other.is_a?(TreeNode)
|
|
238
|
+
return false unless label == other.label
|
|
239
|
+
return false unless value == other.value
|
|
240
|
+
return false unless attributes == other.attributes
|
|
241
|
+
return false unless children.size == other.children.size
|
|
242
|
+
|
|
243
|
+
# Check children have same labels
|
|
244
|
+
children.map(&:label) == other.children.map(&:label)
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Calculate similarity score with another node
|
|
248
|
+
#
|
|
249
|
+
# Uses Jaccard index on combined content:
|
|
250
|
+
# - Label
|
|
251
|
+
# - Value
|
|
252
|
+
# - Attribute keys and values
|
|
253
|
+
# - Child labels
|
|
254
|
+
#
|
|
255
|
+
# @param other [TreeNode] Node to compare with
|
|
256
|
+
# @return [Float] Similarity score 0.0 to 1.0
|
|
257
|
+
def similarity_to(other)
|
|
258
|
+
return 0.0 unless other.is_a?(TreeNode)
|
|
259
|
+
|
|
260
|
+
# Extract comparable elements
|
|
261
|
+
set1 = content_set
|
|
262
|
+
set2 = other.content_set
|
|
263
|
+
|
|
264
|
+
# Jaccard index: |intersection| / |union|
|
|
265
|
+
return 0.0 if set1.empty? && set2.empty?
|
|
266
|
+
|
|
267
|
+
intersection = (set1 & set2).size.to_f
|
|
268
|
+
union = (set1 | set2).size.to_f
|
|
269
|
+
|
|
270
|
+
intersection / union
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
# Calculate semantic distance to another node
|
|
274
|
+
#
|
|
275
|
+
# Semantic distance considers:
|
|
276
|
+
# - Depth difference (structural distance)
|
|
277
|
+
# - Content similarity (inverse)
|
|
278
|
+
# - Attribute differences
|
|
279
|
+
#
|
|
280
|
+
# @param other [TreeNode] Node to compare with
|
|
281
|
+
# @return [Float] Distance metric (0 = identical)
|
|
282
|
+
def semantic_distance_to(other)
|
|
283
|
+
return Float::INFINITY unless other.is_a?(TreeNode)
|
|
284
|
+
|
|
285
|
+
# Component 1: Depth difference (structural)
|
|
286
|
+
depth_diff = (depth - other.depth).abs.to_f
|
|
287
|
+
|
|
288
|
+
# Component 2: Content dissimilarity
|
|
289
|
+
content_diff = 1.0 - similarity_to(other)
|
|
290
|
+
|
|
291
|
+
# Component 3: Attribute differences
|
|
292
|
+
attr_diff = attribute_difference(other)
|
|
293
|
+
|
|
294
|
+
# Weighted combination
|
|
295
|
+
depth_diff * 0.3 + content_diff * 0.5 + attr_diff * 0.2
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Get content as a set for similarity calculation
|
|
299
|
+
#
|
|
300
|
+
# @return [Set<String>]
|
|
301
|
+
def content_set
|
|
302
|
+
result = Set.new
|
|
303
|
+
|
|
304
|
+
# Add label
|
|
305
|
+
result << "label:#{label}" if label
|
|
306
|
+
|
|
307
|
+
# Add value
|
|
308
|
+
result << "value:#{value}" if value
|
|
309
|
+
|
|
310
|
+
# Add attributes
|
|
311
|
+
attributes.each do |key, val|
|
|
312
|
+
result << "attr:#{key}=#{val}"
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
# Add child labels
|
|
316
|
+
children.each do |child|
|
|
317
|
+
result << "child:#{child.label}"
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
result
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# Calculate attribute difference with another node
|
|
324
|
+
#
|
|
325
|
+
# @param other [TreeNode] Node to compare with
|
|
326
|
+
# @return [Float] Difference score 0.0 to 1.0
|
|
327
|
+
def attribute_difference(other)
|
|
328
|
+
keys1 = Set.new(attributes.keys)
|
|
329
|
+
keys2 = Set.new(other.attributes.keys)
|
|
330
|
+
|
|
331
|
+
all_keys = keys1 | keys2
|
|
332
|
+
return 0.0 if all_keys.empty?
|
|
333
|
+
|
|
334
|
+
diff_count = 0
|
|
335
|
+
|
|
336
|
+
all_keys.each do |key|
|
|
337
|
+
val1 = attributes[key]
|
|
338
|
+
val2 = other.attributes[key]
|
|
339
|
+
|
|
340
|
+
diff_count += 1 if val1 != val2
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
diff_count.to_f / all_keys.size
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
# Get XPath for this node
|
|
347
|
+
#
|
|
348
|
+
# @return [String] XPath expression
|
|
349
|
+
def xpath
|
|
350
|
+
# If we have a source node that supports xpath, use it
|
|
351
|
+
if @source_node.respond_to?(:path)
|
|
352
|
+
return @source_node.path
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
# Otherwise construct path from tree structure
|
|
356
|
+
construct_path
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# Construct path from tree structure
|
|
360
|
+
#
|
|
361
|
+
# @return [String] Path expression
|
|
362
|
+
def construct_path
|
|
363
|
+
segments = []
|
|
364
|
+
node = self
|
|
365
|
+
|
|
366
|
+
while node
|
|
367
|
+
if node.parent
|
|
368
|
+
# Get position among siblings with same label
|
|
369
|
+
siblings = node.parent.children.select do |c|
|
|
370
|
+
c.label == node.label
|
|
371
|
+
end
|
|
372
|
+
position = siblings.index(node) + 1 # 1-based indexing for XPath
|
|
373
|
+
|
|
374
|
+
# Always include index for clarity and precision
|
|
375
|
+
segments.unshift("#{node.label}[#{position}]")
|
|
376
|
+
else
|
|
377
|
+
segments.unshift(node.label)
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
node = node.parent
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
"/#{segments.join('/')}"
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
# Deep clone this node and its subtree
|
|
387
|
+
#
|
|
388
|
+
# @return [TreeNode]
|
|
389
|
+
def deep_clone
|
|
390
|
+
cloned_children = children.map(&:deep_clone)
|
|
391
|
+
|
|
392
|
+
TreeNode.new(
|
|
393
|
+
label: label,
|
|
394
|
+
value: value,
|
|
395
|
+
children: cloned_children,
|
|
396
|
+
parent: nil,
|
|
397
|
+
attributes: attributes.dup,
|
|
398
|
+
xid: xid,
|
|
399
|
+
source_node: source_node, # Preserve source node reference
|
|
400
|
+
)
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
# Convert to hash representation
|
|
404
|
+
#
|
|
405
|
+
# @return [Hash]
|
|
406
|
+
def to_h
|
|
407
|
+
result = {
|
|
408
|
+
label: label,
|
|
409
|
+
value: value,
|
|
410
|
+
attributes: attributes,
|
|
411
|
+
xid: xid,
|
|
412
|
+
children: children.map(&:to_h),
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
result[:signature] = signature if signature
|
|
416
|
+
result[:weight] = weight if weight
|
|
417
|
+
|
|
418
|
+
result
|
|
419
|
+
end
|
|
420
|
+
|
|
421
|
+
# String representation for debugging
|
|
422
|
+
#
|
|
423
|
+
# @return [String]
|
|
424
|
+
def inspect
|
|
425
|
+
attrs = []
|
|
426
|
+
attrs << "label=#{label.inspect}"
|
|
427
|
+
attrs << "value=#{value.inspect}" if value
|
|
428
|
+
attrs << "xid=#{xid.inspect}" if xid
|
|
429
|
+
attrs << "children=#{children.size}" unless children.empty?
|
|
430
|
+
attrs << "attributes=#{attributes.size}" unless attributes.empty?
|
|
431
|
+
|
|
432
|
+
"#<TreeNode #{attrs.join(' ')}>"
|
|
433
|
+
end
|
|
434
|
+
|
|
435
|
+
alias to_s inspect
|
|
436
|
+
|
|
437
|
+
private
|
|
438
|
+
|
|
439
|
+
# Invalidate cached computations
|
|
440
|
+
def invalidate_cache
|
|
441
|
+
@signature = nil
|
|
442
|
+
@weight = nil
|
|
443
|
+
|
|
444
|
+
# Propagate upward
|
|
445
|
+
parent&.send(:invalidate_cache)
|
|
446
|
+
end
|
|
447
|
+
end
|
|
448
|
+
end
|
|
449
|
+
end
|
|
450
|
+
end
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../core/tree_node"
|
|
4
|
+
require_relative "../core/node_signature"
|
|
5
|
+
require_relative "../core/node_weight"
|
|
6
|
+
require_relative "../core/matching"
|
|
7
|
+
require_relative "../core/attribute_comparator"
|
|
8
|
+
|
|
9
|
+
module Canon
|
|
10
|
+
module TreeDiff
|
|
11
|
+
module Matchers
|
|
12
|
+
# HashMatcher performs fast exact subtree matching
|
|
13
|
+
#
|
|
14
|
+
# Based on XyDiff/Cobena (2002, INRIA) BULD algorithm:
|
|
15
|
+
# - Build signature map for tree1
|
|
16
|
+
# - Process nodes by weight (heaviest first)
|
|
17
|
+
# - Match identical subtrees via signature lookup
|
|
18
|
+
# - Propagate matches to ancestors
|
|
19
|
+
#
|
|
20
|
+
# Complexity: O(n log n) where n is number of nodes
|
|
21
|
+
#
|
|
22
|
+
# Features:
|
|
23
|
+
# - Hash-based exact matching (O(1) lookup)
|
|
24
|
+
# - Weight-based prioritization (largest subtrees first)
|
|
25
|
+
# - Automatic ancestor propagation
|
|
26
|
+
# - Handles both element and text nodes
|
|
27
|
+
class HashMatcher
|
|
28
|
+
attr_reader :tree1, :tree2, :matching, :match_options
|
|
29
|
+
|
|
30
|
+
# Initialize matcher with two trees
|
|
31
|
+
#
|
|
32
|
+
# @param tree1 [TreeNode] First tree root
|
|
33
|
+
# @param tree2 [TreeNode] Second tree root
|
|
34
|
+
# @param options [Hash] Match options (includes text_content, attribute_order, etc.)
|
|
35
|
+
def initialize(tree1, tree2, options = {})
|
|
36
|
+
@tree1 = tree1
|
|
37
|
+
@tree2 = tree2
|
|
38
|
+
@matching = Core::Matching.new
|
|
39
|
+
@signature_map = {}
|
|
40
|
+
@matched_tree1 = Set.new
|
|
41
|
+
@matched_tree2 = Set.new
|
|
42
|
+
@options = options
|
|
43
|
+
@match_options = options # Store full match options for text comparison
|
|
44
|
+
@attribute_comparator = Core::AttributeComparator.new(
|
|
45
|
+
attribute_order: options[:attribute_order] || :ignore,
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Perform hash-based matching
|
|
50
|
+
#
|
|
51
|
+
# @return [Core::Matching] The resulting matching
|
|
52
|
+
def match
|
|
53
|
+
# Step 1: Build signature map for tree1
|
|
54
|
+
build_signature_map
|
|
55
|
+
|
|
56
|
+
# Step 2: Get all nodes from tree2 sorted by weight (heaviest first)
|
|
57
|
+
tree2_nodes = collect_nodes(tree2).sort_by do |node|
|
|
58
|
+
-Core::NodeWeight.for(node).value
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Step 3: Match nodes from tree2 to tree1 via signatures
|
|
62
|
+
tree2_nodes.each do |node2|
|
|
63
|
+
next if @matched_tree2.include?(node2)
|
|
64
|
+
|
|
65
|
+
match_node(node2)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
@matching
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
private
|
|
72
|
+
|
|
73
|
+
# Build signature map for tree1
|
|
74
|
+
#
|
|
75
|
+
# Maps signatures to arrays of nodes (multiple nodes can share signature)
|
|
76
|
+
def build_signature_map
|
|
77
|
+
collect_nodes(tree1).each do |node|
|
|
78
|
+
sig = Core::NodeSignature.for(node)
|
|
79
|
+
@signature_map[sig] ||= []
|
|
80
|
+
@signature_map[sig] << node
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Collect all nodes from a tree (depth-first)
|
|
85
|
+
#
|
|
86
|
+
# @param root [TreeNode] Root of tree
|
|
87
|
+
# @return [Array<TreeNode>]
|
|
88
|
+
def collect_nodes(root)
|
|
89
|
+
nodes = [root]
|
|
90
|
+
nodes.concat(root.descendants)
|
|
91
|
+
nodes
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Try to match a node from tree2 to tree1
|
|
95
|
+
#
|
|
96
|
+
# @param node2 [TreeNode] Node from tree2
|
|
97
|
+
def match_node(node2)
|
|
98
|
+
sig2 = Core::NodeSignature.for(node2)
|
|
99
|
+
|
|
100
|
+
# Find candidate nodes in tree1 with same signature
|
|
101
|
+
candidates = @signature_map[sig2] || []
|
|
102
|
+
|
|
103
|
+
# Filter to unmatched candidates
|
|
104
|
+
candidates = candidates.reject { |n| @matched_tree1.include?(n) }
|
|
105
|
+
|
|
106
|
+
return if candidates.empty?
|
|
107
|
+
|
|
108
|
+
# Find best match among candidates
|
|
109
|
+
best_match = find_best_match(node2, candidates)
|
|
110
|
+
|
|
111
|
+
return unless best_match
|
|
112
|
+
|
|
113
|
+
# Add match if it satisfies constraints
|
|
114
|
+
if @matching.add(best_match, node2)
|
|
115
|
+
@matched_tree1 << best_match
|
|
116
|
+
@matched_tree2 << node2
|
|
117
|
+
|
|
118
|
+
# Try to propagate match to ancestors
|
|
119
|
+
propagate_to_ancestors(best_match, node2)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Find best match among candidates
|
|
124
|
+
#
|
|
125
|
+
# For exact matching, we need:
|
|
126
|
+
# 1. Same signature (already filtered)
|
|
127
|
+
# 2. Matching subtrees (same structure and values)
|
|
128
|
+
#
|
|
129
|
+
# @param node2 [TreeNode] Node from tree2
|
|
130
|
+
# @param candidates [Array<TreeNode>] Candidate nodes from tree1
|
|
131
|
+
# @return [TreeNode, nil]
|
|
132
|
+
def find_best_match(node2, candidates)
|
|
133
|
+
# For hash matching, we want exact subtree equality
|
|
134
|
+
# Find first candidate that has matching subtree
|
|
135
|
+
candidates.find do |node1|
|
|
136
|
+
subtrees_match?(node1, node2)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Check if two subtrees match exactly
|
|
141
|
+
#
|
|
142
|
+
# @param node1 [TreeNode] Node from tree1
|
|
143
|
+
# @param node2 [TreeNode] Node from tree2
|
|
144
|
+
# @return [Boolean]
|
|
145
|
+
def subtrees_match?(node1, node2)
|
|
146
|
+
# Check root nodes match
|
|
147
|
+
return false unless nodes_match?(node1, node2)
|
|
148
|
+
|
|
149
|
+
# Check children count
|
|
150
|
+
return false unless node1.children.size == node2.children.size
|
|
151
|
+
|
|
152
|
+
# Check each child subtree matches
|
|
153
|
+
node1.children.zip(node2.children).all? do |child1, child2|
|
|
154
|
+
subtrees_match?(child1, child2)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Check if two nodes match (not including subtrees)
|
|
159
|
+
#
|
|
160
|
+
# Uses normalized text comparison based on match_options.
|
|
161
|
+
#
|
|
162
|
+
# @param node1 [TreeNode] Node from tree1
|
|
163
|
+
# @param node2 [TreeNode] Node from tree2
|
|
164
|
+
# @return [Boolean]
|
|
165
|
+
def nodes_match?(node1, node2)
|
|
166
|
+
return false unless node1.label == node2.label
|
|
167
|
+
|
|
168
|
+
# CRITICAL FIX: Use normalized text comparison
|
|
169
|
+
return false unless text_equivalent?(node1, node2)
|
|
170
|
+
|
|
171
|
+
return false unless @attribute_comparator.equal?(node1.attributes,
|
|
172
|
+
node2.attributes)
|
|
173
|
+
|
|
174
|
+
true
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Check if text values are equivalent according to match options
|
|
178
|
+
#
|
|
179
|
+
# Same logic as in OperationDetector for consistency.
|
|
180
|
+
#
|
|
181
|
+
# @param node1 [TreeNode] First node
|
|
182
|
+
# @param node2 [TreeNode] Second node
|
|
183
|
+
# @return [Boolean] True if text values are equivalent
|
|
184
|
+
def text_equivalent?(node1, node2)
|
|
185
|
+
text1 = node1.value
|
|
186
|
+
text2 = node2.value
|
|
187
|
+
|
|
188
|
+
# Both nil or empty = equivalent
|
|
189
|
+
return true if (text1.nil? || text1.empty?) && (text2.nil? || text2.empty?)
|
|
190
|
+
return false if (text1.nil? || text1.empty?) || (text2.nil? || text2.empty?)
|
|
191
|
+
|
|
192
|
+
# If both normalize to empty (whitespace-only), treat as equivalent
|
|
193
|
+
norm1 = normalize_text(text1)
|
|
194
|
+
norm2 = normalize_text(text2)
|
|
195
|
+
return true if norm1.empty? && norm2.empty?
|
|
196
|
+
|
|
197
|
+
# Apply normalization based on match_options
|
|
198
|
+
text_content_mode = @match_options[:text_content] || :normalize
|
|
199
|
+
|
|
200
|
+
case text_content_mode
|
|
201
|
+
when :strict
|
|
202
|
+
text1 == text2
|
|
203
|
+
when :normalize, :normalized
|
|
204
|
+
norm1 == norm2
|
|
205
|
+
else
|
|
206
|
+
norm1 == norm2
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Normalize text for comparison
|
|
211
|
+
#
|
|
212
|
+
# @param text [String, nil] Text to normalize
|
|
213
|
+
# @return [String] Normalized text
|
|
214
|
+
def normalize_text(text)
|
|
215
|
+
return "" if text.nil? || text.empty?
|
|
216
|
+
|
|
217
|
+
text.gsub(/\s+/, " ").strip
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Propagate match to ancestors if possible
|
|
221
|
+
#
|
|
222
|
+
# If both nodes have parents and:
|
|
223
|
+
# - Parents have same signature
|
|
224
|
+
# - Parents are not yet matched
|
|
225
|
+
# - All matched children align
|
|
226
|
+
# Then match the parents too
|
|
227
|
+
#
|
|
228
|
+
# @param node1 [TreeNode] Matched node from tree1
|
|
229
|
+
# @param node2 [TreeNode] Matched node from tree2
|
|
230
|
+
def propagate_to_ancestors(node1, node2)
|
|
231
|
+
parent1 = node1.parent
|
|
232
|
+
parent2 = node2.parent
|
|
233
|
+
|
|
234
|
+
return unless parent1 && parent2
|
|
235
|
+
return if @matched_tree1.include?(parent1)
|
|
236
|
+
return if @matched_tree2.include?(parent2)
|
|
237
|
+
|
|
238
|
+
# Check if parents have same signature
|
|
239
|
+
sig1 = Core::NodeSignature.for(parent1)
|
|
240
|
+
sig2 = Core::NodeSignature.for(parent2)
|
|
241
|
+
return unless sig1 == sig2
|
|
242
|
+
|
|
243
|
+
# Check if parents match structurally
|
|
244
|
+
return unless nodes_match?(parent1, parent2)
|
|
245
|
+
|
|
246
|
+
# Try to match parents
|
|
247
|
+
if @matching.add(parent1, parent2)
|
|
248
|
+
@matched_tree1 << parent1
|
|
249
|
+
@matched_tree2 << parent2
|
|
250
|
+
|
|
251
|
+
# Recursively propagate upward
|
|
252
|
+
propagate_to_ancestors(parent1, parent2)
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
end
|