canon 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module TreeDiff
|
|
5
|
+
module Core
|
|
6
|
+
# AttributeComparator provides order-independent attribute comparison
|
|
7
|
+
#
|
|
8
|
+
# This class encapsulates the logic for comparing node attributes
|
|
9
|
+
# in a way that respects match options, particularly attribute_order.
|
|
10
|
+
#
|
|
11
|
+
# Key responsibilities:
|
|
12
|
+
# - Compare attributes with configurable order sensitivity
|
|
13
|
+
# - Provide hash-based equality for matching algorithms
|
|
14
|
+
# - Support both strict and normalized comparison modes
|
|
15
|
+
#
|
|
16
|
+
# @example
|
|
17
|
+
# comparator = AttributeComparator.new(attribute_order: :ignore)
|
|
18
|
+
# attrs1 = {class: "TOC", id: "_"}
|
|
19
|
+
# attrs2 = {id: "_", class: "TOC"}
|
|
20
|
+
# comparator.equal?(attrs1, attrs2) # => true
|
|
21
|
+
#
|
|
22
|
+
class AttributeComparator
|
|
23
|
+
attr_reader :attribute_order
|
|
24
|
+
|
|
25
|
+
# Initialize comparator with match options
|
|
26
|
+
#
|
|
27
|
+
# @param attribute_order [Symbol] :strict or :ignore/:normalize
|
|
28
|
+
def initialize(attribute_order: :strict)
|
|
29
|
+
@attribute_order = attribute_order
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Compare two attribute hashes for equality
|
|
33
|
+
#
|
|
34
|
+
# @param attrs1 [Hash] First attribute hash
|
|
35
|
+
# @param attrs2 [Hash] Second attribute hash
|
|
36
|
+
# @return [Boolean] True if attributes are considered equal
|
|
37
|
+
def equal?(attrs1, attrs2)
|
|
38
|
+
# Handle nil/empty cases
|
|
39
|
+
return true if attrs1.nil? && attrs2.nil?
|
|
40
|
+
return false if attrs1.nil? || attrs2.nil?
|
|
41
|
+
|
|
42
|
+
attrs1 = attrs1.to_h if attrs1.respond_to?(:to_h)
|
|
43
|
+
attrs2 = attrs2.to_h if attrs2.respond_to?(:to_h)
|
|
44
|
+
|
|
45
|
+
if attribute_order == :strict
|
|
46
|
+
# Strict mode: order matters
|
|
47
|
+
attrs1 == attrs2
|
|
48
|
+
else
|
|
49
|
+
# Ignore/normalize mode: sort keys for comparison
|
|
50
|
+
normalize_for_comparison(attrs1) == normalize_for_comparison(attrs2)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Generate a comparison hash for attribute matching
|
|
55
|
+
#
|
|
56
|
+
# This is used by hash-based matchers to ensure nodes with
|
|
57
|
+
# equivalent attributes (according to match options) get the
|
|
58
|
+
# same hash value.
|
|
59
|
+
#
|
|
60
|
+
# @param attrs [Hash] Attribute hash
|
|
61
|
+
# @return [Hash] Normalized hash for comparison
|
|
62
|
+
def comparison_hash(attrs)
|
|
63
|
+
return {} if attrs.nil? || attrs.empty?
|
|
64
|
+
|
|
65
|
+
if attribute_order == :strict
|
|
66
|
+
attrs
|
|
67
|
+
else
|
|
68
|
+
normalize_for_comparison(attrs)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
# Normalize attributes for order-independent comparison
|
|
75
|
+
#
|
|
76
|
+
# @param attrs [Hash] Attribute hash
|
|
77
|
+
# @return [Hash] Sorted attribute hash
|
|
78
|
+
def normalize_for_comparison(attrs)
|
|
79
|
+
attrs.sort.to_h
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module TreeDiff
|
|
5
|
+
module Core
|
|
6
|
+
# Matching stores and manages node pair matches
|
|
7
|
+
#
|
|
8
|
+
# A matching is a set of pairs (n1, n2) where:
|
|
9
|
+
# 1. One-to-one: Each node appears in at most one pair
|
|
10
|
+
# 2. Prefix closure: If (n1, n2) matched, ancestors can match
|
|
11
|
+
#
|
|
12
|
+
# Features:
|
|
13
|
+
# - Efficient lookup: O(1) for checking if node is matched
|
|
14
|
+
# - Validation: Ensures constraints are maintained
|
|
15
|
+
# - Iteration: Supports enumeration of all pairs
|
|
16
|
+
class Matching
|
|
17
|
+
attr_reader :pairs
|
|
18
|
+
|
|
19
|
+
# Initialize empty matching
|
|
20
|
+
def initialize
|
|
21
|
+
@pairs = []
|
|
22
|
+
@tree1_map = {} # node => matched_node
|
|
23
|
+
@tree2_map = {} # node => matched_node
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Add a matched pair
|
|
27
|
+
#
|
|
28
|
+
# @param node1 [TreeNode] Node from tree 1
|
|
29
|
+
# @param node2 [TreeNode] Node from tree 2
|
|
30
|
+
# @return [Boolean] true if added, false if violates constraints
|
|
31
|
+
def add(node1, node2)
|
|
32
|
+
return false unless valid_pair?(node1, node2)
|
|
33
|
+
|
|
34
|
+
@pairs << [node1, node2]
|
|
35
|
+
@tree1_map[node1] = node2
|
|
36
|
+
@tree2_map[node2] = node1
|
|
37
|
+
|
|
38
|
+
true
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Remove a matched pair
|
|
42
|
+
#
|
|
43
|
+
# @param node1 [TreeNode] Node from tree 1
|
|
44
|
+
# @param node2 [TreeNode] Node from tree 2
|
|
45
|
+
# @return [Boolean] true if removed, false if not found
|
|
46
|
+
def remove(node1, node2)
|
|
47
|
+
removed = @pairs.delete([node1, node2])
|
|
48
|
+
return false unless removed
|
|
49
|
+
|
|
50
|
+
@tree1_map.delete(node1)
|
|
51
|
+
@tree2_map.delete(node2)
|
|
52
|
+
|
|
53
|
+
true
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Check if a node from tree 1 is matched
|
|
57
|
+
#
|
|
58
|
+
# @param node [TreeNode] Node to check
|
|
59
|
+
# @return [Boolean]
|
|
60
|
+
def matched1?(node)
|
|
61
|
+
@tree1_map.key?(node)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Check if a node from tree 2 is matched
|
|
65
|
+
#
|
|
66
|
+
# @param node [TreeNode] Node to check
|
|
67
|
+
# @return [Boolean]
|
|
68
|
+
def matched2?(node)
|
|
69
|
+
@tree2_map.key?(node)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Get the match for a node from tree 1
|
|
73
|
+
#
|
|
74
|
+
# @param node [TreeNode] Node from tree 1
|
|
75
|
+
# @return [TreeNode, nil] Matched node from tree 2, or nil
|
|
76
|
+
def match_for1(node)
|
|
77
|
+
@tree1_map[node]
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Get the match for a node from tree 2
|
|
81
|
+
#
|
|
82
|
+
# @param node [TreeNode] Node from tree 2
|
|
83
|
+
# @return [TreeNode, nil] Matched node from tree 1, or nil
|
|
84
|
+
def match_for2(node)
|
|
85
|
+
@tree2_map[node]
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Get all unmatched nodes from tree 1
|
|
89
|
+
#
|
|
90
|
+
# @param nodes [Array<TreeNode>] All nodes from tree 1
|
|
91
|
+
# @return [Array<TreeNode>]
|
|
92
|
+
def unmatched1(nodes)
|
|
93
|
+
nodes.reject { |node| matched1?(node) }
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Get all unmatched nodes from tree 2
|
|
97
|
+
#
|
|
98
|
+
# @param nodes [Array<TreeNode>] All nodes from tree 2
|
|
99
|
+
# @return [Array<TreeNode>]
|
|
100
|
+
def unmatched2(nodes)
|
|
101
|
+
nodes.reject { |node| matched2?(node) }
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Get number of matched pairs
|
|
105
|
+
#
|
|
106
|
+
# @return [Integer]
|
|
107
|
+
def size
|
|
108
|
+
@pairs.size
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Check if matching is empty
|
|
112
|
+
#
|
|
113
|
+
# @return [Boolean]
|
|
114
|
+
def empty?
|
|
115
|
+
@pairs.empty?
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Iterate over all pairs
|
|
119
|
+
#
|
|
120
|
+
# @yield [node1, node2]
|
|
121
|
+
def each(&block)
|
|
122
|
+
@pairs.each(&block)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Check if matching satisfies all constraints
|
|
126
|
+
#
|
|
127
|
+
# @return [Boolean]
|
|
128
|
+
def valid?
|
|
129
|
+
# Check one-to-one constraint
|
|
130
|
+
return false unless one_to_one?
|
|
131
|
+
|
|
132
|
+
# Check prefix closure constraint
|
|
133
|
+
return false unless prefix_closure?
|
|
134
|
+
|
|
135
|
+
true
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Check one-to-one constraint
|
|
139
|
+
#
|
|
140
|
+
# Each node appears in at most one pair
|
|
141
|
+
#
|
|
142
|
+
# @return [Boolean]
|
|
143
|
+
def one_to_one?
|
|
144
|
+
# Check tree1 map has unique values
|
|
145
|
+
tree1_values = @tree1_map.values
|
|
146
|
+
return false unless tree1_values.size == tree1_values.uniq.size
|
|
147
|
+
|
|
148
|
+
# Check tree2 map has unique values
|
|
149
|
+
tree2_values = @tree2_map.values
|
|
150
|
+
return false unless tree2_values.size == tree2_values.uniq.size
|
|
151
|
+
|
|
152
|
+
# Check maps are consistent
|
|
153
|
+
@tree1_map.all? { |n1, n2| @tree2_map[n2] == n1 }
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Check prefix closure constraint
|
|
157
|
+
#
|
|
158
|
+
# If (n1, n2) matched and ancestors (a1, a2) matched,
|
|
159
|
+
# then a1 is ancestor of n1 iff a2 is ancestor of n2
|
|
160
|
+
#
|
|
161
|
+
# @return [Boolean]
|
|
162
|
+
def prefix_closure?
|
|
163
|
+
@pairs.each do |node1, node2|
|
|
164
|
+
# Check each ancestor pair
|
|
165
|
+
node1.ancestors.each_with_index do |anc1, idx|
|
|
166
|
+
anc2 = node2.ancestors[idx]
|
|
167
|
+
|
|
168
|
+
# If ancestor matched, must be to corresponding ancestor
|
|
169
|
+
if matched1?(anc1)
|
|
170
|
+
match = match_for1(anc1)
|
|
171
|
+
return false unless match == anc2
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
true
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Convert to array of pairs
|
|
180
|
+
#
|
|
181
|
+
# @return [Array<Array<TreeNode, TreeNode>>]
|
|
182
|
+
def to_a
|
|
183
|
+
@pairs.dup
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# String representation
|
|
187
|
+
#
|
|
188
|
+
# @return [String]
|
|
189
|
+
def to_s
|
|
190
|
+
"#<Matching #{size} pairs>"
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Detailed inspection
|
|
194
|
+
#
|
|
195
|
+
# @return [String]
|
|
196
|
+
def inspect
|
|
197
|
+
pairs_str = @pairs.map do |n1, n2|
|
|
198
|
+
"(#{n1.label} ↔ #{n2.label})"
|
|
199
|
+
end.join(", ")
|
|
200
|
+
|
|
201
|
+
"#<Matching [#{pairs_str}]>"
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
private
|
|
205
|
+
|
|
206
|
+
# Check if a pair can be added without violating constraints
|
|
207
|
+
#
|
|
208
|
+
# @param node1 [TreeNode] Node from tree 1
|
|
209
|
+
# @param node2 [TreeNode] Node from tree 2
|
|
210
|
+
# @return [Boolean]
|
|
211
|
+
def valid_pair?(node1, node2)
|
|
212
|
+
# Check one-to-one constraint
|
|
213
|
+
return false if matched1?(node1)
|
|
214
|
+
return false if matched2?(node2)
|
|
215
|
+
|
|
216
|
+
# Check prefix closure constraint
|
|
217
|
+
# If ancestors are matched, they must be matched to each other
|
|
218
|
+
node1.ancestors.each_with_index do |anc1, idx|
|
|
219
|
+
# Get corresponding ancestor in tree2
|
|
220
|
+
anc2_ancestors = node2.ancestors
|
|
221
|
+
return false if idx >= anc2_ancestors.size
|
|
222
|
+
|
|
223
|
+
anc2 = anc2_ancestors[idx]
|
|
224
|
+
|
|
225
|
+
# If anc1 is matched, it must be matched to anc2
|
|
226
|
+
if matched1?(anc1) && match_for1(anc1) != anc2
|
|
227
|
+
return false
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# If anc2 is matched, it must be matched to anc1
|
|
231
|
+
if matched2?(anc2) && match_for2(anc2) != anc1
|
|
232
|
+
return false
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
true
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module TreeDiff
|
|
5
|
+
module Core
|
|
6
|
+
# NodeSignature computes unique signatures for tree nodes
|
|
7
|
+
#
|
|
8
|
+
# Based on XDiff (2002, U. Wisconsin) approach:
|
|
9
|
+
# - Signature is the path from root to node
|
|
10
|
+
# - Format: /ancestor1/ancestor2/.../node/type
|
|
11
|
+
# - Used for fast exact matching via hash lookup
|
|
12
|
+
#
|
|
13
|
+
# Features:
|
|
14
|
+
# - Deterministic: Same path always produces same signature
|
|
15
|
+
# - Hierarchical: Parent-child relationships encoded
|
|
16
|
+
# - Type-aware: Distinguishes element vs text nodes
|
|
17
|
+
class NodeSignature
|
|
18
|
+
attr_reader :path, :signature_string
|
|
19
|
+
|
|
20
|
+
# Initialize signature for a node
|
|
21
|
+
#
|
|
22
|
+
# @param node [TreeNode] Node to compute signature for
|
|
23
|
+
# @param include_attributes [Boolean] Whether to include attributes
|
|
24
|
+
def initialize(node, include_attributes: true)
|
|
25
|
+
@node = node
|
|
26
|
+
@include_attributes = include_attributes
|
|
27
|
+
@path = compute_path
|
|
28
|
+
@signature_string = compute_signature_string
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Compute and cache signature for a node
|
|
32
|
+
#
|
|
33
|
+
# @param node [TreeNode] Node to compute signature for
|
|
34
|
+
# @param include_attributes [Boolean] Whether to include attributes in signature
|
|
35
|
+
# @return [NodeSignature]
|
|
36
|
+
def self.for(node, include_attributes: true)
|
|
37
|
+
if include_attributes
|
|
38
|
+
node.signature ||= new(node, include_attributes: true)
|
|
39
|
+
else
|
|
40
|
+
# Don't cache loose signatures
|
|
41
|
+
new(node, include_attributes: false)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Check if two signatures are equal
|
|
46
|
+
#
|
|
47
|
+
# @param other [NodeSignature] Signature to compare with
|
|
48
|
+
# @return [Boolean]
|
|
49
|
+
def ==(other)
|
|
50
|
+
return false unless other.is_a?(NodeSignature)
|
|
51
|
+
|
|
52
|
+
signature_string == other.signature_string
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
alias eql? ==
|
|
56
|
+
|
|
57
|
+
# Hash value for use in Hash/Set
|
|
58
|
+
#
|
|
59
|
+
# @return [Integer]
|
|
60
|
+
def hash
|
|
61
|
+
signature_string.hash
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# String representation
|
|
65
|
+
#
|
|
66
|
+
# @return [String]
|
|
67
|
+
def to_s
|
|
68
|
+
signature_string
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Detailed inspection
|
|
72
|
+
#
|
|
73
|
+
# @return [String]
|
|
74
|
+
def inspect
|
|
75
|
+
"#<NodeSignature #{signature_string.inspect}>"
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
# Compute the path from root to this node
|
|
81
|
+
#
|
|
82
|
+
# @return [Array<String>] Path components
|
|
83
|
+
def compute_path
|
|
84
|
+
components = []
|
|
85
|
+
|
|
86
|
+
# Build path from root to node
|
|
87
|
+
ancestors = @node.ancestors.reverse
|
|
88
|
+
ancestors.each do |ancestor|
|
|
89
|
+
components << path_component(ancestor)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Add the node itself
|
|
93
|
+
components << path_component(@node)
|
|
94
|
+
|
|
95
|
+
components
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Get path component for a node
|
|
99
|
+
#
|
|
100
|
+
# @param node [TreeNode] Node to get component for
|
|
101
|
+
# @return [String]
|
|
102
|
+
def path_component(node)
|
|
103
|
+
# For element nodes: use label with sorted attributes
|
|
104
|
+
# For text nodes: use "#text"
|
|
105
|
+
# CRITICAL: Text nodes should use "#text" not "text"
|
|
106
|
+
# Check the label - actual text nodes have no label or label == "text"
|
|
107
|
+
label_str = node.label.to_s.downcase
|
|
108
|
+
if node.label.nil? || label_str.empty? || label_str == "#text" || label_str == "text"
|
|
109
|
+
"#text"
|
|
110
|
+
else
|
|
111
|
+
component = node.label.to_s
|
|
112
|
+
|
|
113
|
+
# Include sorted attributes to distinguish nodes with same label
|
|
114
|
+
# but different attributes (while ignoring attribute order)
|
|
115
|
+
# Only include attributes if requested (for hash matching)
|
|
116
|
+
if @include_attributes && !node.attributes.empty?
|
|
117
|
+
sorted_attrs = node.attributes.sort.to_h
|
|
118
|
+
attrs_str = sorted_attrs.map { |k, v| "#{k}=#{v}" }.join(",")
|
|
119
|
+
component += "{#{attrs_str}}"
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# CRITICAL: For whitespace-sensitive HTML elements, include the text value
|
|
123
|
+
# in the signature to prevent incorrect matching of nodes with different whitespace
|
|
124
|
+
if @include_attributes && whitespace_sensitive?(node) && node.value
|
|
125
|
+
# Include text value in signature for whitespace-sensitive elements
|
|
126
|
+
# Use inspect to make whitespace visible and handle special characters
|
|
127
|
+
component += "[text=#{node.value.inspect}]"
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
component
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Check if a node is in a whitespace-sensitive context
|
|
135
|
+
#
|
|
136
|
+
# HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
|
|
137
|
+
#
|
|
138
|
+
# @param node [TreeNode] Node to check
|
|
139
|
+
# @return [Boolean] True if node is whitespace-sensitive
|
|
140
|
+
def whitespace_sensitive?(node)
|
|
141
|
+
return false unless node
|
|
142
|
+
|
|
143
|
+
# List of HTML elements where whitespace is semantically significant
|
|
144
|
+
whitespace_sensitive_tags = %w[pre code textarea script style]
|
|
145
|
+
|
|
146
|
+
# Check if this node is whitespace-sensitive
|
|
147
|
+
if node.respond_to?(:label)
|
|
148
|
+
label = node.label.to_s.downcase
|
|
149
|
+
return true if whitespace_sensitive_tags.include?(label)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
false
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Compute signature string from path
|
|
156
|
+
#
|
|
157
|
+
# @return [String]
|
|
158
|
+
def compute_signature_string
|
|
159
|
+
"/#{path.join('/')}"
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
end
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module TreeDiff
|
|
5
|
+
module Core
|
|
6
|
+
# NodeWeight computes weights for tree nodes
|
|
7
|
+
#
|
|
8
|
+
# Based on XyDiff/Cobena (2002, INRIA) approach:
|
|
9
|
+
# - Weight reflects subtree size/importance
|
|
10
|
+
# - Formula: 1 + Σ(child_weights)
|
|
11
|
+
# - Text nodes: 1 + log(text_length) for significant text
|
|
12
|
+
# - Used to prioritize matching (heaviest first)
|
|
13
|
+
#
|
|
14
|
+
# Features:
|
|
15
|
+
# - Hierarchical: Parent weight includes all descendants
|
|
16
|
+
# - Text-aware: Longer text has higher weight
|
|
17
|
+
# - Cached: Computed once and reused
|
|
18
|
+
class NodeWeight
|
|
19
|
+
attr_reader :value
|
|
20
|
+
|
|
21
|
+
# Initialize weight for a node
|
|
22
|
+
#
|
|
23
|
+
# @param node [TreeNode] Node to compute weight for
|
|
24
|
+
def initialize(node)
|
|
25
|
+
@node = node
|
|
26
|
+
@value = compute_weight
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Compute and cache weight for a node
|
|
30
|
+
#
|
|
31
|
+
# @param node [TreeNode] Node to compute weight for
|
|
32
|
+
# @return [NodeWeight]
|
|
33
|
+
def self.for(node)
|
|
34
|
+
node.weight ||= new(node)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Compare weights (for sorting)
|
|
38
|
+
#
|
|
39
|
+
# @param other [NodeWeight] Weight to compare with
|
|
40
|
+
# @return [Integer] -1, 0, or 1
|
|
41
|
+
def <=>(other)
|
|
42
|
+
return nil unless other.is_a?(NodeWeight)
|
|
43
|
+
|
|
44
|
+
value <=> other.value
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Check if equal
|
|
48
|
+
#
|
|
49
|
+
# @param other [NodeWeight] Weight to compare with
|
|
50
|
+
# @return [Boolean]
|
|
51
|
+
def ==(other)
|
|
52
|
+
return false unless other.is_a?(NodeWeight)
|
|
53
|
+
|
|
54
|
+
value == other.value
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Numeric value for calculations
|
|
58
|
+
#
|
|
59
|
+
# @return [Float]
|
|
60
|
+
def to_f
|
|
61
|
+
value
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Integer value for calculations
|
|
65
|
+
#
|
|
66
|
+
# @return [Integer]
|
|
67
|
+
def to_i
|
|
68
|
+
value.to_i
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# String representation
|
|
72
|
+
#
|
|
73
|
+
# @return [String]
|
|
74
|
+
def to_s
|
|
75
|
+
value.to_s
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Detailed inspection
|
|
79
|
+
#
|
|
80
|
+
# @return [String]
|
|
81
|
+
def inspect
|
|
82
|
+
"#<NodeWeight #{value}>"
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
private
|
|
86
|
+
|
|
87
|
+
# Compute weight based on node type and structure
|
|
88
|
+
#
|
|
89
|
+
# @return [Float]
|
|
90
|
+
def compute_weight
|
|
91
|
+
if @node.text?
|
|
92
|
+
compute_text_weight
|
|
93
|
+
else
|
|
94
|
+
compute_element_weight
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Compute weight for text nodes
|
|
99
|
+
#
|
|
100
|
+
# Formula: 1 + log(text_length)
|
|
101
|
+
# - Minimum weight is 1.0 (empty text)
|
|
102
|
+
# - Grows logarithmically with text length
|
|
103
|
+
# - Prevents very long text from dominating
|
|
104
|
+
#
|
|
105
|
+
# @return [Float]
|
|
106
|
+
def compute_text_weight
|
|
107
|
+
text = @node.value.to_s
|
|
108
|
+
return 1.0 if text.empty?
|
|
109
|
+
|
|
110
|
+
# Use natural logarithm (log base e)
|
|
111
|
+
# Add 1 to avoid log(0)
|
|
112
|
+
1.0 + Math.log(text.length + 1)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Compute weight for element nodes
|
|
116
|
+
#
|
|
117
|
+
# Formula: 1 + Σ(child_weights)
|
|
118
|
+
# - Each node has base weight of 1
|
|
119
|
+
# - Parent weight includes all descendants
|
|
120
|
+
# - Recursive computation
|
|
121
|
+
#
|
|
122
|
+
# @return [Float]
|
|
123
|
+
def compute_element_weight
|
|
124
|
+
return 1.0 if @node.children.empty?
|
|
125
|
+
|
|
126
|
+
child_weights = @node.children.map do |child|
|
|
127
|
+
self.class.for(child).value
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
1.0 + child_weights.sum
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|