canon 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module TreeDiff
|
|
5
|
+
module Operations
|
|
6
|
+
# Base class for all tree diff operations
|
|
7
|
+
#
|
|
8
|
+
# Represents a high-level semantic operation detected from tree matching.
|
|
9
|
+
# Each operation has a type, affected nodes, and metadata.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# operation = Operation.new(
|
|
13
|
+
# type: :insert,
|
|
14
|
+
# node: new_node,
|
|
15
|
+
# parent: parent_node,
|
|
16
|
+
# position: 2
|
|
17
|
+
# )
|
|
18
|
+
#
|
|
19
|
+
class Operation
|
|
20
|
+
# Operation types based on XDiff and JATS-diff research
|
|
21
|
+
TYPES = %i[
|
|
22
|
+
insert
|
|
23
|
+
delete
|
|
24
|
+
update
|
|
25
|
+
move
|
|
26
|
+
merge
|
|
27
|
+
split
|
|
28
|
+
upgrade
|
|
29
|
+
downgrade
|
|
30
|
+
].freeze
|
|
31
|
+
|
|
32
|
+
attr_reader :type, :metadata
|
|
33
|
+
|
|
34
|
+
# Initialize a new operation
|
|
35
|
+
#
|
|
36
|
+
# @param type [Symbol] Operation type (must be in TYPES)
|
|
37
|
+
# @param metadata [Hash] Operation-specific metadata
|
|
38
|
+
def initialize(type:, **metadata)
|
|
39
|
+
unless TYPES.include?(type)
|
|
40
|
+
raise ArgumentError, "Invalid operation type: #{type}"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
@type = type
|
|
44
|
+
@metadata = metadata
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Check if operation is a specific type
|
|
48
|
+
#
|
|
49
|
+
# @param type [Symbol] Type to check
|
|
50
|
+
# @return [Boolean]
|
|
51
|
+
def type?(type)
|
|
52
|
+
@type == type
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Get a metadata value
|
|
56
|
+
#
|
|
57
|
+
# @param key [Symbol] Metadata key
|
|
58
|
+
# @return [Object, nil] Metadata value
|
|
59
|
+
def [](key)
|
|
60
|
+
@metadata[key]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Check if two operations are equal
|
|
64
|
+
#
|
|
65
|
+
# @param other [Operation] Other operation
|
|
66
|
+
# @return [Boolean]
|
|
67
|
+
def ==(other)
|
|
68
|
+
return false unless other.is_a?(Operation)
|
|
69
|
+
|
|
70
|
+
type == other.type && metadata == other.metadata
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# String representation
|
|
74
|
+
#
|
|
75
|
+
# @return [String]
|
|
76
|
+
def to_s
|
|
77
|
+
"Operation(#{type})"
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Detailed string representation
|
|
81
|
+
#
|
|
82
|
+
# @return [String]
|
|
83
|
+
def inspect
|
|
84
|
+
metadata_str = @metadata.map do |k, v|
|
|
85
|
+
"#{k}: #{v.inspect}"
|
|
86
|
+
end.join(", ")
|
|
87
|
+
"#<#{self.class.name} type=#{type} #{metadata_str}>"
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -0,0 +1,626 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module TreeDiff
|
|
5
|
+
module Operations
|
|
6
|
+
# OperationDetector analyzes tree matching results to detect high-level
|
|
7
|
+
# semantic operations.
|
|
8
|
+
#
|
|
9
|
+
# Based on research from XDiff, XyDiff, and JATS-diff, this detector
|
|
10
|
+
# identifies operations in three levels:
|
|
11
|
+
#
|
|
12
|
+
# Level 1: Basic operations (INSERT, DELETE, UPDATE)
|
|
13
|
+
# Level 2: Structural operations (MOVE)
|
|
14
|
+
# Level 3: Semantic operations (MERGE, SPLIT, UPGRADE, DOWNGRADE)
|
|
15
|
+
#
|
|
16
|
+
# @example
|
|
17
|
+
# detector = OperationDetector.new(tree1, tree2, matching)
|
|
18
|
+
# operations = detector.detect
|
|
19
|
+
# operations.each { |op| puts op.inspect }
|
|
20
|
+
#
|
|
21
|
+
class OperationDetector
|
|
22
|
+
attr_reader :tree1, :tree2, :matching, :operations, :match_options
|
|
23
|
+
|
|
24
|
+
# Initialize a new operation detector
|
|
25
|
+
#
|
|
26
|
+
# @param tree1 [TreeNode] First tree root
|
|
27
|
+
# @param tree2 [TreeNode] Second tree root
|
|
28
|
+
# @param matching [Matching] Matching between trees
|
|
29
|
+
# @param match_options [Hash] Match options for comparison
|
|
30
|
+
def initialize(tree1, tree2, matching, match_options = {})
|
|
31
|
+
@tree1 = tree1
|
|
32
|
+
@tree2 = tree2
|
|
33
|
+
@matching = matching
|
|
34
|
+
@match_options = match_options || {}
|
|
35
|
+
@operations = []
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Detect all operations
|
|
39
|
+
#
|
|
40
|
+
# @return [Array<Operation>] Detected operations
|
|
41
|
+
def detect
|
|
42
|
+
@operations = []
|
|
43
|
+
|
|
44
|
+
# Level 1: Basic operations
|
|
45
|
+
detect_inserts
|
|
46
|
+
detect_deletes
|
|
47
|
+
detect_updates
|
|
48
|
+
|
|
49
|
+
# Level 2: Structural operations
|
|
50
|
+
detect_moves
|
|
51
|
+
|
|
52
|
+
# Level 3: Semantic operations
|
|
53
|
+
# These require more sophisticated pattern analysis
|
|
54
|
+
detect_merges
|
|
55
|
+
detect_splits
|
|
56
|
+
detect_upgrades
|
|
57
|
+
detect_downgrades
|
|
58
|
+
|
|
59
|
+
@operations
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
# Detect INSERT operations (nodes in tree2 not matched in tree1)
|
|
65
|
+
def detect_inserts
|
|
66
|
+
all_nodes2 = collect_all_nodes(tree2)
|
|
67
|
+
|
|
68
|
+
all_nodes2.each do |node2|
|
|
69
|
+
next if @matching.matched2?(node2)
|
|
70
|
+
|
|
71
|
+
# Skip if parent is also unmatched (parent will be reported instead)
|
|
72
|
+
# This prevents redundant reporting of descendants
|
|
73
|
+
parent2 = node2.parent
|
|
74
|
+
next if parent2 && !@matching.matched2?(parent2)
|
|
75
|
+
|
|
76
|
+
# Find position
|
|
77
|
+
position = parent2 ? parent2.children.index(node2) : 0
|
|
78
|
+
|
|
79
|
+
@operations << Operation.new(
|
|
80
|
+
type: :insert,
|
|
81
|
+
node: node2,
|
|
82
|
+
parent: parent2,
|
|
83
|
+
position: position,
|
|
84
|
+
path: node2.xpath,
|
|
85
|
+
content: extract_node_content(node2),
|
|
86
|
+
)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Detect DELETE operations (nodes in tree1 not matched in tree2)
|
|
91
|
+
def detect_deletes
|
|
92
|
+
all_nodes1 = collect_all_nodes(tree1)
|
|
93
|
+
|
|
94
|
+
all_nodes1.each do |node1|
|
|
95
|
+
next if @matching.matched1?(node1)
|
|
96
|
+
|
|
97
|
+
# Skip if parent is also unmatched (parent will be reported instead)
|
|
98
|
+
# This prevents redundant reporting of descendants
|
|
99
|
+
parent1 = node1.parent
|
|
100
|
+
next if parent1 && !@matching.matched1?(parent1)
|
|
101
|
+
|
|
102
|
+
# Find position
|
|
103
|
+
position = parent1 ? parent1.children.index(node1) : 0
|
|
104
|
+
|
|
105
|
+
@operations << Operation.new(
|
|
106
|
+
type: :delete,
|
|
107
|
+
node: node1,
|
|
108
|
+
parent: parent1,
|
|
109
|
+
position: position,
|
|
110
|
+
path: node1.xpath,
|
|
111
|
+
content: extract_node_content(node1),
|
|
112
|
+
)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Detect UPDATE operations (matched nodes with different content)
|
|
117
|
+
def detect_updates
|
|
118
|
+
@matching.pairs.each do |node1, node2|
|
|
119
|
+
# Detect what changed (including attribute order)
|
|
120
|
+
changes = detect_changes(node1, node2)
|
|
121
|
+
|
|
122
|
+
# Skip if truly identical (no changes detected)
|
|
123
|
+
next if changes.empty?
|
|
124
|
+
|
|
125
|
+
@operations << Operation.new(
|
|
126
|
+
type: :update,
|
|
127
|
+
node1: node1,
|
|
128
|
+
node2: node2,
|
|
129
|
+
changes: changes,
|
|
130
|
+
path: node2.xpath,
|
|
131
|
+
old_content: extract_node_content(node1),
|
|
132
|
+
new_content: extract_node_content(node2),
|
|
133
|
+
)
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Detect MOVE operations (nodes that moved in the tree structure)
|
|
138
|
+
def detect_moves
|
|
139
|
+
@matching.pairs.each do |node1, node2|
|
|
140
|
+
next unless moved?(node1, node2)
|
|
141
|
+
|
|
142
|
+
@operations << Operation.new(
|
|
143
|
+
type: :move,
|
|
144
|
+
node1: node1,
|
|
145
|
+
node2: node2,
|
|
146
|
+
old_parent: node1.parent,
|
|
147
|
+
new_parent: node2.parent,
|
|
148
|
+
old_position: node1.parent&.children&.index(node1),
|
|
149
|
+
new_position: node2.parent&.children&.index(node2),
|
|
150
|
+
old_path: node1.xpath,
|
|
151
|
+
new_path: node2.xpath,
|
|
152
|
+
)
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# Check if a node moved between trees
|
|
157
|
+
#
|
|
158
|
+
# @param node1 [TreeNode] Node in tree1
|
|
159
|
+
# @param node2 [TreeNode] Node in tree2
|
|
160
|
+
# @return [Boolean]
|
|
161
|
+
def moved?(node1, node2)
|
|
162
|
+
# Node moved if parents don't match
|
|
163
|
+
parent1 = node1.parent
|
|
164
|
+
parent2 = node2.parent
|
|
165
|
+
|
|
166
|
+
return false if parent1.nil? && parent2.nil?
|
|
167
|
+
return true if parent1.nil? || parent2.nil?
|
|
168
|
+
|
|
169
|
+
# Check if parents match
|
|
170
|
+
matched_parent2 = @matching.match_for1(parent1)
|
|
171
|
+
matched_parent2 != parent2
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Check if two nodes are identical
|
|
175
|
+
#
|
|
176
|
+
# @param node1 [TreeNode] First node
|
|
177
|
+
# @param node2 [TreeNode] Second node
|
|
178
|
+
# @return [Boolean]
|
|
179
|
+
def nodes_identical?(node1, node2)
|
|
180
|
+
node1.label == node2.label &&
|
|
181
|
+
node1.value == node2.value &&
|
|
182
|
+
node1.attributes == node2.attributes
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Detect specific changes between two nodes
|
|
186
|
+
#
|
|
187
|
+
# @param node1 [TreeNode] Original node
|
|
188
|
+
# @param node2 [TreeNode] Modified node
|
|
189
|
+
# @return [Hash] Hash of changes
|
|
190
|
+
def detect_changes(node1, node2)
|
|
191
|
+
changes = {}
|
|
192
|
+
|
|
193
|
+
if node1.label != node2.label
|
|
194
|
+
changes[:label] =
|
|
195
|
+
{ old: node1.label, new: node2.label }
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# CRITICAL FIX: Use normalized text comparison based on match_options
|
|
199
|
+
if !text_equivalent?(node1, node2)
|
|
200
|
+
changes[:value] =
|
|
201
|
+
{ old: node1.value, new: node2.value }
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Detect attribute changes (values or order)
|
|
205
|
+
attrs1 = node1.attributes
|
|
206
|
+
attrs2 = node2.attributes
|
|
207
|
+
|
|
208
|
+
# Check if attribute values differ (ignoring order)
|
|
209
|
+
if attrs1.sort.to_h != attrs2.sort.to_h
|
|
210
|
+
# Actual attribute value differences
|
|
211
|
+
changes[:attributes] = {
|
|
212
|
+
old: attrs1,
|
|
213
|
+
new: attrs2,
|
|
214
|
+
}
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Check if attribute order differs (independently)
|
|
218
|
+
# This can coexist with attribute value differences
|
|
219
|
+
# Only detect order differences when the same attributes exist in different order
|
|
220
|
+
# AND when attribute_order mode is :strict
|
|
221
|
+
attribute_order_mode = @match_options[:attribute_order] || :ignore
|
|
222
|
+
if attribute_order_mode == :strict &&
|
|
223
|
+
attrs1.keys.sort == attrs2.keys.sort &&
|
|
224
|
+
attrs1.keys != attrs2.keys
|
|
225
|
+
# Same attributes but in different order
|
|
226
|
+
changes[:attribute_order] = {
|
|
227
|
+
old: attrs1.keys,
|
|
228
|
+
new: attrs2.keys,
|
|
229
|
+
}
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
changes
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
# Check if text values are equivalent according to match options
|
|
236
|
+
#
|
|
237
|
+
# @param node1 [TreeNode] First node
|
|
238
|
+
# @param node2 [TreeNode] Second node
|
|
239
|
+
# @return [Boolean] True if text values are equivalent
|
|
240
|
+
def text_equivalent?(node1, node2)
|
|
241
|
+
text1 = node1.value
|
|
242
|
+
text2 = node2.value
|
|
243
|
+
|
|
244
|
+
# Both nil or empty = equivalent
|
|
245
|
+
return true if (text1.nil? || text1.empty?) && (text2.nil? || text2.empty?)
|
|
246
|
+
return false if (text1.nil? || text1.empty?) || (text2.nil? || text2.empty?)
|
|
247
|
+
|
|
248
|
+
# Check if node is in a whitespace-sensitive context
|
|
249
|
+
is_ws_sensitive = whitespace_sensitive?(node1) || whitespace_sensitive?(node2)
|
|
250
|
+
if is_ws_sensitive
|
|
251
|
+
# For whitespace-sensitive elements, use strict comparison
|
|
252
|
+
return text1 == text2
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
# For non-whitespace-sensitive elements, apply normalization
|
|
256
|
+
norm1 = normalize_text(text1)
|
|
257
|
+
norm2 = normalize_text(text2)
|
|
258
|
+
|
|
259
|
+
# If both normalize to empty (whitespace-only), treat as equivalent
|
|
260
|
+
# This only applies to non-whitespace-sensitive contexts
|
|
261
|
+
return true if norm1.empty? && norm2.empty?
|
|
262
|
+
|
|
263
|
+
# Apply normalization based on match_options
|
|
264
|
+
text_content_mode = @match_options[:text_content] || :normalize
|
|
265
|
+
|
|
266
|
+
case text_content_mode
|
|
267
|
+
when :strict
|
|
268
|
+
# Strict mode: must match exactly
|
|
269
|
+
text1 == text2
|
|
270
|
+
when :normalize, :normalized
|
|
271
|
+
# Normalize mode: normalize whitespace before comparing
|
|
272
|
+
norm1 == norm2
|
|
273
|
+
else
|
|
274
|
+
# Default to normalize behavior
|
|
275
|
+
norm1 == norm2
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
# Normalize text for comparison
|
|
280
|
+
#
|
|
281
|
+
# Collapses multiple whitespace into single space and strips.
|
|
282
|
+
# This matches the behavior of Canon's text_content: normalize option.
|
|
283
|
+
#
|
|
284
|
+
# @param text [String, nil] Text to normalize
|
|
285
|
+
# @return [String] Normalized text
|
|
286
|
+
def normalize_text(text)
|
|
287
|
+
return "" if text.nil? || text.empty?
|
|
288
|
+
|
|
289
|
+
# Collapse multiple whitespace (including newlines) into single space
|
|
290
|
+
# Then strip leading/trailing whitespace
|
|
291
|
+
text.gsub(/\s+/, " ").strip
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# Collect all nodes in a tree (depth-first)
|
|
295
|
+
#
|
|
296
|
+
# @param node [TreeNode] Root node
|
|
297
|
+
# @return [Array<TreeNode>] All nodes
|
|
298
|
+
def collect_all_nodes(node)
|
|
299
|
+
nodes = [node]
|
|
300
|
+
node.children.each do |child|
|
|
301
|
+
nodes.concat(collect_all_nodes(child))
|
|
302
|
+
end
|
|
303
|
+
nodes
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
# Detect MERGE operations
|
|
307
|
+
# Pattern: Multiple sibling nodes in tree1 combined into one node in tree2
|
|
308
|
+
# (n-1) × DELETE + 1 × UPDATE with content similarity
|
|
309
|
+
def detect_merges
|
|
310
|
+
deletes = @operations.select { |op| op.type == :delete }
|
|
311
|
+
updates = @operations.select { |op| op.type == :update }
|
|
312
|
+
|
|
313
|
+
# Group deletes by parent
|
|
314
|
+
deletes_by_parent = deletes.group_by { |op| op[:parent] }
|
|
315
|
+
|
|
316
|
+
deletes_by_parent.each do |parent1, del_ops|
|
|
317
|
+
next if del_ops.size < 2 # Need at least 2 deletes for merge
|
|
318
|
+
|
|
319
|
+
# Find potential merge target in updates with same parent
|
|
320
|
+
parent2 = @matching.match_for1(parent1)
|
|
321
|
+
next unless parent2
|
|
322
|
+
|
|
323
|
+
updates.each do |update_op|
|
|
324
|
+
node2 = update_op[:node2]
|
|
325
|
+
next unless node2.parent == parent2
|
|
326
|
+
|
|
327
|
+
# Check if deleted content was merged into this node
|
|
328
|
+
if content_merged?(del_ops.map do |op|
|
|
329
|
+
op[:node]
|
|
330
|
+
end, update_op[:node1], node2)
|
|
331
|
+
# Remove the component operations
|
|
332
|
+
@operations.delete_if do |op|
|
|
333
|
+
del_ops.include?(op) || op == update_op
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
# Add merge operation
|
|
337
|
+
@operations << Operation.new(
|
|
338
|
+
type: :merge,
|
|
339
|
+
source_nodes: del_ops.map { |op| op[:node] },
|
|
340
|
+
target_node: node2,
|
|
341
|
+
merged_from: del_ops.map { |op| op[:node].label },
|
|
342
|
+
)
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# Detect SPLIT operations
|
|
349
|
+
# Pattern: One node in tree1 split into multiple nodes in tree2
|
|
350
|
+
# 1 × DELETE + n × INSERT with content similarity
|
|
351
|
+
def detect_splits
|
|
352
|
+
deletes = @operations.select { |op| op.type == :delete }
|
|
353
|
+
inserts = @operations.select { |op| op.type == :insert }
|
|
354
|
+
|
|
355
|
+
# Group inserts by parent
|
|
356
|
+
inserts_by_parent = inserts.group_by { |op| op[:parent] }
|
|
357
|
+
|
|
358
|
+
deletes.each do |delete_op|
|
|
359
|
+
node1 = delete_op[:node]
|
|
360
|
+
parent1 = delete_op[:parent]
|
|
361
|
+
parent2 = @matching.match_for1(parent1) if parent1
|
|
362
|
+
|
|
363
|
+
next unless parent2
|
|
364
|
+
|
|
365
|
+
# Find inserts with the same parent in tree2
|
|
366
|
+
candidate_inserts = inserts_by_parent[parent2] || []
|
|
367
|
+
next if candidate_inserts.size < 2 # Need at least 2 inserts for split
|
|
368
|
+
|
|
369
|
+
# Check if this node's content was split into multiple inserts
|
|
370
|
+
if content_split?(node1, candidate_inserts.map { |op| op[:node] })
|
|
371
|
+
# Remove the component operations
|
|
372
|
+
@operations.delete(delete_op)
|
|
373
|
+
@operations.delete_if { |op| candidate_inserts.include?(op) }
|
|
374
|
+
|
|
375
|
+
# Add split operation
|
|
376
|
+
@operations << Operation.new(
|
|
377
|
+
type: :split,
|
|
378
|
+
source_node: node1,
|
|
379
|
+
target_nodes: candidate_inserts.map { |op| op[:node] },
|
|
380
|
+
split_into: candidate_inserts.map { |op| op[:node].label },
|
|
381
|
+
)
|
|
382
|
+
end
|
|
383
|
+
end
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
# Detect UPGRADE operations
|
|
387
|
+
# Pattern: Node moved to shallower depth (promoted in hierarchy)
|
|
388
|
+
# DELETE + INSERT at shallower depth with similar content
|
|
389
|
+
def detect_upgrades
|
|
390
|
+
deletes = @operations.select { |op| op.type == :delete }
|
|
391
|
+
inserts = @operations.select { |op| op.type == :insert }
|
|
392
|
+
|
|
393
|
+
deletes.each do |delete_op|
|
|
394
|
+
node1 = delete_op[:node]
|
|
395
|
+
depth1 = calculate_depth(node1)
|
|
396
|
+
|
|
397
|
+
inserts.each do |insert_op|
|
|
398
|
+
node2 = insert_op[:node]
|
|
399
|
+
depth2 = calculate_depth(node2)
|
|
400
|
+
|
|
401
|
+
# Upgrade means shallower depth (smaller number)
|
|
402
|
+
next unless depth2 < depth1
|
|
403
|
+
|
|
404
|
+
# Check if nodes are similar (same label, similar content)
|
|
405
|
+
if nodes_similar_for_hierarchy_change?(node1, node2)
|
|
406
|
+
# Remove the component operations
|
|
407
|
+
@operations.delete(delete_op)
|
|
408
|
+
@operations.delete(insert_op)
|
|
409
|
+
|
|
410
|
+
# Add upgrade operation
|
|
411
|
+
@operations << Operation.new(
|
|
412
|
+
type: :upgrade,
|
|
413
|
+
node1: node1,
|
|
414
|
+
node2: node2,
|
|
415
|
+
from_depth: depth1,
|
|
416
|
+
to_depth: depth2,
|
|
417
|
+
promoted_by: depth1 - depth2,
|
|
418
|
+
)
|
|
419
|
+
end
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
# Detect DOWNGRADE operations
|
|
425
|
+
# Pattern: Node moved to deeper depth (demoted in hierarchy)
|
|
426
|
+
# DELETE + INSERT at deeper depth with similar content
|
|
427
|
+
def detect_downgrades
|
|
428
|
+
deletes = @operations.select { |op| op.type == :delete }
|
|
429
|
+
inserts = @operations.select { |op| op.type == :insert }
|
|
430
|
+
|
|
431
|
+
deletes.each do |delete_op|
|
|
432
|
+
node1 = delete_op[:node]
|
|
433
|
+
depth1 = calculate_depth(node1)
|
|
434
|
+
|
|
435
|
+
inserts.each do |insert_op|
|
|
436
|
+
node2 = insert_op[:node]
|
|
437
|
+
depth2 = calculate_depth(node2)
|
|
438
|
+
|
|
439
|
+
# Downgrade means deeper depth (larger number)
|
|
440
|
+
next unless depth2 > depth1
|
|
441
|
+
|
|
442
|
+
# Check if nodes are similar (same label, similar content)
|
|
443
|
+
if nodes_similar_for_hierarchy_change?(node1, node2)
|
|
444
|
+
# Remove the component operations
|
|
445
|
+
@operations.delete(delete_op)
|
|
446
|
+
@operations.delete(insert_op)
|
|
447
|
+
|
|
448
|
+
# Add downgrade operation
|
|
449
|
+
@operations << Operation.new(
|
|
450
|
+
type: :downgrade,
|
|
451
|
+
node1: node1,
|
|
452
|
+
node2: node2,
|
|
453
|
+
from_depth: depth1,
|
|
454
|
+
to_depth: depth2,
|
|
455
|
+
demoted_by: depth2 - depth1,
|
|
456
|
+
)
|
|
457
|
+
end
|
|
458
|
+
end
|
|
459
|
+
end
|
|
460
|
+
end
|
|
461
|
+
|
|
462
|
+
# Check if content from multiple nodes was merged into target
|
|
463
|
+
#
|
|
464
|
+
# @param source_nodes [Array<TreeNode>] Source nodes
|
|
465
|
+
# @param original_target [TreeNode] Original target node in tree1
|
|
466
|
+
# @param merged_target [TreeNode] Merged target node in tree2
|
|
467
|
+
# @return [Boolean]
|
|
468
|
+
def content_merged?(source_nodes, original_target, merged_target)
|
|
469
|
+
# Collect all text content
|
|
470
|
+
source_text = source_nodes.map do |n|
|
|
471
|
+
extract_text_content(n)
|
|
472
|
+
end.join(" ")
|
|
473
|
+
original_text = extract_text_content(original_target)
|
|
474
|
+
merged_text = extract_text_content(merged_target)
|
|
475
|
+
|
|
476
|
+
# Check if merged text contains both original and source content
|
|
477
|
+
return false if merged_text.empty?
|
|
478
|
+
|
|
479
|
+
similarity = text_similarity("#{source_text} #{original_text}",
|
|
480
|
+
merged_text)
|
|
481
|
+
similarity >= 0.8 # 80% similarity threshold for merge detection
|
|
482
|
+
end
|
|
483
|
+
|
|
484
|
+
# Check if content from one node was split into multiple nodes
|
|
485
|
+
#
|
|
486
|
+
# @param source_node [TreeNode] Source node
|
|
487
|
+
# @param target_nodes [Array<TreeNode>] Target nodes
|
|
488
|
+
# @return [Boolean]
|
|
489
|
+
def content_split?(source_node, target_nodes)
|
|
490
|
+
source_text = extract_text_content(source_node)
|
|
491
|
+
target_text = target_nodes.map do |n|
|
|
492
|
+
extract_text_content(n)
|
|
493
|
+
end.join(" ")
|
|
494
|
+
|
|
495
|
+
return false if source_text.empty? || target_text.empty?
|
|
496
|
+
|
|
497
|
+
similarity = text_similarity(source_text, target_text)
|
|
498
|
+
similarity >= 0.8 # 80% similarity threshold for split detection
|
|
499
|
+
end
|
|
500
|
+
|
|
501
|
+
# Check if two nodes are similar enough for hierarchy change
|
|
502
|
+
#
|
|
503
|
+
# @param node1 [TreeNode] First node
|
|
504
|
+
# @param node2 [TreeNode] Second node
|
|
505
|
+
# @return [Boolean]
|
|
506
|
+
def nodes_similar_for_hierarchy_change?(node1, node2)
|
|
507
|
+
# Must have same label
|
|
508
|
+
return false unless node1.label == node2.label
|
|
509
|
+
|
|
510
|
+
# Compare content similarity
|
|
511
|
+
text1 = extract_text_content(node1)
|
|
512
|
+
text2 = extract_text_content(node2)
|
|
513
|
+
|
|
514
|
+
return true if text1.empty? && text2.empty?
|
|
515
|
+
return false if text1.empty? || text2.empty?
|
|
516
|
+
|
|
517
|
+
similarity = text_similarity(text1, text2)
|
|
518
|
+
similarity >= 0.9 # 90% similarity for hierarchy changes
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
# Extract all text content from a node and its descendants
|
|
522
|
+
#
|
|
523
|
+
# @param node [TreeNode] Node to extract from
|
|
524
|
+
# @return [String] Combined text content
|
|
525
|
+
def extract_text_content(node)
|
|
526
|
+
texts = []
|
|
527
|
+
texts << node.value if node.value && !node.value.empty?
|
|
528
|
+
|
|
529
|
+
node.children.each do |child|
|
|
530
|
+
texts << extract_text_content(child)
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
texts.join(" ").strip
|
|
534
|
+
end
|
|
535
|
+
|
|
536
|
+
# Extract node content summary for display
|
|
537
|
+
#
|
|
538
|
+
# @param node [TreeNode] Node to extract from
|
|
539
|
+
# @return [String] Content summary
|
|
540
|
+
def extract_node_content(node)
|
|
541
|
+
parts = []
|
|
542
|
+
|
|
543
|
+
# Add label
|
|
544
|
+
parts << "<#{node.label}>"
|
|
545
|
+
|
|
546
|
+
# Add attributes if present
|
|
547
|
+
unless node.attributes.empty?
|
|
548
|
+
attrs = node.attributes.map { |k, v| "#{k}=\"#{v}\"" }.join(" ")
|
|
549
|
+
parts << "[#{attrs}]"
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
# Add value/text if present
|
|
553
|
+
if node.value && !node.value.empty?
|
|
554
|
+
# Truncate long values
|
|
555
|
+
value_preview = node.value.length > 50 ? "#{node.value[0..47]}..." : node.value
|
|
556
|
+
parts << "\"#{value_preview}\""
|
|
557
|
+
elsif !node.children.empty?
|
|
558
|
+
parts << "(#{node.children.size} children)"
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
parts.join(" ")
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
# Calculate text similarity using Jaccard index
|
|
565
|
+
#
|
|
566
|
+
# @param text1 [String] First text
|
|
567
|
+
# @param text2 [String] Second text
|
|
568
|
+
# @return [Float] Similarity score (0.0 to 1.0)
|
|
569
|
+
def text_similarity(text1, text2)
|
|
570
|
+
tokens1 = text1.downcase.split(/\s+/)
|
|
571
|
+
tokens2 = text2.downcase.split(/\s+/)
|
|
572
|
+
|
|
573
|
+
return 0.0 if tokens1.empty? && tokens2.empty?
|
|
574
|
+
return 0.0 if tokens1.empty? || tokens2.empty?
|
|
575
|
+
|
|
576
|
+
intersection = (tokens1 & tokens2).size
|
|
577
|
+
union = (tokens1 | tokens2).size
|
|
578
|
+
|
|
579
|
+
intersection.to_f / union
|
|
580
|
+
end
|
|
581
|
+
|
|
582
|
+
# Calculate depth of a node in the tree
|
|
583
|
+
#
|
|
584
|
+
# @param node [TreeNode] Node to calculate depth for
|
|
585
|
+
# @return [Integer] Depth (0 for root)
|
|
586
|
+
def calculate_depth(node)
|
|
587
|
+
depth = 0
|
|
588
|
+
current = node
|
|
589
|
+
while current.parent
|
|
590
|
+
depth += 1
|
|
591
|
+
current = current.parent
|
|
592
|
+
end
|
|
593
|
+
depth
|
|
594
|
+
end
|
|
595
|
+
|
|
596
|
+
# Check if a node is in a whitespace-sensitive context
|
|
597
|
+
#
|
|
598
|
+
# HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
|
|
599
|
+
#
|
|
600
|
+
# @param node [TreeNode] Node to check
|
|
601
|
+
# @return [Boolean] True if node is in whitespace-sensitive context
|
|
602
|
+
def whitespace_sensitive?(node)
|
|
603
|
+
return false unless node
|
|
604
|
+
|
|
605
|
+
# List of HTML elements where whitespace is semantically significant
|
|
606
|
+
whitespace_sensitive_tags = %w[pre code textarea script style]
|
|
607
|
+
|
|
608
|
+
# Check if this node or any ancestor is whitespace-sensitive
|
|
609
|
+
current = node
|
|
610
|
+
while current
|
|
611
|
+
if current.respond_to?(:label)
|
|
612
|
+
label = current.label.to_s.downcase
|
|
613
|
+
return true if whitespace_sensitive_tags.include?(label)
|
|
614
|
+
end
|
|
615
|
+
|
|
616
|
+
# Check parent
|
|
617
|
+
current = current.parent if current.respond_to?(:parent)
|
|
618
|
+
break unless current
|
|
619
|
+
end
|
|
620
|
+
|
|
621
|
+
false
|
|
622
|
+
end
|
|
623
|
+
end
|
|
624
|
+
end
|
|
625
|
+
end
|
|
626
|
+
end
|