canon 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +163 -67
- data/README.adoc +400 -7
- data/docs/Gemfile +9 -0
- data/docs/INDEX.adoc +99 -182
- data/docs/_config.yml +100 -0
- data/docs/advanced/diff-classification.adoc +547 -0
- data/docs/advanced/diff-pipeline.adoc +358 -0
- data/docs/advanced/index.adoc +214 -0
- data/docs/advanced/semantic-diff-report.adoc +390 -0
- data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
- data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
- data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
- data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
- data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
- data/docs/features/diff-formatting/display-filtering.adoc +472 -0
- data/docs/features/diff-formatting/index.adoc +140 -0
- data/docs/features/environment-configuration/index.adoc +327 -0
- data/docs/features/environment-configuration/override-system.adoc +436 -0
- data/docs/features/environment-configuration/size-limits.adoc +273 -0
- data/docs/features/index.adoc +173 -0
- data/docs/features/input-validation/index.adoc +521 -0
- data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
- data/docs/features/match-options/html-policies.adoc +312 -0
- data/docs/features/match-options/index.adoc +621 -0
- data/docs/getting-started/index.adoc +83 -0
- data/docs/getting-started/quick-start.adoc +76 -0
- data/docs/guides/choosing-configuration.adoc +689 -0
- data/docs/guides/index.adoc +181 -0
- data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
- data/docs/interfaces/index.adoc +101 -0
- data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
- data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
- data/docs/lychee.toml +65 -0
- data/docs/reference/cli-options.adoc +418 -0
- data/docs/reference/environment-variables.adoc +375 -0
- data/docs/reference/index.adoc +204 -0
- data/docs/reference/options-across-interfaces.adoc +417 -0
- data/docs/understanding/algorithms/dom-diff.adoc +389 -0
- data/docs/understanding/algorithms/index.adoc +314 -0
- data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
- data/docs/understanding/architecture.adoc +447 -0
- data/docs/understanding/comparison-pipeline.adoc +317 -0
- data/docs/understanding/formats/html.adoc +380 -0
- data/docs/understanding/formats/index.adoc +261 -0
- data/docs/understanding/formats/json.adoc +390 -0
- data/docs/understanding/formats/xml.adoc +366 -0
- data/docs/understanding/formats/yaml.adoc +504 -0
- data/docs/understanding/index.adoc +130 -0
- data/lib/canon/cli.rb +42 -1
- data/lib/canon/commands/diff_command.rb +108 -23
- data/lib/canon/comparison/compare_profile.rb +101 -0
- data/lib/canon/comparison/comparison_result.rb +41 -2
- data/lib/canon/comparison/html_comparator.rb +292 -71
- data/lib/canon/comparison/html_compare_profile.rb +117 -0
- data/lib/canon/comparison/match_options.rb +42 -4
- data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
- data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
- data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
- data/lib/canon/comparison/xml_comparator.rb +695 -91
- data/lib/canon/comparison.rb +207 -2
- data/lib/canon/config/env_provider.rb +71 -0
- data/lib/canon/config/env_schema.rb +58 -0
- data/lib/canon/config/override_resolver.rb +55 -0
- data/lib/canon/config/type_converter.rb +59 -0
- data/lib/canon/config.rb +158 -29
- data/lib/canon/data_model.rb +29 -0
- data/lib/canon/diff/diff_classifier.rb +74 -14
- data/lib/canon/diff/diff_context_builder.rb +41 -0
- data/lib/canon/diff/diff_line.rb +18 -2
- data/lib/canon/diff/diff_node.rb +18 -3
- data/lib/canon/diff/diff_node_mapper.rb +71 -12
- data/lib/canon/diff/formatting_detector.rb +53 -0
- data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
- data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
- data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
- data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
- data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
- data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
- data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
- data/lib/canon/diff_formatter/debug_output.rb +7 -1
- data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
- data/lib/canon/diff_formatter/legend.rb +42 -0
- data/lib/canon/diff_formatter.rb +78 -9
- data/lib/canon/errors.rb +56 -0
- data/lib/canon/formatters/html_formatter_base.rb +35 -1
- data/lib/canon/formatters/json_formatter.rb +3 -0
- data/lib/canon/formatters/yaml_formatter.rb +3 -0
- data/lib/canon/html/data_model.rb +229 -0
- data/lib/canon/html.rb +9 -0
- data/lib/canon/options/cli_generator.rb +70 -0
- data/lib/canon/options/registry.rb +234 -0
- data/lib/canon/rspec_matchers.rb +34 -13
- data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
- data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
- data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
- data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
- data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
- data/lib/canon/tree_diff/core/matching.rb +241 -0
- data/lib/canon/tree_diff/core/node_signature.rb +164 -0
- data/lib/canon/tree_diff/core/node_weight.rb +135 -0
- data/lib/canon/tree_diff/core/tree_node.rb +450 -0
- data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
- data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
- data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
- data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
- data/lib/canon/tree_diff/operation_converter.rb +631 -0
- data/lib/canon/tree_diff/operations/operation.rb +92 -0
- data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
- data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
- data/lib/canon/tree_diff.rb +33 -0
- data/lib/canon/validators/json_validator.rb +3 -1
- data/lib/canon/validators/yaml_validator.rb +3 -1
- data/lib/canon/version.rb +1 -1
- data/lib/canon/xml/data_model.rb +22 -23
- data/lib/canon/xml/element_matcher.rb +128 -20
- data/lib/canon/xml/namespace_helper.rb +110 -0
- data/lib/canon.rb +3 -0
- metadata +81 -23
- data/_config.yml +0 -116
- data/docs/ADVANCED_TOPICS.adoc +0 -20
- data/docs/BASIC_USAGE.adoc +0 -16
- data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
- data/docs/DIFF_ARCHITECTURE.adoc +0 -435
- data/docs/DIFF_FORMATTING.adoc +0 -540
- data/docs/FORMATS.adoc +0 -447
- data/docs/INPUT_VALIDATION.adoc +0 -477
- data/docs/MATCH_ARCHITECTURE.adoc +0 -463
- data/docs/MATCH_OPTIONS.adoc +0 -719
- data/docs/MODES.adoc +0 -432
- data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
- data/docs/OPTIONS.adoc +0 -1387
- data/docs/PREPROCESSING.adoc +0 -491
- data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
- data/docs/UNDERSTANDING_CANON.adoc +0 -17
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../core/tree_node"
|
|
4
|
+
require_relative "../core/node_signature"
|
|
5
|
+
require_relative "../core/matching"
|
|
6
|
+
|
|
7
|
+
module Canon
|
|
8
|
+
module TreeDiff
|
|
9
|
+
module Matchers
|
|
10
|
+
# SimilarityMatcher performs similarity-based matching
|
|
11
|
+
#
|
|
12
|
+
# Based on JATS-diff (2022) approach:
|
|
13
|
+
# - Use Jaccard index for content similarity
|
|
14
|
+
# - Configurable similarity threshold (default 0.95)
|
|
15
|
+
# - Group candidates by signature for efficiency
|
|
16
|
+
# - Extend matches for unmatched nodes
|
|
17
|
+
#
|
|
18
|
+
# Features:
|
|
19
|
+
# - Handles text-centric documents
|
|
20
|
+
# - Fuzzy matching for similar but not identical nodes
|
|
21
|
+
# - Threshold-based filtering
|
|
22
|
+
# - Efficient signature-based grouping
|
|
23
|
+
class SimilarityMatcher
|
|
24
|
+
attr_reader :tree1, :tree2, :matching, :threshold
|
|
25
|
+
|
|
26
|
+
# Initialize matcher with two trees and existing matching
|
|
27
|
+
#
|
|
28
|
+
# @param tree1 [TreeNode] First tree root
|
|
29
|
+
# @param tree2 [TreeNode] Second tree root
|
|
30
|
+
# @param matching [Core::Matching] Existing matching from previous phase
|
|
31
|
+
# @param threshold [Float] Similarity threshold (0.0 to 1.0)
|
|
32
|
+
def initialize(tree1, tree2, matching, threshold: 0.95)
|
|
33
|
+
@tree1 = tree1
|
|
34
|
+
@tree2 = tree2
|
|
35
|
+
@matching = matching
|
|
36
|
+
@threshold = threshold
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Perform similarity-based matching
|
|
40
|
+
#
|
|
41
|
+
# @return [Core::Matching] Updated matching
|
|
42
|
+
def match
|
|
43
|
+
# Get unmatched nodes from both trees
|
|
44
|
+
all_nodes1 = collect_nodes(tree1)
|
|
45
|
+
all_nodes2 = collect_nodes(tree2)
|
|
46
|
+
|
|
47
|
+
unmatched1 = @matching.unmatched1(all_nodes1)
|
|
48
|
+
unmatched2 = @matching.unmatched2(all_nodes2)
|
|
49
|
+
|
|
50
|
+
# Group unmatched nodes by signature for efficiency
|
|
51
|
+
groups1 = group_by_signature(unmatched1)
|
|
52
|
+
groups2 = group_by_signature(unmatched2)
|
|
53
|
+
|
|
54
|
+
# For each signature group, find similar matches
|
|
55
|
+
groups2.each do |sig, nodes2|
|
|
56
|
+
# Find corresponding group in tree1
|
|
57
|
+
nodes1 = groups1[sig] || []
|
|
58
|
+
next if nodes1.empty?
|
|
59
|
+
|
|
60
|
+
# Match nodes within this signature group
|
|
61
|
+
match_group(nodes1, nodes2)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
@matching
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
private
|
|
68
|
+
|
|
69
|
+
# Collect all nodes from a tree
|
|
70
|
+
#
|
|
71
|
+
# @param root [TreeNode] Root of tree
|
|
72
|
+
# @return [Array<TreeNode>]
|
|
73
|
+
def collect_nodes(root)
|
|
74
|
+
nodes = [root]
|
|
75
|
+
nodes.concat(root.descendants)
|
|
76
|
+
nodes
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Group nodes by signature
|
|
80
|
+
#
|
|
81
|
+
# For similarity matching, we use LOOSE signatures (element name only,
|
|
82
|
+
# no attributes) so that nodes with different attributes can still be
|
|
83
|
+
# compared for similarity. This allows matching nodes like:
|
|
84
|
+
# <note id="A"> vs <note id="A" autonum="1">
|
|
85
|
+
#
|
|
86
|
+
# @param nodes [Array<TreeNode>] Nodes to group
|
|
87
|
+
# @return [Hash<NodeSignature, Array<TreeNode>>]
|
|
88
|
+
def group_by_signature(nodes)
|
|
89
|
+
nodes.group_by { |node| Core::NodeSignature.for(node, include_attributes: false) }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Match nodes within a signature group
|
|
93
|
+
#
|
|
94
|
+
# @param nodes1 [Array<TreeNode>] Nodes from tree1
|
|
95
|
+
# @param nodes2 [Array<TreeNode>] Nodes from tree2
|
|
96
|
+
def match_group(nodes1, nodes2)
|
|
97
|
+
# Create similarity matrix
|
|
98
|
+
matches = []
|
|
99
|
+
|
|
100
|
+
nodes2.each do |node2|
|
|
101
|
+
next if @matching.matched2?(node2)
|
|
102
|
+
|
|
103
|
+
# Find best match in nodes1
|
|
104
|
+
best_match = nil
|
|
105
|
+
best_similarity = @threshold
|
|
106
|
+
|
|
107
|
+
nodes1.each do |node1|
|
|
108
|
+
next if @matching.matched1?(node1)
|
|
109
|
+
|
|
110
|
+
# CRITICAL: For whitespace-sensitive elements, require exact text match
|
|
111
|
+
# Don't fuzzy-match <pre>, <code>, etc. with different whitespace
|
|
112
|
+
if (whitespace_sensitive?(node1) || whitespace_sensitive?(node2)) && node1.value != node2.value
|
|
113
|
+
# For whitespace-sensitive elements, text must match exactly
|
|
114
|
+
next
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
similarity = node1.similarity_to(node2)
|
|
118
|
+
|
|
119
|
+
if similarity > best_similarity
|
|
120
|
+
best_similarity = similarity
|
|
121
|
+
best_match = node1
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Record match if found
|
|
126
|
+
if best_match
|
|
127
|
+
matches << [best_match, node2, best_similarity]
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Sort matches by similarity (highest first)
|
|
132
|
+
matches.sort_by! { |_, _, sim| -sim }
|
|
133
|
+
|
|
134
|
+
# Add matches in order of similarity
|
|
135
|
+
matches.each do |node1, node2, _similarity|
|
|
136
|
+
# Skip if already matched (by a higher-similarity match)
|
|
137
|
+
next if @matching.matched1?(node1)
|
|
138
|
+
next if @matching.matched2?(node2)
|
|
139
|
+
|
|
140
|
+
# Try to add match
|
|
141
|
+
@matching.add(node1, node2)
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Check if a node is whitespace-sensitive
|
|
146
|
+
#
|
|
147
|
+
# HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
|
|
148
|
+
#
|
|
149
|
+
# @param node [TreeNode] Node to check
|
|
150
|
+
# @return [Boolean] True if node is whitespace-sensitive
|
|
151
|
+
def whitespace_sensitive?(node)
|
|
152
|
+
return false unless node
|
|
153
|
+
|
|
154
|
+
# List of HTML elements where whitespace is semantically significant
|
|
155
|
+
whitespace_sensitive_tags = %w[pre code textarea script style]
|
|
156
|
+
|
|
157
|
+
# Check if this node is whitespace-sensitive
|
|
158
|
+
if node.respond_to?(:label)
|
|
159
|
+
label = node.label.to_s.downcase
|
|
160
|
+
return true if whitespace_sensitive_tags.include?(label)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
false
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "../core/tree_node"
|
|
4
|
+
require_relative "../core/node_weight"
|
|
5
|
+
require_relative "../core/matching"
|
|
6
|
+
|
|
7
|
+
module Canon
|
|
8
|
+
module TreeDiff
|
|
9
|
+
module Matchers
|
|
10
|
+
# StructuralPropagator extends matches using structural relationships
|
|
11
|
+
#
|
|
12
|
+
# Based on XyDiff/Cobena (2002, INRIA) propagation strategies:
|
|
13
|
+
# - Bottom-up: Match parents of matched children
|
|
14
|
+
# - Top-down: Match children of matched parents (lazy propagation)
|
|
15
|
+
#
|
|
16
|
+
# Propagation depth formula: 1 + (W / W₀)
|
|
17
|
+
# where W = node weight, W₀ = base weight threshold
|
|
18
|
+
#
|
|
19
|
+
# Features:
|
|
20
|
+
# - Conservative propagation (only when safe)
|
|
21
|
+
# - Weight-based depth control
|
|
22
|
+
# - Handles unique child labels
|
|
23
|
+
# - Preserves matching constraints
|
|
24
|
+
class StructuralPropagator
|
|
25
|
+
attr_reader :tree1, :tree2, :matching
|
|
26
|
+
|
|
27
|
+
# Base weight threshold for propagation depth
|
|
28
|
+
BASE_WEIGHT_THRESHOLD = 10.0
|
|
29
|
+
|
|
30
|
+
# Initialize propagator with trees and existing matching
|
|
31
|
+
#
|
|
32
|
+
# @param tree1 [TreeNode] First tree root
|
|
33
|
+
# @param tree2 [TreeNode] Second tree root
|
|
34
|
+
# @param matching [Core::Matching] Existing matching
|
|
35
|
+
def initialize(tree1, tree2, matching)
|
|
36
|
+
@tree1 = tree1
|
|
37
|
+
@tree2 = tree2
|
|
38
|
+
@matching = matching
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Perform structural propagation
|
|
42
|
+
#
|
|
43
|
+
# @return [Core::Matching] Updated matching
|
|
44
|
+
def propagate
|
|
45
|
+
# Phase 1: Bottom-up propagation
|
|
46
|
+
propagate_bottom_up
|
|
47
|
+
|
|
48
|
+
# Phase 2: Top-down propagation
|
|
49
|
+
propagate_top_down
|
|
50
|
+
|
|
51
|
+
@matching
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
# Bottom-up propagation: match parents of matched children
|
|
57
|
+
#
|
|
58
|
+
# If multiple children are matched and parents are compatible,
|
|
59
|
+
# match the parents too
|
|
60
|
+
def propagate_bottom_up
|
|
61
|
+
# Get all matched pairs
|
|
62
|
+
matched_pairs = @matching.to_a
|
|
63
|
+
|
|
64
|
+
# Process in reverse (children before parents)
|
|
65
|
+
matched_pairs.reverse.each do |node1, node2|
|
|
66
|
+
propagate_to_parent(node1, node2)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Try to match parents of a matched pair
|
|
71
|
+
#
|
|
72
|
+
# @param node1 [TreeNode] Node from tree1
|
|
73
|
+
# @param node2 [TreeNode] Node from tree2
|
|
74
|
+
def propagate_to_parent(node1, node2)
|
|
75
|
+
parent1 = node1.parent
|
|
76
|
+
parent2 = node2.parent
|
|
77
|
+
|
|
78
|
+
return unless parent1 && parent2
|
|
79
|
+
return if @matching.matched1?(parent1)
|
|
80
|
+
return if @matching.matched2?(parent2)
|
|
81
|
+
|
|
82
|
+
# Check if parents are compatible
|
|
83
|
+
return unless parents_compatible?(parent1, parent2)
|
|
84
|
+
|
|
85
|
+
# Check propagation depth
|
|
86
|
+
weight1 = Core::NodeWeight.for(parent1).value
|
|
87
|
+
depth = propagation_depth(weight1)
|
|
88
|
+
|
|
89
|
+
return if depth < 1
|
|
90
|
+
|
|
91
|
+
# Try to match parents
|
|
92
|
+
@matching.add(parent1, parent2)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Check if two parent nodes are compatible for matching
|
|
96
|
+
#
|
|
97
|
+
# Parents are compatible if:
|
|
98
|
+
# - Same label
|
|
99
|
+
# - Similar attributes
|
|
100
|
+
# - Matched children align properly
|
|
101
|
+
#
|
|
102
|
+
# @param parent1 [TreeNode] Parent from tree1
|
|
103
|
+
# @param parent2 [TreeNode] Parent from tree2
|
|
104
|
+
# @return [Boolean]
|
|
105
|
+
def parents_compatible?(parent1, parent2)
|
|
106
|
+
# Must have same label
|
|
107
|
+
return false unless parent1.label == parent2.label
|
|
108
|
+
|
|
109
|
+
# Must have similar attributes (allow some differences)
|
|
110
|
+
attr_sim = 1.0 - parent1.attribute_difference(parent2)
|
|
111
|
+
return false if attr_sim < 0.5
|
|
112
|
+
|
|
113
|
+
# Check that matched children align
|
|
114
|
+
matched_children_align?(parent1, parent2)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Check if matched children of two parents align
|
|
118
|
+
#
|
|
119
|
+
# @param parent1 [TreeNode] Parent from tree1
|
|
120
|
+
# @param parent2 [TreeNode] Parent from tree2
|
|
121
|
+
# @return [Boolean]
|
|
122
|
+
def matched_children_align?(parent1, parent2)
|
|
123
|
+
# Get matched children
|
|
124
|
+
matched1 = parent1.children.select { |c| @matching.matched1?(c) }
|
|
125
|
+
parent2.children.select { |c| @matching.matched2?(c) }
|
|
126
|
+
|
|
127
|
+
return false if matched1.empty?
|
|
128
|
+
|
|
129
|
+
# Check each matched child in parent1
|
|
130
|
+
matched1.all? do |child1|
|
|
131
|
+
# Get its match in tree2
|
|
132
|
+
child2 = @matching.match_for1(child1)
|
|
133
|
+
|
|
134
|
+
# Check if child2 is actually a child of parent2
|
|
135
|
+
parent2.children.include?(child2)
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Top-down propagation: match children of matched parents
|
|
140
|
+
#
|
|
141
|
+
# If parents are matched and have unique corresponding children,
|
|
142
|
+
# match those children too
|
|
143
|
+
def propagate_top_down
|
|
144
|
+
# Get all matched pairs
|
|
145
|
+
matched_pairs = @matching.to_a
|
|
146
|
+
|
|
147
|
+
# Process each matched pair
|
|
148
|
+
matched_pairs.each do |node1, node2|
|
|
149
|
+
propagate_to_children(node1, node2)
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Try to match children of a matched pair
|
|
154
|
+
#
|
|
155
|
+
# @param node1 [TreeNode] Node from tree1
|
|
156
|
+
# @param node2 [TreeNode] Node from tree2
|
|
157
|
+
def propagate_to_children(node1, node2)
|
|
158
|
+
# Get unmatched children
|
|
159
|
+
unmatched1 = node1.children.reject { |c| @matching.matched1?(c) }
|
|
160
|
+
unmatched2 = node2.children.reject { |c| @matching.matched2?(c) }
|
|
161
|
+
|
|
162
|
+
return if unmatched1.empty? || unmatched2.empty?
|
|
163
|
+
|
|
164
|
+
# Find unique label correspondences
|
|
165
|
+
find_unique_matches(unmatched1, unmatched2)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Find and match children with unique labels
|
|
169
|
+
#
|
|
170
|
+
# If a label appears exactly once in each parent's unmatched children,
|
|
171
|
+
# match those children
|
|
172
|
+
#
|
|
173
|
+
# @param children1 [Array<TreeNode>] Unmatched children from tree1
|
|
174
|
+
# @param children2 [Array<TreeNode>] Unmatched children from tree2
|
|
175
|
+
def find_unique_matches(children1, children2)
|
|
176
|
+
# Group children by label
|
|
177
|
+
by_label1 = children1.group_by(&:label)
|
|
178
|
+
by_label2 = children2.group_by(&:label)
|
|
179
|
+
|
|
180
|
+
# Find labels that appear exactly once in both
|
|
181
|
+
by_label1.each do |label, nodes1|
|
|
182
|
+
next unless nodes1.size == 1
|
|
183
|
+
|
|
184
|
+
nodes2 = by_label2[label]
|
|
185
|
+
next unless nodes2 && nodes2.size == 1
|
|
186
|
+
|
|
187
|
+
child1 = nodes1.first
|
|
188
|
+
child2 = nodes2.first
|
|
189
|
+
|
|
190
|
+
# CRITICAL: For whitespace-sensitive elements, check text values match
|
|
191
|
+
# Don't auto-match <pre>, <code>, etc. with different whitespace
|
|
192
|
+
if (whitespace_sensitive?(child1) || whitespace_sensitive?(child2)) && child1.value != child2.value
|
|
193
|
+
# For whitespace-sensitive elements, text must match exactly
|
|
194
|
+
next
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Check propagation depth
|
|
198
|
+
weight1 = Core::NodeWeight.for(child1).value
|
|
199
|
+
depth = propagation_depth(weight1)
|
|
200
|
+
|
|
201
|
+
next if depth < 1
|
|
202
|
+
|
|
203
|
+
# Try to match
|
|
204
|
+
@matching.add(child1, child2)
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Check if a node is whitespace-sensitive
|
|
209
|
+
#
|
|
210
|
+
# HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
|
|
211
|
+
#
|
|
212
|
+
# @param node [TreeNode] Node to check
|
|
213
|
+
# @return [Boolean] True if node is whitespace-sensitive
|
|
214
|
+
def whitespace_sensitive?(node)
|
|
215
|
+
return false unless node
|
|
216
|
+
|
|
217
|
+
# List of HTML elements where whitespace is semantically significant
|
|
218
|
+
whitespace_sensitive_tags = %w[pre code textarea script style]
|
|
219
|
+
|
|
220
|
+
# Check if this node is whitespace-sensitive
|
|
221
|
+
if node.respond_to?(:label)
|
|
222
|
+
label = node.label.to_s.downcase
|
|
223
|
+
return true if whitespace_sensitive_tags.include?(label)
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
false
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Calculate propagation depth based on node weight
|
|
230
|
+
#
|
|
231
|
+
# Formula: 1 + floor(W / W₀)
|
|
232
|
+
# where W = node weight, W₀ = base threshold
|
|
233
|
+
#
|
|
234
|
+
# @param weight [Float] Node weight
|
|
235
|
+
# @return [Integer] Propagation depth
|
|
236
|
+
def propagation_depth(weight)
|
|
237
|
+
1 + (weight / BASE_WEIGHT_THRESHOLD).floor
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Canon
|
|
4
|
+
module TreeDiff
|
|
5
|
+
module Matchers
|
|
6
|
+
# UniversalMatcher orchestrates the complete matching process by combining
|
|
7
|
+
# hash-based, similarity-based, and structural propagation matching strategies.
|
|
8
|
+
#
|
|
9
|
+
# This is the main entry point for tree matching and follows a multi-phase
|
|
10
|
+
# pipeline approach:
|
|
11
|
+
#
|
|
12
|
+
# Phase 1: Hash Matching (XyDiff BULD)
|
|
13
|
+
# - Exact signature matching for identical subtrees
|
|
14
|
+
# - O(n log n) complexity via priority queue
|
|
15
|
+
# - Processes heaviest nodes first
|
|
16
|
+
#
|
|
17
|
+
# Phase 2: Similarity Matching (JATS-diff)
|
|
18
|
+
# - Content-based similarity via Jaccard index
|
|
19
|
+
# - Configurable threshold (default 0.95)
|
|
20
|
+
# - Groups by signature for efficiency
|
|
21
|
+
#
|
|
22
|
+
# Phase 3: Structural Propagation (XyDiff)
|
|
23
|
+
# - Bottom-up: match parents of matched children
|
|
24
|
+
# - Top-down: match children of matched parents
|
|
25
|
+
# - Adaptive propagation depth based on weight
|
|
26
|
+
#
|
|
27
|
+
# @example Basic usage
|
|
28
|
+
# matcher = UniversalMatcher.new
|
|
29
|
+
# matching = matcher.match(tree1, tree2)
|
|
30
|
+
# puts "Matched #{matching.size} nodes"
|
|
31
|
+
#
|
|
32
|
+
# @example With custom options
|
|
33
|
+
# matcher = UniversalMatcher.new(
|
|
34
|
+
# similarity_threshold: 0.9,
|
|
35
|
+
# enable_propagation: false
|
|
36
|
+
# )
|
|
37
|
+
# matching = matcher.match(tree1, tree2)
|
|
38
|
+
#
|
|
39
|
+
class UniversalMatcher
|
|
40
|
+
# Default options for the matching process
|
|
41
|
+
DEFAULT_OPTIONS = {
|
|
42
|
+
# Minimum Jaccard similarity for content matching
|
|
43
|
+
similarity_threshold: 0.95,
|
|
44
|
+
|
|
45
|
+
# Enable hash-based exact matching
|
|
46
|
+
enable_hash_matching: true,
|
|
47
|
+
|
|
48
|
+
# Enable similarity-based matching
|
|
49
|
+
enable_similarity_matching: true,
|
|
50
|
+
|
|
51
|
+
# Enable structural propagation
|
|
52
|
+
enable_propagation: true,
|
|
53
|
+
|
|
54
|
+
# Maximum propagation depth (nil = adaptive)
|
|
55
|
+
max_propagation_depth: nil,
|
|
56
|
+
|
|
57
|
+
# Minimum weight for propagation
|
|
58
|
+
min_propagation_weight: 2.0,
|
|
59
|
+
}.freeze
|
|
60
|
+
|
|
61
|
+
attr_reader :options, :statistics
|
|
62
|
+
|
|
63
|
+
# Initialize a new UniversalMatcher
|
|
64
|
+
#
|
|
65
|
+
# @param options [Hash] Configuration options
|
|
66
|
+
# @option options [Float] :similarity_threshold (0.95)
|
|
67
|
+
# Minimum similarity for content matching
|
|
68
|
+
# @option options [Boolean] :enable_hash_matching (true)
|
|
69
|
+
# Enable hash-based exact matching
|
|
70
|
+
# @option options [Boolean] :enable_similarity_matching (true)
|
|
71
|
+
# Enable similarity-based matching
|
|
72
|
+
# @option options [Boolean] :enable_propagation (true)
|
|
73
|
+
# Enable structural propagation
|
|
74
|
+
# @option options [Integer, nil] :max_propagation_depth (nil)
|
|
75
|
+
# Maximum propagation depth (nil = adaptive)
|
|
76
|
+
# @option options [Float] :min_propagation_weight (2.0)
|
|
77
|
+
# Minimum weight for propagation
|
|
78
|
+
def initialize(options = {})
|
|
79
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
|
80
|
+
@statistics = {}
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Match two trees and return a Matching object
|
|
84
|
+
#
|
|
85
|
+
# @param tree1 [TreeNode] First tree root
|
|
86
|
+
# @param tree2 [TreeNode] Second tree root
|
|
87
|
+
# @return [Matching] Matching object with all matched pairs
|
|
88
|
+
def match(tree1, tree2)
|
|
89
|
+
reset_statistics(tree1, tree2)
|
|
90
|
+
|
|
91
|
+
matching = Core::Matching.new
|
|
92
|
+
|
|
93
|
+
# Phase 1: Hash-based exact matching
|
|
94
|
+
if @options[:enable_hash_matching]
|
|
95
|
+
hash_matching_phase(tree1, tree2, matching)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Phase 2: Similarity-based matching
|
|
99
|
+
if @options[:enable_similarity_matching]
|
|
100
|
+
similarity_matching_phase(tree1, tree2, matching)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Phase 3: Structural propagation
|
|
104
|
+
if @options[:enable_propagation]
|
|
105
|
+
propagation_phase(tree1, tree2, matching)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
finalize_statistics(matching)
|
|
109
|
+
matching
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
private
|
|
113
|
+
|
|
114
|
+
# Reset statistics for a new matching process
|
|
115
|
+
#
|
|
116
|
+
# @param tree1 [TreeNode] First tree root
|
|
117
|
+
# @param tree2 [TreeNode] Second tree root
|
|
118
|
+
def reset_statistics(tree1, tree2)
|
|
119
|
+
@statistics = {
|
|
120
|
+
tree1_nodes: count_nodes(tree1),
|
|
121
|
+
tree2_nodes: count_nodes(tree2),
|
|
122
|
+
hash_matches: 0,
|
|
123
|
+
similarity_matches: 0,
|
|
124
|
+
propagation_matches: 0,
|
|
125
|
+
total_matches: 0,
|
|
126
|
+
match_ratio_tree1: 0.0,
|
|
127
|
+
match_ratio_tree2: 0.0,
|
|
128
|
+
phases_executed: [],
|
|
129
|
+
}
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Execute hash-based matching phase
|
|
133
|
+
#
|
|
134
|
+
# @param tree1 [TreeNode] First tree root
|
|
135
|
+
# @param tree2 [TreeNode] Second tree root
|
|
136
|
+
# @param matching [Matching] Matching object to update
|
|
137
|
+
def hash_matching_phase(tree1, tree2, matching)
|
|
138
|
+
@statistics[:phases_executed] << :hash_matching
|
|
139
|
+
|
|
140
|
+
hash_matcher = HashMatcher.new(tree1, tree2, @options)
|
|
141
|
+
temp_matching = hash_matcher.match
|
|
142
|
+
|
|
143
|
+
# Transfer matches to the main matching object
|
|
144
|
+
temp_matching.pairs.each do |node1, node2|
|
|
145
|
+
matching.add(node1, node2)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
@statistics[:hash_matches] = temp_matching.size
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Execute similarity-based matching phase
|
|
152
|
+
#
|
|
153
|
+
# @param tree1 [TreeNode] First tree root
|
|
154
|
+
# @param tree2 [TreeNode] Second tree root
|
|
155
|
+
# @param matching [Matching] Matching object to update
|
|
156
|
+
def similarity_matching_phase(tree1, tree2, matching)
|
|
157
|
+
@statistics[:phases_executed] << :similarity_matching
|
|
158
|
+
|
|
159
|
+
before_count = matching.size
|
|
160
|
+
|
|
161
|
+
similarity_matcher = SimilarityMatcher.new(
|
|
162
|
+
tree1,
|
|
163
|
+
tree2,
|
|
164
|
+
matching,
|
|
165
|
+
threshold: @options[:similarity_threshold],
|
|
166
|
+
)
|
|
167
|
+
similarity_matcher.match
|
|
168
|
+
|
|
169
|
+
@statistics[:similarity_matches] = matching.size - before_count
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Execute structural propagation phase
|
|
173
|
+
#
|
|
174
|
+
# @param tree1 [TreeNode] First tree root
|
|
175
|
+
# @param tree2 [TreeNode] Second tree root
|
|
176
|
+
# @param matching [Matching] Matching object to update
|
|
177
|
+
def propagation_phase(tree1, tree2, matching)
|
|
178
|
+
@statistics[:phases_executed] << :propagation
|
|
179
|
+
|
|
180
|
+
before_count = matching.size
|
|
181
|
+
|
|
182
|
+
propagator = StructuralPropagator.new(tree1, tree2, matching)
|
|
183
|
+
propagator.propagate
|
|
184
|
+
|
|
185
|
+
@statistics[:propagation_matches] = matching.size - before_count
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Finalize statistics after matching is complete
|
|
189
|
+
#
|
|
190
|
+
# @param matching [Matching] Final matching object
|
|
191
|
+
def finalize_statistics(matching)
|
|
192
|
+
@statistics[:total_matches] = matching.size
|
|
193
|
+
|
|
194
|
+
# Calculate match ratios
|
|
195
|
+
if @statistics[:tree1_nodes].positive?
|
|
196
|
+
@statistics[:match_ratio_tree1] =
|
|
197
|
+
matching.size.to_f / @statistics[:tree1_nodes]
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
if @statistics[:tree2_nodes].positive?
|
|
201
|
+
@statistics[:match_ratio_tree2] =
|
|
202
|
+
matching.size.to_f / @statistics[:tree2_nodes]
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Count total nodes in a tree
|
|
207
|
+
#
|
|
208
|
+
# @param node [TreeNode] Tree root
|
|
209
|
+
# @return [Integer] Total node count
|
|
210
|
+
def count_nodes(node)
|
|
211
|
+
count = 1
|
|
212
|
+
node.children.each do |child|
|
|
213
|
+
count += count_nodes(child)
|
|
214
|
+
end
|
|
215
|
+
count
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|