canon 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +163 -67
  3. data/README.adoc +400 -7
  4. data/docs/Gemfile +9 -0
  5. data/docs/INDEX.adoc +99 -182
  6. data/docs/_config.yml +100 -0
  7. data/docs/advanced/diff-classification.adoc +547 -0
  8. data/docs/advanced/diff-pipeline.adoc +358 -0
  9. data/docs/advanced/index.adoc +214 -0
  10. data/docs/advanced/semantic-diff-report.adoc +390 -0
  11. data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
  12. data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
  13. data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
  14. data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
  15. data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
  16. data/docs/features/diff-formatting/display-filtering.adoc +472 -0
  17. data/docs/features/diff-formatting/index.adoc +140 -0
  18. data/docs/features/environment-configuration/index.adoc +327 -0
  19. data/docs/features/environment-configuration/override-system.adoc +436 -0
  20. data/docs/features/environment-configuration/size-limits.adoc +273 -0
  21. data/docs/features/index.adoc +173 -0
  22. data/docs/features/input-validation/index.adoc +521 -0
  23. data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
  24. data/docs/features/match-options/html-policies.adoc +312 -0
  25. data/docs/features/match-options/index.adoc +621 -0
  26. data/docs/getting-started/index.adoc +83 -0
  27. data/docs/getting-started/quick-start.adoc +76 -0
  28. data/docs/guides/choosing-configuration.adoc +689 -0
  29. data/docs/guides/index.adoc +181 -0
  30. data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
  31. data/docs/interfaces/index.adoc +101 -0
  32. data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
  33. data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
  34. data/docs/lychee.toml +65 -0
  35. data/docs/reference/cli-options.adoc +418 -0
  36. data/docs/reference/environment-variables.adoc +375 -0
  37. data/docs/reference/index.adoc +204 -0
  38. data/docs/reference/options-across-interfaces.adoc +417 -0
  39. data/docs/understanding/algorithms/dom-diff.adoc +389 -0
  40. data/docs/understanding/algorithms/index.adoc +314 -0
  41. data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
  42. data/docs/understanding/architecture.adoc +447 -0
  43. data/docs/understanding/comparison-pipeline.adoc +317 -0
  44. data/docs/understanding/formats/html.adoc +380 -0
  45. data/docs/understanding/formats/index.adoc +261 -0
  46. data/docs/understanding/formats/json.adoc +390 -0
  47. data/docs/understanding/formats/xml.adoc +366 -0
  48. data/docs/understanding/formats/yaml.adoc +504 -0
  49. data/docs/understanding/index.adoc +130 -0
  50. data/lib/canon/cli.rb +42 -1
  51. data/lib/canon/commands/diff_command.rb +108 -23
  52. data/lib/canon/comparison/compare_profile.rb +101 -0
  53. data/lib/canon/comparison/comparison_result.rb +41 -2
  54. data/lib/canon/comparison/html_comparator.rb +292 -71
  55. data/lib/canon/comparison/html_compare_profile.rb +117 -0
  56. data/lib/canon/comparison/match_options.rb +42 -4
  57. data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
  58. data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
  59. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
  60. data/lib/canon/comparison/xml_comparator.rb +695 -91
  61. data/lib/canon/comparison.rb +207 -2
  62. data/lib/canon/config/env_provider.rb +71 -0
  63. data/lib/canon/config/env_schema.rb +58 -0
  64. data/lib/canon/config/override_resolver.rb +55 -0
  65. data/lib/canon/config/type_converter.rb +59 -0
  66. data/lib/canon/config.rb +158 -29
  67. data/lib/canon/data_model.rb +29 -0
  68. data/lib/canon/diff/diff_classifier.rb +74 -14
  69. data/lib/canon/diff/diff_context_builder.rb +41 -0
  70. data/lib/canon/diff/diff_line.rb +18 -2
  71. data/lib/canon/diff/diff_node.rb +18 -3
  72. data/lib/canon/diff/diff_node_mapper.rb +71 -12
  73. data/lib/canon/diff/formatting_detector.rb +53 -0
  74. data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
  75. data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
  76. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
  77. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
  78. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
  79. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
  80. data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
  81. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
  82. data/lib/canon/diff_formatter/debug_output.rb +7 -1
  83. data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
  84. data/lib/canon/diff_formatter/legend.rb +42 -0
  85. data/lib/canon/diff_formatter.rb +78 -9
  86. data/lib/canon/errors.rb +56 -0
  87. data/lib/canon/formatters/html_formatter_base.rb +35 -1
  88. data/lib/canon/formatters/json_formatter.rb +3 -0
  89. data/lib/canon/formatters/yaml_formatter.rb +3 -0
  90. data/lib/canon/html/data_model.rb +229 -0
  91. data/lib/canon/html.rb +9 -0
  92. data/lib/canon/options/cli_generator.rb +70 -0
  93. data/lib/canon/options/registry.rb +234 -0
  94. data/lib/canon/rspec_matchers.rb +34 -13
  95. data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
  96. data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
  97. data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
  98. data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
  99. data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
  100. data/lib/canon/tree_diff/core/matching.rb +241 -0
  101. data/lib/canon/tree_diff/core/node_signature.rb +164 -0
  102. data/lib/canon/tree_diff/core/node_weight.rb +135 -0
  103. data/lib/canon/tree_diff/core/tree_node.rb +450 -0
  104. data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
  105. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
  106. data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
  107. data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
  108. data/lib/canon/tree_diff/operation_converter.rb +631 -0
  109. data/lib/canon/tree_diff/operations/operation.rb +92 -0
  110. data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
  111. data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
  112. data/lib/canon/tree_diff.rb +33 -0
  113. data/lib/canon/validators/json_validator.rb +3 -1
  114. data/lib/canon/validators/yaml_validator.rb +3 -1
  115. data/lib/canon/version.rb +1 -1
  116. data/lib/canon/xml/data_model.rb +22 -23
  117. data/lib/canon/xml/element_matcher.rb +128 -20
  118. data/lib/canon/xml/namespace_helper.rb +110 -0
  119. data/lib/canon.rb +3 -0
  120. metadata +81 -23
  121. data/_config.yml +0 -116
  122. data/docs/ADVANCED_TOPICS.adoc +0 -20
  123. data/docs/BASIC_USAGE.adoc +0 -16
  124. data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  125. data/docs/DIFF_ARCHITECTURE.adoc +0 -435
  126. data/docs/DIFF_FORMATTING.adoc +0 -540
  127. data/docs/FORMATS.adoc +0 -447
  128. data/docs/INPUT_VALIDATION.adoc +0 -477
  129. data/docs/MATCH_ARCHITECTURE.adoc +0 -463
  130. data/docs/MATCH_OPTIONS.adoc +0 -719
  131. data/docs/MODES.adoc +0 -432
  132. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  133. data/docs/OPTIONS.adoc +0 -1387
  134. data/docs/PREPROCESSING.adoc +0 -491
  135. data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
  136. data/docs/UNDERSTANDING_CANON.adoc +0 -17
@@ -0,0 +1,168 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../core/tree_node"
4
+ require_relative "../core/node_signature"
5
+ require_relative "../core/matching"
6
+
7
+ module Canon
8
+ module TreeDiff
9
+ module Matchers
10
+ # SimilarityMatcher performs similarity-based matching
11
+ #
12
+ # Based on JATS-diff (2022) approach:
13
+ # - Use Jaccard index for content similarity
14
+ # - Configurable similarity threshold (default 0.95)
15
+ # - Group candidates by signature for efficiency
16
+ # - Extend matches for unmatched nodes
17
+ #
18
+ # Features:
19
+ # - Handles text-centric documents
20
+ # - Fuzzy matching for similar but not identical nodes
21
+ # - Threshold-based filtering
22
+ # - Efficient signature-based grouping
23
+ class SimilarityMatcher
24
+ attr_reader :tree1, :tree2, :matching, :threshold
25
+
26
+ # Initialize matcher with two trees and existing matching
27
+ #
28
+ # @param tree1 [TreeNode] First tree root
29
+ # @param tree2 [TreeNode] Second tree root
30
+ # @param matching [Core::Matching] Existing matching from previous phase
31
+ # @param threshold [Float] Similarity threshold (0.0 to 1.0)
32
+ def initialize(tree1, tree2, matching, threshold: 0.95)
33
+ @tree1 = tree1
34
+ @tree2 = tree2
35
+ @matching = matching
36
+ @threshold = threshold
37
+ end
38
+
39
+ # Perform similarity-based matching
40
+ #
41
+ # @return [Core::Matching] Updated matching
42
+ def match
43
+ # Get unmatched nodes from both trees
44
+ all_nodes1 = collect_nodes(tree1)
45
+ all_nodes2 = collect_nodes(tree2)
46
+
47
+ unmatched1 = @matching.unmatched1(all_nodes1)
48
+ unmatched2 = @matching.unmatched2(all_nodes2)
49
+
50
+ # Group unmatched nodes by signature for efficiency
51
+ groups1 = group_by_signature(unmatched1)
52
+ groups2 = group_by_signature(unmatched2)
53
+
54
+ # For each signature group, find similar matches
55
+ groups2.each do |sig, nodes2|
56
+ # Find corresponding group in tree1
57
+ nodes1 = groups1[sig] || []
58
+ next if nodes1.empty?
59
+
60
+ # Match nodes within this signature group
61
+ match_group(nodes1, nodes2)
62
+ end
63
+
64
+ @matching
65
+ end
66
+
67
+ private
68
+
69
+ # Collect all nodes from a tree
70
+ #
71
+ # @param root [TreeNode] Root of tree
72
+ # @return [Array<TreeNode>]
73
+ def collect_nodes(root)
74
+ nodes = [root]
75
+ nodes.concat(root.descendants)
76
+ nodes
77
+ end
78
+
79
+ # Group nodes by signature
80
+ #
81
+ # For similarity matching, we use LOOSE signatures (element name only,
82
+ # no attributes) so that nodes with different attributes can still be
83
+ # compared for similarity. This allows matching nodes like:
84
+ # <note id="A"> vs <note id="A" autonum="1">
85
+ #
86
+ # @param nodes [Array<TreeNode>] Nodes to group
87
+ # @return [Hash<NodeSignature, Array<TreeNode>>]
88
+ def group_by_signature(nodes)
89
+ nodes.group_by { |node| Core::NodeSignature.for(node, include_attributes: false) }
90
+ end
91
+
92
+ # Match nodes within a signature group
93
+ #
94
+ # @param nodes1 [Array<TreeNode>] Nodes from tree1
95
+ # @param nodes2 [Array<TreeNode>] Nodes from tree2
96
+ def match_group(nodes1, nodes2)
97
+ # Create similarity matrix
98
+ matches = []
99
+
100
+ nodes2.each do |node2|
101
+ next if @matching.matched2?(node2)
102
+
103
+ # Find best match in nodes1
104
+ best_match = nil
105
+ best_similarity = @threshold
106
+
107
+ nodes1.each do |node1|
108
+ next if @matching.matched1?(node1)
109
+
110
+ # CRITICAL: For whitespace-sensitive elements, require exact text match
111
+ # Don't fuzzy-match <pre>, <code>, etc. with different whitespace
112
+ if (whitespace_sensitive?(node1) || whitespace_sensitive?(node2)) && node1.value != node2.value
113
+ # For whitespace-sensitive elements, text must match exactly
114
+ next
115
+ end
116
+
117
+ similarity = node1.similarity_to(node2)
118
+
119
+ if similarity > best_similarity
120
+ best_similarity = similarity
121
+ best_match = node1
122
+ end
123
+ end
124
+
125
+ # Record match if found
126
+ if best_match
127
+ matches << [best_match, node2, best_similarity]
128
+ end
129
+ end
130
+
131
+ # Sort matches by similarity (highest first)
132
+ matches.sort_by! { |_, _, sim| -sim }
133
+
134
+ # Add matches in order of similarity
135
+ matches.each do |node1, node2, _similarity|
136
+ # Skip if already matched (by a higher-similarity match)
137
+ next if @matching.matched1?(node1)
138
+ next if @matching.matched2?(node2)
139
+
140
+ # Try to add match
141
+ @matching.add(node1, node2)
142
+ end
143
+ end
144
+
145
+ # Check if a node is whitespace-sensitive
146
+ #
147
+ # HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
148
+ #
149
+ # @param node [TreeNode] Node to check
150
+ # @return [Boolean] True if node is whitespace-sensitive
151
+ def whitespace_sensitive?(node)
152
+ return false unless node
153
+
154
+ # List of HTML elements where whitespace is semantically significant
155
+ whitespace_sensitive_tags = %w[pre code textarea script style]
156
+
157
+ # Check if this node is whitespace-sensitive
158
+ if node.respond_to?(:label)
159
+ label = node.label.to_s.downcase
160
+ return true if whitespace_sensitive_tags.include?(label)
161
+ end
162
+
163
+ false
164
+ end
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,242 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../core/tree_node"
4
+ require_relative "../core/node_weight"
5
+ require_relative "../core/matching"
6
+
7
+ module Canon
8
+ module TreeDiff
9
+ module Matchers
10
+ # StructuralPropagator extends matches using structural relationships
11
+ #
12
+ # Based on XyDiff/Cobena (2002, INRIA) propagation strategies:
13
+ # - Bottom-up: Match parents of matched children
14
+ # - Top-down: Match children of matched parents (lazy propagation)
15
+ #
16
+ # Propagation depth formula: 1 + (W / W₀)
17
+ # where W = node weight, W₀ = base weight threshold
18
+ #
19
+ # Features:
20
+ # - Conservative propagation (only when safe)
21
+ # - Weight-based depth control
22
+ # - Handles unique child labels
23
+ # - Preserves matching constraints
24
+ class StructuralPropagator
25
+ attr_reader :tree1, :tree2, :matching
26
+
27
+ # Base weight threshold for propagation depth
28
+ BASE_WEIGHT_THRESHOLD = 10.0
29
+
30
+ # Initialize propagator with trees and existing matching
31
+ #
32
+ # @param tree1 [TreeNode] First tree root
33
+ # @param tree2 [TreeNode] Second tree root
34
+ # @param matching [Core::Matching] Existing matching
35
+ def initialize(tree1, tree2, matching)
36
+ @tree1 = tree1
37
+ @tree2 = tree2
38
+ @matching = matching
39
+ end
40
+
41
+ # Perform structural propagation
42
+ #
43
+ # @return [Core::Matching] Updated matching
44
+ def propagate
45
+ # Phase 1: Bottom-up propagation
46
+ propagate_bottom_up
47
+
48
+ # Phase 2: Top-down propagation
49
+ propagate_top_down
50
+
51
+ @matching
52
+ end
53
+
54
+ private
55
+
56
+ # Bottom-up propagation: match parents of matched children
57
+ #
58
+ # If multiple children are matched and parents are compatible,
59
+ # match the parents too
60
+ def propagate_bottom_up
61
+ # Get all matched pairs
62
+ matched_pairs = @matching.to_a
63
+
64
+ # Process in reverse (children before parents)
65
+ matched_pairs.reverse.each do |node1, node2|
66
+ propagate_to_parent(node1, node2)
67
+ end
68
+ end
69
+
70
+ # Try to match parents of a matched pair
71
+ #
72
+ # @param node1 [TreeNode] Node from tree1
73
+ # @param node2 [TreeNode] Node from tree2
74
+ def propagate_to_parent(node1, node2)
75
+ parent1 = node1.parent
76
+ parent2 = node2.parent
77
+
78
+ return unless parent1 && parent2
79
+ return if @matching.matched1?(parent1)
80
+ return if @matching.matched2?(parent2)
81
+
82
+ # Check if parents are compatible
83
+ return unless parents_compatible?(parent1, parent2)
84
+
85
+ # Check propagation depth
86
+ weight1 = Core::NodeWeight.for(parent1).value
87
+ depth = propagation_depth(weight1)
88
+
89
+ return if depth < 1
90
+
91
+ # Try to match parents
92
+ @matching.add(parent1, parent2)
93
+ end
94
+
95
+ # Check if two parent nodes are compatible for matching
96
+ #
97
+ # Parents are compatible if:
98
+ # - Same label
99
+ # - Similar attributes
100
+ # - Matched children align properly
101
+ #
102
+ # @param parent1 [TreeNode] Parent from tree1
103
+ # @param parent2 [TreeNode] Parent from tree2
104
+ # @return [Boolean]
105
+ def parents_compatible?(parent1, parent2)
106
+ # Must have same label
107
+ return false unless parent1.label == parent2.label
108
+
109
+ # Must have similar attributes (allow some differences)
110
+ attr_sim = 1.0 - parent1.attribute_difference(parent2)
111
+ return false if attr_sim < 0.5
112
+
113
+ # Check that matched children align
114
+ matched_children_align?(parent1, parent2)
115
+ end
116
+
117
+ # Check if matched children of two parents align
118
+ #
119
+ # @param parent1 [TreeNode] Parent from tree1
120
+ # @param parent2 [TreeNode] Parent from tree2
121
+ # @return [Boolean]
122
+ def matched_children_align?(parent1, parent2)
123
+ # Get matched children
124
+ matched1 = parent1.children.select { |c| @matching.matched1?(c) }
125
+ parent2.children.select { |c| @matching.matched2?(c) }
126
+
127
+ return false if matched1.empty?
128
+
129
+ # Check each matched child in parent1
130
+ matched1.all? do |child1|
131
+ # Get its match in tree2
132
+ child2 = @matching.match_for1(child1)
133
+
134
+ # Check if child2 is actually a child of parent2
135
+ parent2.children.include?(child2)
136
+ end
137
+ end
138
+
139
+ # Top-down propagation: match children of matched parents
140
+ #
141
+ # If parents are matched and have unique corresponding children,
142
+ # match those children too
143
+ def propagate_top_down
144
+ # Get all matched pairs
145
+ matched_pairs = @matching.to_a
146
+
147
+ # Process each matched pair
148
+ matched_pairs.each do |node1, node2|
149
+ propagate_to_children(node1, node2)
150
+ end
151
+ end
152
+
153
+ # Try to match children of a matched pair
154
+ #
155
+ # @param node1 [TreeNode] Node from tree1
156
+ # @param node2 [TreeNode] Node from tree2
157
+ def propagate_to_children(node1, node2)
158
+ # Get unmatched children
159
+ unmatched1 = node1.children.reject { |c| @matching.matched1?(c) }
160
+ unmatched2 = node2.children.reject { |c| @matching.matched2?(c) }
161
+
162
+ return if unmatched1.empty? || unmatched2.empty?
163
+
164
+ # Find unique label correspondences
165
+ find_unique_matches(unmatched1, unmatched2)
166
+ end
167
+
168
+ # Find and match children with unique labels
169
+ #
170
+ # If a label appears exactly once in each parent's unmatched children,
171
+ # match those children
172
+ #
173
+ # @param children1 [Array<TreeNode>] Unmatched children from tree1
174
+ # @param children2 [Array<TreeNode>] Unmatched children from tree2
175
+ def find_unique_matches(children1, children2)
176
+ # Group children by label
177
+ by_label1 = children1.group_by(&:label)
178
+ by_label2 = children2.group_by(&:label)
179
+
180
+ # Find labels that appear exactly once in both
181
+ by_label1.each do |label, nodes1|
182
+ next unless nodes1.size == 1
183
+
184
+ nodes2 = by_label2[label]
185
+ next unless nodes2 && nodes2.size == 1
186
+
187
+ child1 = nodes1.first
188
+ child2 = nodes2.first
189
+
190
+ # CRITICAL: For whitespace-sensitive elements, check text values match
191
+ # Don't auto-match <pre>, <code>, etc. with different whitespace
192
+ if (whitespace_sensitive?(child1) || whitespace_sensitive?(child2)) && child1.value != child2.value
193
+ # For whitespace-sensitive elements, text must match exactly
194
+ next
195
+ end
196
+
197
+ # Check propagation depth
198
+ weight1 = Core::NodeWeight.for(child1).value
199
+ depth = propagation_depth(weight1)
200
+
201
+ next if depth < 1
202
+
203
+ # Try to match
204
+ @matching.add(child1, child2)
205
+ end
206
+ end
207
+
208
+ # Check if a node is whitespace-sensitive
209
+ #
210
+ # HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
211
+ #
212
+ # @param node [TreeNode] Node to check
213
+ # @return [Boolean] True if node is whitespace-sensitive
214
+ def whitespace_sensitive?(node)
215
+ return false unless node
216
+
217
+ # List of HTML elements where whitespace is semantically significant
218
+ whitespace_sensitive_tags = %w[pre code textarea script style]
219
+
220
+ # Check if this node is whitespace-sensitive
221
+ if node.respond_to?(:label)
222
+ label = node.label.to_s.downcase
223
+ return true if whitespace_sensitive_tags.include?(label)
224
+ end
225
+
226
+ false
227
+ end
228
+
229
+ # Calculate propagation depth based on node weight
230
+ #
231
+ # Formula: 1 + floor(W / W₀)
232
+ # where W = node weight, W₀ = base threshold
233
+ #
234
+ # @param weight [Float] Node weight
235
+ # @return [Integer] Propagation depth
236
+ def propagation_depth(weight)
237
+ 1 + (weight / BASE_WEIGHT_THRESHOLD).floor
238
+ end
239
+ end
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,220 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module TreeDiff
5
+ module Matchers
6
+ # UniversalMatcher orchestrates the complete matching process by combining
7
+ # hash-based, similarity-based, and structural propagation matching strategies.
8
+ #
9
+ # This is the main entry point for tree matching and follows a multi-phase
10
+ # pipeline approach:
11
+ #
12
+ # Phase 1: Hash Matching (XyDiff BULD)
13
+ # - Exact signature matching for identical subtrees
14
+ # - O(n log n) complexity via priority queue
15
+ # - Processes heaviest nodes first
16
+ #
17
+ # Phase 2: Similarity Matching (JATS-diff)
18
+ # - Content-based similarity via Jaccard index
19
+ # - Configurable threshold (default 0.95)
20
+ # - Groups by signature for efficiency
21
+ #
22
+ # Phase 3: Structural Propagation (XyDiff)
23
+ # - Bottom-up: match parents of matched children
24
+ # - Top-down: match children of matched parents
25
+ # - Adaptive propagation depth based on weight
26
+ #
27
+ # @example Basic usage
28
+ # matcher = UniversalMatcher.new
29
+ # matching = matcher.match(tree1, tree2)
30
+ # puts "Matched #{matching.size} nodes"
31
+ #
32
+ # @example With custom options
33
+ # matcher = UniversalMatcher.new(
34
+ # similarity_threshold: 0.9,
35
+ # enable_propagation: false
36
+ # )
37
+ # matching = matcher.match(tree1, tree2)
38
+ #
39
+ class UniversalMatcher
40
+ # Default options for the matching process
41
+ DEFAULT_OPTIONS = {
42
+ # Minimum Jaccard similarity for content matching
43
+ similarity_threshold: 0.95,
44
+
45
+ # Enable hash-based exact matching
46
+ enable_hash_matching: true,
47
+
48
+ # Enable similarity-based matching
49
+ enable_similarity_matching: true,
50
+
51
+ # Enable structural propagation
52
+ enable_propagation: true,
53
+
54
+ # Maximum propagation depth (nil = adaptive)
55
+ max_propagation_depth: nil,
56
+
57
+ # Minimum weight for propagation
58
+ min_propagation_weight: 2.0,
59
+ }.freeze
60
+
61
+ attr_reader :options, :statistics
62
+
63
+ # Initialize a new UniversalMatcher
64
+ #
65
+ # @param options [Hash] Configuration options
66
+ # @option options [Float] :similarity_threshold (0.95)
67
+ # Minimum similarity for content matching
68
+ # @option options [Boolean] :enable_hash_matching (true)
69
+ # Enable hash-based exact matching
70
+ # @option options [Boolean] :enable_similarity_matching (true)
71
+ # Enable similarity-based matching
72
+ # @option options [Boolean] :enable_propagation (true)
73
+ # Enable structural propagation
74
+ # @option options [Integer, nil] :max_propagation_depth (nil)
75
+ # Maximum propagation depth (nil = adaptive)
76
+ # @option options [Float] :min_propagation_weight (2.0)
77
+ # Minimum weight for propagation
78
+ def initialize(options = {})
79
+ @options = DEFAULT_OPTIONS.merge(options)
80
+ @statistics = {}
81
+ end
82
+
83
+ # Match two trees and return a Matching object
84
+ #
85
+ # @param tree1 [TreeNode] First tree root
86
+ # @param tree2 [TreeNode] Second tree root
87
+ # @return [Matching] Matching object with all matched pairs
88
+ def match(tree1, tree2)
89
+ reset_statistics(tree1, tree2)
90
+
91
+ matching = Core::Matching.new
92
+
93
+ # Phase 1: Hash-based exact matching
94
+ if @options[:enable_hash_matching]
95
+ hash_matching_phase(tree1, tree2, matching)
96
+ end
97
+
98
+ # Phase 2: Similarity-based matching
99
+ if @options[:enable_similarity_matching]
100
+ similarity_matching_phase(tree1, tree2, matching)
101
+ end
102
+
103
+ # Phase 3: Structural propagation
104
+ if @options[:enable_propagation]
105
+ propagation_phase(tree1, tree2, matching)
106
+ end
107
+
108
+ finalize_statistics(matching)
109
+ matching
110
+ end
111
+
112
+ private
113
+
114
+ # Reset statistics for a new matching process
115
+ #
116
+ # @param tree1 [TreeNode] First tree root
117
+ # @param tree2 [TreeNode] Second tree root
118
+ def reset_statistics(tree1, tree2)
119
+ @statistics = {
120
+ tree1_nodes: count_nodes(tree1),
121
+ tree2_nodes: count_nodes(tree2),
122
+ hash_matches: 0,
123
+ similarity_matches: 0,
124
+ propagation_matches: 0,
125
+ total_matches: 0,
126
+ match_ratio_tree1: 0.0,
127
+ match_ratio_tree2: 0.0,
128
+ phases_executed: [],
129
+ }
130
+ end
131
+
132
+ # Execute hash-based matching phase
133
+ #
134
+ # @param tree1 [TreeNode] First tree root
135
+ # @param tree2 [TreeNode] Second tree root
136
+ # @param matching [Matching] Matching object to update
137
+ def hash_matching_phase(tree1, tree2, matching)
138
+ @statistics[:phases_executed] << :hash_matching
139
+
140
+ hash_matcher = HashMatcher.new(tree1, tree2, @options)
141
+ temp_matching = hash_matcher.match
142
+
143
+ # Transfer matches to the main matching object
144
+ temp_matching.pairs.each do |node1, node2|
145
+ matching.add(node1, node2)
146
+ end
147
+
148
+ @statistics[:hash_matches] = temp_matching.size
149
+ end
150
+
151
+ # Execute similarity-based matching phase
152
+ #
153
+ # @param tree1 [TreeNode] First tree root
154
+ # @param tree2 [TreeNode] Second tree root
155
+ # @param matching [Matching] Matching object to update
156
+ def similarity_matching_phase(tree1, tree2, matching)
157
+ @statistics[:phases_executed] << :similarity_matching
158
+
159
+ before_count = matching.size
160
+
161
+ similarity_matcher = SimilarityMatcher.new(
162
+ tree1,
163
+ tree2,
164
+ matching,
165
+ threshold: @options[:similarity_threshold],
166
+ )
167
+ similarity_matcher.match
168
+
169
+ @statistics[:similarity_matches] = matching.size - before_count
170
+ end
171
+
172
+ # Execute structural propagation phase
173
+ #
174
+ # @param tree1 [TreeNode] First tree root
175
+ # @param tree2 [TreeNode] Second tree root
176
+ # @param matching [Matching] Matching object to update
177
+ def propagation_phase(tree1, tree2, matching)
178
+ @statistics[:phases_executed] << :propagation
179
+
180
+ before_count = matching.size
181
+
182
+ propagator = StructuralPropagator.new(tree1, tree2, matching)
183
+ propagator.propagate
184
+
185
+ @statistics[:propagation_matches] = matching.size - before_count
186
+ end
187
+
188
+ # Finalize statistics after matching is complete
189
+ #
190
+ # @param matching [Matching] Final matching object
191
+ def finalize_statistics(matching)
192
+ @statistics[:total_matches] = matching.size
193
+
194
+ # Calculate match ratios
195
+ if @statistics[:tree1_nodes].positive?
196
+ @statistics[:match_ratio_tree1] =
197
+ matching.size.to_f / @statistics[:tree1_nodes]
198
+ end
199
+
200
+ if @statistics[:tree2_nodes].positive?
201
+ @statistics[:match_ratio_tree2] =
202
+ matching.size.to_f / @statistics[:tree2_nodes]
203
+ end
204
+ end
205
+
206
+ # Count total nodes in a tree
207
+ #
208
+ # @param node [TreeNode] Tree root
209
+ # @return [Integer] Total node count
210
+ def count_nodes(node)
211
+ count = 1
212
+ node.children.each do |child|
213
+ count += count_nodes(child)
214
+ end
215
+ count
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end