canon 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +163 -67
  3. data/README.adoc +400 -7
  4. data/docs/Gemfile +9 -0
  5. data/docs/INDEX.adoc +99 -182
  6. data/docs/_config.yml +100 -0
  7. data/docs/advanced/diff-classification.adoc +547 -0
  8. data/docs/advanced/diff-pipeline.adoc +358 -0
  9. data/docs/advanced/index.adoc +214 -0
  10. data/docs/advanced/semantic-diff-report.adoc +390 -0
  11. data/docs/{VERBOSE.adoc → advanced/verbose-mode-architecture.adoc} +51 -53
  12. data/docs/features/diff-formatting/algorithm-specific-output.adoc +533 -0
  13. data/docs/{CHARACTER_VISUALIZATION.adoc → features/diff-formatting/character-visualization.adoc} +23 -62
  14. data/docs/features/diff-formatting/colors-and-symbols.adoc +606 -0
  15. data/docs/features/diff-formatting/context-and-grouping.adoc +490 -0
  16. data/docs/features/diff-formatting/display-filtering.adoc +472 -0
  17. data/docs/features/diff-formatting/index.adoc +140 -0
  18. data/docs/features/environment-configuration/index.adoc +327 -0
  19. data/docs/features/environment-configuration/override-system.adoc +436 -0
  20. data/docs/features/environment-configuration/size-limits.adoc +273 -0
  21. data/docs/features/index.adoc +173 -0
  22. data/docs/features/input-validation/index.adoc +521 -0
  23. data/docs/features/match-options/algorithm-specific-behavior.adoc +365 -0
  24. data/docs/features/match-options/html-policies.adoc +312 -0
  25. data/docs/features/match-options/index.adoc +621 -0
  26. data/docs/getting-started/index.adoc +83 -0
  27. data/docs/getting-started/quick-start.adoc +76 -0
  28. data/docs/guides/choosing-configuration.adoc +689 -0
  29. data/docs/guides/index.adoc +181 -0
  30. data/docs/{CLI.adoc → interfaces/cli/index.adoc} +18 -13
  31. data/docs/interfaces/index.adoc +101 -0
  32. data/docs/{RSPEC.adoc → interfaces/rspec/index.adoc} +242 -31
  33. data/docs/{RUBY_API.adoc → interfaces/ruby-api/index.adoc} +118 -16
  34. data/docs/lychee.toml +65 -0
  35. data/docs/reference/cli-options.adoc +418 -0
  36. data/docs/reference/environment-variables.adoc +375 -0
  37. data/docs/reference/index.adoc +204 -0
  38. data/docs/reference/options-across-interfaces.adoc +417 -0
  39. data/docs/understanding/algorithms/dom-diff.adoc +389 -0
  40. data/docs/understanding/algorithms/index.adoc +314 -0
  41. data/docs/understanding/algorithms/semantic-tree-diff.adoc +533 -0
  42. data/docs/understanding/architecture.adoc +447 -0
  43. data/docs/understanding/comparison-pipeline.adoc +317 -0
  44. data/docs/understanding/formats/html.adoc +380 -0
  45. data/docs/understanding/formats/index.adoc +261 -0
  46. data/docs/understanding/formats/json.adoc +390 -0
  47. data/docs/understanding/formats/xml.adoc +366 -0
  48. data/docs/understanding/formats/yaml.adoc +504 -0
  49. data/docs/understanding/index.adoc +130 -0
  50. data/lib/canon/cli.rb +42 -1
  51. data/lib/canon/commands/diff_command.rb +108 -23
  52. data/lib/canon/comparison/compare_profile.rb +101 -0
  53. data/lib/canon/comparison/comparison_result.rb +41 -2
  54. data/lib/canon/comparison/html_comparator.rb +292 -71
  55. data/lib/canon/comparison/html_compare_profile.rb +117 -0
  56. data/lib/canon/comparison/match_options.rb +42 -4
  57. data/lib/canon/comparison/strategies/base_match_strategy.rb +99 -0
  58. data/lib/canon/comparison/strategies/match_strategy_factory.rb +74 -0
  59. data/lib/canon/comparison/strategies/semantic_tree_match_strategy.rb +220 -0
  60. data/lib/canon/comparison/xml_comparator.rb +695 -91
  61. data/lib/canon/comparison.rb +207 -2
  62. data/lib/canon/config/env_provider.rb +71 -0
  63. data/lib/canon/config/env_schema.rb +58 -0
  64. data/lib/canon/config/override_resolver.rb +55 -0
  65. data/lib/canon/config/type_converter.rb +59 -0
  66. data/lib/canon/config.rb +158 -29
  67. data/lib/canon/data_model.rb +29 -0
  68. data/lib/canon/diff/diff_classifier.rb +74 -14
  69. data/lib/canon/diff/diff_context_builder.rb +41 -0
  70. data/lib/canon/diff/diff_line.rb +18 -2
  71. data/lib/canon/diff/diff_node.rb +18 -3
  72. data/lib/canon/diff/diff_node_mapper.rb +71 -12
  73. data/lib/canon/diff/formatting_detector.rb +53 -0
  74. data/lib/canon/diff_formatter/by_line/base_formatter.rb +60 -5
  75. data/lib/canon/diff_formatter/by_line/html_formatter.rb +68 -16
  76. data/lib/canon/diff_formatter/by_line/json_formatter.rb +0 -37
  77. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +0 -42
  78. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +116 -31
  79. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +0 -37
  80. data/lib/canon/diff_formatter/by_object/base_formatter.rb +126 -19
  81. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +30 -1
  82. data/lib/canon/diff_formatter/debug_output.rb +7 -1
  83. data/lib/canon/diff_formatter/diff_detail_formatter.rb +674 -57
  84. data/lib/canon/diff_formatter/legend.rb +42 -0
  85. data/lib/canon/diff_formatter.rb +78 -9
  86. data/lib/canon/errors.rb +56 -0
  87. data/lib/canon/formatters/html_formatter_base.rb +35 -1
  88. data/lib/canon/formatters/json_formatter.rb +3 -0
  89. data/lib/canon/formatters/yaml_formatter.rb +3 -0
  90. data/lib/canon/html/data_model.rb +229 -0
  91. data/lib/canon/html.rb +9 -0
  92. data/lib/canon/options/cli_generator.rb +70 -0
  93. data/lib/canon/options/registry.rb +234 -0
  94. data/lib/canon/rspec_matchers.rb +34 -13
  95. data/lib/canon/tree_diff/adapters/html_adapter.rb +316 -0
  96. data/lib/canon/tree_diff/adapters/json_adapter.rb +204 -0
  97. data/lib/canon/tree_diff/adapters/xml_adapter.rb +285 -0
  98. data/lib/canon/tree_diff/adapters/yaml_adapter.rb +213 -0
  99. data/lib/canon/tree_diff/core/attribute_comparator.rb +84 -0
  100. data/lib/canon/tree_diff/core/matching.rb +241 -0
  101. data/lib/canon/tree_diff/core/node_signature.rb +164 -0
  102. data/lib/canon/tree_diff/core/node_weight.rb +135 -0
  103. data/lib/canon/tree_diff/core/tree_node.rb +450 -0
  104. data/lib/canon/tree_diff/matchers/hash_matcher.rb +258 -0
  105. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +168 -0
  106. data/lib/canon/tree_diff/matchers/structural_propagator.rb +242 -0
  107. data/lib/canon/tree_diff/matchers/universal_matcher.rb +220 -0
  108. data/lib/canon/tree_diff/operation_converter.rb +631 -0
  109. data/lib/canon/tree_diff/operations/operation.rb +92 -0
  110. data/lib/canon/tree_diff/operations/operation_detector.rb +626 -0
  111. data/lib/canon/tree_diff/tree_diff_integrator.rb +140 -0
  112. data/lib/canon/tree_diff.rb +33 -0
  113. data/lib/canon/validators/json_validator.rb +3 -1
  114. data/lib/canon/validators/yaml_validator.rb +3 -1
  115. data/lib/canon/version.rb +1 -1
  116. data/lib/canon/xml/data_model.rb +22 -23
  117. data/lib/canon/xml/element_matcher.rb +128 -20
  118. data/lib/canon/xml/namespace_helper.rb +110 -0
  119. data/lib/canon.rb +3 -0
  120. metadata +81 -23
  121. data/_config.yml +0 -116
  122. data/docs/ADVANCED_TOPICS.adoc +0 -20
  123. data/docs/BASIC_USAGE.adoc +0 -16
  124. data/docs/CUSTOMIZING_BEHAVIOR.adoc +0 -19
  125. data/docs/DIFF_ARCHITECTURE.adoc +0 -435
  126. data/docs/DIFF_FORMATTING.adoc +0 -540
  127. data/docs/FORMATS.adoc +0 -447
  128. data/docs/INPUT_VALIDATION.adoc +0 -477
  129. data/docs/MATCH_ARCHITECTURE.adoc +0 -463
  130. data/docs/MATCH_OPTIONS.adoc +0 -719
  131. data/docs/MODES.adoc +0 -432
  132. data/docs/NORMATIVE_INFORMATIVE_DIFFS.adoc +0 -219
  133. data/docs/OPTIONS.adoc +0 -1387
  134. data/docs/PREPROCESSING.adoc +0 -491
  135. data/docs/SEMANTIC_DIFF_REPORT.adoc +0 -528
  136. data/docs/UNDERSTANDING_CANON.adoc +0 -17
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module TreeDiff
5
+ module Operations
6
+ # Base class for all tree diff operations
7
+ #
8
+ # Represents a high-level semantic operation detected from tree matching.
9
+ # Each operation has a type, affected nodes, and metadata.
10
+ #
11
+ # @example
12
+ # operation = Operation.new(
13
+ # type: :insert,
14
+ # node: new_node,
15
+ # parent: parent_node,
16
+ # position: 2
17
+ # )
18
+ #
19
+ class Operation
20
+ # Operation types based on XDiff and JATS-diff research
21
+ TYPES = %i[
22
+ insert
23
+ delete
24
+ update
25
+ move
26
+ merge
27
+ split
28
+ upgrade
29
+ downgrade
30
+ ].freeze
31
+
32
+ attr_reader :type, :metadata
33
+
34
+ # Initialize a new operation
35
+ #
36
+ # @param type [Symbol] Operation type (must be in TYPES)
37
+ # @param metadata [Hash] Operation-specific metadata
38
+ def initialize(type:, **metadata)
39
+ unless TYPES.include?(type)
40
+ raise ArgumentError, "Invalid operation type: #{type}"
41
+ end
42
+
43
+ @type = type
44
+ @metadata = metadata
45
+ end
46
+
47
+ # Check if operation is a specific type
48
+ #
49
+ # @param type [Symbol] Type to check
50
+ # @return [Boolean]
51
+ def type?(type)
52
+ @type == type
53
+ end
54
+
55
+ # Get a metadata value
56
+ #
57
+ # @param key [Symbol] Metadata key
58
+ # @return [Object, nil] Metadata value
59
+ def [](key)
60
+ @metadata[key]
61
+ end
62
+
63
+ # Check if two operations are equal
64
+ #
65
+ # @param other [Operation] Other operation
66
+ # @return [Boolean]
67
+ def ==(other)
68
+ return false unless other.is_a?(Operation)
69
+
70
+ type == other.type && metadata == other.metadata
71
+ end
72
+
73
+ # String representation
74
+ #
75
+ # @return [String]
76
+ def to_s
77
+ "Operation(#{type})"
78
+ end
79
+
80
+ # Detailed string representation
81
+ #
82
+ # @return [String]
83
+ def inspect
84
+ metadata_str = @metadata.map do |k, v|
85
+ "#{k}: #{v.inspect}"
86
+ end.join(", ")
87
+ "#<#{self.class.name} type=#{type} #{metadata_str}>"
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,626 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Canon
4
+ module TreeDiff
5
+ module Operations
6
+ # OperationDetector analyzes tree matching results to detect high-level
7
+ # semantic operations.
8
+ #
9
+ # Based on research from XDiff, XyDiff, and JATS-diff, this detector
10
+ # identifies operations in three levels:
11
+ #
12
+ # Level 1: Basic operations (INSERT, DELETE, UPDATE)
13
+ # Level 2: Structural operations (MOVE)
14
+ # Level 3: Semantic operations (MERGE, SPLIT, UPGRADE, DOWNGRADE)
15
+ #
16
+ # @example
17
+ # detector = OperationDetector.new(tree1, tree2, matching)
18
+ # operations = detector.detect
19
+ # operations.each { |op| puts op.inspect }
20
+ #
21
+ class OperationDetector
22
+ attr_reader :tree1, :tree2, :matching, :operations, :match_options
23
+
24
+ # Initialize a new operation detector
25
+ #
26
+ # @param tree1 [TreeNode] First tree root
27
+ # @param tree2 [TreeNode] Second tree root
28
+ # @param matching [Matching] Matching between trees
29
+ # @param match_options [Hash] Match options for comparison
30
+ def initialize(tree1, tree2, matching, match_options = {})
31
+ @tree1 = tree1
32
+ @tree2 = tree2
33
+ @matching = matching
34
+ @match_options = match_options || {}
35
+ @operations = []
36
+ end
37
+
38
+ # Detect all operations
39
+ #
40
+ # @return [Array<Operation>] Detected operations
41
+ def detect
42
+ @operations = []
43
+
44
+ # Level 1: Basic operations
45
+ detect_inserts
46
+ detect_deletes
47
+ detect_updates
48
+
49
+ # Level 2: Structural operations
50
+ detect_moves
51
+
52
+ # Level 3: Semantic operations
53
+ # These require more sophisticated pattern analysis
54
+ detect_merges
55
+ detect_splits
56
+ detect_upgrades
57
+ detect_downgrades
58
+
59
+ @operations
60
+ end
61
+
62
+ private
63
+
64
+ # Detect INSERT operations (nodes in tree2 not matched in tree1)
65
+ def detect_inserts
66
+ all_nodes2 = collect_all_nodes(tree2)
67
+
68
+ all_nodes2.each do |node2|
69
+ next if @matching.matched2?(node2)
70
+
71
+ # Skip if parent is also unmatched (parent will be reported instead)
72
+ # This prevents redundant reporting of descendants
73
+ parent2 = node2.parent
74
+ next if parent2 && !@matching.matched2?(parent2)
75
+
76
+ # Find position
77
+ position = parent2 ? parent2.children.index(node2) : 0
78
+
79
+ @operations << Operation.new(
80
+ type: :insert,
81
+ node: node2,
82
+ parent: parent2,
83
+ position: position,
84
+ path: node2.xpath,
85
+ content: extract_node_content(node2),
86
+ )
87
+ end
88
+ end
89
+
90
+ # Detect DELETE operations (nodes in tree1 not matched in tree2)
91
+ def detect_deletes
92
+ all_nodes1 = collect_all_nodes(tree1)
93
+
94
+ all_nodes1.each do |node1|
95
+ next if @matching.matched1?(node1)
96
+
97
+ # Skip if parent is also unmatched (parent will be reported instead)
98
+ # This prevents redundant reporting of descendants
99
+ parent1 = node1.parent
100
+ next if parent1 && !@matching.matched1?(parent1)
101
+
102
+ # Find position
103
+ position = parent1 ? parent1.children.index(node1) : 0
104
+
105
+ @operations << Operation.new(
106
+ type: :delete,
107
+ node: node1,
108
+ parent: parent1,
109
+ position: position,
110
+ path: node1.xpath,
111
+ content: extract_node_content(node1),
112
+ )
113
+ end
114
+ end
115
+
116
+ # Detect UPDATE operations (matched nodes with different content)
117
+ def detect_updates
118
+ @matching.pairs.each do |node1, node2|
119
+ # Detect what changed (including attribute order)
120
+ changes = detect_changes(node1, node2)
121
+
122
+ # Skip if truly identical (no changes detected)
123
+ next if changes.empty?
124
+
125
+ @operations << Operation.new(
126
+ type: :update,
127
+ node1: node1,
128
+ node2: node2,
129
+ changes: changes,
130
+ path: node2.xpath,
131
+ old_content: extract_node_content(node1),
132
+ new_content: extract_node_content(node2),
133
+ )
134
+ end
135
+ end
136
+
137
+ # Detect MOVE operations (nodes that moved in the tree structure)
138
+ def detect_moves
139
+ @matching.pairs.each do |node1, node2|
140
+ next unless moved?(node1, node2)
141
+
142
+ @operations << Operation.new(
143
+ type: :move,
144
+ node1: node1,
145
+ node2: node2,
146
+ old_parent: node1.parent,
147
+ new_parent: node2.parent,
148
+ old_position: node1.parent&.children&.index(node1),
149
+ new_position: node2.parent&.children&.index(node2),
150
+ old_path: node1.xpath,
151
+ new_path: node2.xpath,
152
+ )
153
+ end
154
+ end
155
+
156
+ # Check if a node moved between trees
157
+ #
158
+ # @param node1 [TreeNode] Node in tree1
159
+ # @param node2 [TreeNode] Node in tree2
160
+ # @return [Boolean]
161
+ def moved?(node1, node2)
162
+ # Node moved if parents don't match
163
+ parent1 = node1.parent
164
+ parent2 = node2.parent
165
+
166
+ return false if parent1.nil? && parent2.nil?
167
+ return true if parent1.nil? || parent2.nil?
168
+
169
+ # Check if parents match
170
+ matched_parent2 = @matching.match_for1(parent1)
171
+ matched_parent2 != parent2
172
+ end
173
+
174
+ # Check if two nodes are identical
175
+ #
176
+ # @param node1 [TreeNode] First node
177
+ # @param node2 [TreeNode] Second node
178
+ # @return [Boolean]
179
+ def nodes_identical?(node1, node2)
180
+ node1.label == node2.label &&
181
+ node1.value == node2.value &&
182
+ node1.attributes == node2.attributes
183
+ end
184
+
185
+ # Detect specific changes between two nodes
186
+ #
187
+ # @param node1 [TreeNode] Original node
188
+ # @param node2 [TreeNode] Modified node
189
+ # @return [Hash] Hash of changes
190
+ def detect_changes(node1, node2)
191
+ changes = {}
192
+
193
+ if node1.label != node2.label
194
+ changes[:label] =
195
+ { old: node1.label, new: node2.label }
196
+ end
197
+
198
+ # CRITICAL FIX: Use normalized text comparison based on match_options
199
+ if !text_equivalent?(node1, node2)
200
+ changes[:value] =
201
+ { old: node1.value, new: node2.value }
202
+ end
203
+
204
+ # Detect attribute changes (values or order)
205
+ attrs1 = node1.attributes
206
+ attrs2 = node2.attributes
207
+
208
+ # Check if attribute values differ (ignoring order)
209
+ if attrs1.sort.to_h != attrs2.sort.to_h
210
+ # Actual attribute value differences
211
+ changes[:attributes] = {
212
+ old: attrs1,
213
+ new: attrs2,
214
+ }
215
+ end
216
+
217
+ # Check if attribute order differs (independently)
218
+ # This can coexist with attribute value differences
219
+ # Only detect order differences when the same attributes exist in different order
220
+ # AND when attribute_order mode is :strict
221
+ attribute_order_mode = @match_options[:attribute_order] || :ignore
222
+ if attribute_order_mode == :strict &&
223
+ attrs1.keys.sort == attrs2.keys.sort &&
224
+ attrs1.keys != attrs2.keys
225
+ # Same attributes but in different order
226
+ changes[:attribute_order] = {
227
+ old: attrs1.keys,
228
+ new: attrs2.keys,
229
+ }
230
+ end
231
+
232
+ changes
233
+ end
234
+
235
+ # Check if text values are equivalent according to match options
236
+ #
237
+ # @param node1 [TreeNode] First node
238
+ # @param node2 [TreeNode] Second node
239
+ # @return [Boolean] True if text values are equivalent
240
+ def text_equivalent?(node1, node2)
241
+ text1 = node1.value
242
+ text2 = node2.value
243
+
244
+ # Both nil or empty = equivalent
245
+ return true if (text1.nil? || text1.empty?) && (text2.nil? || text2.empty?)
246
+ return false if (text1.nil? || text1.empty?) || (text2.nil? || text2.empty?)
247
+
248
+ # Check if node is in a whitespace-sensitive context
249
+ is_ws_sensitive = whitespace_sensitive?(node1) || whitespace_sensitive?(node2)
250
+ if is_ws_sensitive
251
+ # For whitespace-sensitive elements, use strict comparison
252
+ return text1 == text2
253
+ end
254
+
255
+ # For non-whitespace-sensitive elements, apply normalization
256
+ norm1 = normalize_text(text1)
257
+ norm2 = normalize_text(text2)
258
+
259
+ # If both normalize to empty (whitespace-only), treat as equivalent
260
+ # This only applies to non-whitespace-sensitive contexts
261
+ return true if norm1.empty? && norm2.empty?
262
+
263
+ # Apply normalization based on match_options
264
+ text_content_mode = @match_options[:text_content] || :normalize
265
+
266
+ case text_content_mode
267
+ when :strict
268
+ # Strict mode: must match exactly
269
+ text1 == text2
270
+ when :normalize, :normalized
271
+ # Normalize mode: normalize whitespace before comparing
272
+ norm1 == norm2
273
+ else
274
+ # Default to normalize behavior
275
+ norm1 == norm2
276
+ end
277
+ end
278
+
279
+ # Normalize text for comparison
280
+ #
281
+ # Collapses multiple whitespace into single space and strips.
282
+ # This matches the behavior of Canon's text_content: normalize option.
283
+ #
284
+ # @param text [String, nil] Text to normalize
285
+ # @return [String] Normalized text
286
+ def normalize_text(text)
287
+ return "" if text.nil? || text.empty?
288
+
289
+ # Collapse multiple whitespace (including newlines) into single space
290
+ # Then strip leading/trailing whitespace
291
+ text.gsub(/\s+/, " ").strip
292
+ end
293
+
294
+ # Collect all nodes in a tree (depth-first)
295
+ #
296
+ # @param node [TreeNode] Root node
297
+ # @return [Array<TreeNode>] All nodes
298
+ def collect_all_nodes(node)
299
+ nodes = [node]
300
+ node.children.each do |child|
301
+ nodes.concat(collect_all_nodes(child))
302
+ end
303
+ nodes
304
+ end
305
+
306
+ # Detect MERGE operations
307
+ # Pattern: Multiple sibling nodes in tree1 combined into one node in tree2
308
+ # (n-1) × DELETE + 1 × UPDATE with content similarity
309
+ def detect_merges
310
+ deletes = @operations.select { |op| op.type == :delete }
311
+ updates = @operations.select { |op| op.type == :update }
312
+
313
+ # Group deletes by parent
314
+ deletes_by_parent = deletes.group_by { |op| op[:parent] }
315
+
316
+ deletes_by_parent.each do |parent1, del_ops|
317
+ next if del_ops.size < 2 # Need at least 2 deletes for merge
318
+
319
+ # Find potential merge target in updates with same parent
320
+ parent2 = @matching.match_for1(parent1)
321
+ next unless parent2
322
+
323
+ updates.each do |update_op|
324
+ node2 = update_op[:node2]
325
+ next unless node2.parent == parent2
326
+
327
+ # Check if deleted content was merged into this node
328
+ if content_merged?(del_ops.map do |op|
329
+ op[:node]
330
+ end, update_op[:node1], node2)
331
+ # Remove the component operations
332
+ @operations.delete_if do |op|
333
+ del_ops.include?(op) || op == update_op
334
+ end
335
+
336
+ # Add merge operation
337
+ @operations << Operation.new(
338
+ type: :merge,
339
+ source_nodes: del_ops.map { |op| op[:node] },
340
+ target_node: node2,
341
+ merged_from: del_ops.map { |op| op[:node].label },
342
+ )
343
+ end
344
+ end
345
+ end
346
+ end
347
+
348
+ # Detect SPLIT operations
349
+ # Pattern: One node in tree1 split into multiple nodes in tree2
350
+ # 1 × DELETE + n × INSERT with content similarity
351
+ def detect_splits
352
+ deletes = @operations.select { |op| op.type == :delete }
353
+ inserts = @operations.select { |op| op.type == :insert }
354
+
355
+ # Group inserts by parent
356
+ inserts_by_parent = inserts.group_by { |op| op[:parent] }
357
+
358
+ deletes.each do |delete_op|
359
+ node1 = delete_op[:node]
360
+ parent1 = delete_op[:parent]
361
+ parent2 = @matching.match_for1(parent1) if parent1
362
+
363
+ next unless parent2
364
+
365
+ # Find inserts with the same parent in tree2
366
+ candidate_inserts = inserts_by_parent[parent2] || []
367
+ next if candidate_inserts.size < 2 # Need at least 2 inserts for split
368
+
369
+ # Check if this node's content was split into multiple inserts
370
+ if content_split?(node1, candidate_inserts.map { |op| op[:node] })
371
+ # Remove the component operations
372
+ @operations.delete(delete_op)
373
+ @operations.delete_if { |op| candidate_inserts.include?(op) }
374
+
375
+ # Add split operation
376
+ @operations << Operation.new(
377
+ type: :split,
378
+ source_node: node1,
379
+ target_nodes: candidate_inserts.map { |op| op[:node] },
380
+ split_into: candidate_inserts.map { |op| op[:node].label },
381
+ )
382
+ end
383
+ end
384
+ end
385
+
386
+ # Detect UPGRADE operations
387
+ # Pattern: Node moved to shallower depth (promoted in hierarchy)
388
+ # DELETE + INSERT at shallower depth with similar content
389
+ def detect_upgrades
390
+ deletes = @operations.select { |op| op.type == :delete }
391
+ inserts = @operations.select { |op| op.type == :insert }
392
+
393
+ deletes.each do |delete_op|
394
+ node1 = delete_op[:node]
395
+ depth1 = calculate_depth(node1)
396
+
397
+ inserts.each do |insert_op|
398
+ node2 = insert_op[:node]
399
+ depth2 = calculate_depth(node2)
400
+
401
+ # Upgrade means shallower depth (smaller number)
402
+ next unless depth2 < depth1
403
+
404
+ # Check if nodes are similar (same label, similar content)
405
+ if nodes_similar_for_hierarchy_change?(node1, node2)
406
+ # Remove the component operations
407
+ @operations.delete(delete_op)
408
+ @operations.delete(insert_op)
409
+
410
+ # Add upgrade operation
411
+ @operations << Operation.new(
412
+ type: :upgrade,
413
+ node1: node1,
414
+ node2: node2,
415
+ from_depth: depth1,
416
+ to_depth: depth2,
417
+ promoted_by: depth1 - depth2,
418
+ )
419
+ end
420
+ end
421
+ end
422
+ end
423
+
424
+ # Detect DOWNGRADE operations
425
+ # Pattern: Node moved to deeper depth (demoted in hierarchy)
426
+ # DELETE + INSERT at deeper depth with similar content
427
+ def detect_downgrades
428
+ deletes = @operations.select { |op| op.type == :delete }
429
+ inserts = @operations.select { |op| op.type == :insert }
430
+
431
+ deletes.each do |delete_op|
432
+ node1 = delete_op[:node]
433
+ depth1 = calculate_depth(node1)
434
+
435
+ inserts.each do |insert_op|
436
+ node2 = insert_op[:node]
437
+ depth2 = calculate_depth(node2)
438
+
439
+ # Downgrade means deeper depth (larger number)
440
+ next unless depth2 > depth1
441
+
442
+ # Check if nodes are similar (same label, similar content)
443
+ if nodes_similar_for_hierarchy_change?(node1, node2)
444
+ # Remove the component operations
445
+ @operations.delete(delete_op)
446
+ @operations.delete(insert_op)
447
+
448
+ # Add downgrade operation
449
+ @operations << Operation.new(
450
+ type: :downgrade,
451
+ node1: node1,
452
+ node2: node2,
453
+ from_depth: depth1,
454
+ to_depth: depth2,
455
+ demoted_by: depth2 - depth1,
456
+ )
457
+ end
458
+ end
459
+ end
460
+ end
461
+
462
+ # Check if content from multiple nodes was merged into target
463
+ #
464
+ # @param source_nodes [Array<TreeNode>] Source nodes
465
+ # @param original_target [TreeNode] Original target node in tree1
466
+ # @param merged_target [TreeNode] Merged target node in tree2
467
+ # @return [Boolean]
468
+ def content_merged?(source_nodes, original_target, merged_target)
469
+ # Collect all text content
470
+ source_text = source_nodes.map do |n|
471
+ extract_text_content(n)
472
+ end.join(" ")
473
+ original_text = extract_text_content(original_target)
474
+ merged_text = extract_text_content(merged_target)
475
+
476
+ # Check if merged text contains both original and source content
477
+ return false if merged_text.empty?
478
+
479
+ similarity = text_similarity("#{source_text} #{original_text}",
480
+ merged_text)
481
+ similarity >= 0.8 # 80% similarity threshold for merge detection
482
+ end
483
+
484
+ # Check if content from one node was split into multiple nodes
485
+ #
486
+ # @param source_node [TreeNode] Source node
487
+ # @param target_nodes [Array<TreeNode>] Target nodes
488
+ # @return [Boolean]
489
+ def content_split?(source_node, target_nodes)
490
+ source_text = extract_text_content(source_node)
491
+ target_text = target_nodes.map do |n|
492
+ extract_text_content(n)
493
+ end.join(" ")
494
+
495
+ return false if source_text.empty? || target_text.empty?
496
+
497
+ similarity = text_similarity(source_text, target_text)
498
+ similarity >= 0.8 # 80% similarity threshold for split detection
499
+ end
500
+
501
+ # Check if two nodes are similar enough for hierarchy change
502
+ #
503
+ # @param node1 [TreeNode] First node
504
+ # @param node2 [TreeNode] Second node
505
+ # @return [Boolean]
506
+ def nodes_similar_for_hierarchy_change?(node1, node2)
507
+ # Must have same label
508
+ return false unless node1.label == node2.label
509
+
510
+ # Compare content similarity
511
+ text1 = extract_text_content(node1)
512
+ text2 = extract_text_content(node2)
513
+
514
+ return true if text1.empty? && text2.empty?
515
+ return false if text1.empty? || text2.empty?
516
+
517
+ similarity = text_similarity(text1, text2)
518
+ similarity >= 0.9 # 90% similarity for hierarchy changes
519
+ end
520
+
521
+ # Extract all text content from a node and its descendants
522
+ #
523
+ # @param node [TreeNode] Node to extract from
524
+ # @return [String] Combined text content
525
+ def extract_text_content(node)
526
+ texts = []
527
+ texts << node.value if node.value && !node.value.empty?
528
+
529
+ node.children.each do |child|
530
+ texts << extract_text_content(child)
531
+ end
532
+
533
+ texts.join(" ").strip
534
+ end
535
+
536
+ # Extract node content summary for display
537
+ #
538
+ # @param node [TreeNode] Node to extract from
539
+ # @return [String] Content summary
540
+ def extract_node_content(node)
541
+ parts = []
542
+
543
+ # Add label
544
+ parts << "<#{node.label}>"
545
+
546
+ # Add attributes if present
547
+ unless node.attributes.empty?
548
+ attrs = node.attributes.map { |k, v| "#{k}=\"#{v}\"" }.join(" ")
549
+ parts << "[#{attrs}]"
550
+ end
551
+
552
+ # Add value/text if present
553
+ if node.value && !node.value.empty?
554
+ # Truncate long values
555
+ value_preview = node.value.length > 50 ? "#{node.value[0..47]}..." : node.value
556
+ parts << "\"#{value_preview}\""
557
+ elsif !node.children.empty?
558
+ parts << "(#{node.children.size} children)"
559
+ end
560
+
561
+ parts.join(" ")
562
+ end
563
+
564
+ # Calculate text similarity using Jaccard index
565
+ #
566
+ # @param text1 [String] First text
567
+ # @param text2 [String] Second text
568
+ # @return [Float] Similarity score (0.0 to 1.0)
569
+ def text_similarity(text1, text2)
570
+ tokens1 = text1.downcase.split(/\s+/)
571
+ tokens2 = text2.downcase.split(/\s+/)
572
+
573
+ return 0.0 if tokens1.empty? && tokens2.empty?
574
+ return 0.0 if tokens1.empty? || tokens2.empty?
575
+
576
+ intersection = (tokens1 & tokens2).size
577
+ union = (tokens1 | tokens2).size
578
+
579
+ intersection.to_f / union
580
+ end
581
+
582
+ # Calculate depth of a node in the tree
583
+ #
584
+ # @param node [TreeNode] Node to calculate depth for
585
+ # @return [Integer] Depth (0 for root)
586
+ def calculate_depth(node)
587
+ depth = 0
588
+ current = node
589
+ while current.parent
590
+ depth += 1
591
+ current = current.parent
592
+ end
593
+ depth
594
+ end
595
+
596
+ # Check if a node is in a whitespace-sensitive context
597
+ #
598
+ # HTML elements where whitespace is significant: <pre>, <code>, <textarea>, <script>, <style>
599
+ #
600
+ # @param node [TreeNode] Node to check
601
+ # @return [Boolean] True if node is in whitespace-sensitive context
602
+ def whitespace_sensitive?(node)
603
+ return false unless node
604
+
605
+ # List of HTML elements where whitespace is semantically significant
606
+ whitespace_sensitive_tags = %w[pre code textarea script style]
607
+
608
+ # Check if this node or any ancestor is whitespace-sensitive
609
+ current = node
610
+ while current
611
+ if current.respond_to?(:label)
612
+ label = current.label.to_s.downcase
613
+ return true if whitespace_sensitive_tags.include?(label)
614
+ end
615
+
616
+ # Check parent
617
+ current = current.parent if current.respond_to?(:parent)
618
+ break unless current
619
+ end
620
+
621
+ false
622
+ end
623
+ end
624
+ end
625
+ end
626
+ end