canon 0.1.21 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +43 -43
  3. data/README.adoc +8 -3
  4. data/docs/advanced/diff-pipeline.adoc +36 -9
  5. data/docs/features/diff-formatting/colors-and-symbols.adoc +82 -0
  6. data/docs/features/diff-formatting/index.adoc +12 -0
  7. data/docs/features/diff-formatting/themes.adoc +353 -0
  8. data/docs/features/environment-configuration/index.adoc +23 -0
  9. data/docs/internals/diff-char-range-pipeline.adoc +249 -0
  10. data/docs/internals/diffnode-enrichment.adoc +1 -0
  11. data/docs/internals/index.adoc +52 -4
  12. data/docs/reference/environment-variables.adoc +6 -0
  13. data/docs/understanding/architecture.adoc +5 -0
  14. data/examples/show_themes.rb +217 -0
  15. data/lib/canon/comparison/comparison_result.rb +9 -4
  16. data/lib/canon/config/env_schema.rb +3 -1
  17. data/lib/canon/config.rb +11 -0
  18. data/lib/canon/diff/diff_block.rb +7 -0
  19. data/lib/canon/diff/diff_block_builder.rb +2 -2
  20. data/lib/canon/diff/diff_char_range.rb +140 -0
  21. data/lib/canon/diff/diff_line.rb +42 -4
  22. data/lib/canon/diff/diff_line_builder.rb +907 -0
  23. data/lib/canon/diff/diff_node.rb +5 -1
  24. data/lib/canon/diff/diff_node_enricher.rb +1418 -0
  25. data/lib/canon/diff/diff_node_mapper.rb +54 -0
  26. data/lib/canon/diff/source_locator.rb +105 -0
  27. data/lib/canon/diff/text_decomposer.rb +103 -0
  28. data/lib/canon/diff_formatter/by_line/base_formatter.rb +264 -24
  29. data/lib/canon/diff_formatter/by_line/html_formatter.rb +35 -20
  30. data/lib/canon/diff_formatter/by_line/json_formatter.rb +36 -19
  31. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +33 -19
  32. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +583 -98
  33. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +36 -19
  34. data/lib/canon/diff_formatter/by_object/base_formatter.rb +62 -13
  35. data/lib/canon/diff_formatter/by_object/json_formatter.rb +59 -24
  36. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +74 -34
  37. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +4 -5
  38. data/lib/canon/diff_formatter/diff_detail_formatter.rb +1 -1
  39. data/lib/canon/diff_formatter/legend.rb +4 -2
  40. data/lib/canon/diff_formatter/theme.rb +857 -0
  41. data/lib/canon/diff_formatter.rb +11 -6
  42. data/lib/canon/tree_diff/matchers/hash_matcher.rb +15 -15
  43. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +10 -0
  44. data/lib/canon/tree_diff/operations/operation_detector.rb +5 -1
  45. data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
  46. data/lib/canon/version.rb +1 -1
  47. metadata +11 -2
@@ -0,0 +1,1418 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "diff_char_range"
4
+ require_relative "text_decomposer"
5
+ require_relative "source_locator"
6
+
7
+ module Canon
8
+ module Diff
9
+ # Enriches DiffNodes with character position data (DiffCharRanges).
10
+ #
11
+ # This is Phase 1 of the two-phase diff pipeline. It runs after comparison
12
+ # and before rendering. It CAN use string operations (including LCS) on
13
+ # serialized content to determine character-level change positions.
14
+ #
15
+ # The output is DiffNodes enriched with:
16
+ # - char_ranges: Array<DiffCharRange> mapping changes to specific line/columns
17
+ # - line_range_before: [start_line, end_line] in text1
18
+ # - line_range_after: [start_line, end_line] in text2
19
+ #
20
+ # Phase 2 (DiffLineBuilder) then assembles DiffLines from these enriched
21
+ # DiffNodes without any further computation.
22
+ class DiffNodeEnricher
23
+ # Enrich DiffNodes with character position data.
24
+ #
25
+ # @param diff_nodes [Array<DiffNode>] The semantic differences
26
+ # @param text1 [String] The first document (preprocessed)
27
+ # @param text2 [String] The second document (preprocessed)
28
+ # @return [Array<DiffNode>] The same DiffNodes, enriched in place
29
+ def self.build(diff_nodes, text1, text2)
30
+ return diff_nodes if diff_nodes.nil? || diff_nodes.empty?
31
+ return diff_nodes if text1.nil? || text2.nil?
32
+
33
+ new(diff_nodes, text1, text2).enrich
34
+ end
35
+
36
+ def initialize(diff_nodes, text1, text2)
37
+ @diff_nodes = diff_nodes
38
+ @text1 = text1
39
+ @text2 = text2
40
+ @line_map1 = SourceLocator.build_line_map(text1)
41
+ @line_map2 = SourceLocator.build_line_map(text2)
42
+ @lines1 = text1.split("\n")
43
+ @lines2 = text2.split("\n")
44
+ # Track occurrences for text_content dimension to find correct element instance
45
+ @text_occurrence1 = Hash.new(0)
46
+ @text_occurrence2 = Hash.new(0)
47
+ end
48
+
49
+ def enrich
50
+ @diff_nodes.each do |diff_node|
51
+ enrich_node(diff_node)
52
+ end
53
+ @diff_nodes
54
+ end
55
+
56
+ private
57
+
58
+ # Enrich a single DiffNode with DiffCharRanges based on its dimension.
59
+ def enrich_node(diff_node)
60
+ case diff_node.dimension
61
+ when :text_content
62
+ enrich_text_content(diff_node)
63
+ when :attribute_values
64
+ enrich_attribute_values(diff_node)
65
+ when :attribute_presence
66
+ enrich_attribute_presence(diff_node)
67
+ when :attribute_order
68
+ enrich_attribute_order(diff_node)
69
+ when :comments
70
+ enrich_comments(diff_node)
71
+ when :structural_whitespace
72
+ enrich_structural_whitespace(diff_node)
73
+ when :element_structure
74
+ enrich_element_structure(diff_node)
75
+ else
76
+ enrich_generic(diff_node)
77
+ end
78
+ end
79
+
80
+ # Text content change: decompose serialized_before/after into
81
+ # before-text, changed-text, after-text and map to DiffCharRanges.
82
+ def enrich_text_content(diff_node)
83
+ before = diff_node.serialized_before
84
+ after = diff_node.serialized_after
85
+
86
+ if before.nil? && after.nil?
87
+ return
88
+ end
89
+
90
+ # One side is nil = pure insertion/deletion
91
+ if before.nil?
92
+ loc = locate_at_element_index(after, @text2, @line_map2,
93
+ diff_node.path)
94
+ loc ||= locate_via_parent_element(diff_node.path, @text2, @line_map2)
95
+ loc ||= locate_via_node_tree(diff_node.node2, after, @text2,
96
+ @line_map2, :new)
97
+ # Final fallback: when tree-based location in text2 fails because the
98
+ # leaf element is self-closing (text moved OUTSIDE the element in text2),
99
+ # search in text1 (original) using path-based location. The original
100
+ # has the correct element structure with content intact.
101
+ loc ||= locate_via_parent_element(diff_node.path, @text1, @line_map1)
102
+ return unless loc
103
+
104
+ cr = DiffCharRange.new(
105
+ line_number: loc[:line_number],
106
+ start_col: loc[:col],
107
+ end_col: loc[:col] + after.length,
108
+ side: :new,
109
+ status: :added,
110
+ role: :changed,
111
+ diff_node: diff_node,
112
+ )
113
+ diff_node.char_ranges = [cr]
114
+ diff_node.line_range_before = nil
115
+ diff_node.line_range_after = [loc[:line_number], loc[:line_number]]
116
+ return
117
+ end
118
+
119
+ if after.nil?
120
+ loc = locate_at_element_index(before, @text1, @line_map1,
121
+ diff_node.path)
122
+ loc ||= locate_via_parent_element(diff_node.path, @text1, @line_map1)
123
+ loc ||= locate_via_node_tree(diff_node.node1, before, @text1,
124
+ @line_map1, :old)
125
+ return unless loc
126
+
127
+ cr = DiffCharRange.new(
128
+ line_number: loc[:line_number],
129
+ start_col: loc[:col],
130
+ end_col: loc[:col] + before.length,
131
+ side: :old,
132
+ status: :removed,
133
+ role: :changed,
134
+ diff_node: diff_node,
135
+ )
136
+ diff_node.char_ranges = [cr]
137
+ diff_node.line_range_before = [loc[:line_number], loc[:line_number]]
138
+ diff_node.line_range_after = nil
139
+ return
140
+ end
141
+
142
+ # Both sides exist: locate and decompose
143
+ loc1 = locate_at_element_index(before, @text1, @line_map1,
144
+ diff_node.path)
145
+ loc2 = locate_at_element_index(after, @text2, @line_map2,
146
+ diff_node.path)
147
+
148
+ unless loc1 && loc2
149
+ # Cannot locate - element_structure changes can't be located without exact match
150
+ return
151
+ end
152
+
153
+ # Decompose into 3 parts
154
+ parts = TextDecomposer.decompose(before, after)
155
+ ranges = []
156
+
157
+ # Before-text (unchanged prefix)
158
+ unless parts[:common_prefix].empty?
159
+ prefix_len = parts[:common_prefix].length
160
+
161
+ ranges << DiffCharRange.new(
162
+ line_number: loc1[:line_number],
163
+ start_col: loc1[:col],
164
+ end_col: loc1[:col] + prefix_len,
165
+ side: :old,
166
+ status: :unchanged,
167
+ role: :before,
168
+ diff_node: diff_node,
169
+ )
170
+
171
+ ranges << DiffCharRange.new(
172
+ line_number: loc2[:line_number],
173
+ start_col: loc2[:col],
174
+ end_col: loc2[:col] + prefix_len,
175
+ side: :new,
176
+ status: :unchanged,
177
+ role: :before,
178
+ diff_node: diff_node,
179
+ )
180
+ end
181
+
182
+ # Changed-text (the actual difference)
183
+ unless parts[:changed_old].empty? && parts[:changed_new].empty?
184
+ prefix_offset = parts[:common_prefix].length
185
+
186
+ unless parts[:changed_old].empty?
187
+ ranges << DiffCharRange.new(
188
+ line_number: loc1[:line_number],
189
+ start_col: loc1[:col] + prefix_offset,
190
+ end_col: loc1[:col] + prefix_offset + parts[:changed_old].length,
191
+ side: :old,
192
+ status: :changed_old,
193
+ role: :changed,
194
+ diff_node: diff_node,
195
+ )
196
+ end
197
+
198
+ unless parts[:changed_new].empty?
199
+ ranges << DiffCharRange.new(
200
+ line_number: loc2[:line_number],
201
+ start_col: loc2[:col] + prefix_offset,
202
+ end_col: loc2[:col] + prefix_offset + parts[:changed_new].length,
203
+ side: :new,
204
+ status: :changed_new,
205
+ role: :changed,
206
+ diff_node: diff_node,
207
+ )
208
+ end
209
+ end
210
+
211
+ # After-text (unchanged suffix)
212
+ unless parts[:common_suffix].empty?
213
+ suffix_offset_old = loc1[:col] + before.length - parts[:common_suffix].length
214
+ suffix_offset_new = loc2[:col] + after.length - parts[:common_suffix].length
215
+ suffix_len = parts[:common_suffix].length
216
+
217
+ ranges << DiffCharRange.new(
218
+ line_number: loc1[:line_number],
219
+ start_col: suffix_offset_old,
220
+ end_col: suffix_offset_old + suffix_len,
221
+ side: :old,
222
+ status: :unchanged,
223
+ role: :after,
224
+ diff_node: diff_node,
225
+ )
226
+
227
+ ranges << DiffCharRange.new(
228
+ line_number: loc2[:line_number],
229
+ start_col: suffix_offset_new,
230
+ end_col: suffix_offset_new + suffix_len,
231
+ side: :new,
232
+ status: :unchanged,
233
+ role: :after,
234
+ diff_node: diff_node,
235
+ )
236
+ end
237
+
238
+ diff_node.char_ranges = ranges
239
+ # Compute actual line span for multi-line text content.
240
+ # Content like "abc\ndef" spans 2 lines.
241
+ newline_count_before = before.count("\n")
242
+ newline_count_after = after.count("\n")
243
+ end_line_before = loc1[:line_number] + newline_count_before
244
+ end_line_after = loc2[:line_number] + newline_count_after
245
+ diff_node.line_range_before = [loc1[:line_number], end_line_before]
246
+ diff_node.line_range_after = [loc2[:line_number], end_line_after]
247
+ end
248
+
249
+ # Attribute value change: locate the specific attribute values in the text.
250
+ def enrich_attribute_values(diff_node)
251
+ attrs_before = diff_node.attributes_before
252
+ attrs_after = diff_node.attributes_after
253
+ return unless attrs_before && attrs_after
254
+
255
+ # Find which attributes changed
256
+ all_keys = (attrs_before.keys + attrs_after.keys).uniq
257
+ changed_keys = all_keys.reject do |key|
258
+ attrs_before[key] == attrs_after[key]
259
+ end
260
+
261
+ return if changed_keys.empty?
262
+
263
+ ranges = []
264
+ line1_num = nil
265
+ line2_num = nil
266
+
267
+ changed_keys.each do |key|
268
+ old_val = attrs_before[key]
269
+ new_val = attrs_after[key]
270
+
271
+ # Find in text1: key="old_val"
272
+ # Use element_name to scope the search and avoid matching
273
+ # attributes in the XML declaration (e.g., version="1.0" in
274
+ # <?xml version="1.0"?> vs <element version="1.0">)
275
+ element_name = diff_node.node1&.name
276
+ if old_val
277
+ pattern = build_attr_pattern(key, old_val)
278
+ start_from = xml_declaration_end_offset(@text1)
279
+ loc = SourceLocator.locate(pattern, @text1, @line_map1,
280
+ start_from: start_from)
281
+ # If not found after XML decl, try with element-scoped pattern
282
+ if loc.nil? && element_name
283
+ scoped = "#{element_name} #{pattern}"
284
+ loc = SourceLocator.locate(scoped, @text1, @line_map1)
285
+ # Adjust col to point to the attribute, not the element name
286
+ loc = loc.merge(col: loc[:col] + element_name.length + 1) if loc
287
+ end
288
+ if loc
289
+ line1_num ||= loc[:line_number]
290
+ ranges << DiffCharRange.new(
291
+ line_number: loc[:line_number],
292
+ start_col: loc[:col] + key.length + 2, # skip key="
293
+ end_col: loc[:col] + pattern.length - 1, # skip closing "
294
+ side: :old,
295
+ status: :changed_old,
296
+ role: :changed,
297
+ diff_node: diff_node,
298
+ )
299
+ end
300
+ end
301
+
302
+ # Find in text2: key="new_val"
303
+ element_name2 = diff_node.node2&.name
304
+ if new_val
305
+ pattern = build_attr_pattern(key, new_val)
306
+ start_from = xml_declaration_end_offset(@text2)
307
+ loc = SourceLocator.locate(pattern, @text2, @line_map2,
308
+ start_from: start_from)
309
+ if loc.nil? && element_name2
310
+ scoped = "#{element_name2} #{pattern}"
311
+ loc = SourceLocator.locate(scoped, @text2, @line_map2)
312
+ loc = loc.merge(col: loc[:col] + element_name2.length + 1) if loc
313
+ end
314
+ if loc
315
+ line2_num ||= loc[:line_number]
316
+ ranges << DiffCharRange.new(
317
+ line_number: loc[:line_number],
318
+ start_col: loc[:col] + key.length + 2,
319
+ end_col: loc[:col] + pattern.length - 1,
320
+ side: :new,
321
+ status: :changed_new,
322
+ role: :changed,
323
+ diff_node: diff_node,
324
+ )
325
+ end
326
+ end
327
+ end
328
+
329
+ diff_node.char_ranges = ranges
330
+ diff_node.line_range_before = line1_num ? [line1_num, line1_num] : nil
331
+ diff_node.line_range_after = line2_num ? [line2_num, line2_num] : nil
332
+ end
333
+
334
+ # Attribute presence change: find added/removed attributes.
335
+ def enrich_attribute_presence(diff_node)
336
+ attrs_before = diff_node.attributes_before || {}
337
+ attrs_after = diff_node.attributes_after || {}
338
+
339
+ added_keys = attrs_after.keys - attrs_before.keys
340
+ removed_keys = attrs_before.keys - attrs_after.keys
341
+
342
+ return if added_keys.empty? && removed_keys.empty?
343
+
344
+ ranges = []
345
+ line1_num = nil
346
+ line2_num = nil
347
+
348
+ # Removed attributes (only in text1)
349
+ removed_keys.each do |key|
350
+ val = attrs_before[key]
351
+ pattern = build_attr_pattern(key, val)
352
+ start_from = xml_declaration_end_offset(@text1)
353
+ loc = SourceLocator.locate(pattern, @text1, @line_map1,
354
+ start_from: start_from)
355
+ next unless loc
356
+
357
+ line1_num ||= loc[:line_number]
358
+ ranges << DiffCharRange.new(
359
+ line_number: loc[:line_number],
360
+ start_col: loc[:col],
361
+ end_col: loc[:col] + pattern.length,
362
+ side: :old,
363
+ status: :removed,
364
+ role: :changed,
365
+ diff_node: diff_node,
366
+ )
367
+ end
368
+
369
+ # Added attributes (only in text2)
370
+ added_keys.each do |key|
371
+ val = attrs_after[key]
372
+ pattern = build_attr_pattern(key, val)
373
+ start_from = xml_declaration_end_offset(@text2)
374
+ loc = SourceLocator.locate(pattern, @text2, @line_map2,
375
+ start_from: start_from)
376
+ next unless loc
377
+
378
+ line2_num ||= loc[:line_number]
379
+ ranges << DiffCharRange.new(
380
+ line_number: loc[:line_number],
381
+ start_col: loc[:col],
382
+ end_col: loc[:col] + pattern.length,
383
+ side: :new,
384
+ status: :added,
385
+ role: :changed,
386
+ diff_node: diff_node,
387
+ )
388
+ end
389
+
390
+ diff_node.char_ranges = ranges
391
+ diff_node.line_range_before = line1_num ? [line1_num, line1_num] : nil
392
+ diff_node.line_range_after = line2_num ? [line2_num, line2_num] : nil
393
+ end
394
+
395
+ # Attribute order change: highlight entire attribute sections as formatting.
396
+ def enrich_attribute_order(diff_node)
397
+ before = diff_node.serialized_before
398
+ after = diff_node.serialized_after
399
+
400
+ loc1 = SourceLocator.locate(before, @text1, @line_map1) if before
401
+ loc2 = SourceLocator.locate(after, @text2, @line_map2) if after
402
+
403
+ ranges = []
404
+
405
+ if loc1
406
+ ranges << DiffCharRange.new(
407
+ line_number: loc1[:line_number],
408
+ start_col: loc1[:col],
409
+ end_col: loc1[:col] + before.length,
410
+ side: :old,
411
+ status: :unchanged,
412
+ role: :changed,
413
+ diff_node: diff_node,
414
+ )
415
+ end
416
+
417
+ if loc2
418
+ ranges << DiffCharRange.new(
419
+ line_number: loc2[:line_number],
420
+ start_col: loc2[:col],
421
+ end_col: loc2[:col] + after.length,
422
+ side: :new,
423
+ status: :unchanged,
424
+ role: :changed,
425
+ diff_node: diff_node,
426
+ )
427
+ end
428
+
429
+ diff_node.char_ranges = ranges
430
+ diff_node.line_range_before = if loc1
431
+ [loc1[:line_number],
432
+ loc1[:line_number]]
433
+ end
434
+ diff_node.line_range_after = if loc2
435
+ [loc2[:line_number],
436
+ loc2[:line_number]]
437
+ end
438
+ end
439
+
440
+ # Comment change: locate and decompose comment content.
441
+ def enrich_comments(diff_node)
442
+ before = diff_node.serialized_before
443
+ after = diff_node.serialized_after
444
+
445
+ if before.nil? && after.nil?
446
+ return
447
+ end
448
+
449
+ # Pure addition or removal
450
+ if before.nil?
451
+ loc = SourceLocator.locate(after, @text2, @line_map2)
452
+ return unless loc
453
+
454
+ diff_node.char_ranges = [
455
+ DiffCharRange.new(
456
+ line_number: loc[:line_number],
457
+ start_col: loc[:col],
458
+ end_col: loc[:col] + after.length,
459
+ side: :new,
460
+ status: :added,
461
+ role: :changed,
462
+ diff_node: diff_node,
463
+ ),
464
+ ]
465
+ diff_node.line_range_after = [loc[:line_number], loc[:line_number]]
466
+ return
467
+ end
468
+
469
+ if after.nil?
470
+ loc = SourceLocator.locate(before, @text1, @line_map1)
471
+ return unless loc
472
+
473
+ diff_node.char_ranges = [
474
+ DiffCharRange.new(
475
+ line_number: loc[:line_number],
476
+ start_col: loc[:col],
477
+ end_col: loc[:col] + before.length,
478
+ side: :old,
479
+ status: :removed,
480
+ role: :changed,
481
+ diff_node: diff_node,
482
+ ),
483
+ ]
484
+ diff_node.line_range_before = [loc[:line_number], loc[:line_number]]
485
+ return
486
+ end
487
+
488
+ # Both exist: locate and decompose
489
+ loc1 = SourceLocator.locate(before, @text1, @line_map1)
490
+ loc2 = SourceLocator.locate(after, @text2, @line_map2)
491
+
492
+ unless loc1 && loc2
493
+ enrich_generic(diff_node)
494
+ return
495
+ end
496
+
497
+ parts = TextDecomposer.decompose(before, after)
498
+ ranges = []
499
+
500
+ # Prefix (unchanged)
501
+ unless parts[:common_prefix].empty?
502
+ prefix_len = parts[:common_prefix].length
503
+ ranges << DiffCharRange.new(
504
+ line_number: loc1[:line_number], start_col: loc1[:col],
505
+ end_col: loc1[:col] + prefix_len,
506
+ side: :old, status: :unchanged, role: :before, diff_node: diff_node
507
+ )
508
+ ranges << DiffCharRange.new(
509
+ line_number: loc2[:line_number], start_col: loc2[:col],
510
+ end_col: loc2[:col] + prefix_len,
511
+ side: :new, status: :unchanged, role: :before, diff_node: diff_node
512
+ )
513
+ end
514
+
515
+ # Changed portion
516
+ unless parts[:changed_old].empty? && parts[:changed_new].empty?
517
+ prefix_offset = parts[:common_prefix].length
518
+
519
+ unless parts[:changed_old].empty?
520
+ ranges << DiffCharRange.new(
521
+ line_number: loc1[:line_number],
522
+ start_col: loc1[:col] + prefix_offset,
523
+ end_col: loc1[:col] + prefix_offset + parts[:changed_old].length,
524
+ side: :old, status: :changed_old, role: :changed, diff_node: diff_node
525
+ )
526
+ end
527
+
528
+ unless parts[:changed_new].empty?
529
+ ranges << DiffCharRange.new(
530
+ line_number: loc2[:line_number],
531
+ start_col: loc2[:col] + prefix_offset,
532
+ end_col: loc2[:col] + prefix_offset + parts[:changed_new].length,
533
+ side: :new, status: :changed_new, role: :changed, diff_node: diff_node
534
+ )
535
+ end
536
+ end
537
+
538
+ # Suffix (unchanged)
539
+ unless parts[:common_suffix].empty?
540
+ s_off_old = loc1[:col] + before.length - parts[:common_suffix].length
541
+ s_off_new = loc2[:col] + after.length - parts[:common_suffix].length
542
+ s_len = parts[:common_suffix].length
543
+ ranges << DiffCharRange.new(
544
+ line_number: loc1[:line_number], start_col: s_off_old,
545
+ end_col: s_off_old + s_len,
546
+ side: :old, status: :unchanged, role: :after, diff_node: diff_node
547
+ )
548
+ ranges << DiffCharRange.new(
549
+ line_number: loc2[:line_number], start_col: s_off_new,
550
+ end_col: s_off_new + s_len,
551
+ side: :new, status: :unchanged, role: :after, diff_node: diff_node
552
+ )
553
+ end
554
+
555
+ diff_node.char_ranges = ranges
556
+ diff_node.line_range_before = [loc1[:line_number], loc1[:line_number]]
557
+ diff_node.line_range_after = [loc2[:line_number], loc2[:line_number]]
558
+ end
559
+
560
+ # Structural whitespace: mark affected lines as formatting-only.
561
+ def enrich_structural_whitespace(diff_node)
562
+ before = diff_node.serialized_before
563
+ after = diff_node.serialized_after
564
+
565
+ loc1 = SourceLocator.locate(before, @text1, @line_map1) if before
566
+ loc2 = SourceLocator.locate(after, @text2, @line_map2) if after
567
+
568
+ ranges = []
569
+
570
+ if loc1 && before
571
+ ranges << DiffCharRange.new(
572
+ line_number: loc1[:line_number],
573
+ start_col: loc1[:col],
574
+ end_col: loc1[:col] + before.length,
575
+ side: :old,
576
+ status: :unchanged,
577
+ role: :changed,
578
+ diff_node: diff_node,
579
+ )
580
+ end
581
+
582
+ if loc2 && after
583
+ ranges << DiffCharRange.new(
584
+ line_number: loc2[:line_number],
585
+ start_col: loc2[:col],
586
+ end_col: loc2[:col] + after.length,
587
+ side: :new,
588
+ status: :unchanged,
589
+ role: :changed,
590
+ diff_node: diff_node,
591
+ )
592
+ end
593
+
594
+ diff_node.char_ranges = ranges
595
+ diff_node.line_range_before = if loc1
596
+ [loc1[:line_number],
597
+ loc1[:line_number]]
598
+ end
599
+ diff_node.line_range_after = if loc2
600
+ [loc2[:line_number],
601
+ loc2[:line_number]]
602
+ end
603
+ end
604
+
605
+ # Element structure change: full element deletion/insertion.
606
+ # Locate the entire element (opening tag through closing tag).
607
+ def enrich_element_structure(diff_node)
608
+ before = diff_node.serialized_before
609
+ after = diff_node.serialized_after
610
+ path = diff_node.path
611
+
612
+ if before.nil? && after.nil?
613
+ return
614
+ end
615
+
616
+ # Minimum reliable length for SourceLocator.locate to find the correct
617
+ # occurrence. Shorter strings match too many places in the document.
618
+ min_locate_length = 3
619
+
620
+ # Element added (only in text2)
621
+ if before.nil?
622
+ loc = if after.length < min_locate_length && path
623
+ locate_via_parent_element(path, @text2, @line_map2)
624
+ else
625
+ SourceLocator.locate(after, @text2, @line_map2)
626
+ end
627
+
628
+ if loc
629
+ end_line = find_end_line(loc[:line_number], @line_map2, after)
630
+ diff_node.char_ranges = [
631
+ DiffCharRange.new(
632
+ line_number: loc[:line_number],
633
+ start_col: loc[:col],
634
+ end_col: loc[:col] + after.length,
635
+ side: :new,
636
+ status: :added,
637
+ role: :changed,
638
+ diff_node: diff_node,
639
+ ),
640
+ ]
641
+ diff_node.line_range_before = nil
642
+ diff_node.line_range_after = [loc[:line_number], end_line]
643
+ else
644
+ # Fallback: can't locate exact content, mark entire text2 as affected
645
+ fallback_element_structure_ranges(diff_node, nil, after, :new)
646
+ end
647
+ return
648
+ end
649
+
650
+ # Element removed (only in text1)
651
+ if after.nil?
652
+ loc = if before.length < min_locate_length && path
653
+ locate_via_parent_element(path, @text1, @line_map1)
654
+ else
655
+ SourceLocator.locate(before, @text1, @line_map1)
656
+ end
657
+
658
+ if loc
659
+ end_line = find_end_line(loc[:line_number], @line_map1, before)
660
+ diff_node.char_ranges = [
661
+ DiffCharRange.new(
662
+ line_number: loc[:line_number],
663
+ start_col: loc[:col],
664
+ end_col: loc[:col] + before.length,
665
+ side: :old,
666
+ status: :changed_old,
667
+ role: :changed,
668
+ diff_node: diff_node,
669
+ ),
670
+ ]
671
+ diff_node.line_range_before = [loc[:line_number], end_line]
672
+ diff_node.line_range_after = nil
673
+ else
674
+ # Try using node1's parent element as anchor for text nodes
675
+ loc = locate_textnode_parent(diff_node.node1, before, @text1,
676
+ @line_map1)
677
+ if loc
678
+ end_line = find_end_line(loc[:line_number], @line_map1, before)
679
+ diff_node.char_ranges = [
680
+ DiffCharRange.new(
681
+ line_number: loc[:line_number],
682
+ start_col: loc[:col],
683
+ end_col: loc[:col] + before.length,
684
+ side: :old,
685
+ status: :changed_old,
686
+ role: :changed,
687
+ diff_node: diff_node,
688
+ ),
689
+ ]
690
+ diff_node.line_range_before = [loc[:line_number], end_line]
691
+ diff_node.line_range_after = nil
692
+ else
693
+ # Fallback: can't locate exact content, mark entire text1 as affected
694
+ fallback_element_structure_ranges(diff_node, before, nil, :old)
695
+ end
696
+ end
697
+ return
698
+ end
699
+
700
+ # Both exist: structural change (e.g., element renamed)
701
+ loc1 = if before.length < min_locate_length && path
702
+ locate_via_parent_element(path, @text1, @line_map1)
703
+ else
704
+ SourceLocator.locate(before, @text1, @line_map1)
705
+ end
706
+ loc2 = if after.length < min_locate_length && path
707
+ locate_via_parent_element(path, @text2, @line_map2)
708
+ else
709
+ SourceLocator.locate(after, @text2, @line_map2)
710
+ end
711
+
712
+ ranges = []
713
+
714
+ if loc1
715
+ ranges << DiffCharRange.new(
716
+ line_number: loc1[:line_number],
717
+ start_col: loc1[:col],
718
+ end_col: loc1[:col] + before.length,
719
+ side: :old,
720
+ status: :changed_old,
721
+ role: :changed,
722
+ diff_node: diff_node,
723
+ )
724
+ end
725
+
726
+ if loc2
727
+ ranges << DiffCharRange.new(
728
+ line_number: loc2[:line_number],
729
+ start_col: loc2[:col],
730
+ end_col: loc2[:col] + after.length,
731
+ side: :new,
732
+ status: :changed_new,
733
+ role: :changed,
734
+ diff_node: diff_node,
735
+ )
736
+ end
737
+
738
+ diff_node.char_ranges = ranges
739
+ diff_node.line_range_before = if loc1
740
+ [loc1[:line_number],
741
+ loc1[:line_number]]
742
+ end
743
+ diff_node.line_range_after = if loc2
744
+ [loc2[:line_number],
745
+ loc2[:line_number]]
746
+ end
747
+ end
748
+
749
+ # Fallback for element_structure when exact location fails.
750
+ # Uses element name matching to find affected lines.
751
+ def fallback_element_structure_ranges(diff_node, before, after, side)
752
+ element_name = extract_element_name(before || after)
753
+ return unless element_name
754
+
755
+ ranges = []
756
+
757
+ if %i[old both].include?(side)
758
+ # Element removed from text1 (before exists, after nil)
759
+ old_lines = find_lines_with_element(element_name, @lines1, @text1)
760
+ old_lines.each do |line_idx|
761
+ ranges << DiffCharRange.new(
762
+ line_number: line_idx,
763
+ start_col: 0,
764
+ end_col: @lines1[line_idx].length,
765
+ side: :old,
766
+ status: :removed,
767
+ role: :changed,
768
+ diff_node: diff_node,
769
+ )
770
+ end
771
+ diff_node.line_range_before = old_lines.any? ? old_lines.minmax : nil
772
+ end
773
+
774
+ if %i[new both].include?(side)
775
+ # Element added to text2 (before nil, after exists)
776
+ new_lines = find_lines_with_element(element_name, @lines2, @text2)
777
+ new_lines.each do |line_idx|
778
+ ranges << DiffCharRange.new(
779
+ line_number: line_idx,
780
+ start_col: 0,
781
+ end_col: @lines2[line_idx].length,
782
+ side: :new,
783
+ status: :added,
784
+ role: :changed,
785
+ diff_node: diff_node,
786
+ )
787
+ end
788
+ diff_node.line_range_after = new_lines.any? ? new_lines.minmax : nil
789
+ end
790
+
791
+ diff_node.char_ranges = ranges
792
+ end
793
+
794
+ def find_lines_with_element(element_name, lines, _text)
795
+ result = []
796
+ lines.each_with_index do |line, idx|
797
+ # Check if line contains opening or closing tag for this element
798
+ if line.include?("<#{element_name}") || line.include?("</#{element_name}>")
799
+ result << idx
800
+ end
801
+ end
802
+ result
803
+ end
804
+
805
+ def extract_element_name(content)
806
+ return nil if content.nil?
807
+
808
+ # Match opening or closing tag: <element or </element>
809
+ match = content.match(/<\/?([a-zA-Z0-9_:-]+)/)
810
+ match[1] if match
811
+ end
812
+
813
+ # Generic fallback: try to locate and decompose serialized content.
814
+ # Does NOT call enrich_text_content to avoid infinite recursion.
815
+ def enrich_generic(diff_node)
816
+ before = diff_node.serialized_before
817
+ after = diff_node.serialized_after
818
+
819
+ if before && after
820
+ # Both sides exist: locate the entire changed region
821
+ loc1 = SourceLocator.locate(before, @text1, @line_map1)
822
+ loc2 = SourceLocator.locate(after, @text2, @line_map2)
823
+
824
+ ranges = []
825
+ if loc1
826
+ ranges << DiffCharRange.new(
827
+ line_number: loc1[:line_number],
828
+ start_col: loc1[:col],
829
+ end_col: loc1[:col] + before.length,
830
+ side: :old,
831
+ status: :changed_old,
832
+ role: :changed,
833
+ diff_node: diff_node,
834
+ )
835
+ end
836
+ if loc2
837
+ ranges << DiffCharRange.new(
838
+ line_number: loc2[:line_number],
839
+ start_col: loc2[:col],
840
+ end_col: loc2[:col] + after.length,
841
+ side: :new,
842
+ status: :changed_new,
843
+ role: :changed,
844
+ diff_node: diff_node,
845
+ )
846
+ end
847
+ diff_node.char_ranges = ranges
848
+ diff_node.line_range_before = if loc1
849
+ [loc1[:line_number],
850
+ loc1[:line_number]]
851
+ end
852
+ diff_node.line_range_after = if loc2
853
+ [loc2[:line_number],
854
+ loc2[:line_number]]
855
+ end
856
+ elsif before
857
+ loc = SourceLocator.locate(before, @text1, @line_map1)
858
+ return unless loc
859
+
860
+ diff_node.char_ranges = [
861
+ DiffCharRange.new(
862
+ line_number: loc[:line_number],
863
+ start_col: loc[:col],
864
+ end_col: loc[:col] + before.length,
865
+ side: :old,
866
+ status: :removed,
867
+ role: :changed,
868
+ diff_node: diff_node,
869
+ ),
870
+ ]
871
+ diff_node.line_range_before = [loc[:line_number], loc[:line_number]]
872
+ elsif after
873
+ loc = SourceLocator.locate(after, @text2, @line_map2)
874
+ return unless loc
875
+
876
+ diff_node.char_ranges = [
877
+ DiffCharRange.new(
878
+ line_number: loc[:line_number],
879
+ start_col: loc[:col],
880
+ end_col: loc[:col] + after.length,
881
+ side: :new,
882
+ status: :added,
883
+ role: :changed,
884
+ diff_node: diff_node,
885
+ ),
886
+ ]
887
+ diff_node.line_range_after = [loc[:line_number], loc[:line_number]]
888
+ end
889
+ end
890
+
891
+ # Build an attribute pattern string: key="value"
892
+ def build_attr_pattern(key, value)
893
+ "#{key}=\"#{value}\""
894
+ end
895
+
896
+ # Return the character offset just past the XML declaration `?>`,
897
+ # or 0 if there is no XML declaration.
898
+ #
899
+ # The XML declaration can contain attributes like version, encoding
900
+ # that may collide with element attributes. Skipping past it prevents
901
+ # false matches when locating attribute patterns.
902
+ #
903
+ # @param text [String] the source text
904
+ # @return [Integer] character offset past the XML declaration, or 0
905
+ def xml_declaration_end_offset(text)
906
+ if text.start_with?("<?xml")
907
+ idx = text.index("?>")
908
+ idx ? idx + 2 : 0
909
+ else
910
+ 0
911
+ end
912
+ end
913
+
914
+ # Find the last line that content starting at start_line spans.
915
+ # Handles multi-line serialized content.
916
+ #
917
+ # @param start_line [Integer] 0-based line where content starts
918
+ # @param line_map [Array<Hash>] line offset map
919
+ # @param content [String] the serialized content
920
+ # @return [Integer] the last line number
921
+ def find_end_line(start_line, line_map, content)
922
+ newline_count = content.count("\n")
923
+ [start_line + newline_count, line_map.length - 1].min
924
+ end
925
+
926
+ # Find the occurrence of a value at a specific element index.
927
+ # Used for text_content changes when the same text appears multiple times
928
+ # in different elements (e.g., "original" in multiple item elements).
929
+ #
930
+ # @param value [String] the text to find
931
+ # @param text [String] the source text
932
+ # @param line_map [Array<Hash>] pre-built line offset map
933
+ # @param path [String] the diff node path (e.g., "/root[0]/item[1]/unknown[0]")
934
+ # @return [Hash, nil] location hash or nil if not found
935
+ def locate_at_element_index(value, text, line_map, path)
936
+ # Path like "/root[0]/item[1]/unknown[0]" has multiple segments.
937
+ # For text_content changes, the last segment is the text node,
938
+ # and the second-to-last is the element whose text changed.
939
+ # We need to find "item[1]" not "unknown[0]".
940
+ segments = path.split("/").reject(&:empty?)
941
+ if segments.length < 2
942
+ return SourceLocator.locate(value, text,
943
+ line_map)
944
+ end
945
+
946
+ # Start from segments[-2] (skip the last segment which is the text node)
947
+ # and walk backwards to find a segment with a bracket index.
948
+ # E.g., path "named-content[0]/named-content/text()[0]" — segments[-2]
949
+ # is "named-content" (no bracket), so we skip to segments[-3]
950
+ # "named-content[0]" which has the bracket.
951
+ element_segment = nil
952
+ (segments.length - 2).downto(1) do |i|
953
+ seg = segments[i]
954
+ if seg.include?("[")
955
+ element_segment = seg
956
+ break
957
+ end
958
+ end
959
+ unless element_segment
960
+ return SourceLocator.locate(value, text,
961
+ line_map)
962
+ end
963
+
964
+ element_match = element_segment.match(/([a-zA-Z0-9_:-]+)\[(\d+)\]/)
965
+ return SourceLocator.locate(value, text, line_map) unless element_match
966
+
967
+ element_name = element_match[1]
968
+ target_index = element_match[2].to_i
969
+
970
+ # For short values (< 3 chars), enumerate_all is too expensive.
971
+ # Use path-based hierarchy traversal instead.
972
+ if value.length < 3
973
+ return nil # Caller will fall back to locate_via_parent_element
974
+ end
975
+
976
+ # Find all occurrences and determine which element each belongs to
977
+ occurrences = SourceLocator.locate_all(value, text, line_map)
978
+
979
+ occurrences.each do |occ|
980
+ element_index = count_elements_before_position(text,
981
+ occ[:char_offset], element_name)
982
+ return occ if element_index == target_index
983
+ end
984
+
985
+ # Fallback: return first occurrence
986
+ SourceLocator.locate(value, text, line_map)
987
+ end
988
+
989
+ # Fallback location strategy for text_content when locate_at_element_index
990
+ # fails (e.g., the text value is too short to locate reliably).
991
+ # Walks the full element hierarchy from the path to locate the correct
992
+ # parent element, then returns a position inside it.
993
+ #
994
+ # @param path [String] the diff node path (e.g., "/root[0]/item[1]/unknown[0]")
995
+ # @param text [String] the source text
996
+ # @param line_map [Array<Hash>] pre-built line offset map
997
+ # @return [Hash, nil] location hash or nil if not found
998
+ def locate_via_parent_element(path, text, line_map)
999
+ segments = path.split("/").reject(&:empty?)
1000
+ return nil if segments.length < 2
1001
+
1002
+ # Collect all element segments with bracket indices, walking backwards
1003
+ # from segments[-2] (skip the last segment which is the text node).
1004
+ # E.g., for ".../def-item[1]/term[0]/named-content[0]/unknown[0]"
1005
+ # we need to traverse: def-item[1] -> term[0] -> named-content[0]
1006
+ element_segments = []
1007
+ (segments.length - 2).downto(0) do |i|
1008
+ seg = segments[i]
1009
+ next if seg.start_with?("text()", "comment()", "unknown")
1010
+
1011
+ if seg.include?("[")
1012
+ element_segments.unshift(seg) # maintain top-down order
1013
+ end
1014
+ end
1015
+ return nil if element_segments.empty?
1016
+
1017
+ # Walk the hierarchy: find each element within the search range of its parent
1018
+ search_start = 0
1019
+ search_end = text.length
1020
+
1021
+ element_segments.each do |seg|
1022
+ match = seg.match(/([a-zA-Z0-9_:-]+)\[(\d+)\]/)
1023
+ return nil unless match
1024
+
1025
+ element_name = match[1]
1026
+ target_index = match[2].to_i
1027
+
1028
+ pos = find_nth_element_in_range(text, element_name, target_index,
1029
+ search_start, search_end)
1030
+ return nil unless pos
1031
+
1032
+ # Narrow the search range to inside this element
1033
+ close_pos = text.index(">", pos)
1034
+ return nil unless close_pos
1035
+
1036
+ search_start = close_pos + 1
1037
+
1038
+ # Find the end of this element (closing tag or self-closing)
1039
+ close_tag = "</#{element_name}>"
1040
+ end_pos = text.index(close_tag, search_start)
1041
+ search_end = if end_pos
1042
+ end_pos
1043
+ else
1044
+ # Self-closing: search range is empty for children
1045
+ search_start
1046
+ end
1047
+ end
1048
+
1049
+ # search_start now points inside the innermost element
1050
+ line_idx = SourceLocator.send(:find_line_for_offset, search_start,
1051
+ line_map)
1052
+ return nil unless line_idx
1053
+
1054
+ col = search_start - line_map[line_idx][:start_offset]
1055
+ { char_offset: search_start, line_number: line_idx, col: col }
1056
+ end
1057
+
1058
+ # Find the Nth sibling occurrence of an element within a text range,
1059
+ # counting only elements at the same depth (direct children).
1060
+ #
1061
+ # The path indices (e.g., sec[3]) count siblings at the same level.
1062
+ # Simply counting all <sec> tags would incorrectly count descendant
1063
+ # elements (e.g., a <sec> nested inside another <sec>).
1064
+ #
1065
+ # This method tracks XML depth: it skips <element> tags inside child
1066
+ # elements (depth > 1) and only counts at depth == 1.
1067
+ def find_nth_element_in_range(text, element_name, target_index,
1068
+ range_start, range_end)
1069
+ offset = range_start
1070
+ current_index = 0
1071
+ depth = 0
1072
+ open_pattern = /<#{Regexp.escape(element_name)}[\s>]/
1073
+ close_pattern = /<\/#{Regexp.escape(element_name)}\s*>/
1074
+
1075
+ loop do
1076
+ # Find next opening tag at any depth
1077
+ open_pos = text.index(open_pattern, offset)
1078
+ open_pos = nil if open_pos && open_pos >= range_end
1079
+
1080
+ # Find next closing tag at any depth
1081
+ close_pos = text.index(close_pattern, offset)
1082
+ close_pos = nil if close_pos && close_pos >= range_end
1083
+
1084
+ # Both exhausted or past range end
1085
+ break if open_pos.nil? && close_pos.nil?
1086
+
1087
+ if open_pos && (close_pos.nil? || open_pos <= close_pos)
1088
+ tag_end = text.index(">", open_pos)
1089
+ break unless tag_end
1090
+
1091
+ if depth.zero?
1092
+ return open_pos if current_index == target_index
1093
+
1094
+ current_index += 1
1095
+ end
1096
+
1097
+ # Check if self-closing
1098
+ tag_text = text[open_pos..tag_end]
1099
+ unless tag_text.include?("/>")
1100
+ depth += 1
1101
+ end
1102
+ offset = tag_end + 1
1103
+ else
1104
+ # Closing tag
1105
+ close_tag_end = close_pos + 2 # "</x>".length = 2 min chars for ">"
1106
+ # Find actual > of closing tag
1107
+ actual_close = text.index(">", close_pos)
1108
+ close_tag_end = actual_close + 1 if actual_close
1109
+ depth -= 1 if depth.positive?
1110
+ offset = close_tag_end
1111
+ end
1112
+ end
1113
+
1114
+ nil
1115
+ end
1116
+
1117
+ # Locate text using the parsed node tree when path-based lookup fails.
1118
+ #
1119
+ # This is the most robust fallback: it walks up the node's ancestor chain
1120
+ # looking for an element with a unique "id" attribute, then searches for
1121
+ # that element in the text. Once found, it locates the target text within
1122
+ # the element's content area.
1123
+ #
1124
+ # @param node [Canon::Xml::Node] the parsed node (TextNode or ElementNode)
1125
+ # @param value [String] the text value to locate (e.g., "a")
1126
+ # @param text [String] the full source text
1127
+ # @param line_map [Array<Hash>] pre-built line offset map
1128
+ # @param side [Symbol] :old or :new (which text to search)
1129
+ # @return [Hash, nil] location hash {char_offset, line_number, col} or nil
1130
+ def locate_via_node_tree(node, value, text, line_map, _side)
1131
+ return nil unless node
1132
+
1133
+ # Walk up ancestors to find one with an "id" attribute
1134
+ ancestors = []
1135
+ current = node
1136
+ while current.respond_to?(:parent)
1137
+ ancestors << current if current.respond_to?(:name)
1138
+ current = current.parent
1139
+ end
1140
+
1141
+ # Find the nearest ancestor with an "id" attribute
1142
+ anchor = nil
1143
+ anchor_name = nil
1144
+ anchor_id = nil
1145
+ ancestors.each do |anc|
1146
+ next unless anc.respond_to?(:attribute_nodes) && anc.attribute_nodes
1147
+
1148
+ anc.attribute_nodes.each do |attr|
1149
+ next unless attr.respond_to?(:name) && attr.name == "id"
1150
+
1151
+ anchor = anc
1152
+ anchor_name = anc.name
1153
+ anchor_id = attr.respond_to?(:value) ? attr.value : nil
1154
+ break
1155
+ end
1156
+ break if anchor
1157
+ end
1158
+
1159
+ return nil unless anchor && anchor_id
1160
+
1161
+ # Find the anchor element in the text: <anchor_name ... id="anchor_id" ...>
1162
+ anchor_pattern = /<#{Regexp.escape(anchor_name)}\b[^>]*\bid="#{Regexp.escape(anchor_id)}"/
1163
+ anchor_pos = text.index(anchor_pattern)
1164
+ return nil unless anchor_pos
1165
+
1166
+ # Find the end of the opening tag
1167
+ anchor_tag_end = text.index(">", anchor_pos)
1168
+ return nil unless anchor_tag_end
1169
+
1170
+ # Find the closing tag for the anchor
1171
+ close_tag = "</#{anchor_name}>"
1172
+ anchor_close = text.index(close_tag, anchor_tag_end + 1)
1173
+ return nil unless anchor_close
1174
+
1175
+ # Search for the value within the anchor's content
1176
+ # But first, walk down from anchor to find the specific leaf element
1177
+ # Build a regex for each ancestor level between anchor and node
1178
+ leaf_element = ancestors.first # closest ancestor with a name (the parent of the text node)
1179
+
1180
+ # Find the leaf element's opening tag within the anchor's content
1181
+ if leaf_element && leaf_element != anchor
1182
+ leaf_name = leaf_element.name
1183
+ leaf_attrs = element_attribute_signature(leaf_element)
1184
+
1185
+ # Search for the leaf element within anchor range
1186
+ leaf_pattern = /<#{Regexp.escape(leaf_name)}\b/
1187
+ leaf_pos = nil
1188
+ offset = anchor_tag_end + 1
1189
+ while (pos = text.index(leaf_pattern, offset))
1190
+ break if pos >= anchor_close
1191
+
1192
+ # Check if this element matches the attribute signature
1193
+ tag_end_pos = text.index(">", pos)
1194
+ break unless tag_end_pos && tag_end_pos < anchor_close
1195
+
1196
+ tag_text = text[pos..tag_end_pos]
1197
+ if leaf_attrs.empty? || leaf_attrs.all? do |k, v|
1198
+ tag_text.include?("#{k}=\"#{v}\"")
1199
+ end
1200
+ leaf_pos = pos
1201
+ break
1202
+ end
1203
+ offset = pos + 1
1204
+ end
1205
+
1206
+ if leaf_pos
1207
+ # Found the leaf element - find the value within it
1208
+ leaf_tag_end = text.index(">", leaf_pos)
1209
+ leaf_close = text.index("</#{leaf_name}>", leaf_tag_end + 1)
1210
+
1211
+ # Check if leaf is self-closing: if so, the value cannot be inside it
1212
+ # in this document (it was moved or removed). Return nil so the caller
1213
+ # can fall back to searching in the other document.
1214
+ if text[leaf_pos..leaf_tag_end].include?("/>")
1215
+ return nil # Self-closing element - value not present in this doc
1216
+ end
1217
+
1218
+ if leaf_close && leaf_close < anchor_close
1219
+ # Search for value inside leaf element
1220
+ value_pos = text.index(value, leaf_tag_end + 1)
1221
+ if value_pos && value_pos < leaf_close
1222
+ line_idx = SourceLocator.send(:find_line_for_offset, value_pos,
1223
+ line_map)
1224
+ return nil unless line_idx
1225
+
1226
+ col = value_pos - line_map[line_idx][:start_offset]
1227
+ return { char_offset: value_pos, line_number: line_idx,
1228
+ col: col }
1229
+ end
1230
+ end
1231
+ end
1232
+ end
1233
+
1234
+ # Direct search: value might be directly in the anchor's content
1235
+ value_pos = text.index(value, anchor_tag_end + 1)
1236
+ if value_pos && value_pos < anchor_close
1237
+ line_idx = SourceLocator.send(:find_line_for_offset, value_pos,
1238
+ line_map)
1239
+ return nil unless line_idx
1240
+
1241
+ col = value_pos - line_map[line_idx][:start_offset]
1242
+ return { char_offset: value_pos, line_number: line_idx, col: col }
1243
+ end
1244
+
1245
+ nil
1246
+ end
1247
+
1248
+ # Locate text using a TextNode's parent element as anchor.
1249
+ # Uses the parent element's tag name and attributes to find a unique anchor,
1250
+ # then searches within that element for the text value.
1251
+ #
1252
+ # @param textnode [Canon::Xml::Nodes::TextNode] the TextNode whose parent to use
1253
+ # @param value [String] the text value to find
1254
+ # @param text [String] the source text to search in
1255
+ # @param line_map [Array<Hash>] pre-built line offset map
1256
+ # @return [Hash, nil] location hash with :char_offset, :line_number, :col or nil
1257
+ def locate_textnode_parent(textnode, value, text, line_map)
1258
+ return nil unless textnode.respond_to?(:parent) && textnode.parent
1259
+
1260
+ parent = textnode.parent
1261
+ return nil unless parent.respond_to?(:name) && parent.name
1262
+
1263
+ parent_name = parent.name
1264
+ parent_attrs = element_attribute_signature(parent)
1265
+
1266
+ # Find all occurrences of the parent element
1267
+ anchor_pattern = /<#{Regexp.escape(parent_name)}\b/
1268
+ offset = 0
1269
+
1270
+ while (anchor_pos = text.index(anchor_pattern, offset))
1271
+ tag_end = text.index(">", anchor_pos)
1272
+ break unless tag_end
1273
+
1274
+ # Check if attributes match
1275
+ tag_text = text[anchor_pos..tag_end]
1276
+ attrs_match = parent_attrs.empty? || parent_attrs.all? do |k, v|
1277
+ tag_text.include?("#{k}=\"#{v}\"")
1278
+ end
1279
+
1280
+ if attrs_match
1281
+ # Found matching parent element - search for value inside it
1282
+ anchor_tag_end = tag_end
1283
+ anchor_close = text.index("</#{parent_name}>", anchor_tag_end + 1)
1284
+ return nil unless anchor_close
1285
+
1286
+ # Search for value within this element
1287
+ value_pos = text.index(value, anchor_tag_end + 1)
1288
+ if value_pos && value_pos < anchor_close
1289
+ line_idx = SourceLocator.send(:find_line_for_offset, value_pos,
1290
+ line_map)
1291
+ return nil unless line_idx
1292
+
1293
+ col = value_pos - line_map[line_idx][:start_offset]
1294
+ return { char_offset: value_pos, line_number: line_idx, col: col }
1295
+ end
1296
+ end
1297
+
1298
+ offset = anchor_pos + 1
1299
+ end
1300
+
1301
+ nil
1302
+ end
1303
+
1304
+ # Locate the same element (parent of a TextNode) in text2, even if empty.
1305
+ # Uses the parent element's tag name and attributes to find a matching element.
1306
+ # Returns the element's position (for creating zero-length new_ranges).
1307
+ #
1308
+ # @param textnode [Canon::Xml::Nodes::TextNode] the TextNode whose parent to find
1309
+ # @param text [String] the source text (should be text2)
1310
+ # @param line_map [Array<Hash>] pre-built line offset map
1311
+ # @return [Hash, nil] location hash with :char_offset, :line_number, :col or nil
1312
+ def locate_element_in_text2(textnode, text, line_map)
1313
+ return nil unless textnode.respond_to?(:parent) && textnode.parent
1314
+
1315
+ parent = textnode.parent
1316
+ return nil unless parent.respond_to?(:name) && parent.name
1317
+
1318
+ parent_name = parent.name
1319
+ parent_attrs = element_attribute_signature(parent)
1320
+
1321
+ # Find all occurrences of the parent element
1322
+ anchor_pattern = /<#{Regexp.escape(parent_name)}\b/
1323
+ offset = 0
1324
+
1325
+ while (anchor_pos = text.index(anchor_pattern, offset))
1326
+ tag_end = text.index(">", anchor_pos)
1327
+ break unless tag_end
1328
+
1329
+ # Check if attributes match
1330
+ tag_text = text[anchor_pos..tag_end]
1331
+ attrs_match = parent_attrs.empty? || parent_attrs.all? do |k, v|
1332
+ tag_text.include?("#{k}=\"#{v}\"")
1333
+ end
1334
+
1335
+ if attrs_match
1336
+ # Found matching element - return its START position
1337
+ # For self-closing elements, return the position of <
1338
+ # For regular elements, return the position of >
1339
+ is_self_closing = tag_text.include?("/>")
1340
+
1341
+ if is_self_closing
1342
+ # Self-closing element - return position of <
1343
+ line_idx = SourceLocator.send(:find_line_for_offset, anchor_pos,
1344
+ line_map)
1345
+ return nil unless line_idx
1346
+
1347
+ col = anchor_pos - line_map[line_idx][:start_offset]
1348
+ return { char_offset: anchor_pos, line_number: line_idx,
1349
+ col: col }
1350
+ else
1351
+ # Regular element - return position of >
1352
+ line_idx = SourceLocator.send(:find_line_for_offset, tag_end_pos,
1353
+ line_map)
1354
+ return nil unless line_idx
1355
+
1356
+ col = tag_end_pos - line_map[line_idx][:start_offset]
1357
+ return { char_offset: tag_end_pos, line_number: line_idx,
1358
+ col: col }
1359
+ end
1360
+ end
1361
+
1362
+ offset = anchor_pos + 1
1363
+ end
1364
+
1365
+ nil
1366
+ end
1367
+
1368
+ # Build a string representation of an element's attributes for matching.
1369
+ def element_attribute_signature(element)
1370
+ sig = {}
1371
+ if element.respond_to?(:attribute_nodes) && element.attribute_nodes
1372
+ element.attribute_nodes.each do |attr|
1373
+ next unless attr.respond_to?(:name) && attr.respond_to?(:value)
1374
+
1375
+ sig[attr.name] = attr.value
1376
+ end
1377
+ end
1378
+ sig
1379
+ end
1380
+
1381
+ # Fallback for short text location when tree-based methods fail.
1382
+ # Searches in the original text (text1) for the value and returns the first
1383
+ # occurrence. For `before.nil?` cases where the content exists in text1
1384
+ # but not at the tree-indicated position in text2.
1385
+ #
1386
+ # @param value [String] the text to find
1387
+ # @param path [String] the diff node path for element context
1388
+ # @param text [String] the source text (should be text1/original)
1389
+ # @param line_map [Array<Hash>] pre-built line offset map
1390
+ # @return [Hash, nil] location hash or nil
1391
+ def locate_short_text_in_original(value, _path, text, line_map)
1392
+ return nil unless value && !value.empty?
1393
+
1394
+ # For very short strings, just use SourceLocator.locate which finds
1395
+ # the first occurrence. This is a best-effort approach.
1396
+ loc = SourceLocator.locate(value, text, line_map)
1397
+ return loc if loc
1398
+
1399
+ nil
1400
+ end
1401
+
1402
+ # Count how many elements of a given name appear before a character position,
1403
+ # minus one (since the count includes the element we are inside).
1404
+ # Used to determine which element instance an occurrence belongs to.
1405
+ #
1406
+ # @param text [String] the source text
1407
+ # @param char_offset [Integer] character offset to check before
1408
+ # @param element_name [String] name of element to count
1409
+ # @return [Integer] element index (0-based) of the element containing the position
1410
+ def count_elements_before_position(text, char_offset, element_name)
1411
+ prefix = text[0...char_offset]
1412
+ count = prefix.scan(/<#{element_name}[>\s]/).length
1413
+ # Subtract 1 because the count includes the element we are inside
1414
+ [count - 1, 0].max
1415
+ end
1416
+ end
1417
+ end
1418
+ end