canon 0.1.21 → 0.1.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +50 -26
  3. data/README.adoc +8 -3
  4. data/docs/advanced/diff-pipeline.adoc +36 -9
  5. data/docs/features/diff-formatting/colors-and-symbols.adoc +82 -0
  6. data/docs/features/diff-formatting/index.adoc +12 -0
  7. data/docs/features/diff-formatting/themes.adoc +353 -0
  8. data/docs/features/environment-configuration/index.adoc +23 -0
  9. data/docs/internals/diff-char-range-pipeline.adoc +249 -0
  10. data/docs/internals/diffnode-enrichment.adoc +1 -0
  11. data/docs/internals/index.adoc +52 -4
  12. data/docs/reference/environment-variables.adoc +6 -0
  13. data/docs/understanding/architecture.adoc +5 -0
  14. data/examples/show_themes.rb +217 -0
  15. data/lib/canon/comparison/comparison_result.rb +9 -4
  16. data/lib/canon/config/env_schema.rb +3 -1
  17. data/lib/canon/config.rb +11 -0
  18. data/lib/canon/diff/diff_block.rb +7 -0
  19. data/lib/canon/diff/diff_block_builder.rb +2 -2
  20. data/lib/canon/diff/diff_char_range.rb +140 -0
  21. data/lib/canon/diff/diff_line.rb +42 -4
  22. data/lib/canon/diff/diff_line_builder.rb +907 -0
  23. data/lib/canon/diff/diff_node.rb +5 -1
  24. data/lib/canon/diff/diff_node_enricher.rb +1418 -0
  25. data/lib/canon/diff/diff_node_mapper.rb +54 -0
  26. data/lib/canon/diff/source_locator.rb +105 -0
  27. data/lib/canon/diff/text_decomposer.rb +103 -0
  28. data/lib/canon/diff_formatter/by_line/base_formatter.rb +264 -24
  29. data/lib/canon/diff_formatter/by_line/html_formatter.rb +35 -20
  30. data/lib/canon/diff_formatter/by_line/json_formatter.rb +36 -19
  31. data/lib/canon/diff_formatter/by_line/simple_formatter.rb +33 -19
  32. data/lib/canon/diff_formatter/by_line/xml_formatter.rb +583 -98
  33. data/lib/canon/diff_formatter/by_line/yaml_formatter.rb +36 -19
  34. data/lib/canon/diff_formatter/by_object/base_formatter.rb +62 -13
  35. data/lib/canon/diff_formatter/by_object/json_formatter.rb +59 -24
  36. data/lib/canon/diff_formatter/by_object/xml_formatter.rb +74 -34
  37. data/lib/canon/diff_formatter/diff_detail_formatter/color_helper.rb +4 -5
  38. data/lib/canon/diff_formatter/diff_detail_formatter.rb +1 -1
  39. data/lib/canon/diff_formatter/legend.rb +4 -2
  40. data/lib/canon/diff_formatter/theme.rb +864 -0
  41. data/lib/canon/diff_formatter.rb +11 -6
  42. data/lib/canon/tree_diff/matchers/hash_matcher.rb +16 -1
  43. data/lib/canon/tree_diff/matchers/similarity_matcher.rb +10 -0
  44. data/lib/canon/tree_diff/operations/operation_detector.rb +5 -1
  45. data/lib/canon/tree_diff/tree_diff_integrator.rb +1 -1
  46. data/lib/canon/version.rb +1 -1
  47. metadata +11 -2
@@ -0,0 +1,907 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "diff_line"
4
+ require_relative "formatting_detector"
5
+
6
+ module Canon
7
+ module Diff
8
+ # Assembles DiffLines from enriched DiffNodes.
9
+ #
10
+ # This is Phase 2 of the two-phase diff pipeline. It runs after
11
+ # DiffNodeEnricher and before DiffBlockBuilder. It does NO computation
12
+ # on the change content — it simply reads pre-computed DiffCharRanges
13
+ # from DiffNodes and assembles them into DiffLines.
14
+ #
15
+ # The DiffLineBuilder handles:
16
+ # - Mapping DiffCharRanges to the correct DiffLines
17
+ # - Filling in unchanged context lines between changes
18
+ # - Detecting reflow (lines that moved between documents)
19
+ # - Computing line correspondence without LCS
20
+ class DiffLineBuilder
21
+ # Build DiffLines from enriched DiffNodes.
22
+ #
23
+ # @param diff_nodes [Array<DiffNode>] Enriched DiffNodes with char_ranges
24
+ # @param text1 [String] The first document (preprocessed)
25
+ # @param text2 [String] The second document (preprocessed)
26
+ # @return [Array<DiffLine>] The assembled diff lines
27
+ def self.build(diff_nodes, text1, text2)
28
+ return [] if diff_nodes.nil? || diff_nodes.empty?
29
+ return [] if text1.nil? || text2.nil?
30
+
31
+ new(diff_nodes, text1, text2).build
32
+ end
33
+
34
+ def initialize(diff_nodes, text1, text2)
35
+ @diff_nodes = diff_nodes
36
+ @text1 = text1
37
+ @text2 = text2
38
+ @lines1 = text1.split("\n")
39
+ @lines2 = text2.split("\n")
40
+ # Build reverse indices for efficient content lookup in gap handling.
41
+ # Maps content string to array of line indices where that content appears.
42
+ @line_to_indices1 = build_line_index(@lines1)
43
+ @line_to_indices2 = build_line_index(@lines2)
44
+ end
45
+
46
+ # Maximum number of reflow lines before switching to summary mode.
47
+ # When more lines than this are unmatched in a reflow gap, a summary
48
+ # line is emitted instead of listing each individual line.
49
+ REFLOW_SUMMARY_THRESHOLD = 2
50
+
51
+ def build
52
+ # Sort DiffNodes by their position in text1 (or text2 if no text1 range)
53
+ sorted = @diff_nodes.select do |dn|
54
+ dn.char_ranges && !dn.char_ranges.empty?
55
+ end
56
+ .sort_by { |dn| sort_key(dn) }
57
+
58
+ result = []
59
+ cursor1 = 0 # current position in text1 lines
60
+ cursor2 = 0 # current position in text2 lines
61
+
62
+ sorted.each do |diff_node|
63
+ range1 = diff_node.line_range_before
64
+ range2 = diff_node.line_range_after
65
+
66
+ # Determine the start positions for this change
67
+ node_start1 = range1 ? range1[0] : cursor1
68
+ node_start2 = range2 ? range2[0] : cursor2
69
+
70
+ # Skip if this node's range has already been passed by the cursor.
71
+ # Handle cases where range1 or range2 is nil (nil means position is only
72
+ # in the other text, so we only check the non-nil side).
73
+ cursor1_passed = range1.nil? ? false : (cursor1 > node_start1)
74
+ cursor2_passed = range2.nil? ? false : (cursor2 > node_start2)
75
+ if cursor1_passed || cursor2_passed
76
+ next
77
+ end
78
+
79
+ # Emit unchanged lines before this change
80
+ emit_unchanged(result, cursor1, node_start1, cursor2, node_start2)
81
+
82
+ # Detect and handle reflow before this change
83
+ handle_reflow(result, cursor1, node_start1, cursor2, node_start2,
84
+ diff_node)
85
+
86
+ # Emit changed lines for this DiffNode
87
+ emit_changed(result, diff_node)
88
+
89
+ # Advance cursors past this change.
90
+ # cursor1 advances based on text1 content consumed.
91
+ # cursor2 advances based on text2 content consumed.
92
+ # For pure insertions (range1 nil), cursor1 advances by count2 to
93
+ # account for text2 gap lines that were emitted as mapping to text1.
94
+ # For pure deletions (range2 nil), cursor2 advances by count1.
95
+ old_cursor1 = cursor1
96
+ old_cursor2 = cursor2
97
+ cursor1 = if range1
98
+ range1[1] + 1
99
+ elsif range2
100
+ old_cursor1 + (node_start2 - old_cursor2)
101
+ else
102
+ node_start1 + 1
103
+ end
104
+ cursor2 = if range2
105
+ range2[1] + 1
106
+ elsif range1
107
+ old_cursor2 + (node_start1 - old_cursor1)
108
+ else
109
+ node_start2 + 1
110
+ end
111
+ end
112
+
113
+ # Emit remaining unchanged lines after last change
114
+ emit_unchanged(result, cursor1, @lines1.length, cursor2, @lines2.length)
115
+
116
+ result
117
+ end
118
+
119
+ private
120
+
121
+ # Sort key for ordering DiffNodes by position in the document.
122
+ def sort_key(diff_node)
123
+ range = diff_node.line_range_before || diff_node.line_range_after
124
+ range ? range[0] : Float::INFINITY
125
+ end
126
+
127
+ # Emit unchanged DiffLines between two cursor positions.
128
+ #
129
+ # @param result [Array<DiffLine>] output array
130
+ # @param from1 [Integer] start line in text1
131
+ # @param to1 [Integer] end line (exclusive) in text1
132
+ # @param from2 [Integer] start line in text2
133
+ # @param to2 [Integer] end line (exclusive) in text2
134
+ def emit_unchanged(result, from1, to1, from2, to2)
135
+ count1 = to1 - from1
136
+ count2 = to2 - from2
137
+
138
+ if count1 == count2 && count1 >= 0
139
+ # Simple case: same number of lines
140
+ count1.times do |i|
141
+ line1_idx = from1 + i
142
+ line2_idx = from2 + i
143
+ next if line1_idx >= @lines1.length && line2_idx >= @lines2.length
144
+
145
+ content = if line1_idx < @lines1.length
146
+ @lines1[line1_idx]
147
+ else
148
+ @lines2[line2_idx]
149
+ end
150
+
151
+ result << DiffLine.new(
152
+ line_number: line1_idx,
153
+ new_position: line2_idx,
154
+ content: content,
155
+ type: :unchanged,
156
+ )
157
+ end
158
+ elsif count1.positive? && count2.positive?
159
+ # Different number of lines: check if content actually exists in other text.
160
+ # If middle content is truly orphaned (doesn't exist in other text),
161
+ # use emit_gap_lines instead to avoid emitting lines without diff_nodes.
162
+ slice1 = @lines1[from1...to1]
163
+ slice2 = @lines2[from2...to2]
164
+ middle_orphaned = slice_middle_orphaned?(slice1, slice2)
165
+ if middle_orphaned
166
+ # Content only exists in one text - use gap handling
167
+ emit_gap_lines(result, from1, to1, from2, to2, count1, count2)
168
+ else
169
+ # Content exists in both texts but at different positions - use reflow
170
+ emit_unchanged_with_reflow(result, from1, to1, from2, to2)
171
+ end
172
+ elsif count1.positive? || count2.positive?
173
+ # Handle gap lines (orphaned or reflow)
174
+ emit_gap_lines(result, from1, to1, from2, to2, count1, count2)
175
+ end
176
+ end
177
+
178
+ # Check if the middle content (after removing common prefix/suffix) is truly
179
+ # orphaned - meaning it exists in only one text, not both.
180
+ # Returns true if content exists in only one text (not reflow).
181
+ def slice_middle_orphaned?(slice1, slice2)
182
+ return false if slice1.empty? || slice2.empty?
183
+
184
+ # Check if slice1 content exists anywhere in text2
185
+ slice1_all_in_text2 = slice1.all? do |line|
186
+ @line_to_indices2.key?(line)
187
+ end
188
+ # Check if slice2 content exists anywhere in text1
189
+ slice2_all_in_text1 = slice2.all? do |line|
190
+ @line_to_indices1.key?(line)
191
+ end
192
+
193
+ # If either slice has no presence in the other text, it's orphaned
194
+ !slice1_all_in_text2 || !slice2_all_in_text1
195
+ end
196
+
197
+ # Handle gap lines when one text has more lines than the other.
198
+ # Determines whether lines are orphaned (exist in both texts at different
199
+ # positions) or reflow (formatting-only).
200
+ #
201
+ # IMPORTANT: We never emit DiffLines without diff_nodes for gap content.
202
+ # If content exists in one text but not the other, the comparison should
203
+ # have reported it as a diff_node. We only emit :unchanged for orphaned
204
+ # content when we can find it in the other text at a different position.
205
+ def emit_gap_lines(result, from1, to1, from2, to2, count1, count2)
206
+ if count1.positive?
207
+ # Lines only in text1: check if they exist in text2 at different positions
208
+ if count1 >= REFLOW_SUMMARY_THRESHOLD
209
+ all_exist_in_text2 = (0...count1).all? do |i|
210
+ line_idx = from1 + i
211
+ line_idx < @lines1.length &&
212
+ @line_to_indices2.key?(@lines1[line_idx])
213
+ end
214
+ if all_exist_in_text2
215
+ emit_orphaned_unchanged(result, from1, to1, from2,
216
+ @line_to_indices2, true)
217
+ # Also emit extra lines from text2 as :added (text2 has more lines)
218
+ emit_extra_added_lines(result, from1, to1, from2, count1, count2)
219
+ else
220
+ # Can't emit individual lines without diff_nodes — use summary
221
+ emit_reflow_summary(result, from1, to1, from2, to2)
222
+ end
223
+ else
224
+ # Small gap: check each line individually
225
+ # Only emit :unchanged if we can find content in text2.
226
+ # DON'T emit :removed formatting lines without diff_nodes.
227
+ count1.times do |i|
228
+ line_idx = from1 + i
229
+ next if line_idx >= @lines1.length
230
+
231
+ content = @lines1[line_idx]
232
+ if @line_to_indices2.key?(content)
233
+ # Found in text2: emit as :unchanged with correct position
234
+ new_pos = @line_to_indices2[content].min_by do |idx|
235
+ (idx - from2).abs
236
+ end
237
+ result << DiffLine.new(
238
+ line_number: line_idx,
239
+ new_position: new_pos,
240
+ content: content,
241
+ type: :unchanged,
242
+ )
243
+ end
244
+ # If not found in text2: don't emit anything.
245
+ # The comparison should have reported this as a diff_node.
246
+ end
247
+ end
248
+ elsif count2.positive?
249
+ # Lines only in text2: check if they exist in text1 at different positions
250
+ # When count1=0, don't emit unchanged lines here - they'll be emitted
251
+ # from the text1 gap when cursor1 catches up.
252
+ if count1.zero?
253
+ # Pure insertion: text1 has no gap. The text2 gap lines are unchanged
254
+ # and correspond to text1 positions. Emit them from text1's perspective
255
+ # to avoid duplicates when cursor1 catches up.
256
+ count2.times do |i|
257
+ line_idx = from2 + i
258
+ next if line_idx >= @lines2.length
259
+
260
+ content = @lines2[line_idx]
261
+ if @line_to_indices1.key?(content)
262
+ # Found in text1: emit as :unchanged with TEXT1 line number
263
+ text1_pos = @line_to_indices1[content].min_by do |idx|
264
+ (idx - from1).abs
265
+ end
266
+ result << DiffLine.new(
267
+ line_number: text1_pos, # Use text1 position as primary
268
+ new_position: line_idx, # Use text2 position as secondary
269
+ content: content,
270
+ type: :unchanged,
271
+ )
272
+ end
273
+ # If not found in text1: don't emit anything
274
+ end
275
+ elsif count2 >= REFLOW_SUMMARY_THRESHOLD
276
+ all_exist_in_text1 = (0...count2).all? do |i|
277
+ line_idx = from2 + i
278
+ line_idx < @lines2.length &&
279
+ @line_to_indices1.key?(@lines2[line_idx])
280
+ end
281
+ if all_exist_in_text1
282
+ # All content exists in text1 but at different positions: treat as reflow
283
+ # Emit orphaned content with position mapping
284
+ emit_orphaned_unchanged(result, from2, to2, from1, from1, true)
285
+ else
286
+ emit_reflow_summary(result, from1, to1, from2, to2)
287
+ end
288
+ else
289
+ count2.times do |i|
290
+ line_idx = from2 + i
291
+ next if line_idx >= @lines2.length
292
+
293
+ content = @lines2[line_idx]
294
+ if @line_to_indices1.key?(content)
295
+ new_pos = @line_to_indices1[content].min_by do |idx|
296
+ (idx - from1).abs
297
+ end
298
+ result << DiffLine.new(
299
+ line_number: line_idx,
300
+ new_position: new_pos,
301
+ content: content,
302
+ type: :unchanged,
303
+ )
304
+ end
305
+ # If not found in text1: don't emit anything
306
+ end
307
+ end
308
+ end
309
+ end
310
+
311
+ # Emit extra lines from text2 as :added when text2 has more lines than text1
312
+ # in a gap where all of text1's content exists in text2 (reflow case).
313
+ def emit_extra_added_lines(result, from1, to1, from2, count1, count2)
314
+ return unless count2 > count1
315
+
316
+ extra_count = count2 - count1
317
+ extra_lines_in_text2 = @lines2[from2...(from2 + count2)]
318
+ text1_set = @lines1[from1...to1].to_set
319
+ extra_lines_in_text2.each do |content|
320
+ next if text1_set.include?(content)
321
+
322
+ extra_count -= 1
323
+ next if extra_count.negative?
324
+
325
+ line_idx = @line_to_indices2[content].min_by do |idx|
326
+ (idx - from2).abs
327
+ end
328
+ result << DiffLine.new(
329
+ line_number: line_idx,
330
+ new_position: line_idx,
331
+ content: content,
332
+ type: :added,
333
+ formatting: true,
334
+ )
335
+ end
336
+ end
337
+
338
+ # Emit unchanged lines when text1 and text2 have different line counts
339
+ # in the unchanged region. Uses prefix/suffix matching at the structural level
340
+ # to find which lines correspond, treating unmatched middle lines as reflow.
341
+ #
342
+ # This method handles unchanged regions between DiffNodes. Within those
343
+ # regions, we use prefix/suffix matching to find structural correspondence.
344
+ # The unmatched lines are marked as formatting-only (reflow).
345
+ # When many lines are unmatched, a summary is emitted instead.
346
+ def emit_unchanged_with_reflow(result, from1, to1, from2, to2)
347
+ slice1 = @lines1[from1...to1]
348
+ slice2 = @lines2[from2...to2]
349
+ return if slice1.empty? && slice2.empty?
350
+
351
+ # Fast path: if slices are identical, emit all as unchanged
352
+ if slice1 == slice2
353
+ emit_unchanged_range(result, from1, from2, slice1.length)
354
+ return
355
+ end
356
+
357
+ # Find common prefix (lines that match between the two slices)
358
+ prefix_len = 0
359
+ max_prefix = [slice1.length, slice2.length].min
360
+ while prefix_len < max_prefix &&
361
+ strip_for_compare(slice1[prefix_len]) == strip_for_compare(slice2[prefix_len])
362
+ prefix_len += 1
363
+ end
364
+
365
+ # Find common suffix
366
+ suffix_len = 0
367
+ max_suffix = [slice1.length - prefix_len,
368
+ slice2.length - prefix_len].min
369
+ while suffix_len < max_suffix &&
370
+ strip_for_compare(slice1[slice1.length - 1 - suffix_len]) ==
371
+ strip_for_compare(slice2[slice2.length - 1 - suffix_len])
372
+ suffix_len += 1
373
+ end
374
+
375
+ # Emit common prefix as unchanged
376
+ prefix_len.times do |i|
377
+ result << DiffLine.new(
378
+ line_number: from1 + i,
379
+ new_position: from2 + i,
380
+ content: slice1[i],
381
+ type: :unchanged,
382
+ )
383
+ end
384
+
385
+ # Emit middle (reflow) lines
386
+ mid_start1 = from1 + prefix_len
387
+ mid_end1 = to1 - suffix_len
388
+ mid_start2 = from2 + prefix_len
389
+ mid_end2 = to2 - suffix_len
390
+ mid_count1 = mid_end1 - mid_start1
391
+ mid_count2 = mid_end2 - mid_start2
392
+
393
+ if mid_count1 + mid_count2 >= REFLOW_SUMMARY_THRESHOLD
394
+ # Too many reflow lines — emit summary instead of listing each
395
+ emit_reflow_summary(result, mid_start1, mid_end1, mid_start2,
396
+ mid_end2)
397
+ else
398
+ # Few enough to show individually
399
+ # Lines only in text1 (removed by reflow)
400
+ (mid_start1...mid_end1).each do |line_idx|
401
+ next if line_idx >= @lines1.length
402
+
403
+ result << DiffLine.new(
404
+ line_number: line_idx,
405
+ content: @lines1[line_idx],
406
+ type: :removed,
407
+ formatting: true,
408
+ )
409
+ end
410
+
411
+ # Lines only in text2 (added by reflow)
412
+ (mid_start2...mid_end2).each do |line_idx|
413
+ next if line_idx >= @lines2.length
414
+
415
+ result << DiffLine.new(
416
+ line_number: line_idx,
417
+ new_position: line_idx,
418
+ content: @lines2[line_idx],
419
+ type: :added,
420
+ formatting: true,
421
+ )
422
+ end
423
+ end
424
+
425
+ # Emit common suffix as unchanged
426
+ suffix_len.times do |i|
427
+ idx1 = to1 - suffix_len + i
428
+ idx2 = to2 - suffix_len + i
429
+ next if idx1 >= @lines1.length && idx2 >= @lines2.length
430
+
431
+ content = if idx1 < @lines1.length
432
+ @lines1[idx1]
433
+ else
434
+ @lines2[idx2]
435
+ end
436
+
437
+ result << DiffLine.new(
438
+ line_number: idx1,
439
+ new_position: idx2,
440
+ content: content,
441
+ type: :unchanged,
442
+ )
443
+ end
444
+ end
445
+
446
+ # Helper to emit a range of unchanged lines
447
+ def emit_unchanged_range(result, from1, from2, count)
448
+ count.times do |i|
449
+ line1_idx = from1 + i
450
+ line2_idx = from2 + i
451
+ next if line1_idx >= @lines1.length && line2_idx >= @lines2.length
452
+
453
+ content = if line1_idx < @lines1.length
454
+ @lines1[line1_idx]
455
+ else
456
+ @lines2[line2_idx]
457
+ end
458
+
459
+ result << DiffLine.new(
460
+ line_number: line1_idx,
461
+ new_position: line2_idx,
462
+ content: content,
463
+ type: :unchanged,
464
+ )
465
+ end
466
+ end
467
+
468
+ # Emit a summary line for large reflow gaps instead of listing each line.
469
+ # This prevents output explosion when documents have different formatting
470
+ # that causes many lines to be unmatched in prefix/suffix matching.
471
+ #
472
+ # IMPORTANT: We only emit representative removed/added lines if they
473
+ # actually exist in the other text. Lines that are truly orphaned
474
+ # (don't exist in the other text) are NOT emitted as individual lines
475
+ # since that would be "inventing" diffs without diff_nodes.
476
+ def emit_reflow_summary(result, mid_start1, mid_end1, mid_start2,
477
+ mid_end2)
478
+ mid_count1 = mid_end1 - mid_start1
479
+ mid_count2 = mid_end2 - mid_start2
480
+
481
+ # Only emit representative lines if they exist in the other text.
482
+ # This avoids "inventing" diffs for content that truly doesn't exist.
483
+ first_removed_content = mid_count1.positive? && mid_start1 < @lines1.length ? @lines1[mid_start1] : nil
484
+ first_added_content = mid_count2.positive? && mid_start2 < @lines2.length ? @lines2[mid_start2] : nil
485
+
486
+ # Check if first lines exist in the other text (not truly orphaned)
487
+ show_first_removed = first_removed_content && @line_to_indices2.key?(first_removed_content)
488
+ show_first_added = first_added_content && @line_to_indices1.key?(first_added_content)
489
+
490
+ if show_first_removed
491
+ result << DiffLine.new(
492
+ line_number: mid_start1,
493
+ content: first_removed_content,
494
+ type: :removed,
495
+ formatting: true,
496
+ )
497
+ end
498
+
499
+ if show_first_added
500
+ result << DiffLine.new(
501
+ line_number: mid_start2,
502
+ new_position: mid_start2,
503
+ content: first_added_content,
504
+ type: :added,
505
+ formatting: true,
506
+ )
507
+ end
508
+
509
+ # Summary line when there are more than the first-shown pair
510
+ extra1 = show_first_removed ? [mid_count1 - 1, 0].max : mid_count1
511
+ extra2 = show_first_added ? [mid_count2 - 1, 0].max : mid_count2
512
+
513
+ if extra1.positive? || extra2.positive?
514
+ parts = []
515
+ parts << "#{extra1} more removed" if extra1.positive?
516
+ parts << "#{extra2} more added" if extra2.positive?
517
+
518
+ result << DiffLine.new(
519
+ line_number: mid_start1,
520
+ new_position: mid_start2,
521
+ content: "... #{parts.join(', ')} (formatting only) ...",
522
+ type: :reflow_summary,
523
+ formatting: true,
524
+ )
525
+ end
526
+ end
527
+
528
+ # Emit orphaned lines that exist in both texts but at different positions.
529
+ # This handles the case where structural changes cause content to be
530
+ # repositioned rather than added/removed.
531
+ #
532
+ # @param result [Array<DiffLine>] output array
533
+ # @param from1 [Integer] start line in text1
534
+ # @param to1 [Integer] end line (exclusive) in text1
535
+ # @param from2 [Integer] start line in text2
536
+ # @param to2 [Integer] end line (exclusive) in text2
537
+ # @param text1_orphaned [Boolean] true if text1 has the orphaned lines
538
+ def emit_orphaned_unchanged(result, from1, to1, from2, to2,
539
+ text1_orphaned)
540
+ if text1_orphaned
541
+ count = to1 - from1
542
+ count.times do |i|
543
+ line_idx = from1 + i
544
+ next if line_idx >= @lines1.length
545
+
546
+ content = @lines1[line_idx]
547
+ next unless content
548
+
549
+ if @line_to_indices2.key?(content)
550
+ new_pos = @line_to_indices2[content].min_by do |idx|
551
+ (idx - from2).abs
552
+ end
553
+ result << DiffLine.new(
554
+ line_number: line_idx,
555
+ new_position: new_pos,
556
+ content: content,
557
+ type: :unchanged,
558
+ )
559
+ end
560
+ end
561
+ else
562
+ count = to2 - from2
563
+ count.times do |i|
564
+ line_idx = from2 + i
565
+ next if line_idx >= @lines2.length
566
+
567
+ content = @lines2[line_idx]
568
+ next unless content
569
+
570
+ if @line_to_indices1.key?(content)
571
+ new_pos = @line_to_indices1[content].min_by do |idx|
572
+ (idx - from1).abs
573
+ end
574
+ result << DiffLine.new(
575
+ line_number: line_idx,
576
+ new_position: new_pos,
577
+ content: content,
578
+ type: :unchanged,
579
+ )
580
+ end
581
+ end
582
+ end
583
+ end
584
+
585
+ # Detect reflow: lines that exist in text1 but whose content is absorbed
586
+ # into an adjacent changed line in text2 (or vice versa).
587
+ def handle_reflow(result, cursor1, node_start1, _cursor2, _node_start2,
588
+ diff_node)
589
+ # Check if there are "extra" lines in text1 before the change
590
+ # that are absorbed into the changed line in text2
591
+ extra_lines1 = node_start1 - cursor1
592
+ return if extra_lines1 <= 0
593
+
594
+ # Check if the content of those extra lines appears in the
595
+ # adjacent line in text2
596
+ extra_content = @lines1[cursor1...node_start1].map(&:strip).join
597
+
598
+ # Find the nearest changed line in text2
599
+ next_new_line = find_changed_line_in_text2(diff_node)
600
+ return unless next_new_line
601
+
602
+ if next_new_line.include?(extra_content.strip)
603
+ # The extra lines are reflow — mark as formatting-only
604
+ # Remove any removed lines we already added for this range
605
+ # (they were added by emit_unchanged)
606
+ extra_lines1.times do |i|
607
+ line_idx = cursor1 + i
608
+ # Find and mark existing lines as formatting
609
+ existing = result.find do |dl|
610
+ dl.line_number == line_idx && dl.removed? && !dl.formatting?
611
+ end
612
+ existing&.formatting = true
613
+ end
614
+ end
615
+ end
616
+
617
+ # Find the content of the changed line in text2 for a DiffNode.
618
+ def find_changed_line_in_text2(diff_node)
619
+ new_ranges = diff_node.char_ranges&.select(&:new_side?)
620
+ return nil unless new_ranges&.any?
621
+
622
+ first_range = new_ranges.min_by(&:line_number)
623
+ return nil unless first_range
624
+
625
+ @lines2[first_range.line_number]
626
+ end
627
+
628
+ # Emit DiffLines for a single DiffNode's char_ranges.
629
+ def emit_changed(result, diff_node)
630
+ return unless diff_node.char_ranges && !diff_node.char_ranges.empty?
631
+
632
+ ranges = diff_node.char_ranges
633
+
634
+ # Group ranges by (line_number, side) to build DiffLines
635
+ old_line_ranges = {}
636
+ new_line_ranges = {}
637
+
638
+ ranges.each do |cr|
639
+ if cr.old_side?
640
+ (old_line_ranges[cr.line_number] ||= []) << cr
641
+ else
642
+ (new_line_ranges[cr.line_number] ||= []) << cr
643
+ end
644
+ end
645
+
646
+ # Determine what kind of change this is
647
+ has_old = !old_line_ranges.empty?
648
+ has_new = !new_line_ranges.empty?
649
+
650
+ if has_old && has_new
651
+ # Changed: exists in both texts
652
+ emit_changed_lines(result, diff_node, old_line_ranges,
653
+ new_line_ranges)
654
+ elsif has_old
655
+ # Removed: only in text1
656
+ emit_removed_lines(result, diff_node, old_line_ranges)
657
+ elsif has_new
658
+ # Added: only in text2
659
+ emit_added_lines(result, diff_node, new_line_ranges)
660
+ end
661
+ end
662
+
663
+ # Emit DiffLines for a change that exists in both texts.
664
+ def emit_changed_lines(result, diff_node, old_line_ranges,
665
+ new_line_ranges)
666
+ old_lines = old_line_ranges.keys.sort
667
+ new_lines = new_line_ranges.keys.sort
668
+
669
+ # For single-line changes, emit as a single :changed DiffLine
670
+ if old_lines.length == 1 && new_lines.length == 1
671
+ line1_idx = old_lines[0]
672
+ line2_idx = new_lines[0]
673
+ line1_content = @lines1[line1_idx]
674
+ line2_content = @lines2[line2_idx]
675
+
676
+ # For formatting detection, use the actual changed content from the DiffNode,
677
+ # not the full line content. The full line includes surrounding XML tags
678
+ # which would cause FormattingDetector to not detect whitespace-only changes.
679
+ text1 = diff_node&.serialized_before || line1_content
680
+ text2 = diff_node&.serialized_after || line2_content
681
+
682
+ dl = DiffLine.new(
683
+ line_number: line1_idx,
684
+ new_position: line2_idx,
685
+ content: line1_content,
686
+ new_content: line2_content,
687
+ type: :changed,
688
+ diff_node: diff_node,
689
+ formatting: formatting?(diff_node, text1, text2),
690
+ char_ranges: sort_ranges(old_line_ranges[line1_idx]),
691
+ new_char_ranges: sort_ranges(new_line_ranges[line2_idx]),
692
+ )
693
+ result << dl
694
+
695
+ # If line_range indicates content spans more lines than char_ranges cover,
696
+ # emit additional :added lines for the continuation lines.
697
+ # This handles multi-line text nodes where TextDecomposer only creates
698
+ # char_ranges on the starting line.
699
+ range1 = diff_node.line_range_before
700
+ range2 = diff_node.line_range_after
701
+ if range2 && new_lines[0] < range2[1]
702
+ # New version has continuation lines
703
+ ((new_lines[0] + 1)..range2[1]).each do |cont_line_idx|
704
+ next if cont_line_idx >= @lines2.length
705
+
706
+ cont_content = @lines2[cont_line_idx]
707
+ result << DiffLine.new(
708
+ line_number: cont_line_idx,
709
+ new_position: cont_line_idx,
710
+ content: cont_content,
711
+ type: :added,
712
+ formatting: true, # Continuation lines are formatting-only
713
+ )
714
+ end
715
+ end
716
+ if range1 && old_lines[0] < range1[1]
717
+ # Old version has continuation lines
718
+ ((old_lines[0] + 1)..range1[1]).each do |cont_line_idx|
719
+ next if cont_line_idx >= @lines1.length
720
+
721
+ cont_content = @lines1[cont_line_idx]
722
+ result << DiffLine.new(
723
+ line_number: cont_line_idx,
724
+ content: cont_content,
725
+ type: :removed,
726
+ formatting: true, # Continuation lines are formatting-only
727
+ )
728
+ end
729
+ end
730
+ else
731
+ # Multi-line change: emit old lines as :removed, new lines as :added
732
+ # But keep them associated with the same DiffNode
733
+
734
+ # Emit old lines
735
+ old_lines.each do |line_idx|
736
+ line_content = @lines1[line_idx]
737
+ result << DiffLine.new(
738
+ line_number: line_idx,
739
+ content: line_content,
740
+ type: :removed,
741
+ diff_node: diff_node,
742
+ formatting: formatting?(diff_node, line_content, ""),
743
+ char_ranges: sort_ranges(old_line_ranges[line_idx]),
744
+ )
745
+ end
746
+
747
+ # Emit new lines
748
+ new_lines.each do |line_idx|
749
+ line_content = @lines2[line_idx]
750
+ result << DiffLine.new(
751
+ line_number: line_idx, # Required; same as new_position for added lines
752
+ new_position: line_idx,
753
+ content: line_content,
754
+ type: :added,
755
+ diff_node: diff_node,
756
+ formatting: formatting?(diff_node, "", line_content),
757
+ new_char_ranges: sort_ranges(new_line_ranges[line_idx]),
758
+ )
759
+ end
760
+ end
761
+ end
762
+
763
+ # Emit DiffLines for a removal (only in text1).
764
+ def emit_removed_lines(result, diff_node, old_line_ranges)
765
+ old_lines = old_line_ranges.keys.sort
766
+
767
+ old_lines.each do |line_idx|
768
+ line_content = @lines1[line_idx]
769
+ result << DiffLine.new(
770
+ line_number: line_idx,
771
+ content: line_content,
772
+ type: :removed,
773
+ diff_node: diff_node,
774
+ formatting: formatting?(diff_node, line_content, ""),
775
+ char_ranges: sort_ranges(old_line_ranges[line_idx]),
776
+ )
777
+ end
778
+
779
+ # Emit continuation lines when line_range_before extends beyond the lines
780
+ # that have char_ranges. This handles multi-line elements where
781
+ # TextDecomposer only creates char_ranges on the starting line.
782
+ range1 = diff_node.line_range_before
783
+ if range1 && old_lines.any? && old_lines.last < range1[1]
784
+ ((old_lines.last + 1)..range1[1]).each do |cont_line_idx|
785
+ next if cont_line_idx >= @lines1.length
786
+
787
+ cont_content = @lines1[cont_line_idx]
788
+ result << DiffLine.new(
789
+ line_number: cont_line_idx,
790
+ content: cont_content,
791
+ type: :removed,
792
+ formatting: true, # Continuation lines are formatting-only
793
+ )
794
+ end
795
+ end
796
+ end
797
+
798
+ # Emit DiffLines for an addition (only in text2).
799
+ def emit_added_lines(result, diff_node, new_line_ranges)
800
+ new_lines = new_line_ranges.keys.sort
801
+
802
+ new_lines.each do |line_idx|
803
+ line_content = @lines2[line_idx]
804
+ result << DiffLine.new(
805
+ line_number: line_idx, # Required; same as new_position for added lines
806
+ new_position: line_idx,
807
+ content: line_content,
808
+ type: :added,
809
+ diff_node: diff_node,
810
+ formatting: formatting?(diff_node, "", line_content),
811
+ new_char_ranges: sort_ranges(new_line_ranges[line_idx]),
812
+ )
813
+ end
814
+
815
+ # Emit continuation lines when line_range_after extends beyond the lines
816
+ # that have char_ranges. This handles multi-line elements where
817
+ # TextDecomposer only creates char_ranges on the starting line.
818
+ range2 = diff_node.line_range_after
819
+ if range2 && new_lines.any? && new_lines.last < range2[1]
820
+ ((new_lines.last + 1)..range2[1]).each do |cont_line_idx|
821
+ next if cont_line_idx >= @lines2.length
822
+
823
+ cont_content = @lines2[cont_line_idx]
824
+ result << DiffLine.new(
825
+ line_number: cont_line_idx,
826
+ new_position: cont_line_idx,
827
+ content: cont_content,
828
+ type: :added,
829
+ formatting: true, # Continuation lines are formatting-only
830
+ )
831
+ end
832
+ end
833
+ end
834
+
835
+ # Build a reverse index mapping line content to array of line indices.
836
+ # Used for efficient lookup when handling orphaned lines in gaps.
837
+ #
838
+ # @param lines [Array<String>] Array of lines
839
+ # @return [Hash{String => Array<Integer>}] Map from content to indices
840
+ def build_line_index(lines)
841
+ index = Hash.new { |h, k| h[k] = [] }
842
+ lines.each_with_index { |line, idx| index[line] << idx }
843
+ index
844
+ end
845
+
846
+ # Sort char ranges by start_col for consistent rendering.
847
+ def sort_ranges(ranges)
848
+ (ranges || []).sort_by(&:start_col)
849
+ end
850
+
851
+ # Strip a line for comparison purposes (handles whitespace-only differences).
852
+ def strip_for_compare(line)
853
+ line.strip
854
+ end
855
+
856
+ # Compute formatting flag for a DiffLine.
857
+ #
858
+ # The DiffNode's explicit formatting? flag takes precedence:
859
+ # - If formatting? == true: return true (explicitly formatting-only)
860
+ #
861
+ # If node exists and is normative (formatting? is nil but norm is true):
862
+ # - Check line-level formatting via FormattingDetector for whitespace-only changes
863
+ # - But NOT via comment_only_line? heuristic because comment content is different
864
+ #
865
+ # If node exists and is informative (norm=false):
866
+ # - Return false (informative diffs are always shown as informative)
867
+ #
868
+ # If NO node exists (diff_node is nil):
869
+ # - Use heuristics: comment-only lines and FormattingDetector
870
+ #
871
+ # @param diff_node [DiffNode, nil] The associated DiffNode
872
+ # @param line1 [String, nil] Old line content
873
+ # @param line2 [String, nil] New line content
874
+ # @return [Boolean] true if formatting-only
875
+ def formatting?(diff_node, line1, line2)
876
+ # If node explicitly has formatting? == true, it's formatting-only
877
+ return true if diff_node&.formatting?
878
+
879
+ if diff_node
880
+ # Node exists - use node classification
881
+ return false unless diff_node.normative?
882
+
883
+ # For normative nodes, check line-level formatting
884
+ # (but NOT comment_only_line? which would misclassify comment content changes)
885
+ elsif comment_only_line?(line1) || comment_only_line?(line2)
886
+ # No DiffNode: use heuristics
887
+ return true
888
+
889
+ end
890
+ FormattingDetector.formatting_only?(line1, line2)
891
+ end
892
+
893
+ # Check if a line is entirely an XML comment (possibly with whitespace).
894
+ # Used as heuristic: comment-only lines with no DiffNode are likely
895
+ # filtered/ignored comments, not normative differences.
896
+ #
897
+ # @param line [String, nil] Line content
898
+ # @return [Boolean] true if comment-only
899
+ def comment_only_line?(line)
900
+ return false if line.nil?
901
+
902
+ stripped = line.strip
903
+ stripped.start_with?("<!--") && stripped.end_with?("-->")
904
+ end
905
+ end
906
+ end
907
+ end