@createiq/htmldiff 1.1.0-beta.0 → 1.2.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/TableDiff.ts CHANGED
@@ -1,3 +1,20 @@
1
+ import {
2
+ type Alignment,
3
+ findOptimalAlignmentSkips,
4
+ lcsAlign,
5
+ orderAlignmentForEmission,
6
+ pairSimilarUnmatched,
7
+ textSimilarity,
8
+ } from './Alignment'
9
+ import {
10
+ findMatchingClosingTag,
11
+ injectClass,
12
+ matchesClosingTagAt,
13
+ matchesTagAt,
14
+ parseOpeningTagAt,
15
+ } from './HtmlScanner'
16
+ import { wrapText } from './Utils'
17
+
1
18
  /**
2
19
  * Table-aware preprocessing for HtmlDiff.
3
20
  *
@@ -22,7 +39,7 @@
22
39
  * by the normal word-level pipeline.
23
40
  */
24
41
 
25
- interface CellRange {
42
+ export interface CellRange {
26
43
  /** Start index of the cell's opening tag in the original html. */
27
44
  cellStart: number
28
45
  /** Index just past the cell's closing tag. */
@@ -32,13 +49,13 @@ interface CellRange {
32
49
  contentEnd: number
33
50
  }
34
51
 
35
- interface RowRange {
52
+ export interface RowRange {
36
53
  rowStart: number
37
54
  rowEnd: number
38
55
  cells: CellRange[]
39
56
  }
40
57
 
41
- interface TableRange {
58
+ export interface TableRange {
42
59
  tableStart: number
43
60
  tableEnd: number
44
61
  rows: RowRange[]
@@ -73,9 +90,26 @@ const PLACEHOLDER_SUFFIX = '-->'
73
90
  const MAX_TABLE_ROWS = 1500
74
91
  const MAX_TABLE_CELLS_PER_ROW = 200
75
92
 
76
- function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
93
+ // Caps for the per-row column-position DP in
94
+ // findBestColumnInsertPositions / findBestColumnDeletePositions.
95
+ // MAX_COLUMN_DELTA is the *semantic* guard: a row with more than 6
96
+ // columns added or deleted is almost always a row rewrite rather than
97
+ // a structural column change, and is better handled by cell-LCS with
98
+ // fuzzy pairing. MAX_COLUMN_SEARCH_WIDTH bounds the per-row DP at
99
+ // O(MAX_COLUMN_SEARCH_WIDTH²) ≈ 40K ops; aligned with
100
+ // MAX_TABLE_CELLS_PER_ROW so any row that survives the table-size cap
101
+ // can still use the DP path.
102
+ const MAX_COLUMN_DELTA = 6
103
+ const MAX_COLUMN_SEARCH_WIDTH = 200
104
+
105
+ /**
106
+ * Generate a placeholder-prefix nonce that doesn't collide with any
107
+ * existing content in the inputs. Variadic so callers with N inputs
108
+ * (e.g. three-way diff with V1/V2/V3) check across all of them.
109
+ */
110
+ export function makePlaceholderPrefix(...inputs: string[]): string {
77
111
  // 4 random bytes → 8 hex chars → 16^8 ≈ 4.3 billion combinations. We
78
- // also retry if the generated nonce happens to occur in either input.
112
+ // also retry if the generated nonce happens to occur in any input.
79
113
  // Using `Math.random` here is fine: we're not defending against a
80
114
  // malicious adversary, just avoiding accidental collisions.
81
115
  for (let attempt = 0; attempt < 8; attempt++) {
@@ -83,7 +117,7 @@ function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
83
117
  .toString(16)
84
118
  .padStart(8, '0')
85
119
  const prefix = `${PLACEHOLDER_PREFIX_BASE}${nonce}_`
86
- if (!oldHtml.includes(prefix) && !newHtml.includes(prefix)) {
120
+ if (inputs.every(input => !input.includes(prefix))) {
87
121
  return prefix
88
122
  }
89
123
  }
@@ -93,6 +127,8 @@ function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
93
127
  return `${PLACEHOLDER_PREFIX_BASE}fallback_${Date.now()}_`
94
128
  }
95
129
 
130
+ export { PLACEHOLDER_SUFFIX }
131
+
96
132
  type DiffCellFn = (oldCellContent: string, newCellContent: string) => string
97
133
 
98
134
  /**
@@ -145,11 +181,11 @@ export function restoreTablePlaceholders(diffOutput: string, placeholderToDiff:
145
181
  return result
146
182
  }
147
183
 
148
- function spliceString(s: string, start: number, end: number, replacement: string): string {
184
+ export function spliceString(s: string, start: number, end: number, replacement: string): string {
149
185
  return s.slice(0, start) + replacement + s.slice(end)
150
186
  }
151
187
 
152
- function exceedsSizeLimit(table: TableRange): boolean {
188
+ export function exceedsSizeLimit(table: TableRange): boolean {
153
189
  if (table.rows.length > MAX_TABLE_ROWS) return true
154
190
  for (const row of table.rows) {
155
191
  if (row.cells.length > MAX_TABLE_CELLS_PER_ROW) return true
@@ -320,7 +356,7 @@ function emitEmptyRow(html: string, row: RowRange): string {
320
356
  return html.slice(row.rowStart, row.rowEnd)
321
357
  }
322
358
 
323
- function sameDimensions(a: TableRange, b: TableRange): boolean {
359
+ export function sameDimensions(a: TableRange, b: TableRange): boolean {
324
360
  if (a.rows.length !== b.rows.length) return false
325
361
  for (let i = 0; i < a.rows.length; i++) {
326
362
  if (a.rows[i].cells.length !== b.rows[i].cells.length) return false
@@ -395,7 +431,7 @@ function diffStructurallyAlignedTable(
395
431
  // has no rows at all, fall back to a from-scratch reconstruction so
396
432
  // we still emit deleted rows.
397
433
  if (newTable.rows.length === 0) {
398
- return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell)
434
+ return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment)
399
435
  }
400
436
 
401
437
  const out: string[] = []
@@ -408,97 +444,23 @@ function diffStructurallyAlignedTable(
408
444
  if (align.oldIdx !== null) {
409
445
  out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[align.oldIdx], newRow, diffCell))
410
446
  } else {
411
- out.push(emitFullRow(newHtml, newRow, 'ins', diffCell))
447
+ out.push(emitFullRow(newHtml, newRow, 'ins'))
412
448
  }
413
449
  cursor = newRow.rowEnd
414
450
  } else if (align.oldIdx !== null) {
415
- out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
451
+ out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'))
416
452
  }
417
453
  }
418
454
  out.push(newHtml.slice(cursor, newTable.tableEnd))
419
455
  return out.join('')
420
456
  }
421
457
 
422
- /**
423
- * Reorders the alignment so emission produces rows in the visually-
424
- * correct order. Each entry is assigned a fractional "position" in
425
- * new's flow:
426
- *
427
- * • Preserved/paired (oldIdx, newIdx): position = newIdx.
428
- * • Pure insert (null, newIdx): position = newIdx.
429
- * • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
430
- * before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
431
- * they appear in old's row order. The +0.5 places dels BEFORE any
432
- * insert at the same gap (insert at newIdx N1+1 has position N1+1
433
- * which is > N1+0.5), giving the natural "delete first, insert
434
- * second" reading order at a replaced position.
435
- *
436
- * This handles the full range:
437
- * • Run of unpaired dels at the start (no preserved predecessor):
438
- * position -0.5, sorted by oldIdx.
439
- * • Dels in the middle: positioned right after their preceding
440
- * preserved row.
441
- * • Dels at the end (no preserved successor): positioned after the
442
- * last preserved row.
443
- *
444
- * Without this reordering, a run of unpaired deletes at low alignment
445
- * indices got emitted at cursor = first-new-row position — putting
446
- * all deletes before any preserved row in the output, regardless of
447
- * where they came from in old.
448
- */
449
- function orderAlignmentForEmission(alignment: Alignment[]): Alignment[] {
450
- const preserved: Array<{ oldIdx: number; newIdx: number }> = []
451
- for (const a of alignment) {
452
- if (a.oldIdx !== null && a.newIdx !== null) {
453
- preserved.push({ oldIdx: a.oldIdx, newIdx: a.newIdx })
454
- }
455
- }
456
- preserved.sort((a, b) => a.oldIdx - b.oldIdx)
457
-
458
- // For a deleted row with oldIdx K, return the newIdx of the preserved
459
- // entry with the largest oldIdx less than K, or -1 if none.
460
- function newIdxOfPreservedBefore(oldIdx: number): number {
461
- let result = -1
462
- for (const p of preserved) {
463
- if (p.oldIdx >= oldIdx) break
464
- result = p.newIdx
465
- }
466
- return result
467
- }
468
-
469
- // Decorate each alignment with a fractional position. We use
470
- // (primary, secondary) tuples so dels at the same gap sort by oldIdx
471
- // (in old's row order) and inserts at the same newIdx stay stable.
472
- const decorated = alignment.map((a, i) => {
473
- let primary: number
474
- let secondary: number
475
- if (a.newIdx !== null) {
476
- primary = a.newIdx
477
- secondary = a.oldIdx === null ? 1 : 0 // preserved before pure-insert at same newIdx (rare)
478
- } else {
479
- // Pure delete
480
- primary = newIdxOfPreservedBefore(a.oldIdx as number) + 0.5
481
- secondary = a.oldIdx as number
482
- }
483
- return { entry: a, primary, secondary, originalIdx: i }
484
- })
485
-
486
- decorated.sort((a, b) => {
487
- if (a.primary !== b.primary) return a.primary - b.primary
488
- if (a.secondary !== b.secondary) return a.secondary - b.secondary
489
- return a.originalIdx - b.originalIdx // stable
490
- })
491
-
492
- return decorated.map(d => d.entry)
493
- }
494
-
495
458
  function rebuildStructurallyAlignedTable(
496
459
  oldHtml: string,
497
460
  newHtml: string,
498
461
  oldTable: TableRange,
499
462
  newTable: TableRange,
500
- alignment: Alignment[],
501
- diffCell: DiffCellFn
463
+ alignment: Alignment[]
502
464
  ): string {
503
465
  // Used when new has no rows but old does — we lose the per-row
504
466
  // wrappers from new (there are none), so reconstruct from old's frame.
@@ -506,9 +468,9 @@ function rebuildStructurallyAlignedTable(
506
468
  out.push(headerSlice(newHtml, newTable, oldHtml, oldTable))
507
469
  for (const align of alignment) {
508
470
  if (align.oldIdx !== null) {
509
- out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
471
+ out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'))
510
472
  } else if (align.newIdx !== null) {
511
- out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], 'ins', diffCell))
473
+ out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], 'ins'))
512
474
  }
513
475
  }
514
476
  out.push('</table>')
@@ -526,7 +488,7 @@ function headerSlice(newHtml: string, newTable: TableRange, oldHtml: string, old
526
488
  return oldHtml.slice(oldTable.tableStart, oldFirstRow)
527
489
  }
528
490
 
529
- function rowKey(html: string, row: RowRange): string {
491
+ export function rowKey(html: string, row: RowRange): string {
530
492
  // Include cell tag text in the key so column-add doesn't accidentally
531
493
  // match a row to one with different cell counts. Whitespace-normalize to
532
494
  // tolerate formatting differences.
@@ -548,59 +510,51 @@ function diffPreservedRow(
548
510
  // on each affected cell.
549
511
  const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
550
512
  if (colspanAligned !== null) return colspanAligned
551
- // For a single-column add/delete (cell count differs by exactly 1),
552
- // detect the position via positional similarity scan and align the
553
- // remaining cells positionally. This handles the case where a column
554
- // was added AND a different cell got an unrelated content edit — the
555
- // edited cell still aligns by position rather than getting orphaned by
556
- // the cell-LCS exact-match.
557
- const delta = newRow.cells.length - oldRow.cells.length
558
513
  // For column add/delete (cell counts differ), find the best insertion
559
514
  // or deletion positions via positional similarity scan and align the
560
515
  // remaining cells positionally. This handles content-edit alongside
561
516
  // column-add by keeping the edited cell in its column position rather
562
517
  // than orphaning it via the cell-LCS exact match.
563
- // Guardrail: combinatorial search is C(newCount, k); we cap to avoid
564
- // explosion on very wide tables. The cap is generous for real legal
565
- // schedules; anything above falls through to cell-LCS.
518
+ // Guardrail: O(M × N) DP scales fine within MAX_COLUMN_SEARCH_WIDTH;
519
+ // wider rows fall through to cell-LCS so we don't run the per-row DP
520
+ // on multi-thousand-cell exotica. MAX_COLUMN_DELTA stays as a
521
+ // semantic guard — a delta > 6 usually means "row rewrite", not
522
+ // "column added", and is better handled by cell-LCS.
523
+ const delta = newRow.cells.length - oldRow.cells.length
566
524
  const absDelta = Math.abs(delta)
567
525
  if (
568
526
  absDelta > 0 &&
569
527
  absDelta <= MAX_COLUMN_DELTA &&
570
528
  Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH
571
529
  ) {
572
- if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell)
573
- return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell)
530
+ if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, diffCell)
531
+ return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, diffCell)
574
532
  }
575
533
  return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
576
534
  }
577
535
 
578
- const MAX_COLUMN_DELTA = 6
579
- const MAX_COLUMN_SEARCH_WIDTH = 40
580
-
581
536
  /**
582
- * For a row where new has K more cells than old, find the K column
583
- * positions in new where cells were inserted by scanning all C(newCount,
584
- * K) combinations and picking the one that maximises positional content
585
- * similarity with the remaining cells. The inserted cells are emitted
586
- * with diff markers; the rest are aligned positionally with content
587
- * diff for matched pairs.
537
+ * For a row where new has more cells than old, find the column positions
538
+ * in new where cells were inserted by running a monotonic-alignment DP
539
+ * over the cell texts: pick the skip positions that maximise the sum-of-
540
+ * similarities of the unskipped new cells aligned positionally against
541
+ * the old cells. The inserted cells are emitted with diff markers; the
542
+ * rest are aligned positionally with content diff for matched pairs.
588
543
  */
589
544
  function diffMultiColumnAddRow(
590
545
  oldHtml: string,
591
546
  newHtml: string,
592
547
  oldRow: RowRange,
593
548
  newRow: RowRange,
594
- k: number,
595
549
  diffCell: DiffCellFn
596
550
  ): string {
597
- const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml)
551
+ const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, oldHtml, newHtml)
598
552
  const inserted = new Set(insertedPositions)
599
553
  const out: string[] = [rowHeaderSlice(newHtml, newRow)]
600
554
  let oldIdx = 0
601
555
  for (let c = 0; c < newRow.cells.length; c++) {
602
556
  if (inserted.has(c)) {
603
- out.push(emitFullCell(newHtml, newRow.cells[c], 'ins', diffCell))
557
+ out.push(emitFullCell(newHtml, newRow.cells[c], 'ins'))
604
558
  } else {
605
559
  out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell))
606
560
  oldIdx++
@@ -615,16 +569,15 @@ function diffMultiColumnDeleteRow(
615
569
  newHtml: string,
616
570
  oldRow: RowRange,
617
571
  newRow: RowRange,
618
- k: number,
619
572
  diffCell: DiffCellFn
620
573
  ): string {
621
- const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml)
574
+ const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, oldHtml, newHtml)
622
575
  const deleted = new Set(deletedPositions)
623
576
  const out: string[] = [rowHeaderSlice(newHtml, newRow)]
624
577
  let newIdx = 0
625
578
  for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
626
579
  if (deleted.has(oldIdx)) {
627
- out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del', diffCell))
580
+ out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del'))
628
581
  continue
629
582
  }
630
583
  out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell))
@@ -634,74 +587,20 @@ function diffMultiColumnDeleteRow(
634
587
  return out.join('')
635
588
  }
636
589
 
637
- function findBestColumnInsertPositions(
638
- oldRow: RowRange,
639
- newRow: RowRange,
640
- k: number,
641
- oldHtml: string,
642
- newHtml: string
643
- ): number[] {
644
- let bestPositions: number[] = []
645
- let bestScore = -1
646
- for (const combo of combinationsOfRange(newRow.cells.length, k)) {
647
- const inserted = new Set(combo)
648
- let score = 0
649
- let oldIdx = 0
650
- for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
651
- if (inserted.has(newIdx)) continue
652
- score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
653
- oldIdx++
654
- }
655
- if (score > bestScore) {
656
- bestScore = score
657
- bestPositions = combo
658
- }
659
- }
660
- return bestPositions
661
- }
662
-
663
- function findBestColumnDeletePositions(
664
- oldRow: RowRange,
665
- newRow: RowRange,
666
- k: number,
667
- oldHtml: string,
668
- newHtml: string
669
- ): number[] {
670
- let bestPositions: number[] = []
671
- let bestScore = -1
672
- for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
673
- const deleted = new Set(combo)
674
- let score = 0
675
- let newIdx = 0
676
- for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
677
- if (deleted.has(oldIdx)) continue
678
- score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
679
- newIdx++
680
- }
681
- if (score > bestScore) {
682
- bestScore = score
683
- bestPositions = combo
684
- }
685
- }
686
- return bestPositions
590
+ function findBestColumnInsertPositions(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number[] {
591
+ const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
592
+ const newTexts = newRow.cells.map(c => cellText(newHtml, c))
593
+ return findOptimalAlignmentSkips(oldTexts, newTexts, (oldIdx, newIdx) =>
594
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
595
+ )
687
596
  }
688
597
 
689
- /**
690
- * Yields all sorted-ascending combinations of `k` distinct integers
691
- * from [0, n). Iterative implementation avoids recursion overhead and
692
- * keeps memory at O(k).
693
- */
694
- function* combinationsOfRange(n: number, k: number): IterableIterator<number[]> {
695
- if (k === 0 || k > n) return
696
- const indices = Array.from({ length: k }, (_, i) => i)
697
- while (true) {
698
- yield indices.slice()
699
- let i = k - 1
700
- while (i >= 0 && indices[i] === n - k + i) i--
701
- if (i < 0) return
702
- indices[i]++
703
- for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1
704
- }
598
+ function findBestColumnDeletePositions(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number[] {
599
+ const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
600
+ const newTexts = newRow.cells.map(c => cellText(newHtml, c))
601
+ return findOptimalAlignmentSkips(newTexts, oldTexts, (newIdx, oldIdx) =>
602
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
603
+ )
705
604
  }
706
605
 
707
606
  /**
@@ -864,9 +763,9 @@ function diffStructurallyAlignedRow(
864
763
  const newCell = newRow.cells[align.newIdx]
865
764
  out.push(emitDiffedCell(oldHtml, newHtml, oldCell, newCell, diffCell))
866
765
  } else if (align.newIdx !== null) {
867
- out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], 'ins', diffCell))
766
+ out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], 'ins'))
868
767
  } else if (align.oldIdx !== null) {
869
- out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], 'del', diffCell))
768
+ out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], 'del'))
870
769
  }
871
770
  }
872
771
 
@@ -888,7 +787,7 @@ function cellKey(html: string, cell: CellRange): string {
888
787
  * each `<td>`, with an `<ins>`/`<del>` wrapper around any cell content
889
788
  * (empty cells get the class but no wrapper).
890
789
  */
891
- function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell: DiffCellFn): string {
790
+ function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del'): string {
892
791
  const cls = kind === 'ins' ? 'diffins' : 'diffdel'
893
792
  const trOpening = parseOpeningTagAt(html, row.rowStart)
894
793
  if (!trOpening) return html.slice(row.rowStart, row.rowEnd)
@@ -898,7 +797,7 @@ function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell:
898
797
  let cursor = trOpening.end
899
798
  for (const cell of row.cells) {
900
799
  out.push(html.slice(cursor, cell.cellStart))
901
- out.push(emitFullCell(html, cell, kind, diffCell))
800
+ out.push(emitFullCell(html, cell, kind))
902
801
  cursor = cell.cellEnd
903
802
  }
904
803
  out.push(html.slice(cursor, row.rowEnd))
@@ -913,7 +812,7 @@ function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell:
913
812
  * the full recursive diff would produce for newly-inserted formatting.
914
813
  * Empty cells get the class on the `<td>` but no inner wrapping.
915
814
  */
916
- function emitFullCell(html: string, cell: CellRange, kind: 'ins' | 'del', _diffCell: DiffCellFn): string {
815
+ function emitFullCell(html: string, cell: CellRange, kind: 'ins' | 'del'): string {
917
816
  const cls = kind === 'ins' ? 'diffins' : 'diffdel'
918
817
  const tdOpening = parseOpeningTagAt(html, cell.cellStart)
919
818
  if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd)
@@ -954,7 +853,7 @@ function wrapInlineTextRuns(content: string, kind: 'ins' | 'del'): string {
954
853
  while (j < content.length && content[j] !== '<') j++
955
854
  const text = content.slice(i, j)
956
855
  if (text.trim().length > 0) {
957
- out.push(`<${tag} class='${cls}'>${text}</${tag}>`)
856
+ out.push(wrapText(text, tag, cls))
958
857
  } else {
959
858
  out.push(text)
960
859
  }
@@ -993,11 +892,6 @@ function rowHeaderSlice(html: string, row: RowRange): string {
993
892
  return html.slice(row.rowStart, row.cells[0].cellStart)
994
893
  }
995
894
 
996
- interface Alignment {
997
- oldIdx: number | null
998
- newIdx: number | null
999
- }
1000
-
1001
895
  /** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
1002
896
  const ROW_FUZZY_THRESHOLD = 0.5
1003
897
 
@@ -1024,8 +918,13 @@ function pairSimilarUnmatchedRows(
1024
918
  oldHtml: string,
1025
919
  newHtml: string
1026
920
  ): Alignment[] {
921
+ // Pre-compute row texts once; the similarity callback is invoked
922
+ // O(D × I) times per unmatched run (every del × every ins), and
923
+ // rowText walks every cell.
924
+ const oldTexts = oldTable.rows.map(r => rowText(oldHtml, r))
925
+ const newTexts = newTable.rows.map(r => rowText(newHtml, r))
1027
926
  return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
1028
- rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml)
927
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
1029
928
  )
1030
929
  }
1031
930
 
@@ -1036,149 +935,14 @@ function pairSimilarUnmatchedCells(
1036
935
  oldHtml: string,
1037
936
  newHtml: string
1038
937
  ): Alignment[] {
938
+ const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
939
+ const newTexts = newRow.cells.map(c => cellText(newHtml, c))
1039
940
  return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
1040
- cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
941
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
1041
942
  )
1042
943
  }
1043
944
 
1044
- /**
1045
- * Identify pairings inside each unmatched-only run, then build the output
1046
- * alignment by walking the original and substituting paired entries at
1047
- * the *ins position* (not the del position). This keeps the result
1048
- * monotonic in newIdx — critical because the cursor-based emission
1049
- * downstream walks new's html in order. Emitting at the del position
1050
- * would be fine when del<ins in the alignment array (the typical case),
1051
- * but can violate monotonicity when there are mixed unpaired entries in
1052
- * between (column-add + row-add together, content-edit + column-add,
1053
- * etc.).
1054
- *
1055
- * Generic over what's being paired — works for both rows (by full row
1056
- * content similarity) and cells (by per-cell content similarity).
1057
- */
1058
- function pairSimilarUnmatched(
1059
- alignment: Alignment[],
1060
- threshold: number,
1061
- similarity: (oldIdx: number, newIdx: number) => number
1062
- ): Alignment[] {
1063
- const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
1064
- let i = 0
1065
- while (i < alignment.length) {
1066
- if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
1067
- i++
1068
- continue
1069
- }
1070
- const runStart = i
1071
- while (i < alignment.length && (alignment[i].oldIdx === null) !== (alignment[i].newIdx === null)) i++
1072
- const runEnd = i
1073
-
1074
- const delIndices: number[] = []
1075
- const insIndices: number[] = []
1076
- for (let k = runStart; k < runEnd; k++) {
1077
- if (alignment[k].oldIdx !== null) delIndices.push(k)
1078
- else insIndices.push(k)
1079
- }
1080
-
1081
- const usedIns = new Set<number>()
1082
- for (const di of delIndices) {
1083
- let bestIi = -1
1084
- let bestSim = threshold
1085
- for (const ii of insIndices) {
1086
- if (usedIns.has(ii)) continue
1087
- const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
1088
- if (sim > bestSim) {
1089
- bestSim = sim
1090
- bestIi = ii
1091
- }
1092
- }
1093
- if (bestIi >= 0) {
1094
- pairs.set(di, bestIi)
1095
- usedIns.add(bestIi)
1096
- }
1097
- }
1098
- }
1099
-
1100
- const insToDel = new Map<number, number>() // ins-alignment-idx → del-alignment-idx
1101
- for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi)
1102
- const pairedDels = new Set<number>(pairs.keys())
1103
-
1104
- const result: Alignment[] = []
1105
- for (let k = 0; k < alignment.length; k++) {
1106
- if (pairedDels.has(k)) continue // paired del — emitted when we reach its ins
1107
- if (insToDel.has(k)) {
1108
- const delAi = insToDel.get(k) as number
1109
- result.push({ oldIdx: alignment[delAi].oldIdx, newIdx: alignment[k].newIdx })
1110
- } else {
1111
- result.push(alignment[k])
1112
- }
1113
- }
1114
- return result
1115
- }
1116
-
1117
- /**
1118
- * Combined similarity metric used for both row-level and cell-level
1119
- * fuzzy pairing. Returns the MAX of two complementary metrics:
1120
- *
1121
- * 1. **Character prefix+suffix similarity** — fraction of the longer
1122
- * string covered by shared prefix + shared suffix. Catches small
1123
- * edits in the middle of a string (one word changed in a row).
1124
- * Misses cases where the bulk of common content is in the middle
1125
- * and the ends differ.
1126
- *
1127
- * 2. **Token Jaccard similarity** — intersection-over-union of the
1128
- * whitespace-split tokens. Catches "most of the content is the
1129
- * same but bookended by different bits" — e.g. a row whose only
1130
- * edit is a column added at the start and another at the end,
1131
- * where the ~50 chars in the middle that DO match would be
1132
- * invisible to prefix+suffix.
1133
- *
1134
- * Either metric exceeding the threshold means pair. Neither alone is
1135
- * sufficient for the full range of legal-doc edits we see in
1136
- * production tables.
1137
- */
1138
- function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number {
1139
- return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow))
1140
- }
1141
-
1142
- function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
1143
- return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell))
1144
- }
1145
-
1146
- function textSimilarity(a: string, b: string): number {
1147
- if (a === b) return 1
1148
- if (a.length === 0 || b.length === 0) return 0
1149
- return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
1150
- }
1151
-
1152
- function charPrefixSuffixSimilarity(a: string, b: string): number {
1153
- let prefix = 0
1154
- const minLen = Math.min(a.length, b.length)
1155
- while (prefix < minLen && a[prefix] === b[prefix]) prefix++
1156
-
1157
- let suffix = 0
1158
- while (
1159
- suffix < a.length - prefix &&
1160
- suffix < b.length - prefix &&
1161
- a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
1162
- ) {
1163
- suffix++
1164
- }
1165
-
1166
- return (prefix + suffix) / Math.max(a.length, b.length)
1167
- }
1168
-
1169
- function tokenJaccardSimilarity(a: string, b: string): number {
1170
- const tokensA = new Set(a.split(/\s+/).filter(Boolean))
1171
- const tokensB = new Set(b.split(/\s+/).filter(Boolean))
1172
- if (tokensA.size === 0 && tokensB.size === 0) return 1
1173
- let intersection = 0
1174
- for (const t of tokensA) {
1175
- if (tokensB.has(t)) intersection++
1176
- }
1177
- const union = tokensA.size + tokensB.size - intersection
1178
- return union === 0 ? 0 : intersection / union
1179
- }
1180
-
1181
- function rowText(html: string, row: RowRange): string {
945
+ export function rowText(html: string, row: RowRange): string {
1182
946
  const parts: string[] = []
1183
947
  for (const cell of row.cells) {
1184
948
  parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, ' '))
@@ -1195,140 +959,13 @@ function cellText(html: string, cell: CellRange): string {
1195
959
  .toLowerCase()
1196
960
  }
1197
961
 
1198
- /**
1199
- * Standard LCS alignment: walks both sequences and emits a list of pairs
1200
- * where `(oldIdx, newIdx)` are both set for matching positions, and one
1201
- * side is null for an unmatched entry on the other side. Equality uses
1202
- * strict ===.
1203
- */
1204
- function lcsAlign(oldKeys: string[], newKeys: string[]): Alignment[] {
1205
- const m = oldKeys.length
1206
- const n = newKeys.length
1207
- const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
1208
- for (let i = 1; i <= m; i++) {
1209
- for (let j = 1; j <= n; j++) {
1210
- if (oldKeys[i - 1] === newKeys[j - 1]) {
1211
- dp[i][j] = dp[i - 1][j - 1] + 1
1212
- } else {
1213
- dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1])
1214
- }
1215
- }
1216
- }
1217
-
1218
- const result: Alignment[] = []
1219
- let i = m
1220
- let j = n
1221
- while (i > 0 || j > 0) {
1222
- if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
1223
- result.unshift({ oldIdx: i - 1, newIdx: j - 1 })
1224
- i--
1225
- j--
1226
- } else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
1227
- result.unshift({ oldIdx: null, newIdx: j - 1 })
1228
- j--
1229
- } else {
1230
- result.unshift({ oldIdx: i - 1, newIdx: null })
1231
- i--
1232
- }
1233
- }
1234
- return result
1235
- }
1236
-
1237
- /**
1238
- * Returns the opening tag string with the given class injected. Existing
1239
- * `class` attributes are preserved and the new class appended.
1240
- */
1241
- /**
1242
- * Returns the opening tag with the given class injected. Locates the real
1243
- * `class` attribute via attribute-aware walking (NOT a flat regex — that
1244
- * would mis-match inside a foreign attribute value like
1245
- * `title="see class='x'"`). When the class already partially overlaps with
1246
- * `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
1247
- * only the missing tokens get appended, so we never end up with
1248
- * `class="mod mod colspan"`.
1249
- */
1250
- function injectClass(openingTag: string, cls: string): string {
1251
- const clsTokens = cls.split(/\s+/).filter(Boolean)
1252
- if (clsTokens.length === 0) return openingTag
1253
-
1254
- const classAttr = findClassAttribute(openingTag)
1255
- if (classAttr) {
1256
- const existingTokens = classAttr.value.split(/\s+/).filter(Boolean)
1257
- const missing = clsTokens.filter(t => !existingTokens.includes(t))
1258
- if (missing.length === 0) return openingTag
1259
- const updatedValue =
1260
- existingTokens.length === 0 ? missing.join(' ') : `${existingTokens.join(' ')} ${missing.join(' ')}`
1261
- return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd)
1262
- }
1263
-
1264
- const isSelfClosing = openingTag.endsWith('/>')
1265
- const insertAt = isSelfClosing ? openingTag.length - 2 : openingTag.length - 1
1266
- return `${openingTag.slice(0, insertAt).replace(/\s*$/, '')} class='${cls}'${openingTag.slice(insertAt)}`
1267
- }
1268
-
1269
- /**
1270
- * Walks the opening tag's attributes (respecting quoted values) to find
1271
- * the actual `class` attribute. Returns the value range (start/end of the
1272
- * value content, *excluding* the surrounding quotes) and the value, or
1273
- * null if no `class` attribute is present.
1274
- */
1275
- function findClassAttribute(openingTag: string): { valueStart: number; valueEnd: number; value: string } | null {
1276
- // Skip past the tag name. Tag starts with `<`; first run of [A-Za-z0-9-]
1277
- // is the tag name. Anything after is attribute territory.
1278
- let i = 1
1279
- while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++
1280
-
1281
- while (i < openingTag.length) {
1282
- // Skip whitespace
1283
- while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1284
- if (i >= openingTag.length) break
1285
- if (openingTag[i] === '>' || openingTag[i] === '/') break
1286
-
1287
- // Read attribute name
1288
- const nameStart = i
1289
- while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++
1290
- const name = openingTag.slice(nameStart, i)
1291
-
1292
- // Optional whitespace + '=' + optional whitespace + value
1293
- while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1294
- if (openingTag[i] !== '=') {
1295
- // Bare attribute (no value) — not class
1296
- continue
1297
- }
1298
- i++ // past '='
1299
- while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1300
-
1301
- // Value: quoted or unquoted
1302
- let valueStart: number
1303
- let valueEnd: number
1304
- if (openingTag[i] === '"' || openingTag[i] === "'") {
1305
- const quote = openingTag[i]
1306
- i++
1307
- valueStart = i
1308
- while (i < openingTag.length && openingTag[i] !== quote) i++
1309
- valueEnd = i
1310
- if (i < openingTag.length) i++ // past closing quote
1311
- } else {
1312
- valueStart = i
1313
- while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++
1314
- valueEnd = i
1315
- }
1316
-
1317
- if (name.toLowerCase() === 'class') {
1318
- return { valueStart, valueEnd, value: openingTag.slice(valueStart, valueEnd) }
1319
- }
1320
- }
1321
-
1322
- return null
1323
- }
1324
-
1325
962
  /**
1326
963
  * Walks html and returns ranges for every top-level `<table>...</table>`
1327
964
  * block. Nested tables aren't extracted as separate top-level entries —
1328
965
  * they're captured inside the parent's content range and handled when the
1329
966
  * cell-level diff recurses through them.
1330
967
  */
1331
- function findTopLevelTables(html: string): TableRange[] {
968
+ export function findTopLevelTables(html: string): TableRange[] {
1332
969
  const tables: TableRange[] = []
1333
970
  let i = 0
1334
971
  while (i < html.length) {
@@ -1414,91 +1051,3 @@ function findTopLevelCells(html: string, start: number, end: number): CellRange[
1414
1051
  }
1415
1052
  return cells
1416
1053
  }
1417
-
1418
- function matchesTagAt(html: string, i: number, tagName: string): boolean {
1419
- if (html[i] !== '<') return false
1420
- const candidate = html.slice(i + 1, i + 1 + tagName.length).toLowerCase()
1421
- if (candidate !== tagName) return false
1422
- const after = html[i + 1 + tagName.length]
1423
- return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r' || after === '/'
1424
- }
1425
-
1426
- function matchesClosingTagAt(html: string, i: number, tagName: string): boolean {
1427
- if (html[i] !== '<' || html[i + 1] !== '/') return false
1428
- const candidate = html.slice(i + 2, i + 2 + tagName.length).toLowerCase()
1429
- if (candidate !== tagName) return false
1430
- const after = html[i + 2 + tagName.length]
1431
- return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r'
1432
- }
1433
-
1434
- interface OpeningTag {
1435
- /** Index just past the closing `>` of the opening tag. */
1436
- end: number
1437
- }
1438
-
1439
- function parseOpeningTagAt(html: string, i: number): OpeningTag | null {
1440
- // HTML comments, CDATA, processing instructions, and DOCTYPE need their
1441
- // own terminators — a plain `>`-walker would cut a comment like
1442
- // `<!-- a > b -->` at the first inner `>`, treating the rest as text
1443
- // and corrupting downstream offsets. Word-exported HTML routinely
1444
- // emits comments inside tables (conditional comments, OLE markers) so
1445
- // these have to be handled, not just be theoretical.
1446
- if (html.startsWith('<!--', i)) {
1447
- const close = html.indexOf('-->', i + 4)
1448
- return close === -1 ? null : { end: close + 3 }
1449
- }
1450
- if (html.startsWith('<![CDATA[', i)) {
1451
- const close = html.indexOf(']]>', i + 9)
1452
- return close === -1 ? null : { end: close + 3 }
1453
- }
1454
- if (html.startsWith('<?', i)) {
1455
- const close = html.indexOf('?>', i + 2)
1456
- return close === -1 ? null : { end: close + 2 }
1457
- }
1458
- // Walk to the next unquoted '>'. Handles attributes whose values contain
1459
- // a literal '>' inside quotes, which a plain indexOf would mishandle.
1460
- let j = i + 1
1461
- let quote: string | null = null
1462
- while (j < html.length) {
1463
- const ch = html[j]
1464
- if (quote) {
1465
- if (ch === quote) quote = null
1466
- } else if (ch === '"' || ch === "'") {
1467
- quote = ch
1468
- } else if (ch === '>') {
1469
- return { end: j + 1 }
1470
- }
1471
- j++
1472
- }
1473
- return null
1474
- }
1475
-
1476
- /**
1477
- * Returns the index just past the matching `</tagName>`, accounting for
1478
- * nested tags of the same name. Returns -1 if no match before `limit`.
1479
- */
1480
- function findMatchingClosingTag(html: string, from: number, tagName: string, limit: number = html.length): number {
1481
- let depth = 1
1482
- let i = from
1483
- while (i < limit) {
1484
- if (matchesTagAt(html, i, tagName)) {
1485
- const opening = parseOpeningTagAt(html, i)
1486
- if (!opening) {
1487
- i++
1488
- continue
1489
- }
1490
- const tagText = html.slice(i, opening.end)
1491
- if (!tagText.endsWith('/>')) depth++
1492
- i = opening.end
1493
- } else if (matchesClosingTagAt(html, i, tagName)) {
1494
- depth--
1495
- const closing = parseOpeningTagAt(html, i)
1496
- const closingEnd = closing?.end ?? i + `</${tagName}>`.length
1497
- if (depth === 0) return closingEnd
1498
- i = closingEnd
1499
- } else {
1500
- i++
1501
- }
1502
- }
1503
- return -1
1504
- }