@createiq/htmldiff 1.1.0 → 1.2.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/TableDiff.ts CHANGED
@@ -1,3 +1,18 @@
1
+ import {
2
+ type Alignment,
3
+ findOptimalAlignmentSkips,
4
+ lcsAlign,
5
+ orderAlignmentForEmission,
6
+ pairSimilarUnmatched,
7
+ textSimilarity,
8
+ } from './Alignment'
9
+ import {
10
+ findMatchingClosingTag,
11
+ injectClass,
12
+ matchesClosingTagAt,
13
+ matchesTagAt,
14
+ parseOpeningTagAt,
15
+ } from './HtmlScanner'
1
16
  import { wrapText } from './Utils'
2
17
 
3
18
  /**
@@ -24,7 +39,7 @@ import { wrapText } from './Utils'
24
39
  * by the normal word-level pipeline.
25
40
  */
26
41
 
27
- interface CellRange {
42
+ export interface CellRange {
28
43
  /** Start index of the cell's opening tag in the original html. */
29
44
  cellStart: number
30
45
  /** Index just past the cell's closing tag. */
@@ -34,13 +49,13 @@ interface CellRange {
34
49
  contentEnd: number
35
50
  }
36
51
 
37
- interface RowRange {
52
+ export interface RowRange {
38
53
  rowStart: number
39
54
  rowEnd: number
40
55
  cells: CellRange[]
41
56
  }
42
57
 
43
- interface TableRange {
58
+ export interface TableRange {
44
59
  tableStart: number
45
60
  tableEnd: number
46
61
  rows: RowRange[]
@@ -75,16 +90,26 @@ const PLACEHOLDER_SUFFIX = '-->'
75
90
  const MAX_TABLE_ROWS = 1500
76
91
  const MAX_TABLE_CELLS_PER_ROW = 200
77
92
 
78
- // Caps for the per-row combinatorial column-position search in
79
- // findBestColumnInsertPositions / findBestColumnDeletePositions. Worst
80
- // case is C(MAX_COLUMN_SEARCH_WIDTH, MAX_COLUMN_DELTA) 3.8M combos at
81
- // the caps below; wider or more-skewed rows fall through to cell-LCS.
93
+ // Caps for the per-row column-position DP in
94
+ // findBestColumnInsertPositions / findBestColumnDeletePositions.
95
+ // MAX_COLUMN_DELTA is the *semantic* guard: a row with more than 6
96
+ // columns added or deleted is almost always a row rewrite rather than
97
+ // a structural column change, and is better handled by cell-LCS with
98
+ // fuzzy pairing. MAX_COLUMN_SEARCH_WIDTH bounds the per-row DP at
99
+ // O(MAX_COLUMN_SEARCH_WIDTH²) ≈ 40K ops; aligned with
100
+ // MAX_TABLE_CELLS_PER_ROW so any row that survives the table-size cap
101
+ // can still use the DP path.
82
102
  const MAX_COLUMN_DELTA = 6
83
- const MAX_COLUMN_SEARCH_WIDTH = 40
103
+ const MAX_COLUMN_SEARCH_WIDTH = 200
84
104
 
85
- function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
105
+ /**
106
+ * Generate a placeholder-prefix nonce that doesn't collide with any
107
+ * existing content in the inputs. Variadic so callers with N inputs
108
+ * (e.g. three-way diff with V1/V2/V3) check across all of them.
109
+ */
110
+ export function makePlaceholderPrefix(...inputs: string[]): string {
86
111
  // 4 random bytes → 8 hex chars → 16^8 ≈ 4.3 billion combinations. We
87
- // also retry if the generated nonce happens to occur in either input.
112
+ // also retry if the generated nonce happens to occur in any input.
88
113
  // Using `Math.random` here is fine: we're not defending against a
89
114
  // malicious adversary, just avoiding accidental collisions.
90
115
  for (let attempt = 0; attempt < 8; attempt++) {
@@ -92,7 +117,7 @@ function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
92
117
  .toString(16)
93
118
  .padStart(8, '0')
94
119
  const prefix = `${PLACEHOLDER_PREFIX_BASE}${nonce}_`
95
- if (!oldHtml.includes(prefix) && !newHtml.includes(prefix)) {
120
+ if (inputs.every(input => !input.includes(prefix))) {
96
121
  return prefix
97
122
  }
98
123
  }
@@ -102,6 +127,8 @@ function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
102
127
  return `${PLACEHOLDER_PREFIX_BASE}fallback_${Date.now()}_`
103
128
  }
104
129
 
130
+ export { PLACEHOLDER_SUFFIX }
131
+
105
132
  type DiffCellFn = (oldCellContent: string, newCellContent: string) => string
106
133
 
107
134
  /**
@@ -154,11 +181,11 @@ export function restoreTablePlaceholders(diffOutput: string, placeholderToDiff:
154
181
  return result
155
182
  }
156
183
 
157
- function spliceString(s: string, start: number, end: number, replacement: string): string {
184
+ export function spliceString(s: string, start: number, end: number, replacement: string): string {
158
185
  return s.slice(0, start) + replacement + s.slice(end)
159
186
  }
160
187
 
161
- function exceedsSizeLimit(table: TableRange): boolean {
188
+ export function exceedsSizeLimit(table: TableRange): boolean {
162
189
  if (table.rows.length > MAX_TABLE_ROWS) return true
163
190
  for (const row of table.rows) {
164
191
  if (row.cells.length > MAX_TABLE_CELLS_PER_ROW) return true
@@ -329,7 +356,7 @@ function emitEmptyRow(html: string, row: RowRange): string {
329
356
  return html.slice(row.rowStart, row.rowEnd)
330
357
  }
331
358
 
332
- function sameDimensions(a: TableRange, b: TableRange): boolean {
359
+ export function sameDimensions(a: TableRange, b: TableRange): boolean {
333
360
  if (a.rows.length !== b.rows.length) return false
334
361
  for (let i = 0; i < a.rows.length; i++) {
335
362
  if (a.rows[i].cells.length !== b.rows[i].cells.length) return false
@@ -428,79 +455,6 @@ function diffStructurallyAlignedTable(
428
455
  return out.join('')
429
456
  }
430
457
 
431
- /**
432
- * Reorders the alignment so emission produces rows in the visually-
433
- * correct order. Each entry is assigned a fractional "position" in
434
- * new's flow:
435
- *
436
- * • Preserved/paired (oldIdx, newIdx): position = newIdx.
437
- * • Pure insert (null, newIdx): position = newIdx.
438
- * • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
439
- * before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
440
- * they appear in old's row order. The +0.5 places dels BEFORE any
441
- * insert at the same gap (insert at newIdx N1+1 has position N1+1
442
- * which is > N1+0.5), giving the natural "delete first, insert
443
- * second" reading order at a replaced position.
444
- *
445
- * This handles the full range:
446
- * • Run of unpaired dels at the start (no preserved predecessor):
447
- * position -0.5, sorted by oldIdx.
448
- * • Dels in the middle: positioned right after their preceding
449
- * preserved row.
450
- * • Dels at the end (no preserved successor): positioned after the
451
- * last preserved row.
452
- *
453
- * Without this reordering, a run of unpaired deletes at low alignment
454
- * indices got emitted at cursor = first-new-row position — putting
455
- * all deletes before any preserved row in the output, regardless of
456
- * where they came from in old.
457
- */
458
- function orderAlignmentForEmission(alignment: Alignment[]): Alignment[] {
459
- const preserved: Array<{ oldIdx: number; newIdx: number }> = []
460
- for (const a of alignment) {
461
- if (a.oldIdx !== null && a.newIdx !== null) {
462
- preserved.push({ oldIdx: a.oldIdx, newIdx: a.newIdx })
463
- }
464
- }
465
- preserved.sort((a, b) => a.oldIdx - b.oldIdx)
466
-
467
- // For a deleted row with oldIdx K, return the newIdx of the preserved
468
- // entry with the largest oldIdx less than K, or -1 if none.
469
- function newIdxOfPreservedBefore(oldIdx: number): number {
470
- let result = -1
471
- for (const p of preserved) {
472
- if (p.oldIdx >= oldIdx) break
473
- result = p.newIdx
474
- }
475
- return result
476
- }
477
-
478
- // Decorate each alignment with a fractional position. We use
479
- // (primary, secondary) tuples so dels at the same gap sort by oldIdx
480
- // (in old's row order) and inserts at the same newIdx stay stable.
481
- const decorated = alignment.map((a, i) => {
482
- let primary: number
483
- let secondary: number
484
- if (a.newIdx !== null) {
485
- primary = a.newIdx
486
- secondary = a.oldIdx === null ? 1 : 0 // preserved before pure-insert at same newIdx (rare)
487
- } else {
488
- // Pure delete
489
- primary = newIdxOfPreservedBefore(a.oldIdx as number) + 0.5
490
- secondary = a.oldIdx as number
491
- }
492
- return { entry: a, primary, secondary, originalIdx: i }
493
- })
494
-
495
- decorated.sort((a, b) => {
496
- if (a.primary !== b.primary) return a.primary - b.primary
497
- if (a.secondary !== b.secondary) return a.secondary - b.secondary
498
- return a.originalIdx - b.originalIdx // stable
499
- })
500
-
501
- return decorated.map(d => d.entry)
502
- }
503
-
504
458
  function rebuildStructurallyAlignedTable(
505
459
  oldHtml: string,
506
460
  newHtml: string,
@@ -534,7 +488,7 @@ function headerSlice(newHtml: string, newTable: TableRange, oldHtml: string, old
534
488
  return oldHtml.slice(oldTable.tableStart, oldFirstRow)
535
489
  }
536
490
 
537
- function rowKey(html: string, row: RowRange): string {
491
+ export function rowKey(html: string, row: RowRange): string {
538
492
  // Include cell tag text in the key so column-add doesn't accidentally
539
493
  // match a row to one with different cell counts. Whitespace-normalize to
540
494
  // tolerate formatting differences.
@@ -561,9 +515,11 @@ function diffPreservedRow(
561
515
  // remaining cells positionally. This handles content-edit alongside
562
516
  // column-add by keeping the edited cell in its column position rather
563
517
  // than orphaning it via the cell-LCS exact match.
564
- // Guardrail: combinatorial search is C(newCount, k); we cap to avoid
565
- // explosion on very wide tables. Worst case at the caps is C(40, 6) ≈
566
- // 3.8M combos; above that we fall through to cell-LCS.
518
+ // Guardrail: O(M × N) DP scales fine within MAX_COLUMN_SEARCH_WIDTH;
519
+ // wider rows fall through to cell-LCS so we don't run the per-row DP
520
+ // on multi-thousand-cell exotica. MAX_COLUMN_DELTA stays as a
521
+ // semantic guard — a delta > 6 usually means "row rewrite", not
522
+ // "column added", and is better handled by cell-LCS.
567
523
  const delta = newRow.cells.length - oldRow.cells.length
568
524
  const absDelta = Math.abs(delta)
569
525
  if (
@@ -571,29 +527,28 @@ function diffPreservedRow(
571
527
  absDelta <= MAX_COLUMN_DELTA &&
572
528
  Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH
573
529
  ) {
574
- if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell)
575
- return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell)
530
+ if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, diffCell)
531
+ return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, diffCell)
576
532
  }
577
533
  return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
578
534
  }
579
535
 
580
536
  /**
581
- * For a row where new has K more cells than old, find the K column
582
- * positions in new where cells were inserted by scanning all C(newCount,
583
- * K) combinations and picking the one that maximises positional content
584
- * similarity with the remaining cells. The inserted cells are emitted
585
- * with diff markers; the rest are aligned positionally with content
586
- * diff for matched pairs.
537
+ * For a row where new has more cells than old, find the column positions
538
+ * in new where cells were inserted by running a monotonic-alignment DP
539
+ * over the cell texts: pick the skip positions that maximise the sum-of-
540
+ * similarities of the unskipped new cells aligned positionally against
541
+ * the old cells. The inserted cells are emitted with diff markers; the
542
+ * rest are aligned positionally with content diff for matched pairs.
587
543
  */
588
544
  function diffMultiColumnAddRow(
589
545
  oldHtml: string,
590
546
  newHtml: string,
591
547
  oldRow: RowRange,
592
548
  newRow: RowRange,
593
- k: number,
594
549
  diffCell: DiffCellFn
595
550
  ): string {
596
- const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml)
551
+ const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, oldHtml, newHtml)
597
552
  const inserted = new Set(insertedPositions)
598
553
  const out: string[] = [rowHeaderSlice(newHtml, newRow)]
599
554
  let oldIdx = 0
@@ -614,10 +569,9 @@ function diffMultiColumnDeleteRow(
614
569
  newHtml: string,
615
570
  oldRow: RowRange,
616
571
  newRow: RowRange,
617
- k: number,
618
572
  diffCell: DiffCellFn
619
573
  ): string {
620
- const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml)
574
+ const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, oldHtml, newHtml)
621
575
  const deleted = new Set(deletedPositions)
622
576
  const out: string[] = [rowHeaderSlice(newHtml, newRow)]
623
577
  let newIdx = 0
@@ -633,81 +587,20 @@ function diffMultiColumnDeleteRow(
633
587
  return out.join('')
634
588
  }
635
589
 
636
- function findBestColumnInsertPositions(
637
- oldRow: RowRange,
638
- newRow: RowRange,
639
- k: number,
640
- oldHtml: string,
641
- newHtml: string
642
- ): number[] {
643
- // Pre-compute cell texts once instead of letting textSimilarity
644
- // recompute them inside every combo iteration — C(N, K) combos times
645
- // ~N text extractions each is a lot of wasted string work.
590
+ function findBestColumnInsertPositions(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number[] {
646
591
  const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
647
592
  const newTexts = newRow.cells.map(c => cellText(newHtml, c))
648
- let bestPositions: number[] = []
649
- let bestScore = -1
650
- for (const combo of combinationsOfRange(newRow.cells.length, k)) {
651
- const inserted = new Set(combo)
652
- let score = 0
653
- let oldIdx = 0
654
- for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
655
- if (inserted.has(newIdx)) continue
656
- score += textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
657
- oldIdx++
658
- }
659
- if (score > bestScore) {
660
- bestScore = score
661
- bestPositions = combo
662
- }
663
- }
664
- return bestPositions
593
+ return findOptimalAlignmentSkips(oldTexts, newTexts, (oldIdx, newIdx) =>
594
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
595
+ )
665
596
  }
666
597
 
667
- function findBestColumnDeletePositions(
668
- oldRow: RowRange,
669
- newRow: RowRange,
670
- k: number,
671
- oldHtml: string,
672
- newHtml: string
673
- ): number[] {
598
+ function findBestColumnDeletePositions(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number[] {
674
599
  const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
675
600
  const newTexts = newRow.cells.map(c => cellText(newHtml, c))
676
- let bestPositions: number[] = []
677
- let bestScore = -1
678
- for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
679
- const deleted = new Set(combo)
680
- let score = 0
681
- let newIdx = 0
682
- for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
683
- if (deleted.has(oldIdx)) continue
684
- score += textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
685
- newIdx++
686
- }
687
- if (score > bestScore) {
688
- bestScore = score
689
- bestPositions = combo
690
- }
691
- }
692
- return bestPositions
693
- }
694
-
695
- /**
696
- * Yields all sorted-ascending combinations of `k` distinct integers
697
- * from [0, n). Iterative implementation avoids recursion overhead and
698
- * keeps memory at O(k).
699
- */
700
- function* combinationsOfRange(n: number, k: number): IterableIterator<number[]> {
701
- if (k === 0 || k > n) return
702
- const indices = Array.from({ length: k }, (_, i) => i)
703
- while (true) {
704
- yield indices.slice()
705
- let i = k - 1
706
- while (i >= 0 && indices[i] === n - k + i) i--
707
- if (i < 0) return
708
- indices[i]++
709
- for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1
710
- }
601
+ return findOptimalAlignmentSkips(newTexts, oldTexts, (newIdx, oldIdx) =>
602
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
603
+ )
711
604
  }
712
605
 
713
606
  /**
@@ -999,11 +892,6 @@ function rowHeaderSlice(html: string, row: RowRange): string {
999
892
  return html.slice(row.rowStart, row.cells[0].cellStart)
1000
893
  }
1001
894
 
1002
- interface Alignment {
1003
- oldIdx: number | null
1004
- newIdx: number | null
1005
- }
1006
-
1007
895
  /** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
1008
896
  const ROW_FUZZY_THRESHOLD = 0.5
1009
897
 
@@ -1054,136 +942,7 @@ function pairSimilarUnmatchedCells(
1054
942
  )
1055
943
  }
1056
944
 
1057
- /**
1058
- * Identify pairings inside each unmatched-only run, then build the output
1059
- * alignment by walking the original and substituting paired entries at
1060
- * the *ins position* (not the del position). This keeps the result
1061
- * monotonic in newIdx — critical because the cursor-based emission
1062
- * downstream walks new's html in order. Emitting at the del position
1063
- * would be fine when del<ins in the alignment array (the typical case),
1064
- * but can violate monotonicity when there are mixed unpaired entries in
1065
- * between (column-add + row-add together, content-edit + column-add,
1066
- * etc.).
1067
- *
1068
- * Generic over what's being paired — works for both rows (by full row
1069
- * content similarity) and cells (by per-cell content similarity).
1070
- */
1071
- function pairSimilarUnmatched(
1072
- alignment: Alignment[],
1073
- threshold: number,
1074
- similarity: (oldIdx: number, newIdx: number) => number
1075
- ): Alignment[] {
1076
- const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
1077
- let i = 0
1078
- while (i < alignment.length) {
1079
- if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
1080
- i++
1081
- continue
1082
- }
1083
- const runStart = i
1084
- while (i < alignment.length && (alignment[i].oldIdx === null) !== (alignment[i].newIdx === null)) i++
1085
- const runEnd = i
1086
-
1087
- const delIndices: number[] = []
1088
- const insIndices: number[] = []
1089
- for (let k = runStart; k < runEnd; k++) {
1090
- if (alignment[k].oldIdx !== null) delIndices.push(k)
1091
- else insIndices.push(k)
1092
- }
1093
-
1094
- const usedIns = new Set<number>()
1095
- for (const di of delIndices) {
1096
- let bestIi = -1
1097
- let bestSim = threshold
1098
- for (const ii of insIndices) {
1099
- if (usedIns.has(ii)) continue
1100
- const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
1101
- if (sim > bestSim) {
1102
- bestSim = sim
1103
- bestIi = ii
1104
- }
1105
- }
1106
- if (bestIi >= 0) {
1107
- pairs.set(di, bestIi)
1108
- usedIns.add(bestIi)
1109
- }
1110
- }
1111
- }
1112
-
1113
- const insToDel = new Map<number, number>() // ins-alignment-idx → del-alignment-idx
1114
- for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi)
1115
- const pairedDels = new Set<number>(pairs.keys())
1116
-
1117
- const result: Alignment[] = []
1118
- for (let k = 0; k < alignment.length; k++) {
1119
- if (pairedDels.has(k)) continue // paired del — emitted when we reach its ins
1120
- if (insToDel.has(k)) {
1121
- const delAi = insToDel.get(k) as number
1122
- result.push({ oldIdx: alignment[delAi].oldIdx, newIdx: alignment[k].newIdx })
1123
- } else {
1124
- result.push(alignment[k])
1125
- }
1126
- }
1127
- return result
1128
- }
1129
-
1130
- /**
1131
- * Combined similarity metric used for both row-level and cell-level
1132
- * fuzzy pairing. Returns the MAX of two complementary metrics:
1133
- *
1134
- * 1. **Character prefix+suffix similarity** — fraction of the longer
1135
- * string covered by shared prefix + shared suffix. Catches small
1136
- * edits in the middle of a string (one word changed in a row).
1137
- * Misses cases where the bulk of common content is in the middle
1138
- * and the ends differ.
1139
- *
1140
- * 2. **Token Jaccard similarity** — intersection-over-union of the
1141
- * whitespace-split tokens. Catches "most of the content is the
1142
- * same but bookended by different bits" — e.g. a row whose only
1143
- * edit is a column added at the start and another at the end,
1144
- * where the ~50 chars in the middle that DO match would be
1145
- * invisible to prefix+suffix.
1146
- *
1147
- * Either metric exceeding the threshold means pair. Neither alone is
1148
- * sufficient for the full range of legal-doc edits we see in
1149
- * production tables.
1150
- */
1151
- function textSimilarity(a: string, b: string): number {
1152
- if (a === b) return 1
1153
- if (a.length === 0 || b.length === 0) return 0
1154
- return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
1155
- }
1156
-
1157
- function charPrefixSuffixSimilarity(a: string, b: string): number {
1158
- let prefix = 0
1159
- const minLen = Math.min(a.length, b.length)
1160
- while (prefix < minLen && a[prefix] === b[prefix]) prefix++
1161
-
1162
- let suffix = 0
1163
- while (
1164
- suffix < a.length - prefix &&
1165
- suffix < b.length - prefix &&
1166
- a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
1167
- ) {
1168
- suffix++
1169
- }
1170
-
1171
- return (prefix + suffix) / Math.max(a.length, b.length)
1172
- }
1173
-
1174
- function tokenJaccardSimilarity(a: string, b: string): number {
1175
- const tokensA = new Set(a.split(/\s+/).filter(Boolean))
1176
- const tokensB = new Set(b.split(/\s+/).filter(Boolean))
1177
- if (tokensA.size === 0 && tokensB.size === 0) return 1
1178
- let intersection = 0
1179
- for (const t of tokensA) {
1180
- if (tokensB.has(t)) intersection++
1181
- }
1182
- const union = tokensA.size + tokensB.size - intersection
1183
- return union === 0 ? 0 : intersection / union
1184
- }
1185
-
1186
- function rowText(html: string, row: RowRange): string {
945
+ export function rowText(html: string, row: RowRange): string {
1187
946
  const parts: string[] = []
1188
947
  for (const cell of row.cells) {
1189
948
  parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, ' '))
@@ -1200,139 +959,13 @@ function cellText(html: string, cell: CellRange): string {
1200
959
  .toLowerCase()
1201
960
  }
1202
961
 
1203
- /**
1204
- * Standard LCS alignment: walks both sequences and emits a list of pairs
1205
- * where `(oldIdx, newIdx)` are both set for matching positions, and one
1206
- * side is null for an unmatched entry on the other side. Equality uses
1207
- * strict ===.
1208
- */
1209
- function lcsAlign(oldKeys: string[], newKeys: string[]): Alignment[] {
1210
- const m = oldKeys.length
1211
- const n = newKeys.length
1212
- const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
1213
- for (let i = 1; i <= m; i++) {
1214
- for (let j = 1; j <= n; j++) {
1215
- if (oldKeys[i - 1] === newKeys[j - 1]) {
1216
- dp[i][j] = dp[i - 1][j - 1] + 1
1217
- } else {
1218
- dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1])
1219
- }
1220
- }
1221
- }
1222
-
1223
- // Backtrack and push; reverse at the end. `unshift` is O(n) per call
1224
- // so the naive version was O(n²); push+reverse is O(n) total.
1225
- const result: Alignment[] = []
1226
- let i = m
1227
- let j = n
1228
- while (i > 0 || j > 0) {
1229
- if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
1230
- result.push({ oldIdx: i - 1, newIdx: j - 1 })
1231
- i--
1232
- j--
1233
- } else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
1234
- result.push({ oldIdx: null, newIdx: j - 1 })
1235
- j--
1236
- } else {
1237
- result.push({ oldIdx: i - 1, newIdx: null })
1238
- i--
1239
- }
1240
- }
1241
- result.reverse()
1242
- return result
1243
- }
1244
-
1245
- /**
1246
- * Returns the opening tag with the given class injected. Locates the real
1247
- * `class` attribute via attribute-aware walking (NOT a flat regex — that
1248
- * would mis-match inside a foreign attribute value like
1249
- * `title="see class='x'"`). When the class already partially overlaps with
1250
- * `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
1251
- * only the missing tokens get appended, so we never end up with
1252
- * `class="mod mod colspan"`.
1253
- */
1254
- function injectClass(openingTag: string, cls: string): string {
1255
- const clsTokens = cls.split(/\s+/).filter(Boolean)
1256
- if (clsTokens.length === 0) return openingTag
1257
-
1258
- const classAttr = findClassAttribute(openingTag)
1259
- if (classAttr) {
1260
- const existingTokens = classAttr.value.split(/\s+/).filter(Boolean)
1261
- const missing = clsTokens.filter(t => !existingTokens.includes(t))
1262
- if (missing.length === 0) return openingTag
1263
- const updatedValue =
1264
- existingTokens.length === 0 ? missing.join(' ') : `${existingTokens.join(' ')} ${missing.join(' ')}`
1265
- return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd)
1266
- }
1267
-
1268
- const isSelfClosing = openingTag.endsWith('/>')
1269
- const insertAt = isSelfClosing ? openingTag.length - 2 : openingTag.length - 1
1270
- return `${openingTag.slice(0, insertAt).replace(/\s*$/, '')} class='${cls}'${openingTag.slice(insertAt)}`
1271
- }
1272
-
1273
- /**
1274
- * Walks the opening tag's attributes (respecting quoted values) to find
1275
- * the actual `class` attribute. Returns the value range (start/end of the
1276
- * value content, *excluding* the surrounding quotes) and the value, or
1277
- * null if no `class` attribute is present.
1278
- */
1279
- function findClassAttribute(openingTag: string): { valueStart: number; valueEnd: number; value: string } | null {
1280
- // Skip past the tag name. Tag starts with `<`; first run of [A-Za-z0-9-]
1281
- // is the tag name. Anything after is attribute territory.
1282
- let i = 1
1283
- while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++
1284
-
1285
- while (i < openingTag.length) {
1286
- // Skip whitespace
1287
- while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1288
- if (i >= openingTag.length) break
1289
- if (openingTag[i] === '>' || openingTag[i] === '/') break
1290
-
1291
- // Read attribute name
1292
- const nameStart = i
1293
- while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++
1294
- const name = openingTag.slice(nameStart, i)
1295
-
1296
- // Optional whitespace + '=' + optional whitespace + value
1297
- while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1298
- if (openingTag[i] !== '=') {
1299
- // Bare attribute (no value) — not class
1300
- continue
1301
- }
1302
- i++ // past '='
1303
- while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1304
-
1305
- // Value: quoted or unquoted
1306
- let valueStart: number
1307
- let valueEnd: number
1308
- if (openingTag[i] === '"' || openingTag[i] === "'") {
1309
- const quote = openingTag[i]
1310
- i++
1311
- valueStart = i
1312
- while (i < openingTag.length && openingTag[i] !== quote) i++
1313
- valueEnd = i
1314
- if (i < openingTag.length) i++ // past closing quote
1315
- } else {
1316
- valueStart = i
1317
- while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++
1318
- valueEnd = i
1319
- }
1320
-
1321
- if (name.toLowerCase() === 'class') {
1322
- return { valueStart, valueEnd, value: openingTag.slice(valueStart, valueEnd) }
1323
- }
1324
- }
1325
-
1326
- return null
1327
- }
1328
-
1329
962
  /**
1330
963
  * Walks html and returns ranges for every top-level `<table>...</table>`
1331
964
  * block. Nested tables aren't extracted as separate top-level entries —
1332
965
  * they're captured inside the parent's content range and handled when the
1333
966
  * cell-level diff recurses through them.
1334
967
  */
1335
- function findTopLevelTables(html: string): TableRange[] {
968
+ export function findTopLevelTables(html: string): TableRange[] {
1336
969
  const tables: TableRange[] = []
1337
970
  let i = 0
1338
971
  while (i < html.length) {
@@ -1418,91 +1051,3 @@ function findTopLevelCells(html: string, start: number, end: number): CellRange[
1418
1051
  }
1419
1052
  return cells
1420
1053
  }
1421
-
1422
- function matchesTagAt(html: string, i: number, tagName: string): boolean {
1423
- if (html[i] !== '<') return false
1424
- const candidate = html.slice(i + 1, i + 1 + tagName.length).toLowerCase()
1425
- if (candidate !== tagName) return false
1426
- const after = html[i + 1 + tagName.length]
1427
- return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r' || after === '/'
1428
- }
1429
-
1430
- function matchesClosingTagAt(html: string, i: number, tagName: string): boolean {
1431
- if (html[i] !== '<' || html[i + 1] !== '/') return false
1432
- const candidate = html.slice(i + 2, i + 2 + tagName.length).toLowerCase()
1433
- if (candidate !== tagName) return false
1434
- const after = html[i + 2 + tagName.length]
1435
- return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r'
1436
- }
1437
-
1438
- interface OpeningTag {
1439
- /** Index just past the closing `>` of the opening tag. */
1440
- end: number
1441
- }
1442
-
1443
- function parseOpeningTagAt(html: string, i: number): OpeningTag | null {
1444
- // HTML comments, CDATA, processing instructions, and DOCTYPE need their
1445
- // own terminators — a plain `>`-walker would cut a comment like
1446
- // `<!-- a > b -->` at the first inner `>`, treating the rest as text
1447
- // and corrupting downstream offsets. Word-exported HTML routinely
1448
- // emits comments inside tables (conditional comments, OLE markers) so
1449
- // these have to be handled, not just be theoretical.
1450
- if (html.startsWith('<!--', i)) {
1451
- const close = html.indexOf('-->', i + 4)
1452
- return close === -1 ? null : { end: close + 3 }
1453
- }
1454
- if (html.startsWith('<![CDATA[', i)) {
1455
- const close = html.indexOf(']]>', i + 9)
1456
- return close === -1 ? null : { end: close + 3 }
1457
- }
1458
- if (html.startsWith('<?', i)) {
1459
- const close = html.indexOf('?>', i + 2)
1460
- return close === -1 ? null : { end: close + 2 }
1461
- }
1462
- // Walk to the next unquoted '>'. Handles attributes whose values contain
1463
- // a literal '>' inside quotes, which a plain indexOf would mishandle.
1464
- let j = i + 1
1465
- let quote: string | null = null
1466
- while (j < html.length) {
1467
- const ch = html[j]
1468
- if (quote) {
1469
- if (ch === quote) quote = null
1470
- } else if (ch === '"' || ch === "'") {
1471
- quote = ch
1472
- } else if (ch === '>') {
1473
- return { end: j + 1 }
1474
- }
1475
- j++
1476
- }
1477
- return null
1478
- }
1479
-
1480
- /**
1481
- * Returns the index just past the matching `</tagName>`, accounting for
1482
- * nested tags of the same name. Returns -1 if no match before `limit`.
1483
- */
1484
- function findMatchingClosingTag(html: string, from: number, tagName: string, limit: number = html.length): number {
1485
- let depth = 1
1486
- let i = from
1487
- while (i < limit) {
1488
- if (matchesTagAt(html, i, tagName)) {
1489
- const opening = parseOpeningTagAt(html, i)
1490
- if (!opening) {
1491
- i++
1492
- continue
1493
- }
1494
- const tagText = html.slice(i, opening.end)
1495
- if (!tagText.endsWith('/>')) depth++
1496
- i = opening.end
1497
- } else if (matchesClosingTagAt(html, i, tagName)) {
1498
- depth--
1499
- const closing = parseOpeningTagAt(html, i)
1500
- const closingEnd = closing?.end ?? i + `</${tagName}>`.length
1501
- if (depth === 0) return closingEnd
1502
- i = closingEnd
1503
- } else {
1504
- i++
1505
- }
1506
- }
1507
- return -1
1508
- }