@createiq/htmldiff 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1508 @@
1
+ import { wrapText } from './Utils'
2
+
3
+ /**
4
+ * Table-aware preprocessing for HtmlDiff.
5
+ *
6
+ * The word-level diff alone matches longest-common-subsequences across cell
7
+ * boundaries and produces structurally wrong output for table edits — it
8
+ * shuffles content between cells, introduces phantom `<td>`s, and provides
9
+ * no signal that an entire row or column was added/deleted. We pre-process
10
+ * the inputs to give Word-style results:
11
+ *
12
+ * • When dimensions match (same row count, same cell count per row), we
13
+ * diff cell content positionally so cross-cell shifts produce one
14
+ * independent del/ins per cell.
15
+ * • When dimensions don't match (added/deleted row, added/deleted column),
16
+ * we run a row-level LCS to identify structurally added/deleted rows,
17
+ * then within preserved rows a cell-level LCS to identify added/deleted
18
+ * columns. Structurally added rows/cells get `class='diffins'` on the
19
+ * `<tr>`/`<td>`; deleted ones get `class='diffdel'`. Preserved cells
20
+ * fall back to a content diff via the recursive HtmlDiff callback.
21
+ *
22
+ * Tables are spliced out into placeholders before the main diff runs and
23
+ * spliced back in after, so the surrounding (non-table) content is diffed
24
+ * by the normal word-level pipeline.
25
+ */
26
+
27
+ interface CellRange {
28
+ /** Start index of the cell's opening tag in the original html. */
29
+ cellStart: number
30
+ /** Index just past the cell's closing tag. */
31
+ cellEnd: number
32
+ /** Cell content range — the slice we feed into the cell-level diff. */
33
+ contentStart: number
34
+ contentEnd: number
35
+ }
36
+
37
+ interface RowRange {
38
+ rowStart: number
39
+ rowEnd: number
40
+ cells: CellRange[]
41
+ }
42
+
43
+ interface TableRange {
44
+ tableStart: number
45
+ tableEnd: number
46
+ rows: RowRange[]
47
+ }
48
+
49
+ export interface PreprocessResult {
50
+ modifiedOld: string
51
+ modifiedNew: string
52
+ /** Maps placeholder marker → already-diffed table HTML to splice back in. */
53
+ placeholderToDiff: Map<string, string>
54
+ }
55
+
56
+ // HTML comments survive WordSplitter as a single atomic token and are
57
+ // treated as equal on both sides, so they pass through the diff
58
+ // untouched and are easy to substitute back later. The nonce is generated
59
+ // per call so a previously-diffed document being re-diffed (or any input
60
+ // that legitimately contains an `<!--HTMLDIFF_TABLE_*-->` comment) can't
61
+ // collide with the placeholder we substitute. We additionally regenerate
62
+ // the nonce if it appears in either input.
63
+ const PLACEHOLDER_PREFIX_BASE = '<!--HTMLDIFF_TABLE_'
64
+ const PLACEHOLDER_SUFFIX = '-->'
65
+
66
+ /**
67
+ * Hard cap on table dimensions handled by the structural-aware path.
68
+ * The row-LCS is O(rows²), the per-row cell-LCS is O(cells²), and each
69
+ * comparison string-equals row content (potentially many KB). Without a
70
+ * cap, a several-thousand-row table can pin a CPU for seconds. Tables
71
+ * larger than this fall through to the word-level diff, which scales
72
+ * linearly. Tuned to comfortably cover real-world ISDA schedules
73
+ * (which routinely have 1000+ rows).
74
+ */
75
+ const MAX_TABLE_ROWS = 1500
76
+ const MAX_TABLE_CELLS_PER_ROW = 200
77
+
78
+ // Caps for the per-row combinatorial column-position search in
79
+ // findBestColumnInsertPositions / findBestColumnDeletePositions. Worst
80
+ // case is C(MAX_COLUMN_SEARCH_WIDTH, MAX_COLUMN_DELTA) ≈ 3.8M combos at
81
+ // the caps below; wider or more-skewed rows fall through to cell-LCS.
82
+ const MAX_COLUMN_DELTA = 6
83
+ const MAX_COLUMN_SEARCH_WIDTH = 40
84
+
85
+ function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
86
+ // 4 random bytes → 8 hex chars → 16^8 ≈ 4.3 billion combinations. We
87
+ // also retry if the generated nonce happens to occur in either input.
88
+ // Using `Math.random` here is fine: we're not defending against a
89
+ // malicious adversary, just avoiding accidental collisions.
90
+ for (let attempt = 0; attempt < 8; attempt++) {
91
+ const nonce = Math.floor(Math.random() * 0xffffffff)
92
+ .toString(16)
93
+ .padStart(8, '0')
94
+ const prefix = `${PLACEHOLDER_PREFIX_BASE}${nonce}_`
95
+ if (!oldHtml.includes(prefix) && !newHtml.includes(prefix)) {
96
+ return prefix
97
+ }
98
+ }
99
+ // Astronomically unlikely. Falling back to a counter ensures progress
100
+ // rather than an infinite loop, and any remaining collision will simply
101
+ // surface as a malformed diff that the caller can detect.
102
+ return `${PLACEHOLDER_PREFIX_BASE}fallback_${Date.now()}_`
103
+ }
104
+
105
+ type DiffCellFn = (oldCellContent: string, newCellContent: string) => string
106
+
107
+ /**
108
+ * Diffs every paired-by-position table in the inputs and replaces each
109
+ * source table with a placeholder, returning the modified inputs plus the
110
+ * placeholder→diff mapping. Returns null when there are no tables to
111
+ * preprocess or the table counts don't line up.
112
+ */
113
+ export function preprocessTables(oldHtml: string, newHtml: string, diffCell: DiffCellFn): PreprocessResult | null {
114
+ const oldTables = findTopLevelTables(oldHtml)
115
+ const newTables = findTopLevelTables(newHtml)
116
+
117
+ if (oldTables.length === 0 && newTables.length === 0) return null
118
+ if (oldTables.length !== newTables.length) return null
119
+
120
+ // Bail out on pathologically large tables — see MAX_TABLE_ROWS comment.
121
+ for (let i = 0; i < oldTables.length; i++) {
122
+ if (exceedsSizeLimit(oldTables[i]) || exceedsSizeLimit(newTables[i])) return null
123
+ }
124
+
125
+ const pairs: Array<{ oldTable: TableRange; newTable: TableRange; diffed: string }> = []
126
+ for (let i = 0; i < oldTables.length; i++) {
127
+ pairs.push({
128
+ oldTable: oldTables[i],
129
+ newTable: newTables[i],
130
+ diffed: diffTable(oldHtml, newHtml, oldTables[i], newTables[i], diffCell),
131
+ })
132
+ }
133
+
134
+ // Splice from end → start so earlier offsets stay valid.
135
+ let modifiedOld = oldHtml
136
+ let modifiedNew = newHtml
137
+ const placeholderPrefix = makePlaceholderPrefix(oldHtml, newHtml)
138
+ const placeholderToDiff = new Map<string, string>()
139
+ for (let i = pairs.length - 1; i >= 0; i--) {
140
+ const placeholder = `${placeholderPrefix}${i}${PLACEHOLDER_SUFFIX}`
141
+ placeholderToDiff.set(placeholder, pairs[i].diffed)
142
+ modifiedOld = spliceString(modifiedOld, pairs[i].oldTable.tableStart, pairs[i].oldTable.tableEnd, placeholder)
143
+ modifiedNew = spliceString(modifiedNew, pairs[i].newTable.tableStart, pairs[i].newTable.tableEnd, placeholder)
144
+ }
145
+
146
+ return { modifiedOld, modifiedNew, placeholderToDiff }
147
+ }
148
+
149
+ export function restoreTablePlaceholders(diffOutput: string, placeholderToDiff: Map<string, string>): string {
150
+ let result = diffOutput
151
+ for (const [placeholder, html] of placeholderToDiff) {
152
+ result = result.split(placeholder).join(html)
153
+ }
154
+ return result
155
+ }
156
+
157
+ function spliceString(s: string, start: number, end: number, replacement: string): string {
158
+ return s.slice(0, start) + replacement + s.slice(end)
159
+ }
160
+
161
+ function exceedsSizeLimit(table: TableRange): boolean {
162
+ if (table.rows.length > MAX_TABLE_ROWS) return true
163
+ for (const row of table.rows) {
164
+ if (row.cells.length > MAX_TABLE_CELLS_PER_ROW) return true
165
+ }
166
+ return false
167
+ }
168
+
169
+ function diffTable(
170
+ oldHtml: string,
171
+ newHtml: string,
172
+ oldTable: TableRange,
173
+ newTable: TableRange,
174
+ diffCell: DiffCellFn
175
+ ): string {
176
+ if (sameDimensions(oldTable, newTable)) {
177
+ return diffPositionalTable(oldHtml, newHtml, oldTable, newTable, diffCell)
178
+ }
179
+ if (oldTable.rows.length === newTable.rows.length) {
180
+ // Same row count, different cell counts: column add/delete only.
181
+ // Aligning rows positionally avoids the LCS row-key mismatch that
182
+ // happens when rows have different cell counts.
183
+ return diffSameRowCountTable(oldHtml, newHtml, oldTable, newTable, diffCell)
184
+ }
185
+ return diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diffCell)
186
+ }
187
+
188
+ function diffSameRowCountTable(
189
+ oldHtml: string,
190
+ newHtml: string,
191
+ oldTable: TableRange,
192
+ newTable: TableRange,
193
+ diffCell: DiffCellFn
194
+ ): string {
195
+ // Walk the new table verbatim (preserving `<thead>`/`<tbody>` wrappers,
196
+ // whitespace, etc.) and substitute each row's content with the diffed
197
+ // form. The cursor-based emission keeps everything between rows intact.
198
+ const out: string[] = []
199
+ let cursor = newTable.tableStart
200
+ let r = 0
201
+ while (r < newTable.rows.length) {
202
+ const merge = detectVerticalMerge(oldHtml, newHtml, oldTable, newTable, r)
203
+ if (merge) {
204
+ out.push(newHtml.slice(cursor, newTable.rows[r].rowStart))
205
+ out.push(merge.diff)
206
+ cursor = newTable.rows[r + merge.span - 1].rowEnd
207
+ r += merge.span
208
+ continue
209
+ }
210
+ const split = detectVerticalSplit(oldHtml, newHtml, oldTable, newTable, r)
211
+ if (split) {
212
+ out.push(newHtml.slice(cursor, newTable.rows[r].rowStart))
213
+ out.push(split.diff)
214
+ cursor = newTable.rows[r + split.span - 1].rowEnd
215
+ r += split.span
216
+ continue
217
+ }
218
+ const newRow = newTable.rows[r]
219
+ out.push(newHtml.slice(cursor, newRow.rowStart))
220
+ out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[r], newRow, diffCell))
221
+ cursor = newRow.rowEnd
222
+ r++
223
+ }
224
+ out.push(newHtml.slice(cursor, newTable.tableEnd))
225
+ return out.join('')
226
+ }
227
+
228
+ /**
229
+ * Detects a vertical merge starting at row `r`: new row R has a single
230
+ * cell with rowspan=K (and any colspan ≥ 1), with rows R+1..R+K-1 empty
231
+ * in new. Old rows R..R+K-1 must have a logical column width equal to
232
+ * the new cell's colspan and contain no rowspan'd cells of their own.
233
+ * This handles both single-column merges (old rows are 1-cell, new cell
234
+ * rowspan=K) and rectangular merges (e.g. 2×2 merge into a single
235
+ * colspan=2 rowspan=2 cell). Output: emit the merged cell with
236
+ * `class='mod rowspan'` and the empty trailing rows unchanged.
237
+ */
238
+ function detectVerticalMerge(
239
+ oldHtml: string,
240
+ newHtml: string,
241
+ oldTable: TableRange,
242
+ newTable: TableRange,
243
+ r: number
244
+ ): { diff: string; span: number } | null {
245
+ const newRow = newTable.rows[r]
246
+ if (newRow.cells.length !== 1) return null
247
+ const cell = newRow.cells[0]
248
+ const span = getRowspan(newHtml, cell)
249
+ if (span <= 1) return null
250
+ if (r + span > newTable.rows.length) return null
251
+
252
+ const colspan = getColspan(newHtml, cell)
253
+
254
+ for (let k = 1; k < span; k++) {
255
+ if (newTable.rows[r + k].cells.length !== 0) return null
256
+ }
257
+ for (let k = 0; k < span; k++) {
258
+ const oldRow = oldTable.rows[r + k]
259
+ if (!oldRow) return null
260
+ // The absorbed region's logical width must match the merged cell's
261
+ // colspan; otherwise this isn't a clean rectangular merge and we let
262
+ // the caller fall through.
263
+ if (sumColspans(oldHtml, oldRow.cells) !== colspan) return null
264
+ for (const c of oldRow.cells) {
265
+ if (getRowspan(oldHtml, c) !== 1) return null
266
+ }
267
+ }
268
+
269
+ const out: string[] = []
270
+ out.push(rowHeaderSlice(newHtml, newRow))
271
+ out.push(emitSpanChangedCell(newHtml, cell, 'rowspan'))
272
+ out.push('</tr>')
273
+ for (let k = 1; k < span; k++) {
274
+ out.push(emitEmptyRow(newHtml, newTable.rows[r + k]))
275
+ }
276
+ return { diff: out.join(''), span }
277
+ }
278
+
279
+ /**
280
+ * Detects a vertical split starting at row `r`: old row R has a single
281
+ * cell with rowspan=K, old rows R+1..R+K-1 are empty. New rows R..R+K-1
282
+ * each have a single cell. Output: emit each new row with the new cell
283
+ * tagged `class='mod rowspan'`.
284
+ */
285
+ function detectVerticalSplit(
286
+ oldHtml: string,
287
+ newHtml: string,
288
+ oldTable: TableRange,
289
+ newTable: TableRange,
290
+ r: number
291
+ ): { diff: string; span: number } | null {
292
+ const oldRow = oldTable.rows[r]
293
+ if (oldRow.cells.length !== 1) return null
294
+ const oldCell = oldRow.cells[0]
295
+ const span = getRowspan(oldHtml, oldCell)
296
+ if (span <= 1) return null
297
+ if (r + span > oldTable.rows.length) return null
298
+
299
+ const colspan = getColspan(oldHtml, oldCell)
300
+
301
+ for (let k = 1; k < span; k++) {
302
+ if (oldTable.rows[r + k].cells.length !== 0) return null
303
+ }
304
+ for (let k = 0; k < span; k++) {
305
+ const newRow = newTable.rows[r + k]
306
+ if (!newRow) return null
307
+ // New rows must collectively cover the same logical width as the old
308
+ // merged cell's colspan, with no rowspan'd cells of their own.
309
+ if (sumColspans(newHtml, newRow.cells) !== colspan) return null
310
+ for (const c of newRow.cells) {
311
+ if (getRowspan(newHtml, c) !== 1) return null
312
+ }
313
+ }
314
+
315
+ const out: string[] = []
316
+ for (let k = 0; k < span; k++) {
317
+ const newRow = newTable.rows[r + k]
318
+ out.push(rowHeaderSlice(newHtml, newRow))
319
+ for (const c of newRow.cells) {
320
+ out.push(emitSpanChangedCell(newHtml, c, 'rowspan'))
321
+ }
322
+ out.push('</tr>')
323
+ }
324
+ return { diff: out.join(''), span }
325
+ }
326
+
327
+ function emitEmptyRow(html: string, row: RowRange): string {
328
+ // Re-emit the source row's `<tr ...></tr>` verbatim.
329
+ return html.slice(row.rowStart, row.rowEnd)
330
+ }
331
+
332
+ function sameDimensions(a: TableRange, b: TableRange): boolean {
333
+ if (a.rows.length !== b.rows.length) return false
334
+ for (let i = 0; i < a.rows.length; i++) {
335
+ if (a.rows[i].cells.length !== b.rows[i].cells.length) return false
336
+ }
337
+ return true
338
+ }
339
+
340
+ /**
341
+ * Same-dimension path: walk the new table verbatim and substitute each
342
+ * cell content range with the cell-level diff. The surrounding
343
+ * `<thead>`/`<tbody>`/whitespace passes through untouched.
344
+ */
345
+ function diffPositionalTable(
346
+ oldHtml: string,
347
+ newHtml: string,
348
+ oldTable: TableRange,
349
+ newTable: TableRange,
350
+ diffCell: DiffCellFn
351
+ ): string {
352
+ const out: string[] = []
353
+ let cursor = newTable.tableStart
354
+ for (let r = 0; r < newTable.rows.length; r++) {
355
+ const oldRow = oldTable.rows[r]
356
+ const newRow = newTable.rows[r]
357
+ for (let c = 0; c < newRow.cells.length; c++) {
358
+ const oldCell = oldRow.cells[c]
359
+ const newCell = newRow.cells[c]
360
+ out.push(newHtml.slice(cursor, newCell.contentStart))
361
+ out.push(
362
+ diffCell(
363
+ oldHtml.slice(oldCell.contentStart, oldCell.contentEnd),
364
+ newHtml.slice(newCell.contentStart, newCell.contentEnd)
365
+ )
366
+ )
367
+ cursor = newCell.contentEnd
368
+ }
369
+ }
370
+ out.push(newHtml.slice(cursor, newTable.tableEnd))
371
+ return out.join('')
372
+ }
373
+
374
+ /**
375
+ * Mismatched-dimensions path: row-level LCS to identify added/deleted rows,
376
+ * then per preserved row a cell-level LCS to identify added/deleted cells.
377
+ * Reconstructs the table from scratch — there's no "single new structure"
378
+ * to walk verbatim, since we're stitching together kept rows from both
379
+ * sides.
380
+ */
381
+ function diffStructurallyAlignedTable(
382
+ oldHtml: string,
383
+ newHtml: string,
384
+ oldTable: TableRange,
385
+ newTable: TableRange,
386
+ diffCell: DiffCellFn
387
+ ): string {
388
+ const oldKeys = oldTable.rows.map(row => rowKey(oldHtml, row))
389
+ const newKeys = newTable.rows.map(row => rowKey(newHtml, row))
390
+ const exactAlignment = lcsAlign(oldKeys, newKeys)
391
+ const paired = pairSimilarUnmatchedRows(exactAlignment, oldTable, newTable, oldHtml, newHtml)
392
+ // Reorder so unpaired deleted rows appear at their *natural old-side
393
+ // position* — immediately after the preserved/paired row that came
394
+ // before them in old. Without this, runs of unpaired dels at low
395
+ // alignment indices end up emitted before any preserved row (the
396
+ // "deleted rows out of order" bug).
397
+ const alignment = orderAlignmentForEmission(paired)
398
+
399
+ // Walk new's tableStart→tableEnd, substituting rows with their diffed
400
+ // form so `<thead>`/`<tbody>` wrappers and inter-row whitespace are
401
+ // preserved verbatim. Deleted rows (no position in new) are injected
402
+ // inline at the cursor's current position, which now corresponds to
403
+ // their natural old-side slot thanks to the reordering above. If new
404
+ // has no rows at all, fall back to a from-scratch reconstruction so
405
+ // we still emit deleted rows.
406
+ if (newTable.rows.length === 0) {
407
+ return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment)
408
+ }
409
+
410
+ const out: string[] = []
411
+ out.push(newHtml.slice(newTable.tableStart, newTable.rows[0].rowStart))
412
+ let cursor = newTable.rows[0].rowStart
413
+ for (const align of alignment) {
414
+ if (align.newIdx !== null) {
415
+ const newRow = newTable.rows[align.newIdx]
416
+ out.push(newHtml.slice(cursor, newRow.rowStart))
417
+ if (align.oldIdx !== null) {
418
+ out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[align.oldIdx], newRow, diffCell))
419
+ } else {
420
+ out.push(emitFullRow(newHtml, newRow, 'ins'))
421
+ }
422
+ cursor = newRow.rowEnd
423
+ } else if (align.oldIdx !== null) {
424
+ out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'))
425
+ }
426
+ }
427
+ out.push(newHtml.slice(cursor, newTable.tableEnd))
428
+ return out.join('')
429
+ }
430
+
431
+ /**
432
+ * Reorders the alignment so emission produces rows in the visually-
433
+ * correct order. Each entry is assigned a fractional "position" in
434
+ * new's flow:
435
+ *
436
+ * • Preserved/paired (oldIdx, newIdx): position = newIdx.
437
+ * • Pure insert (null, newIdx): position = newIdx.
438
+ * • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
439
+ * before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
440
+ * they appear in old's row order. The +0.5 places dels BEFORE any
441
+ * insert at the same gap (insert at newIdx N1+1 has position N1+1
442
+ * which is > N1+0.5), giving the natural "delete first, insert
443
+ * second" reading order at a replaced position.
444
+ *
445
+ * This handles the full range:
446
+ * • Run of unpaired dels at the start (no preserved predecessor):
447
+ * position -0.5, sorted by oldIdx.
448
+ * • Dels in the middle: positioned right after their preceding
449
+ * preserved row.
450
+ * • Dels at the end (no preserved successor): positioned after the
451
+ * last preserved row.
452
+ *
453
+ * Without this reordering, a run of unpaired deletes at low alignment
454
+ * indices got emitted at cursor = first-new-row position — putting
455
+ * all deletes before any preserved row in the output, regardless of
456
+ * where they came from in old.
457
+ */
458
+ function orderAlignmentForEmission(alignment: Alignment[]): Alignment[] {
459
+ const preserved: Array<{ oldIdx: number; newIdx: number }> = []
460
+ for (const a of alignment) {
461
+ if (a.oldIdx !== null && a.newIdx !== null) {
462
+ preserved.push({ oldIdx: a.oldIdx, newIdx: a.newIdx })
463
+ }
464
+ }
465
+ preserved.sort((a, b) => a.oldIdx - b.oldIdx)
466
+
467
+ // For a deleted row with oldIdx K, return the newIdx of the preserved
468
+ // entry with the largest oldIdx less than K, or -1 if none.
469
+ function newIdxOfPreservedBefore(oldIdx: number): number {
470
+ let result = -1
471
+ for (const p of preserved) {
472
+ if (p.oldIdx >= oldIdx) break
473
+ result = p.newIdx
474
+ }
475
+ return result
476
+ }
477
+
478
+ // Decorate each alignment with a fractional position. We use
479
+ // (primary, secondary) tuples so dels at the same gap sort by oldIdx
480
+ // (in old's row order) and inserts at the same newIdx stay stable.
481
+ const decorated = alignment.map((a, i) => {
482
+ let primary: number
483
+ let secondary: number
484
+ if (a.newIdx !== null) {
485
+ primary = a.newIdx
486
+ secondary = a.oldIdx === null ? 1 : 0 // preserved before pure-insert at same newIdx (rare)
487
+ } else {
488
+ // Pure delete
489
+ primary = newIdxOfPreservedBefore(a.oldIdx as number) + 0.5
490
+ secondary = a.oldIdx as number
491
+ }
492
+ return { entry: a, primary, secondary, originalIdx: i }
493
+ })
494
+
495
+ decorated.sort((a, b) => {
496
+ if (a.primary !== b.primary) return a.primary - b.primary
497
+ if (a.secondary !== b.secondary) return a.secondary - b.secondary
498
+ return a.originalIdx - b.originalIdx // stable
499
+ })
500
+
501
+ return decorated.map(d => d.entry)
502
+ }
503
+
504
+ function rebuildStructurallyAlignedTable(
505
+ oldHtml: string,
506
+ newHtml: string,
507
+ oldTable: TableRange,
508
+ newTable: TableRange,
509
+ alignment: Alignment[]
510
+ ): string {
511
+ // Used when new has no rows but old does — we lose the per-row
512
+ // wrappers from new (there are none), so reconstruct from old's frame.
513
+ const out: string[] = []
514
+ out.push(headerSlice(newHtml, newTable, oldHtml, oldTable))
515
+ for (const align of alignment) {
516
+ if (align.oldIdx !== null) {
517
+ out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'))
518
+ } else if (align.newIdx !== null) {
519
+ out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], 'ins'))
520
+ }
521
+ }
522
+ out.push('</table>')
523
+ return out.join('')
524
+ }
525
+
526
+ function headerSlice(newHtml: string, newTable: TableRange, oldHtml: string, oldTable: TableRange): string {
527
+ // Slice from <table> to the start of the first <tr>. Prefer new since
528
+ // attribute changes on <table> itself should follow new.
529
+ const newFirstRow = newTable.rows[0]?.rowStart ?? newTable.tableEnd - '</table>'.length
530
+ if (newFirstRow > newTable.tableStart) {
531
+ return newHtml.slice(newTable.tableStart, newFirstRow)
532
+ }
533
+ const oldFirstRow = oldTable.rows[0]?.rowStart ?? oldTable.tableEnd - '</table>'.length
534
+ return oldHtml.slice(oldTable.tableStart, oldFirstRow)
535
+ }
536
+
537
+ function rowKey(html: string, row: RowRange): string {
538
+ // Include cell tag text in the key so column-add doesn't accidentally
539
+ // match a row to one with different cell counts. Whitespace-normalize to
540
+ // tolerate formatting differences.
541
+ return html.slice(row.rowStart, row.rowEnd).replace(/\s+/g, ' ').trim()
542
+ }
543
+
544
+ function diffPreservedRow(
545
+ oldHtml: string,
546
+ newHtml: string,
547
+ oldRow: RowRange,
548
+ newRow: RowRange,
549
+ diffCell: DiffCellFn
550
+ ): string {
551
+ if (oldRow.cells.length === newRow.cells.length) {
552
+ return diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell)
553
+ }
554
+ // Cell counts differ. Try to interpret it as a horizontal merge/split via
555
+ // colspan first — preserving the new structure with `class='mod colspan'`
556
+ // on each affected cell.
557
+ const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
558
+ if (colspanAligned !== null) return colspanAligned
559
+ // For column add/delete (cell counts differ), find the best insertion
560
+ // or deletion positions via positional similarity scan and align the
561
+ // remaining cells positionally. This handles content-edit alongside
562
+ // column-add by keeping the edited cell in its column position rather
563
+ // than orphaning it via the cell-LCS exact match.
564
+ // Guardrail: combinatorial search is C(newCount, k); we cap to avoid
565
+ // explosion on very wide tables. Worst case at the caps is C(40, 6) ≈
566
+ // 3.8M combos; above that we fall through to cell-LCS.
567
+ const delta = newRow.cells.length - oldRow.cells.length
568
+ const absDelta = Math.abs(delta)
569
+ if (
570
+ absDelta > 0 &&
571
+ absDelta <= MAX_COLUMN_DELTA &&
572
+ Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH
573
+ ) {
574
+ if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell)
575
+ return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell)
576
+ }
577
+ return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
578
+ }
579
+
580
+ /**
581
+ * For a row where new has K more cells than old, find the K column
582
+ * positions in new where cells were inserted by scanning all C(newCount,
583
+ * K) combinations and picking the one that maximises positional content
584
+ * similarity with the remaining cells. The inserted cells are emitted
585
+ * with diff markers; the rest are aligned positionally with content
586
+ * diff for matched pairs.
587
+ */
588
+ function diffMultiColumnAddRow(
589
+ oldHtml: string,
590
+ newHtml: string,
591
+ oldRow: RowRange,
592
+ newRow: RowRange,
593
+ k: number,
594
+ diffCell: DiffCellFn
595
+ ): string {
596
+ const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml)
597
+ const inserted = new Set(insertedPositions)
598
+ const out: string[] = [rowHeaderSlice(newHtml, newRow)]
599
+ let oldIdx = 0
600
+ for (let c = 0; c < newRow.cells.length; c++) {
601
+ if (inserted.has(c)) {
602
+ out.push(emitFullCell(newHtml, newRow.cells[c], 'ins'))
603
+ } else {
604
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell))
605
+ oldIdx++
606
+ }
607
+ }
608
+ out.push('</tr>')
609
+ return out.join('')
610
+ }
611
+
612
+ function diffMultiColumnDeleteRow(
613
+ oldHtml: string,
614
+ newHtml: string,
615
+ oldRow: RowRange,
616
+ newRow: RowRange,
617
+ k: number,
618
+ diffCell: DiffCellFn
619
+ ): string {
620
+ const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml)
621
+ const deleted = new Set(deletedPositions)
622
+ const out: string[] = [rowHeaderSlice(newHtml, newRow)]
623
+ let newIdx = 0
624
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
625
+ if (deleted.has(oldIdx)) {
626
+ out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del'))
627
+ continue
628
+ }
629
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell))
630
+ newIdx++
631
+ }
632
+ out.push('</tr>')
633
+ return out.join('')
634
+ }
635
+
636
+ function findBestColumnInsertPositions(
637
+ oldRow: RowRange,
638
+ newRow: RowRange,
639
+ k: number,
640
+ oldHtml: string,
641
+ newHtml: string
642
+ ): number[] {
643
+ // Pre-compute cell texts once instead of letting textSimilarity
644
+ // recompute them inside every combo iteration — C(N, K) combos times
645
+ // ~N text extractions each is a lot of wasted string work.
646
+ const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
647
+ const newTexts = newRow.cells.map(c => cellText(newHtml, c))
648
+ let bestPositions: number[] = []
649
+ let bestScore = -1
650
+ for (const combo of combinationsOfRange(newRow.cells.length, k)) {
651
+ const inserted = new Set(combo)
652
+ let score = 0
653
+ let oldIdx = 0
654
+ for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
655
+ if (inserted.has(newIdx)) continue
656
+ score += textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
657
+ oldIdx++
658
+ }
659
+ if (score > bestScore) {
660
+ bestScore = score
661
+ bestPositions = combo
662
+ }
663
+ }
664
+ return bestPositions
665
+ }
666
+
667
+ function findBestColumnDeletePositions(
668
+ oldRow: RowRange,
669
+ newRow: RowRange,
670
+ k: number,
671
+ oldHtml: string,
672
+ newHtml: string
673
+ ): number[] {
674
+ const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
675
+ const newTexts = newRow.cells.map(c => cellText(newHtml, c))
676
+ let bestPositions: number[] = []
677
+ let bestScore = -1
678
+ for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
679
+ const deleted = new Set(combo)
680
+ let score = 0
681
+ let newIdx = 0
682
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
683
+ if (deleted.has(oldIdx)) continue
684
+ score += textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
685
+ newIdx++
686
+ }
687
+ if (score > bestScore) {
688
+ bestScore = score
689
+ bestPositions = combo
690
+ }
691
+ }
692
+ return bestPositions
693
+ }
694
+
695
+ /**
696
+ * Yields all sorted-ascending combinations of `k` distinct integers
697
+ * from [0, n). Iterative implementation avoids recursion overhead and
698
+ * keeps memory at O(k).
699
+ */
700
+ function* combinationsOfRange(n: number, k: number): IterableIterator<number[]> {
701
+ if (k === 0 || k > n) return
702
+ const indices = Array.from({ length: k }, (_, i) => i)
703
+ while (true) {
704
+ yield indices.slice()
705
+ let i = k - 1
706
+ while (i >= 0 && indices[i] === n - k + i) i--
707
+ if (i < 0) return
708
+ indices[i]++
709
+ for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1
710
+ }
711
+ }
712
+
713
+ /**
714
+ * Try to align cells by logical column position (sum of colspans). When
715
+ * one side has a colspan'd cell that absorbs multiple cells on the other
716
+ * side, emit the new structure with `class='mod colspan'` on the
717
+ * merged/split cells. Returns null if the rows don't align cleanly —
718
+ * caller falls back to a generic cell-LCS.
719
+ */
720
+ function diffColspanChangedRow(
721
+ oldHtml: string,
722
+ newHtml: string,
723
+ oldRow: RowRange,
724
+ newRow: RowRange,
725
+ diffCell: DiffCellFn
726
+ ): string | null {
727
+ const oldWidth = sumColspans(oldHtml, oldRow.cells)
728
+ const newWidth = sumColspans(newHtml, newRow.cells)
729
+ if (oldWidth !== newWidth) return null
730
+
731
+ const out: string[] = []
732
+ out.push(rowHeaderSlice(newHtml, newRow))
733
+
734
+ let oi = 0
735
+ let ni = 0
736
+ while (oi < oldRow.cells.length && ni < newRow.cells.length) {
737
+ const oCell = oldRow.cells[oi]
738
+ const nCell = newRow.cells[ni]
739
+ const oSpan = getColspan(oldHtml, oCell)
740
+ const nSpan = getColspan(newHtml, nCell)
741
+
742
+ if (oSpan === nSpan) {
743
+ out.push(emitDiffedCell(oldHtml, newHtml, oCell, nCell, diffCell))
744
+ oi++
745
+ ni++
746
+ } else if (nSpan > oSpan) {
747
+ // New cell absorbs multiple old cells — horizontal merge.
748
+ let totalOldSpan = 0
749
+ let oj = oi
750
+ while (oj < oldRow.cells.length && totalOldSpan < nSpan) {
751
+ totalOldSpan += getColspan(oldHtml, oldRow.cells[oj])
752
+ oj++
753
+ }
754
+ if (totalOldSpan !== nSpan) return null
755
+ out.push(emitSpanChangedCell(newHtml, nCell, 'colspan'))
756
+ oi = oj
757
+ ni++
758
+ } else {
759
+ // One old cell becomes multiple new cells — horizontal split.
760
+ let totalNewSpan = 0
761
+ let nj = ni
762
+ while (nj < newRow.cells.length && totalNewSpan < oSpan) {
763
+ totalNewSpan += getColspan(newHtml, newRow.cells[nj])
764
+ nj++
765
+ }
766
+ if (totalNewSpan !== oSpan) return null
767
+ for (let k = ni; k < nj; k++) {
768
+ out.push(emitSpanChangedCell(newHtml, newRow.cells[k], 'colspan'))
769
+ }
770
+ oi++
771
+ ni = nj
772
+ }
773
+ }
774
+
775
+ // If we couldn't consume both sides cleanly, bail out.
776
+ if (oi !== oldRow.cells.length || ni !== newRow.cells.length) return null
777
+
778
+ out.push('</tr>')
779
+ return out.join('')
780
+ }
781
+
782
+ function sumColspans(html: string, cells: CellRange[]): number {
783
+ let total = 0
784
+ for (const cell of cells) total += getColspan(html, cell)
785
+ return total
786
+ }
787
+
788
+ function getColspan(html: string, cell: CellRange): number {
789
+ return parseSpanAttribute(html.slice(cell.cellStart, cell.contentStart), 'colspan')
790
+ }
791
+
792
+ function getRowspan(html: string, cell: CellRange): number {
793
+ return parseSpanAttribute(html.slice(cell.cellStart, cell.contentStart), 'rowspan')
794
+ }
795
+
796
+ function parseSpanAttribute(openingTag: string, name: 'colspan' | 'rowspan'): number {
797
+ const re = name === 'colspan' ? /\bcolspan\s*=\s*["']?(\d+)["']?/i : /\browspan\s*=\s*["']?(\d+)["']?/i
798
+ const m = re.exec(openingTag)
799
+ if (!m) return 1
800
+ const value = Number.parseInt(m[1], 10)
801
+ return Number.isFinite(value) && value > 0 ? value : 1
802
+ }
803
+
804
+ /**
805
+ * Emits a cell that's the merged/split product of a structural change,
806
+ * tagged with `class='mod colspan'` or `class='mod rowspan'`. Content is
807
+ * carried through unmodified — Word doesn't track these changes, and
808
+ * inserting del/ins around content that didn't really change would be
809
+ * misleading.
810
+ */
811
+ function emitSpanChangedCell(html: string, cell: CellRange, kind: 'colspan' | 'rowspan'): string {
812
+ const tdOpening = parseOpeningTagAt(html, cell.cellStart)
813
+ if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd)
814
+ const tdOpenTag = injectClass(html.slice(cell.cellStart, tdOpening.end), `mod ${kind}`)
815
+ return tdOpenTag + html.slice(cell.contentStart, cell.cellEnd)
816
+ }
817
+
818
+ function diffPositionalRow(
819
+ oldHtml: string,
820
+ newHtml: string,
821
+ oldRow: RowRange,
822
+ newRow: RowRange,
823
+ diffCell: DiffCellFn
824
+ ): string {
825
+ const out: string[] = []
826
+ // Use new's <tr> opening tag (preserves attributes from new).
827
+ const trHeader = rowHeaderSlice(newHtml, newRow)
828
+ out.push(trHeader)
829
+
830
+ let cursor = newRow.cells[0]?.cellStart ?? newRow.rowEnd
831
+ for (let c = 0; c < newRow.cells.length; c++) {
832
+ const oldCell = oldRow.cells[c]
833
+ const newCell = newRow.cells[c]
834
+ out.push(newHtml.slice(cursor, newCell.contentStart))
835
+ out.push(
836
+ diffCell(
837
+ oldHtml.slice(oldCell.contentStart, oldCell.contentEnd),
838
+ newHtml.slice(newCell.contentStart, newCell.contentEnd)
839
+ )
840
+ )
841
+ cursor = newCell.contentEnd
842
+ }
843
+ out.push(newHtml.slice(cursor, newRow.rowEnd))
844
+ return out.join('')
845
+ }
846
+
847
+ function diffStructurallyAlignedRow(
848
+ oldHtml: string,
849
+ newHtml: string,
850
+ oldRow: RowRange,
851
+ newRow: RowRange,
852
+ diffCell: DiffCellFn
853
+ ): string {
854
+ const oldKeys = oldRow.cells.map(cell => cellKey(oldHtml, cell))
855
+ const newKeys = newRow.cells.map(cell => cellKey(newHtml, cell))
856
+ const exactAlignment = lcsAlign(oldKeys, newKeys)
857
+ // After exact LCS, fuzzy-pair adjacent unmatched old/new cells whose
858
+ // content is similar enough — so a content-edit cell alongside a
859
+ // column-add in the same row produces a content diff for the edited
860
+ // cell rather than a phantom delete + insert + extra cell.
861
+ const alignment = pairSimilarUnmatchedCells(exactAlignment, oldRow, newRow, oldHtml, newHtml)
862
+
863
+ const out: string[] = []
864
+ // Use new's <tr> if it exists; otherwise old's.
865
+ out.push(rowHeaderSlice(newHtml, newRow))
866
+
867
+ for (const align of alignment) {
868
+ if (align.oldIdx !== null && align.newIdx !== null) {
869
+ const oldCell = oldRow.cells[align.oldIdx]
870
+ const newCell = newRow.cells[align.newIdx]
871
+ out.push(emitDiffedCell(oldHtml, newHtml, oldCell, newCell, diffCell))
872
+ } else if (align.newIdx !== null) {
873
+ out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], 'ins'))
874
+ } else if (align.oldIdx !== null) {
875
+ out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], 'del'))
876
+ }
877
+ }
878
+
879
+ out.push('</tr>')
880
+ return out.join('')
881
+ }
882
+
883
+ function cellKey(html: string, cell: CellRange): string {
884
+ // Use cell content (not tag attributes) for matching, since column-add
885
+ // typically changes content but not tag attributes — and matching purely
886
+ // on attributes would mis-pair cells with the same content but different
887
+ // styling.
888
+ return html.slice(cell.contentStart, cell.contentEnd).replace(/\s+/g, ' ').trim()
889
+ }
890
+
891
+ /**
892
+ * Emits a row with all cells either inserted (kind='ins') or deleted
893
+ * (kind='del'). Adds `class='diffins'`/`'diffdel'` to the `<tr>` and to
894
+ * each `<td>`, with an `<ins>`/`<del>` wrapper around any cell content
895
+ * (empty cells get the class but no wrapper).
896
+ */
897
+ function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del'): string {
898
+ const cls = kind === 'ins' ? 'diffins' : 'diffdel'
899
+ const trOpening = parseOpeningTagAt(html, row.rowStart)
900
+ if (!trOpening) return html.slice(row.rowStart, row.rowEnd)
901
+ const trOpenTag = injectClass(html.slice(row.rowStart, trOpening.end), cls)
902
+
903
+ const out: string[] = [trOpenTag]
904
+ let cursor = trOpening.end
905
+ for (const cell of row.cells) {
906
+ out.push(html.slice(cursor, cell.cellStart))
907
+ out.push(emitFullCell(html, cell, kind))
908
+ cursor = cell.cellEnd
909
+ }
910
+ out.push(html.slice(cursor, row.rowEnd))
911
+ return out.join('')
912
+ }
913
+
914
+ /**
915
+ * Emits a fully-inserted or fully-deleted cell. Inner text runs are wrapped
916
+ * with `<ins>`/`<del>` while formatting tags pass through unchanged, so
917
+ * `<strong>B</strong>` renders as `<strong><ins>B</ins></strong>` —
918
+ * matching htmldiff's general convention without the doubled-`<ins>` that
919
+ * the full recursive diff would produce for newly-inserted formatting.
920
+ * Empty cells get the class on the `<td>` but no inner wrapping.
921
+ */
922
+ function emitFullCell(html: string, cell: CellRange, kind: 'ins' | 'del'): string {
923
+ const cls = kind === 'ins' ? 'diffins' : 'diffdel'
924
+ const tdOpening = parseOpeningTagAt(html, cell.cellStart)
925
+ if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd)
926
+ const tdOpenTag = injectClass(html.slice(cell.cellStart, tdOpening.end), cls)
927
+
928
+ const content = html.slice(cell.contentStart, cell.contentEnd)
929
+ const wrapped = content.trim().length === 0 ? content : wrapInlineTextRuns(content, kind)
930
+ const closing = html.slice(cell.contentEnd, cell.cellEnd)
931
+ return tdOpenTag + wrapped + closing
932
+ }
933
+
934
+ /**
935
+ * Wraps every non-whitespace text run in the given content with an
936
+ * `<ins>`/`<del>` tag, leaving HTML tags untouched. This produces output
937
+ * like `<strong><ins>X</ins></strong>` for fully-inserted formatted
938
+ * content — the same shape the rest of htmldiff emits for content
939
+ * insertions inside existing formatting.
940
+ */
941
+ function wrapInlineTextRuns(content: string, kind: 'ins' | 'del'): string {
942
+ const tag = kind === 'ins' ? 'ins' : 'del'
943
+ const cls = kind === 'ins' ? 'diffins' : 'diffdel'
944
+
945
+ const out: string[] = []
946
+ let i = 0
947
+ while (i < content.length) {
948
+ if (content[i] === '<') {
949
+ const tagEnd = parseOpeningTagAt(content, i)
950
+ if (!tagEnd) {
951
+ // Malformed — pass the rest through verbatim.
952
+ out.push(content.slice(i))
953
+ break
954
+ }
955
+ out.push(content.slice(i, tagEnd.end))
956
+ i = tagEnd.end
957
+ continue
958
+ }
959
+ let j = i
960
+ while (j < content.length && content[j] !== '<') j++
961
+ const text = content.slice(i, j)
962
+ if (text.trim().length > 0) {
963
+ out.push(wrapText(text, tag, cls))
964
+ } else {
965
+ out.push(text)
966
+ }
967
+ i = j
968
+ }
969
+ return out.join('')
970
+ }
971
+
972
+ function emitDiffedCell(
973
+ oldHtml: string,
974
+ newHtml: string,
975
+ oldCell: CellRange,
976
+ newCell: CellRange,
977
+ diffCell: DiffCellFn
978
+ ): string {
979
+ const tdOpening = parseOpeningTagAt(newHtml, newCell.cellStart)
980
+ if (!tdOpening) return newHtml.slice(newCell.cellStart, newCell.cellEnd)
981
+ const tdOpenTag = newHtml.slice(newCell.cellStart, tdOpening.end)
982
+ const content = diffCell(
983
+ oldHtml.slice(oldCell.contentStart, oldCell.contentEnd),
984
+ newHtml.slice(newCell.contentStart, newCell.contentEnd)
985
+ )
986
+ const closing = newHtml.slice(newCell.contentEnd, newCell.cellEnd)
987
+ return tdOpenTag + content + closing
988
+ }
989
+
990
+ function rowHeaderSlice(html: string, row: RowRange): string {
991
+ // Slice from <tr> to just before the first <td> opening tag. Preserves
992
+ // the <tr ...> attributes plus any inter-tag whitespace. For a row with
993
+ // no cells, we only want the `<tr ...>` opening — the caller appends the
994
+ // closing `</tr>` explicitly, so taking the whole `<tr></tr>` here would
995
+ // double the close.
996
+ const opening = parseOpeningTagAt(html, row.rowStart)
997
+ if (!opening) return ''
998
+ if (row.cells.length === 0) return html.slice(row.rowStart, opening.end)
999
+ return html.slice(row.rowStart, row.cells[0].cellStart)
1000
+ }
1001
+
1002
+ interface Alignment {
1003
+ oldIdx: number | null
1004
+ newIdx: number | null
1005
+ }
1006
+
1007
+ /** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
1008
+ const ROW_FUZZY_THRESHOLD = 0.5
1009
+
1010
+ /**
1011
+ * Threshold for "this cell is a content-edit of that cell." Tuned the same
1012
+ * as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
1013
+ * content typically ARE the same logical cell with a body edit, so 0.5
1014
+ * works for both granularities in practice.
1015
+ */
1016
+ const CELL_FUZZY_THRESHOLD = 0.5
1017
+
1018
+ /**
1019
+ * After exact LCS, scan the alignment for runs of "old deleted, then new
1020
+ * inserted" (or vice versa) and pair entries whose content is similar
1021
+ * enough to be treated as an edit rather than a delete+insert. This keeps
1022
+ * row-level edits (a typo fix, a single word change) from being shown as
1023
+ * an entire row vanishing and a new one appearing — matching what users
1024
+ * expect from a typical track-changes view.
1025
+ */
1026
+ function pairSimilarUnmatchedRows(
1027
+ alignment: Alignment[],
1028
+ oldTable: TableRange,
1029
+ newTable: TableRange,
1030
+ oldHtml: string,
1031
+ newHtml: string
1032
+ ): Alignment[] {
1033
+ // Pre-compute row texts once; the similarity callback is invoked
1034
+ // O(D × I) times per unmatched run (every del × every ins), and
1035
+ // rowText walks every cell.
1036
+ const oldTexts = oldTable.rows.map(r => rowText(oldHtml, r))
1037
+ const newTexts = newTable.rows.map(r => rowText(newHtml, r))
1038
+ return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
1039
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
1040
+ )
1041
+ }
1042
+
1043
+ function pairSimilarUnmatchedCells(
1044
+ alignment: Alignment[],
1045
+ oldRow: RowRange,
1046
+ newRow: RowRange,
1047
+ oldHtml: string,
1048
+ newHtml: string
1049
+ ): Alignment[] {
1050
+ const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
1051
+ const newTexts = newRow.cells.map(c => cellText(newHtml, c))
1052
+ return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
1053
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
1054
+ )
1055
+ }
1056
+
1057
+ /**
1058
+ * Identify pairings inside each unmatched-only run, then build the output
1059
+ * alignment by walking the original and substituting paired entries at
1060
+ * the *ins position* (not the del position). This keeps the result
1061
+ * monotonic in newIdx — critical because the cursor-based emission
1062
+ * downstream walks new's html in order. Emitting at the del position
1063
+ * would be fine when del<ins in the alignment array (the typical case),
1064
+ * but can violate monotonicity when there are mixed unpaired entries in
1065
+ * between (column-add + row-add together, content-edit + column-add,
1066
+ * etc.).
1067
+ *
1068
+ * Generic over what's being paired — works for both rows (by full row
1069
+ * content similarity) and cells (by per-cell content similarity).
1070
+ */
1071
+ function pairSimilarUnmatched(
1072
+ alignment: Alignment[],
1073
+ threshold: number,
1074
+ similarity: (oldIdx: number, newIdx: number) => number
1075
+ ): Alignment[] {
1076
+ const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
1077
+ let i = 0
1078
+ while (i < alignment.length) {
1079
+ if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
1080
+ i++
1081
+ continue
1082
+ }
1083
+ const runStart = i
1084
+ while (i < alignment.length && (alignment[i].oldIdx === null) !== (alignment[i].newIdx === null)) i++
1085
+ const runEnd = i
1086
+
1087
+ const delIndices: number[] = []
1088
+ const insIndices: number[] = []
1089
+ for (let k = runStart; k < runEnd; k++) {
1090
+ if (alignment[k].oldIdx !== null) delIndices.push(k)
1091
+ else insIndices.push(k)
1092
+ }
1093
+
1094
+ const usedIns = new Set<number>()
1095
+ for (const di of delIndices) {
1096
+ let bestIi = -1
1097
+ let bestSim = threshold
1098
+ for (const ii of insIndices) {
1099
+ if (usedIns.has(ii)) continue
1100
+ const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
1101
+ if (sim > bestSim) {
1102
+ bestSim = sim
1103
+ bestIi = ii
1104
+ }
1105
+ }
1106
+ if (bestIi >= 0) {
1107
+ pairs.set(di, bestIi)
1108
+ usedIns.add(bestIi)
1109
+ }
1110
+ }
1111
+ }
1112
+
1113
+ const insToDel = new Map<number, number>() // ins-alignment-idx → del-alignment-idx
1114
+ for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi)
1115
+ const pairedDels = new Set<number>(pairs.keys())
1116
+
1117
+ const result: Alignment[] = []
1118
+ for (let k = 0; k < alignment.length; k++) {
1119
+ if (pairedDels.has(k)) continue // paired del — emitted when we reach its ins
1120
+ if (insToDel.has(k)) {
1121
+ const delAi = insToDel.get(k) as number
1122
+ result.push({ oldIdx: alignment[delAi].oldIdx, newIdx: alignment[k].newIdx })
1123
+ } else {
1124
+ result.push(alignment[k])
1125
+ }
1126
+ }
1127
+ return result
1128
+ }
1129
+
1130
+ /**
1131
+ * Combined similarity metric used for both row-level and cell-level
1132
+ * fuzzy pairing. Returns the MAX of two complementary metrics:
1133
+ *
1134
+ * 1. **Character prefix+suffix similarity** — fraction of the longer
1135
+ * string covered by shared prefix + shared suffix. Catches small
1136
+ * edits in the middle of a string (one word changed in a row).
1137
+ * Misses cases where the bulk of common content is in the middle
1138
+ * and the ends differ.
1139
+ *
1140
+ * 2. **Token Jaccard similarity** — intersection-over-union of the
1141
+ * whitespace-split tokens. Catches "most of the content is the
1142
+ * same but bookended by different bits" — e.g. a row whose only
1143
+ * edit is a column added at the start and another at the end,
1144
+ * where the ~50 chars in the middle that DO match would be
1145
+ * invisible to prefix+suffix.
1146
+ *
1147
+ * Either metric exceeding the threshold means pair. Neither alone is
1148
+ * sufficient for the full range of legal-doc edits we see in
1149
+ * production tables.
1150
+ */
1151
+ function textSimilarity(a: string, b: string): number {
1152
+ if (a === b) return 1
1153
+ if (a.length === 0 || b.length === 0) return 0
1154
+ return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
1155
+ }
1156
+
1157
+ function charPrefixSuffixSimilarity(a: string, b: string): number {
1158
+ let prefix = 0
1159
+ const minLen = Math.min(a.length, b.length)
1160
+ while (prefix < minLen && a[prefix] === b[prefix]) prefix++
1161
+
1162
+ let suffix = 0
1163
+ while (
1164
+ suffix < a.length - prefix &&
1165
+ suffix < b.length - prefix &&
1166
+ a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
1167
+ ) {
1168
+ suffix++
1169
+ }
1170
+
1171
+ return (prefix + suffix) / Math.max(a.length, b.length)
1172
+ }
1173
+
1174
+ function tokenJaccardSimilarity(a: string, b: string): number {
1175
+ const tokensA = new Set(a.split(/\s+/).filter(Boolean))
1176
+ const tokensB = new Set(b.split(/\s+/).filter(Boolean))
1177
+ if (tokensA.size === 0 && tokensB.size === 0) return 1
1178
+ let intersection = 0
1179
+ for (const t of tokensA) {
1180
+ if (tokensB.has(t)) intersection++
1181
+ }
1182
+ const union = tokensA.size + tokensB.size - intersection
1183
+ return union === 0 ? 0 : intersection / union
1184
+ }
1185
+
1186
+ function rowText(html: string, row: RowRange): string {
1187
+ const parts: string[] = []
1188
+ for (const cell of row.cells) {
1189
+ parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, ' '))
1190
+ }
1191
+ return parts.join(' ').replace(/\s+/g, ' ').trim().toLowerCase()
1192
+ }
1193
+
1194
+ function cellText(html: string, cell: CellRange): string {
1195
+ return html
1196
+ .slice(cell.contentStart, cell.contentEnd)
1197
+ .replace(/<[^>]+>/g, ' ')
1198
+ .replace(/\s+/g, ' ')
1199
+ .trim()
1200
+ .toLowerCase()
1201
+ }
1202
+
1203
+ /**
1204
+ * Standard LCS alignment: walks both sequences and emits a list of pairs
1205
+ * where `(oldIdx, newIdx)` are both set for matching positions, and one
1206
+ * side is null for an unmatched entry on the other side. Equality uses
1207
+ * strict ===.
1208
+ */
1209
+ function lcsAlign(oldKeys: string[], newKeys: string[]): Alignment[] {
1210
+ const m = oldKeys.length
1211
+ const n = newKeys.length
1212
+ const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
1213
+ for (let i = 1; i <= m; i++) {
1214
+ for (let j = 1; j <= n; j++) {
1215
+ if (oldKeys[i - 1] === newKeys[j - 1]) {
1216
+ dp[i][j] = dp[i - 1][j - 1] + 1
1217
+ } else {
1218
+ dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1])
1219
+ }
1220
+ }
1221
+ }
1222
+
1223
+ // Backtrack and push; reverse at the end. `unshift` is O(n) per call
1224
+ // so the naive version was O(n²); push+reverse is O(n) total.
1225
+ const result: Alignment[] = []
1226
+ let i = m
1227
+ let j = n
1228
+ while (i > 0 || j > 0) {
1229
+ if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
1230
+ result.push({ oldIdx: i - 1, newIdx: j - 1 })
1231
+ i--
1232
+ j--
1233
+ } else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
1234
+ result.push({ oldIdx: null, newIdx: j - 1 })
1235
+ j--
1236
+ } else {
1237
+ result.push({ oldIdx: i - 1, newIdx: null })
1238
+ i--
1239
+ }
1240
+ }
1241
+ result.reverse()
1242
+ return result
1243
+ }
1244
+
1245
+ /**
1246
+ * Returns the opening tag with the given class injected. Locates the real
1247
+ * `class` attribute via attribute-aware walking (NOT a flat regex — that
1248
+ * would mis-match inside a foreign attribute value like
1249
+ * `title="see class='x'"`). When the class already partially overlaps with
1250
+ * `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
1251
+ * only the missing tokens get appended, so we never end up with
1252
+ * `class="mod mod colspan"`.
1253
+ */
1254
+ function injectClass(openingTag: string, cls: string): string {
1255
+ const clsTokens = cls.split(/\s+/).filter(Boolean)
1256
+ if (clsTokens.length === 0) return openingTag
1257
+
1258
+ const classAttr = findClassAttribute(openingTag)
1259
+ if (classAttr) {
1260
+ const existingTokens = classAttr.value.split(/\s+/).filter(Boolean)
1261
+ const missing = clsTokens.filter(t => !existingTokens.includes(t))
1262
+ if (missing.length === 0) return openingTag
1263
+ const updatedValue =
1264
+ existingTokens.length === 0 ? missing.join(' ') : `${existingTokens.join(' ')} ${missing.join(' ')}`
1265
+ return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd)
1266
+ }
1267
+
1268
+ const isSelfClosing = openingTag.endsWith('/>')
1269
+ const insertAt = isSelfClosing ? openingTag.length - 2 : openingTag.length - 1
1270
+ return `${openingTag.slice(0, insertAt).replace(/\s*$/, '')} class='${cls}'${openingTag.slice(insertAt)}`
1271
+ }
1272
+
1273
+ /**
1274
+ * Walks the opening tag's attributes (respecting quoted values) to find
1275
+ * the actual `class` attribute. Returns the value range (start/end of the
1276
+ * value content, *excluding* the surrounding quotes) and the value, or
1277
+ * null if no `class` attribute is present.
1278
+ */
1279
+ function findClassAttribute(openingTag: string): { valueStart: number; valueEnd: number; value: string } | null {
1280
+ // Skip past the tag name. Tag starts with `<`; first run of [A-Za-z0-9-]
1281
+ // is the tag name. Anything after is attribute territory.
1282
+ let i = 1
1283
+ while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++
1284
+
1285
+ while (i < openingTag.length) {
1286
+ // Skip whitespace
1287
+ while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1288
+ if (i >= openingTag.length) break
1289
+ if (openingTag[i] === '>' || openingTag[i] === '/') break
1290
+
1291
+ // Read attribute name
1292
+ const nameStart = i
1293
+ while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++
1294
+ const name = openingTag.slice(nameStart, i)
1295
+
1296
+ // Optional whitespace + '=' + optional whitespace + value
1297
+ while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1298
+ if (openingTag[i] !== '=') {
1299
+ // Bare attribute (no value) — not class
1300
+ continue
1301
+ }
1302
+ i++ // past '='
1303
+ while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1304
+
1305
+ // Value: quoted or unquoted
1306
+ let valueStart: number
1307
+ let valueEnd: number
1308
+ if (openingTag[i] === '"' || openingTag[i] === "'") {
1309
+ const quote = openingTag[i]
1310
+ i++
1311
+ valueStart = i
1312
+ while (i < openingTag.length && openingTag[i] !== quote) i++
1313
+ valueEnd = i
1314
+ if (i < openingTag.length) i++ // past closing quote
1315
+ } else {
1316
+ valueStart = i
1317
+ while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++
1318
+ valueEnd = i
1319
+ }
1320
+
1321
+ if (name.toLowerCase() === 'class') {
1322
+ return { valueStart, valueEnd, value: openingTag.slice(valueStart, valueEnd) }
1323
+ }
1324
+ }
1325
+
1326
+ return null
1327
+ }
1328
+
1329
+ /**
1330
+ * Walks html and returns ranges for every top-level `<table>...</table>`
1331
+ * block. Nested tables aren't extracted as separate top-level entries —
1332
+ * they're captured inside the parent's content range and handled when the
1333
+ * cell-level diff recurses through them.
1334
+ */
1335
+ function findTopLevelTables(html: string): TableRange[] {
1336
+ const tables: TableRange[] = []
1337
+ let i = 0
1338
+ while (i < html.length) {
1339
+ if (matchesTagAt(html, i, 'table')) {
1340
+ const opening = parseOpeningTagAt(html, i)
1341
+ if (!opening) {
1342
+ i++
1343
+ continue
1344
+ }
1345
+ const tableContentStart = opening.end
1346
+ const tableEnd = findMatchingClosingTag(html, tableContentStart, 'table')
1347
+ if (tableEnd === -1) {
1348
+ i = opening.end
1349
+ continue
1350
+ }
1351
+ const closingTagStart = tableEnd - '</table>'.length
1352
+ const rows = findTopLevelRows(html, tableContentStart, closingTagStart)
1353
+ tables.push({ tableStart: i, tableEnd, rows })
1354
+ i = tableEnd
1355
+ } else {
1356
+ i++
1357
+ }
1358
+ }
1359
+ return tables
1360
+ }
1361
+
1362
+ function findTopLevelRows(html: string, start: number, end: number): RowRange[] {
1363
+ const rows: RowRange[] = []
1364
+ let i = start
1365
+ while (i < end) {
1366
+ if (matchesTagAt(html, i, 'tr')) {
1367
+ const opening = parseOpeningTagAt(html, i)
1368
+ if (!opening) {
1369
+ i++
1370
+ continue
1371
+ }
1372
+ const rowContentStart = opening.end
1373
+ const rowEnd = findMatchingClosingTag(html, rowContentStart, 'tr', end)
1374
+ if (rowEnd === -1) {
1375
+ i = opening.end
1376
+ continue
1377
+ }
1378
+ const closingTagStart = rowEnd - '</tr>'.length
1379
+ const cells = findTopLevelCells(html, rowContentStart, closingTagStart)
1380
+ rows.push({ rowStart: i, rowEnd, cells })
1381
+ i = rowEnd
1382
+ } else if (matchesClosingTagAt(html, i, 'table')) {
1383
+ // Defensive: bail out if we encounter a closing </table> while
1384
+ // scanning rows (we should have stopped at `end` already).
1385
+ break
1386
+ } else {
1387
+ i++
1388
+ }
1389
+ }
1390
+ return rows
1391
+ }
1392
+
1393
+ function findTopLevelCells(html: string, start: number, end: number): CellRange[] {
1394
+ const cells: CellRange[] = []
1395
+ let i = start
1396
+ while (i < end) {
1397
+ if (matchesTagAt(html, i, 'td') || matchesTagAt(html, i, 'th')) {
1398
+ const tagName = matchesTagAt(html, i, 'td') ? 'td' : 'th'
1399
+ const opening = parseOpeningTagAt(html, i)
1400
+ if (!opening) {
1401
+ i++
1402
+ continue
1403
+ }
1404
+ const contentStart = opening.end
1405
+ const cellEnd = findMatchingClosingTag(html, contentStart, tagName, end)
1406
+ if (cellEnd === -1) {
1407
+ i = opening.end
1408
+ continue
1409
+ }
1410
+ const contentEnd = cellEnd - `</${tagName}>`.length
1411
+ cells.push({ cellStart: i, cellEnd, contentStart, contentEnd })
1412
+ i = cellEnd
1413
+ } else if (matchesClosingTagAt(html, i, 'tr')) {
1414
+ break
1415
+ } else {
1416
+ i++
1417
+ }
1418
+ }
1419
+ return cells
1420
+ }
1421
+
1422
+ function matchesTagAt(html: string, i: number, tagName: string): boolean {
1423
+ if (html[i] !== '<') return false
1424
+ const candidate = html.slice(i + 1, i + 1 + tagName.length).toLowerCase()
1425
+ if (candidate !== tagName) return false
1426
+ const after = html[i + 1 + tagName.length]
1427
+ return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r' || after === '/'
1428
+ }
1429
+
1430
+ function matchesClosingTagAt(html: string, i: number, tagName: string): boolean {
1431
+ if (html[i] !== '<' || html[i + 1] !== '/') return false
1432
+ const candidate = html.slice(i + 2, i + 2 + tagName.length).toLowerCase()
1433
+ if (candidate !== tagName) return false
1434
+ const after = html[i + 2 + tagName.length]
1435
+ return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r'
1436
+ }
1437
+
1438
+ interface OpeningTag {
1439
+ /** Index just past the closing `>` of the opening tag. */
1440
+ end: number
1441
+ }
1442
+
1443
+ function parseOpeningTagAt(html: string, i: number): OpeningTag | null {
1444
+ // HTML comments, CDATA, processing instructions, and DOCTYPE need their
1445
+ // own terminators — a plain `>`-walker would cut a comment like
1446
+ // `<!-- a > b -->` at the first inner `>`, treating the rest as text
1447
+ // and corrupting downstream offsets. Word-exported HTML routinely
1448
+ // emits comments inside tables (conditional comments, OLE markers) so
1449
+ // these have to be handled, not just be theoretical.
1450
+ if (html.startsWith('<!--', i)) {
1451
+ const close = html.indexOf('-->', i + 4)
1452
+ return close === -1 ? null : { end: close + 3 }
1453
+ }
1454
+ if (html.startsWith('<![CDATA[', i)) {
1455
+ const close = html.indexOf(']]>', i + 9)
1456
+ return close === -1 ? null : { end: close + 3 }
1457
+ }
1458
+ if (html.startsWith('<?', i)) {
1459
+ const close = html.indexOf('?>', i + 2)
1460
+ return close === -1 ? null : { end: close + 2 }
1461
+ }
1462
+ // Walk to the next unquoted '>'. Handles attributes whose values contain
1463
+ // a literal '>' inside quotes, which a plain indexOf would mishandle.
1464
+ let j = i + 1
1465
+ let quote: string | null = null
1466
+ while (j < html.length) {
1467
+ const ch = html[j]
1468
+ if (quote) {
1469
+ if (ch === quote) quote = null
1470
+ } else if (ch === '"' || ch === "'") {
1471
+ quote = ch
1472
+ } else if (ch === '>') {
1473
+ return { end: j + 1 }
1474
+ }
1475
+ j++
1476
+ }
1477
+ return null
1478
+ }
1479
+
1480
+ /**
1481
+ * Returns the index just past the matching `</tagName>`, accounting for
1482
+ * nested tags of the same name. Returns -1 if no match before `limit`.
1483
+ */
1484
+ function findMatchingClosingTag(html: string, from: number, tagName: string, limit: number = html.length): number {
1485
+ let depth = 1
1486
+ let i = from
1487
+ while (i < limit) {
1488
+ if (matchesTagAt(html, i, tagName)) {
1489
+ const opening = parseOpeningTagAt(html, i)
1490
+ if (!opening) {
1491
+ i++
1492
+ continue
1493
+ }
1494
+ const tagText = html.slice(i, opening.end)
1495
+ if (!tagText.endsWith('/>')) depth++
1496
+ i = opening.end
1497
+ } else if (matchesClosingTagAt(html, i, tagName)) {
1498
+ depth--
1499
+ const closing = parseOpeningTagAt(html, i)
1500
+ const closingEnd = closing?.end ?? i + `</${tagName}>`.length
1501
+ if (depth === 0) return closingEnd
1502
+ i = closingEnd
1503
+ } else {
1504
+ i++
1505
+ }
1506
+ }
1507
+ return -1
1508
+ }