@createiq/htmldiff 1.0.5-beta.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/TableDiff.ts DELETED
@@ -1,1428 +0,0 @@
1
- /**
2
- * Table-aware preprocessing for HtmlDiff.
3
- *
4
- * The word-level diff alone matches longest-common-subsequences across cell
5
- * boundaries and produces structurally wrong output for table edits — it
6
- * shuffles content between cells, introduces phantom `<td>`s, and provides
7
- * no signal that an entire row or column was added/deleted. We pre-process
8
- * the inputs to give Word-style results:
9
- *
10
- * • When dimensions match (same row count, same cell count per row), we
11
- * diff cell content positionally so cross-cell shifts produce one
12
- * independent del/ins per cell.
13
- * • When dimensions don't match (added/deleted row, added/deleted column),
14
- * we run a row-level LCS to identify structurally added/deleted rows,
15
- * then within preserved rows a cell-level LCS to identify added/deleted
16
- * columns. Structurally added rows/cells get `class='diffins'` on the
17
- * `<tr>`/`<td>`; deleted ones get `class='diffdel'`. Preserved cells
18
- * fall back to a content diff via the recursive HtmlDiff callback.
19
- *
20
- * Tables are spliced out into placeholders before the main diff runs and
21
- * spliced back in after, so the surrounding (non-table) content is diffed
22
- * by the normal word-level pipeline.
23
- */
24
-
25
- interface CellRange {
26
- /** Start index of the cell's opening tag in the original html. */
27
- cellStart: number
28
- /** Index just past the cell's closing tag. */
29
- cellEnd: number
30
- /** Cell content range — the slice we feed into the cell-level diff. */
31
- contentStart: number
32
- contentEnd: number
33
- }
34
-
35
- interface RowRange {
36
- rowStart: number
37
- rowEnd: number
38
- cells: CellRange[]
39
- }
40
-
41
- interface TableRange {
42
- tableStart: number
43
- tableEnd: number
44
- rows: RowRange[]
45
- }
46
-
47
- export interface PreprocessResult {
48
- modifiedOld: string
49
- modifiedNew: string
50
- /** Maps placeholder marker → already-diffed table HTML to splice back in. */
51
- placeholderToDiff: Map<string, string>
52
- }
53
-
54
- // HTML comments survive WordSplitter as a single atomic token and are
55
- // treated as equal on both sides, so they pass through the diff
56
- // untouched and are easy to substitute back later. The nonce is generated
57
- // per call so a previously-diffed document being re-diffed (or any input
58
- // that legitimately contains an `<!--HTMLDIFF_TABLE_*-->` comment) can't
59
- // collide with the placeholder we substitute. We additionally regenerate
60
- // the nonce if it appears in either input.
61
- const PLACEHOLDER_PREFIX_BASE = '<!--HTMLDIFF_TABLE_'
62
- const PLACEHOLDER_SUFFIX = '-->'
63
-
64
- /**
65
- * Hard cap on table dimensions handled by the structural-aware path.
66
- * The row-LCS is O(rows²), the per-row cell-LCS is O(cells²), and each
67
- * comparison string-equals row content (potentially many KB). Without a
68
- * cap, a several-thousand-row table can pin a CPU for seconds. Tables
69
- * larger than this fall through to the word-level diff, which scales
70
- * linearly. Tuned to comfortably cover real-world ISDA schedules
71
- * (which routinely have 1000+ rows).
72
- */
73
- const MAX_TABLE_ROWS = 1500
74
- const MAX_TABLE_CELLS_PER_ROW = 200
75
-
76
- function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
77
- // 4 random bytes → 8 hex chars → 16^8 ≈ 4.3 billion combinations. We
78
- // also retry if the generated nonce happens to occur in either input.
79
- // Using `Math.random` here is fine: we're not defending against a
80
- // malicious adversary, just avoiding accidental collisions.
81
- for (let attempt = 0; attempt < 8; attempt++) {
82
- const nonce = Math.floor(Math.random() * 0xffffffff)
83
- .toString(16)
84
- .padStart(8, '0')
85
- const prefix = `${PLACEHOLDER_PREFIX_BASE}${nonce}_`
86
- if (!oldHtml.includes(prefix) && !newHtml.includes(prefix)) {
87
- return prefix
88
- }
89
- }
90
- // Astronomically unlikely. Falling back to a counter ensures progress
91
- // rather than an infinite loop, and any remaining collision will simply
92
- // surface as a malformed diff that the caller can detect.
93
- return `${PLACEHOLDER_PREFIX_BASE}fallback_${Date.now()}_`
94
- }
95
-
96
- type DiffCellFn = (oldCellContent: string, newCellContent: string) => string
97
-
98
- /**
99
- * Diffs every paired-by-position table in the inputs and replaces each
100
- * source table with a placeholder, returning the modified inputs plus the
101
- * placeholder→diff mapping. Returns null when there are no tables to
102
- * preprocess or the table counts don't line up.
103
- */
104
- export function preprocessTables(oldHtml: string, newHtml: string, diffCell: DiffCellFn): PreprocessResult | null {
105
- const oldTables = findTopLevelTables(oldHtml)
106
- const newTables = findTopLevelTables(newHtml)
107
-
108
- if (oldTables.length === 0 && newTables.length === 0) return null
109
- if (oldTables.length !== newTables.length) return null
110
-
111
- // Bail out on pathologically large tables — see MAX_TABLE_ROWS comment.
112
- for (let i = 0; i < oldTables.length; i++) {
113
- if (exceedsSizeLimit(oldTables[i]) || exceedsSizeLimit(newTables[i])) return null
114
- }
115
-
116
- const pairs: Array<{ oldTable: TableRange; newTable: TableRange; diffed: string }> = []
117
- for (let i = 0; i < oldTables.length; i++) {
118
- pairs.push({
119
- oldTable: oldTables[i],
120
- newTable: newTables[i],
121
- diffed: diffTable(oldHtml, newHtml, oldTables[i], newTables[i], diffCell),
122
- })
123
- }
124
-
125
- // Splice from end → start so earlier offsets stay valid.
126
- let modifiedOld = oldHtml
127
- let modifiedNew = newHtml
128
- const placeholderPrefix = makePlaceholderPrefix(oldHtml, newHtml)
129
- const placeholderToDiff = new Map<string, string>()
130
- for (let i = pairs.length - 1; i >= 0; i--) {
131
- const placeholder = `${placeholderPrefix}${i}${PLACEHOLDER_SUFFIX}`
132
- placeholderToDiff.set(placeholder, pairs[i].diffed)
133
- modifiedOld = spliceString(modifiedOld, pairs[i].oldTable.tableStart, pairs[i].oldTable.tableEnd, placeholder)
134
- modifiedNew = spliceString(modifiedNew, pairs[i].newTable.tableStart, pairs[i].newTable.tableEnd, placeholder)
135
- }
136
-
137
- return { modifiedOld, modifiedNew, placeholderToDiff }
138
- }
139
-
140
- export function restoreTablePlaceholders(diffOutput: string, placeholderToDiff: Map<string, string>): string {
141
- let result = diffOutput
142
- for (const [placeholder, html] of placeholderToDiff) {
143
- result = result.split(placeholder).join(html)
144
- }
145
- return result
146
- }
147
-
148
- function spliceString(s: string, start: number, end: number, replacement: string): string {
149
- return s.slice(0, start) + replacement + s.slice(end)
150
- }
151
-
152
- function exceedsSizeLimit(table: TableRange): boolean {
153
- if (table.rows.length > MAX_TABLE_ROWS) return true
154
- for (const row of table.rows) {
155
- if (row.cells.length > MAX_TABLE_CELLS_PER_ROW) return true
156
- }
157
- return false
158
- }
159
-
160
- function diffTable(
161
- oldHtml: string,
162
- newHtml: string,
163
- oldTable: TableRange,
164
- newTable: TableRange,
165
- diffCell: DiffCellFn
166
- ): string {
167
- if (sameDimensions(oldTable, newTable)) {
168
- return diffPositionalTable(oldHtml, newHtml, oldTable, newTable, diffCell)
169
- }
170
- if (oldTable.rows.length === newTable.rows.length) {
171
- // Same row count, different cell counts: column add/delete only.
172
- // Aligning rows positionally avoids the LCS row-key mismatch that
173
- // happens when rows have different cell counts.
174
- return diffSameRowCountTable(oldHtml, newHtml, oldTable, newTable, diffCell)
175
- }
176
- return diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diffCell)
177
- }
178
-
179
- function diffSameRowCountTable(
180
- oldHtml: string,
181
- newHtml: string,
182
- oldTable: TableRange,
183
- newTable: TableRange,
184
- diffCell: DiffCellFn
185
- ): string {
186
- // Walk the new table verbatim (preserving `<thead>`/`<tbody>` wrappers,
187
- // whitespace, etc.) and substitute each row's content with the diffed
188
- // form. The cursor-based emission keeps everything between rows intact.
189
- const out: string[] = []
190
- let cursor = newTable.tableStart
191
- let r = 0
192
- while (r < newTable.rows.length) {
193
- const merge = detectVerticalMerge(oldHtml, newHtml, oldTable, newTable, r)
194
- if (merge) {
195
- out.push(newHtml.slice(cursor, newTable.rows[r].rowStart))
196
- out.push(merge.diff)
197
- cursor = newTable.rows[r + merge.span - 1].rowEnd
198
- r += merge.span
199
- continue
200
- }
201
- const split = detectVerticalSplit(oldHtml, newHtml, oldTable, newTable, r)
202
- if (split) {
203
- out.push(newHtml.slice(cursor, newTable.rows[r].rowStart))
204
- out.push(split.diff)
205
- cursor = newTable.rows[r + split.span - 1].rowEnd
206
- r += split.span
207
- continue
208
- }
209
- const newRow = newTable.rows[r]
210
- out.push(newHtml.slice(cursor, newRow.rowStart))
211
- out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[r], newRow, diffCell))
212
- cursor = newRow.rowEnd
213
- r++
214
- }
215
- out.push(newHtml.slice(cursor, newTable.tableEnd))
216
- return out.join('')
217
- }
218
-
219
- /**
220
- * Detects a vertical merge starting at row `r`: new row R has a single
221
- * cell with rowspan=K (and any colspan ≥ 1), with rows R+1..R+K-1 empty
222
- * in new. Old rows R..R+K-1 must have a logical column width equal to
223
- * the new cell's colspan and contain no rowspan'd cells of their own.
224
- * This handles both single-column merges (old rows are 1-cell, new cell
225
- * rowspan=K) and rectangular merges (e.g. 2×2 merge into a single
226
- * colspan=2 rowspan=2 cell). Output: emit the merged cell with
227
- * `class='mod rowspan'` and the empty trailing rows unchanged.
228
- */
229
- function detectVerticalMerge(
230
- oldHtml: string,
231
- newHtml: string,
232
- oldTable: TableRange,
233
- newTable: TableRange,
234
- r: number
235
- ): { diff: string; span: number } | null {
236
- const newRow = newTable.rows[r]
237
- if (newRow.cells.length !== 1) return null
238
- const cell = newRow.cells[0]
239
- const span = getRowspan(newHtml, cell)
240
- if (span <= 1) return null
241
- if (r + span > newTable.rows.length) return null
242
-
243
- const colspan = getColspan(newHtml, cell)
244
-
245
- for (let k = 1; k < span; k++) {
246
- if (newTable.rows[r + k].cells.length !== 0) return null
247
- }
248
- for (let k = 0; k < span; k++) {
249
- const oldRow = oldTable.rows[r + k]
250
- if (!oldRow) return null
251
- // The absorbed region's logical width must match the merged cell's
252
- // colspan; otherwise this isn't a clean rectangular merge and we let
253
- // the caller fall through.
254
- if (sumColspans(oldHtml, oldRow.cells) !== colspan) return null
255
- for (const c of oldRow.cells) {
256
- if (getRowspan(oldHtml, c) !== 1) return null
257
- }
258
- }
259
-
260
- const out: string[] = []
261
- out.push(rowHeaderSlice(newHtml, newRow))
262
- out.push(emitSpanChangedCell(newHtml, cell, 'rowspan'))
263
- out.push('</tr>')
264
- for (let k = 1; k < span; k++) {
265
- out.push(emitEmptyRow(newHtml, newTable.rows[r + k]))
266
- }
267
- return { diff: out.join(''), span }
268
- }
269
-
270
- /**
271
- * Detects a vertical split starting at row `r`: old row R has a single
272
- * cell with rowspan=K, old rows R+1..R+K-1 are empty. New rows R..R+K-1
273
- * each have a single cell. Output: emit each new row with the new cell
274
- * tagged `class='mod rowspan'`.
275
- */
276
- function detectVerticalSplit(
277
- oldHtml: string,
278
- newHtml: string,
279
- oldTable: TableRange,
280
- newTable: TableRange,
281
- r: number
282
- ): { diff: string; span: number } | null {
283
- const oldRow = oldTable.rows[r]
284
- if (oldRow.cells.length !== 1) return null
285
- const oldCell = oldRow.cells[0]
286
- const span = getRowspan(oldHtml, oldCell)
287
- if (span <= 1) return null
288
- if (r + span > oldTable.rows.length) return null
289
-
290
- const colspan = getColspan(oldHtml, oldCell)
291
-
292
- for (let k = 1; k < span; k++) {
293
- if (oldTable.rows[r + k].cells.length !== 0) return null
294
- }
295
- for (let k = 0; k < span; k++) {
296
- const newRow = newTable.rows[r + k]
297
- if (!newRow) return null
298
- // New rows must collectively cover the same logical width as the old
299
- // merged cell's colspan, with no rowspan'd cells of their own.
300
- if (sumColspans(newHtml, newRow.cells) !== colspan) return null
301
- for (const c of newRow.cells) {
302
- if (getRowspan(newHtml, c) !== 1) return null
303
- }
304
- }
305
-
306
- const out: string[] = []
307
- for (let k = 0; k < span; k++) {
308
- const newRow = newTable.rows[r + k]
309
- out.push(rowHeaderSlice(newHtml, newRow))
310
- for (const c of newRow.cells) {
311
- out.push(emitSpanChangedCell(newHtml, c, 'rowspan'))
312
- }
313
- out.push('</tr>')
314
- }
315
- return { diff: out.join(''), span }
316
- }
317
-
318
- function emitEmptyRow(html: string, row: RowRange): string {
319
- // Re-emit the source row's `<tr ...></tr>` verbatim.
320
- return html.slice(row.rowStart, row.rowEnd)
321
- }
322
-
323
- function sameDimensions(a: TableRange, b: TableRange): boolean {
324
- if (a.rows.length !== b.rows.length) return false
325
- for (let i = 0; i < a.rows.length; i++) {
326
- if (a.rows[i].cells.length !== b.rows[i].cells.length) return false
327
- }
328
- return true
329
- }
330
-
331
- /**
332
- * Same-dimension path: walk the new table verbatim and substitute each
333
- * cell content range with the cell-level diff. The surrounding
334
- * `<thead>`/`<tbody>`/whitespace passes through untouched.
335
- */
336
- function diffPositionalTable(
337
- oldHtml: string,
338
- newHtml: string,
339
- oldTable: TableRange,
340
- newTable: TableRange,
341
- diffCell: DiffCellFn
342
- ): string {
343
- const out: string[] = []
344
- let cursor = newTable.tableStart
345
- for (let r = 0; r < newTable.rows.length; r++) {
346
- const oldRow = oldTable.rows[r]
347
- const newRow = newTable.rows[r]
348
- for (let c = 0; c < newRow.cells.length; c++) {
349
- const oldCell = oldRow.cells[c]
350
- const newCell = newRow.cells[c]
351
- out.push(newHtml.slice(cursor, newCell.contentStart))
352
- out.push(
353
- diffCell(
354
- oldHtml.slice(oldCell.contentStart, oldCell.contentEnd),
355
- newHtml.slice(newCell.contentStart, newCell.contentEnd)
356
- )
357
- )
358
- cursor = newCell.contentEnd
359
- }
360
- }
361
- out.push(newHtml.slice(cursor, newTable.tableEnd))
362
- return out.join('')
363
- }
364
-
365
- /**
366
- * Mismatched-dimensions path: row-level LCS to identify added/deleted rows,
367
- * then per preserved row a cell-level LCS to identify added/deleted cells.
368
- * Reconstructs the table from scratch — there's no "single new structure"
369
- * to walk verbatim, since we're stitching together kept rows from both
370
- * sides.
371
- */
372
- function diffStructurallyAlignedTable(
373
- oldHtml: string,
374
- newHtml: string,
375
- oldTable: TableRange,
376
- newTable: TableRange,
377
- diffCell: DiffCellFn
378
- ): string {
379
- const oldKeys = oldTable.rows.map(row => rowKey(oldHtml, row))
380
- const newKeys = newTable.rows.map(row => rowKey(newHtml, row))
381
- const exactAlignment = lcsAlign(oldKeys, newKeys)
382
- const alignment = pairSimilarUnmatchedRows(exactAlignment, oldTable, newTable, oldHtml, newHtml)
383
-
384
- // Walk new's tableStart→tableEnd, substituting rows with their diffed
385
- // form so `<thead>`/`<tbody>` wrappers and inter-row whitespace are
386
- // preserved verbatim. Deleted rows (no position in new) are injected
387
- // inline at their alignment position. If new has no rows at all, fall
388
- // back to a from-scratch reconstruction so we still emit deleted rows.
389
- if (newTable.rows.length === 0) {
390
- return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell)
391
- }
392
-
393
- // Emit the table header (`<table>` + any `<thead>`/`<tbody>` opening
394
- // text up to the first row) up-front so a leading run of deleted-only
395
- // alignments doesn't slip in before the table opens.
396
- const out: string[] = []
397
- out.push(newHtml.slice(newTable.tableStart, newTable.rows[0].rowStart))
398
- let cursor = newTable.rows[0].rowStart
399
- for (const align of alignment) {
400
- if (align.newIdx !== null) {
401
- const newRow = newTable.rows[align.newIdx]
402
- out.push(newHtml.slice(cursor, newRow.rowStart))
403
- if (align.oldIdx !== null) {
404
- out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[align.oldIdx], newRow, diffCell))
405
- } else {
406
- out.push(emitFullRow(newHtml, newRow, 'ins', diffCell))
407
- }
408
- cursor = newRow.rowEnd
409
- } else if (align.oldIdx !== null) {
410
- // Deleted row: inject inline at the current cursor (between the
411
- // previously emitted row and the next one in new).
412
- out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
413
- }
414
- }
415
- out.push(newHtml.slice(cursor, newTable.tableEnd))
416
- return out.join('')
417
- }
418
-
419
- function rebuildStructurallyAlignedTable(
420
- oldHtml: string,
421
- newHtml: string,
422
- oldTable: TableRange,
423
- newTable: TableRange,
424
- alignment: Alignment[],
425
- diffCell: DiffCellFn
426
- ): string {
427
- // Used when new has no rows but old does — we lose the per-row
428
- // wrappers from new (there are none), so reconstruct from old's frame.
429
- const out: string[] = []
430
- out.push(headerSlice(newHtml, newTable, oldHtml, oldTable))
431
- for (const align of alignment) {
432
- if (align.oldIdx !== null) {
433
- out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
434
- } else if (align.newIdx !== null) {
435
- out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], 'ins', diffCell))
436
- }
437
- }
438
- out.push('</table>')
439
- return out.join('')
440
- }
441
-
442
- function headerSlice(newHtml: string, newTable: TableRange, oldHtml: string, oldTable: TableRange): string {
443
- // Slice from <table> to the start of the first <tr>. Prefer new since
444
- // attribute changes on <table> itself should follow new.
445
- const newFirstRow = newTable.rows[0]?.rowStart ?? newTable.tableEnd - '</table>'.length
446
- if (newFirstRow > newTable.tableStart) {
447
- return newHtml.slice(newTable.tableStart, newFirstRow)
448
- }
449
- const oldFirstRow = oldTable.rows[0]?.rowStart ?? oldTable.tableEnd - '</table>'.length
450
- return oldHtml.slice(oldTable.tableStart, oldFirstRow)
451
- }
452
-
453
- function rowKey(html: string, row: RowRange): string {
454
- // Include cell tag text in the key so column-add doesn't accidentally
455
- // match a row to one with different cell counts. Whitespace-normalize to
456
- // tolerate formatting differences.
457
- return html.slice(row.rowStart, row.rowEnd).replace(/\s+/g, ' ').trim()
458
- }
459
-
460
- function diffPreservedRow(
461
- oldHtml: string,
462
- newHtml: string,
463
- oldRow: RowRange,
464
- newRow: RowRange,
465
- diffCell: DiffCellFn
466
- ): string {
467
- if (oldRow.cells.length === newRow.cells.length) {
468
- return diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell)
469
- }
470
- // Cell counts differ. Try to interpret it as a horizontal merge/split via
471
- // colspan first — preserving the new structure with `class='mod colspan'`
472
- // on each affected cell.
473
- const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
474
- if (colspanAligned !== null) return colspanAligned
475
- // For a single-column add/delete (cell count differs by exactly 1),
476
- // detect the position via positional similarity scan and align the
477
- // remaining cells positionally. This handles the case where a column
478
- // was added AND a different cell got an unrelated content edit — the
479
- // edited cell still aligns by position rather than getting orphaned by
480
- // the cell-LCS exact-match.
481
- const delta = newRow.cells.length - oldRow.cells.length
482
- // For column add/delete (cell counts differ), find the best insertion
483
- // or deletion positions via positional similarity scan and align the
484
- // remaining cells positionally. This handles content-edit alongside
485
- // column-add by keeping the edited cell in its column position rather
486
- // than orphaning it via the cell-LCS exact match.
487
- // Guardrail: combinatorial search is C(newCount, k); we cap to avoid
488
- // explosion on very wide tables. The cap is generous for real legal
489
- // schedules; anything above falls through to cell-LCS.
490
- const absDelta = Math.abs(delta)
491
- if (
492
- absDelta > 0 &&
493
- absDelta <= MAX_COLUMN_DELTA &&
494
- Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH
495
- ) {
496
- if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell)
497
- return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell)
498
- }
499
- return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
500
- }
501
-
502
- const MAX_COLUMN_DELTA = 6
503
- const MAX_COLUMN_SEARCH_WIDTH = 40
504
-
505
- /**
506
- * For a row where new has K more cells than old, find the K column
507
- * positions in new where cells were inserted by scanning all C(newCount,
508
- * K) combinations and picking the one that maximises positional content
509
- * similarity with the remaining cells. The inserted cells are emitted
510
- * with diff markers; the rest are aligned positionally with content
511
- * diff for matched pairs.
512
- */
513
- function diffMultiColumnAddRow(
514
- oldHtml: string,
515
- newHtml: string,
516
- oldRow: RowRange,
517
- newRow: RowRange,
518
- k: number,
519
- diffCell: DiffCellFn
520
- ): string {
521
- const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml)
522
- const inserted = new Set(insertedPositions)
523
- const out: string[] = [rowHeaderSlice(newHtml, newRow)]
524
- let oldIdx = 0
525
- for (let c = 0; c < newRow.cells.length; c++) {
526
- if (inserted.has(c)) {
527
- out.push(emitFullCell(newHtml, newRow.cells[c], 'ins', diffCell))
528
- } else {
529
- out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell))
530
- oldIdx++
531
- }
532
- }
533
- out.push('</tr>')
534
- return out.join('')
535
- }
536
-
537
- function diffMultiColumnDeleteRow(
538
- oldHtml: string,
539
- newHtml: string,
540
- oldRow: RowRange,
541
- newRow: RowRange,
542
- k: number,
543
- diffCell: DiffCellFn
544
- ): string {
545
- const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml)
546
- const deleted = new Set(deletedPositions)
547
- const out: string[] = [rowHeaderSlice(newHtml, newRow)]
548
- let newIdx = 0
549
- for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
550
- if (deleted.has(oldIdx)) {
551
- out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del', diffCell))
552
- continue
553
- }
554
- out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell))
555
- newIdx++
556
- }
557
- out.push('</tr>')
558
- return out.join('')
559
- }
560
-
561
- function findBestColumnInsertPositions(
562
- oldRow: RowRange,
563
- newRow: RowRange,
564
- k: number,
565
- oldHtml: string,
566
- newHtml: string
567
- ): number[] {
568
- let bestPositions: number[] = []
569
- let bestScore = -1
570
- for (const combo of combinationsOfRange(newRow.cells.length, k)) {
571
- const inserted = new Set(combo)
572
- let score = 0
573
- let oldIdx = 0
574
- for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
575
- if (inserted.has(newIdx)) continue
576
- score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
577
- oldIdx++
578
- }
579
- if (score > bestScore) {
580
- bestScore = score
581
- bestPositions = combo
582
- }
583
- }
584
- return bestPositions
585
- }
586
-
587
- function findBestColumnDeletePositions(
588
- oldRow: RowRange,
589
- newRow: RowRange,
590
- k: number,
591
- oldHtml: string,
592
- newHtml: string
593
- ): number[] {
594
- let bestPositions: number[] = []
595
- let bestScore = -1
596
- for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
597
- const deleted = new Set(combo)
598
- let score = 0
599
- let newIdx = 0
600
- for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
601
- if (deleted.has(oldIdx)) continue
602
- score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
603
- newIdx++
604
- }
605
- if (score > bestScore) {
606
- bestScore = score
607
- bestPositions = combo
608
- }
609
- }
610
- return bestPositions
611
- }
612
-
613
- /**
614
- * Yields all sorted-ascending combinations of `k` distinct integers
615
- * from [0, n). Iterative implementation avoids recursion overhead and
616
- * keeps memory at O(k).
617
- */
618
- function* combinationsOfRange(n: number, k: number): IterableIterator<number[]> {
619
- if (k === 0 || k > n) return
620
- const indices = Array.from({ length: k }, (_, i) => i)
621
- while (true) {
622
- yield indices.slice()
623
- let i = k - 1
624
- while (i >= 0 && indices[i] === n - k + i) i--
625
- if (i < 0) return
626
- indices[i]++
627
- for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1
628
- }
629
- }
630
-
631
- /**
632
- * Try to align cells by logical column position (sum of colspans). When
633
- * one side has a colspan'd cell that absorbs multiple cells on the other
634
- * side, emit the new structure with `class='mod colspan'` on the
635
- * merged/split cells. Returns null if the rows don't align cleanly —
636
- * caller falls back to a generic cell-LCS.
637
- */
638
- function diffColspanChangedRow(
639
- oldHtml: string,
640
- newHtml: string,
641
- oldRow: RowRange,
642
- newRow: RowRange,
643
- diffCell: DiffCellFn
644
- ): string | null {
645
- const oldWidth = sumColspans(oldHtml, oldRow.cells)
646
- const newWidth = sumColspans(newHtml, newRow.cells)
647
- if (oldWidth !== newWidth) return null
648
-
649
- const out: string[] = []
650
- out.push(rowHeaderSlice(newHtml, newRow))
651
-
652
- let oi = 0
653
- let ni = 0
654
- while (oi < oldRow.cells.length && ni < newRow.cells.length) {
655
- const oCell = oldRow.cells[oi]
656
- const nCell = newRow.cells[ni]
657
- const oSpan = getColspan(oldHtml, oCell)
658
- const nSpan = getColspan(newHtml, nCell)
659
-
660
- if (oSpan === nSpan) {
661
- out.push(emitDiffedCell(oldHtml, newHtml, oCell, nCell, diffCell))
662
- oi++
663
- ni++
664
- } else if (nSpan > oSpan) {
665
- // New cell absorbs multiple old cells — horizontal merge.
666
- let totalOldSpan = 0
667
- let oj = oi
668
- while (oj < oldRow.cells.length && totalOldSpan < nSpan) {
669
- totalOldSpan += getColspan(oldHtml, oldRow.cells[oj])
670
- oj++
671
- }
672
- if (totalOldSpan !== nSpan) return null
673
- out.push(emitSpanChangedCell(newHtml, nCell, 'colspan'))
674
- oi = oj
675
- ni++
676
- } else {
677
- // One old cell becomes multiple new cells — horizontal split.
678
- let totalNewSpan = 0
679
- let nj = ni
680
- while (nj < newRow.cells.length && totalNewSpan < oSpan) {
681
- totalNewSpan += getColspan(newHtml, newRow.cells[nj])
682
- nj++
683
- }
684
- if (totalNewSpan !== oSpan) return null
685
- for (let k = ni; k < nj; k++) {
686
- out.push(emitSpanChangedCell(newHtml, newRow.cells[k], 'colspan'))
687
- }
688
- oi++
689
- ni = nj
690
- }
691
- }
692
-
693
- // If we couldn't consume both sides cleanly, bail out.
694
- if (oi !== oldRow.cells.length || ni !== newRow.cells.length) return null
695
-
696
- out.push('</tr>')
697
- return out.join('')
698
- }
699
-
700
- function sumColspans(html: string, cells: CellRange[]): number {
701
- let total = 0
702
- for (const cell of cells) total += getColspan(html, cell)
703
- return total
704
- }
705
-
706
- function getColspan(html: string, cell: CellRange): number {
707
- return parseSpanAttribute(html.slice(cell.cellStart, cell.contentStart), 'colspan')
708
- }
709
-
710
- function getRowspan(html: string, cell: CellRange): number {
711
- return parseSpanAttribute(html.slice(cell.cellStart, cell.contentStart), 'rowspan')
712
- }
713
-
714
- function parseSpanAttribute(openingTag: string, name: 'colspan' | 'rowspan'): number {
715
- const re = name === 'colspan' ? /\bcolspan\s*=\s*["']?(\d+)["']?/i : /\browspan\s*=\s*["']?(\d+)["']?/i
716
- const m = re.exec(openingTag)
717
- if (!m) return 1
718
- const value = Number.parseInt(m[1], 10)
719
- return Number.isFinite(value) && value > 0 ? value : 1
720
- }
721
-
722
- /**
723
- * Emits a cell that's the merged/split product of a structural change,
724
- * tagged with `class='mod colspan'` or `class='mod rowspan'`. Content is
725
- * carried through unmodified — Word doesn't track these changes, and
726
- * inserting del/ins around content that didn't really change would be
727
- * misleading.
728
- */
729
- function emitSpanChangedCell(html: string, cell: CellRange, kind: 'colspan' | 'rowspan'): string {
730
- const tdOpening = parseOpeningTagAt(html, cell.cellStart)
731
- if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd)
732
- const tdOpenTag = injectClass(html.slice(cell.cellStart, tdOpening.end), `mod ${kind}`)
733
- return tdOpenTag + html.slice(cell.contentStart, cell.cellEnd)
734
- }
735
-
736
- function diffPositionalRow(
737
- oldHtml: string,
738
- newHtml: string,
739
- oldRow: RowRange,
740
- newRow: RowRange,
741
- diffCell: DiffCellFn
742
- ): string {
743
- const out: string[] = []
744
- // Use new's <tr> opening tag (preserves attributes from new).
745
- const trHeader = rowHeaderSlice(newHtml, newRow)
746
- out.push(trHeader)
747
-
748
- let cursor = newRow.cells[0]?.cellStart ?? newRow.rowEnd
749
- for (let c = 0; c < newRow.cells.length; c++) {
750
- const oldCell = oldRow.cells[c]
751
- const newCell = newRow.cells[c]
752
- out.push(newHtml.slice(cursor, newCell.contentStart))
753
- out.push(
754
- diffCell(
755
- oldHtml.slice(oldCell.contentStart, oldCell.contentEnd),
756
- newHtml.slice(newCell.contentStart, newCell.contentEnd)
757
- )
758
- )
759
- cursor = newCell.contentEnd
760
- }
761
- out.push(newHtml.slice(cursor, newRow.rowEnd))
762
- return out.join('')
763
- }
764
-
765
- function diffStructurallyAlignedRow(
766
- oldHtml: string,
767
- newHtml: string,
768
- oldRow: RowRange,
769
- newRow: RowRange,
770
- diffCell: DiffCellFn
771
- ): string {
772
- const oldKeys = oldRow.cells.map(cell => cellKey(oldHtml, cell))
773
- const newKeys = newRow.cells.map(cell => cellKey(newHtml, cell))
774
- const exactAlignment = lcsAlign(oldKeys, newKeys)
775
- // After exact LCS, fuzzy-pair adjacent unmatched old/new cells whose
776
- // content is similar enough — so a content-edit cell alongside a
777
- // column-add in the same row produces a content diff for the edited
778
- // cell rather than a phantom delete + insert + extra cell.
779
- const alignment = pairSimilarUnmatchedCells(exactAlignment, oldRow, newRow, oldHtml, newHtml)
780
-
781
- const out: string[] = []
782
- // Use new's <tr> if it exists; otherwise old's.
783
- out.push(rowHeaderSlice(newHtml, newRow))
784
-
785
- for (const align of alignment) {
786
- if (align.oldIdx !== null && align.newIdx !== null) {
787
- const oldCell = oldRow.cells[align.oldIdx]
788
- const newCell = newRow.cells[align.newIdx]
789
- out.push(emitDiffedCell(oldHtml, newHtml, oldCell, newCell, diffCell))
790
- } else if (align.newIdx !== null) {
791
- out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], 'ins', diffCell))
792
- } else if (align.oldIdx !== null) {
793
- out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], 'del', diffCell))
794
- }
795
- }
796
-
797
- out.push('</tr>')
798
- return out.join('')
799
- }
800
-
801
- function cellKey(html: string, cell: CellRange): string {
802
- // Use cell content (not tag attributes) for matching, since column-add
803
- // typically changes content but not tag attributes — and matching purely
804
- // on attributes would mis-pair cells with the same content but different
805
- // styling.
806
- return html.slice(cell.contentStart, cell.contentEnd).replace(/\s+/g, ' ').trim()
807
- }
808
-
809
- /**
810
- * Emits a row with all cells either inserted (kind='ins') or deleted
811
- * (kind='del'). Adds `class='diffins'`/`'diffdel'` to the `<tr>` and to
812
- * each `<td>`, with an `<ins>`/`<del>` wrapper around any cell content
813
- * (empty cells get the class but no wrapper).
814
- */
815
- function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell: DiffCellFn): string {
816
- const cls = kind === 'ins' ? 'diffins' : 'diffdel'
817
- const trOpening = parseOpeningTagAt(html, row.rowStart)
818
- if (!trOpening) return html.slice(row.rowStart, row.rowEnd)
819
- const trOpenTag = injectClass(html.slice(row.rowStart, trOpening.end), cls)
820
-
821
- const out: string[] = [trOpenTag]
822
- let cursor = trOpening.end
823
- for (const cell of row.cells) {
824
- out.push(html.slice(cursor, cell.cellStart))
825
- out.push(emitFullCell(html, cell, kind, diffCell))
826
- cursor = cell.cellEnd
827
- }
828
- out.push(html.slice(cursor, row.rowEnd))
829
- return out.join('')
830
- }
831
-
832
- /**
833
- * Emits a fully-inserted or fully-deleted cell. Inner text runs are wrapped
834
- * with `<ins>`/`<del>` while formatting tags pass through unchanged, so
835
- * `<strong>B</strong>` renders as `<strong><ins>B</ins></strong>` —
836
- * matching htmldiff's general convention without the doubled-`<ins>` that
837
- * the full recursive diff would produce for newly-inserted formatting.
838
- * Empty cells get the class on the `<td>` but no inner wrapping.
839
- */
840
- function emitFullCell(html: string, cell: CellRange, kind: 'ins' | 'del', _diffCell: DiffCellFn): string {
841
- const cls = kind === 'ins' ? 'diffins' : 'diffdel'
842
- const tdOpening = parseOpeningTagAt(html, cell.cellStart)
843
- if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd)
844
- const tdOpenTag = injectClass(html.slice(cell.cellStart, tdOpening.end), cls)
845
-
846
- const content = html.slice(cell.contentStart, cell.contentEnd)
847
- const wrapped = content.trim().length === 0 ? content : wrapInlineTextRuns(content, kind)
848
- const closing = html.slice(cell.contentEnd, cell.cellEnd)
849
- return tdOpenTag + wrapped + closing
850
- }
851
-
852
- /**
853
- * Wraps every non-whitespace text run in the given content with an
854
- * `<ins>`/`<del>` tag, leaving HTML tags untouched. This produces output
855
- * like `<strong><ins>X</ins></strong>` for fully-inserted formatted
856
- * content — the same shape the rest of htmldiff emits for content
857
- * insertions inside existing formatting.
858
- */
859
- function wrapInlineTextRuns(content: string, kind: 'ins' | 'del'): string {
860
- const tag = kind === 'ins' ? 'ins' : 'del'
861
- const cls = kind === 'ins' ? 'diffins' : 'diffdel'
862
-
863
- const out: string[] = []
864
- let i = 0
865
- while (i < content.length) {
866
- if (content[i] === '<') {
867
- const tagEnd = parseOpeningTagAt(content, i)
868
- if (!tagEnd) {
869
- // Malformed — pass the rest through verbatim.
870
- out.push(content.slice(i))
871
- break
872
- }
873
- out.push(content.slice(i, tagEnd.end))
874
- i = tagEnd.end
875
- continue
876
- }
877
- let j = i
878
- while (j < content.length && content[j] !== '<') j++
879
- const text = content.slice(i, j)
880
- if (text.trim().length > 0) {
881
- out.push(`<${tag} class='${cls}'>${text}</${tag}>`)
882
- } else {
883
- out.push(text)
884
- }
885
- i = j
886
- }
887
- return out.join('')
888
- }
889
-
890
- function emitDiffedCell(
891
- oldHtml: string,
892
- newHtml: string,
893
- oldCell: CellRange,
894
- newCell: CellRange,
895
- diffCell: DiffCellFn
896
- ): string {
897
- const tdOpening = parseOpeningTagAt(newHtml, newCell.cellStart)
898
- if (!tdOpening) return newHtml.slice(newCell.cellStart, newCell.cellEnd)
899
- const tdOpenTag = newHtml.slice(newCell.cellStart, tdOpening.end)
900
- const content = diffCell(
901
- oldHtml.slice(oldCell.contentStart, oldCell.contentEnd),
902
- newHtml.slice(newCell.contentStart, newCell.contentEnd)
903
- )
904
- const closing = newHtml.slice(newCell.contentEnd, newCell.cellEnd)
905
- return tdOpenTag + content + closing
906
- }
907
-
908
- function rowHeaderSlice(html: string, row: RowRange): string {
909
- // Slice from <tr> to just before the first <td> opening tag. Preserves
910
- // the <tr ...> attributes plus any inter-tag whitespace. For a row with
911
- // no cells, we only want the `<tr ...>` opening — the caller appends the
912
- // closing `</tr>` explicitly, so taking the whole `<tr></tr>` here would
913
- // double the close.
914
- const opening = parseOpeningTagAt(html, row.rowStart)
915
- if (!opening) return ''
916
- if (row.cells.length === 0) return html.slice(row.rowStart, opening.end)
917
- return html.slice(row.rowStart, row.cells[0].cellStart)
918
- }
919
-
920
- interface Alignment {
921
- oldIdx: number | null
922
- newIdx: number | null
923
- }
924
-
925
- /** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
926
- const ROW_FUZZY_THRESHOLD = 0.5
927
-
928
- /**
929
- * Threshold for "this cell is a content-edit of that cell." Tuned the same
930
- * as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
931
- * content typically ARE the same logical cell with a body edit, so 0.5
932
- * works for both granularities in practice.
933
- */
934
- const CELL_FUZZY_THRESHOLD = 0.5
935
-
936
- /**
937
- * After exact LCS, scan the alignment for runs of "old deleted, then new
938
- * inserted" (or vice versa) and pair entries whose content is similar
939
- * enough to be treated as an edit rather than a delete+insert. This keeps
940
- * row-level edits (a typo fix, a single word change) from being shown as
941
- * an entire row vanishing and a new one appearing — matching what users
942
- * expect from a typical track-changes view.
943
- */
944
- function pairSimilarUnmatchedRows(
945
- alignment: Alignment[],
946
- oldTable: TableRange,
947
- newTable: TableRange,
948
- oldHtml: string,
949
- newHtml: string
950
- ): Alignment[] {
951
- return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
952
- rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml)
953
- )
954
- }
955
-
956
- function pairSimilarUnmatchedCells(
957
- alignment: Alignment[],
958
- oldRow: RowRange,
959
- newRow: RowRange,
960
- oldHtml: string,
961
- newHtml: string
962
- ): Alignment[] {
963
- return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
964
- cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
965
- )
966
- }
967
-
968
- /**
969
- * Identify pairings inside each unmatched-only run, then build the output
970
- * alignment by walking the original and substituting paired entries at
971
- * the *ins position* (not the del position). This keeps the result
972
- * monotonic in newIdx — critical because the cursor-based emission
973
- * downstream walks new's html in order. Emitting at the del position
974
- * would be fine when del<ins in the alignment array (the typical case),
975
- * but can violate monotonicity when there are mixed unpaired entries in
976
- * between (column-add + row-add together, content-edit + column-add,
977
- * etc.).
978
- *
979
- * Generic over what's being paired — works for both rows (by full row
980
- * content similarity) and cells (by per-cell content similarity).
981
- */
982
- function pairSimilarUnmatched(
983
- alignment: Alignment[],
984
- threshold: number,
985
- similarity: (oldIdx: number, newIdx: number) => number
986
- ): Alignment[] {
987
- const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
988
- let i = 0
989
- while (i < alignment.length) {
990
- if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
991
- i++
992
- continue
993
- }
994
- const runStart = i
995
- while (i < alignment.length && (alignment[i].oldIdx === null) !== (alignment[i].newIdx === null)) i++
996
- const runEnd = i
997
-
998
- const delIndices: number[] = []
999
- const insIndices: number[] = []
1000
- for (let k = runStart; k < runEnd; k++) {
1001
- if (alignment[k].oldIdx !== null) delIndices.push(k)
1002
- else insIndices.push(k)
1003
- }
1004
-
1005
- const usedIns = new Set<number>()
1006
- for (const di of delIndices) {
1007
- let bestIi = -1
1008
- let bestSim = threshold
1009
- for (const ii of insIndices) {
1010
- if (usedIns.has(ii)) continue
1011
- const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
1012
- if (sim > bestSim) {
1013
- bestSim = sim
1014
- bestIi = ii
1015
- }
1016
- }
1017
- if (bestIi >= 0) {
1018
- pairs.set(di, bestIi)
1019
- usedIns.add(bestIi)
1020
- }
1021
- }
1022
- }
1023
-
1024
- const insToDel = new Map<number, number>() // ins-alignment-idx → del-alignment-idx
1025
- for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi)
1026
- const pairedDels = new Set<number>(pairs.keys())
1027
-
1028
- const result: Alignment[] = []
1029
- for (let k = 0; k < alignment.length; k++) {
1030
- if (pairedDels.has(k)) continue // paired del — emitted when we reach its ins
1031
- if (insToDel.has(k)) {
1032
- const delAi = insToDel.get(k) as number
1033
- result.push({ oldIdx: alignment[delAi].oldIdx, newIdx: alignment[k].newIdx })
1034
- } else {
1035
- result.push(alignment[k])
1036
- }
1037
- }
1038
- return result
1039
- }
1040
-
1041
- /**
1042
- * Combined similarity metric used for both row-level and cell-level
1043
- * fuzzy pairing. Returns the MAX of two complementary metrics:
1044
- *
1045
- * 1. **Character prefix+suffix similarity** — fraction of the longer
1046
- * string covered by shared prefix + shared suffix. Catches small
1047
- * edits in the middle of a string (one word changed in a row).
1048
- * Misses cases where the bulk of common content is in the middle
1049
- * and the ends differ.
1050
- *
1051
- * 2. **Token Jaccard similarity** — intersection-over-union of the
1052
- * whitespace-split tokens. Catches "most of the content is the
1053
- * same but bookended by different bits" — e.g. a row whose only
1054
- * edit is a column added at the start and another at the end,
1055
- * where the ~50 chars in the middle that DO match would be
1056
- * invisible to prefix+suffix.
1057
- *
1058
- * Either metric exceeding the threshold means pair. Neither alone is
1059
- * sufficient for the full range of legal-doc edits we see in
1060
- * production tables.
1061
- */
1062
- function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number {
1063
- return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow))
1064
- }
1065
-
1066
- function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
1067
- return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell))
1068
- }
1069
-
1070
- function textSimilarity(a: string, b: string): number {
1071
- if (a === b) return 1
1072
- if (a.length === 0 || b.length === 0) return 0
1073
- return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
1074
- }
1075
-
1076
- function charPrefixSuffixSimilarity(a: string, b: string): number {
1077
- let prefix = 0
1078
- const minLen = Math.min(a.length, b.length)
1079
- while (prefix < minLen && a[prefix] === b[prefix]) prefix++
1080
-
1081
- let suffix = 0
1082
- while (
1083
- suffix < a.length - prefix &&
1084
- suffix < b.length - prefix &&
1085
- a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
1086
- ) {
1087
- suffix++
1088
- }
1089
-
1090
- return (prefix + suffix) / Math.max(a.length, b.length)
1091
- }
1092
-
1093
- function tokenJaccardSimilarity(a: string, b: string): number {
1094
- const tokensA = new Set(a.split(/\s+/).filter(Boolean))
1095
- const tokensB = new Set(b.split(/\s+/).filter(Boolean))
1096
- if (tokensA.size === 0 && tokensB.size === 0) return 1
1097
- let intersection = 0
1098
- for (const t of tokensA) {
1099
- if (tokensB.has(t)) intersection++
1100
- }
1101
- const union = tokensA.size + tokensB.size - intersection
1102
- return union === 0 ? 0 : intersection / union
1103
- }
1104
-
1105
- function rowText(html: string, row: RowRange): string {
1106
- const parts: string[] = []
1107
- for (const cell of row.cells) {
1108
- parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, ' '))
1109
- }
1110
- return parts.join(' ').replace(/\s+/g, ' ').trim().toLowerCase()
1111
- }
1112
-
1113
- function cellText(html: string, cell: CellRange): string {
1114
- return html
1115
- .slice(cell.contentStart, cell.contentEnd)
1116
- .replace(/<[^>]+>/g, ' ')
1117
- .replace(/\s+/g, ' ')
1118
- .trim()
1119
- .toLowerCase()
1120
- }
1121
-
1122
- /**
1123
- * Standard LCS alignment: walks both sequences and emits a list of pairs
1124
- * where `(oldIdx, newIdx)` are both set for matching positions, and one
1125
- * side is null for an unmatched entry on the other side. Equality uses
1126
- * strict ===.
1127
- */
1128
- function lcsAlign(oldKeys: string[], newKeys: string[]): Alignment[] {
1129
- const m = oldKeys.length
1130
- const n = newKeys.length
1131
- const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
1132
- for (let i = 1; i <= m; i++) {
1133
- for (let j = 1; j <= n; j++) {
1134
- if (oldKeys[i - 1] === newKeys[j - 1]) {
1135
- dp[i][j] = dp[i - 1][j - 1] + 1
1136
- } else {
1137
- dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1])
1138
- }
1139
- }
1140
- }
1141
-
1142
- const result: Alignment[] = []
1143
- let i = m
1144
- let j = n
1145
- while (i > 0 || j > 0) {
1146
- if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
1147
- result.unshift({ oldIdx: i - 1, newIdx: j - 1 })
1148
- i--
1149
- j--
1150
- } else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
1151
- result.unshift({ oldIdx: null, newIdx: j - 1 })
1152
- j--
1153
- } else {
1154
- result.unshift({ oldIdx: i - 1, newIdx: null })
1155
- i--
1156
- }
1157
- }
1158
- return result
1159
- }
1160
-
1161
- /**
1162
- * Returns the opening tag string with the given class injected. Existing
1163
- * `class` attributes are preserved and the new class appended.
1164
- */
1165
- /**
1166
- * Returns the opening tag with the given class injected. Locates the real
1167
- * `class` attribute via attribute-aware walking (NOT a flat regex — that
1168
- * would mis-match inside a foreign attribute value like
1169
- * `title="see class='x'"`). When the class already partially overlaps with
1170
- * `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
1171
- * only the missing tokens get appended, so we never end up with
1172
- * `class="mod mod colspan"`.
1173
- */
1174
- function injectClass(openingTag: string, cls: string): string {
1175
- const clsTokens = cls.split(/\s+/).filter(Boolean)
1176
- if (clsTokens.length === 0) return openingTag
1177
-
1178
- const classAttr = findClassAttribute(openingTag)
1179
- if (classAttr) {
1180
- const existingTokens = classAttr.value.split(/\s+/).filter(Boolean)
1181
- const missing = clsTokens.filter(t => !existingTokens.includes(t))
1182
- if (missing.length === 0) return openingTag
1183
- const updatedValue =
1184
- existingTokens.length === 0 ? missing.join(' ') : `${existingTokens.join(' ')} ${missing.join(' ')}`
1185
- return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd)
1186
- }
1187
-
1188
- const isSelfClosing = openingTag.endsWith('/>')
1189
- const insertAt = isSelfClosing ? openingTag.length - 2 : openingTag.length - 1
1190
- return `${openingTag.slice(0, insertAt).replace(/\s*$/, '')} class='${cls}'${openingTag.slice(insertAt)}`
1191
- }
1192
-
1193
- /**
1194
- * Walks the opening tag's attributes (respecting quoted values) to find
1195
- * the actual `class` attribute. Returns the value range (start/end of the
1196
- * value content, *excluding* the surrounding quotes) and the value, or
1197
- * null if no `class` attribute is present.
1198
- */
1199
- function findClassAttribute(openingTag: string): { valueStart: number; valueEnd: number; value: string } | null {
1200
- // Skip past the tag name. Tag starts with `<`; first run of [A-Za-z0-9-]
1201
- // is the tag name. Anything after is attribute territory.
1202
- let i = 1
1203
- while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++
1204
-
1205
- while (i < openingTag.length) {
1206
- // Skip whitespace
1207
- while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1208
- if (i >= openingTag.length) break
1209
- if (openingTag[i] === '>' || openingTag[i] === '/') break
1210
-
1211
- // Read attribute name
1212
- const nameStart = i
1213
- while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++
1214
- const name = openingTag.slice(nameStart, i)
1215
-
1216
- // Optional whitespace + '=' + optional whitespace + value
1217
- while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1218
- if (openingTag[i] !== '=') {
1219
- // Bare attribute (no value) — not class
1220
- continue
1221
- }
1222
- i++ // past '='
1223
- while (i < openingTag.length && /\s/.test(openingTag[i])) i++
1224
-
1225
- // Value: quoted or unquoted
1226
- let valueStart: number
1227
- let valueEnd: number
1228
- if (openingTag[i] === '"' || openingTag[i] === "'") {
1229
- const quote = openingTag[i]
1230
- i++
1231
- valueStart = i
1232
- while (i < openingTag.length && openingTag[i] !== quote) i++
1233
- valueEnd = i
1234
- if (i < openingTag.length) i++ // past closing quote
1235
- } else {
1236
- valueStart = i
1237
- while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++
1238
- valueEnd = i
1239
- }
1240
-
1241
- if (name.toLowerCase() === 'class') {
1242
- return { valueStart, valueEnd, value: openingTag.slice(valueStart, valueEnd) }
1243
- }
1244
- }
1245
-
1246
- return null
1247
- }
1248
-
1249
- /**
1250
- * Walks html and returns ranges for every top-level `<table>...</table>`
1251
- * block. Nested tables aren't extracted as separate top-level entries —
1252
- * they're captured inside the parent's content range and handled when the
1253
- * cell-level diff recurses through them.
1254
- */
1255
- function findTopLevelTables(html: string): TableRange[] {
1256
- const tables: TableRange[] = []
1257
- let i = 0
1258
- while (i < html.length) {
1259
- if (matchesTagAt(html, i, 'table')) {
1260
- const opening = parseOpeningTagAt(html, i)
1261
- if (!opening) {
1262
- i++
1263
- continue
1264
- }
1265
- const tableContentStart = opening.end
1266
- const tableEnd = findMatchingClosingTag(html, tableContentStart, 'table')
1267
- if (tableEnd === -1) {
1268
- i = opening.end
1269
- continue
1270
- }
1271
- const closingTagStart = tableEnd - '</table>'.length
1272
- const rows = findTopLevelRows(html, tableContentStart, closingTagStart)
1273
- tables.push({ tableStart: i, tableEnd, rows })
1274
- i = tableEnd
1275
- } else {
1276
- i++
1277
- }
1278
- }
1279
- return tables
1280
- }
1281
-
1282
- function findTopLevelRows(html: string, start: number, end: number): RowRange[] {
1283
- const rows: RowRange[] = []
1284
- let i = start
1285
- while (i < end) {
1286
- if (matchesTagAt(html, i, 'tr')) {
1287
- const opening = parseOpeningTagAt(html, i)
1288
- if (!opening) {
1289
- i++
1290
- continue
1291
- }
1292
- const rowContentStart = opening.end
1293
- const rowEnd = findMatchingClosingTag(html, rowContentStart, 'tr', end)
1294
- if (rowEnd === -1) {
1295
- i = opening.end
1296
- continue
1297
- }
1298
- const closingTagStart = rowEnd - '</tr>'.length
1299
- const cells = findTopLevelCells(html, rowContentStart, closingTagStart)
1300
- rows.push({ rowStart: i, rowEnd, cells })
1301
- i = rowEnd
1302
- } else if (matchesClosingTagAt(html, i, 'table')) {
1303
- // Defensive: bail out if we encounter a closing </table> while
1304
- // scanning rows (we should have stopped at `end` already).
1305
- break
1306
- } else {
1307
- i++
1308
- }
1309
- }
1310
- return rows
1311
- }
1312
-
1313
- function findTopLevelCells(html: string, start: number, end: number): CellRange[] {
1314
- const cells: CellRange[] = []
1315
- let i = start
1316
- while (i < end) {
1317
- if (matchesTagAt(html, i, 'td') || matchesTagAt(html, i, 'th')) {
1318
- const tagName = matchesTagAt(html, i, 'td') ? 'td' : 'th'
1319
- const opening = parseOpeningTagAt(html, i)
1320
- if (!opening) {
1321
- i++
1322
- continue
1323
- }
1324
- const contentStart = opening.end
1325
- const cellEnd = findMatchingClosingTag(html, contentStart, tagName, end)
1326
- if (cellEnd === -1) {
1327
- i = opening.end
1328
- continue
1329
- }
1330
- const contentEnd = cellEnd - `</${tagName}>`.length
1331
- cells.push({ cellStart: i, cellEnd, contentStart, contentEnd })
1332
- i = cellEnd
1333
- } else if (matchesClosingTagAt(html, i, 'tr')) {
1334
- break
1335
- } else {
1336
- i++
1337
- }
1338
- }
1339
- return cells
1340
- }
1341
-
1342
- function matchesTagAt(html: string, i: number, tagName: string): boolean {
1343
- if (html[i] !== '<') return false
1344
- const candidate = html.slice(i + 1, i + 1 + tagName.length).toLowerCase()
1345
- if (candidate !== tagName) return false
1346
- const after = html[i + 1 + tagName.length]
1347
- return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r' || after === '/'
1348
- }
1349
-
1350
- function matchesClosingTagAt(html: string, i: number, tagName: string): boolean {
1351
- if (html[i] !== '<' || html[i + 1] !== '/') return false
1352
- const candidate = html.slice(i + 2, i + 2 + tagName.length).toLowerCase()
1353
- if (candidate !== tagName) return false
1354
- const after = html[i + 2 + tagName.length]
1355
- return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r'
1356
- }
1357
-
1358
- interface OpeningTag {
1359
- /** Index just past the closing `>` of the opening tag. */
1360
- end: number
1361
- }
1362
-
1363
- function parseOpeningTagAt(html: string, i: number): OpeningTag | null {
1364
- // HTML comments, CDATA, processing instructions, and DOCTYPE need their
1365
- // own terminators — a plain `>`-walker would cut a comment like
1366
- // `<!-- a > b -->` at the first inner `>`, treating the rest as text
1367
- // and corrupting downstream offsets. Word-exported HTML routinely
1368
- // emits comments inside tables (conditional comments, OLE markers) so
1369
- // these have to be handled, not just be theoretical.
1370
- if (html.startsWith('<!--', i)) {
1371
- const close = html.indexOf('-->', i + 4)
1372
- return close === -1 ? null : { end: close + 3 }
1373
- }
1374
- if (html.startsWith('<![CDATA[', i)) {
1375
- const close = html.indexOf(']]>', i + 9)
1376
- return close === -1 ? null : { end: close + 3 }
1377
- }
1378
- if (html.startsWith('<?', i)) {
1379
- const close = html.indexOf('?>', i + 2)
1380
- return close === -1 ? null : { end: close + 2 }
1381
- }
1382
- // Walk to the next unquoted '>'. Handles attributes whose values contain
1383
- // a literal '>' inside quotes, which a plain indexOf would mishandle.
1384
- let j = i + 1
1385
- let quote: string | null = null
1386
- while (j < html.length) {
1387
- const ch = html[j]
1388
- if (quote) {
1389
- if (ch === quote) quote = null
1390
- } else if (ch === '"' || ch === "'") {
1391
- quote = ch
1392
- } else if (ch === '>') {
1393
- return { end: j + 1 }
1394
- }
1395
- j++
1396
- }
1397
- return null
1398
- }
1399
-
1400
- /**
1401
- * Returns the index just past the matching `</tagName>`, accounting for
1402
- * nested tags of the same name. Returns -1 if no match before `limit`.
1403
- */
1404
- function findMatchingClosingTag(html: string, from: number, tagName: string, limit: number = html.length): number {
1405
- let depth = 1
1406
- let i = from
1407
- while (i < limit) {
1408
- if (matchesTagAt(html, i, tagName)) {
1409
- const opening = parseOpeningTagAt(html, i)
1410
- if (!opening) {
1411
- i++
1412
- continue
1413
- }
1414
- const tagText = html.slice(i, opening.end)
1415
- if (!tagText.endsWith('/>')) depth++
1416
- i = opening.end
1417
- } else if (matchesClosingTagAt(html, i, tagName)) {
1418
- depth--
1419
- const closing = parseOpeningTagAt(html, i)
1420
- const closingEnd = closing?.end ?? i + `</${tagName}>`.length
1421
- if (depth === 0) return closingEnd
1422
- i = closingEnd
1423
- } else {
1424
- i++
1425
- }
1426
- }
1427
- return -1
1428
- }