@createiq/htmldiff 1.0.5-beta.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/HtmlDiff.cjs +5 -973
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +2 -16
- package/dist/HtmlDiff.d.mts +2 -16
- package/dist/HtmlDiff.mjs +5 -973
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +3 -3
- package/src/HtmlDiff.ts +5 -51
- package/test/HtmlDiff.spec.ts +1 -1
- package/.claude/settings.local.json +0 -15
- package/src/TableDiff.ts +0 -1428
- package/test/HtmlDiff.tables.matrix.spec.ts +0 -327
- package/test/HtmlDiff.tables.spec.ts +0 -1458
- package/test/TableDiff.bench.ts +0 -244
package/src/TableDiff.ts
DELETED
|
@@ -1,1428 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Table-aware preprocessing for HtmlDiff.
|
|
3
|
-
*
|
|
4
|
-
* The word-level diff alone matches longest-common-subsequences across cell
|
|
5
|
-
* boundaries and produces structurally wrong output for table edits — it
|
|
6
|
-
* shuffles content between cells, introduces phantom `<td>`s, and provides
|
|
7
|
-
* no signal that an entire row or column was added/deleted. We pre-process
|
|
8
|
-
* the inputs to give Word-style results:
|
|
9
|
-
*
|
|
10
|
-
* • When dimensions match (same row count, same cell count per row), we
|
|
11
|
-
* diff cell content positionally so cross-cell shifts produce one
|
|
12
|
-
* independent del/ins per cell.
|
|
13
|
-
* • When dimensions don't match (added/deleted row, added/deleted column),
|
|
14
|
-
* we run a row-level LCS to identify structurally added/deleted rows,
|
|
15
|
-
* then within preserved rows a cell-level LCS to identify added/deleted
|
|
16
|
-
* columns. Structurally added rows/cells get `class='diffins'` on the
|
|
17
|
-
* `<tr>`/`<td>`; deleted ones get `class='diffdel'`. Preserved cells
|
|
18
|
-
* fall back to a content diff via the recursive HtmlDiff callback.
|
|
19
|
-
*
|
|
20
|
-
* Tables are spliced out into placeholders before the main diff runs and
|
|
21
|
-
* spliced back in after, so the surrounding (non-table) content is diffed
|
|
22
|
-
* by the normal word-level pipeline.
|
|
23
|
-
*/
|
|
24
|
-
|
|
25
|
-
interface CellRange {
|
|
26
|
-
/** Start index of the cell's opening tag in the original html. */
|
|
27
|
-
cellStart: number
|
|
28
|
-
/** Index just past the cell's closing tag. */
|
|
29
|
-
cellEnd: number
|
|
30
|
-
/** Cell content range — the slice we feed into the cell-level diff. */
|
|
31
|
-
contentStart: number
|
|
32
|
-
contentEnd: number
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
interface RowRange {
|
|
36
|
-
rowStart: number
|
|
37
|
-
rowEnd: number
|
|
38
|
-
cells: CellRange[]
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
interface TableRange {
|
|
42
|
-
tableStart: number
|
|
43
|
-
tableEnd: number
|
|
44
|
-
rows: RowRange[]
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
export interface PreprocessResult {
|
|
48
|
-
modifiedOld: string
|
|
49
|
-
modifiedNew: string
|
|
50
|
-
/** Maps placeholder marker → already-diffed table HTML to splice back in. */
|
|
51
|
-
placeholderToDiff: Map<string, string>
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
// HTML comments survive WordSplitter as a single atomic token and are
|
|
55
|
-
// treated as equal on both sides, so they pass through the diff
|
|
56
|
-
// untouched and are easy to substitute back later. The nonce is generated
|
|
57
|
-
// per call so a previously-diffed document being re-diffed (or any input
|
|
58
|
-
// that legitimately contains an `<!--HTMLDIFF_TABLE_*-->` comment) can't
|
|
59
|
-
// collide with the placeholder we substitute. We additionally regenerate
|
|
60
|
-
// the nonce if it appears in either input.
|
|
61
|
-
const PLACEHOLDER_PREFIX_BASE = '<!--HTMLDIFF_TABLE_'
|
|
62
|
-
const PLACEHOLDER_SUFFIX = '-->'
|
|
63
|
-
|
|
64
|
-
/**
|
|
65
|
-
* Hard cap on table dimensions handled by the structural-aware path.
|
|
66
|
-
* The row-LCS is O(rows²), the per-row cell-LCS is O(cells²), and each
|
|
67
|
-
* comparison string-equals row content (potentially many KB). Without a
|
|
68
|
-
* cap, a several-thousand-row table can pin a CPU for seconds. Tables
|
|
69
|
-
* larger than this fall through to the word-level diff, which scales
|
|
70
|
-
* linearly. Tuned to comfortably cover real-world ISDA schedules
|
|
71
|
-
* (which routinely have 1000+ rows).
|
|
72
|
-
*/
|
|
73
|
-
const MAX_TABLE_ROWS = 1500
|
|
74
|
-
const MAX_TABLE_CELLS_PER_ROW = 200
|
|
75
|
-
|
|
76
|
-
function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
|
|
77
|
-
// 4 random bytes → 8 hex chars → 16^8 ≈ 4.3 billion combinations. We
|
|
78
|
-
// also retry if the generated nonce happens to occur in either input.
|
|
79
|
-
// Using `Math.random` here is fine: we're not defending against a
|
|
80
|
-
// malicious adversary, just avoiding accidental collisions.
|
|
81
|
-
for (let attempt = 0; attempt < 8; attempt++) {
|
|
82
|
-
const nonce = Math.floor(Math.random() * 0xffffffff)
|
|
83
|
-
.toString(16)
|
|
84
|
-
.padStart(8, '0')
|
|
85
|
-
const prefix = `${PLACEHOLDER_PREFIX_BASE}${nonce}_`
|
|
86
|
-
if (!oldHtml.includes(prefix) && !newHtml.includes(prefix)) {
|
|
87
|
-
return prefix
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
// Astronomically unlikely. Falling back to a counter ensures progress
|
|
91
|
-
// rather than an infinite loop, and any remaining collision will simply
|
|
92
|
-
// surface as a malformed diff that the caller can detect.
|
|
93
|
-
return `${PLACEHOLDER_PREFIX_BASE}fallback_${Date.now()}_`
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
type DiffCellFn = (oldCellContent: string, newCellContent: string) => string
|
|
97
|
-
|
|
98
|
-
/**
|
|
99
|
-
* Diffs every paired-by-position table in the inputs and replaces each
|
|
100
|
-
* source table with a placeholder, returning the modified inputs plus the
|
|
101
|
-
* placeholder→diff mapping. Returns null when there are no tables to
|
|
102
|
-
* preprocess or the table counts don't line up.
|
|
103
|
-
*/
|
|
104
|
-
export function preprocessTables(oldHtml: string, newHtml: string, diffCell: DiffCellFn): PreprocessResult | null {
|
|
105
|
-
const oldTables = findTopLevelTables(oldHtml)
|
|
106
|
-
const newTables = findTopLevelTables(newHtml)
|
|
107
|
-
|
|
108
|
-
if (oldTables.length === 0 && newTables.length === 0) return null
|
|
109
|
-
if (oldTables.length !== newTables.length) return null
|
|
110
|
-
|
|
111
|
-
// Bail out on pathologically large tables — see MAX_TABLE_ROWS comment.
|
|
112
|
-
for (let i = 0; i < oldTables.length; i++) {
|
|
113
|
-
if (exceedsSizeLimit(oldTables[i]) || exceedsSizeLimit(newTables[i])) return null
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
const pairs: Array<{ oldTable: TableRange; newTable: TableRange; diffed: string }> = []
|
|
117
|
-
for (let i = 0; i < oldTables.length; i++) {
|
|
118
|
-
pairs.push({
|
|
119
|
-
oldTable: oldTables[i],
|
|
120
|
-
newTable: newTables[i],
|
|
121
|
-
diffed: diffTable(oldHtml, newHtml, oldTables[i], newTables[i], diffCell),
|
|
122
|
-
})
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
// Splice from end → start so earlier offsets stay valid.
|
|
126
|
-
let modifiedOld = oldHtml
|
|
127
|
-
let modifiedNew = newHtml
|
|
128
|
-
const placeholderPrefix = makePlaceholderPrefix(oldHtml, newHtml)
|
|
129
|
-
const placeholderToDiff = new Map<string, string>()
|
|
130
|
-
for (let i = pairs.length - 1; i >= 0; i--) {
|
|
131
|
-
const placeholder = `${placeholderPrefix}${i}${PLACEHOLDER_SUFFIX}`
|
|
132
|
-
placeholderToDiff.set(placeholder, pairs[i].diffed)
|
|
133
|
-
modifiedOld = spliceString(modifiedOld, pairs[i].oldTable.tableStart, pairs[i].oldTable.tableEnd, placeholder)
|
|
134
|
-
modifiedNew = spliceString(modifiedNew, pairs[i].newTable.tableStart, pairs[i].newTable.tableEnd, placeholder)
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
return { modifiedOld, modifiedNew, placeholderToDiff }
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
export function restoreTablePlaceholders(diffOutput: string, placeholderToDiff: Map<string, string>): string {
|
|
141
|
-
let result = diffOutput
|
|
142
|
-
for (const [placeholder, html] of placeholderToDiff) {
|
|
143
|
-
result = result.split(placeholder).join(html)
|
|
144
|
-
}
|
|
145
|
-
return result
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
function spliceString(s: string, start: number, end: number, replacement: string): string {
|
|
149
|
-
return s.slice(0, start) + replacement + s.slice(end)
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
function exceedsSizeLimit(table: TableRange): boolean {
|
|
153
|
-
if (table.rows.length > MAX_TABLE_ROWS) return true
|
|
154
|
-
for (const row of table.rows) {
|
|
155
|
-
if (row.cells.length > MAX_TABLE_CELLS_PER_ROW) return true
|
|
156
|
-
}
|
|
157
|
-
return false
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
function diffTable(
|
|
161
|
-
oldHtml: string,
|
|
162
|
-
newHtml: string,
|
|
163
|
-
oldTable: TableRange,
|
|
164
|
-
newTable: TableRange,
|
|
165
|
-
diffCell: DiffCellFn
|
|
166
|
-
): string {
|
|
167
|
-
if (sameDimensions(oldTable, newTable)) {
|
|
168
|
-
return diffPositionalTable(oldHtml, newHtml, oldTable, newTable, diffCell)
|
|
169
|
-
}
|
|
170
|
-
if (oldTable.rows.length === newTable.rows.length) {
|
|
171
|
-
// Same row count, different cell counts: column add/delete only.
|
|
172
|
-
// Aligning rows positionally avoids the LCS row-key mismatch that
|
|
173
|
-
// happens when rows have different cell counts.
|
|
174
|
-
return diffSameRowCountTable(oldHtml, newHtml, oldTable, newTable, diffCell)
|
|
175
|
-
}
|
|
176
|
-
return diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diffCell)
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
function diffSameRowCountTable(
|
|
180
|
-
oldHtml: string,
|
|
181
|
-
newHtml: string,
|
|
182
|
-
oldTable: TableRange,
|
|
183
|
-
newTable: TableRange,
|
|
184
|
-
diffCell: DiffCellFn
|
|
185
|
-
): string {
|
|
186
|
-
// Walk the new table verbatim (preserving `<thead>`/`<tbody>` wrappers,
|
|
187
|
-
// whitespace, etc.) and substitute each row's content with the diffed
|
|
188
|
-
// form. The cursor-based emission keeps everything between rows intact.
|
|
189
|
-
const out: string[] = []
|
|
190
|
-
let cursor = newTable.tableStart
|
|
191
|
-
let r = 0
|
|
192
|
-
while (r < newTable.rows.length) {
|
|
193
|
-
const merge = detectVerticalMerge(oldHtml, newHtml, oldTable, newTable, r)
|
|
194
|
-
if (merge) {
|
|
195
|
-
out.push(newHtml.slice(cursor, newTable.rows[r].rowStart))
|
|
196
|
-
out.push(merge.diff)
|
|
197
|
-
cursor = newTable.rows[r + merge.span - 1].rowEnd
|
|
198
|
-
r += merge.span
|
|
199
|
-
continue
|
|
200
|
-
}
|
|
201
|
-
const split = detectVerticalSplit(oldHtml, newHtml, oldTable, newTable, r)
|
|
202
|
-
if (split) {
|
|
203
|
-
out.push(newHtml.slice(cursor, newTable.rows[r].rowStart))
|
|
204
|
-
out.push(split.diff)
|
|
205
|
-
cursor = newTable.rows[r + split.span - 1].rowEnd
|
|
206
|
-
r += split.span
|
|
207
|
-
continue
|
|
208
|
-
}
|
|
209
|
-
const newRow = newTable.rows[r]
|
|
210
|
-
out.push(newHtml.slice(cursor, newRow.rowStart))
|
|
211
|
-
out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[r], newRow, diffCell))
|
|
212
|
-
cursor = newRow.rowEnd
|
|
213
|
-
r++
|
|
214
|
-
}
|
|
215
|
-
out.push(newHtml.slice(cursor, newTable.tableEnd))
|
|
216
|
-
return out.join('')
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
/**
|
|
220
|
-
* Detects a vertical merge starting at row `r`: new row R has a single
|
|
221
|
-
* cell with rowspan=K (and any colspan ≥ 1), with rows R+1..R+K-1 empty
|
|
222
|
-
* in new. Old rows R..R+K-1 must have a logical column width equal to
|
|
223
|
-
* the new cell's colspan and contain no rowspan'd cells of their own.
|
|
224
|
-
* This handles both single-column merges (old rows are 1-cell, new cell
|
|
225
|
-
* rowspan=K) and rectangular merges (e.g. 2×2 merge into a single
|
|
226
|
-
* colspan=2 rowspan=2 cell). Output: emit the merged cell with
|
|
227
|
-
* `class='mod rowspan'` and the empty trailing rows unchanged.
|
|
228
|
-
*/
|
|
229
|
-
function detectVerticalMerge(
|
|
230
|
-
oldHtml: string,
|
|
231
|
-
newHtml: string,
|
|
232
|
-
oldTable: TableRange,
|
|
233
|
-
newTable: TableRange,
|
|
234
|
-
r: number
|
|
235
|
-
): { diff: string; span: number } | null {
|
|
236
|
-
const newRow = newTable.rows[r]
|
|
237
|
-
if (newRow.cells.length !== 1) return null
|
|
238
|
-
const cell = newRow.cells[0]
|
|
239
|
-
const span = getRowspan(newHtml, cell)
|
|
240
|
-
if (span <= 1) return null
|
|
241
|
-
if (r + span > newTable.rows.length) return null
|
|
242
|
-
|
|
243
|
-
const colspan = getColspan(newHtml, cell)
|
|
244
|
-
|
|
245
|
-
for (let k = 1; k < span; k++) {
|
|
246
|
-
if (newTable.rows[r + k].cells.length !== 0) return null
|
|
247
|
-
}
|
|
248
|
-
for (let k = 0; k < span; k++) {
|
|
249
|
-
const oldRow = oldTable.rows[r + k]
|
|
250
|
-
if (!oldRow) return null
|
|
251
|
-
// The absorbed region's logical width must match the merged cell's
|
|
252
|
-
// colspan; otherwise this isn't a clean rectangular merge and we let
|
|
253
|
-
// the caller fall through.
|
|
254
|
-
if (sumColspans(oldHtml, oldRow.cells) !== colspan) return null
|
|
255
|
-
for (const c of oldRow.cells) {
|
|
256
|
-
if (getRowspan(oldHtml, c) !== 1) return null
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
const out: string[] = []
|
|
261
|
-
out.push(rowHeaderSlice(newHtml, newRow))
|
|
262
|
-
out.push(emitSpanChangedCell(newHtml, cell, 'rowspan'))
|
|
263
|
-
out.push('</tr>')
|
|
264
|
-
for (let k = 1; k < span; k++) {
|
|
265
|
-
out.push(emitEmptyRow(newHtml, newTable.rows[r + k]))
|
|
266
|
-
}
|
|
267
|
-
return { diff: out.join(''), span }
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
/**
|
|
271
|
-
* Detects a vertical split starting at row `r`: old row R has a single
|
|
272
|
-
* cell with rowspan=K, old rows R+1..R+K-1 are empty. New rows R..R+K-1
|
|
273
|
-
* each have a single cell. Output: emit each new row with the new cell
|
|
274
|
-
* tagged `class='mod rowspan'`.
|
|
275
|
-
*/
|
|
276
|
-
function detectVerticalSplit(
|
|
277
|
-
oldHtml: string,
|
|
278
|
-
newHtml: string,
|
|
279
|
-
oldTable: TableRange,
|
|
280
|
-
newTable: TableRange,
|
|
281
|
-
r: number
|
|
282
|
-
): { diff: string; span: number } | null {
|
|
283
|
-
const oldRow = oldTable.rows[r]
|
|
284
|
-
if (oldRow.cells.length !== 1) return null
|
|
285
|
-
const oldCell = oldRow.cells[0]
|
|
286
|
-
const span = getRowspan(oldHtml, oldCell)
|
|
287
|
-
if (span <= 1) return null
|
|
288
|
-
if (r + span > oldTable.rows.length) return null
|
|
289
|
-
|
|
290
|
-
const colspan = getColspan(oldHtml, oldCell)
|
|
291
|
-
|
|
292
|
-
for (let k = 1; k < span; k++) {
|
|
293
|
-
if (oldTable.rows[r + k].cells.length !== 0) return null
|
|
294
|
-
}
|
|
295
|
-
for (let k = 0; k < span; k++) {
|
|
296
|
-
const newRow = newTable.rows[r + k]
|
|
297
|
-
if (!newRow) return null
|
|
298
|
-
// New rows must collectively cover the same logical width as the old
|
|
299
|
-
// merged cell's colspan, with no rowspan'd cells of their own.
|
|
300
|
-
if (sumColspans(newHtml, newRow.cells) !== colspan) return null
|
|
301
|
-
for (const c of newRow.cells) {
|
|
302
|
-
if (getRowspan(newHtml, c) !== 1) return null
|
|
303
|
-
}
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
const out: string[] = []
|
|
307
|
-
for (let k = 0; k < span; k++) {
|
|
308
|
-
const newRow = newTable.rows[r + k]
|
|
309
|
-
out.push(rowHeaderSlice(newHtml, newRow))
|
|
310
|
-
for (const c of newRow.cells) {
|
|
311
|
-
out.push(emitSpanChangedCell(newHtml, c, 'rowspan'))
|
|
312
|
-
}
|
|
313
|
-
out.push('</tr>')
|
|
314
|
-
}
|
|
315
|
-
return { diff: out.join(''), span }
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
function emitEmptyRow(html: string, row: RowRange): string {
|
|
319
|
-
// Re-emit the source row's `<tr ...></tr>` verbatim.
|
|
320
|
-
return html.slice(row.rowStart, row.rowEnd)
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
function sameDimensions(a: TableRange, b: TableRange): boolean {
|
|
324
|
-
if (a.rows.length !== b.rows.length) return false
|
|
325
|
-
for (let i = 0; i < a.rows.length; i++) {
|
|
326
|
-
if (a.rows[i].cells.length !== b.rows[i].cells.length) return false
|
|
327
|
-
}
|
|
328
|
-
return true
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
/**
|
|
332
|
-
* Same-dimension path: walk the new table verbatim and substitute each
|
|
333
|
-
* cell content range with the cell-level diff. The surrounding
|
|
334
|
-
* `<thead>`/`<tbody>`/whitespace passes through untouched.
|
|
335
|
-
*/
|
|
336
|
-
function diffPositionalTable(
|
|
337
|
-
oldHtml: string,
|
|
338
|
-
newHtml: string,
|
|
339
|
-
oldTable: TableRange,
|
|
340
|
-
newTable: TableRange,
|
|
341
|
-
diffCell: DiffCellFn
|
|
342
|
-
): string {
|
|
343
|
-
const out: string[] = []
|
|
344
|
-
let cursor = newTable.tableStart
|
|
345
|
-
for (let r = 0; r < newTable.rows.length; r++) {
|
|
346
|
-
const oldRow = oldTable.rows[r]
|
|
347
|
-
const newRow = newTable.rows[r]
|
|
348
|
-
for (let c = 0; c < newRow.cells.length; c++) {
|
|
349
|
-
const oldCell = oldRow.cells[c]
|
|
350
|
-
const newCell = newRow.cells[c]
|
|
351
|
-
out.push(newHtml.slice(cursor, newCell.contentStart))
|
|
352
|
-
out.push(
|
|
353
|
-
diffCell(
|
|
354
|
-
oldHtml.slice(oldCell.contentStart, oldCell.contentEnd),
|
|
355
|
-
newHtml.slice(newCell.contentStart, newCell.contentEnd)
|
|
356
|
-
)
|
|
357
|
-
)
|
|
358
|
-
cursor = newCell.contentEnd
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
out.push(newHtml.slice(cursor, newTable.tableEnd))
|
|
362
|
-
return out.join('')
|
|
363
|
-
}
|
|
364
|
-
|
|
365
|
-
/**
|
|
366
|
-
* Mismatched-dimensions path: row-level LCS to identify added/deleted rows,
|
|
367
|
-
* then per preserved row a cell-level LCS to identify added/deleted cells.
|
|
368
|
-
* Reconstructs the table from scratch — there's no "single new structure"
|
|
369
|
-
* to walk verbatim, since we're stitching together kept rows from both
|
|
370
|
-
* sides.
|
|
371
|
-
*/
|
|
372
|
-
function diffStructurallyAlignedTable(
|
|
373
|
-
oldHtml: string,
|
|
374
|
-
newHtml: string,
|
|
375
|
-
oldTable: TableRange,
|
|
376
|
-
newTable: TableRange,
|
|
377
|
-
diffCell: DiffCellFn
|
|
378
|
-
): string {
|
|
379
|
-
const oldKeys = oldTable.rows.map(row => rowKey(oldHtml, row))
|
|
380
|
-
const newKeys = newTable.rows.map(row => rowKey(newHtml, row))
|
|
381
|
-
const exactAlignment = lcsAlign(oldKeys, newKeys)
|
|
382
|
-
const alignment = pairSimilarUnmatchedRows(exactAlignment, oldTable, newTable, oldHtml, newHtml)
|
|
383
|
-
|
|
384
|
-
// Walk new's tableStart→tableEnd, substituting rows with their diffed
|
|
385
|
-
// form so `<thead>`/`<tbody>` wrappers and inter-row whitespace are
|
|
386
|
-
// preserved verbatim. Deleted rows (no position in new) are injected
|
|
387
|
-
// inline at their alignment position. If new has no rows at all, fall
|
|
388
|
-
// back to a from-scratch reconstruction so we still emit deleted rows.
|
|
389
|
-
if (newTable.rows.length === 0) {
|
|
390
|
-
return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell)
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
// Emit the table header (`<table>` + any `<thead>`/`<tbody>` opening
|
|
394
|
-
// text up to the first row) up-front so a leading run of deleted-only
|
|
395
|
-
// alignments doesn't slip in before the table opens.
|
|
396
|
-
const out: string[] = []
|
|
397
|
-
out.push(newHtml.slice(newTable.tableStart, newTable.rows[0].rowStart))
|
|
398
|
-
let cursor = newTable.rows[0].rowStart
|
|
399
|
-
for (const align of alignment) {
|
|
400
|
-
if (align.newIdx !== null) {
|
|
401
|
-
const newRow = newTable.rows[align.newIdx]
|
|
402
|
-
out.push(newHtml.slice(cursor, newRow.rowStart))
|
|
403
|
-
if (align.oldIdx !== null) {
|
|
404
|
-
out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[align.oldIdx], newRow, diffCell))
|
|
405
|
-
} else {
|
|
406
|
-
out.push(emitFullRow(newHtml, newRow, 'ins', diffCell))
|
|
407
|
-
}
|
|
408
|
-
cursor = newRow.rowEnd
|
|
409
|
-
} else if (align.oldIdx !== null) {
|
|
410
|
-
// Deleted row: inject inline at the current cursor (between the
|
|
411
|
-
// previously emitted row and the next one in new).
|
|
412
|
-
out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
|
|
413
|
-
}
|
|
414
|
-
}
|
|
415
|
-
out.push(newHtml.slice(cursor, newTable.tableEnd))
|
|
416
|
-
return out.join('')
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
function rebuildStructurallyAlignedTable(
|
|
420
|
-
oldHtml: string,
|
|
421
|
-
newHtml: string,
|
|
422
|
-
oldTable: TableRange,
|
|
423
|
-
newTable: TableRange,
|
|
424
|
-
alignment: Alignment[],
|
|
425
|
-
diffCell: DiffCellFn
|
|
426
|
-
): string {
|
|
427
|
-
// Used when new has no rows but old does — we lose the per-row
|
|
428
|
-
// wrappers from new (there are none), so reconstruct from old's frame.
|
|
429
|
-
const out: string[] = []
|
|
430
|
-
out.push(headerSlice(newHtml, newTable, oldHtml, oldTable))
|
|
431
|
-
for (const align of alignment) {
|
|
432
|
-
if (align.oldIdx !== null) {
|
|
433
|
-
out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
|
|
434
|
-
} else if (align.newIdx !== null) {
|
|
435
|
-
out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], 'ins', diffCell))
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
out.push('</table>')
|
|
439
|
-
return out.join('')
|
|
440
|
-
}
|
|
441
|
-
|
|
442
|
-
function headerSlice(newHtml: string, newTable: TableRange, oldHtml: string, oldTable: TableRange): string {
|
|
443
|
-
// Slice from <table> to the start of the first <tr>. Prefer new since
|
|
444
|
-
// attribute changes on <table> itself should follow new.
|
|
445
|
-
const newFirstRow = newTable.rows[0]?.rowStart ?? newTable.tableEnd - '</table>'.length
|
|
446
|
-
if (newFirstRow > newTable.tableStart) {
|
|
447
|
-
return newHtml.slice(newTable.tableStart, newFirstRow)
|
|
448
|
-
}
|
|
449
|
-
const oldFirstRow = oldTable.rows[0]?.rowStart ?? oldTable.tableEnd - '</table>'.length
|
|
450
|
-
return oldHtml.slice(oldTable.tableStart, oldFirstRow)
|
|
451
|
-
}
|
|
452
|
-
|
|
453
|
-
function rowKey(html: string, row: RowRange): string {
|
|
454
|
-
// Include cell tag text in the key so column-add doesn't accidentally
|
|
455
|
-
// match a row to one with different cell counts. Whitespace-normalize to
|
|
456
|
-
// tolerate formatting differences.
|
|
457
|
-
return html.slice(row.rowStart, row.rowEnd).replace(/\s+/g, ' ').trim()
|
|
458
|
-
}
|
|
459
|
-
|
|
460
|
-
function diffPreservedRow(
|
|
461
|
-
oldHtml: string,
|
|
462
|
-
newHtml: string,
|
|
463
|
-
oldRow: RowRange,
|
|
464
|
-
newRow: RowRange,
|
|
465
|
-
diffCell: DiffCellFn
|
|
466
|
-
): string {
|
|
467
|
-
if (oldRow.cells.length === newRow.cells.length) {
|
|
468
|
-
return diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
469
|
-
}
|
|
470
|
-
// Cell counts differ. Try to interpret it as a horizontal merge/split via
|
|
471
|
-
// colspan first — preserving the new structure with `class='mod colspan'`
|
|
472
|
-
// on each affected cell.
|
|
473
|
-
const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
474
|
-
if (colspanAligned !== null) return colspanAligned
|
|
475
|
-
// For a single-column add/delete (cell count differs by exactly 1),
|
|
476
|
-
// detect the position via positional similarity scan and align the
|
|
477
|
-
// remaining cells positionally. This handles the case where a column
|
|
478
|
-
// was added AND a different cell got an unrelated content edit — the
|
|
479
|
-
// edited cell still aligns by position rather than getting orphaned by
|
|
480
|
-
// the cell-LCS exact-match.
|
|
481
|
-
const delta = newRow.cells.length - oldRow.cells.length
|
|
482
|
-
// For column add/delete (cell counts differ), find the best insertion
|
|
483
|
-
// or deletion positions via positional similarity scan and align the
|
|
484
|
-
// remaining cells positionally. This handles content-edit alongside
|
|
485
|
-
// column-add by keeping the edited cell in its column position rather
|
|
486
|
-
// than orphaning it via the cell-LCS exact match.
|
|
487
|
-
// Guardrail: combinatorial search is C(newCount, k); we cap to avoid
|
|
488
|
-
// explosion on very wide tables. The cap is generous for real legal
|
|
489
|
-
// schedules; anything above falls through to cell-LCS.
|
|
490
|
-
const absDelta = Math.abs(delta)
|
|
491
|
-
if (
|
|
492
|
-
absDelta > 0 &&
|
|
493
|
-
absDelta <= MAX_COLUMN_DELTA &&
|
|
494
|
-
Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH
|
|
495
|
-
) {
|
|
496
|
-
if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell)
|
|
497
|
-
return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell)
|
|
498
|
-
}
|
|
499
|
-
return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
const MAX_COLUMN_DELTA = 6
|
|
503
|
-
const MAX_COLUMN_SEARCH_WIDTH = 40
|
|
504
|
-
|
|
505
|
-
/**
|
|
506
|
-
* For a row where new has K more cells than old, find the K column
|
|
507
|
-
* positions in new where cells were inserted by scanning all C(newCount,
|
|
508
|
-
* K) combinations and picking the one that maximises positional content
|
|
509
|
-
* similarity with the remaining cells. The inserted cells are emitted
|
|
510
|
-
* with diff markers; the rest are aligned positionally with content
|
|
511
|
-
* diff for matched pairs.
|
|
512
|
-
*/
|
|
513
|
-
function diffMultiColumnAddRow(
|
|
514
|
-
oldHtml: string,
|
|
515
|
-
newHtml: string,
|
|
516
|
-
oldRow: RowRange,
|
|
517
|
-
newRow: RowRange,
|
|
518
|
-
k: number,
|
|
519
|
-
diffCell: DiffCellFn
|
|
520
|
-
): string {
|
|
521
|
-
const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml)
|
|
522
|
-
const inserted = new Set(insertedPositions)
|
|
523
|
-
const out: string[] = [rowHeaderSlice(newHtml, newRow)]
|
|
524
|
-
let oldIdx = 0
|
|
525
|
-
for (let c = 0; c < newRow.cells.length; c++) {
|
|
526
|
-
if (inserted.has(c)) {
|
|
527
|
-
out.push(emitFullCell(newHtml, newRow.cells[c], 'ins', diffCell))
|
|
528
|
-
} else {
|
|
529
|
-
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell))
|
|
530
|
-
oldIdx++
|
|
531
|
-
}
|
|
532
|
-
}
|
|
533
|
-
out.push('</tr>')
|
|
534
|
-
return out.join('')
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
function diffMultiColumnDeleteRow(
|
|
538
|
-
oldHtml: string,
|
|
539
|
-
newHtml: string,
|
|
540
|
-
oldRow: RowRange,
|
|
541
|
-
newRow: RowRange,
|
|
542
|
-
k: number,
|
|
543
|
-
diffCell: DiffCellFn
|
|
544
|
-
): string {
|
|
545
|
-
const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml)
|
|
546
|
-
const deleted = new Set(deletedPositions)
|
|
547
|
-
const out: string[] = [rowHeaderSlice(newHtml, newRow)]
|
|
548
|
-
let newIdx = 0
|
|
549
|
-
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
550
|
-
if (deleted.has(oldIdx)) {
|
|
551
|
-
out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del', diffCell))
|
|
552
|
-
continue
|
|
553
|
-
}
|
|
554
|
-
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell))
|
|
555
|
-
newIdx++
|
|
556
|
-
}
|
|
557
|
-
out.push('</tr>')
|
|
558
|
-
return out.join('')
|
|
559
|
-
}
|
|
560
|
-
|
|
561
|
-
function findBestColumnInsertPositions(
|
|
562
|
-
oldRow: RowRange,
|
|
563
|
-
newRow: RowRange,
|
|
564
|
-
k: number,
|
|
565
|
-
oldHtml: string,
|
|
566
|
-
newHtml: string
|
|
567
|
-
): number[] {
|
|
568
|
-
let bestPositions: number[] = []
|
|
569
|
-
let bestScore = -1
|
|
570
|
-
for (const combo of combinationsOfRange(newRow.cells.length, k)) {
|
|
571
|
-
const inserted = new Set(combo)
|
|
572
|
-
let score = 0
|
|
573
|
-
let oldIdx = 0
|
|
574
|
-
for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
|
|
575
|
-
if (inserted.has(newIdx)) continue
|
|
576
|
-
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
|
|
577
|
-
oldIdx++
|
|
578
|
-
}
|
|
579
|
-
if (score > bestScore) {
|
|
580
|
-
bestScore = score
|
|
581
|
-
bestPositions = combo
|
|
582
|
-
}
|
|
583
|
-
}
|
|
584
|
-
return bestPositions
|
|
585
|
-
}
|
|
586
|
-
|
|
587
|
-
function findBestColumnDeletePositions(
|
|
588
|
-
oldRow: RowRange,
|
|
589
|
-
newRow: RowRange,
|
|
590
|
-
k: number,
|
|
591
|
-
oldHtml: string,
|
|
592
|
-
newHtml: string
|
|
593
|
-
): number[] {
|
|
594
|
-
let bestPositions: number[] = []
|
|
595
|
-
let bestScore = -1
|
|
596
|
-
for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
|
|
597
|
-
const deleted = new Set(combo)
|
|
598
|
-
let score = 0
|
|
599
|
-
let newIdx = 0
|
|
600
|
-
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
601
|
-
if (deleted.has(oldIdx)) continue
|
|
602
|
-
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
|
|
603
|
-
newIdx++
|
|
604
|
-
}
|
|
605
|
-
if (score > bestScore) {
|
|
606
|
-
bestScore = score
|
|
607
|
-
bestPositions = combo
|
|
608
|
-
}
|
|
609
|
-
}
|
|
610
|
-
return bestPositions
|
|
611
|
-
}
|
|
612
|
-
|
|
613
|
-
/**
|
|
614
|
-
* Yields all sorted-ascending combinations of `k` distinct integers
|
|
615
|
-
* from [0, n). Iterative implementation avoids recursion overhead and
|
|
616
|
-
* keeps memory at O(k).
|
|
617
|
-
*/
|
|
618
|
-
function* combinationsOfRange(n: number, k: number): IterableIterator<number[]> {
|
|
619
|
-
if (k === 0 || k > n) return
|
|
620
|
-
const indices = Array.from({ length: k }, (_, i) => i)
|
|
621
|
-
while (true) {
|
|
622
|
-
yield indices.slice()
|
|
623
|
-
let i = k - 1
|
|
624
|
-
while (i >= 0 && indices[i] === n - k + i) i--
|
|
625
|
-
if (i < 0) return
|
|
626
|
-
indices[i]++
|
|
627
|
-
for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1
|
|
628
|
-
}
|
|
629
|
-
}
|
|
630
|
-
|
|
631
|
-
/**
|
|
632
|
-
* Try to align cells by logical column position (sum of colspans). When
|
|
633
|
-
* one side has a colspan'd cell that absorbs multiple cells on the other
|
|
634
|
-
* side, emit the new structure with `class='mod colspan'` on the
|
|
635
|
-
* merged/split cells. Returns null if the rows don't align cleanly —
|
|
636
|
-
* caller falls back to a generic cell-LCS.
|
|
637
|
-
*/
|
|
638
|
-
function diffColspanChangedRow(
|
|
639
|
-
oldHtml: string,
|
|
640
|
-
newHtml: string,
|
|
641
|
-
oldRow: RowRange,
|
|
642
|
-
newRow: RowRange,
|
|
643
|
-
diffCell: DiffCellFn
|
|
644
|
-
): string | null {
|
|
645
|
-
const oldWidth = sumColspans(oldHtml, oldRow.cells)
|
|
646
|
-
const newWidth = sumColspans(newHtml, newRow.cells)
|
|
647
|
-
if (oldWidth !== newWidth) return null
|
|
648
|
-
|
|
649
|
-
const out: string[] = []
|
|
650
|
-
out.push(rowHeaderSlice(newHtml, newRow))
|
|
651
|
-
|
|
652
|
-
let oi = 0
|
|
653
|
-
let ni = 0
|
|
654
|
-
while (oi < oldRow.cells.length && ni < newRow.cells.length) {
|
|
655
|
-
const oCell = oldRow.cells[oi]
|
|
656
|
-
const nCell = newRow.cells[ni]
|
|
657
|
-
const oSpan = getColspan(oldHtml, oCell)
|
|
658
|
-
const nSpan = getColspan(newHtml, nCell)
|
|
659
|
-
|
|
660
|
-
if (oSpan === nSpan) {
|
|
661
|
-
out.push(emitDiffedCell(oldHtml, newHtml, oCell, nCell, diffCell))
|
|
662
|
-
oi++
|
|
663
|
-
ni++
|
|
664
|
-
} else if (nSpan > oSpan) {
|
|
665
|
-
// New cell absorbs multiple old cells — horizontal merge.
|
|
666
|
-
let totalOldSpan = 0
|
|
667
|
-
let oj = oi
|
|
668
|
-
while (oj < oldRow.cells.length && totalOldSpan < nSpan) {
|
|
669
|
-
totalOldSpan += getColspan(oldHtml, oldRow.cells[oj])
|
|
670
|
-
oj++
|
|
671
|
-
}
|
|
672
|
-
if (totalOldSpan !== nSpan) return null
|
|
673
|
-
out.push(emitSpanChangedCell(newHtml, nCell, 'colspan'))
|
|
674
|
-
oi = oj
|
|
675
|
-
ni++
|
|
676
|
-
} else {
|
|
677
|
-
// One old cell becomes multiple new cells — horizontal split.
|
|
678
|
-
let totalNewSpan = 0
|
|
679
|
-
let nj = ni
|
|
680
|
-
while (nj < newRow.cells.length && totalNewSpan < oSpan) {
|
|
681
|
-
totalNewSpan += getColspan(newHtml, newRow.cells[nj])
|
|
682
|
-
nj++
|
|
683
|
-
}
|
|
684
|
-
if (totalNewSpan !== oSpan) return null
|
|
685
|
-
for (let k = ni; k < nj; k++) {
|
|
686
|
-
out.push(emitSpanChangedCell(newHtml, newRow.cells[k], 'colspan'))
|
|
687
|
-
}
|
|
688
|
-
oi++
|
|
689
|
-
ni = nj
|
|
690
|
-
}
|
|
691
|
-
}
|
|
692
|
-
|
|
693
|
-
// If we couldn't consume both sides cleanly, bail out.
|
|
694
|
-
if (oi !== oldRow.cells.length || ni !== newRow.cells.length) return null
|
|
695
|
-
|
|
696
|
-
out.push('</tr>')
|
|
697
|
-
return out.join('')
|
|
698
|
-
}
|
|
699
|
-
|
|
700
|
-
function sumColspans(html: string, cells: CellRange[]): number {
|
|
701
|
-
let total = 0
|
|
702
|
-
for (const cell of cells) total += getColspan(html, cell)
|
|
703
|
-
return total
|
|
704
|
-
}
|
|
705
|
-
|
|
706
|
-
function getColspan(html: string, cell: CellRange): number {
|
|
707
|
-
return parseSpanAttribute(html.slice(cell.cellStart, cell.contentStart), 'colspan')
|
|
708
|
-
}
|
|
709
|
-
|
|
710
|
-
function getRowspan(html: string, cell: CellRange): number {
|
|
711
|
-
return parseSpanAttribute(html.slice(cell.cellStart, cell.contentStart), 'rowspan')
|
|
712
|
-
}
|
|
713
|
-
|
|
714
|
-
function parseSpanAttribute(openingTag: string, name: 'colspan' | 'rowspan'): number {
|
|
715
|
-
const re = name === 'colspan' ? /\bcolspan\s*=\s*["']?(\d+)["']?/i : /\browspan\s*=\s*["']?(\d+)["']?/i
|
|
716
|
-
const m = re.exec(openingTag)
|
|
717
|
-
if (!m) return 1
|
|
718
|
-
const value = Number.parseInt(m[1], 10)
|
|
719
|
-
return Number.isFinite(value) && value > 0 ? value : 1
|
|
720
|
-
}
|
|
721
|
-
|
|
722
|
-
/**
|
|
723
|
-
* Emits a cell that's the merged/split product of a structural change,
|
|
724
|
-
* tagged with `class='mod colspan'` or `class='mod rowspan'`. Content is
|
|
725
|
-
* carried through unmodified — Word doesn't track these changes, and
|
|
726
|
-
* inserting del/ins around content that didn't really change would be
|
|
727
|
-
* misleading.
|
|
728
|
-
*/
|
|
729
|
-
function emitSpanChangedCell(html: string, cell: CellRange, kind: 'colspan' | 'rowspan'): string {
|
|
730
|
-
const tdOpening = parseOpeningTagAt(html, cell.cellStart)
|
|
731
|
-
if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd)
|
|
732
|
-
const tdOpenTag = injectClass(html.slice(cell.cellStart, tdOpening.end), `mod ${kind}`)
|
|
733
|
-
return tdOpenTag + html.slice(cell.contentStart, cell.cellEnd)
|
|
734
|
-
}
|
|
735
|
-
|
|
736
|
-
function diffPositionalRow(
|
|
737
|
-
oldHtml: string,
|
|
738
|
-
newHtml: string,
|
|
739
|
-
oldRow: RowRange,
|
|
740
|
-
newRow: RowRange,
|
|
741
|
-
diffCell: DiffCellFn
|
|
742
|
-
): string {
|
|
743
|
-
const out: string[] = []
|
|
744
|
-
// Use new's <tr> opening tag (preserves attributes from new).
|
|
745
|
-
const trHeader = rowHeaderSlice(newHtml, newRow)
|
|
746
|
-
out.push(trHeader)
|
|
747
|
-
|
|
748
|
-
let cursor = newRow.cells[0]?.cellStart ?? newRow.rowEnd
|
|
749
|
-
for (let c = 0; c < newRow.cells.length; c++) {
|
|
750
|
-
const oldCell = oldRow.cells[c]
|
|
751
|
-
const newCell = newRow.cells[c]
|
|
752
|
-
out.push(newHtml.slice(cursor, newCell.contentStart))
|
|
753
|
-
out.push(
|
|
754
|
-
diffCell(
|
|
755
|
-
oldHtml.slice(oldCell.contentStart, oldCell.contentEnd),
|
|
756
|
-
newHtml.slice(newCell.contentStart, newCell.contentEnd)
|
|
757
|
-
)
|
|
758
|
-
)
|
|
759
|
-
cursor = newCell.contentEnd
|
|
760
|
-
}
|
|
761
|
-
out.push(newHtml.slice(cursor, newRow.rowEnd))
|
|
762
|
-
return out.join('')
|
|
763
|
-
}
|
|
764
|
-
|
|
765
|
-
function diffStructurallyAlignedRow(
|
|
766
|
-
oldHtml: string,
|
|
767
|
-
newHtml: string,
|
|
768
|
-
oldRow: RowRange,
|
|
769
|
-
newRow: RowRange,
|
|
770
|
-
diffCell: DiffCellFn
|
|
771
|
-
): string {
|
|
772
|
-
const oldKeys = oldRow.cells.map(cell => cellKey(oldHtml, cell))
|
|
773
|
-
const newKeys = newRow.cells.map(cell => cellKey(newHtml, cell))
|
|
774
|
-
const exactAlignment = lcsAlign(oldKeys, newKeys)
|
|
775
|
-
// After exact LCS, fuzzy-pair adjacent unmatched old/new cells whose
|
|
776
|
-
// content is similar enough — so a content-edit cell alongside a
|
|
777
|
-
// column-add in the same row produces a content diff for the edited
|
|
778
|
-
// cell rather than a phantom delete + insert + extra cell.
|
|
779
|
-
const alignment = pairSimilarUnmatchedCells(exactAlignment, oldRow, newRow, oldHtml, newHtml)
|
|
780
|
-
|
|
781
|
-
const out: string[] = []
|
|
782
|
-
// Use new's <tr> if it exists; otherwise old's.
|
|
783
|
-
out.push(rowHeaderSlice(newHtml, newRow))
|
|
784
|
-
|
|
785
|
-
for (const align of alignment) {
|
|
786
|
-
if (align.oldIdx !== null && align.newIdx !== null) {
|
|
787
|
-
const oldCell = oldRow.cells[align.oldIdx]
|
|
788
|
-
const newCell = newRow.cells[align.newIdx]
|
|
789
|
-
out.push(emitDiffedCell(oldHtml, newHtml, oldCell, newCell, diffCell))
|
|
790
|
-
} else if (align.newIdx !== null) {
|
|
791
|
-
out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], 'ins', diffCell))
|
|
792
|
-
} else if (align.oldIdx !== null) {
|
|
793
|
-
out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], 'del', diffCell))
|
|
794
|
-
}
|
|
795
|
-
}
|
|
796
|
-
|
|
797
|
-
out.push('</tr>')
|
|
798
|
-
return out.join('')
|
|
799
|
-
}
|
|
800
|
-
|
|
801
|
-
function cellKey(html: string, cell: CellRange): string {
|
|
802
|
-
// Use cell content (not tag attributes) for matching, since column-add
|
|
803
|
-
// typically changes content but not tag attributes — and matching purely
|
|
804
|
-
// on attributes would mis-pair cells with the same content but different
|
|
805
|
-
// styling.
|
|
806
|
-
return html.slice(cell.contentStart, cell.contentEnd).replace(/\s+/g, ' ').trim()
|
|
807
|
-
}
|
|
808
|
-
|
|
809
|
-
/**
|
|
810
|
-
* Emits a row with all cells either inserted (kind='ins') or deleted
|
|
811
|
-
* (kind='del'). Adds `class='diffins'`/`'diffdel'` to the `<tr>` and to
|
|
812
|
-
* each `<td>`, with an `<ins>`/`<del>` wrapper around any cell content
|
|
813
|
-
* (empty cells get the class but no wrapper).
|
|
814
|
-
*/
|
|
815
|
-
function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell: DiffCellFn): string {
|
|
816
|
-
const cls = kind === 'ins' ? 'diffins' : 'diffdel'
|
|
817
|
-
const trOpening = parseOpeningTagAt(html, row.rowStart)
|
|
818
|
-
if (!trOpening) return html.slice(row.rowStart, row.rowEnd)
|
|
819
|
-
const trOpenTag = injectClass(html.slice(row.rowStart, trOpening.end), cls)
|
|
820
|
-
|
|
821
|
-
const out: string[] = [trOpenTag]
|
|
822
|
-
let cursor = trOpening.end
|
|
823
|
-
for (const cell of row.cells) {
|
|
824
|
-
out.push(html.slice(cursor, cell.cellStart))
|
|
825
|
-
out.push(emitFullCell(html, cell, kind, diffCell))
|
|
826
|
-
cursor = cell.cellEnd
|
|
827
|
-
}
|
|
828
|
-
out.push(html.slice(cursor, row.rowEnd))
|
|
829
|
-
return out.join('')
|
|
830
|
-
}
|
|
831
|
-
|
|
832
|
-
/**
|
|
833
|
-
* Emits a fully-inserted or fully-deleted cell. Inner text runs are wrapped
|
|
834
|
-
* with `<ins>`/`<del>` while formatting tags pass through unchanged, so
|
|
835
|
-
* `<strong>B</strong>` renders as `<strong><ins>B</ins></strong>` —
|
|
836
|
-
* matching htmldiff's general convention without the doubled-`<ins>` that
|
|
837
|
-
* the full recursive diff would produce for newly-inserted formatting.
|
|
838
|
-
* Empty cells get the class on the `<td>` but no inner wrapping.
|
|
839
|
-
*/
|
|
840
|
-
function emitFullCell(html: string, cell: CellRange, kind: 'ins' | 'del', _diffCell: DiffCellFn): string {
|
|
841
|
-
const cls = kind === 'ins' ? 'diffins' : 'diffdel'
|
|
842
|
-
const tdOpening = parseOpeningTagAt(html, cell.cellStart)
|
|
843
|
-
if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd)
|
|
844
|
-
const tdOpenTag = injectClass(html.slice(cell.cellStart, tdOpening.end), cls)
|
|
845
|
-
|
|
846
|
-
const content = html.slice(cell.contentStart, cell.contentEnd)
|
|
847
|
-
const wrapped = content.trim().length === 0 ? content : wrapInlineTextRuns(content, kind)
|
|
848
|
-
const closing = html.slice(cell.contentEnd, cell.cellEnd)
|
|
849
|
-
return tdOpenTag + wrapped + closing
|
|
850
|
-
}
|
|
851
|
-
|
|
852
|
-
/**
|
|
853
|
-
* Wraps every non-whitespace text run in the given content with an
|
|
854
|
-
* `<ins>`/`<del>` tag, leaving HTML tags untouched. This produces output
|
|
855
|
-
* like `<strong><ins>X</ins></strong>` for fully-inserted formatted
|
|
856
|
-
* content — the same shape the rest of htmldiff emits for content
|
|
857
|
-
* insertions inside existing formatting.
|
|
858
|
-
*/
|
|
859
|
-
function wrapInlineTextRuns(content: string, kind: 'ins' | 'del'): string {
|
|
860
|
-
const tag = kind === 'ins' ? 'ins' : 'del'
|
|
861
|
-
const cls = kind === 'ins' ? 'diffins' : 'diffdel'
|
|
862
|
-
|
|
863
|
-
const out: string[] = []
|
|
864
|
-
let i = 0
|
|
865
|
-
while (i < content.length) {
|
|
866
|
-
if (content[i] === '<') {
|
|
867
|
-
const tagEnd = parseOpeningTagAt(content, i)
|
|
868
|
-
if (!tagEnd) {
|
|
869
|
-
// Malformed — pass the rest through verbatim.
|
|
870
|
-
out.push(content.slice(i))
|
|
871
|
-
break
|
|
872
|
-
}
|
|
873
|
-
out.push(content.slice(i, tagEnd.end))
|
|
874
|
-
i = tagEnd.end
|
|
875
|
-
continue
|
|
876
|
-
}
|
|
877
|
-
let j = i
|
|
878
|
-
while (j < content.length && content[j] !== '<') j++
|
|
879
|
-
const text = content.slice(i, j)
|
|
880
|
-
if (text.trim().length > 0) {
|
|
881
|
-
out.push(`<${tag} class='${cls}'>${text}</${tag}>`)
|
|
882
|
-
} else {
|
|
883
|
-
out.push(text)
|
|
884
|
-
}
|
|
885
|
-
i = j
|
|
886
|
-
}
|
|
887
|
-
return out.join('')
|
|
888
|
-
}
|
|
889
|
-
|
|
890
|
-
function emitDiffedCell(
|
|
891
|
-
oldHtml: string,
|
|
892
|
-
newHtml: string,
|
|
893
|
-
oldCell: CellRange,
|
|
894
|
-
newCell: CellRange,
|
|
895
|
-
diffCell: DiffCellFn
|
|
896
|
-
): string {
|
|
897
|
-
const tdOpening = parseOpeningTagAt(newHtml, newCell.cellStart)
|
|
898
|
-
if (!tdOpening) return newHtml.slice(newCell.cellStart, newCell.cellEnd)
|
|
899
|
-
const tdOpenTag = newHtml.slice(newCell.cellStart, tdOpening.end)
|
|
900
|
-
const content = diffCell(
|
|
901
|
-
oldHtml.slice(oldCell.contentStart, oldCell.contentEnd),
|
|
902
|
-
newHtml.slice(newCell.contentStart, newCell.contentEnd)
|
|
903
|
-
)
|
|
904
|
-
const closing = newHtml.slice(newCell.contentEnd, newCell.cellEnd)
|
|
905
|
-
return tdOpenTag + content + closing
|
|
906
|
-
}
|
|
907
|
-
|
|
908
|
-
function rowHeaderSlice(html: string, row: RowRange): string {
|
|
909
|
-
// Slice from <tr> to just before the first <td> opening tag. Preserves
|
|
910
|
-
// the <tr ...> attributes plus any inter-tag whitespace. For a row with
|
|
911
|
-
// no cells, we only want the `<tr ...>` opening — the caller appends the
|
|
912
|
-
// closing `</tr>` explicitly, so taking the whole `<tr></tr>` here would
|
|
913
|
-
// double the close.
|
|
914
|
-
const opening = parseOpeningTagAt(html, row.rowStart)
|
|
915
|
-
if (!opening) return ''
|
|
916
|
-
if (row.cells.length === 0) return html.slice(row.rowStart, opening.end)
|
|
917
|
-
return html.slice(row.rowStart, row.cells[0].cellStart)
|
|
918
|
-
}
|
|
919
|
-
|
|
920
|
-
interface Alignment {
|
|
921
|
-
oldIdx: number | null
|
|
922
|
-
newIdx: number | null
|
|
923
|
-
}
|
|
924
|
-
|
|
925
|
-
/** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
|
|
926
|
-
const ROW_FUZZY_THRESHOLD = 0.5
|
|
927
|
-
|
|
928
|
-
/**
|
|
929
|
-
* Threshold for "this cell is a content-edit of that cell." Tuned the same
|
|
930
|
-
* as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
|
|
931
|
-
* content typically ARE the same logical cell with a body edit, so 0.5
|
|
932
|
-
* works for both granularities in practice.
|
|
933
|
-
*/
|
|
934
|
-
const CELL_FUZZY_THRESHOLD = 0.5
|
|
935
|
-
|
|
936
|
-
/**
|
|
937
|
-
* After exact LCS, scan the alignment for runs of "old deleted, then new
|
|
938
|
-
* inserted" (or vice versa) and pair entries whose content is similar
|
|
939
|
-
* enough to be treated as an edit rather than a delete+insert. This keeps
|
|
940
|
-
* row-level edits (a typo fix, a single word change) from being shown as
|
|
941
|
-
* an entire row vanishing and a new one appearing — matching what users
|
|
942
|
-
* expect from a typical track-changes view.
|
|
943
|
-
*/
|
|
944
|
-
function pairSimilarUnmatchedRows(
|
|
945
|
-
alignment: Alignment[],
|
|
946
|
-
oldTable: TableRange,
|
|
947
|
-
newTable: TableRange,
|
|
948
|
-
oldHtml: string,
|
|
949
|
-
newHtml: string
|
|
950
|
-
): Alignment[] {
|
|
951
|
-
return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
|
|
952
|
-
rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml)
|
|
953
|
-
)
|
|
954
|
-
}
|
|
955
|
-
|
|
956
|
-
function pairSimilarUnmatchedCells(
|
|
957
|
-
alignment: Alignment[],
|
|
958
|
-
oldRow: RowRange,
|
|
959
|
-
newRow: RowRange,
|
|
960
|
-
oldHtml: string,
|
|
961
|
-
newHtml: string
|
|
962
|
-
): Alignment[] {
|
|
963
|
-
return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
|
|
964
|
-
cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
|
|
965
|
-
)
|
|
966
|
-
}
|
|
967
|
-
|
|
968
|
-
/**
|
|
969
|
-
* Identify pairings inside each unmatched-only run, then build the output
|
|
970
|
-
* alignment by walking the original and substituting paired entries at
|
|
971
|
-
* the *ins position* (not the del position). This keeps the result
|
|
972
|
-
* monotonic in newIdx — critical because the cursor-based emission
|
|
973
|
-
* downstream walks new's html in order. Emitting at the del position
|
|
974
|
-
* would be fine when del<ins in the alignment array (the typical case),
|
|
975
|
-
* but can violate monotonicity when there are mixed unpaired entries in
|
|
976
|
-
* between (column-add + row-add together, content-edit + column-add,
|
|
977
|
-
* etc.).
|
|
978
|
-
*
|
|
979
|
-
* Generic over what's being paired — works for both rows (by full row
|
|
980
|
-
* content similarity) and cells (by per-cell content similarity).
|
|
981
|
-
*/
|
|
982
|
-
function pairSimilarUnmatched(
|
|
983
|
-
alignment: Alignment[],
|
|
984
|
-
threshold: number,
|
|
985
|
-
similarity: (oldIdx: number, newIdx: number) => number
|
|
986
|
-
): Alignment[] {
|
|
987
|
-
const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
|
|
988
|
-
let i = 0
|
|
989
|
-
while (i < alignment.length) {
|
|
990
|
-
if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
|
|
991
|
-
i++
|
|
992
|
-
continue
|
|
993
|
-
}
|
|
994
|
-
const runStart = i
|
|
995
|
-
while (i < alignment.length && (alignment[i].oldIdx === null) !== (alignment[i].newIdx === null)) i++
|
|
996
|
-
const runEnd = i
|
|
997
|
-
|
|
998
|
-
const delIndices: number[] = []
|
|
999
|
-
const insIndices: number[] = []
|
|
1000
|
-
for (let k = runStart; k < runEnd; k++) {
|
|
1001
|
-
if (alignment[k].oldIdx !== null) delIndices.push(k)
|
|
1002
|
-
else insIndices.push(k)
|
|
1003
|
-
}
|
|
1004
|
-
|
|
1005
|
-
const usedIns = new Set<number>()
|
|
1006
|
-
for (const di of delIndices) {
|
|
1007
|
-
let bestIi = -1
|
|
1008
|
-
let bestSim = threshold
|
|
1009
|
-
for (const ii of insIndices) {
|
|
1010
|
-
if (usedIns.has(ii)) continue
|
|
1011
|
-
const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
|
|
1012
|
-
if (sim > bestSim) {
|
|
1013
|
-
bestSim = sim
|
|
1014
|
-
bestIi = ii
|
|
1015
|
-
}
|
|
1016
|
-
}
|
|
1017
|
-
if (bestIi >= 0) {
|
|
1018
|
-
pairs.set(di, bestIi)
|
|
1019
|
-
usedIns.add(bestIi)
|
|
1020
|
-
}
|
|
1021
|
-
}
|
|
1022
|
-
}
|
|
1023
|
-
|
|
1024
|
-
const insToDel = new Map<number, number>() // ins-alignment-idx → del-alignment-idx
|
|
1025
|
-
for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi)
|
|
1026
|
-
const pairedDels = new Set<number>(pairs.keys())
|
|
1027
|
-
|
|
1028
|
-
const result: Alignment[] = []
|
|
1029
|
-
for (let k = 0; k < alignment.length; k++) {
|
|
1030
|
-
if (pairedDels.has(k)) continue // paired del — emitted when we reach its ins
|
|
1031
|
-
if (insToDel.has(k)) {
|
|
1032
|
-
const delAi = insToDel.get(k) as number
|
|
1033
|
-
result.push({ oldIdx: alignment[delAi].oldIdx, newIdx: alignment[k].newIdx })
|
|
1034
|
-
} else {
|
|
1035
|
-
result.push(alignment[k])
|
|
1036
|
-
}
|
|
1037
|
-
}
|
|
1038
|
-
return result
|
|
1039
|
-
}
|
|
1040
|
-
|
|
1041
|
-
/**
|
|
1042
|
-
* Combined similarity metric used for both row-level and cell-level
|
|
1043
|
-
* fuzzy pairing. Returns the MAX of two complementary metrics:
|
|
1044
|
-
*
|
|
1045
|
-
* 1. **Character prefix+suffix similarity** — fraction of the longer
|
|
1046
|
-
* string covered by shared prefix + shared suffix. Catches small
|
|
1047
|
-
* edits in the middle of a string (one word changed in a row).
|
|
1048
|
-
* Misses cases where the bulk of common content is in the middle
|
|
1049
|
-
* and the ends differ.
|
|
1050
|
-
*
|
|
1051
|
-
* 2. **Token Jaccard similarity** — intersection-over-union of the
|
|
1052
|
-
* whitespace-split tokens. Catches "most of the content is the
|
|
1053
|
-
* same but bookended by different bits" — e.g. a row whose only
|
|
1054
|
-
* edit is a column added at the start and another at the end,
|
|
1055
|
-
* where the ~50 chars in the middle that DO match would be
|
|
1056
|
-
* invisible to prefix+suffix.
|
|
1057
|
-
*
|
|
1058
|
-
* Either metric exceeding the threshold means pair. Neither alone is
|
|
1059
|
-
* sufficient for the full range of legal-doc edits we see in
|
|
1060
|
-
* production tables.
|
|
1061
|
-
*/
|
|
1062
|
-
function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number {
|
|
1063
|
-
return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow))
|
|
1064
|
-
}
|
|
1065
|
-
|
|
1066
|
-
function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
|
|
1067
|
-
return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell))
|
|
1068
|
-
}
|
|
1069
|
-
|
|
1070
|
-
function textSimilarity(a: string, b: string): number {
|
|
1071
|
-
if (a === b) return 1
|
|
1072
|
-
if (a.length === 0 || b.length === 0) return 0
|
|
1073
|
-
return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
|
|
1074
|
-
}
|
|
1075
|
-
|
|
1076
|
-
function charPrefixSuffixSimilarity(a: string, b: string): number {
|
|
1077
|
-
let prefix = 0
|
|
1078
|
-
const minLen = Math.min(a.length, b.length)
|
|
1079
|
-
while (prefix < minLen && a[prefix] === b[prefix]) prefix++
|
|
1080
|
-
|
|
1081
|
-
let suffix = 0
|
|
1082
|
-
while (
|
|
1083
|
-
suffix < a.length - prefix &&
|
|
1084
|
-
suffix < b.length - prefix &&
|
|
1085
|
-
a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
|
|
1086
|
-
) {
|
|
1087
|
-
suffix++
|
|
1088
|
-
}
|
|
1089
|
-
|
|
1090
|
-
return (prefix + suffix) / Math.max(a.length, b.length)
|
|
1091
|
-
}
|
|
1092
|
-
|
|
1093
|
-
function tokenJaccardSimilarity(a: string, b: string): number {
|
|
1094
|
-
const tokensA = new Set(a.split(/\s+/).filter(Boolean))
|
|
1095
|
-
const tokensB = new Set(b.split(/\s+/).filter(Boolean))
|
|
1096
|
-
if (tokensA.size === 0 && tokensB.size === 0) return 1
|
|
1097
|
-
let intersection = 0
|
|
1098
|
-
for (const t of tokensA) {
|
|
1099
|
-
if (tokensB.has(t)) intersection++
|
|
1100
|
-
}
|
|
1101
|
-
const union = tokensA.size + tokensB.size - intersection
|
|
1102
|
-
return union === 0 ? 0 : intersection / union
|
|
1103
|
-
}
|
|
1104
|
-
|
|
1105
|
-
function rowText(html: string, row: RowRange): string {
|
|
1106
|
-
const parts: string[] = []
|
|
1107
|
-
for (const cell of row.cells) {
|
|
1108
|
-
parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, ' '))
|
|
1109
|
-
}
|
|
1110
|
-
return parts.join(' ').replace(/\s+/g, ' ').trim().toLowerCase()
|
|
1111
|
-
}
|
|
1112
|
-
|
|
1113
|
-
function cellText(html: string, cell: CellRange): string {
|
|
1114
|
-
return html
|
|
1115
|
-
.slice(cell.contentStart, cell.contentEnd)
|
|
1116
|
-
.replace(/<[^>]+>/g, ' ')
|
|
1117
|
-
.replace(/\s+/g, ' ')
|
|
1118
|
-
.trim()
|
|
1119
|
-
.toLowerCase()
|
|
1120
|
-
}
|
|
1121
|
-
|
|
1122
|
-
/**
|
|
1123
|
-
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
1124
|
-
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|
|
1125
|
-
* side is null for an unmatched entry on the other side. Equality uses
|
|
1126
|
-
* strict ===.
|
|
1127
|
-
*/
|
|
1128
|
-
function lcsAlign(oldKeys: string[], newKeys: string[]): Alignment[] {
|
|
1129
|
-
const m = oldKeys.length
|
|
1130
|
-
const n = newKeys.length
|
|
1131
|
-
const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
|
|
1132
|
-
for (let i = 1; i <= m; i++) {
|
|
1133
|
-
for (let j = 1; j <= n; j++) {
|
|
1134
|
-
if (oldKeys[i - 1] === newKeys[j - 1]) {
|
|
1135
|
-
dp[i][j] = dp[i - 1][j - 1] + 1
|
|
1136
|
-
} else {
|
|
1137
|
-
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1])
|
|
1138
|
-
}
|
|
1139
|
-
}
|
|
1140
|
-
}
|
|
1141
|
-
|
|
1142
|
-
const result: Alignment[] = []
|
|
1143
|
-
let i = m
|
|
1144
|
-
let j = n
|
|
1145
|
-
while (i > 0 || j > 0) {
|
|
1146
|
-
if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
|
|
1147
|
-
result.unshift({ oldIdx: i - 1, newIdx: j - 1 })
|
|
1148
|
-
i--
|
|
1149
|
-
j--
|
|
1150
|
-
} else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
|
|
1151
|
-
result.unshift({ oldIdx: null, newIdx: j - 1 })
|
|
1152
|
-
j--
|
|
1153
|
-
} else {
|
|
1154
|
-
result.unshift({ oldIdx: i - 1, newIdx: null })
|
|
1155
|
-
i--
|
|
1156
|
-
}
|
|
1157
|
-
}
|
|
1158
|
-
return result
|
|
1159
|
-
}
|
|
1160
|
-
|
|
1161
|
-
/**
|
|
1162
|
-
* Returns the opening tag string with the given class injected. Existing
|
|
1163
|
-
* `class` attributes are preserved and the new class appended.
|
|
1164
|
-
*/
|
|
1165
|
-
/**
|
|
1166
|
-
* Returns the opening tag with the given class injected. Locates the real
|
|
1167
|
-
* `class` attribute via attribute-aware walking (NOT a flat regex — that
|
|
1168
|
-
* would mis-match inside a foreign attribute value like
|
|
1169
|
-
* `title="see class='x'"`). When the class already partially overlaps with
|
|
1170
|
-
* `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
|
|
1171
|
-
* only the missing tokens get appended, so we never end up with
|
|
1172
|
-
* `class="mod mod colspan"`.
|
|
1173
|
-
*/
|
|
1174
|
-
function injectClass(openingTag: string, cls: string): string {
|
|
1175
|
-
const clsTokens = cls.split(/\s+/).filter(Boolean)
|
|
1176
|
-
if (clsTokens.length === 0) return openingTag
|
|
1177
|
-
|
|
1178
|
-
const classAttr = findClassAttribute(openingTag)
|
|
1179
|
-
if (classAttr) {
|
|
1180
|
-
const existingTokens = classAttr.value.split(/\s+/).filter(Boolean)
|
|
1181
|
-
const missing = clsTokens.filter(t => !existingTokens.includes(t))
|
|
1182
|
-
if (missing.length === 0) return openingTag
|
|
1183
|
-
const updatedValue =
|
|
1184
|
-
existingTokens.length === 0 ? missing.join(' ') : `${existingTokens.join(' ')} ${missing.join(' ')}`
|
|
1185
|
-
return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd)
|
|
1186
|
-
}
|
|
1187
|
-
|
|
1188
|
-
const isSelfClosing = openingTag.endsWith('/>')
|
|
1189
|
-
const insertAt = isSelfClosing ? openingTag.length - 2 : openingTag.length - 1
|
|
1190
|
-
return `${openingTag.slice(0, insertAt).replace(/\s*$/, '')} class='${cls}'${openingTag.slice(insertAt)}`
|
|
1191
|
-
}
|
|
1192
|
-
|
|
1193
|
-
/**
|
|
1194
|
-
* Walks the opening tag's attributes (respecting quoted values) to find
|
|
1195
|
-
* the actual `class` attribute. Returns the value range (start/end of the
|
|
1196
|
-
* value content, *excluding* the surrounding quotes) and the value, or
|
|
1197
|
-
* null if no `class` attribute is present.
|
|
1198
|
-
*/
|
|
1199
|
-
function findClassAttribute(openingTag: string): { valueStart: number; valueEnd: number; value: string } | null {
|
|
1200
|
-
// Skip past the tag name. Tag starts with `<`; first run of [A-Za-z0-9-]
|
|
1201
|
-
// is the tag name. Anything after is attribute territory.
|
|
1202
|
-
let i = 1
|
|
1203
|
-
while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++
|
|
1204
|
-
|
|
1205
|
-
while (i < openingTag.length) {
|
|
1206
|
-
// Skip whitespace
|
|
1207
|
-
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
1208
|
-
if (i >= openingTag.length) break
|
|
1209
|
-
if (openingTag[i] === '>' || openingTag[i] === '/') break
|
|
1210
|
-
|
|
1211
|
-
// Read attribute name
|
|
1212
|
-
const nameStart = i
|
|
1213
|
-
while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++
|
|
1214
|
-
const name = openingTag.slice(nameStart, i)
|
|
1215
|
-
|
|
1216
|
-
// Optional whitespace + '=' + optional whitespace + value
|
|
1217
|
-
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
1218
|
-
if (openingTag[i] !== '=') {
|
|
1219
|
-
// Bare attribute (no value) — not class
|
|
1220
|
-
continue
|
|
1221
|
-
}
|
|
1222
|
-
i++ // past '='
|
|
1223
|
-
while (i < openingTag.length && /\s/.test(openingTag[i])) i++
|
|
1224
|
-
|
|
1225
|
-
// Value: quoted or unquoted
|
|
1226
|
-
let valueStart: number
|
|
1227
|
-
let valueEnd: number
|
|
1228
|
-
if (openingTag[i] === '"' || openingTag[i] === "'") {
|
|
1229
|
-
const quote = openingTag[i]
|
|
1230
|
-
i++
|
|
1231
|
-
valueStart = i
|
|
1232
|
-
while (i < openingTag.length && openingTag[i] !== quote) i++
|
|
1233
|
-
valueEnd = i
|
|
1234
|
-
if (i < openingTag.length) i++ // past closing quote
|
|
1235
|
-
} else {
|
|
1236
|
-
valueStart = i
|
|
1237
|
-
while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++
|
|
1238
|
-
valueEnd = i
|
|
1239
|
-
}
|
|
1240
|
-
|
|
1241
|
-
if (name.toLowerCase() === 'class') {
|
|
1242
|
-
return { valueStart, valueEnd, value: openingTag.slice(valueStart, valueEnd) }
|
|
1243
|
-
}
|
|
1244
|
-
}
|
|
1245
|
-
|
|
1246
|
-
return null
|
|
1247
|
-
}
|
|
1248
|
-
|
|
1249
|
-
/**
|
|
1250
|
-
* Walks html and returns ranges for every top-level `<table>...</table>`
|
|
1251
|
-
* block. Nested tables aren't extracted as separate top-level entries —
|
|
1252
|
-
* they're captured inside the parent's content range and handled when the
|
|
1253
|
-
* cell-level diff recurses through them.
|
|
1254
|
-
*/
|
|
1255
|
-
function findTopLevelTables(html: string): TableRange[] {
|
|
1256
|
-
const tables: TableRange[] = []
|
|
1257
|
-
let i = 0
|
|
1258
|
-
while (i < html.length) {
|
|
1259
|
-
if (matchesTagAt(html, i, 'table')) {
|
|
1260
|
-
const opening = parseOpeningTagAt(html, i)
|
|
1261
|
-
if (!opening) {
|
|
1262
|
-
i++
|
|
1263
|
-
continue
|
|
1264
|
-
}
|
|
1265
|
-
const tableContentStart = opening.end
|
|
1266
|
-
const tableEnd = findMatchingClosingTag(html, tableContentStart, 'table')
|
|
1267
|
-
if (tableEnd === -1) {
|
|
1268
|
-
i = opening.end
|
|
1269
|
-
continue
|
|
1270
|
-
}
|
|
1271
|
-
const closingTagStart = tableEnd - '</table>'.length
|
|
1272
|
-
const rows = findTopLevelRows(html, tableContentStart, closingTagStart)
|
|
1273
|
-
tables.push({ tableStart: i, tableEnd, rows })
|
|
1274
|
-
i = tableEnd
|
|
1275
|
-
} else {
|
|
1276
|
-
i++
|
|
1277
|
-
}
|
|
1278
|
-
}
|
|
1279
|
-
return tables
|
|
1280
|
-
}
|
|
1281
|
-
|
|
1282
|
-
function findTopLevelRows(html: string, start: number, end: number): RowRange[] {
|
|
1283
|
-
const rows: RowRange[] = []
|
|
1284
|
-
let i = start
|
|
1285
|
-
while (i < end) {
|
|
1286
|
-
if (matchesTagAt(html, i, 'tr')) {
|
|
1287
|
-
const opening = parseOpeningTagAt(html, i)
|
|
1288
|
-
if (!opening) {
|
|
1289
|
-
i++
|
|
1290
|
-
continue
|
|
1291
|
-
}
|
|
1292
|
-
const rowContentStart = opening.end
|
|
1293
|
-
const rowEnd = findMatchingClosingTag(html, rowContentStart, 'tr', end)
|
|
1294
|
-
if (rowEnd === -1) {
|
|
1295
|
-
i = opening.end
|
|
1296
|
-
continue
|
|
1297
|
-
}
|
|
1298
|
-
const closingTagStart = rowEnd - '</tr>'.length
|
|
1299
|
-
const cells = findTopLevelCells(html, rowContentStart, closingTagStart)
|
|
1300
|
-
rows.push({ rowStart: i, rowEnd, cells })
|
|
1301
|
-
i = rowEnd
|
|
1302
|
-
} else if (matchesClosingTagAt(html, i, 'table')) {
|
|
1303
|
-
// Defensive: bail out if we encounter a closing </table> while
|
|
1304
|
-
// scanning rows (we should have stopped at `end` already).
|
|
1305
|
-
break
|
|
1306
|
-
} else {
|
|
1307
|
-
i++
|
|
1308
|
-
}
|
|
1309
|
-
}
|
|
1310
|
-
return rows
|
|
1311
|
-
}
|
|
1312
|
-
|
|
1313
|
-
function findTopLevelCells(html: string, start: number, end: number): CellRange[] {
|
|
1314
|
-
const cells: CellRange[] = []
|
|
1315
|
-
let i = start
|
|
1316
|
-
while (i < end) {
|
|
1317
|
-
if (matchesTagAt(html, i, 'td') || matchesTagAt(html, i, 'th')) {
|
|
1318
|
-
const tagName = matchesTagAt(html, i, 'td') ? 'td' : 'th'
|
|
1319
|
-
const opening = parseOpeningTagAt(html, i)
|
|
1320
|
-
if (!opening) {
|
|
1321
|
-
i++
|
|
1322
|
-
continue
|
|
1323
|
-
}
|
|
1324
|
-
const contentStart = opening.end
|
|
1325
|
-
const cellEnd = findMatchingClosingTag(html, contentStart, tagName, end)
|
|
1326
|
-
if (cellEnd === -1) {
|
|
1327
|
-
i = opening.end
|
|
1328
|
-
continue
|
|
1329
|
-
}
|
|
1330
|
-
const contentEnd = cellEnd - `</${tagName}>`.length
|
|
1331
|
-
cells.push({ cellStart: i, cellEnd, contentStart, contentEnd })
|
|
1332
|
-
i = cellEnd
|
|
1333
|
-
} else if (matchesClosingTagAt(html, i, 'tr')) {
|
|
1334
|
-
break
|
|
1335
|
-
} else {
|
|
1336
|
-
i++
|
|
1337
|
-
}
|
|
1338
|
-
}
|
|
1339
|
-
return cells
|
|
1340
|
-
}
|
|
1341
|
-
|
|
1342
|
-
function matchesTagAt(html: string, i: number, tagName: string): boolean {
|
|
1343
|
-
if (html[i] !== '<') return false
|
|
1344
|
-
const candidate = html.slice(i + 1, i + 1 + tagName.length).toLowerCase()
|
|
1345
|
-
if (candidate !== tagName) return false
|
|
1346
|
-
const after = html[i + 1 + tagName.length]
|
|
1347
|
-
return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r' || after === '/'
|
|
1348
|
-
}
|
|
1349
|
-
|
|
1350
|
-
function matchesClosingTagAt(html: string, i: number, tagName: string): boolean {
|
|
1351
|
-
if (html[i] !== '<' || html[i + 1] !== '/') return false
|
|
1352
|
-
const candidate = html.slice(i + 2, i + 2 + tagName.length).toLowerCase()
|
|
1353
|
-
if (candidate !== tagName) return false
|
|
1354
|
-
const after = html[i + 2 + tagName.length]
|
|
1355
|
-
return after === '>' || after === ' ' || after === '\t' || after === '\n' || after === '\r'
|
|
1356
|
-
}
|
|
1357
|
-
|
|
1358
|
-
interface OpeningTag {
|
|
1359
|
-
/** Index just past the closing `>` of the opening tag. */
|
|
1360
|
-
end: number
|
|
1361
|
-
}
|
|
1362
|
-
|
|
1363
|
-
function parseOpeningTagAt(html: string, i: number): OpeningTag | null {
|
|
1364
|
-
// HTML comments, CDATA, processing instructions, and DOCTYPE need their
|
|
1365
|
-
// own terminators — a plain `>`-walker would cut a comment like
|
|
1366
|
-
// `<!-- a > b -->` at the first inner `>`, treating the rest as text
|
|
1367
|
-
// and corrupting downstream offsets. Word-exported HTML routinely
|
|
1368
|
-
// emits comments inside tables (conditional comments, OLE markers) so
|
|
1369
|
-
// these have to be handled, not just be theoretical.
|
|
1370
|
-
if (html.startsWith('<!--', i)) {
|
|
1371
|
-
const close = html.indexOf('-->', i + 4)
|
|
1372
|
-
return close === -1 ? null : { end: close + 3 }
|
|
1373
|
-
}
|
|
1374
|
-
if (html.startsWith('<![CDATA[', i)) {
|
|
1375
|
-
const close = html.indexOf(']]>', i + 9)
|
|
1376
|
-
return close === -1 ? null : { end: close + 3 }
|
|
1377
|
-
}
|
|
1378
|
-
if (html.startsWith('<?', i)) {
|
|
1379
|
-
const close = html.indexOf('?>', i + 2)
|
|
1380
|
-
return close === -1 ? null : { end: close + 2 }
|
|
1381
|
-
}
|
|
1382
|
-
// Walk to the next unquoted '>'. Handles attributes whose values contain
|
|
1383
|
-
// a literal '>' inside quotes, which a plain indexOf would mishandle.
|
|
1384
|
-
let j = i + 1
|
|
1385
|
-
let quote: string | null = null
|
|
1386
|
-
while (j < html.length) {
|
|
1387
|
-
const ch = html[j]
|
|
1388
|
-
if (quote) {
|
|
1389
|
-
if (ch === quote) quote = null
|
|
1390
|
-
} else if (ch === '"' || ch === "'") {
|
|
1391
|
-
quote = ch
|
|
1392
|
-
} else if (ch === '>') {
|
|
1393
|
-
return { end: j + 1 }
|
|
1394
|
-
}
|
|
1395
|
-
j++
|
|
1396
|
-
}
|
|
1397
|
-
return null
|
|
1398
|
-
}
|
|
1399
|
-
|
|
1400
|
-
/**
|
|
1401
|
-
* Returns the index just past the matching `</tagName>`, accounting for
|
|
1402
|
-
* nested tags of the same name. Returns -1 if no match before `limit`.
|
|
1403
|
-
*/
|
|
1404
|
-
function findMatchingClosingTag(html: string, from: number, tagName: string, limit: number = html.length): number {
|
|
1405
|
-
let depth = 1
|
|
1406
|
-
let i = from
|
|
1407
|
-
while (i < limit) {
|
|
1408
|
-
if (matchesTagAt(html, i, tagName)) {
|
|
1409
|
-
const opening = parseOpeningTagAt(html, i)
|
|
1410
|
-
if (!opening) {
|
|
1411
|
-
i++
|
|
1412
|
-
continue
|
|
1413
|
-
}
|
|
1414
|
-
const tagText = html.slice(i, opening.end)
|
|
1415
|
-
if (!tagText.endsWith('/>')) depth++
|
|
1416
|
-
i = opening.end
|
|
1417
|
-
} else if (matchesClosingTagAt(html, i, tagName)) {
|
|
1418
|
-
depth--
|
|
1419
|
-
const closing = parseOpeningTagAt(html, i)
|
|
1420
|
-
const closingEnd = closing?.end ?? i + `</${tagName}>`.length
|
|
1421
|
-
if (depth === 0) return closingEnd
|
|
1422
|
-
i = closingEnd
|
|
1423
|
-
} else {
|
|
1424
|
-
i++
|
|
1425
|
-
}
|
|
1426
|
-
}
|
|
1427
|
-
return -1
|
|
1428
|
-
}
|