@createiq/htmldiff 1.0.4 → 1.0.5-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/dist/HtmlDiff.cjs +881 -46
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +55 -19
- package/dist/HtmlDiff.d.mts +55 -19
- package/dist/HtmlDiff.mjs +881 -46
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +8 -8
- package/src/HtmlDiff.ts +156 -69
- package/src/TableDiff.ts +1196 -0
- package/test/HtmlDiff.spec.ts +119 -1
- package/test/HtmlDiff.tables.spec.ts +1419 -0
- package/test/TableDiff.bench.ts +244 -0
package/dist/HtmlDiff.mjs
CHANGED
|
@@ -202,6 +202,782 @@ var Operation = class {
|
|
|
202
202
|
}
|
|
203
203
|
};
|
|
204
204
|
//#endregion
|
|
205
|
+
//#region src/TableDiff.ts
|
|
206
|
+
const PLACEHOLDER_PREFIX_BASE = "<!--HTMLDIFF_TABLE_";
|
|
207
|
+
const PLACEHOLDER_SUFFIX = "-->";
|
|
208
|
+
/**
|
|
209
|
+
* Hard cap on table dimensions handled by the structural-aware path.
|
|
210
|
+
* The row-LCS is O(rows²), the per-row cell-LCS is O(cells²), and each
|
|
211
|
+
* comparison string-equals row content (potentially many KB). Without a
|
|
212
|
+
* cap, a several-thousand-row table can pin a CPU for seconds. Tables
|
|
213
|
+
* larger than this fall through to the word-level diff, which scales
|
|
214
|
+
* linearly. Tuned to comfortably cover real-world ISDA schedules
|
|
215
|
+
* (which routinely have 1000+ rows).
|
|
216
|
+
*/
|
|
217
|
+
const MAX_TABLE_ROWS = 1500;
|
|
218
|
+
const MAX_TABLE_CELLS_PER_ROW = 200;
|
|
219
|
+
function makePlaceholderPrefix(oldHtml, newHtml) {
|
|
220
|
+
for (let attempt = 0; attempt < 8; attempt++) {
|
|
221
|
+
const prefix = `${PLACEHOLDER_PREFIX_BASE}${Math.floor(Math.random() * 4294967295).toString(16).padStart(8, "0")}_`;
|
|
222
|
+
if (!oldHtml.includes(prefix) && !newHtml.includes(prefix)) return prefix;
|
|
223
|
+
}
|
|
224
|
+
return `${PLACEHOLDER_PREFIX_BASE}fallback_${Date.now()}_`;
|
|
225
|
+
}
|
|
226
|
+
/**
|
|
227
|
+
* Diffs every paired-by-position table in the inputs and replaces each
|
|
228
|
+
* source table with a placeholder, returning the modified inputs plus the
|
|
229
|
+
* placeholder→diff mapping. Returns null when there are no tables to
|
|
230
|
+
* preprocess or the table counts don't line up.
|
|
231
|
+
*/
|
|
232
|
+
function preprocessTables(oldHtml, newHtml, diffCell) {
|
|
233
|
+
const oldTables = findTopLevelTables(oldHtml);
|
|
234
|
+
const newTables = findTopLevelTables(newHtml);
|
|
235
|
+
if (oldTables.length === 0 && newTables.length === 0) return null;
|
|
236
|
+
if (oldTables.length !== newTables.length) return null;
|
|
237
|
+
for (let i = 0; i < oldTables.length; i++) if (exceedsSizeLimit(oldTables[i]) || exceedsSizeLimit(newTables[i])) return null;
|
|
238
|
+
const pairs = [];
|
|
239
|
+
for (let i = 0; i < oldTables.length; i++) pairs.push({
|
|
240
|
+
oldTable: oldTables[i],
|
|
241
|
+
newTable: newTables[i],
|
|
242
|
+
diffed: diffTable(oldHtml, newHtml, oldTables[i], newTables[i], diffCell)
|
|
243
|
+
});
|
|
244
|
+
let modifiedOld = oldHtml;
|
|
245
|
+
let modifiedNew = newHtml;
|
|
246
|
+
const placeholderPrefix = makePlaceholderPrefix(oldHtml, newHtml);
|
|
247
|
+
const placeholderToDiff = /* @__PURE__ */ new Map();
|
|
248
|
+
for (let i = pairs.length - 1; i >= 0; i--) {
|
|
249
|
+
const placeholder = `${placeholderPrefix}${i}${PLACEHOLDER_SUFFIX}`;
|
|
250
|
+
placeholderToDiff.set(placeholder, pairs[i].diffed);
|
|
251
|
+
modifiedOld = spliceString(modifiedOld, pairs[i].oldTable.tableStart, pairs[i].oldTable.tableEnd, placeholder);
|
|
252
|
+
modifiedNew = spliceString(modifiedNew, pairs[i].newTable.tableStart, pairs[i].newTable.tableEnd, placeholder);
|
|
253
|
+
}
|
|
254
|
+
return {
|
|
255
|
+
modifiedOld,
|
|
256
|
+
modifiedNew,
|
|
257
|
+
placeholderToDiff
|
|
258
|
+
};
|
|
259
|
+
}
|
|
260
|
+
function restoreTablePlaceholders(diffOutput, placeholderToDiff) {
|
|
261
|
+
let result = diffOutput;
|
|
262
|
+
for (const [placeholder, html] of placeholderToDiff) result = result.split(placeholder).join(html);
|
|
263
|
+
return result;
|
|
264
|
+
}
|
|
265
|
+
function spliceString(s, start, end, replacement) {
|
|
266
|
+
return s.slice(0, start) + replacement + s.slice(end);
|
|
267
|
+
}
|
|
268
|
+
function exceedsSizeLimit(table) {
|
|
269
|
+
if (table.rows.length > MAX_TABLE_ROWS) return true;
|
|
270
|
+
for (const row of table.rows) if (row.cells.length > MAX_TABLE_CELLS_PER_ROW) return true;
|
|
271
|
+
return false;
|
|
272
|
+
}
|
|
273
|
+
function diffTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
|
|
274
|
+
if (sameDimensions(oldTable, newTable)) return diffPositionalTable(oldHtml, newHtml, oldTable, newTable, diffCell);
|
|
275
|
+
if (oldTable.rows.length === newTable.rows.length) return diffSameRowCountTable(oldHtml, newHtml, oldTable, newTable, diffCell);
|
|
276
|
+
return diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diffCell);
|
|
277
|
+
}
|
|
278
|
+
function diffSameRowCountTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
|
|
279
|
+
const out = [];
|
|
280
|
+
let cursor = newTable.tableStart;
|
|
281
|
+
let r = 0;
|
|
282
|
+
while (r < newTable.rows.length) {
|
|
283
|
+
const merge = detectVerticalMerge(oldHtml, newHtml, oldTable, newTable, r);
|
|
284
|
+
if (merge) {
|
|
285
|
+
out.push(newHtml.slice(cursor, newTable.rows[r].rowStart));
|
|
286
|
+
out.push(merge.diff);
|
|
287
|
+
cursor = newTable.rows[r + merge.span - 1].rowEnd;
|
|
288
|
+
r += merge.span;
|
|
289
|
+
continue;
|
|
290
|
+
}
|
|
291
|
+
const split = detectVerticalSplit(oldHtml, newHtml, oldTable, newTable, r);
|
|
292
|
+
if (split) {
|
|
293
|
+
out.push(newHtml.slice(cursor, newTable.rows[r].rowStart));
|
|
294
|
+
out.push(split.diff);
|
|
295
|
+
cursor = newTable.rows[r + split.span - 1].rowEnd;
|
|
296
|
+
r += split.span;
|
|
297
|
+
continue;
|
|
298
|
+
}
|
|
299
|
+
const newRow = newTable.rows[r];
|
|
300
|
+
out.push(newHtml.slice(cursor, newRow.rowStart));
|
|
301
|
+
out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[r], newRow, diffCell));
|
|
302
|
+
cursor = newRow.rowEnd;
|
|
303
|
+
r++;
|
|
304
|
+
}
|
|
305
|
+
out.push(newHtml.slice(cursor, newTable.tableEnd));
|
|
306
|
+
return out.join("");
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Detects a vertical merge starting at row `r`: new row R has a single
|
|
310
|
+
* cell with rowspan=K (and any colspan ≥ 1), with rows R+1..R+K-1 empty
|
|
311
|
+
* in new. Old rows R..R+K-1 must have a logical column width equal to
|
|
312
|
+
* the new cell's colspan and contain no rowspan'd cells of their own.
|
|
313
|
+
* This handles both single-column merges (old rows are 1-cell, new cell
|
|
314
|
+
* rowspan=K) and rectangular merges (e.g. 2×2 merge into a single
|
|
315
|
+
* colspan=2 rowspan=2 cell). Output: emit the merged cell with
|
|
316
|
+
* `class='mod rowspan'` and the empty trailing rows unchanged.
|
|
317
|
+
*/
|
|
318
|
+
function detectVerticalMerge(oldHtml, newHtml, oldTable, newTable, r) {
|
|
319
|
+
const newRow = newTable.rows[r];
|
|
320
|
+
if (newRow.cells.length !== 1) return null;
|
|
321
|
+
const cell = newRow.cells[0];
|
|
322
|
+
const span = getRowspan(newHtml, cell);
|
|
323
|
+
if (span <= 1) return null;
|
|
324
|
+
if (r + span > newTable.rows.length) return null;
|
|
325
|
+
const colspan = getColspan(newHtml, cell);
|
|
326
|
+
for (let k = 1; k < span; k++) if (newTable.rows[r + k].cells.length !== 0) return null;
|
|
327
|
+
for (let k = 0; k < span; k++) {
|
|
328
|
+
const oldRow = oldTable.rows[r + k];
|
|
329
|
+
if (!oldRow) return null;
|
|
330
|
+
if (sumColspans(oldHtml, oldRow.cells) !== colspan) return null;
|
|
331
|
+
for (const c of oldRow.cells) if (getRowspan(oldHtml, c) !== 1) return null;
|
|
332
|
+
}
|
|
333
|
+
const out = [];
|
|
334
|
+
out.push(rowHeaderSlice(newHtml, newRow));
|
|
335
|
+
out.push(emitSpanChangedCell(newHtml, cell, "rowspan"));
|
|
336
|
+
out.push("</tr>");
|
|
337
|
+
for (let k = 1; k < span; k++) out.push(emitEmptyRow(newHtml, newTable.rows[r + k]));
|
|
338
|
+
return {
|
|
339
|
+
diff: out.join(""),
|
|
340
|
+
span
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Detects a vertical split starting at row `r`: old row R has a single
|
|
345
|
+
* cell with rowspan=K, old rows R+1..R+K-1 are empty. New rows R..R+K-1
|
|
346
|
+
* each have a single cell. Output: emit each new row with the new cell
|
|
347
|
+
* tagged `class='mod rowspan'`.
|
|
348
|
+
*/
|
|
349
|
+
function detectVerticalSplit(oldHtml, newHtml, oldTable, newTable, r) {
|
|
350
|
+
const oldRow = oldTable.rows[r];
|
|
351
|
+
if (oldRow.cells.length !== 1) return null;
|
|
352
|
+
const oldCell = oldRow.cells[0];
|
|
353
|
+
const span = getRowspan(oldHtml, oldCell);
|
|
354
|
+
if (span <= 1) return null;
|
|
355
|
+
if (r + span > oldTable.rows.length) return null;
|
|
356
|
+
const colspan = getColspan(oldHtml, oldCell);
|
|
357
|
+
for (let k = 1; k < span; k++) if (oldTable.rows[r + k].cells.length !== 0) return null;
|
|
358
|
+
for (let k = 0; k < span; k++) {
|
|
359
|
+
const newRow = newTable.rows[r + k];
|
|
360
|
+
if (!newRow) return null;
|
|
361
|
+
if (sumColspans(newHtml, newRow.cells) !== colspan) return null;
|
|
362
|
+
for (const c of newRow.cells) if (getRowspan(newHtml, c) !== 1) return null;
|
|
363
|
+
}
|
|
364
|
+
const out = [];
|
|
365
|
+
for (let k = 0; k < span; k++) {
|
|
366
|
+
const newRow = newTable.rows[r + k];
|
|
367
|
+
out.push(rowHeaderSlice(newHtml, newRow));
|
|
368
|
+
for (const c of newRow.cells) out.push(emitSpanChangedCell(newHtml, c, "rowspan"));
|
|
369
|
+
out.push("</tr>");
|
|
370
|
+
}
|
|
371
|
+
return {
|
|
372
|
+
diff: out.join(""),
|
|
373
|
+
span
|
|
374
|
+
};
|
|
375
|
+
}
|
|
376
|
+
function emitEmptyRow(html, row) {
|
|
377
|
+
return html.slice(row.rowStart, row.rowEnd);
|
|
378
|
+
}
|
|
379
|
+
function sameDimensions(a, b) {
|
|
380
|
+
if (a.rows.length !== b.rows.length) return false;
|
|
381
|
+
for (let i = 0; i < a.rows.length; i++) if (a.rows[i].cells.length !== b.rows[i].cells.length) return false;
|
|
382
|
+
return true;
|
|
383
|
+
}
|
|
384
|
+
/**
|
|
385
|
+
* Same-dimension path: walk the new table verbatim and substitute each
|
|
386
|
+
* cell content range with the cell-level diff. The surrounding
|
|
387
|
+
* `<thead>`/`<tbody>`/whitespace passes through untouched.
|
|
388
|
+
*/
|
|
389
|
+
function diffPositionalTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
|
|
390
|
+
const out = [];
|
|
391
|
+
let cursor = newTable.tableStart;
|
|
392
|
+
for (let r = 0; r < newTable.rows.length; r++) {
|
|
393
|
+
const oldRow = oldTable.rows[r];
|
|
394
|
+
const newRow = newTable.rows[r];
|
|
395
|
+
for (let c = 0; c < newRow.cells.length; c++) {
|
|
396
|
+
const oldCell = oldRow.cells[c];
|
|
397
|
+
const newCell = newRow.cells[c];
|
|
398
|
+
out.push(newHtml.slice(cursor, newCell.contentStart));
|
|
399
|
+
out.push(diffCell(oldHtml.slice(oldCell.contentStart, oldCell.contentEnd), newHtml.slice(newCell.contentStart, newCell.contentEnd)));
|
|
400
|
+
cursor = newCell.contentEnd;
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
out.push(newHtml.slice(cursor, newTable.tableEnd));
|
|
404
|
+
return out.join("");
|
|
405
|
+
}
|
|
406
|
+
/**
|
|
407
|
+
* Mismatched-dimensions path: row-level LCS to identify added/deleted rows,
|
|
408
|
+
* then per preserved row a cell-level LCS to identify added/deleted cells.
|
|
409
|
+
* Reconstructs the table from scratch — there's no "single new structure"
|
|
410
|
+
* to walk verbatim, since we're stitching together kept rows from both
|
|
411
|
+
* sides.
|
|
412
|
+
*/
|
|
413
|
+
function diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
|
|
414
|
+
const alignment = pairSimilarUnmatchedRows(lcsAlign(oldTable.rows.map((row) => rowKey(oldHtml, row)), newTable.rows.map((row) => rowKey(newHtml, row))), oldTable, newTable, oldHtml, newHtml);
|
|
415
|
+
if (newTable.rows.length === 0) return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell);
|
|
416
|
+
const out = [];
|
|
417
|
+
out.push(newHtml.slice(newTable.tableStart, newTable.rows[0].rowStart));
|
|
418
|
+
let cursor = newTable.rows[0].rowStart;
|
|
419
|
+
for (const align of alignment) if (align.newIdx !== null) {
|
|
420
|
+
const newRow = newTable.rows[align.newIdx];
|
|
421
|
+
out.push(newHtml.slice(cursor, newRow.rowStart));
|
|
422
|
+
if (align.oldIdx !== null) out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[align.oldIdx], newRow, diffCell));
|
|
423
|
+
else out.push(emitFullRow(newHtml, newRow, "ins", diffCell));
|
|
424
|
+
cursor = newRow.rowEnd;
|
|
425
|
+
} else if (align.oldIdx !== null) out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], "del", diffCell));
|
|
426
|
+
out.push(newHtml.slice(cursor, newTable.tableEnd));
|
|
427
|
+
return out.join("");
|
|
428
|
+
}
|
|
429
|
+
function rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell) {
|
|
430
|
+
const out = [];
|
|
431
|
+
out.push(headerSlice(newHtml, newTable, oldHtml, oldTable));
|
|
432
|
+
for (const align of alignment) if (align.oldIdx !== null) out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], "del", diffCell));
|
|
433
|
+
else if (align.newIdx !== null) out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], "ins", diffCell));
|
|
434
|
+
out.push("</table>");
|
|
435
|
+
return out.join("");
|
|
436
|
+
}
|
|
437
|
+
function headerSlice(newHtml, newTable, oldHtml, oldTable) {
|
|
438
|
+
const newFirstRow = newTable.rows[0]?.rowStart ?? newTable.tableEnd - 8;
|
|
439
|
+
if (newFirstRow > newTable.tableStart) return newHtml.slice(newTable.tableStart, newFirstRow);
|
|
440
|
+
const oldFirstRow = oldTable.rows[0]?.rowStart ?? oldTable.tableEnd - 8;
|
|
441
|
+
return oldHtml.slice(oldTable.tableStart, oldFirstRow);
|
|
442
|
+
}
|
|
443
|
+
function rowKey(html, row) {
|
|
444
|
+
return html.slice(row.rowStart, row.rowEnd).replace(/\s+/g, " ").trim();
|
|
445
|
+
}
|
|
446
|
+
function diffPreservedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
447
|
+
if (oldRow.cells.length === newRow.cells.length) return diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
448
|
+
const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
449
|
+
if (colspanAligned !== null) return colspanAligned;
|
|
450
|
+
return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
451
|
+
}
|
|
452
|
+
/**
|
|
453
|
+
* Try to align cells by logical column position (sum of colspans). When
|
|
454
|
+
* one side has a colspan'd cell that absorbs multiple cells on the other
|
|
455
|
+
* side, emit the new structure with `class='mod colspan'` on the
|
|
456
|
+
* merged/split cells. Returns null if the rows don't align cleanly —
|
|
457
|
+
* caller falls back to a generic cell-LCS.
|
|
458
|
+
*/
|
|
459
|
+
function diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
460
|
+
if (sumColspans(oldHtml, oldRow.cells) !== sumColspans(newHtml, newRow.cells)) return null;
|
|
461
|
+
const out = [];
|
|
462
|
+
out.push(rowHeaderSlice(newHtml, newRow));
|
|
463
|
+
let oi = 0;
|
|
464
|
+
let ni = 0;
|
|
465
|
+
while (oi < oldRow.cells.length && ni < newRow.cells.length) {
|
|
466
|
+
const oCell = oldRow.cells[oi];
|
|
467
|
+
const nCell = newRow.cells[ni];
|
|
468
|
+
const oSpan = getColspan(oldHtml, oCell);
|
|
469
|
+
const nSpan = getColspan(newHtml, nCell);
|
|
470
|
+
if (oSpan === nSpan) {
|
|
471
|
+
out.push(emitDiffedCell(oldHtml, newHtml, oCell, nCell, diffCell));
|
|
472
|
+
oi++;
|
|
473
|
+
ni++;
|
|
474
|
+
} else if (nSpan > oSpan) {
|
|
475
|
+
let totalOldSpan = 0;
|
|
476
|
+
let oj = oi;
|
|
477
|
+
while (oj < oldRow.cells.length && totalOldSpan < nSpan) {
|
|
478
|
+
totalOldSpan += getColspan(oldHtml, oldRow.cells[oj]);
|
|
479
|
+
oj++;
|
|
480
|
+
}
|
|
481
|
+
if (totalOldSpan !== nSpan) return null;
|
|
482
|
+
out.push(emitSpanChangedCell(newHtml, nCell, "colspan"));
|
|
483
|
+
oi = oj;
|
|
484
|
+
ni++;
|
|
485
|
+
} else {
|
|
486
|
+
let totalNewSpan = 0;
|
|
487
|
+
let nj = ni;
|
|
488
|
+
while (nj < newRow.cells.length && totalNewSpan < oSpan) {
|
|
489
|
+
totalNewSpan += getColspan(newHtml, newRow.cells[nj]);
|
|
490
|
+
nj++;
|
|
491
|
+
}
|
|
492
|
+
if (totalNewSpan !== oSpan) return null;
|
|
493
|
+
for (let k = ni; k < nj; k++) out.push(emitSpanChangedCell(newHtml, newRow.cells[k], "colspan"));
|
|
494
|
+
oi++;
|
|
495
|
+
ni = nj;
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
if (oi !== oldRow.cells.length || ni !== newRow.cells.length) return null;
|
|
499
|
+
out.push("</tr>");
|
|
500
|
+
return out.join("");
|
|
501
|
+
}
|
|
502
|
+
function sumColspans(html, cells) {
|
|
503
|
+
let total = 0;
|
|
504
|
+
for (const cell of cells) total += getColspan(html, cell);
|
|
505
|
+
return total;
|
|
506
|
+
}
|
|
507
|
+
function getColspan(html, cell) {
|
|
508
|
+
return parseSpanAttribute(html.slice(cell.cellStart, cell.contentStart), "colspan");
|
|
509
|
+
}
|
|
510
|
+
function getRowspan(html, cell) {
|
|
511
|
+
return parseSpanAttribute(html.slice(cell.cellStart, cell.contentStart), "rowspan");
|
|
512
|
+
}
|
|
513
|
+
function parseSpanAttribute(openingTag, name) {
|
|
514
|
+
const m = (name === "colspan" ? /\bcolspan\s*=\s*["']?(\d+)["']?/i : /\browspan\s*=\s*["']?(\d+)["']?/i).exec(openingTag);
|
|
515
|
+
if (!m) return 1;
|
|
516
|
+
const value = Number.parseInt(m[1], 10);
|
|
517
|
+
return Number.isFinite(value) && value > 0 ? value : 1;
|
|
518
|
+
}
|
|
519
|
+
/**
|
|
520
|
+
* Emits a cell that's the merged/split product of a structural change,
|
|
521
|
+
* tagged with `class='mod colspan'` or `class='mod rowspan'`. Content is
|
|
522
|
+
* carried through unmodified — Word doesn't track these changes, and
|
|
523
|
+
* inserting del/ins around content that didn't really change would be
|
|
524
|
+
* misleading.
|
|
525
|
+
*/
|
|
526
|
+
function emitSpanChangedCell(html, cell, kind) {
|
|
527
|
+
const tdOpening = parseOpeningTagAt(html, cell.cellStart);
|
|
528
|
+
if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd);
|
|
529
|
+
return injectClass(html.slice(cell.cellStart, tdOpening.end), `mod ${kind}`) + html.slice(cell.contentStart, cell.cellEnd);
|
|
530
|
+
}
|
|
531
|
+
function diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
532
|
+
const out = [];
|
|
533
|
+
const trHeader = rowHeaderSlice(newHtml, newRow);
|
|
534
|
+
out.push(trHeader);
|
|
535
|
+
let cursor = newRow.cells[0]?.cellStart ?? newRow.rowEnd;
|
|
536
|
+
for (let c = 0; c < newRow.cells.length; c++) {
|
|
537
|
+
const oldCell = oldRow.cells[c];
|
|
538
|
+
const newCell = newRow.cells[c];
|
|
539
|
+
out.push(newHtml.slice(cursor, newCell.contentStart));
|
|
540
|
+
out.push(diffCell(oldHtml.slice(oldCell.contentStart, oldCell.contentEnd), newHtml.slice(newCell.contentStart, newCell.contentEnd)));
|
|
541
|
+
cursor = newCell.contentEnd;
|
|
542
|
+
}
|
|
543
|
+
out.push(newHtml.slice(cursor, newRow.rowEnd));
|
|
544
|
+
return out.join("");
|
|
545
|
+
}
|
|
546
|
+
function diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
547
|
+
const alignment = lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell)));
|
|
548
|
+
const out = [];
|
|
549
|
+
out.push(rowHeaderSlice(newHtml, newRow));
|
|
550
|
+
for (const align of alignment) if (align.oldIdx !== null && align.newIdx !== null) {
|
|
551
|
+
const oldCell = oldRow.cells[align.oldIdx];
|
|
552
|
+
const newCell = newRow.cells[align.newIdx];
|
|
553
|
+
out.push(emitDiffedCell(oldHtml, newHtml, oldCell, newCell, diffCell));
|
|
554
|
+
} else if (align.newIdx !== null) out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], "ins", diffCell));
|
|
555
|
+
else if (align.oldIdx !== null) out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], "del", diffCell));
|
|
556
|
+
out.push("</tr>");
|
|
557
|
+
return out.join("");
|
|
558
|
+
}
|
|
559
|
+
function cellKey(html, cell) {
|
|
560
|
+
return html.slice(cell.contentStart, cell.contentEnd).replace(/\s+/g, " ").trim();
|
|
561
|
+
}
|
|
562
|
+
/**
|
|
563
|
+
* Emits a row with all cells either inserted (kind='ins') or deleted
|
|
564
|
+
* (kind='del'). Adds `class='diffins'`/`'diffdel'` to the `<tr>` and to
|
|
565
|
+
* each `<td>`, with an `<ins>`/`<del>` wrapper around any cell content
|
|
566
|
+
* (empty cells get the class but no wrapper).
|
|
567
|
+
*/
|
|
568
|
+
function emitFullRow(html, row, kind, diffCell) {
|
|
569
|
+
const cls = kind === "ins" ? "diffins" : "diffdel";
|
|
570
|
+
const trOpening = parseOpeningTagAt(html, row.rowStart);
|
|
571
|
+
if (!trOpening) return html.slice(row.rowStart, row.rowEnd);
|
|
572
|
+
const out = [injectClass(html.slice(row.rowStart, trOpening.end), cls)];
|
|
573
|
+
let cursor = trOpening.end;
|
|
574
|
+
for (const cell of row.cells) {
|
|
575
|
+
out.push(html.slice(cursor, cell.cellStart));
|
|
576
|
+
out.push(emitFullCell(html, cell, kind, diffCell));
|
|
577
|
+
cursor = cell.cellEnd;
|
|
578
|
+
}
|
|
579
|
+
out.push(html.slice(cursor, row.rowEnd));
|
|
580
|
+
return out.join("");
|
|
581
|
+
}
|
|
582
|
+
/**
|
|
583
|
+
* Emits a fully-inserted or fully-deleted cell. Inner text runs are wrapped
|
|
584
|
+
* with `<ins>`/`<del>` while formatting tags pass through unchanged, so
|
|
585
|
+
* `<strong>B</strong>` renders as `<strong><ins>B</ins></strong>` —
|
|
586
|
+
* matching htmldiff's general convention without the doubled-`<ins>` that
|
|
587
|
+
* the full recursive diff would produce for newly-inserted formatting.
|
|
588
|
+
* Empty cells get the class on the `<td>` but no inner wrapping.
|
|
589
|
+
*/
|
|
590
|
+
function emitFullCell(html, cell, kind, _diffCell) {
|
|
591
|
+
const cls = kind === "ins" ? "diffins" : "diffdel";
|
|
592
|
+
const tdOpening = parseOpeningTagAt(html, cell.cellStart);
|
|
593
|
+
if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd);
|
|
594
|
+
const tdOpenTag = injectClass(html.slice(cell.cellStart, tdOpening.end), cls);
|
|
595
|
+
const content = html.slice(cell.contentStart, cell.contentEnd);
|
|
596
|
+
const wrapped = content.trim().length === 0 ? content : wrapInlineTextRuns(content, kind);
|
|
597
|
+
const closing = html.slice(cell.contentEnd, cell.cellEnd);
|
|
598
|
+
return tdOpenTag + wrapped + closing;
|
|
599
|
+
}
|
|
600
|
+
/**
|
|
601
|
+
* Wraps every non-whitespace text run in the given content with an
|
|
602
|
+
* `<ins>`/`<del>` tag, leaving HTML tags untouched. This produces output
|
|
603
|
+
* like `<strong><ins>X</ins></strong>` for fully-inserted formatted
|
|
604
|
+
* content — the same shape the rest of htmldiff emits for content
|
|
605
|
+
* insertions inside existing formatting.
|
|
606
|
+
*/
|
|
607
|
+
function wrapInlineTextRuns(content, kind) {
|
|
608
|
+
const tag = kind === "ins" ? "ins" : "del";
|
|
609
|
+
const cls = kind === "ins" ? "diffins" : "diffdel";
|
|
610
|
+
const out = [];
|
|
611
|
+
let i = 0;
|
|
612
|
+
while (i < content.length) {
|
|
613
|
+
if (content[i] === "<") {
|
|
614
|
+
const tagEnd = parseOpeningTagAt(content, i);
|
|
615
|
+
if (!tagEnd) {
|
|
616
|
+
out.push(content.slice(i));
|
|
617
|
+
break;
|
|
618
|
+
}
|
|
619
|
+
out.push(content.slice(i, tagEnd.end));
|
|
620
|
+
i = tagEnd.end;
|
|
621
|
+
continue;
|
|
622
|
+
}
|
|
623
|
+
let j = i;
|
|
624
|
+
while (j < content.length && content[j] !== "<") j++;
|
|
625
|
+
const text = content.slice(i, j);
|
|
626
|
+
if (text.trim().length > 0) out.push(`<${tag} class='${cls}'>${text}</${tag}>`);
|
|
627
|
+
else out.push(text);
|
|
628
|
+
i = j;
|
|
629
|
+
}
|
|
630
|
+
return out.join("");
|
|
631
|
+
}
|
|
632
|
+
function emitDiffedCell(oldHtml, newHtml, oldCell, newCell, diffCell) {
|
|
633
|
+
const tdOpening = parseOpeningTagAt(newHtml, newCell.cellStart);
|
|
634
|
+
if (!tdOpening) return newHtml.slice(newCell.cellStart, newCell.cellEnd);
|
|
635
|
+
const tdOpenTag = newHtml.slice(newCell.cellStart, tdOpening.end);
|
|
636
|
+
const content = diffCell(oldHtml.slice(oldCell.contentStart, oldCell.contentEnd), newHtml.slice(newCell.contentStart, newCell.contentEnd));
|
|
637
|
+
const closing = newHtml.slice(newCell.contentEnd, newCell.cellEnd);
|
|
638
|
+
return tdOpenTag + content + closing;
|
|
639
|
+
}
|
|
640
|
+
function rowHeaderSlice(html, row) {
|
|
641
|
+
const opening = parseOpeningTagAt(html, row.rowStart);
|
|
642
|
+
if (!opening) return "";
|
|
643
|
+
if (row.cells.length === 0) return html.slice(row.rowStart, opening.end);
|
|
644
|
+
return html.slice(row.rowStart, row.cells[0].cellStart);
|
|
645
|
+
}
|
|
646
|
+
/** Jaccard similarity threshold above which we treat two rows as "the same row, edited". */
|
|
647
|
+
const ROW_FUZZY_THRESHOLD = .5;
|
|
648
|
+
/**
|
|
649
|
+
* After exact LCS, scan the alignment for runs of "old deleted, then new
|
|
650
|
+
* inserted" (or vice versa) and pair entries whose content is similar
|
|
651
|
+
* enough to be treated as an edit rather than a delete+insert. This keeps
|
|
652
|
+
* row-level edits (a typo fix, a single word change) from being shown as
|
|
653
|
+
* an entire row vanishing and a new one appearing — matching what users
|
|
654
|
+
* expect from a typical track-changes view.
|
|
655
|
+
*/
|
|
656
|
+
function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtml) {
|
|
657
|
+
const pairs = /* @__PURE__ */ new Map();
|
|
658
|
+
let i = 0;
|
|
659
|
+
while (i < alignment.length) {
|
|
660
|
+
if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
|
|
661
|
+
i++;
|
|
662
|
+
continue;
|
|
663
|
+
}
|
|
664
|
+
const runStart = i;
|
|
665
|
+
while (i < alignment.length && alignment[i].oldIdx === null !== (alignment[i].newIdx === null)) i++;
|
|
666
|
+
const runEnd = i;
|
|
667
|
+
const delIndices = [];
|
|
668
|
+
const insIndices = [];
|
|
669
|
+
for (let k = runStart; k < runEnd; k++) if (alignment[k].oldIdx !== null) delIndices.push(k);
|
|
670
|
+
else insIndices.push(k);
|
|
671
|
+
const usedIns = /* @__PURE__ */ new Set();
|
|
672
|
+
for (const di of delIndices) {
|
|
673
|
+
let bestIi = -1;
|
|
674
|
+
let bestSim = ROW_FUZZY_THRESHOLD;
|
|
675
|
+
for (const ii of insIndices) {
|
|
676
|
+
if (usedIns.has(ii)) continue;
|
|
677
|
+
const sim = rowSimilarity(oldTable.rows[alignment[di].oldIdx], newTable.rows[alignment[ii].newIdx], oldHtml, newHtml);
|
|
678
|
+
if (sim > bestSim) {
|
|
679
|
+
bestSim = sim;
|
|
680
|
+
bestIi = ii;
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
if (bestIi >= 0) {
|
|
684
|
+
pairs.set(di, bestIi);
|
|
685
|
+
usedIns.add(bestIi);
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
const insToDel = /* @__PURE__ */ new Map();
|
|
690
|
+
for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi);
|
|
691
|
+
const pairedDels = new Set(pairs.keys());
|
|
692
|
+
const result = [];
|
|
693
|
+
for (let k = 0; k < alignment.length; k++) {
|
|
694
|
+
if (pairedDels.has(k)) continue;
|
|
695
|
+
if (insToDel.has(k)) {
|
|
696
|
+
const delAi = insToDel.get(k);
|
|
697
|
+
result.push({
|
|
698
|
+
oldIdx: alignment[delAi].oldIdx,
|
|
699
|
+
newIdx: alignment[k].newIdx
|
|
700
|
+
});
|
|
701
|
+
} else result.push(alignment[k]);
|
|
702
|
+
}
|
|
703
|
+
return result;
|
|
704
|
+
}
|
|
705
|
+
/**
|
|
706
|
+
* Character-level similarity using shared prefix + suffix as a fraction
|
|
707
|
+
* of the longer string. Catches "single edit somewhere in a long row"
|
|
708
|
+
* (which token-Jaccard misses on short rows) while still correctly
|
|
709
|
+
* rejecting rows with no positional overlap. HTML tags are stripped to
|
|
710
|
+
* keep the comparison content-focused.
|
|
711
|
+
*/
|
|
712
|
+
function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
|
|
713
|
+
const a = rowText(oldHtml, oldRow);
|
|
714
|
+
const b = rowText(newHtml, newRow);
|
|
715
|
+
if (a === b) return 1;
|
|
716
|
+
if (a.length === 0 || b.length === 0) return 0;
|
|
717
|
+
let prefix = 0;
|
|
718
|
+
const minLen = Math.min(a.length, b.length);
|
|
719
|
+
while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
|
|
720
|
+
let suffix = 0;
|
|
721
|
+
while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
|
|
722
|
+
return (prefix + suffix) / Math.max(a.length, b.length);
|
|
723
|
+
}
|
|
724
|
+
function rowText(html, row) {
|
|
725
|
+
const parts = [];
|
|
726
|
+
for (const cell of row.cells) parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " "));
|
|
727
|
+
return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
728
|
+
}
|
|
729
|
+
/**
|
|
730
|
+
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
731
|
+
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|
|
732
|
+
* side is null for an unmatched entry on the other side. Equality uses
|
|
733
|
+
* strict ===.
|
|
734
|
+
*/
|
|
735
|
+
function lcsAlign(oldKeys, newKeys) {
|
|
736
|
+
const m = oldKeys.length;
|
|
737
|
+
const n = newKeys.length;
|
|
738
|
+
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
739
|
+
for (let i = 1; i <= m; i++) for (let j = 1; j <= n; j++) if (oldKeys[i - 1] === newKeys[j - 1]) dp[i][j] = dp[i - 1][j - 1] + 1;
|
|
740
|
+
else dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
|
|
741
|
+
const result = [];
|
|
742
|
+
let i = m;
|
|
743
|
+
let j = n;
|
|
744
|
+
while (i > 0 || j > 0) if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
|
|
745
|
+
result.unshift({
|
|
746
|
+
oldIdx: i - 1,
|
|
747
|
+
newIdx: j - 1
|
|
748
|
+
});
|
|
749
|
+
i--;
|
|
750
|
+
j--;
|
|
751
|
+
} else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
|
|
752
|
+
result.unshift({
|
|
753
|
+
oldIdx: null,
|
|
754
|
+
newIdx: j - 1
|
|
755
|
+
});
|
|
756
|
+
j--;
|
|
757
|
+
} else {
|
|
758
|
+
result.unshift({
|
|
759
|
+
oldIdx: i - 1,
|
|
760
|
+
newIdx: null
|
|
761
|
+
});
|
|
762
|
+
i--;
|
|
763
|
+
}
|
|
764
|
+
return result;
|
|
765
|
+
}
|
|
766
|
+
/**
|
|
767
|
+
* Returns the opening tag string with the given class injected. Existing
|
|
768
|
+
* `class` attributes are preserved and the new class appended.
|
|
769
|
+
*/
|
|
770
|
+
/**
|
|
771
|
+
* Returns the opening tag with the given class injected. Locates the real
|
|
772
|
+
* `class` attribute via attribute-aware walking (NOT a flat regex — that
|
|
773
|
+
* would mis-match inside a foreign attribute value like
|
|
774
|
+
* `title="see class='x'"`). When the class already partially overlaps with
|
|
775
|
+
* `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
|
|
776
|
+
* only the missing tokens get appended, so we never end up with
|
|
777
|
+
* `class="mod mod colspan"`.
|
|
778
|
+
*/
|
|
779
|
+
function injectClass(openingTag, cls) {
|
|
780
|
+
const clsTokens = cls.split(/\s+/).filter(Boolean);
|
|
781
|
+
if (clsTokens.length === 0) return openingTag;
|
|
782
|
+
const classAttr = findClassAttribute(openingTag);
|
|
783
|
+
if (classAttr) {
|
|
784
|
+
const existingTokens = classAttr.value.split(/\s+/).filter(Boolean);
|
|
785
|
+
const missing = clsTokens.filter((t) => !existingTokens.includes(t));
|
|
786
|
+
if (missing.length === 0) return openingTag;
|
|
787
|
+
const updatedValue = existingTokens.length === 0 ? missing.join(" ") : `${existingTokens.join(" ")} ${missing.join(" ")}`;
|
|
788
|
+
return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd);
|
|
789
|
+
}
|
|
790
|
+
const insertAt = openingTag.endsWith("/>") ? openingTag.length - 2 : openingTag.length - 1;
|
|
791
|
+
return `${openingTag.slice(0, insertAt).replace(/\s*$/, "")} class='${cls}'${openingTag.slice(insertAt)}`;
|
|
792
|
+
}
|
|
793
|
+
/**
|
|
794
|
+
* Walks the opening tag's attributes (respecting quoted values) to find
|
|
795
|
+
* the actual `class` attribute. Returns the value range (start/end of the
|
|
796
|
+
* value content, *excluding* the surrounding quotes) and the value, or
|
|
797
|
+
* null if no `class` attribute is present.
|
|
798
|
+
*/
|
|
799
|
+
function findClassAttribute(openingTag) {
|
|
800
|
+
let i = 1;
|
|
801
|
+
while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++;
|
|
802
|
+
while (i < openingTag.length) {
|
|
803
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++;
|
|
804
|
+
if (i >= openingTag.length) break;
|
|
805
|
+
if (openingTag[i] === ">" || openingTag[i] === "/") break;
|
|
806
|
+
const nameStart = i;
|
|
807
|
+
while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++;
|
|
808
|
+
const name = openingTag.slice(nameStart, i);
|
|
809
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++;
|
|
810
|
+
if (openingTag[i] !== "=") continue;
|
|
811
|
+
i++;
|
|
812
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++;
|
|
813
|
+
let valueStart;
|
|
814
|
+
let valueEnd;
|
|
815
|
+
if (openingTag[i] === "\"" || openingTag[i] === "'") {
|
|
816
|
+
const quote = openingTag[i];
|
|
817
|
+
i++;
|
|
818
|
+
valueStart = i;
|
|
819
|
+
while (i < openingTag.length && openingTag[i] !== quote) i++;
|
|
820
|
+
valueEnd = i;
|
|
821
|
+
if (i < openingTag.length) i++;
|
|
822
|
+
} else {
|
|
823
|
+
valueStart = i;
|
|
824
|
+
while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++;
|
|
825
|
+
valueEnd = i;
|
|
826
|
+
}
|
|
827
|
+
if (name.toLowerCase() === "class") return {
|
|
828
|
+
valueStart,
|
|
829
|
+
valueEnd,
|
|
830
|
+
value: openingTag.slice(valueStart, valueEnd)
|
|
831
|
+
};
|
|
832
|
+
}
|
|
833
|
+
return null;
|
|
834
|
+
}
|
|
835
|
+
/**
|
|
836
|
+
* Walks html and returns ranges for every top-level `<table>...</table>`
|
|
837
|
+
* block. Nested tables aren't extracted as separate top-level entries —
|
|
838
|
+
* they're captured inside the parent's content range and handled when the
|
|
839
|
+
* cell-level diff recurses through them.
|
|
840
|
+
*/
|
|
841
|
+
function findTopLevelTables(html) {
|
|
842
|
+
const tables = [];
|
|
843
|
+
let i = 0;
|
|
844
|
+
while (i < html.length) if (matchesTagAt(html, i, "table")) {
|
|
845
|
+
const opening = parseOpeningTagAt(html, i);
|
|
846
|
+
if (!opening) {
|
|
847
|
+
i++;
|
|
848
|
+
continue;
|
|
849
|
+
}
|
|
850
|
+
const tableContentStart = opening.end;
|
|
851
|
+
const tableEnd = findMatchingClosingTag(html, tableContentStart, "table");
|
|
852
|
+
if (tableEnd === -1) {
|
|
853
|
+
i = opening.end;
|
|
854
|
+
continue;
|
|
855
|
+
}
|
|
856
|
+
const rows = findTopLevelRows(html, tableContentStart, tableEnd - 8);
|
|
857
|
+
tables.push({
|
|
858
|
+
tableStart: i,
|
|
859
|
+
tableEnd,
|
|
860
|
+
rows
|
|
861
|
+
});
|
|
862
|
+
i = tableEnd;
|
|
863
|
+
} else i++;
|
|
864
|
+
return tables;
|
|
865
|
+
}
|
|
866
|
+
function findTopLevelRows(html, start, end) {
|
|
867
|
+
const rows = [];
|
|
868
|
+
let i = start;
|
|
869
|
+
while (i < end) if (matchesTagAt(html, i, "tr")) {
|
|
870
|
+
const opening = parseOpeningTagAt(html, i);
|
|
871
|
+
if (!opening) {
|
|
872
|
+
i++;
|
|
873
|
+
continue;
|
|
874
|
+
}
|
|
875
|
+
const rowContentStart = opening.end;
|
|
876
|
+
const rowEnd = findMatchingClosingTag(html, rowContentStart, "tr", end);
|
|
877
|
+
if (rowEnd === -1) {
|
|
878
|
+
i = opening.end;
|
|
879
|
+
continue;
|
|
880
|
+
}
|
|
881
|
+
const cells = findTopLevelCells(html, rowContentStart, rowEnd - 5);
|
|
882
|
+
rows.push({
|
|
883
|
+
rowStart: i,
|
|
884
|
+
rowEnd,
|
|
885
|
+
cells
|
|
886
|
+
});
|
|
887
|
+
i = rowEnd;
|
|
888
|
+
} else if (matchesClosingTagAt(html, i, "table")) break;
|
|
889
|
+
else i++;
|
|
890
|
+
return rows;
|
|
891
|
+
}
|
|
892
|
+
function findTopLevelCells(html, start, end) {
|
|
893
|
+
const cells = [];
|
|
894
|
+
let i = start;
|
|
895
|
+
while (i < end) if (matchesTagAt(html, i, "td") || matchesTagAt(html, i, "th")) {
|
|
896
|
+
const tagName = matchesTagAt(html, i, "td") ? "td" : "th";
|
|
897
|
+
const opening = parseOpeningTagAt(html, i);
|
|
898
|
+
if (!opening) {
|
|
899
|
+
i++;
|
|
900
|
+
continue;
|
|
901
|
+
}
|
|
902
|
+
const contentStart = opening.end;
|
|
903
|
+
const cellEnd = findMatchingClosingTag(html, contentStart, tagName, end);
|
|
904
|
+
if (cellEnd === -1) {
|
|
905
|
+
i = opening.end;
|
|
906
|
+
continue;
|
|
907
|
+
}
|
|
908
|
+
const contentEnd = cellEnd - `</${tagName}>`.length;
|
|
909
|
+
cells.push({
|
|
910
|
+
cellStart: i,
|
|
911
|
+
cellEnd,
|
|
912
|
+
contentStart,
|
|
913
|
+
contentEnd
|
|
914
|
+
});
|
|
915
|
+
i = cellEnd;
|
|
916
|
+
} else if (matchesClosingTagAt(html, i, "tr")) break;
|
|
917
|
+
else i++;
|
|
918
|
+
return cells;
|
|
919
|
+
}
|
|
920
|
+
function matchesTagAt(html, i, tagName) {
|
|
921
|
+
if (html[i] !== "<") return false;
|
|
922
|
+
if (html.slice(i + 1, i + 1 + tagName.length).toLowerCase() !== tagName) return false;
|
|
923
|
+
const after = html[i + 1 + tagName.length];
|
|
924
|
+
return after === ">" || after === " " || after === " " || after === "\n" || after === "\r" || after === "/";
|
|
925
|
+
}
|
|
926
|
+
function matchesClosingTagAt(html, i, tagName) {
|
|
927
|
+
if (html[i] !== "<" || html[i + 1] !== "/") return false;
|
|
928
|
+
if (html.slice(i + 2, i + 2 + tagName.length).toLowerCase() !== tagName) return false;
|
|
929
|
+
const after = html[i + 2 + tagName.length];
|
|
930
|
+
return after === ">" || after === " " || after === " " || after === "\n" || after === "\r";
|
|
931
|
+
}
|
|
932
|
+
function parseOpeningTagAt(html, i) {
|
|
933
|
+
if (html.startsWith("<!--", i)) {
|
|
934
|
+
const close = html.indexOf("-->", i + 4);
|
|
935
|
+
return close === -1 ? null : { end: close + 3 };
|
|
936
|
+
}
|
|
937
|
+
if (html.startsWith("<![CDATA[", i)) {
|
|
938
|
+
const close = html.indexOf("]]>", i + 9);
|
|
939
|
+
return close === -1 ? null : { end: close + 3 };
|
|
940
|
+
}
|
|
941
|
+
if (html.startsWith("<?", i)) {
|
|
942
|
+
const close = html.indexOf("?>", i + 2);
|
|
943
|
+
return close === -1 ? null : { end: close + 2 };
|
|
944
|
+
}
|
|
945
|
+
let j = i + 1;
|
|
946
|
+
let quote = null;
|
|
947
|
+
while (j < html.length) {
|
|
948
|
+
const ch = html[j];
|
|
949
|
+
if (quote) {
|
|
950
|
+
if (ch === quote) quote = null;
|
|
951
|
+
} else if (ch === "\"" || ch === "'") quote = ch;
|
|
952
|
+
else if (ch === ">") return { end: j + 1 };
|
|
953
|
+
j++;
|
|
954
|
+
}
|
|
955
|
+
return null;
|
|
956
|
+
}
|
|
957
|
+
/**
|
|
958
|
+
* Returns the index just past the matching `</tagName>`, accounting for
|
|
959
|
+
* nested tags of the same name. Returns -1 if no match before `limit`.
|
|
960
|
+
*/
|
|
961
|
+
function findMatchingClosingTag(html, from, tagName, limit = html.length) {
|
|
962
|
+
let depth = 1;
|
|
963
|
+
let i = from;
|
|
964
|
+
while (i < limit) if (matchesTagAt(html, i, tagName)) {
|
|
965
|
+
const opening = parseOpeningTagAt(html, i);
|
|
966
|
+
if (!opening) {
|
|
967
|
+
i++;
|
|
968
|
+
continue;
|
|
969
|
+
}
|
|
970
|
+
if (!html.slice(i, opening.end).endsWith("/>")) depth++;
|
|
971
|
+
i = opening.end;
|
|
972
|
+
} else if (matchesClosingTagAt(html, i, tagName)) {
|
|
973
|
+
depth--;
|
|
974
|
+
const closingEnd = parseOpeningTagAt(html, i)?.end ?? i + `</${tagName}>`.length;
|
|
975
|
+
if (depth === 0) return closingEnd;
|
|
976
|
+
i = closingEnd;
|
|
977
|
+
} else i++;
|
|
978
|
+
return -1;
|
|
979
|
+
}
|
|
980
|
+
//#endregion
|
|
205
981
|
//#region src/WordSplitter.ts
|
|
206
982
|
var WordSplitter = class WordSplitter {
|
|
207
983
|
text;
|
|
@@ -447,9 +1223,20 @@ var HtmlDiff = class HtmlDiff {
|
|
|
447
1223
|
"s",
|
|
448
1224
|
"span"
|
|
449
1225
|
]);
|
|
1226
|
+
/**
|
|
1227
|
+
* Hard cap on nested `HtmlDiff.execute` calls (table preprocessing
|
|
1228
|
+
* recurses through `diffCell` for cell content). Each level allocates
|
|
1229
|
+
* fresh DP matrices and word arrays; without a guard a maliciously
|
|
1230
|
+
* nested table-in-cell-in-table-in-cell input could blow stack and
|
|
1231
|
+
* memory. Set high enough to comfortably handle real legal documents
|
|
1232
|
+
* (tables nested 2-3 deep at most), low enough to short-circuit
|
|
1233
|
+
* pathological input.
|
|
1234
|
+
*/
|
|
1235
|
+
static MaxTablePreprocessDepth = 8;
|
|
450
1236
|
content = [];
|
|
451
1237
|
newText;
|
|
452
1238
|
oldText;
|
|
1239
|
+
tablePreprocessDepth;
|
|
453
1240
|
specialTagDiffStack = [];
|
|
454
1241
|
newWords = [];
|
|
455
1242
|
oldWords = [];
|
|
@@ -462,8 +1249,17 @@ var HtmlDiff = class HtmlDiff {
|
|
|
462
1249
|
/** Maps content-word index → original word index */
|
|
463
1250
|
oldContentToOriginal = null;
|
|
464
1251
|
newContentToOriginal = null;
|
|
465
|
-
/**
|
|
1252
|
+
/**
|
|
1253
|
+
* Tracks the next unwritten word index in oldWords/newWords. Mutated only by
|
|
1254
|
+
* {@link sliceOriginalWordsForOp} (each op reads a slice and advances its cursor).
|
|
1255
|
+
* Advances monotonically. Used so:
|
|
1256
|
+
* - subsequent equal/delete ops know where in old to resume from
|
|
1257
|
+
* - subsequent insert ops know where in new to resume from
|
|
1258
|
+
* The two cursors are independent: equal/delete output from old and advance the old
|
|
1259
|
+
* cursor; insert outputs from new and advances the new cursor.
|
|
1260
|
+
*/
|
|
466
1261
|
lastOriginalOldOutputIndex = 0;
|
|
1262
|
+
lastOriginalNewOutputIndex = 0;
|
|
467
1263
|
matchGranularity = 0;
|
|
468
1264
|
blockExpressions = [];
|
|
469
1265
|
/**
|
|
@@ -503,13 +1299,17 @@ var HtmlDiff = class HtmlDiff {
|
|
|
503
1299
|
* Initializes a new instance of the class.
|
|
504
1300
|
* @param oldText The old text.
|
|
505
1301
|
* @param newText The new text.
|
|
1302
|
+
* @param tablePreprocessDepth Internal: nested-call depth for table
|
|
1303
|
+
* preprocessing. Callers should leave at default (0); the recursive
|
|
1304
|
+
* `diffCell` callback in TableDiff bumps it.
|
|
506
1305
|
*/
|
|
507
|
-
constructor(oldText, newText) {
|
|
1306
|
+
constructor(oldText, newText, tablePreprocessDepth = 0) {
|
|
508
1307
|
this.oldText = oldText;
|
|
509
1308
|
this.newText = newText;
|
|
1309
|
+
this.tablePreprocessDepth = tablePreprocessDepth;
|
|
510
1310
|
}
|
|
511
|
-
static execute(oldText, newText) {
|
|
512
|
-
return new HtmlDiff(oldText, newText).build();
|
|
1311
|
+
static execute(oldText, newText, tablePreprocessDepth = 0) {
|
|
1312
|
+
return new HtmlDiff(oldText, newText, tablePreprocessDepth).build();
|
|
513
1313
|
}
|
|
514
1314
|
/**
|
|
515
1315
|
* Builds the HTML diff output
|
|
@@ -517,6 +1317,22 @@ var HtmlDiff = class HtmlDiff {
|
|
|
517
1317
|
*/
|
|
518
1318
|
build() {
|
|
519
1319
|
if (this.oldText === this.newText) return this.newText;
|
|
1320
|
+
const blockExpressions = this.blockExpressions;
|
|
1321
|
+
const repeatingWordsAccuracy = this.repeatingWordsAccuracy;
|
|
1322
|
+
const orphanMatchThreshold = this.orphanMatchThreshold;
|
|
1323
|
+
const ignoreWhitespaceDifferences = this.ignoreWhitespaceDifferences;
|
|
1324
|
+
const tablePreprocess = this.tablePreprocessDepth >= HtmlDiff.MaxTablePreprocessDepth ? null : preprocessTables(this.oldText, this.newText, (oldCell, newCell) => {
|
|
1325
|
+
const inner = new HtmlDiff(oldCell, newCell, this.tablePreprocessDepth + 1);
|
|
1326
|
+
for (const expr of blockExpressions) inner.addBlockExpression(expr);
|
|
1327
|
+
inner.repeatingWordsAccuracy = repeatingWordsAccuracy;
|
|
1328
|
+
inner.orphanMatchThreshold = orphanMatchThreshold;
|
|
1329
|
+
inner.ignoreWhitespaceDifferences = ignoreWhitespaceDifferences;
|
|
1330
|
+
return inner.build();
|
|
1331
|
+
});
|
|
1332
|
+
if (tablePreprocess) {
|
|
1333
|
+
this.oldText = tablePreprocess.modifiedOld;
|
|
1334
|
+
this.newText = tablePreprocess.modifiedNew;
|
|
1335
|
+
}
|
|
520
1336
|
this.splitInputsToWords();
|
|
521
1337
|
this.buildContentProjections();
|
|
522
1338
|
const wordsForDiffOld = this.oldContentWords ?? this.oldWords;
|
|
@@ -524,7 +1340,8 @@ var HtmlDiff = class HtmlDiff {
|
|
|
524
1340
|
this.matchGranularity = Math.min(HtmlDiff.MatchGranularityMaximum, Math.min(wordsForDiffOld.length, wordsForDiffNew.length));
|
|
525
1341
|
const operations = this.operations();
|
|
526
1342
|
for (const op of operations) this.performOperation(op);
|
|
527
|
-
|
|
1343
|
+
const result = this.content.join("");
|
|
1344
|
+
return tablePreprocess ? restoreTablePlaceholders(result, tablePreprocess.placeholderToDiff) : result;
|
|
528
1345
|
}
|
|
529
1346
|
/**
|
|
530
1347
|
* Uses {@link expression} to group text together so that any change detected within the group is treated as a single block
|
|
@@ -540,21 +1357,32 @@ var HtmlDiff = class HtmlDiff {
|
|
|
540
1357
|
this.newText = "";
|
|
541
1358
|
}
|
|
542
1359
|
/**
|
|
543
|
-
*
|
|
544
|
-
*
|
|
545
|
-
*
|
|
1360
|
+
* Builds "content projections" — word arrays with structural wrapper tags stripped — when
|
|
1361
|
+
* structural normalization is appropriate for these inputs. The diff algorithm operates on
|
|
1362
|
+
* the projections so wrapper-tag differences (e.g. `<p>` vs `<div>`) don't appear as content
|
|
1363
|
+
* changes; structural tags are then folded back in at output time.
|
|
546
1364
|
*/
|
|
547
1365
|
buildContentProjections() {
|
|
548
1366
|
if (!HtmlDiff.hasStructuralDifferences(this.oldWords, this.newWords)) return;
|
|
549
1367
|
const oldProjection = HtmlDiff.createContentProjection(this.oldWords);
|
|
550
1368
|
const newProjection = HtmlDiff.createContentProjection(this.newWords);
|
|
551
|
-
if (
|
|
1369
|
+
if (!HtmlDiff.shouldUseContentProjections(this.oldWords, this.newWords, oldProjection, newProjection)) return;
|
|
552
1370
|
this.oldContentWords = oldProjection.contentWords;
|
|
553
1371
|
this.oldContentToOriginal = oldProjection.contentToOriginal;
|
|
554
1372
|
this.newContentWords = newProjection.contentWords;
|
|
555
1373
|
this.newContentToOriginal = newProjection.contentToOriginal;
|
|
556
1374
|
}
|
|
557
1375
|
/**
|
|
1376
|
+
* Decides whether structural normalization should be activated for this pair of inputs.
|
|
1377
|
+
* Each clause is a distinct correctness or fitness check — extend by adding a named
|
|
1378
|
+
* sub-predicate rather than chaining ad-hoc conditions.
|
|
1379
|
+
*/
|
|
1380
|
+
static shouldUseContentProjections(oldWords, newWords, oldProjection, newProjection) {
|
|
1381
|
+
if (oldProjection.contentWords.length === 0 || newProjection.contentWords.length === 0) return false;
|
|
1382
|
+
if (oldProjection.contentWords.length < oldWords.length !== newProjection.contentWords.length < newWords.length) return false;
|
|
1383
|
+
return true;
|
|
1384
|
+
}
|
|
1385
|
+
/**
|
|
558
1386
|
* Tags that commonly serve as content wrappers and may change structurally
|
|
559
1387
|
* without affecting the actual content. Only these tags are stripped during
|
|
560
1388
|
* structural normalization.
|
|
@@ -575,6 +1403,10 @@ var HtmlDiff = class HtmlDiff {
|
|
|
575
1403
|
const tagName = Utils_default.getTagName(word);
|
|
576
1404
|
return HtmlDiff.WrapperTags.has(tagName);
|
|
577
1405
|
}
|
|
1406
|
+
/** True when the word is a structural opening tag (e.g. `<p>`, `<div>`). */
|
|
1407
|
+
static isOpeningStructuralTag(word) {
|
|
1408
|
+
return HtmlDiff.isStructuralTag(word) && !word.startsWith("</");
|
|
1409
|
+
}
|
|
578
1410
|
/**
|
|
579
1411
|
* Returns true if words between structural tags are just whitespace (indentation).
|
|
580
1412
|
*/
|
|
@@ -629,58 +1461,61 @@ var HtmlDiff = class HtmlDiff {
|
|
|
629
1461
|
this.processInsertOperation(operation, "diffmod");
|
|
630
1462
|
}
|
|
631
1463
|
processInsertOperation(operation, cssClass) {
|
|
632
|
-
const words = this.
|
|
1464
|
+
const words = this.usingContentProjections() ? this.sliceOriginalWordsForOp("new", operation.startInNew, operation.endInNew) : this.newWords.slice(operation.startInNew, operation.endInNew);
|
|
633
1465
|
this.insertTag(HtmlDiff.InsTag, cssClass, words);
|
|
634
1466
|
}
|
|
635
1467
|
processDeleteOperation(operation, cssClass) {
|
|
636
|
-
const words = this.
|
|
1468
|
+
const words = this.usingContentProjections() ? this.sliceOriginalWordsForOp("old", operation.startInOld, operation.endInOld) : this.oldWords.slice(operation.startInOld, operation.endInOld);
|
|
637
1469
|
this.insertTag(HtmlDiff.DelTag, cssClass, words);
|
|
638
|
-
if (this.oldContentToOriginal && operation.endInOld > 0) {
|
|
639
|
-
const lastDeletedOrigIdx = this.oldContentToOriginal[operation.endInOld - 1];
|
|
640
|
-
this.lastOriginalOldOutputIndex = Math.max(this.lastOriginalOldOutputIndex, lastDeletedOrigIdx + 1);
|
|
641
|
-
}
|
|
642
1470
|
}
|
|
643
1471
|
processEqualOperation(operation) {
|
|
644
|
-
if (this.
|
|
645
|
-
const result = this.
|
|
1472
|
+
if (this.usingContentProjections()) {
|
|
1473
|
+
const result = this.sliceOriginalWordsForOp("old", operation.startInOld, operation.endInOld);
|
|
646
1474
|
this.content.push(result.join(""));
|
|
1475
|
+
this.sliceOriginalWordsForOp("new", operation.startInNew, operation.endInNew);
|
|
647
1476
|
} else {
|
|
648
1477
|
const result = this.newWords.slice(operation.startInNew, operation.endInNew);
|
|
649
1478
|
this.content.push(result.join(""));
|
|
650
1479
|
}
|
|
651
1480
|
}
|
|
652
|
-
/**
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
*/
|
|
656
|
-
getOriginalOldWords(contentStart, contentEnd) {
|
|
657
|
-
if (!this.oldContentToOriginal) return this.oldWords.slice(contentStart, contentEnd);
|
|
658
|
-
const result = [];
|
|
659
|
-
for (let i = contentStart; i < contentEnd; i++) result.push(this.oldWords[this.oldContentToOriginal[i]]);
|
|
660
|
-
return result;
|
|
661
|
-
}
|
|
662
|
-
/**
|
|
663
|
-
* Gets original new words for a content-index range, including only content and formatting tags
|
|
664
|
-
* (used for insert/replace operations where we don't want structural tags).
|
|
665
|
-
*/
|
|
666
|
-
getOriginalNewWords(contentStart, contentEnd) {
|
|
667
|
-
if (!this.newContentToOriginal) return this.newWords.slice(contentStart, contentEnd);
|
|
668
|
-
const result = [];
|
|
669
|
-
for (let i = contentStart; i < contentEnd; i++) result.push(this.newWords[this.newContentToOriginal[i]]);
|
|
670
|
-
return result;
|
|
1481
|
+
/** True when content projections are active for both sides — i.e. structural normalization is in effect. */
|
|
1482
|
+
usingContentProjections() {
|
|
1483
|
+
return this.oldContentToOriginal !== null && this.newContentToOriginal !== null;
|
|
671
1484
|
}
|
|
672
1485
|
/**
|
|
673
|
-
*
|
|
674
|
-
*
|
|
1486
|
+
* Returns the slice of original (old or new) words covering a content-index range,
|
|
1487
|
+
* including the structural tags that surround the content. Advances the side's cursor
|
|
1488
|
+
* past the slice so the next op resumes correctly.
|
|
1489
|
+
*
|
|
1490
|
+
* The slice extends:
|
|
1491
|
+
* - LEADING: from the side's cursor (or the first content word's original index,
|
|
1492
|
+
* whichever is smaller) so structural tags that precede the first content word
|
|
1493
|
+
* are picked up by this op rather than left orphaned.
|
|
1494
|
+
* - TRAILING (non-last range): from just after the last content word, including
|
|
1495
|
+
* closing structural tags that close *this* op's paragraphs, but stopping at
|
|
1496
|
+
* the first opening structural tag — that opening tag belongs to the next
|
|
1497
|
+
* op's paragraph and would otherwise be emitted twice.
|
|
1498
|
+
* - TRAILING (last range): all the way to the end of words, since there is no next
|
|
1499
|
+
* op to claim the trailing tags.
|
|
675
1500
|
*/
|
|
676
|
-
|
|
677
|
-
|
|
1501
|
+
sliceOriginalWordsForOp(side, contentStart, contentEnd) {
|
|
1502
|
+
const words = side === "old" ? this.oldWords : this.newWords;
|
|
1503
|
+
const contentToOriginal = side === "old" ? this.oldContentToOriginal : this.newContentToOriginal;
|
|
1504
|
+
if (!contentToOriginal) return words.slice(contentStart, contentEnd);
|
|
678
1505
|
if (contentStart >= contentEnd) return [];
|
|
679
|
-
const firstContentOrigIdx =
|
|
680
|
-
const
|
|
681
|
-
const
|
|
682
|
-
|
|
683
|
-
|
|
1506
|
+
const firstContentOrigIdx = contentToOriginal[contentStart];
|
|
1507
|
+
const lastContentOrigIdx = contentToOriginal[contentEnd - 1];
|
|
1508
|
+
const cursor = side === "old" ? this.lastOriginalOldOutputIndex : this.lastOriginalNewOutputIndex;
|
|
1509
|
+
const origStart = Math.min(cursor, firstContentOrigIdx);
|
|
1510
|
+
let origEnd;
|
|
1511
|
+
if (contentEnd < contentToOriginal.length) {
|
|
1512
|
+
const limit = contentToOriginal[contentEnd];
|
|
1513
|
+
origEnd = lastContentOrigIdx + 1;
|
|
1514
|
+
while (origEnd < limit && !HtmlDiff.isOpeningStructuralTag(words[origEnd])) origEnd++;
|
|
1515
|
+
} else origEnd = words.length;
|
|
1516
|
+
if (side === "old") this.lastOriginalOldOutputIndex = origEnd;
|
|
1517
|
+
else this.lastOriginalNewOutputIndex = origEnd;
|
|
1518
|
+
return words.slice(origStart, origEnd);
|
|
684
1519
|
}
|
|
685
1520
|
/**
|
|
686
1521
|
* This method encloses words within a specified tag (ins or del), and adds this into "content",
|
|
@@ -733,7 +1568,7 @@ var HtmlDiff = class HtmlDiff {
|
|
|
733
1568
|
if (words.slice(0, indexLastTagInFirstTagBlock + 1).some((w) => !HtmlDiff.SpecialCaseClosingTagsSet.has(w.toLowerCase()))) tagIndexToCompare = 0;
|
|
734
1569
|
}
|
|
735
1570
|
const openingAndClosingTagsMatch = !!openingTag && Utils_default.getTagName(openingTag) === Utils_default.getTagName(words[tagIndexToCompare]);
|
|
736
|
-
if (
|
|
1571
|
+
if (openingTag && openingAndClosingTagsMatch) {
|
|
737
1572
|
specialCaseTagInjection = "</ins>";
|
|
738
1573
|
specialCaseTagInjectionIsBefore = true;
|
|
739
1574
|
} else if (openingTag) this.specialTagDiffStack.push(openingTag);
|