@createiq/htmldiff 1.0.5-beta.2 → 1.0.5-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/HtmlDiff.cjs +204 -26
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.mjs +204 -26
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/TableDiff.ts +280 -45
- package/test/HtmlDiff.tables.matrix.spec.ts +367 -0
- package/test/HtmlDiff.tables.spec.ts +80 -0
package/dist/HtmlDiff.mjs
CHANGED
|
@@ -411,7 +411,7 @@ function diffPositionalTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
|
|
|
411
411
|
* sides.
|
|
412
412
|
*/
|
|
413
413
|
function diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
|
|
414
|
-
const alignment = pairSimilarUnmatchedRows(lcsAlign(oldTable.rows.map((row) => rowKey(oldHtml, row)), newTable.rows.map((row) => rowKey(newHtml, row))), oldTable, newTable, oldHtml, newHtml);
|
|
414
|
+
const alignment = orderAlignmentForEmission(pairSimilarUnmatchedRows(lcsAlign(oldTable.rows.map((row) => rowKey(oldHtml, row)), newTable.rows.map((row) => rowKey(newHtml, row))), oldTable, newTable, oldHtml, newHtml));
|
|
415
415
|
if (newTable.rows.length === 0) return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell);
|
|
416
416
|
const out = [];
|
|
417
417
|
out.push(newHtml.slice(newTable.tableStart, newTable.rows[0].rowStart));
|
|
@@ -426,6 +426,72 @@ function diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diff
|
|
|
426
426
|
out.push(newHtml.slice(cursor, newTable.tableEnd));
|
|
427
427
|
return out.join("");
|
|
428
428
|
}
|
|
429
|
+
/**
|
|
430
|
+
* Reorders the alignment so emission produces rows in the visually-
|
|
431
|
+
* correct order. Each entry is assigned a fractional "position" in
|
|
432
|
+
* new's flow:
|
|
433
|
+
*
|
|
434
|
+
* • Preserved/paired (oldIdx, newIdx): position = newIdx.
|
|
435
|
+
* • Pure insert (null, newIdx): position = newIdx.
|
|
436
|
+
* • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
|
|
437
|
+
* before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
|
|
438
|
+
* they appear in old's row order. The +0.5 places dels BEFORE any
|
|
439
|
+
* insert at the same gap (insert at newIdx N1+1 has position N1+1
|
|
440
|
+
* which is > N1+0.5), giving the natural "delete first, insert
|
|
441
|
+
* second" reading order at a replaced position.
|
|
442
|
+
*
|
|
443
|
+
* This handles the full range:
|
|
444
|
+
* • Run of unpaired dels at the start (no preserved predecessor):
|
|
445
|
+
* position -0.5, sorted by oldIdx.
|
|
446
|
+
* • Dels in the middle: positioned right after their preceding
|
|
447
|
+
* preserved row.
|
|
448
|
+
* • Dels at the end (no preserved successor): positioned after the
|
|
449
|
+
* last preserved row.
|
|
450
|
+
*
|
|
451
|
+
* Without this reordering, a run of unpaired deletes at low alignment
|
|
452
|
+
* indices got emitted at cursor = first-new-row position — putting
|
|
453
|
+
* all deletes before any preserved row in the output, regardless of
|
|
454
|
+
* where they came from in old.
|
|
455
|
+
*/
|
|
456
|
+
function orderAlignmentForEmission(alignment) {
|
|
457
|
+
const preserved = [];
|
|
458
|
+
for (const a of alignment) if (a.oldIdx !== null && a.newIdx !== null) preserved.push({
|
|
459
|
+
oldIdx: a.oldIdx,
|
|
460
|
+
newIdx: a.newIdx
|
|
461
|
+
});
|
|
462
|
+
preserved.sort((a, b) => a.oldIdx - b.oldIdx);
|
|
463
|
+
function newIdxOfPreservedBefore(oldIdx) {
|
|
464
|
+
let result = -1;
|
|
465
|
+
for (const p of preserved) {
|
|
466
|
+
if (p.oldIdx >= oldIdx) break;
|
|
467
|
+
result = p.newIdx;
|
|
468
|
+
}
|
|
469
|
+
return result;
|
|
470
|
+
}
|
|
471
|
+
const decorated = alignment.map((a, i) => {
|
|
472
|
+
let primary;
|
|
473
|
+
let secondary;
|
|
474
|
+
if (a.newIdx !== null) {
|
|
475
|
+
primary = a.newIdx;
|
|
476
|
+
secondary = a.oldIdx === null ? 1 : 0;
|
|
477
|
+
} else {
|
|
478
|
+
primary = newIdxOfPreservedBefore(a.oldIdx) + .5;
|
|
479
|
+
secondary = a.oldIdx;
|
|
480
|
+
}
|
|
481
|
+
return {
|
|
482
|
+
entry: a,
|
|
483
|
+
primary,
|
|
484
|
+
secondary,
|
|
485
|
+
originalIdx: i
|
|
486
|
+
};
|
|
487
|
+
});
|
|
488
|
+
decorated.sort((a, b) => {
|
|
489
|
+
if (a.primary !== b.primary) return a.primary - b.primary;
|
|
490
|
+
if (a.secondary !== b.secondary) return a.secondary - b.secondary;
|
|
491
|
+
return a.originalIdx - b.originalIdx;
|
|
492
|
+
});
|
|
493
|
+
return decorated.map((d) => d.entry);
|
|
494
|
+
}
|
|
429
495
|
function rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell) {
|
|
430
496
|
const out = [];
|
|
431
497
|
out.push(headerSlice(newHtml, newTable, oldHtml, oldTable));
|
|
@@ -447,8 +513,108 @@ function diffPreservedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
|
447
513
|
if (oldRow.cells.length === newRow.cells.length) return diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
448
514
|
const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
449
515
|
if (colspanAligned !== null) return colspanAligned;
|
|
516
|
+
const delta = newRow.cells.length - oldRow.cells.length;
|
|
517
|
+
const absDelta = Math.abs(delta);
|
|
518
|
+
if (absDelta > 0 && absDelta <= MAX_COLUMN_DELTA && Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH) {
|
|
519
|
+
if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell);
|
|
520
|
+
return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell);
|
|
521
|
+
}
|
|
450
522
|
return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
451
523
|
}
|
|
524
|
+
const MAX_COLUMN_DELTA = 6;
|
|
525
|
+
const MAX_COLUMN_SEARCH_WIDTH = 40;
|
|
526
|
+
/**
|
|
527
|
+
* For a row where new has K more cells than old, find the K column
|
|
528
|
+
* positions in new where cells were inserted by scanning all C(newCount,
|
|
529
|
+
* K) combinations and picking the one that maximises positional content
|
|
530
|
+
* similarity with the remaining cells. The inserted cells are emitted
|
|
531
|
+
* with diff markers; the rest are aligned positionally with content
|
|
532
|
+
* diff for matched pairs.
|
|
533
|
+
*/
|
|
534
|
+
function diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
|
|
535
|
+
const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml);
|
|
536
|
+
const inserted = new Set(insertedPositions);
|
|
537
|
+
const out = [rowHeaderSlice(newHtml, newRow)];
|
|
538
|
+
let oldIdx = 0;
|
|
539
|
+
for (let c = 0; c < newRow.cells.length; c++) if (inserted.has(c)) out.push(emitFullCell(newHtml, newRow.cells[c], "ins", diffCell));
|
|
540
|
+
else {
|
|
541
|
+
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell));
|
|
542
|
+
oldIdx++;
|
|
543
|
+
}
|
|
544
|
+
out.push("</tr>");
|
|
545
|
+
return out.join("");
|
|
546
|
+
}
|
|
547
|
+
function diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
|
|
548
|
+
const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml);
|
|
549
|
+
const deleted = new Set(deletedPositions);
|
|
550
|
+
const out = [rowHeaderSlice(newHtml, newRow)];
|
|
551
|
+
let newIdx = 0;
|
|
552
|
+
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
553
|
+
if (deleted.has(oldIdx)) {
|
|
554
|
+
out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], "del", diffCell));
|
|
555
|
+
continue;
|
|
556
|
+
}
|
|
557
|
+
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell));
|
|
558
|
+
newIdx++;
|
|
559
|
+
}
|
|
560
|
+
out.push("</tr>");
|
|
561
|
+
return out.join("");
|
|
562
|
+
}
|
|
563
|
+
function findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml) {
|
|
564
|
+
let bestPositions = [];
|
|
565
|
+
let bestScore = -1;
|
|
566
|
+
for (const combo of combinationsOfRange(newRow.cells.length, k)) {
|
|
567
|
+
const inserted = new Set(combo);
|
|
568
|
+
let score = 0;
|
|
569
|
+
let oldIdx = 0;
|
|
570
|
+
for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
|
|
571
|
+
if (inserted.has(newIdx)) continue;
|
|
572
|
+
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
|
|
573
|
+
oldIdx++;
|
|
574
|
+
}
|
|
575
|
+
if (score > bestScore) {
|
|
576
|
+
bestScore = score;
|
|
577
|
+
bestPositions = combo;
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
return bestPositions;
|
|
581
|
+
}
|
|
582
|
+
function findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml) {
|
|
583
|
+
let bestPositions = [];
|
|
584
|
+
let bestScore = -1;
|
|
585
|
+
for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
|
|
586
|
+
const deleted = new Set(combo);
|
|
587
|
+
let score = 0;
|
|
588
|
+
let newIdx = 0;
|
|
589
|
+
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
590
|
+
if (deleted.has(oldIdx)) continue;
|
|
591
|
+
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
|
|
592
|
+
newIdx++;
|
|
593
|
+
}
|
|
594
|
+
if (score > bestScore) {
|
|
595
|
+
bestScore = score;
|
|
596
|
+
bestPositions = combo;
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
return bestPositions;
|
|
600
|
+
}
|
|
601
|
+
/**
|
|
602
|
+
* Yields all sorted-ascending combinations of `k` distinct integers
|
|
603
|
+
* from [0, n). Iterative implementation avoids recursion overhead and
|
|
604
|
+
* keeps memory at O(k).
|
|
605
|
+
*/
|
|
606
|
+
function* combinationsOfRange(n, k) {
|
|
607
|
+
if (k === 0 || k > n) return;
|
|
608
|
+
const indices = Array.from({ length: k }, (_, i) => i);
|
|
609
|
+
while (true) {
|
|
610
|
+
yield indices.slice();
|
|
611
|
+
let i = k - 1;
|
|
612
|
+
while (i >= 0 && indices[i] === n - k + i) i--;
|
|
613
|
+
if (i < 0) return;
|
|
614
|
+
indices[i]++;
|
|
615
|
+
for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1;
|
|
616
|
+
}
|
|
617
|
+
}
|
|
452
618
|
/**
|
|
453
619
|
* Try to align cells by logical column position (sum of colspans). When
|
|
454
620
|
* one side has a colspan'd cell that absorbs multiple cells on the other
|
|
@@ -730,17 +896,38 @@ function pairSimilarUnmatched(alignment, threshold, similarity) {
|
|
|
730
896
|
return result;
|
|
731
897
|
}
|
|
732
898
|
/**
|
|
733
|
-
*
|
|
734
|
-
*
|
|
735
|
-
*
|
|
736
|
-
*
|
|
737
|
-
*
|
|
899
|
+
* Combined similarity metric used for both row-level and cell-level
|
|
900
|
+
* fuzzy pairing. Returns the MAX of two complementary metrics:
|
|
901
|
+
*
|
|
902
|
+
* 1. **Character prefix+suffix similarity** — fraction of the longer
|
|
903
|
+
* string covered by shared prefix + shared suffix. Catches small
|
|
904
|
+
* edits in the middle of a string (one word changed in a row).
|
|
905
|
+
* Misses cases where the bulk of common content is in the middle
|
|
906
|
+
* and the ends differ.
|
|
907
|
+
*
|
|
908
|
+
* 2. **Token Jaccard similarity** — intersection-over-union of the
|
|
909
|
+
* whitespace-split tokens. Catches "most of the content is the
|
|
910
|
+
* same but bookended by different bits" — e.g. a row whose only
|
|
911
|
+
* edit is a column added at the start and another at the end,
|
|
912
|
+
* where the ~50 chars in the middle that DO match would be
|
|
913
|
+
* invisible to prefix+suffix.
|
|
914
|
+
*
|
|
915
|
+
* Either metric exceeding the threshold means pair. Neither alone is
|
|
916
|
+
* sufficient for the full range of legal-doc edits we see in
|
|
917
|
+
* production tables.
|
|
738
918
|
*/
|
|
739
919
|
function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
|
|
740
|
-
|
|
741
|
-
|
|
920
|
+
return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow));
|
|
921
|
+
}
|
|
922
|
+
function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
|
|
923
|
+
return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell));
|
|
924
|
+
}
|
|
925
|
+
function textSimilarity(a, b) {
|
|
742
926
|
if (a === b) return 1;
|
|
743
927
|
if (a.length === 0 || b.length === 0) return 0;
|
|
928
|
+
return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b));
|
|
929
|
+
}
|
|
930
|
+
function charPrefixSuffixSimilarity(a, b) {
|
|
744
931
|
let prefix = 0;
|
|
745
932
|
const minLen = Math.min(a.length, b.length);
|
|
746
933
|
while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
|
|
@@ -748,29 +935,20 @@ function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
|
|
|
748
935
|
while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
|
|
749
936
|
return (prefix + suffix) / Math.max(a.length, b.length);
|
|
750
937
|
}
|
|
938
|
+
function tokenJaccardSimilarity(a, b) {
|
|
939
|
+
const tokensA = new Set(a.split(/\s+/).filter(Boolean));
|
|
940
|
+
const tokensB = new Set(b.split(/\s+/).filter(Boolean));
|
|
941
|
+
if (tokensA.size === 0 && tokensB.size === 0) return 1;
|
|
942
|
+
let intersection = 0;
|
|
943
|
+
for (const t of tokensA) if (tokensB.has(t)) intersection++;
|
|
944
|
+
const union = tokensA.size + tokensB.size - intersection;
|
|
945
|
+
return union === 0 ? 0 : intersection / union;
|
|
946
|
+
}
|
|
751
947
|
function rowText(html, row) {
|
|
752
948
|
const parts = [];
|
|
753
949
|
for (const cell of row.cells) parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " "));
|
|
754
950
|
return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
755
951
|
}
|
|
756
|
-
/**
|
|
757
|
-
* Character-level prefix+suffix similarity for a single cell's text
|
|
758
|
-
* content. Same metric as rowSimilarity, scoped to one cell so we can
|
|
759
|
-
* fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
|
|
760
|
-
* a column add in the same row).
|
|
761
|
-
*/
|
|
762
|
-
function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
|
|
763
|
-
const a = cellText(oldHtml, oldCell);
|
|
764
|
-
const b = cellText(newHtml, newCell);
|
|
765
|
-
if (a === b) return 1;
|
|
766
|
-
if (a.length === 0 || b.length === 0) return 0;
|
|
767
|
-
let prefix = 0;
|
|
768
|
-
const minLen = Math.min(a.length, b.length);
|
|
769
|
-
while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
|
|
770
|
-
let suffix = 0;
|
|
771
|
-
while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
|
|
772
|
-
return (prefix + suffix) / Math.max(a.length, b.length);
|
|
773
|
-
}
|
|
774
952
|
function cellText(html, cell) {
|
|
775
953
|
return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
776
954
|
}
|