@createiq/htmldiff 1.0.5-beta.1 → 1.0.5-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/HtmlDiff.cjs +171 -11
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.mjs +171 -11
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/TableDiff.ts +258 -26
- package/test/HtmlDiff.tables.matrix.spec.ts +327 -0
- package/test/HtmlDiff.tables.spec.ts +39 -0
package/dist/HtmlDiff.cjs
CHANGED
|
@@ -447,8 +447,108 @@ function diffPreservedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
|
447
447
|
if (oldRow.cells.length === newRow.cells.length) return diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
448
448
|
const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
449
449
|
if (colspanAligned !== null) return colspanAligned;
|
|
450
|
+
const delta = newRow.cells.length - oldRow.cells.length;
|
|
451
|
+
const absDelta = Math.abs(delta);
|
|
452
|
+
if (absDelta > 0 && absDelta <= MAX_COLUMN_DELTA && Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH) {
|
|
453
|
+
if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell);
|
|
454
|
+
return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell);
|
|
455
|
+
}
|
|
450
456
|
return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
451
457
|
}
|
|
458
|
+
const MAX_COLUMN_DELTA = 6;
|
|
459
|
+
const MAX_COLUMN_SEARCH_WIDTH = 40;
|
|
460
|
+
/**
|
|
461
|
+
* For a row where new has K more cells than old, find the K column
|
|
462
|
+
* positions in new where cells were inserted by scanning all C(newCount,
|
|
463
|
+
* K) combinations and picking the one that maximises positional content
|
|
464
|
+
* similarity with the remaining cells. The inserted cells are emitted
|
|
465
|
+
* with diff markers; the rest are aligned positionally with content
|
|
466
|
+
* diff for matched pairs.
|
|
467
|
+
*/
|
|
468
|
+
function diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
|
|
469
|
+
const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml);
|
|
470
|
+
const inserted = new Set(insertedPositions);
|
|
471
|
+
const out = [rowHeaderSlice(newHtml, newRow)];
|
|
472
|
+
let oldIdx = 0;
|
|
473
|
+
for (let c = 0; c < newRow.cells.length; c++) if (inserted.has(c)) out.push(emitFullCell(newHtml, newRow.cells[c], "ins", diffCell));
|
|
474
|
+
else {
|
|
475
|
+
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell));
|
|
476
|
+
oldIdx++;
|
|
477
|
+
}
|
|
478
|
+
out.push("</tr>");
|
|
479
|
+
return out.join("");
|
|
480
|
+
}
|
|
481
|
+
function diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
|
|
482
|
+
const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml);
|
|
483
|
+
const deleted = new Set(deletedPositions);
|
|
484
|
+
const out = [rowHeaderSlice(newHtml, newRow)];
|
|
485
|
+
let newIdx = 0;
|
|
486
|
+
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
487
|
+
if (deleted.has(oldIdx)) {
|
|
488
|
+
out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], "del", diffCell));
|
|
489
|
+
continue;
|
|
490
|
+
}
|
|
491
|
+
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell));
|
|
492
|
+
newIdx++;
|
|
493
|
+
}
|
|
494
|
+
out.push("</tr>");
|
|
495
|
+
return out.join("");
|
|
496
|
+
}
|
|
497
|
+
function findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml) {
|
|
498
|
+
let bestPositions = [];
|
|
499
|
+
let bestScore = -1;
|
|
500
|
+
for (const combo of combinationsOfRange(newRow.cells.length, k)) {
|
|
501
|
+
const inserted = new Set(combo);
|
|
502
|
+
let score = 0;
|
|
503
|
+
let oldIdx = 0;
|
|
504
|
+
for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
|
|
505
|
+
if (inserted.has(newIdx)) continue;
|
|
506
|
+
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
|
|
507
|
+
oldIdx++;
|
|
508
|
+
}
|
|
509
|
+
if (score > bestScore) {
|
|
510
|
+
bestScore = score;
|
|
511
|
+
bestPositions = combo;
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
return bestPositions;
|
|
515
|
+
}
|
|
516
|
+
function findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml) {
|
|
517
|
+
let bestPositions = [];
|
|
518
|
+
let bestScore = -1;
|
|
519
|
+
for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
|
|
520
|
+
const deleted = new Set(combo);
|
|
521
|
+
let score = 0;
|
|
522
|
+
let newIdx = 0;
|
|
523
|
+
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
524
|
+
if (deleted.has(oldIdx)) continue;
|
|
525
|
+
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
|
|
526
|
+
newIdx++;
|
|
527
|
+
}
|
|
528
|
+
if (score > bestScore) {
|
|
529
|
+
bestScore = score;
|
|
530
|
+
bestPositions = combo;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
return bestPositions;
|
|
534
|
+
}
|
|
535
|
+
/**
|
|
536
|
+
* Yields all sorted-ascending combinations of `k` distinct integers
|
|
537
|
+
* from [0, n). Iterative implementation avoids recursion overhead and
|
|
538
|
+
* keeps memory at O(k).
|
|
539
|
+
*/
|
|
540
|
+
function* combinationsOfRange(n, k) {
|
|
541
|
+
if (k === 0 || k > n) return;
|
|
542
|
+
const indices = Array.from({ length: k }, (_, i) => i);
|
|
543
|
+
while (true) {
|
|
544
|
+
yield indices.slice();
|
|
545
|
+
let i = k - 1;
|
|
546
|
+
while (i >= 0 && indices[i] === n - k + i) i--;
|
|
547
|
+
if (i < 0) return;
|
|
548
|
+
indices[i]++;
|
|
549
|
+
for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1;
|
|
550
|
+
}
|
|
551
|
+
}
|
|
452
552
|
/**
|
|
453
553
|
* Try to align cells by logical column position (sum of colspans). When
|
|
454
554
|
* one side has a colspan'd cell that absorbs multiple cells on the other
|
|
@@ -544,7 +644,7 @@ function diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
|
544
644
|
return out.join("");
|
|
545
645
|
}
|
|
546
646
|
function diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
547
|
-
const alignment = lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell)));
|
|
647
|
+
const alignment = pairSimilarUnmatchedCells(lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell))), oldRow, newRow, oldHtml, newHtml);
|
|
548
648
|
const out = [];
|
|
549
649
|
out.push(rowHeaderSlice(newHtml, newRow));
|
|
550
650
|
for (const align of alignment) if (align.oldIdx !== null && align.newIdx !== null) {
|
|
@@ -643,9 +743,16 @@ function rowHeaderSlice(html, row) {
|
|
|
643
743
|
if (row.cells.length === 0) return html.slice(row.rowStart, opening.end);
|
|
644
744
|
return html.slice(row.rowStart, row.cells[0].cellStart);
|
|
645
745
|
}
|
|
646
|
-
/**
|
|
746
|
+
/** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
|
|
647
747
|
const ROW_FUZZY_THRESHOLD = .5;
|
|
648
748
|
/**
|
|
749
|
+
* Threshold for "this cell is a content-edit of that cell." Tuned the same
|
|
750
|
+
* as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
|
|
751
|
+
* content typically ARE the same logical cell with a body edit, so 0.5
|
|
752
|
+
* works for both granularities in practice.
|
|
753
|
+
*/
|
|
754
|
+
const CELL_FUZZY_THRESHOLD = .5;
|
|
755
|
+
/**
|
|
649
756
|
* After exact LCS, scan the alignment for runs of "old deleted, then new
|
|
650
757
|
* inserted" (or vice versa) and pair entries whose content is similar
|
|
651
758
|
* enough to be treated as an edit rather than a delete+insert. This keeps
|
|
@@ -654,6 +761,26 @@ const ROW_FUZZY_THRESHOLD = .5;
|
|
|
654
761
|
* expect from a typical track-changes view.
|
|
655
762
|
*/
|
|
656
763
|
function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtml) {
|
|
764
|
+
return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) => rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml));
|
|
765
|
+
}
|
|
766
|
+
function pairSimilarUnmatchedCells(alignment, oldRow, newRow, oldHtml, newHtml) {
|
|
767
|
+
return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) => cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml));
|
|
768
|
+
}
|
|
769
|
+
/**
|
|
770
|
+
* Identify pairings inside each unmatched-only run, then build the output
|
|
771
|
+
* alignment by walking the original and substituting paired entries at
|
|
772
|
+
* the *ins position* (not the del position). This keeps the result
|
|
773
|
+
* monotonic in newIdx — critical because the cursor-based emission
|
|
774
|
+
* downstream walks new's html in order. Emitting at the del position
|
|
775
|
+
* would be fine when del<ins in the alignment array (the typical case),
|
|
776
|
+
* but can violate monotonicity when there are mixed unpaired entries in
|
|
777
|
+
* between (column-add + row-add together, content-edit + column-add,
|
|
778
|
+
* etc.).
|
|
779
|
+
*
|
|
780
|
+
* Generic over what's being paired — works for both rows (by full row
|
|
781
|
+
* content similarity) and cells (by per-cell content similarity).
|
|
782
|
+
*/
|
|
783
|
+
function pairSimilarUnmatched(alignment, threshold, similarity) {
|
|
657
784
|
const pairs = /* @__PURE__ */ new Map();
|
|
658
785
|
let i = 0;
|
|
659
786
|
while (i < alignment.length) {
|
|
@@ -671,10 +798,10 @@ function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtm
|
|
|
671
798
|
const usedIns = /* @__PURE__ */ new Set();
|
|
672
799
|
for (const di of delIndices) {
|
|
673
800
|
let bestIi = -1;
|
|
674
|
-
let bestSim =
|
|
801
|
+
let bestSim = threshold;
|
|
675
802
|
for (const ii of insIndices) {
|
|
676
803
|
if (usedIns.has(ii)) continue;
|
|
677
|
-
const sim =
|
|
804
|
+
const sim = similarity(alignment[di].oldIdx, alignment[ii].newIdx);
|
|
678
805
|
if (sim > bestSim) {
|
|
679
806
|
bestSim = sim;
|
|
680
807
|
bestIi = ii;
|
|
@@ -703,17 +830,38 @@ function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtm
|
|
|
703
830
|
return result;
|
|
704
831
|
}
|
|
705
832
|
/**
|
|
706
|
-
*
|
|
707
|
-
*
|
|
708
|
-
*
|
|
709
|
-
*
|
|
710
|
-
*
|
|
833
|
+
* Combined similarity metric used for both row-level and cell-level
|
|
834
|
+
* fuzzy pairing. Returns the MAX of two complementary metrics:
|
|
835
|
+
*
|
|
836
|
+
* 1. **Character prefix+suffix similarity** — fraction of the longer
|
|
837
|
+
* string covered by shared prefix + shared suffix. Catches small
|
|
838
|
+
* edits in the middle of a string (one word changed in a row).
|
|
839
|
+
* Misses cases where the bulk of common content is in the middle
|
|
840
|
+
* and the ends differ.
|
|
841
|
+
*
|
|
842
|
+
* 2. **Token Jaccard similarity** — intersection-over-union of the
|
|
843
|
+
* whitespace-split tokens. Catches "most of the content is the
|
|
844
|
+
* same but bookended by different bits" — e.g. a row whose only
|
|
845
|
+
* edit is a column added at the start and another at the end,
|
|
846
|
+
* where the ~50 chars in the middle that DO match would be
|
|
847
|
+
* invisible to prefix+suffix.
|
|
848
|
+
*
|
|
849
|
+
* Either metric exceeding the threshold means pair. Neither alone is
|
|
850
|
+
* sufficient for the full range of legal-doc edits we see in
|
|
851
|
+
* production tables.
|
|
711
852
|
*/
|
|
712
853
|
function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
|
|
713
|
-
|
|
714
|
-
|
|
854
|
+
return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow));
|
|
855
|
+
}
|
|
856
|
+
function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
|
|
857
|
+
return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell));
|
|
858
|
+
}
|
|
859
|
+
function textSimilarity(a, b) {
|
|
715
860
|
if (a === b) return 1;
|
|
716
861
|
if (a.length === 0 || b.length === 0) return 0;
|
|
862
|
+
return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b));
|
|
863
|
+
}
|
|
864
|
+
function charPrefixSuffixSimilarity(a, b) {
|
|
717
865
|
let prefix = 0;
|
|
718
866
|
const minLen = Math.min(a.length, b.length);
|
|
719
867
|
while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
|
|
@@ -721,11 +869,23 @@ function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
|
|
|
721
869
|
while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
|
|
722
870
|
return (prefix + suffix) / Math.max(a.length, b.length);
|
|
723
871
|
}
|
|
872
|
+
function tokenJaccardSimilarity(a, b) {
|
|
873
|
+
const tokensA = new Set(a.split(/\s+/).filter(Boolean));
|
|
874
|
+
const tokensB = new Set(b.split(/\s+/).filter(Boolean));
|
|
875
|
+
if (tokensA.size === 0 && tokensB.size === 0) return 1;
|
|
876
|
+
let intersection = 0;
|
|
877
|
+
for (const t of tokensA) if (tokensB.has(t)) intersection++;
|
|
878
|
+
const union = tokensA.size + tokensB.size - intersection;
|
|
879
|
+
return union === 0 ? 0 : intersection / union;
|
|
880
|
+
}
|
|
724
881
|
function rowText(html, row) {
|
|
725
882
|
const parts = [];
|
|
726
883
|
for (const cell of row.cells) parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " "));
|
|
727
884
|
return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
728
885
|
}
|
|
886
|
+
function cellText(html, cell) {
|
|
887
|
+
return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
888
|
+
}
|
|
729
889
|
/**
|
|
730
890
|
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
731
891
|
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|