@createiq/htmldiff 1.0.5-beta.1 → 1.0.5-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/HtmlDiff.cjs +52 -4
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.mjs +52 -4
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/TableDiff.ts +90 -17
- package/test/HtmlDiff.tables.spec.ts +39 -0
package/dist/HtmlDiff.cjs
CHANGED
|
@@ -544,7 +544,7 @@ function diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
|
544
544
|
return out.join("");
|
|
545
545
|
}
|
|
546
546
|
function diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
547
|
-
const alignment = lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell)));
|
|
547
|
+
const alignment = pairSimilarUnmatchedCells(lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell))), oldRow, newRow, oldHtml, newHtml);
|
|
548
548
|
const out = [];
|
|
549
549
|
out.push(rowHeaderSlice(newHtml, newRow));
|
|
550
550
|
for (const align of alignment) if (align.oldIdx !== null && align.newIdx !== null) {
|
|
@@ -643,9 +643,16 @@ function rowHeaderSlice(html, row) {
|
|
|
643
643
|
if (row.cells.length === 0) return html.slice(row.rowStart, opening.end);
|
|
644
644
|
return html.slice(row.rowStart, row.cells[0].cellStart);
|
|
645
645
|
}
|
|
646
|
-
/**
|
|
646
|
+
/** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
|
|
647
647
|
const ROW_FUZZY_THRESHOLD = .5;
|
|
648
648
|
/**
|
|
649
|
+
* Threshold for "this cell is a content-edit of that cell." Tuned the same
|
|
650
|
+
* as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
|
|
651
|
+
* content typically ARE the same logical cell with a body edit, so 0.5
|
|
652
|
+
* works for both granularities in practice.
|
|
653
|
+
*/
|
|
654
|
+
const CELL_FUZZY_THRESHOLD = .5;
|
|
655
|
+
/**
|
|
649
656
|
* After exact LCS, scan the alignment for runs of "old deleted, then new
|
|
650
657
|
* inserted" (or vice versa) and pair entries whose content is similar
|
|
651
658
|
* enough to be treated as an edit rather than a delete+insert. This keeps
|
|
@@ -654,6 +661,26 @@ const ROW_FUZZY_THRESHOLD = .5;
|
|
|
654
661
|
* expect from a typical track-changes view.
|
|
655
662
|
*/
|
|
656
663
|
function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtml) {
|
|
664
|
+
return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) => rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml));
|
|
665
|
+
}
|
|
666
|
+
function pairSimilarUnmatchedCells(alignment, oldRow, newRow, oldHtml, newHtml) {
|
|
667
|
+
return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) => cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml));
|
|
668
|
+
}
|
|
669
|
+
/**
|
|
670
|
+
* Identify pairings inside each unmatched-only run, then build the output
|
|
671
|
+
* alignment by walking the original and substituting paired entries at
|
|
672
|
+
* the *ins position* (not the del position). This keeps the result
|
|
673
|
+
* monotonic in newIdx — critical because the cursor-based emission
|
|
674
|
+
* downstream walks new's html in order. Emitting at the del position
|
|
675
|
+
* would be fine when del<ins in the alignment array (the typical case),
|
|
676
|
+
* but can violate monotonicity when there are mixed unpaired entries in
|
|
677
|
+
* between (column-add + row-add together, content-edit + column-add,
|
|
678
|
+
* etc.).
|
|
679
|
+
*
|
|
680
|
+
* Generic over what's being paired — works for both rows (by full row
|
|
681
|
+
* content similarity) and cells (by per-cell content similarity).
|
|
682
|
+
*/
|
|
683
|
+
function pairSimilarUnmatched(alignment, threshold, similarity) {
|
|
657
684
|
const pairs = /* @__PURE__ */ new Map();
|
|
658
685
|
let i = 0;
|
|
659
686
|
while (i < alignment.length) {
|
|
@@ -671,10 +698,10 @@ function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtm
|
|
|
671
698
|
const usedIns = /* @__PURE__ */ new Set();
|
|
672
699
|
for (const di of delIndices) {
|
|
673
700
|
let bestIi = -1;
|
|
674
|
-
let bestSim =
|
|
701
|
+
let bestSim = threshold;
|
|
675
702
|
for (const ii of insIndices) {
|
|
676
703
|
if (usedIns.has(ii)) continue;
|
|
677
|
-
const sim =
|
|
704
|
+
const sim = similarity(alignment[di].oldIdx, alignment[ii].newIdx);
|
|
678
705
|
if (sim > bestSim) {
|
|
679
706
|
bestSim = sim;
|
|
680
707
|
bestIi = ii;
|
|
@@ -727,6 +754,27 @@ function rowText(html, row) {
|
|
|
727
754
|
return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
728
755
|
}
|
|
729
756
|
/**
|
|
757
|
+
* Character-level prefix+suffix similarity for a single cell's text
|
|
758
|
+
* content. Same metric as rowSimilarity, scoped to one cell so we can
|
|
759
|
+
* fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
|
|
760
|
+
* a column add in the same row).
|
|
761
|
+
*/
|
|
762
|
+
function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
|
|
763
|
+
const a = cellText(oldHtml, oldCell);
|
|
764
|
+
const b = cellText(newHtml, newCell);
|
|
765
|
+
if (a === b) return 1;
|
|
766
|
+
if (a.length === 0 || b.length === 0) return 0;
|
|
767
|
+
let prefix = 0;
|
|
768
|
+
const minLen = Math.min(a.length, b.length);
|
|
769
|
+
while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
|
|
770
|
+
let suffix = 0;
|
|
771
|
+
while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
|
|
772
|
+
return (prefix + suffix) / Math.max(a.length, b.length);
|
|
773
|
+
}
|
|
774
|
+
function cellText(html, cell) {
|
|
775
|
+
return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
776
|
+
}
|
|
777
|
+
/**
|
|
730
778
|
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
731
779
|
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|
|
732
780
|
* side is null for an unmatched entry on the other side. Equality uses
|