@createiq/htmldiff 1.0.5-beta.1 → 1.0.5-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/HtmlDiff.cjs +52 -4
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.mjs +52 -4
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/TableDiff.ts +90 -17
- package/test/HtmlDiff.tables.spec.ts +39 -0
package/package.json
CHANGED
package/src/TableDiff.ts
CHANGED
|
@@ -619,7 +619,12 @@ function diffStructurallyAlignedRow(
|
|
|
619
619
|
): string {
|
|
620
620
|
const oldKeys = oldRow.cells.map(cell => cellKey(oldHtml, cell))
|
|
621
621
|
const newKeys = newRow.cells.map(cell => cellKey(newHtml, cell))
|
|
622
|
-
const
|
|
622
|
+
const exactAlignment = lcsAlign(oldKeys, newKeys)
|
|
623
|
+
// After exact LCS, fuzzy-pair adjacent unmatched old/new cells whose
|
|
624
|
+
// content is similar enough — so a content-edit cell alongside a
|
|
625
|
+
// column-add in the same row produces a content diff for the edited
|
|
626
|
+
// cell rather than a phantom delete + insert + extra cell.
|
|
627
|
+
const alignment = pairSimilarUnmatchedCells(exactAlignment, oldRow, newRow, oldHtml, newHtml)
|
|
623
628
|
|
|
624
629
|
const out: string[] = []
|
|
625
630
|
// Use new's <tr> if it exists; otherwise old's.
|
|
@@ -765,9 +770,17 @@ interface Alignment {
|
|
|
765
770
|
newIdx: number | null
|
|
766
771
|
}
|
|
767
772
|
|
|
768
|
-
/**
|
|
773
|
+
/** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
|
|
769
774
|
const ROW_FUZZY_THRESHOLD = 0.5
|
|
770
775
|
|
|
776
|
+
/**
|
|
777
|
+
* Threshold for "this cell is a content-edit of that cell." Tuned the same
|
|
778
|
+
* as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
|
|
779
|
+
* content typically ARE the same logical cell with a body edit, so 0.5
|
|
780
|
+
* works for both granularities in practice.
|
|
781
|
+
*/
|
|
782
|
+
const CELL_FUZZY_THRESHOLD = 0.5
|
|
783
|
+
|
|
771
784
|
/**
|
|
772
785
|
* After exact LCS, scan the alignment for runs of "old deleted, then new
|
|
773
786
|
* inserted" (or vice versa) and pair entries whose content is similar
|
|
@@ -783,14 +796,42 @@ function pairSimilarUnmatchedRows(
|
|
|
783
796
|
oldHtml: string,
|
|
784
797
|
newHtml: string
|
|
785
798
|
): Alignment[] {
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
799
|
+
return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
|
|
800
|
+
rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml)
|
|
801
|
+
)
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
function pairSimilarUnmatchedCells(
|
|
805
|
+
alignment: Alignment[],
|
|
806
|
+
oldRow: RowRange,
|
|
807
|
+
newRow: RowRange,
|
|
808
|
+
oldHtml: string,
|
|
809
|
+
newHtml: string
|
|
810
|
+
): Alignment[] {
|
|
811
|
+
return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
|
|
812
|
+
cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
|
|
813
|
+
)
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
/**
|
|
817
|
+
* Identify pairings inside each unmatched-only run, then build the output
|
|
818
|
+
* alignment by walking the original and substituting paired entries at
|
|
819
|
+
* the *ins position* (not the del position). This keeps the result
|
|
820
|
+
* monotonic in newIdx — critical because the cursor-based emission
|
|
821
|
+
* downstream walks new's html in order. Emitting at the del position
|
|
822
|
+
* would be fine when del<ins in the alignment array (the typical case),
|
|
823
|
+
* but can violate monotonicity when there are mixed unpaired entries in
|
|
824
|
+
* between (column-add + row-add together, content-edit + column-add,
|
|
825
|
+
* etc.).
|
|
826
|
+
*
|
|
827
|
+
* Generic over what's being paired — works for both rows (by full row
|
|
828
|
+
* content similarity) and cells (by per-cell content similarity).
|
|
829
|
+
*/
|
|
830
|
+
function pairSimilarUnmatched(
|
|
831
|
+
alignment: Alignment[],
|
|
832
|
+
threshold: number,
|
|
833
|
+
similarity: (oldIdx: number, newIdx: number) => number
|
|
834
|
+
): Alignment[] {
|
|
794
835
|
const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
|
|
795
836
|
let i = 0
|
|
796
837
|
while (i < alignment.length) {
|
|
@@ -812,15 +853,10 @@ function pairSimilarUnmatchedRows(
|
|
|
812
853
|
const usedIns = new Set<number>()
|
|
813
854
|
for (const di of delIndices) {
|
|
814
855
|
let bestIi = -1
|
|
815
|
-
let bestSim =
|
|
856
|
+
let bestSim = threshold
|
|
816
857
|
for (const ii of insIndices) {
|
|
817
858
|
if (usedIns.has(ii)) continue
|
|
818
|
-
const sim =
|
|
819
|
-
oldTable.rows[alignment[di].oldIdx as number],
|
|
820
|
-
newTable.rows[alignment[ii].newIdx as number],
|
|
821
|
-
oldHtml,
|
|
822
|
-
newHtml
|
|
823
|
-
)
|
|
859
|
+
const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
|
|
824
860
|
if (sim > bestSim) {
|
|
825
861
|
bestSim = sim
|
|
826
862
|
bestIi = ii
|
|
@@ -887,6 +923,43 @@ function rowText(html: string, row: RowRange): string {
|
|
|
887
923
|
return parts.join(' ').replace(/\s+/g, ' ').trim().toLowerCase()
|
|
888
924
|
}
|
|
889
925
|
|
|
926
|
+
/**
|
|
927
|
+
* Character-level prefix+suffix similarity for a single cell's text
|
|
928
|
+
* content. Same metric as rowSimilarity, scoped to one cell so we can
|
|
929
|
+
* fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
|
|
930
|
+
* a column add in the same row).
|
|
931
|
+
*/
|
|
932
|
+
function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
|
|
933
|
+
const a = cellText(oldHtml, oldCell)
|
|
934
|
+
const b = cellText(newHtml, newCell)
|
|
935
|
+
if (a === b) return 1
|
|
936
|
+
if (a.length === 0 || b.length === 0) return 0
|
|
937
|
+
|
|
938
|
+
let prefix = 0
|
|
939
|
+
const minLen = Math.min(a.length, b.length)
|
|
940
|
+
while (prefix < minLen && a[prefix] === b[prefix]) prefix++
|
|
941
|
+
|
|
942
|
+
let suffix = 0
|
|
943
|
+
while (
|
|
944
|
+
suffix < a.length - prefix &&
|
|
945
|
+
suffix < b.length - prefix &&
|
|
946
|
+
a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
|
|
947
|
+
) {
|
|
948
|
+
suffix++
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
return (prefix + suffix) / Math.max(a.length, b.length)
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
function cellText(html: string, cell: CellRange): string {
|
|
955
|
+
return html
|
|
956
|
+
.slice(cell.contentStart, cell.contentEnd)
|
|
957
|
+
.replace(/<[^>]+>/g, ' ')
|
|
958
|
+
.replace(/\s+/g, ' ')
|
|
959
|
+
.trim()
|
|
960
|
+
.toLowerCase()
|
|
961
|
+
}
|
|
962
|
+
|
|
890
963
|
/**
|
|
891
964
|
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
892
965
|
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|
|
@@ -1268,6 +1268,45 @@ describe('HtmlDiff — tables', () => {
|
|
|
1268
1268
|
)
|
|
1269
1269
|
})
|
|
1270
1270
|
|
|
1271
|
+
it('handles column-add alongside content edit in the SAME row (cell-level fuzzy matching)', () => {
|
|
1272
|
+
// Real-world scenario: a column was inserted at position 1 AND
|
|
1273
|
+
// one of the existing cells got new content appended. Without
|
|
1274
|
+
// cell-level fuzzy matching, the cell-LCS exact-match misses the
|
|
1275
|
+
// "IRS Forms…" pairing, producing a 5-cell row (phantom delete +
|
|
1276
|
+
// two inserts) instead of 4 cells with one inline content edit.
|
|
1277
|
+
const oldHtml =
|
|
1278
|
+
'<table>' +
|
|
1279
|
+
'<tr><th>Party</th><th>Form</th><th>Date</th></tr>' +
|
|
1280
|
+
'<tr><td>Party A</td><td>IRS Forms W-8BEN-E and W-8ECI (or any successors thereto).</td><td>Upon execution.</td></tr>' +
|
|
1281
|
+
'<tr><td>Party B</td><td>IRS Form W-9, as applicable.</td><td>Upon execution.</td></tr>' +
|
|
1282
|
+
'</table>'
|
|
1283
|
+
const newHtml =
|
|
1284
|
+
'<table>' +
|
|
1285
|
+
'<tr><th>Party</th><th>Extra column</th><th>Form</th><th>Date</th></tr>' +
|
|
1286
|
+
"<tr><td>Party A</td><td>Yes</td><td>IRS Forms W-8BEN-E and W-8ECI (or any successors thereto). Here's some extra content</td><td>Upon execution.</td></tr>" +
|
|
1287
|
+
'<tr><td>Party B</td><td>A</td><td>IRS Form W-9, as applicable.</td><td>Upon execution.</td></tr>' +
|
|
1288
|
+
'</table>'
|
|
1289
|
+
|
|
1290
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
1291
|
+
'<table>' +
|
|
1292
|
+
// Header row: extra column inserted at position 1
|
|
1293
|
+
'<tr><th>Party</th>' +
|
|
1294
|
+
"<th class='diffins'><ins class='diffins'>Extra column</ins></th>" +
|
|
1295
|
+
'<th>Form</th><th>Date</th></tr>' +
|
|
1296
|
+
// Party A row: extra column cell + content edit on the IRS Forms cell
|
|
1297
|
+
'<tr><td>Party A</td>' +
|
|
1298
|
+
"<td class='diffins'><ins class='diffins'>Yes</ins></td>" +
|
|
1299
|
+
"<td>IRS Forms W-8BEN-E and W-8ECI (or any successors thereto).<ins class='diffins'> Here's some extra content</ins></td>" +
|
|
1300
|
+
'<td>Upon execution.</td></tr>' +
|
|
1301
|
+
// Party B row: extra column cell, IRS Form W-9 cell unchanged
|
|
1302
|
+
'<tr><td>Party B</td>' +
|
|
1303
|
+
"<td class='diffins'><ins class='diffins'>A</ins></td>" +
|
|
1304
|
+
'<td>IRS Form W-9, as applicable.</td>' +
|
|
1305
|
+
'<td>Upon execution.</td></tr>' +
|
|
1306
|
+
'</table>'
|
|
1307
|
+
)
|
|
1308
|
+
})
|
|
1309
|
+
|
|
1271
1310
|
it('handles a rowspan cell sharing a row with normal cells (column-add adjacency)', () => {
|
|
1272
1311
|
// The rowspan'd cell occupies row 0 col 0 and row 1's col 0 slot
|
|
1273
1312
|
// (absorbed). Old has rowspan=2 in col 0 + col 1 in row 0 + col
|