@createiq/htmldiff 1.0.5-beta.1 → 1.0.5-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.0.5-beta.1",
3
+ "version": "1.0.5-beta.2",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
package/src/TableDiff.ts CHANGED
@@ -619,7 +619,12 @@ function diffStructurallyAlignedRow(
619
619
  ): string {
620
620
  const oldKeys = oldRow.cells.map(cell => cellKey(oldHtml, cell))
621
621
  const newKeys = newRow.cells.map(cell => cellKey(newHtml, cell))
622
- const alignment = lcsAlign(oldKeys, newKeys)
622
+ const exactAlignment = lcsAlign(oldKeys, newKeys)
623
+ // After exact LCS, fuzzy-pair adjacent unmatched old/new cells whose
624
+ // content is similar enough — so a content-edit cell alongside a
625
+ // column-add in the same row produces a content diff for the edited
626
+ // cell rather than a phantom delete + insert + extra cell.
627
+ const alignment = pairSimilarUnmatchedCells(exactAlignment, oldRow, newRow, oldHtml, newHtml)
623
628
 
624
629
  const out: string[] = []
625
630
  // Use new's <tr> if it exists; otherwise old's.
@@ -765,9 +770,17 @@ interface Alignment {
765
770
  newIdx: number | null
766
771
  }
767
772
 
768
- /** Jaccard similarity threshold above which we treat two rows as "the same row, edited". */
773
+ /** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
769
774
  const ROW_FUZZY_THRESHOLD = 0.5
770
775
 
776
+ /**
777
+ * Threshold for "this cell is a content-edit of that cell." Tuned the same
778
+ * as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
779
+ * content typically ARE the same logical cell with a body edit, so 0.5
780
+ * works for both granularities in practice.
781
+ */
782
+ const CELL_FUZZY_THRESHOLD = 0.5
783
+
771
784
  /**
772
785
  * After exact LCS, scan the alignment for runs of "old deleted, then new
773
786
  * inserted" (or vice versa) and pair entries whose content is similar
@@ -783,14 +796,42 @@ function pairSimilarUnmatchedRows(
783
796
  oldHtml: string,
784
797
  newHtml: string
785
798
  ): Alignment[] {
786
- // Identify pairings inside each unmatched-only run, then build the
787
- // output by walking the alignment and substituting paired entries at
788
- // the *ins position* (not the del position). This keeps the result
789
- // alignment monotonic in newIdx — critical because the cursor-based
790
- // emission downstream walks new's html in order. Emitting at the del
791
- // position would be fine when del<ins in the alignment array (the
792
- // typical case), but can violate monotonicity when there are mixed
793
- // unpaired entries in between (column-add + row-add together, etc.).
799
+ return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
800
+ rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml)
801
+ )
802
+ }
803
+
804
+ function pairSimilarUnmatchedCells(
805
+ alignment: Alignment[],
806
+ oldRow: RowRange,
807
+ newRow: RowRange,
808
+ oldHtml: string,
809
+ newHtml: string
810
+ ): Alignment[] {
811
+ return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
812
+ cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
813
+ )
814
+ }
815
+
816
+ /**
817
+ * Identify pairings inside each unmatched-only run, then build the output
818
+ * alignment by walking the original and substituting paired entries at
819
+ * the *ins position* (not the del position). This keeps the result
820
+ * monotonic in newIdx — critical because the cursor-based emission
821
+ * downstream walks new's html in order. Emitting at the del position
822
+ * would be fine when del<ins in the alignment array (the typical case),
823
+ * but can violate monotonicity when there are mixed unpaired entries in
824
+ * between (column-add + row-add together, content-edit + column-add,
825
+ * etc.).
826
+ *
827
+ * Generic over what's being paired — works for both rows (by full row
828
+ * content similarity) and cells (by per-cell content similarity).
829
+ */
830
+ function pairSimilarUnmatched(
831
+ alignment: Alignment[],
832
+ threshold: number,
833
+ similarity: (oldIdx: number, newIdx: number) => number
834
+ ): Alignment[] {
794
835
  const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
795
836
  let i = 0
796
837
  while (i < alignment.length) {
@@ -812,15 +853,10 @@ function pairSimilarUnmatchedRows(
812
853
  const usedIns = new Set<number>()
813
854
  for (const di of delIndices) {
814
855
  let bestIi = -1
815
- let bestSim = ROW_FUZZY_THRESHOLD
856
+ let bestSim = threshold
816
857
  for (const ii of insIndices) {
817
858
  if (usedIns.has(ii)) continue
818
- const sim = rowSimilarity(
819
- oldTable.rows[alignment[di].oldIdx as number],
820
- newTable.rows[alignment[ii].newIdx as number],
821
- oldHtml,
822
- newHtml
823
- )
859
+ const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
824
860
  if (sim > bestSim) {
825
861
  bestSim = sim
826
862
  bestIi = ii
@@ -887,6 +923,43 @@ function rowText(html: string, row: RowRange): string {
887
923
  return parts.join(' ').replace(/\s+/g, ' ').trim().toLowerCase()
888
924
  }
889
925
 
926
+ /**
927
+ * Character-level prefix+suffix similarity for a single cell's text
928
+ * content. Same metric as rowSimilarity, scoped to one cell so we can
929
+ * fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
930
+ * a column add in the same row).
931
+ */
932
+ function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
933
+ const a = cellText(oldHtml, oldCell)
934
+ const b = cellText(newHtml, newCell)
935
+ if (a === b) return 1
936
+ if (a.length === 0 || b.length === 0) return 0
937
+
938
+ let prefix = 0
939
+ const minLen = Math.min(a.length, b.length)
940
+ while (prefix < minLen && a[prefix] === b[prefix]) prefix++
941
+
942
+ let suffix = 0
943
+ while (
944
+ suffix < a.length - prefix &&
945
+ suffix < b.length - prefix &&
946
+ a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
947
+ ) {
948
+ suffix++
949
+ }
950
+
951
+ return (prefix + suffix) / Math.max(a.length, b.length)
952
+ }
953
+
954
+ function cellText(html: string, cell: CellRange): string {
955
+ return html
956
+ .slice(cell.contentStart, cell.contentEnd)
957
+ .replace(/<[^>]+>/g, ' ')
958
+ .replace(/\s+/g, ' ')
959
+ .trim()
960
+ .toLowerCase()
961
+ }
962
+
890
963
  /**
891
964
  * Standard LCS alignment: walks both sequences and emits a list of pairs
892
965
  * where `(oldIdx, newIdx)` are both set for matching positions, and one
@@ -1268,6 +1268,45 @@ describe('HtmlDiff — tables', () => {
1268
1268
  )
1269
1269
  })
1270
1270
 
1271
+ it('handles column-add alongside content edit in the SAME row (cell-level fuzzy matching)', () => {
1272
+ // Real-world scenario: a column was inserted at position 1 AND
1273
+ // one of the existing cells got new content appended. Without
1274
+ // cell-level fuzzy matching, the cell-LCS exact-match misses the
1275
+ // "IRS Forms…" pairing, producing a 5-cell row (phantom delete +
1276
+ // two inserts) instead of 4 cells with one inline content edit.
1277
+ const oldHtml =
1278
+ '<table>' +
1279
+ '<tr><th>Party</th><th>Form</th><th>Date</th></tr>' +
1280
+ '<tr><td>Party A</td><td>IRS Forms W-8BEN-E and W-8ECI (or any successors thereto).</td><td>Upon execution.</td></tr>' +
1281
+ '<tr><td>Party B</td><td>IRS Form W-9, as applicable.</td><td>Upon execution.</td></tr>' +
1282
+ '</table>'
1283
+ const newHtml =
1284
+ '<table>' +
1285
+ '<tr><th>Party</th><th>Extra column</th><th>Form</th><th>Date</th></tr>' +
1286
+ "<tr><td>Party A</td><td>Yes</td><td>IRS Forms W-8BEN-E and W-8ECI (or any successors thereto). Here's some extra content</td><td>Upon execution.</td></tr>" +
1287
+ '<tr><td>Party B</td><td>A</td><td>IRS Form W-9, as applicable.</td><td>Upon execution.</td></tr>' +
1288
+ '</table>'
1289
+
1290
+ expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
1291
+ '<table>' +
1292
+ // Header row: extra column inserted at position 1
1293
+ '<tr><th>Party</th>' +
1294
+ "<th class='diffins'><ins class='diffins'>Extra column</ins></th>" +
1295
+ '<th>Form</th><th>Date</th></tr>' +
1296
+ // Party A row: extra column cell + content edit on the IRS Forms cell
1297
+ '<tr><td>Party A</td>' +
1298
+ "<td class='diffins'><ins class='diffins'>Yes</ins></td>" +
1299
+ "<td>IRS Forms W-8BEN-E and W-8ECI (or any successors thereto).<ins class='diffins'>&nbsp;Here's some extra content</ins></td>" +
1300
+ '<td>Upon execution.</td></tr>' +
1301
+ // Party B row: extra column cell, IRS Form W-9 cell unchanged
1302
+ '<tr><td>Party B</td>' +
1303
+ "<td class='diffins'><ins class='diffins'>A</ins></td>" +
1304
+ '<td>IRS Form W-9, as applicable.</td>' +
1305
+ '<td>Upon execution.</td></tr>' +
1306
+ '</table>'
1307
+ )
1308
+ })
1309
+
1271
1310
  it('handles a rowspan cell sharing a row with normal cells (column-add adjacency)', () => {
1272
1311
  // The rowspan'd cell occupies row 0 col 0 and row 1's col 0 slot
1273
1312
  // (absorbed). Old has rowspan=2 in col 0 + col 1 in row 0 + col