@createiq/htmldiff 1.1.0-beta.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.1.0-beta.0",
3
+ "version": "1.1.0",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
@@ -23,17 +23,17 @@
23
23
  "node": ">=20.0.0"
24
24
  },
25
25
  "devDependencies": {
26
- "@biomejs/biome": "2.4.14",
26
+ "@biomejs/biome": "2.4.15",
27
27
  "@cyclonedx/cyclonedx-npm": "4.2.1",
28
28
  "@tsconfig/recommended": "1.0.13",
29
- "@types/node": "24.12.3",
30
- "@vitest/coverage-v8": "4.1.5",
31
- "@vitest/ui": "4.1.5",
29
+ "@types/node": "24.12.4",
30
+ "@vitest/coverage-v8": "4.1.6",
31
+ "@vitest/ui": "4.1.6",
32
32
  "husky": "9.1.7",
33
- "lint-staged": "17.0.2",
33
+ "lint-staged": "17.0.4",
34
34
  "tsdown": "0.22.0",
35
35
  "typescript": "6.0.3",
36
- "vitest": "4.1.5"
36
+ "vitest": "4.1.6"
37
37
  },
38
38
  "scripts": {
39
39
  "prepare": "husky",
package/src/TableDiff.ts CHANGED
@@ -1,3 +1,5 @@
1
+ import { wrapText } from './Utils'
2
+
1
3
  /**
2
4
  * Table-aware preprocessing for HtmlDiff.
3
5
  *
@@ -73,6 +75,13 @@ const PLACEHOLDER_SUFFIX = '-->'
73
75
  const MAX_TABLE_ROWS = 1500
74
76
  const MAX_TABLE_CELLS_PER_ROW = 200
75
77
 
78
+ // Caps for the per-row combinatorial column-position search in
79
+ // findBestColumnInsertPositions / findBestColumnDeletePositions. Worst
80
+ // case is C(MAX_COLUMN_SEARCH_WIDTH, MAX_COLUMN_DELTA) ≈ 3.8M combos at
81
+ // the caps below; wider or more-skewed rows fall through to cell-LCS.
82
+ const MAX_COLUMN_DELTA = 6
83
+ const MAX_COLUMN_SEARCH_WIDTH = 40
84
+
76
85
  function makePlaceholderPrefix(oldHtml: string, newHtml: string): string {
77
86
  // 4 random bytes → 8 hex chars → 16^8 ≈ 4.3 billion combinations. We
78
87
  // also retry if the generated nonce happens to occur in either input.
@@ -395,7 +404,7 @@ function diffStructurallyAlignedTable(
395
404
  // has no rows at all, fall back to a from-scratch reconstruction so
396
405
  // we still emit deleted rows.
397
406
  if (newTable.rows.length === 0) {
398
- return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell)
407
+ return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment)
399
408
  }
400
409
 
401
410
  const out: string[] = []
@@ -408,11 +417,11 @@ function diffStructurallyAlignedTable(
408
417
  if (align.oldIdx !== null) {
409
418
  out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[align.oldIdx], newRow, diffCell))
410
419
  } else {
411
- out.push(emitFullRow(newHtml, newRow, 'ins', diffCell))
420
+ out.push(emitFullRow(newHtml, newRow, 'ins'))
412
421
  }
413
422
  cursor = newRow.rowEnd
414
423
  } else if (align.oldIdx !== null) {
415
- out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
424
+ out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'))
416
425
  }
417
426
  }
418
427
  out.push(newHtml.slice(cursor, newTable.tableEnd))
@@ -497,8 +506,7 @@ function rebuildStructurallyAlignedTable(
497
506
  newHtml: string,
498
507
  oldTable: TableRange,
499
508
  newTable: TableRange,
500
- alignment: Alignment[],
501
- diffCell: DiffCellFn
509
+ alignment: Alignment[]
502
510
  ): string {
503
511
  // Used when new has no rows but old does — we lose the per-row
504
512
  // wrappers from new (there are none), so reconstruct from old's frame.
@@ -506,9 +514,9 @@ function rebuildStructurallyAlignedTable(
506
514
  out.push(headerSlice(newHtml, newTable, oldHtml, oldTable))
507
515
  for (const align of alignment) {
508
516
  if (align.oldIdx !== null) {
509
- out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
517
+ out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del'))
510
518
  } else if (align.newIdx !== null) {
511
- out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], 'ins', diffCell))
519
+ out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], 'ins'))
512
520
  }
513
521
  }
514
522
  out.push('</table>')
@@ -548,21 +556,15 @@ function diffPreservedRow(
548
556
  // on each affected cell.
549
557
  const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
550
558
  if (colspanAligned !== null) return colspanAligned
551
- // For a single-column add/delete (cell count differs by exactly 1),
552
- // detect the position via positional similarity scan and align the
553
- // remaining cells positionally. This handles the case where a column
554
- // was added AND a different cell got an unrelated content edit — the
555
- // edited cell still aligns by position rather than getting orphaned by
556
- // the cell-LCS exact-match.
557
- const delta = newRow.cells.length - oldRow.cells.length
558
559
  // For column add/delete (cell counts differ), find the best insertion
559
560
  // or deletion positions via positional similarity scan and align the
560
561
  // remaining cells positionally. This handles content-edit alongside
561
562
  // column-add by keeping the edited cell in its column position rather
562
563
  // than orphaning it via the cell-LCS exact match.
563
564
  // Guardrail: combinatorial search is C(newCount, k); we cap to avoid
564
- // explosion on very wide tables. The cap is generous for real legal
565
- // schedules; anything above falls through to cell-LCS.
565
+ // explosion on very wide tables. Worst case at the caps is C(40, 6)
566
+ // 3.8M combos; above that we fall through to cell-LCS.
567
+ const delta = newRow.cells.length - oldRow.cells.length
566
568
  const absDelta = Math.abs(delta)
567
569
  if (
568
570
  absDelta > 0 &&
@@ -575,9 +577,6 @@ function diffPreservedRow(
575
577
  return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
576
578
  }
577
579
 
578
- const MAX_COLUMN_DELTA = 6
579
- const MAX_COLUMN_SEARCH_WIDTH = 40
580
-
581
580
  /**
582
581
  * For a row where new has K more cells than old, find the K column
583
582
  * positions in new where cells were inserted by scanning all C(newCount,
@@ -600,7 +599,7 @@ function diffMultiColumnAddRow(
600
599
  let oldIdx = 0
601
600
  for (let c = 0; c < newRow.cells.length; c++) {
602
601
  if (inserted.has(c)) {
603
- out.push(emitFullCell(newHtml, newRow.cells[c], 'ins', diffCell))
602
+ out.push(emitFullCell(newHtml, newRow.cells[c], 'ins'))
604
603
  } else {
605
604
  out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell))
606
605
  oldIdx++
@@ -624,7 +623,7 @@ function diffMultiColumnDeleteRow(
624
623
  let newIdx = 0
625
624
  for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
626
625
  if (deleted.has(oldIdx)) {
627
- out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del', diffCell))
626
+ out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del'))
628
627
  continue
629
628
  }
630
629
  out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell))
@@ -641,6 +640,11 @@ function findBestColumnInsertPositions(
641
640
  oldHtml: string,
642
641
  newHtml: string
643
642
  ): number[] {
643
+ // Pre-compute cell texts once instead of letting textSimilarity
644
+ // recompute them inside every combo iteration — C(N, K) combos times
645
+ // ~N text extractions each is a lot of wasted string work.
646
+ const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
647
+ const newTexts = newRow.cells.map(c => cellText(newHtml, c))
644
648
  let bestPositions: number[] = []
645
649
  let bestScore = -1
646
650
  for (const combo of combinationsOfRange(newRow.cells.length, k)) {
@@ -649,7 +653,7 @@ function findBestColumnInsertPositions(
649
653
  let oldIdx = 0
650
654
  for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
651
655
  if (inserted.has(newIdx)) continue
652
- score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
656
+ score += textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
653
657
  oldIdx++
654
658
  }
655
659
  if (score > bestScore) {
@@ -667,6 +671,8 @@ function findBestColumnDeletePositions(
667
671
  oldHtml: string,
668
672
  newHtml: string
669
673
  ): number[] {
674
+ const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
675
+ const newTexts = newRow.cells.map(c => cellText(newHtml, c))
670
676
  let bestPositions: number[] = []
671
677
  let bestScore = -1
672
678
  for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
@@ -675,7 +681,7 @@ function findBestColumnDeletePositions(
675
681
  let newIdx = 0
676
682
  for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
677
683
  if (deleted.has(oldIdx)) continue
678
- score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
684
+ score += textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
679
685
  newIdx++
680
686
  }
681
687
  if (score > bestScore) {
@@ -864,9 +870,9 @@ function diffStructurallyAlignedRow(
864
870
  const newCell = newRow.cells[align.newIdx]
865
871
  out.push(emitDiffedCell(oldHtml, newHtml, oldCell, newCell, diffCell))
866
872
  } else if (align.newIdx !== null) {
867
- out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], 'ins', diffCell))
873
+ out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], 'ins'))
868
874
  } else if (align.oldIdx !== null) {
869
- out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], 'del', diffCell))
875
+ out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], 'del'))
870
876
  }
871
877
  }
872
878
 
@@ -888,7 +894,7 @@ function cellKey(html: string, cell: CellRange): string {
888
894
  * each `<td>`, with an `<ins>`/`<del>` wrapper around any cell content
889
895
  * (empty cells get the class but no wrapper).
890
896
  */
891
- function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell: DiffCellFn): string {
897
+ function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del'): string {
892
898
  const cls = kind === 'ins' ? 'diffins' : 'diffdel'
893
899
  const trOpening = parseOpeningTagAt(html, row.rowStart)
894
900
  if (!trOpening) return html.slice(row.rowStart, row.rowEnd)
@@ -898,7 +904,7 @@ function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell:
898
904
  let cursor = trOpening.end
899
905
  for (const cell of row.cells) {
900
906
  out.push(html.slice(cursor, cell.cellStart))
901
- out.push(emitFullCell(html, cell, kind, diffCell))
907
+ out.push(emitFullCell(html, cell, kind))
902
908
  cursor = cell.cellEnd
903
909
  }
904
910
  out.push(html.slice(cursor, row.rowEnd))
@@ -913,7 +919,7 @@ function emitFullRow(html: string, row: RowRange, kind: 'ins' | 'del', diffCell:
913
919
  * the full recursive diff would produce for newly-inserted formatting.
914
920
  * Empty cells get the class on the `<td>` but no inner wrapping.
915
921
  */
916
- function emitFullCell(html: string, cell: CellRange, kind: 'ins' | 'del', _diffCell: DiffCellFn): string {
922
+ function emitFullCell(html: string, cell: CellRange, kind: 'ins' | 'del'): string {
917
923
  const cls = kind === 'ins' ? 'diffins' : 'diffdel'
918
924
  const tdOpening = parseOpeningTagAt(html, cell.cellStart)
919
925
  if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd)
@@ -954,7 +960,7 @@ function wrapInlineTextRuns(content: string, kind: 'ins' | 'del'): string {
954
960
  while (j < content.length && content[j] !== '<') j++
955
961
  const text = content.slice(i, j)
956
962
  if (text.trim().length > 0) {
957
- out.push(`<${tag} class='${cls}'>${text}</${tag}>`)
963
+ out.push(wrapText(text, tag, cls))
958
964
  } else {
959
965
  out.push(text)
960
966
  }
@@ -1024,8 +1030,13 @@ function pairSimilarUnmatchedRows(
1024
1030
  oldHtml: string,
1025
1031
  newHtml: string
1026
1032
  ): Alignment[] {
1033
+ // Pre-compute row texts once; the similarity callback is invoked
1034
+ // O(D × I) times per unmatched run (every del × every ins), and
1035
+ // rowText walks every cell.
1036
+ const oldTexts = oldTable.rows.map(r => rowText(oldHtml, r))
1037
+ const newTexts = newTable.rows.map(r => rowText(newHtml, r))
1027
1038
  return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
1028
- rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml)
1039
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
1029
1040
  )
1030
1041
  }
1031
1042
 
@@ -1036,8 +1047,10 @@ function pairSimilarUnmatchedCells(
1036
1047
  oldHtml: string,
1037
1048
  newHtml: string
1038
1049
  ): Alignment[] {
1050
+ const oldTexts = oldRow.cells.map(c => cellText(oldHtml, c))
1051
+ const newTexts = newRow.cells.map(c => cellText(newHtml, c))
1039
1052
  return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
1040
- cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
1053
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
1041
1054
  )
1042
1055
  }
1043
1056
 
@@ -1135,14 +1148,6 @@ function pairSimilarUnmatched(
1135
1148
  * sufficient for the full range of legal-doc edits we see in
1136
1149
  * production tables.
1137
1150
  */
1138
- function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number {
1139
- return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow))
1140
- }
1141
-
1142
- function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
1143
- return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell))
1144
- }
1145
-
1146
1151
  function textSimilarity(a: string, b: string): number {
1147
1152
  if (a === b) return 1
1148
1153
  if (a.length === 0 || b.length === 0) return 0
@@ -1215,29 +1220,28 @@ function lcsAlign(oldKeys: string[], newKeys: string[]): Alignment[] {
1215
1220
  }
1216
1221
  }
1217
1222
 
1223
+ // Backtrack and push; reverse at the end. `unshift` is O(n) per call
1224
+ // so the naive version was O(n²); push+reverse is O(n) total.
1218
1225
  const result: Alignment[] = []
1219
1226
  let i = m
1220
1227
  let j = n
1221
1228
  while (i > 0 || j > 0) {
1222
1229
  if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
1223
- result.unshift({ oldIdx: i - 1, newIdx: j - 1 })
1230
+ result.push({ oldIdx: i - 1, newIdx: j - 1 })
1224
1231
  i--
1225
1232
  j--
1226
1233
  } else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
1227
- result.unshift({ oldIdx: null, newIdx: j - 1 })
1234
+ result.push({ oldIdx: null, newIdx: j - 1 })
1228
1235
  j--
1229
1236
  } else {
1230
- result.unshift({ oldIdx: i - 1, newIdx: null })
1237
+ result.push({ oldIdx: i - 1, newIdx: null })
1231
1238
  i--
1232
1239
  }
1233
1240
  }
1241
+ result.reverse()
1234
1242
  return result
1235
1243
  }
1236
1244
 
1237
- /**
1238
- * Returns the opening tag string with the given class injected. Existing
1239
- * `class` attributes are preserved and the new class appended.
1240
- */
1241
1245
  /**
1242
1246
  * Returns the opening tag with the given class injected. Locates the real
1243
1247
  * `class` attribute via attribute-aware walking (NOT a flat regex — that
@@ -108,9 +108,14 @@ describe('HtmlDiff — table operations matrix', () => {
108
108
  // still drifts.
109
109
  const rowCount = countMatches(result, /<tr[\s>]/g)
110
110
  expect(rowCount).toBe(4) // header + Party A + empty + Party B
111
- expect(result).toContain(
112
- "<tr class='diffins'><td class='diffins'></td><td class='diffins'></td><td class='diffins'></td><td class='diffins'></td></tr>"
113
- )
111
+ // The inserted empty row must be emitted with diffins on the <tr>
112
+ // and 4 empty diffins-marked cells. Asserted via regex (quote-
113
+ // agnostic, whitespace-tolerant) so an incidental change in
114
+ // attribute-quote style isn't flagged as a regression.
115
+ const emptyInsertedRow = result.match(/<tr class=['"]diffins['"]>(.*?)<\/tr>/)
116
+ expect(emptyInsertedRow).not.toBeNull()
117
+ const emptyCellCount = countMatches(emptyInsertedRow?.[1] ?? '', /<td class=['"]diffins['"]><\/td>/g)
118
+ expect(emptyCellCount).toBe(4)
114
119
  })
115
120
  })
116
121
  })