@createiq/htmldiff 1.0.5-beta.2 → 1.0.5-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.0.5-beta.2",
3
+ "version": "1.0.5-beta.4",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
package/src/TableDiff.ts CHANGED
@@ -379,20 +379,25 @@ function diffStructurallyAlignedTable(
379
379
  const oldKeys = oldTable.rows.map(row => rowKey(oldHtml, row))
380
380
  const newKeys = newTable.rows.map(row => rowKey(newHtml, row))
381
381
  const exactAlignment = lcsAlign(oldKeys, newKeys)
382
- const alignment = pairSimilarUnmatchedRows(exactAlignment, oldTable, newTable, oldHtml, newHtml)
382
+ const paired = pairSimilarUnmatchedRows(exactAlignment, oldTable, newTable, oldHtml, newHtml)
383
+ // Reorder so unpaired deleted rows appear at their *natural old-side
384
+ // position* — immediately after the preserved/paired row that came
385
+ // before them in old. Without this, runs of unpaired dels at low
386
+ // alignment indices end up emitted before any preserved row (the
387
+ // "deleted rows out of order" bug).
388
+ const alignment = orderAlignmentForEmission(paired)
383
389
 
384
390
  // Walk new's tableStart→tableEnd, substituting rows with their diffed
385
391
  // form so `<thead>`/`<tbody>` wrappers and inter-row whitespace are
386
392
  // preserved verbatim. Deleted rows (no position in new) are injected
387
- // inline at their alignment position. If new has no rows at all, fall
388
- // back to a from-scratch reconstruction so we still emit deleted rows.
393
+ // inline at the cursor's current position, which now corresponds to
394
+ // their natural old-side slot thanks to the reordering above. If new
395
+ // has no rows at all, fall back to a from-scratch reconstruction so
396
+ // we still emit deleted rows.
389
397
  if (newTable.rows.length === 0) {
390
398
  return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell)
391
399
  }
392
400
 
393
- // Emit the table header (`<table>` + any `<thead>`/`<tbody>` opening
394
- // text up to the first row) up-front so a leading run of deleted-only
395
- // alignments doesn't slip in before the table opens.
396
401
  const out: string[] = []
397
402
  out.push(newHtml.slice(newTable.tableStart, newTable.rows[0].rowStart))
398
403
  let cursor = newTable.rows[0].rowStart
@@ -407,8 +412,6 @@ function diffStructurallyAlignedTable(
407
412
  }
408
413
  cursor = newRow.rowEnd
409
414
  } else if (align.oldIdx !== null) {
410
- // Deleted row: inject inline at the current cursor (between the
411
- // previously emitted row and the next one in new).
412
415
  out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
413
416
  }
414
417
  }
@@ -416,6 +419,79 @@ function diffStructurallyAlignedTable(
416
419
  return out.join('')
417
420
  }
418
421
 
422
+ /**
423
+ * Reorders the alignment so emission produces rows in the visually-
424
+ * correct order. Each entry is assigned a fractional "position" in
425
+ * new's flow:
426
+ *
427
+ * • Preserved/paired (oldIdx, newIdx): position = newIdx.
428
+ * • Pure insert (null, newIdx): position = newIdx.
429
+ * • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
430
+ * before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
431
+ * they appear in old's row order. The +0.5 places dels BEFORE any
432
+ * insert at the same gap (insert at newIdx N1+1 has position N1+1
433
+ * which is > N1+0.5), giving the natural "delete first, insert
434
+ * second" reading order at a replaced position.
435
+ *
436
+ * This handles the full range:
437
+ * • Run of unpaired dels at the start (no preserved predecessor):
438
+ * position -0.5, sorted by oldIdx.
439
+ * • Dels in the middle: positioned right after their preceding
440
+ * preserved row.
441
+ * • Dels at the end (no preserved successor): positioned after the
442
+ * last preserved row.
443
+ *
444
+ * Without this reordering, a run of unpaired deletes at low alignment
445
+ * indices got emitted at cursor = first-new-row position — putting
446
+ * all deletes before any preserved row in the output, regardless of
447
+ * where they came from in old.
448
+ */
449
+ function orderAlignmentForEmission(alignment: Alignment[]): Alignment[] {
450
+ const preserved: Array<{ oldIdx: number; newIdx: number }> = []
451
+ for (const a of alignment) {
452
+ if (a.oldIdx !== null && a.newIdx !== null) {
453
+ preserved.push({ oldIdx: a.oldIdx, newIdx: a.newIdx })
454
+ }
455
+ }
456
+ preserved.sort((a, b) => a.oldIdx - b.oldIdx)
457
+
458
+ // For a deleted row with oldIdx K, return the newIdx of the preserved
459
+ // entry with the largest oldIdx less than K, or -1 if none.
460
+ function newIdxOfPreservedBefore(oldIdx: number): number {
461
+ let result = -1
462
+ for (const p of preserved) {
463
+ if (p.oldIdx >= oldIdx) break
464
+ result = p.newIdx
465
+ }
466
+ return result
467
+ }
468
+
469
+ // Decorate each alignment with a fractional position. We use
470
+ // (primary, secondary) tuples so dels at the same gap sort by oldIdx
471
+ // (in old's row order) and inserts at the same newIdx stay stable.
472
+ const decorated = alignment.map((a, i) => {
473
+ let primary: number
474
+ let secondary: number
475
+ if (a.newIdx !== null) {
476
+ primary = a.newIdx
477
+ secondary = a.oldIdx === null ? 1 : 0 // preserved before pure-insert at same newIdx (rare)
478
+ } else {
479
+ // Pure delete
480
+ primary = newIdxOfPreservedBefore(a.oldIdx as number) + 0.5
481
+ secondary = a.oldIdx as number
482
+ }
483
+ return { entry: a, primary, secondary, originalIdx: i }
484
+ })
485
+
486
+ decorated.sort((a, b) => {
487
+ if (a.primary !== b.primary) return a.primary - b.primary
488
+ if (a.secondary !== b.secondary) return a.secondary - b.secondary
489
+ return a.originalIdx - b.originalIdx // stable
490
+ })
491
+
492
+ return decorated.map(d => d.entry)
493
+ }
494
+
419
495
  function rebuildStructurallyAlignedTable(
420
496
  oldHtml: string,
421
497
  newHtml: string,
@@ -469,13 +545,165 @@ function diffPreservedRow(
469
545
  }
470
546
  // Cell counts differ. Try to interpret it as a horizontal merge/split via
471
547
  // colspan first — preserving the new structure with `class='mod colspan'`
472
- // on each affected cell. Falls back to the cell-LCS path if the cells
473
- // don't align cleanly on logical column positions.
548
+ // on each affected cell.
474
549
  const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
475
550
  if (colspanAligned !== null) return colspanAligned
551
+ // For a single-column add/delete (cell count differs by exactly 1),
552
+ // detect the position via positional similarity scan and align the
553
+ // remaining cells positionally. This handles the case where a column
554
+ // was added AND a different cell got an unrelated content edit — the
555
+ // edited cell still aligns by position rather than getting orphaned by
556
+ // the cell-LCS exact-match.
557
+ const delta = newRow.cells.length - oldRow.cells.length
558
+ // For column add/delete (cell counts differ), find the best insertion
559
+ // or deletion positions via positional similarity scan and align the
560
+ // remaining cells positionally. This handles content-edit alongside
561
+ // column-add by keeping the edited cell in its column position rather
562
+ // than orphaning it via the cell-LCS exact match.
563
+ // Guardrail: combinatorial search is C(newCount, k); we cap to avoid
564
+ // explosion on very wide tables. The cap is generous for real legal
565
+ // schedules; anything above falls through to cell-LCS.
566
+ const absDelta = Math.abs(delta)
567
+ if (
568
+ absDelta > 0 &&
569
+ absDelta <= MAX_COLUMN_DELTA &&
570
+ Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH
571
+ ) {
572
+ if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell)
573
+ return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell)
574
+ }
476
575
  return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
477
576
  }
478
577
 
578
+ const MAX_COLUMN_DELTA = 6
579
+ const MAX_COLUMN_SEARCH_WIDTH = 40
580
+
581
+ /**
582
+ * For a row where new has K more cells than old, find the K column
583
+ * positions in new where cells were inserted by scanning all C(newCount,
584
+ * K) combinations and picking the one that maximises positional content
585
+ * similarity with the remaining cells. The inserted cells are emitted
586
+ * with diff markers; the rest are aligned positionally with content
587
+ * diff for matched pairs.
588
+ */
589
+ function diffMultiColumnAddRow(
590
+ oldHtml: string,
591
+ newHtml: string,
592
+ oldRow: RowRange,
593
+ newRow: RowRange,
594
+ k: number,
595
+ diffCell: DiffCellFn
596
+ ): string {
597
+ const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml)
598
+ const inserted = new Set(insertedPositions)
599
+ const out: string[] = [rowHeaderSlice(newHtml, newRow)]
600
+ let oldIdx = 0
601
+ for (let c = 0; c < newRow.cells.length; c++) {
602
+ if (inserted.has(c)) {
603
+ out.push(emitFullCell(newHtml, newRow.cells[c], 'ins', diffCell))
604
+ } else {
605
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell))
606
+ oldIdx++
607
+ }
608
+ }
609
+ out.push('</tr>')
610
+ return out.join('')
611
+ }
612
+
613
+ function diffMultiColumnDeleteRow(
614
+ oldHtml: string,
615
+ newHtml: string,
616
+ oldRow: RowRange,
617
+ newRow: RowRange,
618
+ k: number,
619
+ diffCell: DiffCellFn
620
+ ): string {
621
+ const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml)
622
+ const deleted = new Set(deletedPositions)
623
+ const out: string[] = [rowHeaderSlice(newHtml, newRow)]
624
+ let newIdx = 0
625
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
626
+ if (deleted.has(oldIdx)) {
627
+ out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del', diffCell))
628
+ continue
629
+ }
630
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell))
631
+ newIdx++
632
+ }
633
+ out.push('</tr>')
634
+ return out.join('')
635
+ }
636
+
637
+ function findBestColumnInsertPositions(
638
+ oldRow: RowRange,
639
+ newRow: RowRange,
640
+ k: number,
641
+ oldHtml: string,
642
+ newHtml: string
643
+ ): number[] {
644
+ let bestPositions: number[] = []
645
+ let bestScore = -1
646
+ for (const combo of combinationsOfRange(newRow.cells.length, k)) {
647
+ const inserted = new Set(combo)
648
+ let score = 0
649
+ let oldIdx = 0
650
+ for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
651
+ if (inserted.has(newIdx)) continue
652
+ score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
653
+ oldIdx++
654
+ }
655
+ if (score > bestScore) {
656
+ bestScore = score
657
+ bestPositions = combo
658
+ }
659
+ }
660
+ return bestPositions
661
+ }
662
+
663
+ function findBestColumnDeletePositions(
664
+ oldRow: RowRange,
665
+ newRow: RowRange,
666
+ k: number,
667
+ oldHtml: string,
668
+ newHtml: string
669
+ ): number[] {
670
+ let bestPositions: number[] = []
671
+ let bestScore = -1
672
+ for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
673
+ const deleted = new Set(combo)
674
+ let score = 0
675
+ let newIdx = 0
676
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
677
+ if (deleted.has(oldIdx)) continue
678
+ score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
679
+ newIdx++
680
+ }
681
+ if (score > bestScore) {
682
+ bestScore = score
683
+ bestPositions = combo
684
+ }
685
+ }
686
+ return bestPositions
687
+ }
688
+
689
+ /**
690
+ * Yields all sorted-ascending combinations of `k` distinct integers
691
+ * from [0, n). Iterative implementation avoids recursion overhead and
692
+ * keeps memory at O(k).
693
+ */
694
+ function* combinationsOfRange(n: number, k: number): IterableIterator<number[]> {
695
+ if (k === 0 || k > n) return
696
+ const indices = Array.from({ length: k }, (_, i) => i)
697
+ while (true) {
698
+ yield indices.slice()
699
+ let i = k - 1
700
+ while (i >= 0 && indices[i] === n - k + i) i--
701
+ if (i < 0) return
702
+ indices[i]++
703
+ for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1
704
+ }
705
+ }
706
+
479
707
  /**
480
708
  * Try to align cells by logical column position (sum of colspans). When
481
709
  * one side has a colspan'd cell that absorbs multiple cells on the other
@@ -887,18 +1115,41 @@ function pairSimilarUnmatched(
887
1115
  }
888
1116
 
889
1117
  /**
890
- * Character-level similarity using shared prefix + suffix as a fraction
891
- * of the longer string. Catches "single edit somewhere in a long row"
892
- * (which token-Jaccard misses on short rows) while still correctly
893
- * rejecting rows with no positional overlap. HTML tags are stripped to
894
- * keep the comparison content-focused.
1118
+ * Combined similarity metric used for both row-level and cell-level
1119
+ * fuzzy pairing. Returns the MAX of two complementary metrics:
1120
+ *
1121
+ * 1. **Character prefix+suffix similarity** fraction of the longer
1122
+ * string covered by shared prefix + shared suffix. Catches small
1123
+ * edits in the middle of a string (one word changed in a row).
1124
+ * Misses cases where the bulk of common content is in the middle
1125
+ * and the ends differ.
1126
+ *
1127
+ * 2. **Token Jaccard similarity** — intersection-over-union of the
1128
+ * whitespace-split tokens. Catches "most of the content is the
1129
+ * same but bookended by different bits" — e.g. a row whose only
1130
+ * edit is a column added at the start and another at the end,
1131
+ * where the ~50 chars in the middle that DO match would be
1132
+ * invisible to prefix+suffix.
1133
+ *
1134
+ * Either metric exceeding the threshold means pair. Neither alone is
1135
+ * sufficient for the full range of legal-doc edits we see in
1136
+ * production tables.
895
1137
  */
896
1138
  function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number {
897
- const a = rowText(oldHtml, oldRow)
898
- const b = rowText(newHtml, newRow)
1139
+ return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow))
1140
+ }
1141
+
1142
+ function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
1143
+ return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell))
1144
+ }
1145
+
1146
+ function textSimilarity(a: string, b: string): number {
899
1147
  if (a === b) return 1
900
1148
  if (a.length === 0 || b.length === 0) return 0
1149
+ return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
1150
+ }
901
1151
 
1152
+ function charPrefixSuffixSimilarity(a: string, b: string): number {
902
1153
  let prefix = 0
903
1154
  const minLen = Math.min(a.length, b.length)
904
1155
  while (prefix < minLen && a[prefix] === b[prefix]) prefix++
@@ -915,6 +1166,18 @@ function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newH
915
1166
  return (prefix + suffix) / Math.max(a.length, b.length)
916
1167
  }
917
1168
 
1169
+ function tokenJaccardSimilarity(a: string, b: string): number {
1170
+ const tokensA = new Set(a.split(/\s+/).filter(Boolean))
1171
+ const tokensB = new Set(b.split(/\s+/).filter(Boolean))
1172
+ if (tokensA.size === 0 && tokensB.size === 0) return 1
1173
+ let intersection = 0
1174
+ for (const t of tokensA) {
1175
+ if (tokensB.has(t)) intersection++
1176
+ }
1177
+ const union = tokensA.size + tokensB.size - intersection
1178
+ return union === 0 ? 0 : intersection / union
1179
+ }
1180
+
918
1181
  function rowText(html: string, row: RowRange): string {
919
1182
  const parts: string[] = []
920
1183
  for (const cell of row.cells) {
@@ -923,34 +1186,6 @@ function rowText(html: string, row: RowRange): string {
923
1186
  return parts.join(' ').replace(/\s+/g, ' ').trim().toLowerCase()
924
1187
  }
925
1188
 
926
- /**
927
- * Character-level prefix+suffix similarity for a single cell's text
928
- * content. Same metric as rowSimilarity, scoped to one cell so we can
929
- * fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
930
- * a column add in the same row).
931
- */
932
- function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
933
- const a = cellText(oldHtml, oldCell)
934
- const b = cellText(newHtml, newCell)
935
- if (a === b) return 1
936
- if (a.length === 0 || b.length === 0) return 0
937
-
938
- let prefix = 0
939
- const minLen = Math.min(a.length, b.length)
940
- while (prefix < minLen && a[prefix] === b[prefix]) prefix++
941
-
942
- let suffix = 0
943
- while (
944
- suffix < a.length - prefix &&
945
- suffix < b.length - prefix &&
946
- a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
947
- ) {
948
- suffix++
949
- }
950
-
951
- return (prefix + suffix) / Math.max(a.length, b.length)
952
- }
953
-
954
1189
  function cellText(html: string, cell: CellRange): string {
955
1190
  return html
956
1191
  .slice(cell.contentStart, cell.contentEnd)