@createiq/htmldiff 1.0.5-beta.1 → 1.0.5-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/HtmlDiff.cjs +171 -11
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.mjs +171 -11
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/TableDiff.ts +258 -26
- package/test/HtmlDiff.tables.matrix.spec.ts +327 -0
- package/test/HtmlDiff.tables.spec.ts +39 -0
package/package.json
CHANGED
package/src/TableDiff.ts
CHANGED
|
@@ -469,13 +469,165 @@ function diffPreservedRow(
|
|
|
469
469
|
}
|
|
470
470
|
// Cell counts differ. Try to interpret it as a horizontal merge/split via
|
|
471
471
|
// colspan first — preserving the new structure with `class='mod colspan'`
|
|
472
|
-
// on each affected cell.
|
|
473
|
-
// don't align cleanly on logical column positions.
|
|
472
|
+
// on each affected cell.
|
|
474
473
|
const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
475
474
|
if (colspanAligned !== null) return colspanAligned
|
|
475
|
+
// For a single-column add/delete (cell count differs by exactly 1),
|
|
476
|
+
// detect the position via positional similarity scan and align the
|
|
477
|
+
// remaining cells positionally. This handles the case where a column
|
|
478
|
+
// was added AND a different cell got an unrelated content edit — the
|
|
479
|
+
// edited cell still aligns by position rather than getting orphaned by
|
|
480
|
+
// the cell-LCS exact-match.
|
|
481
|
+
const delta = newRow.cells.length - oldRow.cells.length
|
|
482
|
+
// For column add/delete (cell counts differ), find the best insertion
|
|
483
|
+
// or deletion positions via positional similarity scan and align the
|
|
484
|
+
// remaining cells positionally. This handles content-edit alongside
|
|
485
|
+
// column-add by keeping the edited cell in its column position rather
|
|
486
|
+
// than orphaning it via the cell-LCS exact match.
|
|
487
|
+
// Guardrail: combinatorial search is C(newCount, k); we cap to avoid
|
|
488
|
+
// explosion on very wide tables. The cap is generous for real legal
|
|
489
|
+
// schedules; anything above falls through to cell-LCS.
|
|
490
|
+
const absDelta = Math.abs(delta)
|
|
491
|
+
if (
|
|
492
|
+
absDelta > 0 &&
|
|
493
|
+
absDelta <= MAX_COLUMN_DELTA &&
|
|
494
|
+
Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH
|
|
495
|
+
) {
|
|
496
|
+
if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell)
|
|
497
|
+
return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell)
|
|
498
|
+
}
|
|
476
499
|
return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
477
500
|
}
|
|
478
501
|
|
|
502
|
+
const MAX_COLUMN_DELTA = 6
|
|
503
|
+
const MAX_COLUMN_SEARCH_WIDTH = 40
|
|
504
|
+
|
|
505
|
+
/**
|
|
506
|
+
* For a row where new has K more cells than old, find the K column
|
|
507
|
+
* positions in new where cells were inserted by scanning all C(newCount,
|
|
508
|
+
* K) combinations and picking the one that maximises positional content
|
|
509
|
+
* similarity with the remaining cells. The inserted cells are emitted
|
|
510
|
+
* with diff markers; the rest are aligned positionally with content
|
|
511
|
+
* diff for matched pairs.
|
|
512
|
+
*/
|
|
513
|
+
function diffMultiColumnAddRow(
|
|
514
|
+
oldHtml: string,
|
|
515
|
+
newHtml: string,
|
|
516
|
+
oldRow: RowRange,
|
|
517
|
+
newRow: RowRange,
|
|
518
|
+
k: number,
|
|
519
|
+
diffCell: DiffCellFn
|
|
520
|
+
): string {
|
|
521
|
+
const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml)
|
|
522
|
+
const inserted = new Set(insertedPositions)
|
|
523
|
+
const out: string[] = [rowHeaderSlice(newHtml, newRow)]
|
|
524
|
+
let oldIdx = 0
|
|
525
|
+
for (let c = 0; c < newRow.cells.length; c++) {
|
|
526
|
+
if (inserted.has(c)) {
|
|
527
|
+
out.push(emitFullCell(newHtml, newRow.cells[c], 'ins', diffCell))
|
|
528
|
+
} else {
|
|
529
|
+
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell))
|
|
530
|
+
oldIdx++
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
out.push('</tr>')
|
|
534
|
+
return out.join('')
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
function diffMultiColumnDeleteRow(
|
|
538
|
+
oldHtml: string,
|
|
539
|
+
newHtml: string,
|
|
540
|
+
oldRow: RowRange,
|
|
541
|
+
newRow: RowRange,
|
|
542
|
+
k: number,
|
|
543
|
+
diffCell: DiffCellFn
|
|
544
|
+
): string {
|
|
545
|
+
const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml)
|
|
546
|
+
const deleted = new Set(deletedPositions)
|
|
547
|
+
const out: string[] = [rowHeaderSlice(newHtml, newRow)]
|
|
548
|
+
let newIdx = 0
|
|
549
|
+
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
550
|
+
if (deleted.has(oldIdx)) {
|
|
551
|
+
out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del', diffCell))
|
|
552
|
+
continue
|
|
553
|
+
}
|
|
554
|
+
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell))
|
|
555
|
+
newIdx++
|
|
556
|
+
}
|
|
557
|
+
out.push('</tr>')
|
|
558
|
+
return out.join('')
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
function findBestColumnInsertPositions(
|
|
562
|
+
oldRow: RowRange,
|
|
563
|
+
newRow: RowRange,
|
|
564
|
+
k: number,
|
|
565
|
+
oldHtml: string,
|
|
566
|
+
newHtml: string
|
|
567
|
+
): number[] {
|
|
568
|
+
let bestPositions: number[] = []
|
|
569
|
+
let bestScore = -1
|
|
570
|
+
for (const combo of combinationsOfRange(newRow.cells.length, k)) {
|
|
571
|
+
const inserted = new Set(combo)
|
|
572
|
+
let score = 0
|
|
573
|
+
let oldIdx = 0
|
|
574
|
+
for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
|
|
575
|
+
if (inserted.has(newIdx)) continue
|
|
576
|
+
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
|
|
577
|
+
oldIdx++
|
|
578
|
+
}
|
|
579
|
+
if (score > bestScore) {
|
|
580
|
+
bestScore = score
|
|
581
|
+
bestPositions = combo
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
return bestPositions
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
function findBestColumnDeletePositions(
|
|
588
|
+
oldRow: RowRange,
|
|
589
|
+
newRow: RowRange,
|
|
590
|
+
k: number,
|
|
591
|
+
oldHtml: string,
|
|
592
|
+
newHtml: string
|
|
593
|
+
): number[] {
|
|
594
|
+
let bestPositions: number[] = []
|
|
595
|
+
let bestScore = -1
|
|
596
|
+
for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
|
|
597
|
+
const deleted = new Set(combo)
|
|
598
|
+
let score = 0
|
|
599
|
+
let newIdx = 0
|
|
600
|
+
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
601
|
+
if (deleted.has(oldIdx)) continue
|
|
602
|
+
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
|
|
603
|
+
newIdx++
|
|
604
|
+
}
|
|
605
|
+
if (score > bestScore) {
|
|
606
|
+
bestScore = score
|
|
607
|
+
bestPositions = combo
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
return bestPositions
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/**
|
|
614
|
+
* Yields all sorted-ascending combinations of `k` distinct integers
|
|
615
|
+
* from [0, n). Iterative implementation avoids recursion overhead and
|
|
616
|
+
* keeps memory at O(k).
|
|
617
|
+
*/
|
|
618
|
+
function* combinationsOfRange(n: number, k: number): IterableIterator<number[]> {
|
|
619
|
+
if (k === 0 || k > n) return
|
|
620
|
+
const indices = Array.from({ length: k }, (_, i) => i)
|
|
621
|
+
while (true) {
|
|
622
|
+
yield indices.slice()
|
|
623
|
+
let i = k - 1
|
|
624
|
+
while (i >= 0 && indices[i] === n - k + i) i--
|
|
625
|
+
if (i < 0) return
|
|
626
|
+
indices[i]++
|
|
627
|
+
for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
479
631
|
/**
|
|
480
632
|
* Try to align cells by logical column position (sum of colspans). When
|
|
481
633
|
* one side has a colspan'd cell that absorbs multiple cells on the other
|
|
@@ -619,7 +771,12 @@ function diffStructurallyAlignedRow(
|
|
|
619
771
|
): string {
|
|
620
772
|
const oldKeys = oldRow.cells.map(cell => cellKey(oldHtml, cell))
|
|
621
773
|
const newKeys = newRow.cells.map(cell => cellKey(newHtml, cell))
|
|
622
|
-
const
|
|
774
|
+
const exactAlignment = lcsAlign(oldKeys, newKeys)
|
|
775
|
+
// After exact LCS, fuzzy-pair adjacent unmatched old/new cells whose
|
|
776
|
+
// content is similar enough — so a content-edit cell alongside a
|
|
777
|
+
// column-add in the same row produces a content diff for the edited
|
|
778
|
+
// cell rather than a phantom delete + insert + extra cell.
|
|
779
|
+
const alignment = pairSimilarUnmatchedCells(exactAlignment, oldRow, newRow, oldHtml, newHtml)
|
|
623
780
|
|
|
624
781
|
const out: string[] = []
|
|
625
782
|
// Use new's <tr> if it exists; otherwise old's.
|
|
@@ -765,9 +922,17 @@ interface Alignment {
|
|
|
765
922
|
newIdx: number | null
|
|
766
923
|
}
|
|
767
924
|
|
|
768
|
-
/**
|
|
925
|
+
/** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
|
|
769
926
|
const ROW_FUZZY_THRESHOLD = 0.5
|
|
770
927
|
|
|
928
|
+
/**
|
|
929
|
+
* Threshold for "this cell is a content-edit of that cell." Tuned the same
|
|
930
|
+
* as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
|
|
931
|
+
* content typically ARE the same logical cell with a body edit, so 0.5
|
|
932
|
+
* works for both granularities in practice.
|
|
933
|
+
*/
|
|
934
|
+
const CELL_FUZZY_THRESHOLD = 0.5
|
|
935
|
+
|
|
771
936
|
/**
|
|
772
937
|
* After exact LCS, scan the alignment for runs of "old deleted, then new
|
|
773
938
|
* inserted" (or vice versa) and pair entries whose content is similar
|
|
@@ -783,14 +948,42 @@ function pairSimilarUnmatchedRows(
|
|
|
783
948
|
oldHtml: string,
|
|
784
949
|
newHtml: string
|
|
785
950
|
): Alignment[] {
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
951
|
+
return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
|
|
952
|
+
rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml)
|
|
953
|
+
)
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
function pairSimilarUnmatchedCells(
|
|
957
|
+
alignment: Alignment[],
|
|
958
|
+
oldRow: RowRange,
|
|
959
|
+
newRow: RowRange,
|
|
960
|
+
oldHtml: string,
|
|
961
|
+
newHtml: string
|
|
962
|
+
): Alignment[] {
|
|
963
|
+
return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
|
|
964
|
+
cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
|
|
965
|
+
)
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
/**
|
|
969
|
+
* Identify pairings inside each unmatched-only run, then build the output
|
|
970
|
+
* alignment by walking the original and substituting paired entries at
|
|
971
|
+
* the *ins position* (not the del position). This keeps the result
|
|
972
|
+
* monotonic in newIdx — critical because the cursor-based emission
|
|
973
|
+
* downstream walks new's html in order. Emitting at the del position
|
|
974
|
+
* would be fine when del<ins in the alignment array (the typical case),
|
|
975
|
+
* but can violate monotonicity when there are mixed unpaired entries in
|
|
976
|
+
* between (column-add + row-add together, content-edit + column-add,
|
|
977
|
+
* etc.).
|
|
978
|
+
*
|
|
979
|
+
* Generic over what's being paired — works for both rows (by full row
|
|
980
|
+
* content similarity) and cells (by per-cell content similarity).
|
|
981
|
+
*/
|
|
982
|
+
function pairSimilarUnmatched(
|
|
983
|
+
alignment: Alignment[],
|
|
984
|
+
threshold: number,
|
|
985
|
+
similarity: (oldIdx: number, newIdx: number) => number
|
|
986
|
+
): Alignment[] {
|
|
794
987
|
const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
|
|
795
988
|
let i = 0
|
|
796
989
|
while (i < alignment.length) {
|
|
@@ -812,15 +1005,10 @@ function pairSimilarUnmatchedRows(
|
|
|
812
1005
|
const usedIns = new Set<number>()
|
|
813
1006
|
for (const di of delIndices) {
|
|
814
1007
|
let bestIi = -1
|
|
815
|
-
let bestSim =
|
|
1008
|
+
let bestSim = threshold
|
|
816
1009
|
for (const ii of insIndices) {
|
|
817
1010
|
if (usedIns.has(ii)) continue
|
|
818
|
-
const sim =
|
|
819
|
-
oldTable.rows[alignment[di].oldIdx as number],
|
|
820
|
-
newTable.rows[alignment[ii].newIdx as number],
|
|
821
|
-
oldHtml,
|
|
822
|
-
newHtml
|
|
823
|
-
)
|
|
1011
|
+
const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
|
|
824
1012
|
if (sim > bestSim) {
|
|
825
1013
|
bestSim = sim
|
|
826
1014
|
bestIi = ii
|
|
@@ -851,18 +1039,41 @@ function pairSimilarUnmatchedRows(
|
|
|
851
1039
|
}
|
|
852
1040
|
|
|
853
1041
|
/**
|
|
854
|
-
*
|
|
855
|
-
*
|
|
856
|
-
*
|
|
857
|
-
*
|
|
858
|
-
*
|
|
1042
|
+
* Combined similarity metric used for both row-level and cell-level
|
|
1043
|
+
* fuzzy pairing. Returns the MAX of two complementary metrics:
|
|
1044
|
+
*
|
|
1045
|
+
* 1. **Character prefix+suffix similarity** — fraction of the longer
|
|
1046
|
+
* string covered by shared prefix + shared suffix. Catches small
|
|
1047
|
+
* edits in the middle of a string (one word changed in a row).
|
|
1048
|
+
* Misses cases where the bulk of common content is in the middle
|
|
1049
|
+
* and the ends differ.
|
|
1050
|
+
*
|
|
1051
|
+
* 2. **Token Jaccard similarity** — intersection-over-union of the
|
|
1052
|
+
* whitespace-split tokens. Catches "most of the content is the
|
|
1053
|
+
* same but bookended by different bits" — e.g. a row whose only
|
|
1054
|
+
* edit is a column added at the start and another at the end,
|
|
1055
|
+
* where the ~50 chars in the middle that DO match would be
|
|
1056
|
+
* invisible to prefix+suffix.
|
|
1057
|
+
*
|
|
1058
|
+
* Either metric exceeding the threshold means pair. Neither alone is
|
|
1059
|
+
* sufficient for the full range of legal-doc edits we see in
|
|
1060
|
+
* production tables.
|
|
859
1061
|
*/
|
|
860
1062
|
function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number {
|
|
861
|
-
|
|
862
|
-
|
|
1063
|
+
return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow))
|
|
1064
|
+
}
|
|
1065
|
+
|
|
1066
|
+
function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
|
|
1067
|
+
return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell))
|
|
1068
|
+
}
|
|
1069
|
+
|
|
1070
|
+
function textSimilarity(a: string, b: string): number {
|
|
863
1071
|
if (a === b) return 1
|
|
864
1072
|
if (a.length === 0 || b.length === 0) return 0
|
|
1073
|
+
return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
|
|
1074
|
+
}
|
|
865
1075
|
|
|
1076
|
+
function charPrefixSuffixSimilarity(a: string, b: string): number {
|
|
866
1077
|
let prefix = 0
|
|
867
1078
|
const minLen = Math.min(a.length, b.length)
|
|
868
1079
|
while (prefix < minLen && a[prefix] === b[prefix]) prefix++
|
|
@@ -879,6 +1090,18 @@ function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newH
|
|
|
879
1090
|
return (prefix + suffix) / Math.max(a.length, b.length)
|
|
880
1091
|
}
|
|
881
1092
|
|
|
1093
|
+
function tokenJaccardSimilarity(a: string, b: string): number {
|
|
1094
|
+
const tokensA = new Set(a.split(/\s+/).filter(Boolean))
|
|
1095
|
+
const tokensB = new Set(b.split(/\s+/).filter(Boolean))
|
|
1096
|
+
if (tokensA.size === 0 && tokensB.size === 0) return 1
|
|
1097
|
+
let intersection = 0
|
|
1098
|
+
for (const t of tokensA) {
|
|
1099
|
+
if (tokensB.has(t)) intersection++
|
|
1100
|
+
}
|
|
1101
|
+
const union = tokensA.size + tokensB.size - intersection
|
|
1102
|
+
return union === 0 ? 0 : intersection / union
|
|
1103
|
+
}
|
|
1104
|
+
|
|
882
1105
|
function rowText(html: string, row: RowRange): string {
|
|
883
1106
|
const parts: string[] = []
|
|
884
1107
|
for (const cell of row.cells) {
|
|
@@ -887,6 +1110,15 @@ function rowText(html: string, row: RowRange): string {
|
|
|
887
1110
|
return parts.join(' ').replace(/\s+/g, ' ').trim().toLowerCase()
|
|
888
1111
|
}
|
|
889
1112
|
|
|
1113
|
+
function cellText(html: string, cell: CellRange): string {
|
|
1114
|
+
return html
|
|
1115
|
+
.slice(cell.contentStart, cell.contentEnd)
|
|
1116
|
+
.replace(/<[^>]+>/g, ' ')
|
|
1117
|
+
.replace(/\s+/g, ' ')
|
|
1118
|
+
.trim()
|
|
1119
|
+
.toLowerCase()
|
|
1120
|
+
}
|
|
1121
|
+
|
|
890
1122
|
/**
|
|
891
1123
|
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
892
1124
|
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
import { describe, expect, it } from 'vitest'
|
|
2
|
+
|
|
3
|
+
import HtmlDiff from '../src/HtmlDiff'
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Exhaustive matrix of common table operations and their pairwise
|
|
7
|
+
* combinations. Each case runs HtmlDiff.execute and asserts structural
|
|
8
|
+
* invariants on the output:
|
|
9
|
+
*
|
|
10
|
+
* • Every `<tr>` opens and closes
|
|
11
|
+
* • No row has more cells than max(old-row, new-row) cell count
|
|
12
|
+
* (accounting for colspan)
|
|
13
|
+
* • All `<ins>`/`<del>` tags balance
|
|
14
|
+
* • Class markers are coherent (a `<tr class='diffins'>` row's cells
|
|
15
|
+
* all have ins-marked content or are empty; a deleted row's cells
|
|
16
|
+
* all have del-marked content or are empty)
|
|
17
|
+
* • The cell content of every `<td>` from new appears somewhere in
|
|
18
|
+
* the output (we don't silently drop cells)
|
|
19
|
+
*
|
|
20
|
+
* The matrix is built combinatorially — single operations × single
|
|
21
|
+
* operations — so a regression in any pairwise combination surfaces
|
|
22
|
+
* here even if no test was added for that exact pair.
|
|
23
|
+
*/
|
|
24
|
+
describe('HtmlDiff — table operations matrix', () => {
|
|
25
|
+
describe('single operations on a 3×3 base', () => {
|
|
26
|
+
for (const op of allSingleOperations()) {
|
|
27
|
+
it(`${op.name} produces structurally valid output`, () => {
|
|
28
|
+
const base = baseTable3x3()
|
|
29
|
+
const newHtml = op.apply(base)
|
|
30
|
+
const oldHtml = renderTable(base)
|
|
31
|
+
const result = HtmlDiff.execute(oldHtml, newHtml)
|
|
32
|
+
assertStructurallyValid(result, oldHtml, newHtml, op.name)
|
|
33
|
+
})
|
|
34
|
+
}
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
describe('pairwise combinations on a 3×3 base', () => {
|
|
38
|
+
const ops = allSingleOperations()
|
|
39
|
+
for (const opA of ops) {
|
|
40
|
+
for (const opB of ops) {
|
|
41
|
+
if (opA === opB) continue
|
|
42
|
+
// Some combinations don't compose cleanly (e.g. "delete-row-end"
|
|
43
|
+
// + "delete-row-end" applied twice). Skip pairs that mutate
|
|
44
|
+
// overlapping ranges.
|
|
45
|
+
if (!canCompose(opA, opB)) continue
|
|
46
|
+
it(`${opA.name} + ${opB.name} produces structurally valid output`, () => {
|
|
47
|
+
const base = baseTable3x3()
|
|
48
|
+
const intermediate = parseTable(opA.apply(base))
|
|
49
|
+
const newHtml = opB.apply(intermediate)
|
|
50
|
+
const oldHtml = renderTable(base)
|
|
51
|
+
const result = HtmlDiff.execute(oldHtml, newHtml)
|
|
52
|
+
assertStructurallyValid(result, oldHtml, newHtml, `${opA.name} + ${opB.name}`)
|
|
53
|
+
})
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
describe('user-reported regression scenarios', () => {
|
|
59
|
+
it('column added + empty row inserted in middle (regression for issue with blank row)', () => {
|
|
60
|
+
const oldHtml =
|
|
61
|
+
'<table>' +
|
|
62
|
+
'<tr><th>Party</th><th>Form</th><th>Date</th></tr>' +
|
|
63
|
+
'<tr><td>Party A</td><td>IRS Forms W-8BEN-E and W-8ECI (or any successors thereto).</td><td>(i) Upon execution.</td></tr>' +
|
|
64
|
+
'<tr><td>Party B</td><td>IRS Form W-9, as applicable (or any successor thereto).</td><td>(i) Upon execution.</td></tr>' +
|
|
65
|
+
'</table>'
|
|
66
|
+
const newHtml =
|
|
67
|
+
'<table>' +
|
|
68
|
+
'<tr><th>Party</th><th>Extra column</th><th>Form</th><th>Date</th></tr>' +
|
|
69
|
+
"<tr><td>Party A</td><td>Yes</td><td>IRS Forms W-8BEN-E and W-8ECI (or any successors thereto). Here's some extra content</td><td>(i) Upon execution.</td></tr>" +
|
|
70
|
+
'<tr><td></td><td></td><td></td><td></td></tr>' +
|
|
71
|
+
'<tr><td>Party B</td><td>A</td><td>IRS Form W-9, as applicable (or any successor thereto).</td><td>(i) Upon execution.</td></tr>' +
|
|
72
|
+
'</table>'
|
|
73
|
+
|
|
74
|
+
const result = HtmlDiff.execute(oldHtml, newHtml)
|
|
75
|
+
assertStructurallyValid(result, oldHtml, newHtml, 'column-add + empty row insert')
|
|
76
|
+
// Specific assertions on this case so we can see exactly what went
|
|
77
|
+
// wrong if the structural-invariant check passes but the output
|
|
78
|
+
// still drifts.
|
|
79
|
+
const rowCount = countMatches(result, /<tr[\s>]/g)
|
|
80
|
+
expect(rowCount).toBe(4) // header + Party A + empty + Party B
|
|
81
|
+
expect(result).toContain(
|
|
82
|
+
"<tr class='diffins'><td class='diffins'></td><td class='diffins'></td><td class='diffins'></td><td class='diffins'></td></tr>"
|
|
83
|
+
)
|
|
84
|
+
})
|
|
85
|
+
})
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
// ──────────────────────── operations ────────────────────────
|
|
89
|
+
|
|
90
|
+
interface Op {
|
|
91
|
+
name: string
|
|
92
|
+
apply: (table: TableData) => string
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function allSingleOperations(): Op[] {
|
|
96
|
+
return [
|
|
97
|
+
{ name: 'no-op', apply: t => renderTable(t) },
|
|
98
|
+
{ name: 'edit-cell', apply: t => renderTable(mutate(t, m => m.editCell(1, 1, 'EDITED'))) },
|
|
99
|
+
{ name: 'add-row-start', apply: t => renderTable(mutate(t, m => m.addRowAt(0, ['NEW1', 'NEW2', 'NEW3']))) },
|
|
100
|
+
{ name: 'add-row-middle', apply: t => renderTable(mutate(t, m => m.addRowAt(2, ['NEW1', 'NEW2', 'NEW3']))) },
|
|
101
|
+
{
|
|
102
|
+
name: 'add-row-end',
|
|
103
|
+
apply: t => renderTable(mutate(t, m => m.addRowAt(t.rows.length, ['NEW1', 'NEW2', 'NEW3']))),
|
|
104
|
+
},
|
|
105
|
+
{ name: 'add-empty-row-middle', apply: t => renderTable(mutate(t, m => m.addRowAt(2, ['', '', '']))) },
|
|
106
|
+
{
|
|
107
|
+
name: 'add-multiple-rows',
|
|
108
|
+
apply: t =>
|
|
109
|
+
renderTable(
|
|
110
|
+
mutate(t, m => {
|
|
111
|
+
m.addRowAt(t.rows.length, ['X1', 'X2', 'X3'])
|
|
112
|
+
m.addRowAt(t.rows.length + 1, ['Y1', 'Y2', 'Y3'])
|
|
113
|
+
})
|
|
114
|
+
),
|
|
115
|
+
},
|
|
116
|
+
{ name: 'delete-row-start', apply: t => renderTable(mutate(t, m => m.deleteRow(1))) }, // skip header
|
|
117
|
+
{ name: 'delete-row-middle', apply: t => renderTable(mutate(t, m => m.deleteRow(2))) },
|
|
118
|
+
{ name: 'delete-row-end', apply: t => renderTable(mutate(t, m => m.deleteRow(t.rows.length - 1))) },
|
|
119
|
+
{
|
|
120
|
+
name: 'delete-multiple-rows',
|
|
121
|
+
apply: t =>
|
|
122
|
+
renderTable(
|
|
123
|
+
mutate(t, m => {
|
|
124
|
+
m.deleteRow(t.rows.length - 1)
|
|
125
|
+
m.deleteRow(1)
|
|
126
|
+
})
|
|
127
|
+
),
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
name: 'add-column-start',
|
|
131
|
+
apply: t => renderTable(mutate(t, m => m.addColumnAt(0, ['NewHeader', 'newA', 'newB', 'newC']))),
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
name: 'add-column-middle',
|
|
135
|
+
apply: t => renderTable(mutate(t, m => m.addColumnAt(1, ['NewHeader', 'newA', 'newB', 'newC']))),
|
|
136
|
+
},
|
|
137
|
+
{
|
|
138
|
+
name: 'add-column-end',
|
|
139
|
+
apply: t => renderTable(mutate(t, m => m.addColumnAt(t.rows[0].length, ['NewHeader', 'newA', 'newB', 'newC']))),
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
name: 'add-multiple-columns',
|
|
143
|
+
apply: t =>
|
|
144
|
+
renderTable(
|
|
145
|
+
mutate(t, m => {
|
|
146
|
+
m.addColumnAt(t.rows[0].length, ['H1', 'a1', 'b1', 'c1'])
|
|
147
|
+
m.addColumnAt(t.rows[0].length + 1, ['H2', 'a2', 'b2', 'c2'])
|
|
148
|
+
})
|
|
149
|
+
),
|
|
150
|
+
},
|
|
151
|
+
{ name: 'delete-column-start', apply: t => renderTable(mutate(t, m => m.deleteColumn(0))) },
|
|
152
|
+
{ name: 'delete-column-middle', apply: t => renderTable(mutate(t, m => m.deleteColumn(1))) },
|
|
153
|
+
{ name: 'delete-column-end', apply: t => renderTable(mutate(t, m => m.deleteColumn(t.rows[0].length - 1))) },
|
|
154
|
+
{ name: 'shift-content-right', apply: t => renderTable(mutate(t, m => m.shiftContentRight(1))) },
|
|
155
|
+
]
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Some operation pairs don't compose cleanly because the second
|
|
160
|
+
* operation's row/column index assumes the original table dimensions.
|
|
161
|
+
* We skip pairs where the second op's index would be out of bounds
|
|
162
|
+
* after the first op's mutation.
|
|
163
|
+
*/
|
|
164
|
+
function canCompose(_a: Op, _b: Op): boolean {
|
|
165
|
+
// For now, allow all combinations and let the operation skip
|
|
166
|
+
// gracefully when bounds are invalid. The mutate helpers clamp.
|
|
167
|
+
return true
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// ──────────────────────── table model ────────────────────────
|
|
171
|
+
|
|
172
|
+
interface TableData {
|
|
173
|
+
rows: string[][]
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function baseTable3x3(): TableData {
|
|
177
|
+
return {
|
|
178
|
+
rows: [
|
|
179
|
+
['Header1', 'Header2', 'Header3'],
|
|
180
|
+
['A1', 'A2', 'A3'],
|
|
181
|
+
['B1', 'B2', 'B3'],
|
|
182
|
+
['C1', 'C2', 'C3'],
|
|
183
|
+
],
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
function renderTable(t: TableData): string {
|
|
188
|
+
const out: string[] = ['<table>']
|
|
189
|
+
for (let r = 0; r < t.rows.length; r++) {
|
|
190
|
+
out.push('<tr>')
|
|
191
|
+
const tag = r === 0 ? 'th' : 'td'
|
|
192
|
+
for (const cell of t.rows[r]) {
|
|
193
|
+
out.push(`<${tag}>${cell}</${tag}>`)
|
|
194
|
+
}
|
|
195
|
+
out.push('</tr>')
|
|
196
|
+
}
|
|
197
|
+
out.push('</table>')
|
|
198
|
+
return out.join('')
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function parseTable(html: string): TableData {
|
|
202
|
+
// Tiny parser sufficient for our generated tables. NOT a general
|
|
203
|
+
// HTML parser; only used inside this matrix.
|
|
204
|
+
const rows: string[][] = []
|
|
205
|
+
const rowMatches = html.matchAll(/<tr[^>]*>(.*?)<\/tr>/gs)
|
|
206
|
+
for (const rowMatch of rowMatches) {
|
|
207
|
+
const cells: string[] = []
|
|
208
|
+
const cellMatches = rowMatch[1].matchAll(/<t[dh][^>]*>(.*?)<\/t[dh]>/gs)
|
|
209
|
+
for (const cellMatch of cellMatches) cells.push(cellMatch[1])
|
|
210
|
+
rows.push(cells)
|
|
211
|
+
}
|
|
212
|
+
return { rows }
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
interface Mutator {
|
|
216
|
+
editCell(row: number, col: number, content: string): void
|
|
217
|
+
addRowAt(at: number, content: string[]): void
|
|
218
|
+
deleteRow(at: number): void
|
|
219
|
+
addColumnAt(at: number, columnContent: string[]): void
|
|
220
|
+
deleteColumn(at: number): void
|
|
221
|
+
shiftContentRight(rowIdx: number): void
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
function mutate(t: TableData, fn: (m: Mutator) => void): TableData {
|
|
225
|
+
const cloned: TableData = { rows: t.rows.map(row => [...row]) }
|
|
226
|
+
const m: Mutator = {
|
|
227
|
+
editCell(row, col, content) {
|
|
228
|
+
if (cloned.rows[row]?.[col] !== undefined) cloned.rows[row][col] = content
|
|
229
|
+
},
|
|
230
|
+
addRowAt(at, content) {
|
|
231
|
+
const idx = Math.max(0, Math.min(at, cloned.rows.length))
|
|
232
|
+
cloned.rows.splice(idx, 0, content)
|
|
233
|
+
},
|
|
234
|
+
deleteRow(at) {
|
|
235
|
+
if (at >= 0 && at < cloned.rows.length) cloned.rows.splice(at, 1)
|
|
236
|
+
},
|
|
237
|
+
addColumnAt(at, columnContent) {
|
|
238
|
+
for (let r = 0; r < cloned.rows.length; r++) {
|
|
239
|
+
const idx = Math.max(0, Math.min(at, cloned.rows[r].length))
|
|
240
|
+
cloned.rows[r].splice(idx, 0, columnContent[r] ?? '')
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
deleteColumn(at) {
|
|
244
|
+
for (const row of cloned.rows) {
|
|
245
|
+
if (at >= 0 && at < row.length) row.splice(at, 1)
|
|
246
|
+
}
|
|
247
|
+
},
|
|
248
|
+
shiftContentRight(rowIdx) {
|
|
249
|
+
const row = cloned.rows[rowIdx]
|
|
250
|
+
if (!row) return
|
|
251
|
+
// Shift each cell's content one position to the right; first
|
|
252
|
+
// cell becomes empty, last cell's content drops off.
|
|
253
|
+
for (let c = row.length - 1; c > 0; c--) row[c] = row[c - 1]
|
|
254
|
+
row[0] = ''
|
|
255
|
+
},
|
|
256
|
+
}
|
|
257
|
+
fn(m)
|
|
258
|
+
return cloned
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// ──────────────────────── invariant checks ────────────────────────
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* Asserts the diff output is structurally valid:
|
|
265
|
+
* • All `<tr>`/`<td>`/`<th>` open/close tags balance
|
|
266
|
+
* • All `<ins>`/`<del>` tags balance
|
|
267
|
+
* • Every row in the output has cell count ≤ max(old-row-cell-count,
|
|
268
|
+
* new-row-cell-count) — no phantom cells
|
|
269
|
+
* • Output isn't empty when inputs aren't equal
|
|
270
|
+
*/
|
|
271
|
+
function assertStructurallyValid(output: string, oldHtml: string, newHtml: string, label: string) {
|
|
272
|
+
const ctx = `[${label}]`
|
|
273
|
+
|
|
274
|
+
// Tag balance
|
|
275
|
+
const openTr = countMatches(output, /<tr[\s>]/g)
|
|
276
|
+
const closeTr = countMatches(output, /<\/tr>/g)
|
|
277
|
+
expect(openTr, `${ctx} <tr> tag balance`).toBe(closeTr)
|
|
278
|
+
|
|
279
|
+
const openTd = countMatches(output, /<td[\s>]/g)
|
|
280
|
+
const closeTd = countMatches(output, /<\/td>/g)
|
|
281
|
+
expect(openTd, `${ctx} <td> tag balance`).toBe(closeTd)
|
|
282
|
+
|
|
283
|
+
const openTh = countMatches(output, /<th[\s>]/g)
|
|
284
|
+
const closeTh = countMatches(output, /<\/th>/g)
|
|
285
|
+
expect(openTh, `${ctx} <th> tag balance`).toBe(closeTh)
|
|
286
|
+
|
|
287
|
+
// ins/del balance — each opening tag has a matching closing tag.
|
|
288
|
+
const openIns = countMatches(output, /<ins[\s>]/g)
|
|
289
|
+
const closeIns = countMatches(output, /<\/ins>/g)
|
|
290
|
+
expect(openIns, `${ctx} <ins> tag balance`).toBe(closeIns)
|
|
291
|
+
|
|
292
|
+
const openDel = countMatches(output, /<del[\s>]/g)
|
|
293
|
+
const closeDel = countMatches(output, /<\/del>/g)
|
|
294
|
+
expect(openDel, `${ctx} <del> tag balance`).toBe(closeDel)
|
|
295
|
+
|
|
296
|
+
// Per-row cell count ≤ max(old, new) row width.
|
|
297
|
+
const oldMaxCells = maxRowCellCount(oldHtml)
|
|
298
|
+
const newMaxCells = maxRowCellCount(newHtml)
|
|
299
|
+
const limit = Math.max(oldMaxCells, newMaxCells)
|
|
300
|
+
|
|
301
|
+
// Walk output rows
|
|
302
|
+
const rowMatches = output.matchAll(/<tr[^>]*>(.*?)<\/tr>/gs)
|
|
303
|
+
for (const rowMatch of rowMatches) {
|
|
304
|
+
const cellsInRow = countMatches(rowMatch[1], /<t[dh][\s>]/g)
|
|
305
|
+
expect(cellsInRow, `${ctx} row has too many cells (${cellsInRow} > ${limit})`).toBeLessThanOrEqual(limit)
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Output is non-empty when inputs aren't equal.
|
|
309
|
+
if (oldHtml !== newHtml) {
|
|
310
|
+
expect(output.length, `${ctx} output is empty`).toBeGreaterThan(0)
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
function maxRowCellCount(html: string): number {
|
|
315
|
+
let max = 0
|
|
316
|
+
const rowMatches = html.matchAll(/<tr[^>]*>(.*?)<\/tr>/gs)
|
|
317
|
+
for (const rowMatch of rowMatches) {
|
|
318
|
+
const count = countMatches(rowMatch[1], /<t[dh][\s>]/g)
|
|
319
|
+
if (count > max) max = count
|
|
320
|
+
}
|
|
321
|
+
return max
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
function countMatches(s: string, re: RegExp): number {
|
|
325
|
+
const matches = s.match(re)
|
|
326
|
+
return matches ? matches.length : 0
|
|
327
|
+
}
|