@createiq/htmldiff 1.1.0-beta.0 → 1.2.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -0
- package/dist/HtmlDiff.cjs +1259 -498
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +141 -7
- package/dist/HtmlDiff.d.mts +140 -7
- package/dist/HtmlDiff.mjs +1259 -498
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +7 -7
- package/src/Alignment.ts +349 -0
- package/src/HtmlDiff.ts +323 -33
- package/src/HtmlScanner.ts +200 -0
- package/src/TableDiff.ts +99 -550
- package/src/ThreeWayDiff.ts +223 -0
- package/src/ThreeWayTable.ts +701 -0
- package/src/Utils.ts +34 -2
- package/test/HtmlDiff.analyze.spec.ts +152 -0
- package/test/HtmlDiff.tables.matrix.spec.ts +8 -3
- package/test/HtmlDiff.tables.spec.ts +368 -19
- package/test/HtmlDiff.threeWay.spec.ts +175 -0
- package/test/HtmlDiff.threeWay.tables.spec.ts +407 -0
- package/test/TableDiff.bench.ts +39 -0
- package/test/Utils.spec.ts +48 -0
package/src/Utils.ts
CHANGED
|
@@ -32,8 +32,39 @@ export function stripTagAttributes(word: string): string {
|
|
|
32
32
|
return word
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
-
|
|
36
|
-
|
|
35
|
+
/**
|
|
36
|
+
* Optional metadata attached to a wrapped tag. Used by `executeThreeWay`
|
|
37
|
+
* to colour diff segments with their author (CP vs Me) via extra classes
|
|
38
|
+
* and `data-*` attributes; the two-way path passes nothing and gets the
|
|
39
|
+
* unchanged historical output.
|
|
40
|
+
*/
|
|
41
|
+
export interface WrapMetadata {
|
|
42
|
+
/** Space-separated classes appended after `cssClass`. */
|
|
43
|
+
extraClasses?: string
|
|
44
|
+
/** `data-*` attribute map, keyed by the attribute name *without* the `data-` prefix. */
|
|
45
|
+
dataAttrs?: Readonly<Record<string, string>>
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export function wrapText(text: string, tagName: string, cssClass: string, metadata?: WrapMetadata): string {
|
|
49
|
+
if (!metadata) return `<${tagName} class='${cssClass}'>${text}</${tagName}>`
|
|
50
|
+
return `<${tagName}${composeTagAttributes(cssClass, metadata)}>${text}</${tagName}>`
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Build the attribute portion of an opening tag from a base class plus
|
|
55
|
+
* optional metadata. Exposed so emission paths that build opening-tag
|
|
56
|
+
* fragments by hand (e.g. the formatting-tag special-case in
|
|
57
|
+
* `HtmlDiff.insertTag`) can stay consistent with `wrapText`.
|
|
58
|
+
*/
|
|
59
|
+
export function composeTagAttributes(cssClass: string, metadata: WrapMetadata): string {
|
|
60
|
+
const classes = metadata.extraClasses ? `${cssClass} ${metadata.extraClasses}` : cssClass
|
|
61
|
+
let out = ` class='${classes}'`
|
|
62
|
+
if (metadata.dataAttrs) {
|
|
63
|
+
for (const key of Object.keys(metadata.dataAttrs)) {
|
|
64
|
+
out += ` data-${key}='${metadata.dataAttrs[key]}'`
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return out
|
|
37
68
|
}
|
|
38
69
|
|
|
39
70
|
export function isStartOfTag(val: string): boolean {
|
|
@@ -85,6 +116,7 @@ export default {
|
|
|
85
116
|
isTag,
|
|
86
117
|
stripTagAttributes,
|
|
87
118
|
wrapText,
|
|
119
|
+
composeTagAttributes,
|
|
88
120
|
isStartOfTag,
|
|
89
121
|
isEndOfTag,
|
|
90
122
|
isStartOfEntity,
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import { describe, expect, it } from 'vitest'
|
|
2
|
+
|
|
3
|
+
import Action from '../src/Action'
|
|
4
|
+
import HtmlDiff from '../src/HtmlDiff'
|
|
5
|
+
|
|
6
|
+
describe('HtmlDiff.analyze', () => {
|
|
7
|
+
describe('return shape', () => {
|
|
8
|
+
it('returns operations indexed into oldDiffWords / newDiffWords', () => {
|
|
9
|
+
const result = HtmlDiff.analyze('a b c', 'a x c')
|
|
10
|
+
expect(result.oldDiffWords).toBeInstanceOf(Array)
|
|
11
|
+
expect(result.newDiffWords).toBeInstanceOf(Array)
|
|
12
|
+
expect(result.operations).toBeInstanceOf(Array)
|
|
13
|
+
// Every op's endInOld must be ≤ oldDiffWords.length, etc.
|
|
14
|
+
for (const op of result.operations) {
|
|
15
|
+
expect(op.endInOld).toBeLessThanOrEqual(result.oldDiffWords.length)
|
|
16
|
+
expect(op.endInNew).toBeLessThanOrEqual(result.newDiffWords.length)
|
|
17
|
+
}
|
|
18
|
+
})
|
|
19
|
+
|
|
20
|
+
it('returns original word arrays alongside the diff arrays', () => {
|
|
21
|
+
const result = HtmlDiff.analyze('hello world', 'hello there world')
|
|
22
|
+
expect(result.oldOriginalWords).toBeInstanceOf(Array)
|
|
23
|
+
expect(result.newOriginalWords).toBeInstanceOf(Array)
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
it('returns null contentToOriginal maps when projections are inactive', () => {
|
|
27
|
+
// Plain text with no structural tags → projection inactive.
|
|
28
|
+
const result = HtmlDiff.analyze('a b c', 'a x c')
|
|
29
|
+
expect(result.oldContentToOriginal).toBeNull()
|
|
30
|
+
expect(result.newContentToOriginal).toBeNull()
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
it('returns non-null contentToOriginal maps when projections are active', () => {
|
|
34
|
+
// Different wrapper tags → projection kicks in.
|
|
35
|
+
const result = HtmlDiff.analyze('<p>hello world</p>', '<div>hello world</div>')
|
|
36
|
+
expect(result.oldContentToOriginal).not.toBeNull()
|
|
37
|
+
expect(result.newContentToOriginal).not.toBeNull()
|
|
38
|
+
})
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
describe('useProjections option', () => {
|
|
42
|
+
it('honours useProjections=false even when the heuristic would project', () => {
|
|
43
|
+
const result = HtmlDiff.analyze('<p>hello world</p>', '<div>hello world</div>', { useProjections: false })
|
|
44
|
+
expect(result.oldContentToOriginal).toBeNull()
|
|
45
|
+
expect(result.newContentToOriginal).toBeNull()
|
|
46
|
+
// Structural tags appear as diff tokens — observable consequence.
|
|
47
|
+
expect(result.oldDiffWords).toContain('<p>')
|
|
48
|
+
expect(result.newDiffWords).toContain('<div>')
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
it('honours useProjections=true even when the heuristic would skip', () => {
|
|
52
|
+
// Same structural tags on both sides → heuristic skips projection.
|
|
53
|
+
// Forcing it should still project (strip the <p> tags from diff space).
|
|
54
|
+
const result = HtmlDiff.analyze('<p>a b c</p>', '<p>a x c</p>', { useProjections: true })
|
|
55
|
+
expect(result.oldContentToOriginal).not.toBeNull()
|
|
56
|
+
expect(result.newContentToOriginal).not.toBeNull()
|
|
57
|
+
// Structural tags removed from diff arrays.
|
|
58
|
+
expect(result.oldDiffWords).not.toContain('<p>')
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
it('keeps projections off when useProjections=true but one side has no content', () => {
|
|
62
|
+
const result = HtmlDiff.analyze('<p></p>', '<p>added</p>', { useProjections: true })
|
|
63
|
+
// Empty-content side disables the forced projection.
|
|
64
|
+
expect(result.oldContentToOriginal).toBeNull()
|
|
65
|
+
expect(result.newContentToOriginal).toBeNull()
|
|
66
|
+
})
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
describe('symmetric V2 tokenisation', () => {
|
|
70
|
+
it('produces an identical V2 diff array across two calls when useProjections matches', () => {
|
|
71
|
+
const v1 = '<p>Hello world.</p>'
|
|
72
|
+
const v2 = '<p>Hello cruel world.</p>'
|
|
73
|
+
const v3 = '<p>Hello cruel world today.</p>'
|
|
74
|
+
const useProjections = false // Force off — both calls agree.
|
|
75
|
+
const d1 = HtmlDiff.analyze(v1, v2, { useProjections })
|
|
76
|
+
const d2 = HtmlDiff.analyze(v2, v3, { useProjections })
|
|
77
|
+
expect(d1.newDiffWords).toEqual(d2.oldDiffWords)
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
it('the V2 arrays diverge when one call projects and the other does not (motivates D1)', () => {
|
|
81
|
+
// Asymmetric structural patterns: V1 has <p>, V3 has <div>; V2 has <p>.
|
|
82
|
+
// V1↔V2 heuristic: no structural diff → no projection.
|
|
83
|
+
// V2↔V3 heuristic: structural diff → project.
|
|
84
|
+
// Result: d1.newDiffWords (raw V2) ≠ d2.oldDiffWords (projected V2).
|
|
85
|
+
const v1 = '<p>Hello world.</p>'
|
|
86
|
+
const v2 = '<p>Hello cruel world.</p>'
|
|
87
|
+
const v3 = '<div>Hello cruel world today.</div>'
|
|
88
|
+
const d1 = HtmlDiff.analyze(v1, v2)
|
|
89
|
+
const d2 = HtmlDiff.analyze(v2, v3)
|
|
90
|
+
// This is the bug that D1's symmetric-decision design exists to prevent.
|
|
91
|
+
expect(d1.newDiffWords).not.toEqual(d2.oldDiffWords)
|
|
92
|
+
})
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
describe('options pass-through', () => {
|
|
96
|
+
it('respects ignoreWhitespaceDifferences', () => {
|
|
97
|
+
// With the flag on, the matcher should consider two-space and
|
|
98
|
+
// single-space as equivalent; without it, they replace.
|
|
99
|
+
const withoutFlag = HtmlDiff.analyze('a b', 'a b')
|
|
100
|
+
const withFlag = HtmlDiff.analyze('a b', 'a b', { ignoreWhitespaceDifferences: true })
|
|
101
|
+
const replaceCount = (r: typeof withFlag) => r.operations.filter(op => op.action === Action.Replace).length
|
|
102
|
+
// Flag off: whitespace difference shows up as a Replace.
|
|
103
|
+
expect(replaceCount(withoutFlag)).toBeGreaterThan(0)
|
|
104
|
+
// Flag on: no Replace, only Equals.
|
|
105
|
+
expect(replaceCount(withFlag)).toBe(0)
|
|
106
|
+
})
|
|
107
|
+
|
|
108
|
+
it('respects blockExpressions', () => {
|
|
109
|
+
// Without the block expression, "01/01/2026" is split into multiple
|
|
110
|
+
// tokens; with it, the whole date is one token (per WordSplitter's
|
|
111
|
+
// BlockFinder contract — uses the `g` flag).
|
|
112
|
+
const dateExpr = /\d{2}\/\d{2}\/\d{4}/g
|
|
113
|
+
const without = HtmlDiff.analyze('on 01/01/2026 here', 'on 02/02/2027 here')
|
|
114
|
+
const withExpr = HtmlDiff.analyze('on 01/01/2026 here', 'on 02/02/2027 here', { blockExpressions: [dateExpr] })
|
|
115
|
+
expect(withExpr.oldDiffWords.length).toBeLessThan(without.oldDiffWords.length)
|
|
116
|
+
})
|
|
117
|
+
})
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
describe('HtmlDiff.evaluateProjectionApplicability', () => {
|
|
121
|
+
it('returns false when structures match', () => {
|
|
122
|
+
expect(HtmlDiff.evaluateProjectionApplicability('<p>a</p>', '<p>b</p>')).toBe(false)
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
it('returns true when structures differ (wrapper rename)', () => {
|
|
126
|
+
expect(HtmlDiff.evaluateProjectionApplicability('<p>a b c</p>', '<div>a b c</div>')).toBe(true)
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
it('returns false when one side has no structural tags at all', () => {
|
|
130
|
+
// Plain text vs wrapped HTML: shouldUseContentProjections bails.
|
|
131
|
+
expect(HtmlDiff.evaluateProjectionApplicability('plain text', '<p>plain text</p>')).toBe(false)
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
it('returns false when projection would empty one side', () => {
|
|
135
|
+
expect(HtmlDiff.evaluateProjectionApplicability('<p></p>', '<div>content</div>')).toBe(false)
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
it('lets a composer compute a symmetric decision across three inputs', () => {
|
|
139
|
+
const v1 = '<p>Hello world.</p>'
|
|
140
|
+
const v2 = '<p>Hello cruel world.</p>'
|
|
141
|
+
const v3 = '<div>Hello cruel world today.</div>'
|
|
142
|
+
const proj12 = HtmlDiff.evaluateProjectionApplicability(v1, v2)
|
|
143
|
+
const proj23 = HtmlDiff.evaluateProjectionApplicability(v2, v3)
|
|
144
|
+
// The symmetric decision is the conjunction — project iff both pairs would.
|
|
145
|
+
const symmetric = proj12 && proj23
|
|
146
|
+
expect(symmetric).toBe(false) // V1↔V2 has no structural diff.
|
|
147
|
+
// Both calls then use useProjections=false and V2 tokenises identically.
|
|
148
|
+
const d1 = HtmlDiff.analyze(v1, v2, { useProjections: symmetric })
|
|
149
|
+
const d2 = HtmlDiff.analyze(v2, v3, { useProjections: symmetric })
|
|
150
|
+
expect(d1.newDiffWords).toEqual(d2.oldDiffWords)
|
|
151
|
+
})
|
|
152
|
+
})
|
|
@@ -108,9 +108,14 @@ describe('HtmlDiff — table operations matrix', () => {
|
|
|
108
108
|
// still drifts.
|
|
109
109
|
const rowCount = countMatches(result, /<tr[\s>]/g)
|
|
110
110
|
expect(rowCount).toBe(4) // header + Party A + empty + Party B
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
)
|
|
111
|
+
// The inserted empty row must be emitted with diffins on the <tr>
|
|
112
|
+
// and 4 empty diffins-marked cells. Asserted via regex (quote-
|
|
113
|
+
// agnostic, whitespace-tolerant) so an incidental change in
|
|
114
|
+
// attribute-quote style isn't flagged as a regression.
|
|
115
|
+
const emptyInsertedRow = result.match(/<tr class=['"]diffins['"]>(.*?)<\/tr>/)
|
|
116
|
+
expect(emptyInsertedRow).not.toBeNull()
|
|
117
|
+
const emptyCellCount = countMatches(emptyInsertedRow?.[1] ?? '', /<td class=['"]diffins['"]><\/td>/g)
|
|
118
|
+
expect(emptyCellCount).toBe(4)
|
|
114
119
|
})
|
|
115
120
|
})
|
|
116
121
|
})
|
|
@@ -859,7 +859,15 @@ describe('HtmlDiff — tables', () => {
|
|
|
859
859
|
'<table><tr><td>A</td><td>X</td></tr><tr data-behaviour="data"><td>B</td><td>EXTRA</td><td>C</td></tr></table>'
|
|
860
860
|
|
|
861
861
|
const result = HtmlDiff.execute(oldHtml, newHtml)
|
|
862
|
-
|
|
862
|
+
// The <tr> attribute must survive AND the row's cells must be
|
|
863
|
+
// intact: B preserved, EXTRA inserted, C preserved. A bare
|
|
864
|
+
// `toContain('<tr data-behaviour="data">')` would pass even if
|
|
865
|
+
// the cells were dropped or duplicated downstream.
|
|
866
|
+
expect(result).toContain(
|
|
867
|
+
'<tr data-behaviour="data"><td>B</td>' +
|
|
868
|
+
"<td class='diffins'><ins class='diffins'>EXTRA</ins></td>" +
|
|
869
|
+
'<td>C</td></tr>'
|
|
870
|
+
)
|
|
863
871
|
})
|
|
864
872
|
|
|
865
873
|
it('preserves data-* attribute on a <tr> when the row passes through fuzzy-matching with content edit', () => {
|
|
@@ -879,7 +887,12 @@ describe('HtmlDiff — tables', () => {
|
|
|
879
887
|
'</table>'
|
|
880
888
|
|
|
881
889
|
const result = HtmlDiff.execute(oldHtml, newHtml)
|
|
882
|
-
|
|
890
|
+
// Same intent as above: attribute must survive AND the row's
|
|
891
|
+
// content-edit must be present on the same <tr>.
|
|
892
|
+
expect(result).toContain(
|
|
893
|
+
'<tr data-behaviour="data"><td>The quick brown fox jumps over' +
|
|
894
|
+
"<ins class='diffins'> the lazy dog</ins>.</td></tr>"
|
|
895
|
+
)
|
|
883
896
|
})
|
|
884
897
|
|
|
885
898
|
it('preserves <table> attributes verbatim from new (no diff marker on attribute changes)', () => {
|
|
@@ -1573,12 +1586,13 @@ describe('HtmlDiff — tables', () => {
|
|
|
1573
1586
|
})
|
|
1574
1587
|
})
|
|
1575
1588
|
|
|
1576
|
-
//
|
|
1577
|
-
//
|
|
1578
|
-
//
|
|
1579
|
-
//
|
|
1580
|
-
//
|
|
1581
|
-
//
|
|
1589
|
+
// The cell-LCS fallback path (diffStructurallyAlignedRow + cellKey +
|
|
1590
|
+
// pairSimilarUnmatchedCells) is now entered only when the per-row
|
|
1591
|
+
// column delta exceeds MAX_COLUMN_DELTA (6) — the semantic "this is a
|
|
1592
|
+
// row rewrite, not a column add" guard. The row-width guard
|
|
1593
|
+
// (MAX_COLUMN_SEARCH_WIDTH) is now defensive only since the DP is
|
|
1594
|
+
// O(M × N). These tests pin the fallback's behaviour for the
|
|
1595
|
+
// delta > 6 path.
|
|
1582
1596
|
describe('cell-LCS fallback for very-wide column changes', () => {
|
|
1583
1597
|
it('handles 8 columns inserted alongside existing cells (delta > MAX_COLUMN_DELTA)', () => {
|
|
1584
1598
|
// Old: 3 cells. New: 11 cells (8 columns added). Exact-LCS finds
|
|
@@ -1689,22 +1703,25 @@ describe('HtmlDiff — tables', () => {
|
|
|
1689
1703
|
const newHtml =
|
|
1690
1704
|
'<table><tr>' + '<td>FirstB</td>' + '<td colspan="2">Merged AB</td>' + '<td>LastB</td>' + '</tr></table>'
|
|
1691
1705
|
|
|
1692
|
-
const result = HtmlDiff.execute(oldHtml, newHtml)
|
|
1693
1706
|
// First and last cells should diff content cell-by-cell (matching
|
|
1694
1707
|
// colspans = 1 on both sides); middle two old cells merge into
|
|
1695
|
-
// one colspan=2 cell tagged 'mod colspan'.
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
expect(
|
|
1699
|
-
|
|
1700
|
-
|
|
1708
|
+
// one colspan=2 cell tagged 'mod colspan'. Asserted as an exact
|
|
1709
|
+
// string so that a positional swap of first/last is caught — a
|
|
1710
|
+
// swap-blind set of `toContain` assertions would not flag it.
|
|
1711
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
1712
|
+
'<table><tr>' +
|
|
1713
|
+
"<td><del class='diffmod'>FirstA</del><ins class='diffmod'>FirstB</ins></td>" +
|
|
1714
|
+
`<td colspan="2" class='mod colspan'>Merged AB</td>` +
|
|
1715
|
+
"<td><del class='diffmod'>LastA</del><ins class='diffmod'>LastB</ins></td>" +
|
|
1716
|
+
'</tr></table>'
|
|
1717
|
+
)
|
|
1701
1718
|
})
|
|
1702
1719
|
|
|
1703
|
-
it('handles
|
|
1720
|
+
it('handles a 50-cell row with a single column inserted at start', () => {
|
|
1704
1721
|
// 50-cell row in old, 51-cell row in new (1 column added at
|
|
1705
|
-
// start).
|
|
1706
|
-
//
|
|
1707
|
-
//
|
|
1722
|
+
// start). Now stays on the DP path (MAX_COLUMN_SEARCH_WIDTH=200);
|
|
1723
|
+
// produces the same output as the prior cell-LCS fallback would
|
|
1724
|
+
// have: 1 inserted cell, 50 preserved.
|
|
1708
1725
|
const oldCells = Array.from({ length: 50 }, (_, i) => `<td>c${i}</td>`).join('')
|
|
1709
1726
|
const newCells = `<td>NEW</td>${oldCells}`
|
|
1710
1727
|
const oldHtml = `<table><tr>${oldCells}</tr></table>`
|
|
@@ -1741,4 +1758,336 @@ describe('HtmlDiff — tables', () => {
|
|
|
1741
1758
|
)
|
|
1742
1759
|
})
|
|
1743
1760
|
})
|
|
1761
|
+
|
|
1762
|
+
// The column-position search can encounter score ties when inserted
|
|
1763
|
+
// cells have content that is similar both to each other and to
|
|
1764
|
+
// existing cells (e.g. boilerplate "N/A" in a legal schedule). The
|
|
1765
|
+
// algorithm's tie-breaking resolves to skipping LATER positions in
|
|
1766
|
+
// the longer side — the lex-first-combo behaviour of the original
|
|
1767
|
+
// combinatorial path, now matched by "prefer pair on ties" in the DP
|
|
1768
|
+
// backtrack. These tests pin both the structural shape AND the exact
|
|
1769
|
+
// positions the diffins markers land on, so a silent shift of the
|
|
1770
|
+
// tie-breaking rule would fail loudly.
|
|
1771
|
+
describe('column-position search — score-tied inputs', () => {
|
|
1772
|
+
it('handles delta=2 with content-similar inserts (N/A boilerplate)', () => {
|
|
1773
|
+
const oldHtml = '<table><tr><td>N/A</td><td>Term</td><td>Amount</td><td>N/A</td></tr></table>'
|
|
1774
|
+
const newHtml =
|
|
1775
|
+
'<table><tr><td>N/A</td><td>N/A</td><td>Term</td><td>N/A</td><td>Amount</td><td>N/A</td></tr></table>'
|
|
1776
|
+
|
|
1777
|
+
// Exact match locks in tie-breaking: the diffins markers MUST land
|
|
1778
|
+
// on the earliest positions that produce the optimal score (here:
|
|
1779
|
+
// positions 0 and 3).
|
|
1780
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
1781
|
+
'<table><tr>' +
|
|
1782
|
+
"<td class='diffins'><ins class='diffins'>N/A</ins></td>" +
|
|
1783
|
+
'<td>N/A</td>' +
|
|
1784
|
+
'<td>Term</td>' +
|
|
1785
|
+
"<td class='diffins'><ins class='diffins'>N/A</ins></td>" +
|
|
1786
|
+
'<td>Amount</td>' +
|
|
1787
|
+
'<td>N/A</td>' +
|
|
1788
|
+
'</tr></table>'
|
|
1789
|
+
)
|
|
1790
|
+
})
|
|
1791
|
+
|
|
1792
|
+
it('still passes the loose structural checks for the same inputs', () => {
|
|
1793
|
+
// Kept alongside the exact-match assertion above as a structural
|
|
1794
|
+
// safety net: if the exact form ever shifts (e.g. quote style),
|
|
1795
|
+
// these structural invariants still apply.
|
|
1796
|
+
const oldHtml = '<table><tr><td>N/A</td><td>Term</td><td>Amount</td><td>N/A</td></tr></table>'
|
|
1797
|
+
const newHtml =
|
|
1798
|
+
'<table><tr><td>N/A</td><td>N/A</td><td>Term</td><td>N/A</td><td>Amount</td><td>N/A</td></tr></table>'
|
|
1799
|
+
|
|
1800
|
+
const result = HtmlDiff.execute(oldHtml, newHtml)
|
|
1801
|
+
// Both inserted N/A cells must be marked diffins.
|
|
1802
|
+
const insMarkers = (result.match(/<td class='diffins'>/g) || []).length
|
|
1803
|
+
expect(insMarkers).toBe(2)
|
|
1804
|
+
// Total td count must be 6 (no phantoms).
|
|
1805
|
+
const tdCount = (result.match(/<td[\s>]/g) || []).length
|
|
1806
|
+
expect(tdCount).toBe(6)
|
|
1807
|
+
// Term and Amount must appear as unmarked preserved cells exactly
|
|
1808
|
+
// once each — they're not similar to N/A, so the algorithm has no
|
|
1809
|
+
// ambiguity around them.
|
|
1810
|
+
expect(result).toContain('<td>Term</td>')
|
|
1811
|
+
expect(result).toContain('<td>Amount</td>')
|
|
1812
|
+
})
|
|
1813
|
+
|
|
1814
|
+
it('handles delta=6 (the MAX_COLUMN_DELTA cap) without misalignment', () => {
|
|
1815
|
+
const oldHtml = '<table><tr><td>A</td><td>B</td><td>C</td><td>D</td></tr></table>'
|
|
1816
|
+
const newHtml =
|
|
1817
|
+
'<table><tr>' +
|
|
1818
|
+
'<td>A</td><td>X1</td><td>X2</td><td>B</td><td>X3</td>' +
|
|
1819
|
+
'<td>X4</td><td>C</td><td>X5</td><td>X6</td><td>D</td>' +
|
|
1820
|
+
'</tr></table>'
|
|
1821
|
+
|
|
1822
|
+
const result = HtmlDiff.execute(oldHtml, newHtml)
|
|
1823
|
+
// Exactly 6 cells inserted, 4 preserved.
|
|
1824
|
+
const insMarkers = (result.match(/<td class='diffins'>/g) || []).length
|
|
1825
|
+
expect(insMarkers).toBe(6)
|
|
1826
|
+
const tdCount = (result.match(/<td[\s>]/g) || []).length
|
|
1827
|
+
expect(tdCount).toBe(10)
|
|
1828
|
+
// All four original cells survive in order without diff markers.
|
|
1829
|
+
expect(result).toContain('<td>A</td>')
|
|
1830
|
+
expect(result).toContain('<td>B</td>')
|
|
1831
|
+
expect(result).toContain('<td>C</td>')
|
|
1832
|
+
expect(result).toContain('<td>D</td>')
|
|
1833
|
+
})
|
|
1834
|
+
})
|
|
1835
|
+
|
|
1836
|
+
// orderAlignmentForEmission's `preserved` list is empty when no rows
|
|
1837
|
+
// survive across the diff. The "delete every row" and "insert every
|
|
1838
|
+
// row" cases are corner cases where the float-positioning logic must
|
|
1839
|
+
// degenerate cleanly: every del's primary becomes `-0.5` (predecessor
|
|
1840
|
+
// index -1), every ins's primary is its own newIdx. Both should emit
|
|
1841
|
+
// in oldIdx / newIdx order respectively, with no preserved rows
|
|
1842
|
+
// sandwiched between them.
|
|
1843
|
+
describe('orderAlignmentForEmission — empty preserved list', () => {
|
|
1844
|
+
it('emits every row as diffdel when new is empty (no preserved rows)', () => {
|
|
1845
|
+
const oldHtml =
|
|
1846
|
+
'<table>' +
|
|
1847
|
+
'<tr><td>r1</td></tr>' +
|
|
1848
|
+
'<tr><td>r2</td></tr>' +
|
|
1849
|
+
'<tr><td>r3</td></tr>' +
|
|
1850
|
+
'<tr><td>r4</td></tr>' +
|
|
1851
|
+
'<tr><td>r5</td></tr>' +
|
|
1852
|
+
'</table>'
|
|
1853
|
+
const newHtml = '<table></table>'
|
|
1854
|
+
|
|
1855
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
1856
|
+
'<table>' +
|
|
1857
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>r1</del></td></tr>" +
|
|
1858
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>r2</del></td></tr>" +
|
|
1859
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>r3</del></td></tr>" +
|
|
1860
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>r4</del></td></tr>" +
|
|
1861
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>r5</del></td></tr>" +
|
|
1862
|
+
'</table>'
|
|
1863
|
+
)
|
|
1864
|
+
})
|
|
1865
|
+
|
|
1866
|
+
it('emits every row as diffins when old is empty (no preserved rows)', () => {
|
|
1867
|
+
const oldHtml = '<table></table>'
|
|
1868
|
+
const newHtml = '<table>' + '<tr><td>r1</td></tr>' + '<tr><td>r2</td></tr>' + '<tr><td>r3</td></tr>' + '</table>'
|
|
1869
|
+
|
|
1870
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
1871
|
+
'<table>' +
|
|
1872
|
+
"<tr class='diffins'><td class='diffins'><ins class='diffins'>r1</ins></td></tr>" +
|
|
1873
|
+
"<tr class='diffins'><td class='diffins'><ins class='diffins'>r2</ins></td></tr>" +
|
|
1874
|
+
"<tr class='diffins'><td class='diffins'><ins class='diffins'>r3</ins></td></tr>" +
|
|
1875
|
+
'</table>'
|
|
1876
|
+
)
|
|
1877
|
+
})
|
|
1878
|
+
|
|
1879
|
+
it('emits all-deletes in order when only the header is preserved', () => {
|
|
1880
|
+
// The header is the only preserved row, so 'preserved' has 1 entry
|
|
1881
|
+
// and every del's newIdxOfPreservedBefore returns -1 — exercising
|
|
1882
|
+
// the boundary between empty-preserved and a single anchoring row.
|
|
1883
|
+
const oldHtml =
|
|
1884
|
+
'<table>' +
|
|
1885
|
+
'<tr><td>Header</td></tr>' +
|
|
1886
|
+
'<tr><td>row a</td></tr>' +
|
|
1887
|
+
'<tr><td>row b</td></tr>' +
|
|
1888
|
+
'<tr><td>row c</td></tr>' +
|
|
1889
|
+
'</table>'
|
|
1890
|
+
const newHtml = '<table><tr><td>Header</td></tr></table>'
|
|
1891
|
+
|
|
1892
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
1893
|
+
'<table>' +
|
|
1894
|
+
'<tr><td>Header</td></tr>' +
|
|
1895
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>row a</del></td></tr>" +
|
|
1896
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>row b</del></td></tr>" +
|
|
1897
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>row c</del></td></tr>" +
|
|
1898
|
+
'</table>'
|
|
1899
|
+
)
|
|
1900
|
+
})
|
|
1901
|
+
})
|
|
1902
|
+
|
|
1903
|
+
// Fuzzy row-pairing threshold (0.5) calibrations on ISDA-flavoured
|
|
1904
|
+
// content. The interesting cases are at the edges: rows that share a
|
|
1905
|
+
// little (an enumeration prefix only) — must NOT pair (otherwise an
|
|
1906
|
+
// unrelated rewrite shows up as a single-row content edit); rows that
|
|
1907
|
+
// share a lot of boilerplate text but differ in the meaningful body —
|
|
1908
|
+
// must pair (otherwise the user sees del+ins instead of an edit).
|
|
1909
|
+
describe('fuzzy row pairing — enumerated clauses and shared boilerplate', () => {
|
|
1910
|
+
it('does NOT pair rows that share only an enumeration prefix (different bodies)', () => {
|
|
1911
|
+
// Old has 2 rows, new has 3 rows. The "1." and "2." prefixes are
|
|
1912
|
+
// the only commonality — bodies are completely unrelated.
|
|
1913
|
+
// textSimilarity falls below 0.5 (prefix is 3 chars in 60+; jaccard
|
|
1914
|
+
// is also tiny), so fuzzy pairing must NOT fire — each pair should
|
|
1915
|
+
// emit as a clean del + ins, not a noisy intra-row diff.
|
|
1916
|
+
const oldHtml =
|
|
1917
|
+
'<table>' +
|
|
1918
|
+
'<tr><td>1. Party A shall pay the gross amount on each Payment Date.</td></tr>' +
|
|
1919
|
+
'<tr><td>2. Party B shall deliver collateral on each Calculation Date.</td></tr>' +
|
|
1920
|
+
'</table>'
|
|
1921
|
+
const newHtml =
|
|
1922
|
+
'<table>' +
|
|
1923
|
+
'<tr><td>1. Section intentionally left blank.</td></tr>' +
|
|
1924
|
+
'<tr><td>2. Different boilerplate entirely.</td></tr>' +
|
|
1925
|
+
'<tr><td>3. Brand new clause added here.</td></tr>' +
|
|
1926
|
+
'</table>'
|
|
1927
|
+
|
|
1928
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
1929
|
+
'<table>' +
|
|
1930
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>1. Party A shall pay the gross amount on each Payment Date.</del></td></tr>" +
|
|
1931
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>2. Party B shall deliver collateral on each Calculation Date.</del></td></tr>" +
|
|
1932
|
+
"<tr class='diffins'><td class='diffins'><ins class='diffins'>1. Section intentionally left blank.</ins></td></tr>" +
|
|
1933
|
+
"<tr class='diffins'><td class='diffins'><ins class='diffins'>2. Different boilerplate entirely.</ins></td></tr>" +
|
|
1934
|
+
"<tr class='diffins'><td class='diffins'><ins class='diffins'>3. Brand new clause added here.</ins></td></tr>" +
|
|
1935
|
+
'</table>'
|
|
1936
|
+
)
|
|
1937
|
+
})
|
|
1938
|
+
|
|
1939
|
+
it('DOES pair rows that share a long boilerplate footer (Jaccard kicks in)', () => {
|
|
1940
|
+
// The 9-char body differs, but the 50-char trailing footer is
|
|
1941
|
+
// identical. Prefix-suffix similarity is low, but token Jaccard is
|
|
1942
|
+
// very high because shared footer tokens dominate the token set.
|
|
1943
|
+
// textSimilarity = Math.max(prefix_suffix, jaccard) → must pair.
|
|
1944
|
+
const footer = ' subject to the terms of the Master Agreement.'
|
|
1945
|
+
const oldHtml = `<table><tr><td>Anchor row</td></tr><tr><td>Alpha now${footer}</td></tr></table>`
|
|
1946
|
+
const newHtml =
|
|
1947
|
+
`<table><tr><td>Anchor row</td></tr><tr><td>Bravo new${footer}</td></tr>` +
|
|
1948
|
+
'<tr><td>Extra row appended</td></tr></table>'
|
|
1949
|
+
|
|
1950
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
1951
|
+
'<table>' +
|
|
1952
|
+
'<tr><td>Anchor row</td></tr>' +
|
|
1953
|
+
"<tr><td><del class='diffmod'>Alpha</del><ins class='diffmod'>Bravo</ins> " +
|
|
1954
|
+
"<del class='diffmod'>now</del><ins class='diffmod'>new</ins>" +
|
|
1955
|
+
' subject to the terms of the Master Agreement.</td></tr>' +
|
|
1956
|
+
"<tr class='diffins'><td class='diffins'><ins class='diffins'>Extra row appended</ins></td></tr>" +
|
|
1957
|
+
'</table>'
|
|
1958
|
+
)
|
|
1959
|
+
})
|
|
1960
|
+
})
|
|
1961
|
+
|
|
1962
|
+
// orderAlignmentForEmission must keep unpaired dels in their correct
|
|
1963
|
+
// positions even when the run contains a mix of preserved rows, fuzzy-
|
|
1964
|
+
// paired rows, and unpaired dels. A regression here would make trailing
|
|
1965
|
+
// dels appear before the row they came after — the same family of bug
|
|
1966
|
+
// as the "deleted rows out of order" report that motivated the
|
|
1967
|
+
// function in the first place.
|
|
1968
|
+
describe('orderAlignmentForEmission — mixed paired and unpaired rows', () => {
|
|
1969
|
+
it('emits trailing unpaired dels after a fuzzy-paired content edit', () => {
|
|
1970
|
+
// Old: [Aaaaa, Bbbbb, Cccc, Dddd]. New: [Aaaaa, Bbbbb+NEW].
|
|
1971
|
+
// After LCS: A is preserved. After pairSimilarUnmatchedRows: B↔B'
|
|
1972
|
+
// via fuzzy. C and D are unpaired dels. The output order must be:
|
|
1973
|
+
// preserved(A) → paired(B,B') → del(C) → del(D).
|
|
1974
|
+
const oldHtml =
|
|
1975
|
+
'<table>' +
|
|
1976
|
+
'<tr><td>Aaaaa shared content here</td></tr>' +
|
|
1977
|
+
'<tr><td>Bbbbb shared content here</td></tr>' +
|
|
1978
|
+
'<tr><td>Cccc deleted row</td></tr>' +
|
|
1979
|
+
'<tr><td>Dddd deleted row</td></tr>' +
|
|
1980
|
+
'</table>'
|
|
1981
|
+
const newHtml =
|
|
1982
|
+
'<table>' +
|
|
1983
|
+
'<tr><td>Aaaaa shared content here</td></tr>' +
|
|
1984
|
+
'<tr><td>Bbbbb shared content here NEW</td></tr>' +
|
|
1985
|
+
'</table>'
|
|
1986
|
+
|
|
1987
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
1988
|
+
'<table>' +
|
|
1989
|
+
'<tr><td>Aaaaa shared content here</td></tr>' +
|
|
1990
|
+
"<tr><td>Bbbbb shared content here<ins class='diffins'> NEW</ins></td></tr>" +
|
|
1991
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>Cccc deleted row</del></td></tr>" +
|
|
1992
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>Dddd deleted row</del></td></tr>" +
|
|
1993
|
+
'</table>'
|
|
1994
|
+
)
|
|
1995
|
+
})
|
|
1996
|
+
|
|
1997
|
+
it('emits dels at the end of the table in old-document order', () => {
|
|
1998
|
+
// Specifically protects against the regression that motivated
|
|
1999
|
+
// orderAlignmentForEmission: deleting the last two rows
|
|
2000
|
+
// simultaneously must emit them in the order they appeared in old
|
|
2001
|
+
// (second-last, then last), not reversed or jumbled.
|
|
2002
|
+
const oldHtml =
|
|
2003
|
+
'<table>' +
|
|
2004
|
+
'<tr><td>kept1</td></tr>' +
|
|
2005
|
+
'<tr><td>kept2</td></tr>' +
|
|
2006
|
+
'<tr><td>second-last</td></tr>' +
|
|
2007
|
+
'<tr><td>last</td></tr>' +
|
|
2008
|
+
'</table>'
|
|
2009
|
+
const newHtml = '<table><tr><td>kept1</td></tr><tr><td>kept2</td></tr></table>'
|
|
2010
|
+
|
|
2011
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
2012
|
+
'<table>' +
|
|
2013
|
+
'<tr><td>kept1</td></tr>' +
|
|
2014
|
+
'<tr><td>kept2</td></tr>' +
|
|
2015
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>second-last</del></td></tr>" +
|
|
2016
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>last</del></td></tr>" +
|
|
2017
|
+
'</table>'
|
|
2018
|
+
)
|
|
2019
|
+
})
|
|
2020
|
+
})
|
|
2021
|
+
|
|
2022
|
+
// pairSimilarUnmatched is intentionally greedy (not Hungarian-optimal).
|
|
2023
|
+
// When two unmatched-old entries are both above-threshold matches for
|
|
2024
|
+
// the same unmatched-new entry, the first del wins the pairing only
|
|
2025
|
+
// when iterating in document order means "first" matches the better
|
|
2026
|
+
// candidate. The output must still be structurally valid — no phantom
|
|
2027
|
+
// cells, no dropped content — regardless of which del wins.
|
|
2028
|
+
describe('pairSimilarUnmatchedCells — competing dels for the same ins', () => {
|
|
2029
|
+
it('keeps both candidate cells intact when two old cells could pair with one new cell', () => {
|
|
2030
|
+
// Two old cells with identical content compete for one similar
|
|
2031
|
+
// new cell. The greedy assignment picks one to pair as a content
|
|
2032
|
+
// edit; the other emits as a full diffdel. Both must appear; no
|
|
2033
|
+
// cell may silently vanish.
|
|
2034
|
+
const oldHtml =
|
|
2035
|
+
'<table><tr>' +
|
|
2036
|
+
'<td>Preserved</td>' +
|
|
2037
|
+
'<td>Old content alpha to be edited</td>' +
|
|
2038
|
+
'<td>Old content alpha to be edited</td>' +
|
|
2039
|
+
'</tr></table>'
|
|
2040
|
+
const newHtml = '<table><tr>' + '<td>Preserved</td>' + '<td>Old content alpha CHANGED</td>' + '</tr></table>'
|
|
2041
|
+
|
|
2042
|
+
const result = HtmlDiff.execute(oldHtml, newHtml)
|
|
2043
|
+
// The losing del must emit as a full diffdel cell.
|
|
2044
|
+
expect(result).toContain("<td class='diffdel'><del class='diffdel'>Old content alpha to be edited</del></td>")
|
|
2045
|
+
// The winning pair must emit as a partial content edit.
|
|
2046
|
+
expect(result).toContain("<del class='diffmod'>to be edited</del>")
|
|
2047
|
+
expect(result).toContain("<ins class='diffmod'>CHANGED</ins>")
|
|
2048
|
+
// Structural: 3 tds total in the output (1 preserved, 1 full-del,
|
|
2049
|
+
// 1 paired-edit). No phantoms.
|
|
2050
|
+
const tdCount = (result.match(/<td[\s>]/g) || []).length
|
|
2051
|
+
expect(tdCount).toBe(3)
|
|
2052
|
+
})
|
|
2053
|
+
})
|
|
2054
|
+
|
|
2055
|
+
// Row-LCS on a non-trivial 7-row table where only every other row
|
|
2056
|
+
// matches. Existing named tests max out around 4 rows; the matrix
|
|
2057
|
+
// never produces a row-count drop this large with this much
|
|
2058
|
+
// interleaving. This exercises the LCS DP itself, not just the
|
|
2059
|
+
// diff emission.
|
|
2060
|
+
describe('row-LCS on larger tables', () => {
|
|
2061
|
+
it('finds 4 preserved rows interleaved with 3 dropped rows in a 7-row old table', () => {
|
|
2062
|
+
const oldHtml =
|
|
2063
|
+
'<table>' +
|
|
2064
|
+
'<tr><td>row1 preserved</td></tr>' +
|
|
2065
|
+
'<tr><td>row2 old body</td></tr>' +
|
|
2066
|
+
'<tr><td>row3 preserved</td></tr>' +
|
|
2067
|
+
'<tr><td>row4 old body</td></tr>' +
|
|
2068
|
+
'<tr><td>row5 preserved</td></tr>' +
|
|
2069
|
+
'<tr><td>row6 old body</td></tr>' +
|
|
2070
|
+
'<tr><td>row7 preserved</td></tr>' +
|
|
2071
|
+
'</table>'
|
|
2072
|
+
const newHtml =
|
|
2073
|
+
'<table>' +
|
|
2074
|
+
'<tr><td>row1 preserved</td></tr>' +
|
|
2075
|
+
'<tr><td>row3 preserved</td></tr>' +
|
|
2076
|
+
'<tr><td>row5 preserved</td></tr>' +
|
|
2077
|
+
'<tr><td>row7 preserved</td></tr>' +
|
|
2078
|
+
'</table>'
|
|
2079
|
+
|
|
2080
|
+
expect(HtmlDiff.execute(oldHtml, newHtml)).toEqual(
|
|
2081
|
+
'<table>' +
|
|
2082
|
+
'<tr><td>row1 preserved</td></tr>' +
|
|
2083
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>row2 old body</del></td></tr>" +
|
|
2084
|
+
'<tr><td>row3 preserved</td></tr>' +
|
|
2085
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>row4 old body</del></td></tr>" +
|
|
2086
|
+
'<tr><td>row5 preserved</td></tr>' +
|
|
2087
|
+
"<tr class='diffdel'><td class='diffdel'><del class='diffdel'>row6 old body</del></td></tr>" +
|
|
2088
|
+
'<tr><td>row7 preserved</td></tr>' +
|
|
2089
|
+
'</table>'
|
|
2090
|
+
)
|
|
2091
|
+
})
|
|
2092
|
+
})
|
|
1744
2093
|
})
|