@createiq/htmldiff 1.1.0 → 1.2.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/README.md +67 -0
- package/dist/HtmlDiff.cjs +1192 -456
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +160 -7
- package/dist/HtmlDiff.d.mts +159 -7
- package/dist/HtmlDiff.mjs +1192 -456
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/Alignment.ts +349 -0
- package/src/HtmlDiff.ts +343 -33
- package/src/HtmlScanner.ts +200 -0
- package/src/TableDiff.ts +67 -522
- package/src/ThreeWayDiff.ts +269 -0
- package/src/ThreeWayTable.ts +625 -0
- package/src/Utils.ts +34 -2
- package/test/HtmlDiff.analyze.spec.ts +152 -0
- package/test/HtmlDiff.tables.spec.ts +43 -19
- package/test/HtmlDiff.threeWay.spec.ts +173 -0
- package/test/HtmlDiff.threeWay.tables.spec.ts +301 -0
- package/test/TableDiff.bench.ts +39 -0
- package/test/Utils.spec.ts +48 -0
package/package.json
CHANGED
package/src/Alignment.ts
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generic sequence-alignment primitives used by the table-aware diff and
|
|
3
|
+
* potentially other granularities (rows, cells, list items, …). Nothing
|
|
4
|
+
* here knows about tables or HTML — the caller passes string keys for
|
|
5
|
+
* exact matching and a similarity callback for fuzzy pairing.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface Alignment {
|
|
9
|
+
oldIdx: number | null
|
|
10
|
+
newIdx: number | null
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
15
|
+
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|
|
16
|
+
* side is null for an unmatched entry on the other side. Equality uses
|
|
17
|
+
* strict ===.
|
|
18
|
+
*/
|
|
19
|
+
export function lcsAlign(oldKeys: string[], newKeys: string[]): Alignment[] {
|
|
20
|
+
const m = oldKeys.length
|
|
21
|
+
const n = newKeys.length
|
|
22
|
+
const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
|
|
23
|
+
for (let i = 1; i <= m; i++) {
|
|
24
|
+
for (let j = 1; j <= n; j++) {
|
|
25
|
+
if (oldKeys[i - 1] === newKeys[j - 1]) {
|
|
26
|
+
dp[i][j] = dp[i - 1][j - 1] + 1
|
|
27
|
+
} else {
|
|
28
|
+
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1])
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Backtrack and push; reverse at the end. `unshift` is O(n) per call
|
|
34
|
+
// so the naive version was O(n²); push+reverse is O(n) total.
|
|
35
|
+
const result: Alignment[] = []
|
|
36
|
+
let i = m
|
|
37
|
+
let j = n
|
|
38
|
+
while (i > 0 || j > 0) {
|
|
39
|
+
if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
|
|
40
|
+
result.push({ oldIdx: i - 1, newIdx: j - 1 })
|
|
41
|
+
i--
|
|
42
|
+
j--
|
|
43
|
+
} else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
|
|
44
|
+
result.push({ oldIdx: null, newIdx: j - 1 })
|
|
45
|
+
j--
|
|
46
|
+
} else {
|
|
47
|
+
result.push({ oldIdx: i - 1, newIdx: null })
|
|
48
|
+
i--
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
result.reverse()
|
|
52
|
+
return result
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Given a shorter sequence (M items) and a longer sequence (N items, with
|
|
57
|
+
* N > M), find the K = N - M positions in the longer sequence that should
|
|
58
|
+
* be "skipped" so the unskipped longer items, aligned positionally with
|
|
59
|
+
* the shorter items, maximise the sum of pairwise similarity.
|
|
60
|
+
*
|
|
61
|
+
* Solves the same problem as enumerating C(N, K) skip combinations and
|
|
62
|
+
* picking the highest-scoring one, but in O(M × N) time via DP:
|
|
63
|
+
*
|
|
64
|
+
* f(i, j) = max similarity from consuming i shorter and j longer items
|
|
65
|
+
* (defined for j >= i; entries below the diagonal are never
|
|
66
|
+
* written or read).
|
|
67
|
+
* f(0, j) = 0
|
|
68
|
+
* f(i, j) = max(
|
|
69
|
+
* f(i-1, j-1) + similarity(i-1, j-1), // pair
|
|
70
|
+
* f(i, j-1) // skip longer[j-1]
|
|
71
|
+
* )
|
|
72
|
+
*
|
|
73
|
+
* Tie-breaking prefers pairing over skipping, so ties resolve to skipping
|
|
74
|
+
* EARLIER positions — matching the lex-first-combo behaviour of a full
|
|
75
|
+
* combinatorial enumeration over which K positions to skip. Backtrack
|
|
76
|
+
* re-asks the fill's pair-vs-skip question to preserve this direction
|
|
77
|
+
* (the alternative — a `dp[i][j] > dp[i][j-1]` shortcut — would invert
|
|
78
|
+
* the tie-breaking).
|
|
79
|
+
*
|
|
80
|
+
* Caller responsibility: ensure `longerTexts.length >= shorterTexts.length`.
|
|
81
|
+
*/
|
|
82
|
+
export function findOptimalAlignmentSkips(
|
|
83
|
+
shorterTexts: string[],
|
|
84
|
+
longerTexts: string[],
|
|
85
|
+
similarity: (shorterIdx: number, longerIdx: number) => number
|
|
86
|
+
): number[] {
|
|
87
|
+
const m = shorterTexts.length
|
|
88
|
+
const n = longerTexts.length
|
|
89
|
+
// dp[i][j] is valid only for j >= i; entries below the diagonal are
|
|
90
|
+
// allocated (uniform-shaped matrix keeps the indexing straight) but
|
|
91
|
+
// never written or read. The wasted (M choose 2)-ish cells are not
|
|
92
|
+
// worth "optimising" — a triangular layout would complicate the
|
|
93
|
+
// backtrack's `dp[i][j-1]` reads and the (j > i ? skip : NEG_INF)
|
|
94
|
+
// boundary handling, with no measurable win at the sizes this runs
|
|
95
|
+
// on (capped by MAX_COLUMN_SEARCH_WIDTH).
|
|
96
|
+
const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
|
|
97
|
+
for (let i = 1; i <= m; i++) {
|
|
98
|
+
for (let j = i; j <= n; j++) {
|
|
99
|
+
const pair = dp[i - 1][j - 1] + similarity(i - 1, j - 1)
|
|
100
|
+
const skip = j > i ? dp[i][j - 1] : Number.NEGATIVE_INFINITY
|
|
101
|
+
dp[i][j] = pair >= skip ? pair : skip
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Backtrack from (m, n). To preserve the fill's "prefer pair on ties"
|
|
106
|
+
// direction we have to ask the same question the fill asked:
|
|
107
|
+
// pair = dp[i-1][j-1] + similarity(i-1, j-1)
|
|
108
|
+
// skip = dp[i][j-1]
|
|
109
|
+
// and choose pair iff pair >= skip. A `dp[i][j] > dp[i][j-1]` shortcut
|
|
110
|
+
// would invert the tie-breaking (it'd skip earlier positions on ties)
|
|
111
|
+
// and shift outputs for score-tied scenarios — see the
|
|
112
|
+
// `column-position search — score-tied inputs` regression tests in
|
|
113
|
+
// `HtmlDiff.tables.spec.ts`. The extra similarity calls during
|
|
114
|
+
// backtrack run O(M+N) times total, dwarfed by the O(M × N) fill.
|
|
115
|
+
const skipped: number[] = []
|
|
116
|
+
let i = m
|
|
117
|
+
let j = n
|
|
118
|
+
while (j > 0) {
|
|
119
|
+
if (i === 0) {
|
|
120
|
+
skipped.push(j - 1)
|
|
121
|
+
j--
|
|
122
|
+
continue
|
|
123
|
+
}
|
|
124
|
+
if (j === i) {
|
|
125
|
+
// No slack left — every remaining move is a pair.
|
|
126
|
+
i--
|
|
127
|
+
j--
|
|
128
|
+
continue
|
|
129
|
+
}
|
|
130
|
+
const pair = dp[i - 1][j - 1] + similarity(i - 1, j - 1)
|
|
131
|
+
const skip = dp[i][j - 1]
|
|
132
|
+
if (pair >= skip) {
|
|
133
|
+
i--
|
|
134
|
+
j--
|
|
135
|
+
} else {
|
|
136
|
+
skipped.push(j - 1)
|
|
137
|
+
j--
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
skipped.reverse()
|
|
141
|
+
return skipped
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Identifies pairings inside each unmatched-only run, then builds the
|
|
146
|
+
* output alignment by walking the original and substituting paired
|
|
147
|
+
* entries at the *ins position* (not the del position). This keeps the
|
|
148
|
+
* result monotonically non-decreasing in newIdx — required by any
|
|
149
|
+
* downstream emission that walks the new sequence in order. Emitting at
|
|
150
|
+
* the del position would be safe when del<ins in the alignment array
|
|
151
|
+
* (the typical case), but can violate monotonicity when unpaired
|
|
152
|
+
* entries interleave with paired ones in the same run.
|
|
153
|
+
*
|
|
154
|
+
* Greedy assignment: the first del in document order wins its best ins.
|
|
155
|
+
* Suboptimal vs Hungarian on edge cases (two dels above threshold for
|
|
156
|
+
* the same ins), but bounded — a losing del just emits as a full delete
|
|
157
|
+
* rather than a content edit.
|
|
158
|
+
*/
|
|
159
|
+
export function pairSimilarUnmatched(
|
|
160
|
+
alignment: Alignment[],
|
|
161
|
+
threshold: number,
|
|
162
|
+
similarity: (oldIdx: number, newIdx: number) => number
|
|
163
|
+
): Alignment[] {
|
|
164
|
+
const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
|
|
165
|
+
let i = 0
|
|
166
|
+
while (i < alignment.length) {
|
|
167
|
+
if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
|
|
168
|
+
i++
|
|
169
|
+
continue
|
|
170
|
+
}
|
|
171
|
+
const runStart = i
|
|
172
|
+
while (i < alignment.length && (alignment[i].oldIdx === null) !== (alignment[i].newIdx === null)) i++
|
|
173
|
+
const runEnd = i
|
|
174
|
+
|
|
175
|
+
const delIndices: number[] = []
|
|
176
|
+
const insIndices: number[] = []
|
|
177
|
+
for (let k = runStart; k < runEnd; k++) {
|
|
178
|
+
if (alignment[k].oldIdx !== null) delIndices.push(k)
|
|
179
|
+
else insIndices.push(k)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const usedIns = new Set<number>()
|
|
183
|
+
for (const di of delIndices) {
|
|
184
|
+
let bestIi = -1
|
|
185
|
+
let bestSim = threshold
|
|
186
|
+
for (const ii of insIndices) {
|
|
187
|
+
if (usedIns.has(ii)) continue
|
|
188
|
+
const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
|
|
189
|
+
if (sim > bestSim) {
|
|
190
|
+
bestSim = sim
|
|
191
|
+
bestIi = ii
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
if (bestIi >= 0) {
|
|
195
|
+
pairs.set(di, bestIi)
|
|
196
|
+
usedIns.add(bestIi)
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const insToDel = new Map<number, number>() // ins-alignment-idx → del-alignment-idx
|
|
202
|
+
for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi)
|
|
203
|
+
const pairedDels = new Set<number>(pairs.keys())
|
|
204
|
+
|
|
205
|
+
const result: Alignment[] = []
|
|
206
|
+
for (let k = 0; k < alignment.length; k++) {
|
|
207
|
+
if (pairedDels.has(k)) continue // paired del — emitted when we reach its ins
|
|
208
|
+
if (insToDel.has(k)) {
|
|
209
|
+
const delAi = insToDel.get(k) as number
|
|
210
|
+
result.push({ oldIdx: alignment[delAi].oldIdx, newIdx: alignment[k].newIdx })
|
|
211
|
+
} else {
|
|
212
|
+
result.push(alignment[k])
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
return result
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Reorders the alignment so a cursor-based emission walking the new
|
|
220
|
+
* sequence in order produces entries in their visually-correct
|
|
221
|
+
* position. Each entry is assigned a fractional "position" in new's
|
|
222
|
+
* flow:
|
|
223
|
+
*
|
|
224
|
+
* • Preserved/paired (oldIdx, newIdx): position = newIdx.
|
|
225
|
+
* • Pure insert (null, newIdx): position = newIdx.
|
|
226
|
+
* • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
|
|
227
|
+
* before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
|
|
228
|
+
* they appear in old's source order. The +0.5 places dels BEFORE
|
|
229
|
+
* any insert at the same gap (insert at newIdx N1+1 has position
|
|
230
|
+
* N1+1 which is > N1+0.5), giving the natural "delete first, insert
|
|
231
|
+
* second" reading order at a replaced position.
|
|
232
|
+
*
|
|
233
|
+
* Handles the full range:
|
|
234
|
+
* • Run of unpaired dels at the start (no preserved predecessor):
|
|
235
|
+
* position -0.5, sorted by oldIdx.
|
|
236
|
+
* • Dels in the middle: positioned right after their preceding
|
|
237
|
+
* preserved entry.
|
|
238
|
+
* • Dels at the end (no preserved successor): positioned after the
|
|
239
|
+
* last preserved entry.
|
|
240
|
+
*
|
|
241
|
+
* Without this reordering, a run of unpaired deletes ahead of any
|
|
242
|
+
* preserved entry would be emitted before the first preserved entry,
|
|
243
|
+
* regardless of where they originated in old.
|
|
244
|
+
*
|
|
245
|
+
* NB: `0.5` is the ONLY fractional offset used. If another decoration
|
|
246
|
+
* kind ever needs a fractional position too, redesign this scheme
|
|
247
|
+
* (e.g. a discrete `(integerSlot, kind, secondary)` triple) rather than
|
|
248
|
+
* picking another magic offset and hoping it doesn't collide.
|
|
249
|
+
*/
|
|
250
|
+
export function orderAlignmentForEmission(alignment: Alignment[]): Alignment[] {
|
|
251
|
+
const preserved: Array<{ oldIdx: number; newIdx: number }> = []
|
|
252
|
+
for (const a of alignment) {
|
|
253
|
+
if (a.oldIdx !== null && a.newIdx !== null) {
|
|
254
|
+
preserved.push({ oldIdx: a.oldIdx, newIdx: a.newIdx })
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
preserved.sort((a, b) => a.oldIdx - b.oldIdx)
|
|
258
|
+
|
|
259
|
+
// For a deleted entry with oldIdx K, return the newIdx of the preserved
|
|
260
|
+
// entry with the largest oldIdx less than K, or -1 if none.
|
|
261
|
+
function newIdxOfPreservedBefore(oldIdx: number): number {
|
|
262
|
+
let result = -1
|
|
263
|
+
for (const p of preserved) {
|
|
264
|
+
if (p.oldIdx >= oldIdx) break
|
|
265
|
+
result = p.newIdx
|
|
266
|
+
}
|
|
267
|
+
return result
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Decorate each alignment with a fractional position. We use
|
|
271
|
+
// (primary, secondary) tuples so dels at the same gap sort by oldIdx
|
|
272
|
+
// (in old's source order) and inserts at the same newIdx stay stable.
|
|
273
|
+
const decorated = alignment.map((a, i) => {
|
|
274
|
+
let primary: number
|
|
275
|
+
let secondary: number
|
|
276
|
+
if (a.newIdx !== null) {
|
|
277
|
+
primary = a.newIdx
|
|
278
|
+
secondary = a.oldIdx === null ? 1 : 0 // preserved before pure-insert at same newIdx (rare)
|
|
279
|
+
} else {
|
|
280
|
+
// Pure delete
|
|
281
|
+
primary = newIdxOfPreservedBefore(a.oldIdx as number) + 0.5
|
|
282
|
+
secondary = a.oldIdx as number
|
|
283
|
+
}
|
|
284
|
+
return { entry: a, primary, secondary, originalIdx: i }
|
|
285
|
+
})
|
|
286
|
+
|
|
287
|
+
decorated.sort((a, b) => {
|
|
288
|
+
if (a.primary !== b.primary) return a.primary - b.primary
|
|
289
|
+
if (a.secondary !== b.secondary) return a.secondary - b.secondary
|
|
290
|
+
return a.originalIdx - b.originalIdx // stable
|
|
291
|
+
})
|
|
292
|
+
|
|
293
|
+
return decorated.map(d => d.entry)
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Combined similarity metric used for fuzzy pairing. Returns the MAX of
|
|
298
|
+
* two complementary metrics:
|
|
299
|
+
*
|
|
300
|
+
* 1. **Character prefix+suffix similarity** — fraction of the longer
|
|
301
|
+
* string covered by shared prefix + shared suffix. Catches small
|
|
302
|
+
* edits in the middle of a string (one word changed). Misses cases
|
|
303
|
+
* where the bulk of common content is in the middle and the ends
|
|
304
|
+
* differ.
|
|
305
|
+
*
|
|
306
|
+
* 2. **Token Jaccard similarity** — intersection-over-union of the
|
|
307
|
+
* whitespace-split tokens. Catches "most of the content is the
|
|
308
|
+
* same but bookended by different bits" — e.g. an edit where the
|
|
309
|
+
* ~50 chars in the middle that DO match would be invisible to
|
|
310
|
+
* prefix+suffix.
|
|
311
|
+
*
|
|
312
|
+
* Either metric exceeding the threshold means pair. Neither alone is
|
|
313
|
+
* sufficient for the full range of legal-doc edits we see in
|
|
314
|
+
* production tables.
|
|
315
|
+
*/
|
|
316
|
+
export function textSimilarity(a: string, b: string): number {
|
|
317
|
+
if (a === b) return 1
|
|
318
|
+
if (a.length === 0 || b.length === 0) return 0
|
|
319
|
+
return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function charPrefixSuffixSimilarity(a: string, b: string): number {
|
|
323
|
+
let prefix = 0
|
|
324
|
+
const minLen = Math.min(a.length, b.length)
|
|
325
|
+
while (prefix < minLen && a[prefix] === b[prefix]) prefix++
|
|
326
|
+
|
|
327
|
+
let suffix = 0
|
|
328
|
+
while (
|
|
329
|
+
suffix < a.length - prefix &&
|
|
330
|
+
suffix < b.length - prefix &&
|
|
331
|
+
a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
|
|
332
|
+
) {
|
|
333
|
+
suffix++
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
return (prefix + suffix) / Math.max(a.length, b.length)
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
function tokenJaccardSimilarity(a: string, b: string): number {
|
|
340
|
+
const tokensA = new Set(a.split(/\s+/).filter(Boolean))
|
|
341
|
+
const tokensB = new Set(b.split(/\s+/).filter(Boolean))
|
|
342
|
+
if (tokensA.size === 0 && tokensB.size === 0) return 1
|
|
343
|
+
let intersection = 0
|
|
344
|
+
for (const t of tokensA) {
|
|
345
|
+
if (tokensB.has(t)) intersection++
|
|
346
|
+
}
|
|
347
|
+
const union = tokensA.size + tokensB.size - intersection
|
|
348
|
+
return union === 0 ? 0 : intersection / union
|
|
349
|
+
}
|