@createiq/htmldiff 1.1.0 → 1.2.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.1.0",
3
+ "version": "1.2.0-beta.0",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
@@ -0,0 +1,349 @@
1
+ /**
2
+ * Generic sequence-alignment primitives used by the table-aware diff and
3
+ * potentially other granularities (rows, cells, list items, …). Nothing
4
+ * here knows about tables or HTML — the caller passes string keys for
5
+ * exact matching and a similarity callback for fuzzy pairing.
6
+ */
7
+
8
+ export interface Alignment {
9
+ oldIdx: number | null
10
+ newIdx: number | null
11
+ }
12
+
13
+ /**
14
+ * Standard LCS alignment: walks both sequences and emits a list of pairs
15
+ * where `(oldIdx, newIdx)` are both set for matching positions, and one
16
+ * side is null for an unmatched entry on the other side. Equality uses
17
+ * strict ===.
18
+ */
19
+ export function lcsAlign(oldKeys: string[], newKeys: string[]): Alignment[] {
20
+ const m = oldKeys.length
21
+ const n = newKeys.length
22
+ const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
23
+ for (let i = 1; i <= m; i++) {
24
+ for (let j = 1; j <= n; j++) {
25
+ if (oldKeys[i - 1] === newKeys[j - 1]) {
26
+ dp[i][j] = dp[i - 1][j - 1] + 1
27
+ } else {
28
+ dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1])
29
+ }
30
+ }
31
+ }
32
+
33
+ // Backtrack and push; reverse at the end. `unshift` is O(n) per call
34
+ // so the naive version was O(n²); push+reverse is O(n) total.
35
+ const result: Alignment[] = []
36
+ let i = m
37
+ let j = n
38
+ while (i > 0 || j > 0) {
39
+ if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
40
+ result.push({ oldIdx: i - 1, newIdx: j - 1 })
41
+ i--
42
+ j--
43
+ } else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
44
+ result.push({ oldIdx: null, newIdx: j - 1 })
45
+ j--
46
+ } else {
47
+ result.push({ oldIdx: i - 1, newIdx: null })
48
+ i--
49
+ }
50
+ }
51
+ result.reverse()
52
+ return result
53
+ }
54
+
55
+ /**
56
+ * Given a shorter sequence (M items) and a longer sequence (N items, with
57
+ * N > M), find the K = N - M positions in the longer sequence that should
58
+ * be "skipped" so the unskipped longer items, aligned positionally with
59
+ * the shorter items, maximise the sum of pairwise similarity.
60
+ *
61
+ * Solves the same problem as enumerating C(N, K) skip combinations and
62
+ * picking the highest-scoring one, but in O(M × N) time via DP:
63
+ *
64
+ * f(i, j) = max similarity from consuming i shorter and j longer items
65
+ * (defined for j >= i; entries below the diagonal are never
66
+ * written or read).
67
+ * f(0, j) = 0
68
+ * f(i, j) = max(
69
+ * f(i-1, j-1) + similarity(i-1, j-1), // pair
70
+ * f(i, j-1) // skip longer[j-1]
71
+ * )
72
+ *
73
+ * Tie-breaking prefers pairing over skipping, so ties resolve to skipping
74
+ * EARLIER positions — matching the lex-first-combo behaviour of a full
75
+ * combinatorial enumeration over which K positions to skip. Backtrack
76
+ * re-asks the fill's pair-vs-skip question to preserve this direction
77
+ * (the alternative — a `dp[i][j] > dp[i][j-1]` shortcut — would invert
78
+ * the tie-breaking).
79
+ *
80
+ * Caller responsibility: ensure `longerTexts.length >= shorterTexts.length`.
81
+ */
82
+ export function findOptimalAlignmentSkips(
83
+ shorterTexts: string[],
84
+ longerTexts: string[],
85
+ similarity: (shorterIdx: number, longerIdx: number) => number
86
+ ): number[] {
87
+ const m = shorterTexts.length
88
+ const n = longerTexts.length
89
+ // dp[i][j] is valid only for j >= i; entries below the diagonal are
90
+ // allocated (uniform-shaped matrix keeps the indexing straight) but
91
+ // never written or read. The wasted (M choose 2)-ish cells are not
92
+ // worth "optimising" — a triangular layout would complicate the
93
+ // backtrack's `dp[i][j-1]` reads and the (j > i ? skip : NEG_INF)
94
+ // boundary handling, with no measurable win at the sizes this runs
95
+ // on (capped by MAX_COLUMN_SEARCH_WIDTH).
96
+ const dp: number[][] = Array.from({ length: m + 1 }, () => new Array<number>(n + 1).fill(0))
97
+ for (let i = 1; i <= m; i++) {
98
+ for (let j = i; j <= n; j++) {
99
+ const pair = dp[i - 1][j - 1] + similarity(i - 1, j - 1)
100
+ const skip = j > i ? dp[i][j - 1] : Number.NEGATIVE_INFINITY
101
+ dp[i][j] = pair >= skip ? pair : skip
102
+ }
103
+ }
104
+
105
+ // Backtrack from (m, n). To preserve the fill's "prefer pair on ties"
106
+ // direction we have to ask the same question the fill asked:
107
+ // pair = dp[i-1][j-1] + similarity(i-1, j-1)
108
+ // skip = dp[i][j-1]
109
+ // and choose pair iff pair >= skip. A `dp[i][j] > dp[i][j-1]` shortcut
110
+ // would invert the tie-breaking (it'd skip earlier positions on ties)
111
+ // and shift outputs for score-tied scenarios — see the
112
+ // `column-position search — score-tied inputs` regression tests in
113
+ // `HtmlDiff.tables.spec.ts`. The extra similarity calls during
114
+ // backtrack run O(M+N) times total, dwarfed by the O(M × N) fill.
115
+ const skipped: number[] = []
116
+ let i = m
117
+ let j = n
118
+ while (j > 0) {
119
+ if (i === 0) {
120
+ skipped.push(j - 1)
121
+ j--
122
+ continue
123
+ }
124
+ if (j === i) {
125
+ // No slack left — every remaining move is a pair.
126
+ i--
127
+ j--
128
+ continue
129
+ }
130
+ const pair = dp[i - 1][j - 1] + similarity(i - 1, j - 1)
131
+ const skip = dp[i][j - 1]
132
+ if (pair >= skip) {
133
+ i--
134
+ j--
135
+ } else {
136
+ skipped.push(j - 1)
137
+ j--
138
+ }
139
+ }
140
+ skipped.reverse()
141
+ return skipped
142
+ }
143
+
144
+ /**
145
+ * Identifies pairings inside each unmatched-only run, then builds the
146
+ * output alignment by walking the original and substituting paired
147
+ * entries at the *ins position* (not the del position). This keeps the
148
+ * result monotonically non-decreasing in newIdx — required by any
149
+ * downstream emission that walks the new sequence in order. Emitting at
150
+ * the del position would be safe when del<ins in the alignment array
151
+ * (the typical case), but can violate monotonicity when unpaired
152
+ * entries interleave with paired ones in the same run.
153
+ *
154
+ * Greedy assignment: the first del in document order wins its best ins.
155
+ * Suboptimal vs Hungarian on edge cases (two dels above threshold for
156
+ * the same ins), but bounded — a losing del just emits as a full delete
157
+ * rather than a content edit.
158
+ */
159
+ export function pairSimilarUnmatched(
160
+ alignment: Alignment[],
161
+ threshold: number,
162
+ similarity: (oldIdx: number, newIdx: number) => number
163
+ ): Alignment[] {
164
+ const pairs = new Map<number, number>() // del-alignment-idx → ins-alignment-idx
165
+ let i = 0
166
+ while (i < alignment.length) {
167
+ if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
168
+ i++
169
+ continue
170
+ }
171
+ const runStart = i
172
+ while (i < alignment.length && (alignment[i].oldIdx === null) !== (alignment[i].newIdx === null)) i++
173
+ const runEnd = i
174
+
175
+ const delIndices: number[] = []
176
+ const insIndices: number[] = []
177
+ for (let k = runStart; k < runEnd; k++) {
178
+ if (alignment[k].oldIdx !== null) delIndices.push(k)
179
+ else insIndices.push(k)
180
+ }
181
+
182
+ const usedIns = new Set<number>()
183
+ for (const di of delIndices) {
184
+ let bestIi = -1
185
+ let bestSim = threshold
186
+ for (const ii of insIndices) {
187
+ if (usedIns.has(ii)) continue
188
+ const sim = similarity(alignment[di].oldIdx as number, alignment[ii].newIdx as number)
189
+ if (sim > bestSim) {
190
+ bestSim = sim
191
+ bestIi = ii
192
+ }
193
+ }
194
+ if (bestIi >= 0) {
195
+ pairs.set(di, bestIi)
196
+ usedIns.add(bestIi)
197
+ }
198
+ }
199
+ }
200
+
201
+ const insToDel = new Map<number, number>() // ins-alignment-idx → del-alignment-idx
202
+ for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi)
203
+ const pairedDels = new Set<number>(pairs.keys())
204
+
205
+ const result: Alignment[] = []
206
+ for (let k = 0; k < alignment.length; k++) {
207
+ if (pairedDels.has(k)) continue // paired del — emitted when we reach its ins
208
+ if (insToDel.has(k)) {
209
+ const delAi = insToDel.get(k) as number
210
+ result.push({ oldIdx: alignment[delAi].oldIdx, newIdx: alignment[k].newIdx })
211
+ } else {
212
+ result.push(alignment[k])
213
+ }
214
+ }
215
+ return result
216
+ }
217
+
218
+ /**
219
+ * Reorders the alignment so a cursor-based emission walking the new
220
+ * sequence in order produces entries in their visually-correct
221
+ * position. Each entry is assigned a fractional "position" in new's
222
+ * flow:
223
+ *
224
+ * • Preserved/paired (oldIdx, newIdx): position = newIdx.
225
+ * • Pure insert (null, newIdx): position = newIdx.
226
+ * • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
227
+ * before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
228
+ * they appear in old's source order. The +0.5 places dels BEFORE
229
+ * any insert at the same gap (insert at newIdx N1+1 has position
230
+ * N1+1 which is > N1+0.5), giving the natural "delete first, insert
231
+ * second" reading order at a replaced position.
232
+ *
233
+ * Handles the full range:
234
+ * • Run of unpaired dels at the start (no preserved predecessor):
235
+ * position -0.5, sorted by oldIdx.
236
+ * • Dels in the middle: positioned right after their preceding
237
+ * preserved entry.
238
+ * • Dels at the end (no preserved successor): positioned after the
239
+ * last preserved entry.
240
+ *
241
+ * Without this reordering, a run of unpaired deletes ahead of any
242
+ * preserved entry would be emitted before the first preserved entry,
243
+ * regardless of where they originated in old.
244
+ *
245
+ * NB: `0.5` is the ONLY fractional offset used. If another decoration
246
+ * kind ever needs a fractional position too, redesign this scheme
247
+ * (e.g. a discrete `(integerSlot, kind, secondary)` triple) rather than
248
+ * picking another magic offset and hoping it doesn't collide.
249
+ */
250
+ export function orderAlignmentForEmission(alignment: Alignment[]): Alignment[] {
251
+ const preserved: Array<{ oldIdx: number; newIdx: number }> = []
252
+ for (const a of alignment) {
253
+ if (a.oldIdx !== null && a.newIdx !== null) {
254
+ preserved.push({ oldIdx: a.oldIdx, newIdx: a.newIdx })
255
+ }
256
+ }
257
+ preserved.sort((a, b) => a.oldIdx - b.oldIdx)
258
+
259
+ // For a deleted entry with oldIdx K, return the newIdx of the preserved
260
+ // entry with the largest oldIdx less than K, or -1 if none.
261
+ function newIdxOfPreservedBefore(oldIdx: number): number {
262
+ let result = -1
263
+ for (const p of preserved) {
264
+ if (p.oldIdx >= oldIdx) break
265
+ result = p.newIdx
266
+ }
267
+ return result
268
+ }
269
+
270
+ // Decorate each alignment with a fractional position. We use
271
+ // (primary, secondary) tuples so dels at the same gap sort by oldIdx
272
+ // (in old's source order) and inserts at the same newIdx stay stable.
273
+ const decorated = alignment.map((a, i) => {
274
+ let primary: number
275
+ let secondary: number
276
+ if (a.newIdx !== null) {
277
+ primary = a.newIdx
278
+ secondary = a.oldIdx === null ? 1 : 0 // preserved before pure-insert at same newIdx (rare)
279
+ } else {
280
+ // Pure delete
281
+ primary = newIdxOfPreservedBefore(a.oldIdx as number) + 0.5
282
+ secondary = a.oldIdx as number
283
+ }
284
+ return { entry: a, primary, secondary, originalIdx: i }
285
+ })
286
+
287
+ decorated.sort((a, b) => {
288
+ if (a.primary !== b.primary) return a.primary - b.primary
289
+ if (a.secondary !== b.secondary) return a.secondary - b.secondary
290
+ return a.originalIdx - b.originalIdx // stable
291
+ })
292
+
293
+ return decorated.map(d => d.entry)
294
+ }
295
+
296
+ /**
297
+ * Combined similarity metric used for fuzzy pairing. Returns the MAX of
298
+ * two complementary metrics:
299
+ *
300
+ * 1. **Character prefix+suffix similarity** — fraction of the longer
301
+ * string covered by shared prefix + shared suffix. Catches small
302
+ * edits in the middle of a string (one word changed). Misses cases
303
+ * where the bulk of common content is in the middle and the ends
304
+ * differ.
305
+ *
306
+ * 2. **Token Jaccard similarity** — intersection-over-union of the
307
+ * whitespace-split tokens. Catches "most of the content is the
308
+ * same but bookended by different bits" — e.g. an edit where the
309
+ * ~50 chars in the middle that DO match would be invisible to
310
+ * prefix+suffix.
311
+ *
312
+ * Either metric exceeding the threshold means pair. Neither alone is
313
+ * sufficient for the full range of legal-doc edits we see in
314
+ * production tables.
315
+ */
316
+ export function textSimilarity(a: string, b: string): number {
317
+ if (a === b) return 1
318
+ if (a.length === 0 || b.length === 0) return 0
319
+ return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
320
+ }
321
+
322
+ function charPrefixSuffixSimilarity(a: string, b: string): number {
323
+ let prefix = 0
324
+ const minLen = Math.min(a.length, b.length)
325
+ while (prefix < minLen && a[prefix] === b[prefix]) prefix++
326
+
327
+ let suffix = 0
328
+ while (
329
+ suffix < a.length - prefix &&
330
+ suffix < b.length - prefix &&
331
+ a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
332
+ ) {
333
+ suffix++
334
+ }
335
+
336
+ return (prefix + suffix) / Math.max(a.length, b.length)
337
+ }
338
+
339
+ function tokenJaccardSimilarity(a: string, b: string): number {
340
+ const tokensA = new Set(a.split(/\s+/).filter(Boolean))
341
+ const tokensB = new Set(b.split(/\s+/).filter(Boolean))
342
+ if (tokensA.size === 0 && tokensB.size === 0) return 1
343
+ let intersection = 0
344
+ for (const t of tokensA) {
345
+ if (tokensB.has(t)) intersection++
346
+ }
347
+ const union = tokensA.size + tokensB.size - intersection
348
+ return union === 0 ? 0 : intersection / union
349
+ }