@createiq/htmldiff 1.1.0-beta.0 → 1.2.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -0
- package/dist/HtmlDiff.cjs +1259 -498
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +141 -7
- package/dist/HtmlDiff.d.mts +140 -7
- package/dist/HtmlDiff.mjs +1259 -498
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +7 -7
- package/src/Alignment.ts +349 -0
- package/src/HtmlDiff.ts +323 -33
- package/src/HtmlScanner.ts +200 -0
- package/src/TableDiff.ts +99 -550
- package/src/ThreeWayDiff.ts +223 -0
- package/src/ThreeWayTable.ts +701 -0
- package/src/Utils.ts +34 -2
- package/test/HtmlDiff.analyze.spec.ts +152 -0
- package/test/HtmlDiff.tables.matrix.spec.ts +8 -3
- package/test/HtmlDiff.tables.spec.ts +368 -19
- package/test/HtmlDiff.threeWay.spec.ts +175 -0
- package/test/HtmlDiff.threeWay.tables.spec.ts +407 -0
- package/test/TableDiff.bench.ts +39 -0
- package/test/Utils.spec.ts +48 -0
package/dist/HtmlDiff.cjs
CHANGED
|
@@ -48,8 +48,20 @@ function stripTagAttributes(word) {
|
|
|
48
48
|
if (match) return `${match[0]}${word.endsWith("/>") ? "/>" : ">"}`;
|
|
49
49
|
return word;
|
|
50
50
|
}
|
|
51
|
-
function wrapText(text, tagName, cssClass) {
|
|
52
|
-
return `<${tagName} class='${cssClass}'>${text}</${tagName}>`;
|
|
51
|
+
function wrapText(text, tagName, cssClass, metadata) {
|
|
52
|
+
if (!metadata) return `<${tagName} class='${cssClass}'>${text}</${tagName}>`;
|
|
53
|
+
return `<${tagName}${composeTagAttributes(cssClass, metadata)}>${text}</${tagName}>`;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Build the attribute portion of an opening tag from a base class plus
|
|
57
|
+
* optional metadata. Exposed so emission paths that build opening-tag
|
|
58
|
+
* fragments by hand (e.g. the formatting-tag special-case in
|
|
59
|
+
* `HtmlDiff.insertTag`) can stay consistent with `wrapText`.
|
|
60
|
+
*/
|
|
61
|
+
function composeTagAttributes(cssClass, metadata) {
|
|
62
|
+
let out = ` class='${metadata.extraClasses ? `${cssClass} ${metadata.extraClasses}` : cssClass}'`;
|
|
63
|
+
if (metadata.dataAttrs) for (const key of Object.keys(metadata.dataAttrs)) out += ` data-${key}='${metadata.dataAttrs[key]}'`;
|
|
64
|
+
return out;
|
|
53
65
|
}
|
|
54
66
|
function isStartOfTag(val) {
|
|
55
67
|
return val === "<";
|
|
@@ -83,6 +95,7 @@ var Utils_default = {
|
|
|
83
95
|
isTag,
|
|
84
96
|
stripTagAttributes,
|
|
85
97
|
wrapText,
|
|
98
|
+
composeTagAttributes,
|
|
86
99
|
isStartOfTag,
|
|
87
100
|
isEndOfTag,
|
|
88
101
|
isStartOfEntity,
|
|
@@ -171,40 +184,454 @@ var MatchFinder = class MatchFinder {
|
|
|
171
184
|
}
|
|
172
185
|
matchLengthAt = newMatchLengthAt;
|
|
173
186
|
}
|
|
174
|
-
return bestMatchSize !== 0 ? new Match(bestMatchInOld, bestMatchInNew, bestMatchSize + this.options.blockSize - 1) : null;
|
|
175
|
-
}
|
|
176
|
-
/**
|
|
177
|
-
* This method removes words that occur too many times. This way it reduces total count of comparison operations
|
|
178
|
-
* and as result the diff algorithm takes less time. But the side effect is that it may detect false differences of
|
|
179
|
-
* the repeating words.
|
|
180
|
-
* @private
|
|
181
|
-
*/
|
|
182
|
-
removeRepeatingWords() {
|
|
183
|
-
const threshold = this.newWords.length * this.options.repeatingWordsAccuracy;
|
|
184
|
-
const repeatingWords = Object.entries(this.wordIndices).filter(([, indices]) => indices.length > threshold).map(([word]) => word);
|
|
185
|
-
for (const w of repeatingWords) delete this.wordIndices[w];
|
|
186
|
-
}
|
|
187
|
-
};
|
|
188
|
-
//#endregion
|
|
189
|
-
//#region src/Operation.ts
|
|
190
|
-
var Operation = class {
|
|
191
|
-
action;
|
|
192
|
-
startInOld;
|
|
193
|
-
endInOld;
|
|
194
|
-
startInNew;
|
|
195
|
-
endInNew;
|
|
196
|
-
constructor(action, startInOld, endInOld, startInNew, endInNew) {
|
|
197
|
-
this.action = action;
|
|
198
|
-
this.startInOld = startInOld;
|
|
199
|
-
this.endInOld = endInOld;
|
|
200
|
-
this.startInNew = startInNew;
|
|
201
|
-
this.endInNew = endInNew;
|
|
187
|
+
return bestMatchSize !== 0 ? new Match(bestMatchInOld, bestMatchInNew, bestMatchSize + this.options.blockSize - 1) : null;
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* This method removes words that occur too many times. This way it reduces total count of comparison operations
|
|
191
|
+
* and as result the diff algorithm takes less time. But the side effect is that it may detect false differences of
|
|
192
|
+
* the repeating words.
|
|
193
|
+
* @private
|
|
194
|
+
*/
|
|
195
|
+
removeRepeatingWords() {
|
|
196
|
+
const threshold = this.newWords.length * this.options.repeatingWordsAccuracy;
|
|
197
|
+
const repeatingWords = Object.entries(this.wordIndices).filter(([, indices]) => indices.length > threshold).map(([word]) => word);
|
|
198
|
+
for (const w of repeatingWords) delete this.wordIndices[w];
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
//#endregion
|
|
202
|
+
//#region src/Operation.ts
|
|
203
|
+
var Operation = class {
|
|
204
|
+
action;
|
|
205
|
+
startInOld;
|
|
206
|
+
endInOld;
|
|
207
|
+
startInNew;
|
|
208
|
+
endInNew;
|
|
209
|
+
constructor(action, startInOld, endInOld, startInNew, endInNew) {
|
|
210
|
+
this.action = action;
|
|
211
|
+
this.startInOld = startInOld;
|
|
212
|
+
this.endInOld = endInOld;
|
|
213
|
+
this.startInNew = startInNew;
|
|
214
|
+
this.endInNew = endInNew;
|
|
215
|
+
}
|
|
216
|
+
};
|
|
217
|
+
//#endregion
|
|
218
|
+
//#region src/Alignment.ts
|
|
219
|
+
/**
|
|
220
|
+
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
221
|
+
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|
|
222
|
+
* side is null for an unmatched entry on the other side. Equality uses
|
|
223
|
+
* strict ===.
|
|
224
|
+
*/
|
|
225
|
+
function lcsAlign(oldKeys, newKeys) {
|
|
226
|
+
const m = oldKeys.length;
|
|
227
|
+
const n = newKeys.length;
|
|
228
|
+
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
229
|
+
for (let i = 1; i <= m; i++) for (let j = 1; j <= n; j++) if (oldKeys[i - 1] === newKeys[j - 1]) dp[i][j] = dp[i - 1][j - 1] + 1;
|
|
230
|
+
else dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
|
|
231
|
+
const result = [];
|
|
232
|
+
let i = m;
|
|
233
|
+
let j = n;
|
|
234
|
+
while (i > 0 || j > 0) if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
|
|
235
|
+
result.push({
|
|
236
|
+
oldIdx: i - 1,
|
|
237
|
+
newIdx: j - 1
|
|
238
|
+
});
|
|
239
|
+
i--;
|
|
240
|
+
j--;
|
|
241
|
+
} else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
|
|
242
|
+
result.push({
|
|
243
|
+
oldIdx: null,
|
|
244
|
+
newIdx: j - 1
|
|
245
|
+
});
|
|
246
|
+
j--;
|
|
247
|
+
} else {
|
|
248
|
+
result.push({
|
|
249
|
+
oldIdx: i - 1,
|
|
250
|
+
newIdx: null
|
|
251
|
+
});
|
|
252
|
+
i--;
|
|
253
|
+
}
|
|
254
|
+
result.reverse();
|
|
255
|
+
return result;
|
|
256
|
+
}
|
|
257
|
+
/**
|
|
258
|
+
* Given a shorter sequence (M items) and a longer sequence (N items, with
|
|
259
|
+
* N > M), find the K = N - M positions in the longer sequence that should
|
|
260
|
+
* be "skipped" so the unskipped longer items, aligned positionally with
|
|
261
|
+
* the shorter items, maximise the sum of pairwise similarity.
|
|
262
|
+
*
|
|
263
|
+
* Solves the same problem as enumerating C(N, K) skip combinations and
|
|
264
|
+
* picking the highest-scoring one, but in O(M × N) time via DP:
|
|
265
|
+
*
|
|
266
|
+
* f(i, j) = max similarity from consuming i shorter and j longer items
|
|
267
|
+
* (defined for j >= i; entries below the diagonal are never
|
|
268
|
+
* written or read).
|
|
269
|
+
* f(0, j) = 0
|
|
270
|
+
* f(i, j) = max(
|
|
271
|
+
* f(i-1, j-1) + similarity(i-1, j-1), // pair
|
|
272
|
+
* f(i, j-1) // skip longer[j-1]
|
|
273
|
+
* )
|
|
274
|
+
*
|
|
275
|
+
* Tie-breaking prefers pairing over skipping, so ties resolve to skipping
|
|
276
|
+
* EARLIER positions — matching the lex-first-combo behaviour of a full
|
|
277
|
+
* combinatorial enumeration over which K positions to skip. Backtrack
|
|
278
|
+
* re-asks the fill's pair-vs-skip question to preserve this direction
|
|
279
|
+
* (the alternative — a `dp[i][j] > dp[i][j-1]` shortcut — would invert
|
|
280
|
+
* the tie-breaking).
|
|
281
|
+
*
|
|
282
|
+
* Caller responsibility: ensure `longerTexts.length >= shorterTexts.length`.
|
|
283
|
+
*/
|
|
284
|
+
function findOptimalAlignmentSkips(shorterTexts, longerTexts, similarity) {
|
|
285
|
+
const m = shorterTexts.length;
|
|
286
|
+
const n = longerTexts.length;
|
|
287
|
+
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
288
|
+
for (let i = 1; i <= m; i++) for (let j = i; j <= n; j++) {
|
|
289
|
+
const pair = dp[i - 1][j - 1] + similarity(i - 1, j - 1);
|
|
290
|
+
const skip = j > i ? dp[i][j - 1] : Number.NEGATIVE_INFINITY;
|
|
291
|
+
dp[i][j] = pair >= skip ? pair : skip;
|
|
292
|
+
}
|
|
293
|
+
const skipped = [];
|
|
294
|
+
let i = m;
|
|
295
|
+
let j = n;
|
|
296
|
+
while (j > 0) {
|
|
297
|
+
if (i === 0) {
|
|
298
|
+
skipped.push(j - 1);
|
|
299
|
+
j--;
|
|
300
|
+
continue;
|
|
301
|
+
}
|
|
302
|
+
if (j === i) {
|
|
303
|
+
i--;
|
|
304
|
+
j--;
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
307
|
+
if (dp[i - 1][j - 1] + similarity(i - 1, j - 1) >= dp[i][j - 1]) {
|
|
308
|
+
i--;
|
|
309
|
+
j--;
|
|
310
|
+
} else {
|
|
311
|
+
skipped.push(j - 1);
|
|
312
|
+
j--;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
skipped.reverse();
|
|
316
|
+
return skipped;
|
|
317
|
+
}
|
|
318
|
+
/**
|
|
319
|
+
* Identifies pairings inside each unmatched-only run, then builds the
|
|
320
|
+
* output alignment by walking the original and substituting paired
|
|
321
|
+
* entries at the *ins position* (not the del position). This keeps the
|
|
322
|
+
* result monotonically non-decreasing in newIdx — required by any
|
|
323
|
+
* downstream emission that walks the new sequence in order. Emitting at
|
|
324
|
+
* the del position would be safe when del<ins in the alignment array
|
|
325
|
+
* (the typical case), but can violate monotonicity when unpaired
|
|
326
|
+
* entries interleave with paired ones in the same run.
|
|
327
|
+
*
|
|
328
|
+
* Greedy assignment: the first del in document order wins its best ins.
|
|
329
|
+
* Suboptimal vs Hungarian on edge cases (two dels above threshold for
|
|
330
|
+
* the same ins), but bounded — a losing del just emits as a full delete
|
|
331
|
+
* rather than a content edit.
|
|
332
|
+
*/
|
|
333
|
+
function pairSimilarUnmatched(alignment, threshold, similarity) {
|
|
334
|
+
const pairs = /* @__PURE__ */ new Map();
|
|
335
|
+
let i = 0;
|
|
336
|
+
while (i < alignment.length) {
|
|
337
|
+
if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
|
|
338
|
+
i++;
|
|
339
|
+
continue;
|
|
340
|
+
}
|
|
341
|
+
const runStart = i;
|
|
342
|
+
while (i < alignment.length && alignment[i].oldIdx === null !== (alignment[i].newIdx === null)) i++;
|
|
343
|
+
const runEnd = i;
|
|
344
|
+
const delIndices = [];
|
|
345
|
+
const insIndices = [];
|
|
346
|
+
for (let k = runStart; k < runEnd; k++) if (alignment[k].oldIdx !== null) delIndices.push(k);
|
|
347
|
+
else insIndices.push(k);
|
|
348
|
+
const usedIns = /* @__PURE__ */ new Set();
|
|
349
|
+
for (const di of delIndices) {
|
|
350
|
+
let bestIi = -1;
|
|
351
|
+
let bestSim = threshold;
|
|
352
|
+
for (const ii of insIndices) {
|
|
353
|
+
if (usedIns.has(ii)) continue;
|
|
354
|
+
const sim = similarity(alignment[di].oldIdx, alignment[ii].newIdx);
|
|
355
|
+
if (sim > bestSim) {
|
|
356
|
+
bestSim = sim;
|
|
357
|
+
bestIi = ii;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
if (bestIi >= 0) {
|
|
361
|
+
pairs.set(di, bestIi);
|
|
362
|
+
usedIns.add(bestIi);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
const insToDel = /* @__PURE__ */ new Map();
|
|
367
|
+
for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi);
|
|
368
|
+
const pairedDels = new Set(pairs.keys());
|
|
369
|
+
const result = [];
|
|
370
|
+
for (let k = 0; k < alignment.length; k++) {
|
|
371
|
+
if (pairedDels.has(k)) continue;
|
|
372
|
+
if (insToDel.has(k)) {
|
|
373
|
+
const delAi = insToDel.get(k);
|
|
374
|
+
result.push({
|
|
375
|
+
oldIdx: alignment[delAi].oldIdx,
|
|
376
|
+
newIdx: alignment[k].newIdx
|
|
377
|
+
});
|
|
378
|
+
} else result.push(alignment[k]);
|
|
379
|
+
}
|
|
380
|
+
return result;
|
|
381
|
+
}
|
|
382
|
+
/**
|
|
383
|
+
* Reorders the alignment so a cursor-based emission walking the new
|
|
384
|
+
* sequence in order produces entries in their visually-correct
|
|
385
|
+
* position. Each entry is assigned a fractional "position" in new's
|
|
386
|
+
* flow:
|
|
387
|
+
*
|
|
388
|
+
* • Preserved/paired (oldIdx, newIdx): position = newIdx.
|
|
389
|
+
* • Pure insert (null, newIdx): position = newIdx.
|
|
390
|
+
* • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
|
|
391
|
+
* before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
|
|
392
|
+
* they appear in old's source order. The +0.5 places dels BEFORE
|
|
393
|
+
* any insert at the same gap (insert at newIdx N1+1 has position
|
|
394
|
+
* N1+1 which is > N1+0.5), giving the natural "delete first, insert
|
|
395
|
+
* second" reading order at a replaced position.
|
|
396
|
+
*
|
|
397
|
+
* Handles the full range:
|
|
398
|
+
* • Run of unpaired dels at the start (no preserved predecessor):
|
|
399
|
+
* position -0.5, sorted by oldIdx.
|
|
400
|
+
* • Dels in the middle: positioned right after their preceding
|
|
401
|
+
* preserved entry.
|
|
402
|
+
* • Dels at the end (no preserved successor): positioned after the
|
|
403
|
+
* last preserved entry.
|
|
404
|
+
*
|
|
405
|
+
* Without this reordering, a run of unpaired deletes ahead of any
|
|
406
|
+
* preserved entry would be emitted before the first preserved entry,
|
|
407
|
+
* regardless of where they originated in old.
|
|
408
|
+
*
|
|
409
|
+
* NB: `0.5` is the ONLY fractional offset used. If another decoration
|
|
410
|
+
* kind ever needs a fractional position too, redesign this scheme
|
|
411
|
+
* (e.g. a discrete `(integerSlot, kind, secondary)` triple) rather than
|
|
412
|
+
* picking another magic offset and hoping it doesn't collide.
|
|
413
|
+
*/
|
|
414
|
+
function orderAlignmentForEmission(alignment) {
|
|
415
|
+
const preserved = [];
|
|
416
|
+
for (const a of alignment) if (a.oldIdx !== null && a.newIdx !== null) preserved.push({
|
|
417
|
+
oldIdx: a.oldIdx,
|
|
418
|
+
newIdx: a.newIdx
|
|
419
|
+
});
|
|
420
|
+
preserved.sort((a, b) => a.oldIdx - b.oldIdx);
|
|
421
|
+
function newIdxOfPreservedBefore(oldIdx) {
|
|
422
|
+
let result = -1;
|
|
423
|
+
for (const p of preserved) {
|
|
424
|
+
if (p.oldIdx >= oldIdx) break;
|
|
425
|
+
result = p.newIdx;
|
|
426
|
+
}
|
|
427
|
+
return result;
|
|
428
|
+
}
|
|
429
|
+
const decorated = alignment.map((a, i) => {
|
|
430
|
+
let primary;
|
|
431
|
+
let secondary;
|
|
432
|
+
if (a.newIdx !== null) {
|
|
433
|
+
primary = a.newIdx;
|
|
434
|
+
secondary = a.oldIdx === null ? 1 : 0;
|
|
435
|
+
} else {
|
|
436
|
+
primary = newIdxOfPreservedBefore(a.oldIdx) + .5;
|
|
437
|
+
secondary = a.oldIdx;
|
|
438
|
+
}
|
|
439
|
+
return {
|
|
440
|
+
entry: a,
|
|
441
|
+
primary,
|
|
442
|
+
secondary,
|
|
443
|
+
originalIdx: i
|
|
444
|
+
};
|
|
445
|
+
});
|
|
446
|
+
decorated.sort((a, b) => {
|
|
447
|
+
if (a.primary !== b.primary) return a.primary - b.primary;
|
|
448
|
+
if (a.secondary !== b.secondary) return a.secondary - b.secondary;
|
|
449
|
+
return a.originalIdx - b.originalIdx;
|
|
450
|
+
});
|
|
451
|
+
return decorated.map((d) => d.entry);
|
|
452
|
+
}
|
|
453
|
+
/**
|
|
454
|
+
* Combined similarity metric used for fuzzy pairing. Returns the MAX of
|
|
455
|
+
* two complementary metrics:
|
|
456
|
+
*
|
|
457
|
+
* 1. **Character prefix+suffix similarity** — fraction of the longer
|
|
458
|
+
* string covered by shared prefix + shared suffix. Catches small
|
|
459
|
+
* edits in the middle of a string (one word changed). Misses cases
|
|
460
|
+
* where the bulk of common content is in the middle and the ends
|
|
461
|
+
* differ.
|
|
462
|
+
*
|
|
463
|
+
* 2. **Token Jaccard similarity** — intersection-over-union of the
|
|
464
|
+
* whitespace-split tokens. Catches "most of the content is the
|
|
465
|
+
* same but bookended by different bits" — e.g. an edit where the
|
|
466
|
+
* ~50 chars in the middle that DO match would be invisible to
|
|
467
|
+
* prefix+suffix.
|
|
468
|
+
*
|
|
469
|
+
* Either metric exceeding the threshold means pair. Neither alone is
|
|
470
|
+
* sufficient for the full range of legal-doc edits we see in
|
|
471
|
+
* production tables.
|
|
472
|
+
*/
|
|
473
|
+
function textSimilarity(a, b) {
|
|
474
|
+
if (a === b) return 1;
|
|
475
|
+
if (a.length === 0 || b.length === 0) return 0;
|
|
476
|
+
return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b));
|
|
477
|
+
}
|
|
478
|
+
function charPrefixSuffixSimilarity(a, b) {
|
|
479
|
+
let prefix = 0;
|
|
480
|
+
const minLen = Math.min(a.length, b.length);
|
|
481
|
+
while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
|
|
482
|
+
let suffix = 0;
|
|
483
|
+
while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
|
|
484
|
+
return (prefix + suffix) / Math.max(a.length, b.length);
|
|
485
|
+
}
|
|
486
|
+
function tokenJaccardSimilarity(a, b) {
|
|
487
|
+
const tokensA = new Set(a.split(/\s+/).filter(Boolean));
|
|
488
|
+
const tokensB = new Set(b.split(/\s+/).filter(Boolean));
|
|
489
|
+
if (tokensA.size === 0 && tokensB.size === 0) return 1;
|
|
490
|
+
let intersection = 0;
|
|
491
|
+
for (const t of tokensA) if (tokensB.has(t)) intersection++;
|
|
492
|
+
const union = tokensA.size + tokensB.size - intersection;
|
|
493
|
+
return union === 0 ? 0 : intersection / union;
|
|
494
|
+
}
|
|
495
|
+
//#endregion
|
|
496
|
+
//#region src/HtmlScanner.ts
|
|
497
|
+
/**
|
|
498
|
+
* Parses the opening tag (or comment/CDATA/PI) starting at `i`. Returns
|
|
499
|
+
* the index just past the closing delimiter, or null if the tag is
|
|
500
|
+
* malformed (unterminated). HTML comments, CDATA, processing
|
|
501
|
+
* instructions, and DOCTYPE need their own terminators — a plain
|
|
502
|
+
* `>`-walker would cut a comment like `<!-- a > b -->` at the first
|
|
503
|
+
* inner `>`, treating the rest as text and corrupting downstream
|
|
504
|
+
* offsets. Word-exported HTML routinely emits comments inside tables
|
|
505
|
+
* (conditional comments, OLE markers) so these have to be handled.
|
|
506
|
+
*/
|
|
507
|
+
function parseOpeningTagAt(html, i) {
|
|
508
|
+
if (html.startsWith("<!--", i)) {
|
|
509
|
+
const close = html.indexOf("-->", i + 4);
|
|
510
|
+
return close === -1 ? null : { end: close + 3 };
|
|
511
|
+
}
|
|
512
|
+
if (html.startsWith("<![CDATA[", i)) {
|
|
513
|
+
const close = html.indexOf("]]>", i + 9);
|
|
514
|
+
return close === -1 ? null : { end: close + 3 };
|
|
515
|
+
}
|
|
516
|
+
if (html.startsWith("<?", i)) {
|
|
517
|
+
const close = html.indexOf("?>", i + 2);
|
|
518
|
+
return close === -1 ? null : { end: close + 2 };
|
|
519
|
+
}
|
|
520
|
+
let j = i + 1;
|
|
521
|
+
let quote = null;
|
|
522
|
+
while (j < html.length) {
|
|
523
|
+
const ch = html[j];
|
|
524
|
+
if (quote) {
|
|
525
|
+
if (ch === quote) quote = null;
|
|
526
|
+
} else if (ch === "\"" || ch === "'") quote = ch;
|
|
527
|
+
else if (ch === ">") return { end: j + 1 };
|
|
528
|
+
j++;
|
|
529
|
+
}
|
|
530
|
+
return null;
|
|
531
|
+
}
|
|
532
|
+
function matchesTagAt(html, i, tagName) {
|
|
533
|
+
if (html[i] !== "<") return false;
|
|
534
|
+
if (html.slice(i + 1, i + 1 + tagName.length).toLowerCase() !== tagName) return false;
|
|
535
|
+
const after = html[i + 1 + tagName.length];
|
|
536
|
+
return after === ">" || after === " " || after === " " || after === "\n" || after === "\r" || after === "/";
|
|
537
|
+
}
|
|
538
|
+
function matchesClosingTagAt(html, i, tagName) {
|
|
539
|
+
if (html[i] !== "<" || html[i + 1] !== "/") return false;
|
|
540
|
+
if (html.slice(i + 2, i + 2 + tagName.length).toLowerCase() !== tagName) return false;
|
|
541
|
+
const after = html[i + 2 + tagName.length];
|
|
542
|
+
return after === ">" || after === " " || after === " " || after === "\n" || after === "\r";
|
|
543
|
+
}
|
|
544
|
+
/**
|
|
545
|
+
* Returns the index just past the matching `</tagName>`, accounting for
|
|
546
|
+
* nested tags of the same name. Returns -1 if no match before `limit`.
|
|
547
|
+
*/
|
|
548
|
+
function findMatchingClosingTag(html, from, tagName, limit = html.length) {
|
|
549
|
+
let depth = 1;
|
|
550
|
+
let i = from;
|
|
551
|
+
while (i < limit) if (matchesTagAt(html, i, tagName)) {
|
|
552
|
+
const opening = parseOpeningTagAt(html, i);
|
|
553
|
+
if (!opening) {
|
|
554
|
+
i++;
|
|
555
|
+
continue;
|
|
556
|
+
}
|
|
557
|
+
if (!html.slice(i, opening.end).endsWith("/>")) depth++;
|
|
558
|
+
i = opening.end;
|
|
559
|
+
} else if (matchesClosingTagAt(html, i, tagName)) {
|
|
560
|
+
depth--;
|
|
561
|
+
const closingEnd = parseOpeningTagAt(html, i)?.end ?? i + `</${tagName}>`.length;
|
|
562
|
+
if (depth === 0) return closingEnd;
|
|
563
|
+
i = closingEnd;
|
|
564
|
+
} else i++;
|
|
565
|
+
return -1;
|
|
566
|
+
}
|
|
567
|
+
/**
|
|
568
|
+
* Returns the opening tag with the given class injected. Locates the real
|
|
569
|
+
* `class` attribute via attribute-aware walking (NOT a flat regex — that
|
|
570
|
+
* would mis-match inside a foreign attribute value like
|
|
571
|
+
* `title="see class='x'"`). When the class already partially overlaps with
|
|
572
|
+
* `cls` — e.g. existing `class="mod"` and we're injecting `mod colspan` —
|
|
573
|
+
* only the missing tokens get appended, so we never end up with
|
|
574
|
+
* `class="mod mod colspan"`.
|
|
575
|
+
*/
|
|
576
|
+
function injectClass(openingTag, cls) {
|
|
577
|
+
const clsTokens = cls.split(/\s+/).filter(Boolean);
|
|
578
|
+
if (clsTokens.length === 0) return openingTag;
|
|
579
|
+
const classAttr = findClassAttribute(openingTag);
|
|
580
|
+
if (classAttr) {
|
|
581
|
+
const existingTokens = classAttr.value.split(/\s+/).filter(Boolean);
|
|
582
|
+
const missing = clsTokens.filter((t) => !existingTokens.includes(t));
|
|
583
|
+
if (missing.length === 0) return openingTag;
|
|
584
|
+
const updatedValue = existingTokens.length === 0 ? missing.join(" ") : `${existingTokens.join(" ")} ${missing.join(" ")}`;
|
|
585
|
+
return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd);
|
|
586
|
+
}
|
|
587
|
+
const insertAt = openingTag.endsWith("/>") ? openingTag.length - 2 : openingTag.length - 1;
|
|
588
|
+
return `${openingTag.slice(0, insertAt).replace(/\s*$/, "")} class='${cls}'${openingTag.slice(insertAt)}`;
|
|
589
|
+
}
|
|
590
|
+
/**
|
|
591
|
+
* Walks the opening tag's attributes (respecting quoted values) to find
|
|
592
|
+
* the actual `class` attribute. Returns the value range (start/end of the
|
|
593
|
+
* value content, *excluding* the surrounding quotes) and the value, or
|
|
594
|
+
* null if no `class` attribute is present.
|
|
595
|
+
*/
|
|
596
|
+
function findClassAttribute(openingTag) {
|
|
597
|
+
let i = 1;
|
|
598
|
+
while (i < openingTag.length && /[A-Za-z0-9_:-]/.test(openingTag[i])) i++;
|
|
599
|
+
while (i < openingTag.length) {
|
|
600
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++;
|
|
601
|
+
if (i >= openingTag.length) break;
|
|
602
|
+
if (openingTag[i] === ">" || openingTag[i] === "/") break;
|
|
603
|
+
const nameStart = i;
|
|
604
|
+
while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++;
|
|
605
|
+
const name = openingTag.slice(nameStart, i);
|
|
606
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++;
|
|
607
|
+
if (openingTag[i] !== "=") continue;
|
|
608
|
+
i++;
|
|
609
|
+
while (i < openingTag.length && /\s/.test(openingTag[i])) i++;
|
|
610
|
+
let valueStart;
|
|
611
|
+
let valueEnd;
|
|
612
|
+
if (openingTag[i] === "\"" || openingTag[i] === "'") {
|
|
613
|
+
const quote = openingTag[i];
|
|
614
|
+
i++;
|
|
615
|
+
valueStart = i;
|
|
616
|
+
while (i < openingTag.length && openingTag[i] !== quote) i++;
|
|
617
|
+
valueEnd = i;
|
|
618
|
+
if (i < openingTag.length) i++;
|
|
619
|
+
} else {
|
|
620
|
+
valueStart = i;
|
|
621
|
+
while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++;
|
|
622
|
+
valueEnd = i;
|
|
623
|
+
}
|
|
624
|
+
if (name.toLowerCase() === "class") return {
|
|
625
|
+
valueStart,
|
|
626
|
+
valueEnd,
|
|
627
|
+
value: openingTag.slice(valueStart, valueEnd)
|
|
628
|
+
};
|
|
202
629
|
}
|
|
203
|
-
|
|
630
|
+
return null;
|
|
631
|
+
}
|
|
204
632
|
//#endregion
|
|
205
633
|
//#region src/TableDiff.ts
|
|
206
634
|
const PLACEHOLDER_PREFIX_BASE = "<!--HTMLDIFF_TABLE_";
|
|
207
|
-
const PLACEHOLDER_SUFFIX = "-->";
|
|
208
635
|
/**
|
|
209
636
|
* Hard cap on table dimensions handled by the structural-aware path.
|
|
210
637
|
* The row-LCS is O(rows²), the per-row cell-LCS is O(cells²), and each
|
|
@@ -216,10 +643,17 @@ const PLACEHOLDER_SUFFIX = "-->";
|
|
|
216
643
|
*/
|
|
217
644
|
const MAX_TABLE_ROWS = 1500;
|
|
218
645
|
const MAX_TABLE_CELLS_PER_ROW = 200;
|
|
219
|
-
|
|
646
|
+
const MAX_COLUMN_DELTA = 6;
|
|
647
|
+
const MAX_COLUMN_SEARCH_WIDTH = 200;
|
|
648
|
+
/**
|
|
649
|
+
* Generate a placeholder-prefix nonce that doesn't collide with any
|
|
650
|
+
* existing content in the inputs. Variadic so callers with N inputs
|
|
651
|
+
* (e.g. three-way diff with V1/V2/V3) check across all of them.
|
|
652
|
+
*/
|
|
653
|
+
function makePlaceholderPrefix(...inputs) {
|
|
220
654
|
for (let attempt = 0; attempt < 8; attempt++) {
|
|
221
655
|
const prefix = `${PLACEHOLDER_PREFIX_BASE}${Math.floor(Math.random() * 4294967295).toString(16).padStart(8, "0")}_`;
|
|
222
|
-
if (
|
|
656
|
+
if (inputs.every((input) => !input.includes(prefix))) return prefix;
|
|
223
657
|
}
|
|
224
658
|
return `${PLACEHOLDER_PREFIX_BASE}fallback_${Date.now()}_`;
|
|
225
659
|
}
|
|
@@ -246,7 +680,7 @@ function preprocessTables(oldHtml, newHtml, diffCell) {
|
|
|
246
680
|
const placeholderPrefix = makePlaceholderPrefix(oldHtml, newHtml);
|
|
247
681
|
const placeholderToDiff = /* @__PURE__ */ new Map();
|
|
248
682
|
for (let i = pairs.length - 1; i >= 0; i--) {
|
|
249
|
-
const placeholder = `${placeholderPrefix}${i}
|
|
683
|
+
const placeholder = `${placeholderPrefix}${i}-->`;
|
|
250
684
|
placeholderToDiff.set(placeholder, pairs[i].diffed);
|
|
251
685
|
modifiedOld = spliceString(modifiedOld, pairs[i].oldTable.tableStart, pairs[i].oldTable.tableEnd, placeholder);
|
|
252
686
|
modifiedNew = spliceString(modifiedNew, pairs[i].newTable.tableStart, pairs[i].newTable.tableEnd, placeholder);
|
|
@@ -412,7 +846,7 @@ function diffPositionalTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
|
|
|
412
846
|
*/
|
|
413
847
|
function diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
|
|
414
848
|
const alignment = orderAlignmentForEmission(pairSimilarUnmatchedRows(lcsAlign(oldTable.rows.map((row) => rowKey(oldHtml, row)), newTable.rows.map((row) => rowKey(newHtml, row))), oldTable, newTable, oldHtml, newHtml));
|
|
415
|
-
if (newTable.rows.length === 0) return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment
|
|
849
|
+
if (newTable.rows.length === 0) return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment);
|
|
416
850
|
const out = [];
|
|
417
851
|
out.push(newHtml.slice(newTable.tableStart, newTable.rows[0].rowStart));
|
|
418
852
|
let cursor = newTable.rows[0].rowStart;
|
|
@@ -420,83 +854,17 @@ function diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diff
|
|
|
420
854
|
const newRow = newTable.rows[align.newIdx];
|
|
421
855
|
out.push(newHtml.slice(cursor, newRow.rowStart));
|
|
422
856
|
if (align.oldIdx !== null) out.push(diffPreservedRow(oldHtml, newHtml, oldTable.rows[align.oldIdx], newRow, diffCell));
|
|
423
|
-
else out.push(emitFullRow(newHtml, newRow, "ins"
|
|
857
|
+
else out.push(emitFullRow(newHtml, newRow, "ins"));
|
|
424
858
|
cursor = newRow.rowEnd;
|
|
425
|
-
} else if (align.oldIdx !== null) out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], "del"
|
|
859
|
+
} else if (align.oldIdx !== null) out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], "del"));
|
|
426
860
|
out.push(newHtml.slice(cursor, newTable.tableEnd));
|
|
427
861
|
return out.join("");
|
|
428
862
|
}
|
|
429
|
-
|
|
430
|
-
* Reorders the alignment so emission produces rows in the visually-
|
|
431
|
-
* correct order. Each entry is assigned a fractional "position" in
|
|
432
|
-
* new's flow:
|
|
433
|
-
*
|
|
434
|
-
* • Preserved/paired (oldIdx, newIdx): position = newIdx.
|
|
435
|
-
* • Pure insert (null, newIdx): position = newIdx.
|
|
436
|
-
* • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
|
|
437
|
-
* before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
|
|
438
|
-
* they appear in old's row order. The +0.5 places dels BEFORE any
|
|
439
|
-
* insert at the same gap (insert at newIdx N1+1 has position N1+1
|
|
440
|
-
* which is > N1+0.5), giving the natural "delete first, insert
|
|
441
|
-
* second" reading order at a replaced position.
|
|
442
|
-
*
|
|
443
|
-
* This handles the full range:
|
|
444
|
-
* • Run of unpaired dels at the start (no preserved predecessor):
|
|
445
|
-
* position -0.5, sorted by oldIdx.
|
|
446
|
-
* • Dels in the middle: positioned right after their preceding
|
|
447
|
-
* preserved row.
|
|
448
|
-
* • Dels at the end (no preserved successor): positioned after the
|
|
449
|
-
* last preserved row.
|
|
450
|
-
*
|
|
451
|
-
* Without this reordering, a run of unpaired deletes at low alignment
|
|
452
|
-
* indices got emitted at cursor = first-new-row position — putting
|
|
453
|
-
* all deletes before any preserved row in the output, regardless of
|
|
454
|
-
* where they came from in old.
|
|
455
|
-
*/
|
|
456
|
-
function orderAlignmentForEmission(alignment) {
|
|
457
|
-
const preserved = [];
|
|
458
|
-
for (const a of alignment) if (a.oldIdx !== null && a.newIdx !== null) preserved.push({
|
|
459
|
-
oldIdx: a.oldIdx,
|
|
460
|
-
newIdx: a.newIdx
|
|
461
|
-
});
|
|
462
|
-
preserved.sort((a, b) => a.oldIdx - b.oldIdx);
|
|
463
|
-
function newIdxOfPreservedBefore(oldIdx) {
|
|
464
|
-
let result = -1;
|
|
465
|
-
for (const p of preserved) {
|
|
466
|
-
if (p.oldIdx >= oldIdx) break;
|
|
467
|
-
result = p.newIdx;
|
|
468
|
-
}
|
|
469
|
-
return result;
|
|
470
|
-
}
|
|
471
|
-
const decorated = alignment.map((a, i) => {
|
|
472
|
-
let primary;
|
|
473
|
-
let secondary;
|
|
474
|
-
if (a.newIdx !== null) {
|
|
475
|
-
primary = a.newIdx;
|
|
476
|
-
secondary = a.oldIdx === null ? 1 : 0;
|
|
477
|
-
} else {
|
|
478
|
-
primary = newIdxOfPreservedBefore(a.oldIdx) + .5;
|
|
479
|
-
secondary = a.oldIdx;
|
|
480
|
-
}
|
|
481
|
-
return {
|
|
482
|
-
entry: a,
|
|
483
|
-
primary,
|
|
484
|
-
secondary,
|
|
485
|
-
originalIdx: i
|
|
486
|
-
};
|
|
487
|
-
});
|
|
488
|
-
decorated.sort((a, b) => {
|
|
489
|
-
if (a.primary !== b.primary) return a.primary - b.primary;
|
|
490
|
-
if (a.secondary !== b.secondary) return a.secondary - b.secondary;
|
|
491
|
-
return a.originalIdx - b.originalIdx;
|
|
492
|
-
});
|
|
493
|
-
return decorated.map((d) => d.entry);
|
|
494
|
-
}
|
|
495
|
-
function rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell) {
|
|
863
|
+
function rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment) {
|
|
496
864
|
const out = [];
|
|
497
865
|
out.push(headerSlice(newHtml, newTable, oldHtml, oldTable));
|
|
498
|
-
for (const align of alignment) if (align.oldIdx !== null) out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], "del"
|
|
499
|
-
else if (align.newIdx !== null) out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], "ins"
|
|
866
|
+
for (const align of alignment) if (align.oldIdx !== null) out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], "del"));
|
|
867
|
+
else if (align.newIdx !== null) out.push(emitFullRow(newHtml, newTable.rows[align.newIdx], "ins"));
|
|
500
868
|
out.push("</table>");
|
|
501
869
|
return out.join("");
|
|
502
870
|
}
|
|
@@ -516,27 +884,25 @@ function diffPreservedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
|
516
884
|
const delta = newRow.cells.length - oldRow.cells.length;
|
|
517
885
|
const absDelta = Math.abs(delta);
|
|
518
886
|
if (absDelta > 0 && absDelta <= MAX_COLUMN_DELTA && Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH) {
|
|
519
|
-
if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow,
|
|
520
|
-
return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow,
|
|
887
|
+
if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
888
|
+
return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
521
889
|
}
|
|
522
890
|
return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
|
|
523
891
|
}
|
|
524
|
-
const MAX_COLUMN_DELTA = 6;
|
|
525
|
-
const MAX_COLUMN_SEARCH_WIDTH = 40;
|
|
526
892
|
/**
|
|
527
|
-
* For a row where new has
|
|
528
|
-
*
|
|
529
|
-
*
|
|
530
|
-
*
|
|
531
|
-
*
|
|
532
|
-
* diff for matched pairs.
|
|
893
|
+
* For a row where new has more cells than old, find the column positions
|
|
894
|
+
* in new where cells were inserted by running a monotonic-alignment DP
|
|
895
|
+
* over the cell texts: pick the skip positions that maximise the sum-of-
|
|
896
|
+
* similarities of the unskipped new cells aligned positionally against
|
|
897
|
+
* the old cells. The inserted cells are emitted with diff markers; the
|
|
898
|
+
* rest are aligned positionally with content diff for matched pairs.
|
|
533
899
|
*/
|
|
534
|
-
function diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow,
|
|
535
|
-
const insertedPositions = findBestColumnInsertPositions(oldRow, newRow,
|
|
900
|
+
function diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
901
|
+
const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, oldHtml, newHtml);
|
|
536
902
|
const inserted = new Set(insertedPositions);
|
|
537
903
|
const out = [rowHeaderSlice(newHtml, newRow)];
|
|
538
904
|
let oldIdx = 0;
|
|
539
|
-
for (let c = 0; c < newRow.cells.length; c++) if (inserted.has(c)) out.push(emitFullCell(newHtml, newRow.cells[c], "ins"
|
|
905
|
+
for (let c = 0; c < newRow.cells.length; c++) if (inserted.has(c)) out.push(emitFullCell(newHtml, newRow.cells[c], "ins"));
|
|
540
906
|
else {
|
|
541
907
|
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell));
|
|
542
908
|
oldIdx++;
|
|
@@ -544,14 +910,14 @@ function diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
|
|
|
544
910
|
out.push("</tr>");
|
|
545
911
|
return out.join("");
|
|
546
912
|
}
|
|
547
|
-
function diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow,
|
|
548
|
-
const deletedPositions = findBestColumnDeletePositions(oldRow, newRow,
|
|
913
|
+
function diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
|
|
914
|
+
const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, oldHtml, newHtml);
|
|
549
915
|
const deleted = new Set(deletedPositions);
|
|
550
916
|
const out = [rowHeaderSlice(newHtml, newRow)];
|
|
551
917
|
let newIdx = 0;
|
|
552
918
|
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
553
919
|
if (deleted.has(oldIdx)) {
|
|
554
|
-
out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], "del"
|
|
920
|
+
out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], "del"));
|
|
555
921
|
continue;
|
|
556
922
|
}
|
|
557
923
|
out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell));
|
|
@@ -560,60 +926,15 @@ function diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, k, diffCell)
|
|
|
560
926
|
out.push("</tr>");
|
|
561
927
|
return out.join("");
|
|
562
928
|
}
|
|
563
|
-
function findBestColumnInsertPositions(oldRow, newRow,
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
const inserted = new Set(combo);
|
|
568
|
-
let score = 0;
|
|
569
|
-
let oldIdx = 0;
|
|
570
|
-
for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
|
|
571
|
-
if (inserted.has(newIdx)) continue;
|
|
572
|
-
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
|
|
573
|
-
oldIdx++;
|
|
574
|
-
}
|
|
575
|
-
if (score > bestScore) {
|
|
576
|
-
bestScore = score;
|
|
577
|
-
bestPositions = combo;
|
|
578
|
-
}
|
|
579
|
-
}
|
|
580
|
-
return bestPositions;
|
|
581
|
-
}
|
|
582
|
-
function findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml) {
|
|
583
|
-
let bestPositions = [];
|
|
584
|
-
let bestScore = -1;
|
|
585
|
-
for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
|
|
586
|
-
const deleted = new Set(combo);
|
|
587
|
-
let score = 0;
|
|
588
|
-
let newIdx = 0;
|
|
589
|
-
for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
|
|
590
|
-
if (deleted.has(oldIdx)) continue;
|
|
591
|
-
score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
|
|
592
|
-
newIdx++;
|
|
593
|
-
}
|
|
594
|
-
if (score > bestScore) {
|
|
595
|
-
bestScore = score;
|
|
596
|
-
bestPositions = combo;
|
|
597
|
-
}
|
|
598
|
-
}
|
|
599
|
-
return bestPositions;
|
|
929
|
+
function findBestColumnInsertPositions(oldRow, newRow, oldHtml, newHtml) {
|
|
930
|
+
const oldTexts = oldRow.cells.map((c) => cellText(oldHtml, c));
|
|
931
|
+
const newTexts = newRow.cells.map((c) => cellText(newHtml, c));
|
|
932
|
+
return findOptimalAlignmentSkips(oldTexts, newTexts, (oldIdx, newIdx) => textSimilarity(oldTexts[oldIdx], newTexts[newIdx]));
|
|
600
933
|
}
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
*/
|
|
606
|
-
function* combinationsOfRange(n, k) {
|
|
607
|
-
if (k === 0 || k > n) return;
|
|
608
|
-
const indices = Array.from({ length: k }, (_, i) => i);
|
|
609
|
-
while (true) {
|
|
610
|
-
yield indices.slice();
|
|
611
|
-
let i = k - 1;
|
|
612
|
-
while (i >= 0 && indices[i] === n - k + i) i--;
|
|
613
|
-
if (i < 0) return;
|
|
614
|
-
indices[i]++;
|
|
615
|
-
for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1;
|
|
616
|
-
}
|
|
934
|
+
function findBestColumnDeletePositions(oldRow, newRow, oldHtml, newHtml) {
|
|
935
|
+
const oldTexts = oldRow.cells.map((c) => cellText(oldHtml, c));
|
|
936
|
+
const newTexts = newRow.cells.map((c) => cellText(newHtml, c));
|
|
937
|
+
return findOptimalAlignmentSkips(newTexts, oldTexts, (newIdx, oldIdx) => textSimilarity(oldTexts[oldIdx], newTexts[newIdx]));
|
|
617
938
|
}
|
|
618
939
|
/**
|
|
619
940
|
* Try to align cells by logical column position (sum of colspans). When
|
|
@@ -717,8 +1038,8 @@ function diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
|
|
|
717
1038
|
const oldCell = oldRow.cells[align.oldIdx];
|
|
718
1039
|
const newCell = newRow.cells[align.newIdx];
|
|
719
1040
|
out.push(emitDiffedCell(oldHtml, newHtml, oldCell, newCell, diffCell));
|
|
720
|
-
} else if (align.newIdx !== null) out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], "ins"
|
|
721
|
-
else if (align.oldIdx !== null) out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], "del"
|
|
1041
|
+
} else if (align.newIdx !== null) out.push(emitFullCell(newHtml, newRow.cells[align.newIdx], "ins"));
|
|
1042
|
+
else if (align.oldIdx !== null) out.push(emitFullCell(oldHtml, oldRow.cells[align.oldIdx], "del"));
|
|
722
1043
|
out.push("</tr>");
|
|
723
1044
|
return out.join("");
|
|
724
1045
|
}
|
|
@@ -731,7 +1052,7 @@ function cellKey(html, cell) {
|
|
|
731
1052
|
* each `<td>`, with an `<ins>`/`<del>` wrapper around any cell content
|
|
732
1053
|
* (empty cells get the class but no wrapper).
|
|
733
1054
|
*/
|
|
734
|
-
function emitFullRow(html, row, kind
|
|
1055
|
+
function emitFullRow(html, row, kind) {
|
|
735
1056
|
const cls = kind === "ins" ? "diffins" : "diffdel";
|
|
736
1057
|
const trOpening = parseOpeningTagAt(html, row.rowStart);
|
|
737
1058
|
if (!trOpening) return html.slice(row.rowStart, row.rowEnd);
|
|
@@ -739,7 +1060,7 @@ function emitFullRow(html, row, kind, diffCell) {
|
|
|
739
1060
|
let cursor = trOpening.end;
|
|
740
1061
|
for (const cell of row.cells) {
|
|
741
1062
|
out.push(html.slice(cursor, cell.cellStart));
|
|
742
|
-
out.push(emitFullCell(html, cell, kind
|
|
1063
|
+
out.push(emitFullCell(html, cell, kind));
|
|
743
1064
|
cursor = cell.cellEnd;
|
|
744
1065
|
}
|
|
745
1066
|
out.push(html.slice(cursor, row.rowEnd));
|
|
@@ -753,7 +1074,7 @@ function emitFullRow(html, row, kind, diffCell) {
|
|
|
753
1074
|
* the full recursive diff would produce for newly-inserted formatting.
|
|
754
1075
|
* Empty cells get the class on the `<td>` but no inner wrapping.
|
|
755
1076
|
*/
|
|
756
|
-
function emitFullCell(html, cell, kind
|
|
1077
|
+
function emitFullCell(html, cell, kind) {
|
|
757
1078
|
const cls = kind === "ins" ? "diffins" : "diffdel";
|
|
758
1079
|
const tdOpening = parseOpeningTagAt(html, cell.cellStart);
|
|
759
1080
|
if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd);
|
|
@@ -789,7 +1110,7 @@ function wrapInlineTextRuns(content, kind) {
|
|
|
789
1110
|
let j = i;
|
|
790
1111
|
while (j < content.length && content[j] !== "<") j++;
|
|
791
1112
|
const text = content.slice(i, j);
|
|
792
|
-
if (text.trim().length > 0) out.push(
|
|
1113
|
+
if (text.trim().length > 0) out.push(wrapText(text, tag, cls));
|
|
793
1114
|
else out.push(text);
|
|
794
1115
|
i = j;
|
|
795
1116
|
}
|
|
@@ -810,253 +1131,39 @@ function rowHeaderSlice(html, row) {
|
|
|
810
1131
|
return html.slice(row.rowStart, row.cells[0].cellStart);
|
|
811
1132
|
}
|
|
812
1133
|
/** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
|
|
813
|
-
const ROW_FUZZY_THRESHOLD = .5;
|
|
814
|
-
/**
|
|
815
|
-
* Threshold for "this cell is a content-edit of that cell." Tuned the same
|
|
816
|
-
* as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
|
|
817
|
-
* content typically ARE the same logical cell with a body edit, so 0.5
|
|
818
|
-
* works for both granularities in practice.
|
|
819
|
-
*/
|
|
820
|
-
const CELL_FUZZY_THRESHOLD = .5;
|
|
821
|
-
/**
|
|
822
|
-
* After exact LCS, scan the alignment for runs of "old deleted, then new
|
|
823
|
-
* inserted" (or vice versa) and pair entries whose content is similar
|
|
824
|
-
* enough to be treated as an edit rather than a delete+insert. This keeps
|
|
825
|
-
* row-level edits (a typo fix, a single word change) from being shown as
|
|
826
|
-
* an entire row vanishing and a new one appearing — matching what users
|
|
827
|
-
* expect from a typical track-changes view.
|
|
828
|
-
*/
|
|
829
|
-
function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtml) {
|
|
830
|
-
return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) => rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml));
|
|
831
|
-
}
|
|
832
|
-
function pairSimilarUnmatchedCells(alignment, oldRow, newRow, oldHtml, newHtml) {
|
|
833
|
-
return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) => cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml));
|
|
834
|
-
}
|
|
835
|
-
/**
|
|
836
|
-
* Identify pairings inside each unmatched-only run, then build the output
|
|
837
|
-
* alignment by walking the original and substituting paired entries at
|
|
838
|
-
* the *ins position* (not the del position). This keeps the result
|
|
839
|
-
* monotonic in newIdx — critical because the cursor-based emission
|
|
840
|
-
* downstream walks new's html in order. Emitting at the del position
|
|
841
|
-
* would be fine when del<ins in the alignment array (the typical case),
|
|
842
|
-
* but can violate monotonicity when there are mixed unpaired entries in
|
|
843
|
-
* between (column-add + row-add together, content-edit + column-add,
|
|
844
|
-
* etc.).
|
|
845
|
-
*
|
|
846
|
-
* Generic over what's being paired — works for both rows (by full row
|
|
847
|
-
* content similarity) and cells (by per-cell content similarity).
|
|
848
|
-
*/
|
|
849
|
-
function pairSimilarUnmatched(alignment, threshold, similarity) {
|
|
850
|
-
const pairs = /* @__PURE__ */ new Map();
|
|
851
|
-
let i = 0;
|
|
852
|
-
while (i < alignment.length) {
|
|
853
|
-
if (alignment[i].oldIdx !== null && alignment[i].newIdx !== null) {
|
|
854
|
-
i++;
|
|
855
|
-
continue;
|
|
856
|
-
}
|
|
857
|
-
const runStart = i;
|
|
858
|
-
while (i < alignment.length && alignment[i].oldIdx === null !== (alignment[i].newIdx === null)) i++;
|
|
859
|
-
const runEnd = i;
|
|
860
|
-
const delIndices = [];
|
|
861
|
-
const insIndices = [];
|
|
862
|
-
for (let k = runStart; k < runEnd; k++) if (alignment[k].oldIdx !== null) delIndices.push(k);
|
|
863
|
-
else insIndices.push(k);
|
|
864
|
-
const usedIns = /* @__PURE__ */ new Set();
|
|
865
|
-
for (const di of delIndices) {
|
|
866
|
-
let bestIi = -1;
|
|
867
|
-
let bestSim = threshold;
|
|
868
|
-
for (const ii of insIndices) {
|
|
869
|
-
if (usedIns.has(ii)) continue;
|
|
870
|
-
const sim = similarity(alignment[di].oldIdx, alignment[ii].newIdx);
|
|
871
|
-
if (sim > bestSim) {
|
|
872
|
-
bestSim = sim;
|
|
873
|
-
bestIi = ii;
|
|
874
|
-
}
|
|
875
|
-
}
|
|
876
|
-
if (bestIi >= 0) {
|
|
877
|
-
pairs.set(di, bestIi);
|
|
878
|
-
usedIns.add(bestIi);
|
|
879
|
-
}
|
|
880
|
-
}
|
|
881
|
-
}
|
|
882
|
-
const insToDel = /* @__PURE__ */ new Map();
|
|
883
|
-
for (const [delAi, insAi] of pairs) insToDel.set(insAi, delAi);
|
|
884
|
-
const pairedDels = new Set(pairs.keys());
|
|
885
|
-
const result = [];
|
|
886
|
-
for (let k = 0; k < alignment.length; k++) {
|
|
887
|
-
if (pairedDels.has(k)) continue;
|
|
888
|
-
if (insToDel.has(k)) {
|
|
889
|
-
const delAi = insToDel.get(k);
|
|
890
|
-
result.push({
|
|
891
|
-
oldIdx: alignment[delAi].oldIdx,
|
|
892
|
-
newIdx: alignment[k].newIdx
|
|
893
|
-
});
|
|
894
|
-
} else result.push(alignment[k]);
|
|
895
|
-
}
|
|
896
|
-
return result;
|
|
897
|
-
}
|
|
898
|
-
/**
|
|
899
|
-
* Combined similarity metric used for both row-level and cell-level
|
|
900
|
-
* fuzzy pairing. Returns the MAX of two complementary metrics:
|
|
901
|
-
*
|
|
902
|
-
* 1. **Character prefix+suffix similarity** — fraction of the longer
|
|
903
|
-
* string covered by shared prefix + shared suffix. Catches small
|
|
904
|
-
* edits in the middle of a string (one word changed in a row).
|
|
905
|
-
* Misses cases where the bulk of common content is in the middle
|
|
906
|
-
* and the ends differ.
|
|
907
|
-
*
|
|
908
|
-
* 2. **Token Jaccard similarity** — intersection-over-union of the
|
|
909
|
-
* whitespace-split tokens. Catches "most of the content is the
|
|
910
|
-
* same but bookended by different bits" — e.g. a row whose only
|
|
911
|
-
* edit is a column added at the start and another at the end,
|
|
912
|
-
* where the ~50 chars in the middle that DO match would be
|
|
913
|
-
* invisible to prefix+suffix.
|
|
914
|
-
*
|
|
915
|
-
* Either metric exceeding the threshold means pair. Neither alone is
|
|
916
|
-
* sufficient for the full range of legal-doc edits we see in
|
|
917
|
-
* production tables.
|
|
918
|
-
*/
|
|
919
|
-
function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
|
|
920
|
-
return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow));
|
|
921
|
-
}
|
|
922
|
-
function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
|
|
923
|
-
return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell));
|
|
924
|
-
}
|
|
925
|
-
function textSimilarity(a, b) {
|
|
926
|
-
if (a === b) return 1;
|
|
927
|
-
if (a.length === 0 || b.length === 0) return 0;
|
|
928
|
-
return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b));
|
|
929
|
-
}
|
|
930
|
-
function charPrefixSuffixSimilarity(a, b) {
|
|
931
|
-
let prefix = 0;
|
|
932
|
-
const minLen = Math.min(a.length, b.length);
|
|
933
|
-
while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
|
|
934
|
-
let suffix = 0;
|
|
935
|
-
while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
|
|
936
|
-
return (prefix + suffix) / Math.max(a.length, b.length);
|
|
937
|
-
}
|
|
938
|
-
function tokenJaccardSimilarity(a, b) {
|
|
939
|
-
const tokensA = new Set(a.split(/\s+/).filter(Boolean));
|
|
940
|
-
const tokensB = new Set(b.split(/\s+/).filter(Boolean));
|
|
941
|
-
if (tokensA.size === 0 && tokensB.size === 0) return 1;
|
|
942
|
-
let intersection = 0;
|
|
943
|
-
for (const t of tokensA) if (tokensB.has(t)) intersection++;
|
|
944
|
-
const union = tokensA.size + tokensB.size - intersection;
|
|
945
|
-
return union === 0 ? 0 : intersection / union;
|
|
946
|
-
}
|
|
947
|
-
function rowText(html, row) {
|
|
948
|
-
const parts = [];
|
|
949
|
-
for (const cell of row.cells) parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " "));
|
|
950
|
-
return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
951
|
-
}
|
|
952
|
-
function cellText(html, cell) {
|
|
953
|
-
return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
954
|
-
}
|
|
955
|
-
/**
|
|
956
|
-
* Standard LCS alignment: walks both sequences and emits a list of pairs
|
|
957
|
-
* where `(oldIdx, newIdx)` are both set for matching positions, and one
|
|
958
|
-
* side is null for an unmatched entry on the other side. Equality uses
|
|
959
|
-
* strict ===.
|
|
960
|
-
*/
|
|
961
|
-
function lcsAlign(oldKeys, newKeys) {
|
|
962
|
-
const m = oldKeys.length;
|
|
963
|
-
const n = newKeys.length;
|
|
964
|
-
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
965
|
-
for (let i = 1; i <= m; i++) for (let j = 1; j <= n; j++) if (oldKeys[i - 1] === newKeys[j - 1]) dp[i][j] = dp[i - 1][j - 1] + 1;
|
|
966
|
-
else dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
|
|
967
|
-
const result = [];
|
|
968
|
-
let i = m;
|
|
969
|
-
let j = n;
|
|
970
|
-
while (i > 0 || j > 0) if (i > 0 && j > 0 && oldKeys[i - 1] === newKeys[j - 1]) {
|
|
971
|
-
result.unshift({
|
|
972
|
-
oldIdx: i - 1,
|
|
973
|
-
newIdx: j - 1
|
|
974
|
-
});
|
|
975
|
-
i--;
|
|
976
|
-
j--;
|
|
977
|
-
} else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
|
|
978
|
-
result.unshift({
|
|
979
|
-
oldIdx: null,
|
|
980
|
-
newIdx: j - 1
|
|
981
|
-
});
|
|
982
|
-
j--;
|
|
983
|
-
} else {
|
|
984
|
-
result.unshift({
|
|
985
|
-
oldIdx: i - 1,
|
|
986
|
-
newIdx: null
|
|
987
|
-
});
|
|
988
|
-
i--;
|
|
989
|
-
}
|
|
990
|
-
return result;
|
|
991
|
-
}
|
|
1134
|
+
const ROW_FUZZY_THRESHOLD = .5;
|
|
992
1135
|
/**
|
|
993
|
-
*
|
|
994
|
-
*
|
|
1136
|
+
* Threshold for "this cell is a content-edit of that cell." Tuned the same
|
|
1137
|
+
* as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
|
|
1138
|
+
* content typically ARE the same logical cell with a body edit, so 0.5
|
|
1139
|
+
* works for both granularities in practice.
|
|
995
1140
|
*/
|
|
1141
|
+
const CELL_FUZZY_THRESHOLD = .5;
|
|
996
1142
|
/**
|
|
997
|
-
*
|
|
998
|
-
*
|
|
999
|
-
*
|
|
1000
|
-
*
|
|
1001
|
-
*
|
|
1002
|
-
*
|
|
1003
|
-
* `class="mod mod colspan"`.
|
|
1143
|
+
* After exact LCS, scan the alignment for runs of "old deleted, then new
|
|
1144
|
+
* inserted" (or vice versa) and pair entries whose content is similar
|
|
1145
|
+
* enough to be treated as an edit rather than a delete+insert. This keeps
|
|
1146
|
+
* row-level edits (a typo fix, a single word change) from being shown as
|
|
1147
|
+
* an entire row vanishing and a new one appearing — matching what users
|
|
1148
|
+
* expect from a typical track-changes view.
|
|
1004
1149
|
*/
|
|
1005
|
-
function
|
|
1006
|
-
const
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
if (classAttr) {
|
|
1010
|
-
const existingTokens = classAttr.value.split(/\s+/).filter(Boolean);
|
|
1011
|
-
const missing = clsTokens.filter((t) => !existingTokens.includes(t));
|
|
1012
|
-
if (missing.length === 0) return openingTag;
|
|
1013
|
-
const updatedValue = existingTokens.length === 0 ? missing.join(" ") : `${existingTokens.join(" ")} ${missing.join(" ")}`;
|
|
1014
|
-
return openingTag.slice(0, classAttr.valueStart) + updatedValue + openingTag.slice(classAttr.valueEnd);
|
|
1015
|
-
}
|
|
1016
|
-
const insertAt = openingTag.endsWith("/>") ? openingTag.length - 2 : openingTag.length - 1;
|
|
1017
|
-
return `${openingTag.slice(0, insertAt).replace(/\s*$/, "")} class='${cls}'${openingTag.slice(insertAt)}`;
|
|
1150
|
+
function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtml) {
|
|
1151
|
+
const oldTexts = oldTable.rows.map((r) => rowText(oldHtml, r));
|
|
1152
|
+
const newTexts = newTable.rows.map((r) => rowText(newHtml, r));
|
|
1153
|
+
return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) => textSimilarity(oldTexts[oldIdx], newTexts[newIdx]));
|
|
1018
1154
|
}
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
if (openingTag[i] === ">" || openingTag[i] === "/") break;
|
|
1032
|
-
const nameStart = i;
|
|
1033
|
-
while (i < openingTag.length && !/[\s=>/]/.test(openingTag[i])) i++;
|
|
1034
|
-
const name = openingTag.slice(nameStart, i);
|
|
1035
|
-
while (i < openingTag.length && /\s/.test(openingTag[i])) i++;
|
|
1036
|
-
if (openingTag[i] !== "=") continue;
|
|
1037
|
-
i++;
|
|
1038
|
-
while (i < openingTag.length && /\s/.test(openingTag[i])) i++;
|
|
1039
|
-
let valueStart;
|
|
1040
|
-
let valueEnd;
|
|
1041
|
-
if (openingTag[i] === "\"" || openingTag[i] === "'") {
|
|
1042
|
-
const quote = openingTag[i];
|
|
1043
|
-
i++;
|
|
1044
|
-
valueStart = i;
|
|
1045
|
-
while (i < openingTag.length && openingTag[i] !== quote) i++;
|
|
1046
|
-
valueEnd = i;
|
|
1047
|
-
if (i < openingTag.length) i++;
|
|
1048
|
-
} else {
|
|
1049
|
-
valueStart = i;
|
|
1050
|
-
while (i < openingTag.length && !/[\s>/]/.test(openingTag[i])) i++;
|
|
1051
|
-
valueEnd = i;
|
|
1052
|
-
}
|
|
1053
|
-
if (name.toLowerCase() === "class") return {
|
|
1054
|
-
valueStart,
|
|
1055
|
-
valueEnd,
|
|
1056
|
-
value: openingTag.slice(valueStart, valueEnd)
|
|
1057
|
-
};
|
|
1058
|
-
}
|
|
1059
|
-
return null;
|
|
1155
|
+
function pairSimilarUnmatchedCells(alignment, oldRow, newRow, oldHtml, newHtml) {
|
|
1156
|
+
const oldTexts = oldRow.cells.map((c) => cellText(oldHtml, c));
|
|
1157
|
+
const newTexts = newRow.cells.map((c) => cellText(newHtml, c));
|
|
1158
|
+
return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) => textSimilarity(oldTexts[oldIdx], newTexts[newIdx]));
|
|
1159
|
+
}
|
|
1160
|
+
function rowText(html, row) {
|
|
1161
|
+
const parts = [];
|
|
1162
|
+
for (const cell of row.cells) parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " "));
|
|
1163
|
+
return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
1164
|
+
}
|
|
1165
|
+
function cellText(html, cell) {
|
|
1166
|
+
return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
|
|
1060
1167
|
}
|
|
1061
1168
|
/**
|
|
1062
1169
|
* Walks html and returns ranges for every top-level `<table>...</table>`
|
|
@@ -1143,65 +1250,574 @@ function findTopLevelCells(html, start, end) {
|
|
|
1143
1250
|
else i++;
|
|
1144
1251
|
return cells;
|
|
1145
1252
|
}
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
const
|
|
1150
|
-
|
|
1253
|
+
//#endregion
|
|
1254
|
+
//#region src/ThreeWayDiff.ts
|
|
1255
|
+
function buildSegments(d1, d2) {
|
|
1256
|
+
const v2DiffLen = d1.newDiffWords.length;
|
|
1257
|
+
const fromV1 = buildOriginMap(d1.operations, v2DiffLen);
|
|
1258
|
+
const toV3 = buildFateMap(d2.operations, v2DiffLen);
|
|
1259
|
+
const cpDeletionsAt = collectDeletionsAtBoundary(d1);
|
|
1260
|
+
const meInsertionsAt = collectInsertionsAtBoundary(d2);
|
|
1261
|
+
const diffToOriginal = d1.newContentToOriginal ?? Array.from({ length: v2DiffLen }, (_, i) => i);
|
|
1262
|
+
const v2OriginalLen = d1.newOriginalWords.length;
|
|
1263
|
+
const segments = [];
|
|
1264
|
+
let originalCursor = 0;
|
|
1265
|
+
for (let i = 0; i < v2DiffLen; i++) {
|
|
1266
|
+
const cpDel = cpDeletionsAt.get(i);
|
|
1267
|
+
if (cpDel?.length) appendSegment(segments, {
|
|
1268
|
+
kind: "del",
|
|
1269
|
+
author: "cp"
|
|
1270
|
+
}, cpDel);
|
|
1271
|
+
const attr = combine(fromV1[i], toV3[i]);
|
|
1272
|
+
const origIdx = diffToOriginal[i];
|
|
1273
|
+
const slice = d1.newOriginalWords.slice(originalCursor, origIdx + 1);
|
|
1274
|
+
originalCursor = origIdx + 1;
|
|
1275
|
+
const meIns = meInsertionsAt.get(i);
|
|
1276
|
+
const meInsAfterV2 = meIns?.length && isDeletion(attr);
|
|
1277
|
+
if (meIns?.length && !meInsAfterV2) appendSegment(segments, {
|
|
1278
|
+
kind: "ins",
|
|
1279
|
+
author: "me"
|
|
1280
|
+
}, meIns);
|
|
1281
|
+
appendSegment(segments, attr, slice);
|
|
1282
|
+
if (meInsAfterV2) appendSegment(segments, {
|
|
1283
|
+
kind: "ins",
|
|
1284
|
+
author: "me"
|
|
1285
|
+
}, meIns);
|
|
1286
|
+
}
|
|
1287
|
+
const tailCpDel = cpDeletionsAt.get(v2DiffLen);
|
|
1288
|
+
if (tailCpDel?.length) appendSegment(segments, {
|
|
1289
|
+
kind: "del",
|
|
1290
|
+
author: "cp"
|
|
1291
|
+
}, tailCpDel);
|
|
1292
|
+
const tailMeIns = meInsertionsAt.get(v2DiffLen);
|
|
1293
|
+
if (tailMeIns?.length) appendSegment(segments, {
|
|
1294
|
+
kind: "ins",
|
|
1295
|
+
author: "me"
|
|
1296
|
+
}, tailMeIns);
|
|
1297
|
+
if (originalCursor < v2OriginalLen) appendSegment(segments, { kind: "equal" }, d1.newOriginalWords.slice(originalCursor));
|
|
1298
|
+
return segments;
|
|
1299
|
+
}
|
|
1300
|
+
function buildOriginMap(ops, v2Len) {
|
|
1301
|
+
const out = new Array(v2Len).fill("preserved-from-v1");
|
|
1302
|
+
for (const op of ops) {
|
|
1303
|
+
const origin = op.action === 2 ? "inserted-by-cp" : op.action === 4 ? "replaced-into-by-cp" : null;
|
|
1304
|
+
if (origin === null) continue;
|
|
1305
|
+
for (let i = op.startInNew; i < op.endInNew; i++) if (i >= 0 && i < v2Len) out[i] = origin;
|
|
1306
|
+
}
|
|
1307
|
+
return out;
|
|
1308
|
+
}
|
|
1309
|
+
function buildFateMap(ops, v2Len) {
|
|
1310
|
+
const out = new Array(v2Len).fill("preserved-to-v3");
|
|
1311
|
+
for (const op of ops) {
|
|
1312
|
+
const fate = op.action === 1 ? "deleted-by-me" : op.action === 4 ? "replaced-out-by-me" : null;
|
|
1313
|
+
if (fate === null) continue;
|
|
1314
|
+
for (let i = op.startInOld; i < op.endInOld; i++) if (i >= 0 && i < v2Len) out[i] = fate;
|
|
1315
|
+
}
|
|
1316
|
+
return out;
|
|
1317
|
+
}
|
|
1318
|
+
function isDeletion(attr) {
|
|
1319
|
+
return attr.kind === "del" || attr.kind === "reject";
|
|
1320
|
+
}
|
|
1321
|
+
function combine(origin, fate) {
|
|
1322
|
+
const cpInserted = origin === "inserted-by-cp" || origin === "replaced-into-by-cp";
|
|
1323
|
+
const meDeleted = fate === "deleted-by-me" || fate === "replaced-out-by-me";
|
|
1324
|
+
if (!cpInserted && !meDeleted) return { kind: "equal" };
|
|
1325
|
+
if (cpInserted && !meDeleted) return {
|
|
1326
|
+
kind: "ins",
|
|
1327
|
+
author: "cp"
|
|
1328
|
+
};
|
|
1329
|
+
if (!cpInserted && meDeleted) return {
|
|
1330
|
+
kind: "del",
|
|
1331
|
+
author: "me"
|
|
1332
|
+
};
|
|
1333
|
+
return {
|
|
1334
|
+
kind: "reject",
|
|
1335
|
+
by: "me",
|
|
1336
|
+
rejected: "cp"
|
|
1337
|
+
};
|
|
1151
1338
|
}
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1339
|
+
/**
|
|
1340
|
+
* Map V2-diff-boundary → CP-deleted V1 tokens at that boundary. Includes
|
|
1341
|
+
* both pure Delete ops and the V1-side of Replace ops (semantically a
|
|
1342
|
+
* Delete+Insert; the Insert half is picked up by the V2-token walk).
|
|
1343
|
+
*/
|
|
1344
|
+
function collectDeletionsAtBoundary(d) {
|
|
1345
|
+
const out = /* @__PURE__ */ new Map();
|
|
1346
|
+
for (const op of d.operations) {
|
|
1347
|
+
if (op.action !== 1 && op.action !== 4) continue;
|
|
1348
|
+
const words = d.oldDiffWords.slice(op.startInOld, op.endInOld);
|
|
1349
|
+
if (words.length === 0) continue;
|
|
1350
|
+
const existing = out.get(op.startInNew) ?? [];
|
|
1351
|
+
existing.push(...words);
|
|
1352
|
+
out.set(op.startInNew, existing);
|
|
1353
|
+
}
|
|
1354
|
+
return out;
|
|
1355
|
+
}
|
|
1356
|
+
function collectInsertionsAtBoundary(d) {
|
|
1357
|
+
const out = /* @__PURE__ */ new Map();
|
|
1358
|
+
for (const op of d.operations) {
|
|
1359
|
+
if (op.action !== 2 && op.action !== 4) continue;
|
|
1360
|
+
const words = d.newDiffWords.slice(op.startInNew, op.endInNew);
|
|
1361
|
+
if (words.length === 0) continue;
|
|
1362
|
+
const existing = out.get(op.startInOld) ?? [];
|
|
1363
|
+
existing.push(...words);
|
|
1364
|
+
out.set(op.startInOld, existing);
|
|
1365
|
+
}
|
|
1366
|
+
return out;
|
|
1367
|
+
}
|
|
1368
|
+
function appendSegment(segments, attr, words) {
|
|
1369
|
+
if (words.length === 0) return;
|
|
1370
|
+
const last = segments[segments.length - 1];
|
|
1371
|
+
if (last && sameAttribution(last.attr, attr)) {
|
|
1372
|
+
last.words.push(...words);
|
|
1373
|
+
return;
|
|
1374
|
+
}
|
|
1375
|
+
segments.push({
|
|
1376
|
+
attr,
|
|
1377
|
+
words: [...words]
|
|
1378
|
+
});
|
|
1157
1379
|
}
|
|
1158
|
-
function
|
|
1159
|
-
if (
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1380
|
+
function sameAttribution(a, b) {
|
|
1381
|
+
if (a.kind === "equal" && b.kind === "equal") return true;
|
|
1382
|
+
if (a.kind === "ins" && b.kind === "ins") return a.author === b.author;
|
|
1383
|
+
if (a.kind === "del" && b.kind === "del") return a.author === b.author;
|
|
1384
|
+
if (a.kind === "reject" && b.kind === "reject") return true;
|
|
1385
|
+
return false;
|
|
1386
|
+
}
|
|
1387
|
+
/**
|
|
1388
|
+
* Build the `WrapMetadata` for an attribution. Single source of truth
|
|
1389
|
+
* for author-class / data-attr shape so the three emission paths
|
|
1390
|
+
* (word-level, table-level full-row/cell, multi-table whole-table
|
|
1391
|
+
* pre-wrap) stay consistent. A change here propagates to every author
|
|
1392
|
+
* marker in the output.
|
|
1393
|
+
*/
|
|
1394
|
+
function authorAttribution(author, rejects) {
|
|
1395
|
+
const dataAttrs = { author };
|
|
1396
|
+
if (rejects !== void 0) dataAttrs.rejects = rejects;
|
|
1397
|
+
return {
|
|
1398
|
+
extraClasses: rejects !== void 0 ? `${author} rejects-${rejects}` : author,
|
|
1399
|
+
dataAttrs
|
|
1400
|
+
};
|
|
1401
|
+
}
|
|
1402
|
+
/**
|
|
1403
|
+
* Resolve a segment's attribution into the wrapper-tag, base CSS class,
|
|
1404
|
+
* and `WrapMetadata` consumed by `Utils.wrapText` / `insertTag`. The
|
|
1405
|
+
* caller is `HtmlDiff.executeThreeWay`'s emission loop.
|
|
1406
|
+
*/
|
|
1407
|
+
function segmentEmissionShape(attr) {
|
|
1408
|
+
switch (attr.kind) {
|
|
1409
|
+
case "ins": return {
|
|
1410
|
+
tag: "ins",
|
|
1411
|
+
baseClass: "diffins",
|
|
1412
|
+
metadata: authorAttribution(attr.author)
|
|
1413
|
+
};
|
|
1414
|
+
case "del": return {
|
|
1415
|
+
tag: "del",
|
|
1416
|
+
baseClass: "diffdel",
|
|
1417
|
+
metadata: authorAttribution(attr.author)
|
|
1418
|
+
};
|
|
1419
|
+
case "reject": return {
|
|
1420
|
+
tag: "del",
|
|
1421
|
+
baseClass: "diffdel",
|
|
1422
|
+
metadata: authorAttribution(attr.by, attr.rejected)
|
|
1423
|
+
};
|
|
1166
1424
|
}
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1425
|
+
}
|
|
1426
|
+
//#endregion
|
|
1427
|
+
//#region src/ThreeWayTable.ts
|
|
1428
|
+
function preprocessTablesThreeWay(v1, v2, v3, cellDiff) {
|
|
1429
|
+
const t1s = findTopLevelTables(v1);
|
|
1430
|
+
const t2s = findTopLevelTables(v2);
|
|
1431
|
+
const t3s = findTopLevelTables(v3);
|
|
1432
|
+
if (t1s.length === 0 && t2s.length === 0 && t3s.length === 0) return null;
|
|
1433
|
+
for (const t of t1s) if (exceedsSizeLimit(t)) return null;
|
|
1434
|
+
for (const t of t2s) if (exceedsSizeLimit(t)) return null;
|
|
1435
|
+
for (const t of t3s) if (exceedsSizeLimit(t)) return null;
|
|
1436
|
+
const placeholderPrefix = makePlaceholderPrefix(v1, v2, v3);
|
|
1437
|
+
if (positionallyAligned(v1, v2, v3, t1s, t2s, t3s)) return preprocessAlignedByPosition(v1, v2, v3, t1s, t2s, t3s, cellDiff, placeholderPrefix);
|
|
1438
|
+
return preprocessMisalignedByContent(v1, v2, v3, t1s, t2s, t3s, cellDiff, placeholderPrefix);
|
|
1439
|
+
}
|
|
1440
|
+
function preprocessAlignedByPosition(v1, v2, v3, t1s, t2s, t3s, cellDiff, placeholderPrefix) {
|
|
1441
|
+
const pairs = [];
|
|
1442
|
+
for (let i = 0; i < t1s.length; i++) pairs.push({
|
|
1443
|
+
t1: t1s[i],
|
|
1444
|
+
t2: t2s[i],
|
|
1445
|
+
t3: t3s[i],
|
|
1446
|
+
diffed: diffTableThreeWay(v1, v2, v3, t1s[i], t2s[i], t3s[i], cellDiff)
|
|
1447
|
+
});
|
|
1448
|
+
let modifiedV1 = v1;
|
|
1449
|
+
let modifiedV2 = v2;
|
|
1450
|
+
let modifiedV3 = v3;
|
|
1451
|
+
const placeholderToDiff = /* @__PURE__ */ new Map();
|
|
1452
|
+
for (let i = pairs.length - 1; i >= 0; i--) {
|
|
1453
|
+
const placeholder = `${placeholderPrefix}${i}-->`;
|
|
1454
|
+
placeholderToDiff.set(placeholder, pairs[i].diffed);
|
|
1455
|
+
modifiedV1 = spliceString(modifiedV1, pairs[i].t1.tableStart, pairs[i].t1.tableEnd, placeholder);
|
|
1456
|
+
modifiedV2 = spliceString(modifiedV2, pairs[i].t2.tableStart, pairs[i].t2.tableEnd, placeholder);
|
|
1457
|
+
modifiedV3 = spliceString(modifiedV3, pairs[i].t3.tableStart, pairs[i].t3.tableEnd, placeholder);
|
|
1170
1458
|
}
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1459
|
+
return {
|
|
1460
|
+
modifiedV1,
|
|
1461
|
+
modifiedV2,
|
|
1462
|
+
modifiedV3,
|
|
1463
|
+
placeholderToDiff
|
|
1464
|
+
};
|
|
1465
|
+
}
|
|
1466
|
+
/**
|
|
1467
|
+
* Multi-table mismatch handler. Tables are paired across V1↔V2 and
|
|
1468
|
+
* V2↔V3 via content-LCS, then substituted as placeholders such that
|
|
1469
|
+
* each placeholder appears in exactly the inputs where its underlying
|
|
1470
|
+
* table exists. The word-level merger sees:
|
|
1471
|
+
* - paired-everywhere placeholders → equal in both diffs → unwrapped
|
|
1472
|
+
* - V2-only (CP-inserted + Me-rejected) → inserted by CP, deleted by
|
|
1473
|
+
* Me → reject wrapper around the table
|
|
1474
|
+
* - V2+V3 (CP-inserted, Me-kept) → ins-cp wrapper
|
|
1475
|
+
* - V1+V2 (Me-deleted) → del-me wrapper
|
|
1476
|
+
* - V1-only (CP-deleted before V2) → del-cp wrapper
|
|
1477
|
+
* - V3-only (Me-inserted) → ins-me wrapper
|
|
1478
|
+
*
|
|
1479
|
+
* Each placeholder's content is the diffed table for paired triples,
|
|
1480
|
+
* or the raw table HTML for unpaired tables (the word-level wrapper
|
|
1481
|
+
* provides the attribution).
|
|
1482
|
+
*/
|
|
1483
|
+
function preprocessMisalignedByContent(v1, v2, v3, t1s, t2s, t3s, cellDiff, placeholderPrefix) {
|
|
1484
|
+
const k1 = t1s.map((t) => tableKey(v1, t));
|
|
1485
|
+
const k2 = t2s.map((t) => tableKey(v2, t));
|
|
1486
|
+
const k3 = t3s.map((t) => tableKey(v3, t));
|
|
1487
|
+
const align12 = lcsAlign(k1, k2);
|
|
1488
|
+
const align23 = lcsAlign(k2, k3);
|
|
1489
|
+
const v1ToV2 = new Array(t1s.length).fill(-1);
|
|
1490
|
+
const v2ToV1 = new Array(t2s.length).fill(-1);
|
|
1491
|
+
for (const a of align12) if (a.oldIdx !== null && a.newIdx !== null) {
|
|
1492
|
+
v1ToV2[a.oldIdx] = a.newIdx;
|
|
1493
|
+
v2ToV1[a.newIdx] = a.oldIdx;
|
|
1494
|
+
}
|
|
1495
|
+
const v2ToV3 = new Array(t2s.length).fill(-1);
|
|
1496
|
+
const v3ToV2 = new Array(t3s.length).fill(-1);
|
|
1497
|
+
for (const a of align23) if (a.oldIdx !== null && a.newIdx !== null) {
|
|
1498
|
+
v2ToV3[a.oldIdx] = a.newIdx;
|
|
1499
|
+
v3ToV2[a.newIdx] = a.oldIdx;
|
|
1500
|
+
}
|
|
1501
|
+
let nextId = 0;
|
|
1502
|
+
const placeholderToDiff = /* @__PURE__ */ new Map();
|
|
1503
|
+
const placeholders = {
|
|
1504
|
+
v1: new Array(t1s.length).fill(null),
|
|
1505
|
+
v2: new Array(t2s.length).fill(null),
|
|
1506
|
+
v3: new Array(t3s.length).fill(null)
|
|
1507
|
+
};
|
|
1508
|
+
const allocate = () => `${placeholderPrefix}${nextId++}-->`;
|
|
1509
|
+
for (let v2Idx = 0; v2Idx < t2s.length; v2Idx++) {
|
|
1510
|
+
const v1Idx = v2ToV1[v2Idx];
|
|
1511
|
+
const v3Idx = v2ToV3[v2Idx];
|
|
1512
|
+
if (v1Idx === -1 || v3Idx === -1) continue;
|
|
1513
|
+
const placeholder = allocate();
|
|
1514
|
+
placeholderToDiff.set(placeholder, diffTableThreeWay(v1, v2, v3, t1s[v1Idx], t2s[v2Idx], t3s[v3Idx], cellDiff));
|
|
1515
|
+
placeholders.v1[v1Idx] = placeholder;
|
|
1516
|
+
placeholders.v2[v2Idx] = placeholder;
|
|
1517
|
+
placeholders.v3[v3Idx] = placeholder;
|
|
1518
|
+
}
|
|
1519
|
+
const wrapWhole = (tag, author, tableHtml, rejects) => Utils_default.wrapText(tableHtml, tag, `diff${tag}`, authorAttribution(author, rejects));
|
|
1520
|
+
for (let v2Idx = 0; v2Idx < t2s.length; v2Idx++) {
|
|
1521
|
+
if (placeholders.v2[v2Idx] !== null) continue;
|
|
1522
|
+
const v3Idx = v2ToV3[v2Idx];
|
|
1523
|
+
if (v3Idx === -1) continue;
|
|
1524
|
+
const placeholder = allocate();
|
|
1525
|
+
placeholderToDiff.set(placeholder, wrapWhole("ins", "cp", v2.slice(t2s[v2Idx].tableStart, t2s[v2Idx].tableEnd)));
|
|
1526
|
+
placeholders.v2[v2Idx] = placeholder;
|
|
1527
|
+
placeholders.v3[v3Idx] = placeholder;
|
|
1528
|
+
}
|
|
1529
|
+
for (let v2Idx = 0; v2Idx < t2s.length; v2Idx++) {
|
|
1530
|
+
if (placeholders.v2[v2Idx] !== null) continue;
|
|
1531
|
+
const v1Idx = v2ToV1[v2Idx];
|
|
1532
|
+
if (v1Idx === -1) continue;
|
|
1533
|
+
const placeholder = allocate();
|
|
1534
|
+
placeholderToDiff.set(placeholder, wrapWhole("del", "me", v2.slice(t2s[v2Idx].tableStart, t2s[v2Idx].tableEnd)));
|
|
1535
|
+
placeholders.v1[v1Idx] = placeholder;
|
|
1536
|
+
placeholders.v2[v2Idx] = placeholder;
|
|
1537
|
+
}
|
|
1538
|
+
for (let v2Idx = 0; v2Idx < t2s.length; v2Idx++) {
|
|
1539
|
+
if (placeholders.v2[v2Idx] !== null) continue;
|
|
1540
|
+
const placeholder = allocate();
|
|
1541
|
+
placeholderToDiff.set(placeholder, wrapWhole("del", "me", v2.slice(t2s[v2Idx].tableStart, t2s[v2Idx].tableEnd), "cp"));
|
|
1542
|
+
placeholders.v2[v2Idx] = placeholder;
|
|
1543
|
+
}
|
|
1544
|
+
for (let v1Idx = 0; v1Idx < t1s.length; v1Idx++) {
|
|
1545
|
+
if (placeholders.v1[v1Idx] !== null) continue;
|
|
1546
|
+
const placeholder = allocate();
|
|
1547
|
+
placeholderToDiff.set(placeholder, wrapWhole("del", "cp", v1.slice(t1s[v1Idx].tableStart, t1s[v1Idx].tableEnd)));
|
|
1548
|
+
placeholders.v1[v1Idx] = placeholder;
|
|
1549
|
+
}
|
|
1550
|
+
for (let v3Idx = 0; v3Idx < t3s.length; v3Idx++) {
|
|
1551
|
+
if (placeholders.v3[v3Idx] !== null) continue;
|
|
1552
|
+
const placeholder = allocate();
|
|
1553
|
+
placeholderToDiff.set(placeholder, wrapWhole("ins", "me", v3.slice(t3s[v3Idx].tableStart, t3s[v3Idx].tableEnd)));
|
|
1554
|
+
placeholders.v3[v3Idx] = placeholder;
|
|
1555
|
+
}
|
|
1556
|
+
let modifiedV1 = v1;
|
|
1557
|
+
for (let i = t1s.length - 1; i >= 0; i--) {
|
|
1558
|
+
const p = placeholders.v1[i];
|
|
1559
|
+
if (p === null) continue;
|
|
1560
|
+
modifiedV1 = spliceString(modifiedV1, t1s[i].tableStart, t1s[i].tableEnd, p);
|
|
1561
|
+
}
|
|
1562
|
+
let modifiedV2 = v2;
|
|
1563
|
+
for (let i = t2s.length - 1; i >= 0; i--) {
|
|
1564
|
+
const p = placeholders.v2[i];
|
|
1565
|
+
if (p === null) continue;
|
|
1566
|
+
modifiedV2 = spliceString(modifiedV2, t2s[i].tableStart, t2s[i].tableEnd, p);
|
|
1567
|
+
}
|
|
1568
|
+
let modifiedV3 = v3;
|
|
1569
|
+
for (let i = t3s.length - 1; i >= 0; i--) {
|
|
1570
|
+
const p = placeholders.v3[i];
|
|
1571
|
+
if (p === null) continue;
|
|
1572
|
+
modifiedV3 = spliceString(modifiedV3, t3s[i].tableStart, t3s[i].tableEnd, p);
|
|
1180
1573
|
}
|
|
1181
|
-
return
|
|
1574
|
+
return {
|
|
1575
|
+
modifiedV1,
|
|
1576
|
+
modifiedV2,
|
|
1577
|
+
modifiedV3,
|
|
1578
|
+
placeholderToDiff
|
|
1579
|
+
};
|
|
1182
1580
|
}
|
|
1183
1581
|
/**
|
|
1184
|
-
*
|
|
1185
|
-
*
|
|
1582
|
+
* Threshold at which positional pairing is considered sound. Below this
|
|
1583
|
+
* similarity, two positionally-aligned tables are probably different
|
|
1584
|
+
* tables (e.g. CP swapped them around) and content-LCS pairing should
|
|
1585
|
+
* be used instead. 0.5 is a deliberately loose bar — paired-but-content-
|
|
1586
|
+
* edited tables (the common case) sit well above it; genuinely different
|
|
1587
|
+
* tables sit well below.
|
|
1186
1588
|
*/
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1589
|
+
const POSITIONAL_PAIR_SIMILARITY_THRESHOLD = .5;
|
|
1590
|
+
/**
|
|
1591
|
+
* Returns true when V1/V2/V3 tables can be 1:1 paired by position. The
|
|
1592
|
+
* three lists must have equal length AND each positional triple must
|
|
1593
|
+
* have content similar enough that positional pairing reflects the
|
|
1594
|
+
* authors' likely intent. The slow content-LCS path handles cases that
|
|
1595
|
+
* fail this gate (table reordering, additions, deletions).
|
|
1596
|
+
*/
|
|
1597
|
+
function positionallyAligned(v1, v2, v3, t1s, t2s, t3s) {
|
|
1598
|
+
if (t1s.length !== t2s.length || t2s.length !== t3s.length) return false;
|
|
1599
|
+
for (let i = 0; i < t1s.length; i++) {
|
|
1600
|
+
const k1 = tableKey(v1, t1s[i]);
|
|
1601
|
+
const k2 = tableKey(v2, t2s[i]);
|
|
1602
|
+
const k3 = tableKey(v3, t3s[i]);
|
|
1603
|
+
if (textSimilarity(k1, k2) < POSITIONAL_PAIR_SIMILARITY_THRESHOLD) return false;
|
|
1604
|
+
if (textSimilarity(k2, k3) < POSITIONAL_PAIR_SIMILARITY_THRESHOLD) return false;
|
|
1605
|
+
}
|
|
1606
|
+
return true;
|
|
1607
|
+
}
|
|
1608
|
+
function tableKey(html, table) {
|
|
1609
|
+
return html.slice(table.tableStart, table.tableEnd).replace(/\s+/g, " ").trim();
|
|
1610
|
+
}
|
|
1611
|
+
function diffTableThreeWay(v1, v2, v3, t1, t2, t3, cellDiff) {
|
|
1612
|
+
if (sameDimensions(t1, t2) && sameDimensions(t2, t3)) return diffTablePositional(v1, v2, v3, t1, t2, t3, cellDiff);
|
|
1613
|
+
return diffTableStructural(v1, v2, v3, t1, t2, t3, cellDiff);
|
|
1614
|
+
}
|
|
1615
|
+
function diffTablePositional(v1, v2, v3, t1, t2, t3, cellDiff) {
|
|
1616
|
+
const out = [];
|
|
1617
|
+
let cursor = t2.tableStart;
|
|
1618
|
+
for (let r = 0; r < t2.rows.length; r++) {
|
|
1619
|
+
const r1 = t1.rows[r];
|
|
1620
|
+
const r2 = t2.rows[r];
|
|
1621
|
+
const r3 = t3.rows[r];
|
|
1622
|
+
for (let c = 0; c < r2.cells.length; c++) {
|
|
1623
|
+
const c1 = r1.cells[c];
|
|
1624
|
+
const c2 = r2.cells[c];
|
|
1625
|
+
const c3 = r3.cells[c];
|
|
1626
|
+
out.push(v2.slice(cursor, c2.contentStart));
|
|
1627
|
+
out.push(cellDiff(v1.slice(c1.contentStart, c1.contentEnd), v2.slice(c2.contentStart, c2.contentEnd), v3.slice(c3.contentStart, c3.contentEnd)));
|
|
1628
|
+
cursor = c2.contentEnd;
|
|
1195
1629
|
}
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1630
|
+
}
|
|
1631
|
+
out.push(v2.slice(cursor, t2.tableEnd));
|
|
1632
|
+
return out.join("");
|
|
1633
|
+
}
|
|
1634
|
+
/**
|
|
1635
|
+
* Structural-change three-way table diff: rows or cells differ in count
|
|
1636
|
+
* across V1/V2/V3. Strategy:
|
|
1637
|
+
* 1. Run row-LCS for each pair (V1↔V2, V2↔V3) over rowKeys
|
|
1638
|
+
* 2. Build per-V2-row origin (from align1) and fate (from align2)
|
|
1639
|
+
* 3. Walk V2's row order, interleaving:
|
|
1640
|
+
* - CP-deleted V1 rows (in align1 but not preserved into V2)
|
|
1641
|
+
* - Me-inserted V3 rows (in align2 but not from V2)
|
|
1642
|
+
* 4. For each V2 row, combine origin+fate to decide:
|
|
1643
|
+
* - equal: recurse cellDiff if cell counts match, else fall back
|
|
1644
|
+
* - ins-cp: emit V2 row as fully-CP-inserted
|
|
1645
|
+
* - del-me: emit V2 row as fully-Me-deleted
|
|
1646
|
+
* - reject: emit V2 row as Me-rejects-CP
|
|
1647
|
+
*
|
|
1648
|
+
* Tie-break to Me on LCS disagreement (D2): each LCS is authoritative
|
|
1649
|
+
* for its own pair-wise view; we don't attempt to reconcile cases where
|
|
1650
|
+
* align1's idea of V2's V1 origin contradicts what align2 implies via
|
|
1651
|
+
* V3 history. In practice these cases manifest as the row being
|
|
1652
|
+
* attributed independently per pair, which is the conservative correct
|
|
1653
|
+
* thing to do.
|
|
1654
|
+
*/
|
|
1655
|
+
function diffTableStructural(v1, v2, v3, t1, t2, t3, cellDiff) {
|
|
1656
|
+
const v1Keys = t1.rows.map((r) => rowKey(v1, r));
|
|
1657
|
+
const v2Keys = t2.rows.map((r) => rowKey(v2, r));
|
|
1658
|
+
const v3Keys = t3.rows.map((r) => rowKey(v3, r));
|
|
1659
|
+
const align1 = lcsAlign(v1Keys, v2Keys);
|
|
1660
|
+
const align2 = lcsAlign(v2Keys, v3Keys);
|
|
1661
|
+
const v2Origin = new Array(t2.rows.length);
|
|
1662
|
+
for (let i = 0; i < v2Origin.length; i++) v2Origin[i] = { kind: "cp-inserted" };
|
|
1663
|
+
for (const a of align1) if (a.newIdx !== null && a.oldIdx !== null) v2Origin[a.newIdx] = {
|
|
1664
|
+
kind: "preserved",
|
|
1665
|
+
v1Idx: a.oldIdx
|
|
1666
|
+
};
|
|
1667
|
+
const v2Fate = new Array(t2.rows.length);
|
|
1668
|
+
for (let i = 0; i < v2Fate.length; i++) v2Fate[i] = { kind: "me-deleted" };
|
|
1669
|
+
for (const a of align2) if (a.oldIdx !== null && a.newIdx !== null) v2Fate[a.oldIdx] = {
|
|
1670
|
+
kind: "preserved",
|
|
1671
|
+
v3Idx: a.newIdx
|
|
1672
|
+
};
|
|
1673
|
+
const cpDelRowsAt = collectCpDelRowsAtBoundary(align1, t2.rows.length);
|
|
1674
|
+
const meInsRowsAt = collectMeInsRowsAtBoundary(align2, t2.rows.length);
|
|
1675
|
+
const out = [];
|
|
1676
|
+
out.push(tableHeaderSlice(v2, t2));
|
|
1677
|
+
const emitBoundary = (i) => {
|
|
1678
|
+
const cpDel = cpDelRowsAt.get(i);
|
|
1679
|
+
if (cpDel) for (const v1RowIdx of cpDel) out.push(emitFullRowAttributed(v1, t1.rows[v1RowIdx], "del", "cp"));
|
|
1680
|
+
const meIns = meInsRowsAt.get(i);
|
|
1681
|
+
if (meIns) for (const v3RowIdx of meIns) out.push(emitFullRowAttributed(v3, t3.rows[v3RowIdx], "ins", "me"));
|
|
1682
|
+
};
|
|
1683
|
+
for (let r = 0; r < t2.rows.length; r++) {
|
|
1684
|
+
emitBoundary(r);
|
|
1685
|
+
const v2Row = t2.rows[r];
|
|
1686
|
+
const origin = v2Origin[r];
|
|
1687
|
+
const fate = v2Fate[r];
|
|
1688
|
+
out.push(emitV2Row(v1, v2, v3, v2Row, t1, t3, origin, fate, cellDiff));
|
|
1689
|
+
}
|
|
1690
|
+
emitBoundary(t2.rows.length);
|
|
1691
|
+
out.push(tableFooterSlice(v2, t2));
|
|
1692
|
+
return out.join("");
|
|
1693
|
+
}
|
|
1694
|
+
function emitV2Row(v1, v2, v3, v2Row, t1, t3, origin, fate, cellDiff) {
|
|
1695
|
+
if (origin.kind === "cp-inserted" && fate.kind === "me-deleted") return emitFullRowAttributed(v2, v2Row, "del", "me", "cp");
|
|
1696
|
+
if (origin.kind === "cp-inserted") return emitFullRowAttributed(v2, v2Row, "ins", "cp");
|
|
1697
|
+
if (fate.kind === "me-deleted") return emitFullRowAttributed(v2, v2Row, "del", "me");
|
|
1698
|
+
const v1Row = t1.rows[origin.v1Idx];
|
|
1699
|
+
const v3Row = t3.rows[fate.v3Idx];
|
|
1700
|
+
if (v1Row.cells.length === v2Row.cells.length && v2Row.cells.length === v3Row.cells.length) return diffRowPositional(v1, v2, v3, v1Row, v2Row, v3Row, cellDiff);
|
|
1701
|
+
const out = [];
|
|
1702
|
+
out.push(emitFullRowAttributed(v2, v2Row, "del", "me"));
|
|
1703
|
+
out.push(emitFullRowAttributed(v3, v3Row, "ins", "me"));
|
|
1704
|
+
return out.join("");
|
|
1705
|
+
}
|
|
1706
|
+
function diffRowPositional(v1, v2, v3, v1Row, v2Row, v3Row, cellDiff) {
|
|
1707
|
+
const out = [];
|
|
1708
|
+
let cursor = v2Row.rowStart;
|
|
1709
|
+
for (let c = 0; c < v2Row.cells.length; c++) {
|
|
1710
|
+
const c1 = v1Row.cells[c];
|
|
1711
|
+
const c2 = v2Row.cells[c];
|
|
1712
|
+
const c3 = v3Row.cells[c];
|
|
1713
|
+
out.push(v2.slice(cursor, c2.contentStart));
|
|
1714
|
+
out.push(cellDiff(v1.slice(c1.contentStart, c1.contentEnd), v2.slice(c2.contentStart, c2.contentEnd), v3.slice(c3.contentStart, c3.contentEnd)));
|
|
1715
|
+
cursor = c2.contentEnd;
|
|
1716
|
+
}
|
|
1717
|
+
out.push(v2.slice(cursor, v2Row.rowEnd));
|
|
1718
|
+
return out.join("");
|
|
1719
|
+
}
|
|
1720
|
+
function collectCpDelRowsAtBoundary(align, v2RowCount) {
|
|
1721
|
+
const out = /* @__PURE__ */ new Map();
|
|
1722
|
+
let nextV2Boundary = v2RowCount;
|
|
1723
|
+
const pending = [];
|
|
1724
|
+
for (let i = align.length - 1; i >= 0; i--) {
|
|
1725
|
+
const a = align[i];
|
|
1726
|
+
if (a.newIdx !== null) {
|
|
1727
|
+
if (pending.length > 0) {
|
|
1728
|
+
const existing = out.get(nextV2Boundary) ?? [];
|
|
1729
|
+
existing.unshift(...pending.toReversed());
|
|
1730
|
+
out.set(nextV2Boundary, existing);
|
|
1731
|
+
pending.length = 0;
|
|
1732
|
+
}
|
|
1733
|
+
nextV2Boundary = a.newIdx;
|
|
1734
|
+
} else if (a.oldIdx !== null) pending.push(a.oldIdx);
|
|
1735
|
+
}
|
|
1736
|
+
if (pending.length > 0) {
|
|
1737
|
+
const existing = out.get(nextV2Boundary) ?? [];
|
|
1738
|
+
existing.unshift(...pending.reverse());
|
|
1739
|
+
out.set(nextV2Boundary, existing);
|
|
1740
|
+
}
|
|
1741
|
+
return out;
|
|
1742
|
+
}
|
|
1743
|
+
function collectMeInsRowsAtBoundary(align, v2RowCount) {
|
|
1744
|
+
const out = /* @__PURE__ */ new Map();
|
|
1745
|
+
let nextV2Boundary = v2RowCount;
|
|
1746
|
+
const pending = [];
|
|
1747
|
+
for (let i = align.length - 1; i >= 0; i--) {
|
|
1748
|
+
const a = align[i];
|
|
1749
|
+
if (a.oldIdx !== null) {
|
|
1750
|
+
if (pending.length > 0) {
|
|
1751
|
+
const existing = out.get(nextV2Boundary) ?? [];
|
|
1752
|
+
existing.unshift(...pending.toReversed());
|
|
1753
|
+
out.set(nextV2Boundary, existing);
|
|
1754
|
+
pending.length = 0;
|
|
1755
|
+
}
|
|
1756
|
+
nextV2Boundary = a.oldIdx;
|
|
1757
|
+
} else if (a.newIdx !== null) pending.push(a.newIdx);
|
|
1758
|
+
}
|
|
1759
|
+
if (pending.length > 0) {
|
|
1760
|
+
const existing = out.get(nextV2Boundary) ?? [];
|
|
1761
|
+
existing.unshift(...pending.reverse());
|
|
1762
|
+
out.set(nextV2Boundary, existing);
|
|
1763
|
+
}
|
|
1764
|
+
return out;
|
|
1765
|
+
}
|
|
1766
|
+
function tableHeaderSlice(html, table) {
|
|
1767
|
+
const firstRow = table.rows[0];
|
|
1768
|
+
if (!firstRow) return html.slice(table.tableStart, table.tableEnd - 8);
|
|
1769
|
+
return html.slice(table.tableStart, firstRow.rowStart);
|
|
1770
|
+
}
|
|
1771
|
+
function tableFooterSlice(html, table) {
|
|
1772
|
+
const lastRow = table.rows[table.rows.length - 1];
|
|
1773
|
+
if (!lastRow) return "</table>";
|
|
1774
|
+
return html.slice(lastRow.rowEnd, table.tableEnd);
|
|
1775
|
+
}
|
|
1776
|
+
/**
|
|
1777
|
+
* Emit a row that's fully attributed to one author, in an ins or del
|
|
1778
|
+
* role. `rejectsAuthor` is set when the row is a Me-deletion of a
|
|
1779
|
+
* CP-inserted row. Wraps `<tr>` in `class='diffins cp'` etc. and each
|
|
1780
|
+
* `<td>` content in the corresponding `<ins>`/`<del>` wrapper with the
|
|
1781
|
+
* author classes/attrs.
|
|
1782
|
+
*/
|
|
1783
|
+
function emitFullRowAttributed(html, row, kind, author, rejectsAuthor) {
|
|
1784
|
+
const trOpening = parseOpeningTagAt(html, row.rowStart);
|
|
1785
|
+
if (!trOpening) return html.slice(html.length, html.length);
|
|
1786
|
+
const out = [injectAuthorAttribution(html.slice(row.rowStart, trOpening.end), kind, author, rejectsAuthor)];
|
|
1787
|
+
let cursor = trOpening.end;
|
|
1788
|
+
for (const cell of row.cells) {
|
|
1789
|
+
out.push(html.slice(cursor, cell.cellStart));
|
|
1790
|
+
out.push(emitFullCellAttributed(html, cell, kind, author, rejectsAuthor));
|
|
1791
|
+
cursor = cell.cellEnd;
|
|
1792
|
+
}
|
|
1793
|
+
out.push(html.slice(cursor, row.rowEnd));
|
|
1794
|
+
return out.join("");
|
|
1795
|
+
}
|
|
1796
|
+
function emitFullCellAttributed(html, cell, kind, author, rejectsAuthor) {
|
|
1797
|
+
const tdOpening = parseOpeningTagAt(html, cell.cellStart);
|
|
1798
|
+
if (!tdOpening) return html.slice(cell.cellStart, cell.cellEnd);
|
|
1799
|
+
const tdWithAttrs = injectAuthorAttribution(html.slice(cell.cellStart, tdOpening.end), kind, author, rejectsAuthor);
|
|
1800
|
+
const innerContent = html.slice(cell.contentStart, cell.contentEnd);
|
|
1801
|
+
const innerWrapped = innerContent.trim().length === 0 ? innerContent : Utils_default.wrapText(innerContent, kind, `diff${kind}`, authorAttribution(author, rejectsAuthor));
|
|
1802
|
+
const closing = html.slice(cell.contentEnd, cell.cellEnd);
|
|
1803
|
+
return tdWithAttrs + innerWrapped + closing;
|
|
1804
|
+
}
|
|
1805
|
+
/**
|
|
1806
|
+
* Inject author classes + data-attrs into an existing opening tag (e.g.
|
|
1807
|
+
* an `<tr>` or `<td>` already in the source HTML). Uses the same
|
|
1808
|
+
* attribution shape as `authorAttribution` + `Utils.wrapText` so the
|
|
1809
|
+
* inject-into-existing and wrap-around-text paths agree.
|
|
1810
|
+
*/
|
|
1811
|
+
function injectAuthorAttribution(openingTag, kind, author, rejectsAuthor) {
|
|
1812
|
+
const meta = authorAttribution(author, rejectsAuthor);
|
|
1813
|
+
return injectDataAttrs(injectClass(openingTag, `diff${kind} ${meta.extraClasses}`), meta.dataAttrs ?? {});
|
|
1814
|
+
}
|
|
1815
|
+
function injectDataAttrs(openingTag, dataAttrs) {
|
|
1816
|
+
const keys = Object.keys(dataAttrs);
|
|
1817
|
+
if (keys.length === 0) return openingTag;
|
|
1818
|
+
const attrs = keys.map((k) => ` data-${k}='${dataAttrs[k]}'`).join("");
|
|
1819
|
+
if (openingTag.endsWith("/>")) return `${openingTag.slice(0, -2)}${attrs}/>`;
|
|
1820
|
+
return `${openingTag.slice(0, -1)}${attrs}>`;
|
|
1205
1821
|
}
|
|
1206
1822
|
//#endregion
|
|
1207
1823
|
//#region src/WordSplitter.ts
|
|
@@ -1459,10 +2075,20 @@ var HtmlDiff = class HtmlDiff {
|
|
|
1459
2075
|
* pathological input.
|
|
1460
2076
|
*/
|
|
1461
2077
|
static MaxTablePreprocessDepth = 8;
|
|
2078
|
+
/**
|
|
2079
|
+
* Mirror cap for the three-way path. The 2-way `MaxTablePreprocessDepth`
|
|
2080
|
+
* guards the recursion inside `executeWithContext`; the 3-way path has
|
|
2081
|
+
* its own recursion (`executeThreeWay` → `preprocessTablesThreeWay` →
|
|
2082
|
+
* `cellDiff` → `executeThreeWay`) which needs its own guard. Once the
|
|
2083
|
+
* cap is reached, `executeThreeWay` skips table preprocessing and
|
|
2084
|
+
* falls back to the word-level merge — same bail-out semantics as the
|
|
2085
|
+
* 2-way path.
|
|
2086
|
+
*/
|
|
2087
|
+
static MaxThreeWayDepth = 8;
|
|
1462
2088
|
content = [];
|
|
1463
2089
|
newText;
|
|
1464
2090
|
oldText;
|
|
1465
|
-
tablePreprocessDepth;
|
|
2091
|
+
tablePreprocessDepth = 0;
|
|
1466
2092
|
specialTagDiffStack = [];
|
|
1467
2093
|
newWords = [];
|
|
1468
2094
|
oldWords = [];
|
|
@@ -1525,17 +2151,153 @@ var HtmlDiff = class HtmlDiff {
|
|
|
1525
2151
|
* Initializes a new instance of the class.
|
|
1526
2152
|
* @param oldText The old text.
|
|
1527
2153
|
* @param newText The new text.
|
|
1528
|
-
* @param tablePreprocessDepth Internal: nested-call depth for table
|
|
1529
|
-
* preprocessing. Callers should leave at default (0); the recursive
|
|
1530
|
-
* `diffCell` callback in TableDiff bumps it.
|
|
1531
2154
|
*/
|
|
1532
|
-
constructor(oldText, newText
|
|
2155
|
+
constructor(oldText, newText) {
|
|
1533
2156
|
this.oldText = oldText;
|
|
1534
2157
|
this.newText = newText;
|
|
1535
|
-
this.tablePreprocessDepth = tablePreprocessDepth;
|
|
1536
2158
|
}
|
|
1537
|
-
static execute(oldText, newText
|
|
1538
|
-
return new HtmlDiff(oldText, newText
|
|
2159
|
+
static execute(oldText, newText) {
|
|
2160
|
+
return new HtmlDiff(oldText, newText).build();
|
|
2161
|
+
}
|
|
2162
|
+
/**
|
|
2163
|
+
* Analyse a two-way diff and return its raw building blocks: the word
|
|
2164
|
+
* arrays the diff ran against, the operations produced, the original
|
|
2165
|
+
* (pre-projection) word arrays, and the mappings from diff-index back
|
|
2166
|
+
* to original-word index when structural projection is active.
|
|
2167
|
+
* Consumed by `executeThreeWay` so it can compose two diffs by walking
|
|
2168
|
+
* their Operation streams.
|
|
2169
|
+
*
|
|
2170
|
+
* The caller is expected to coordinate `useProjections` symmetrically
|
|
2171
|
+
* across composed analyses — if V1↔V2 projects but V2↔V3 doesn't,
|
|
2172
|
+
* V2's "new" array in the first analysis won't equal V2's "old" array
|
|
2173
|
+
* in the second. `evaluateProjectionApplicability` exposes the same
|
|
2174
|
+
* heuristic `build()` uses internally, so the orchestrator can compute
|
|
2175
|
+
* a single decision and pass it into every `analyze` call.
|
|
2176
|
+
*
|
|
2177
|
+
* Table preprocessing is skipped here. Placeholders mutate the input
|
|
2178
|
+
* in ways that don't compose across two independent analyses; the
|
|
2179
|
+
* 3-way orchestrator handles tables explicitly before calling analyze.
|
|
2180
|
+
*/
|
|
2181
|
+
static analyze(oldText, newText, options = {}) {
|
|
2182
|
+
const inner = new HtmlDiff(oldText, newText);
|
|
2183
|
+
inner.tablePreprocessDepth = HtmlDiff.MaxTablePreprocessDepth;
|
|
2184
|
+
if (options.blockExpressions) for (const expr of options.blockExpressions) inner.addBlockExpression(expr);
|
|
2185
|
+
if (options.repeatingWordsAccuracy !== void 0) inner.repeatingWordsAccuracy = options.repeatingWordsAccuracy;
|
|
2186
|
+
if (options.orphanMatchThreshold !== void 0) inner.orphanMatchThreshold = options.orphanMatchThreshold;
|
|
2187
|
+
if (options.ignoreWhitespaceDifferences !== void 0) inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences;
|
|
2188
|
+
inner.splitInputsToWords();
|
|
2189
|
+
if (options.useProjections === void 0) inner.buildContentProjections();
|
|
2190
|
+
else if (options.useProjections) {
|
|
2191
|
+
const oldProj = HtmlDiff.createContentProjection(inner.oldWords);
|
|
2192
|
+
const newProj = HtmlDiff.createContentProjection(inner.newWords);
|
|
2193
|
+
if (oldProj.contentWords.length > 0 && newProj.contentWords.length > 0) {
|
|
2194
|
+
inner.oldContentWords = oldProj.contentWords;
|
|
2195
|
+
inner.oldContentToOriginal = oldProj.contentToOriginal;
|
|
2196
|
+
inner.newContentWords = newProj.contentWords;
|
|
2197
|
+
inner.newContentToOriginal = newProj.contentToOriginal;
|
|
2198
|
+
}
|
|
2199
|
+
}
|
|
2200
|
+
const wordsForDiffOld = inner.oldContentWords ?? inner.oldWords;
|
|
2201
|
+
const wordsForDiffNew = inner.newContentWords ?? inner.newWords;
|
|
2202
|
+
inner.matchGranularity = Math.min(HtmlDiff.MatchGranularityMaximum, Math.min(wordsForDiffOld.length, wordsForDiffNew.length));
|
|
2203
|
+
return {
|
|
2204
|
+
oldDiffWords: wordsForDiffOld,
|
|
2205
|
+
newDiffWords: wordsForDiffNew,
|
|
2206
|
+
operations: inner.operations(),
|
|
2207
|
+
oldOriginalWords: inner.oldWords,
|
|
2208
|
+
newOriginalWords: inner.newWords,
|
|
2209
|
+
oldContentToOriginal: inner.oldContentToOriginal,
|
|
2210
|
+
newContentToOriginal: inner.newContentToOriginal
|
|
2211
|
+
};
|
|
2212
|
+
}
|
|
2213
|
+
/**
|
|
2214
|
+
* Whether content-projection (structural-tag normalisation) would
|
|
2215
|
+
* apply to this pair of inputs under `build()`'s default heuristic.
|
|
2216
|
+
* Exposed so composers of multiple analyses can compute a symmetric
|
|
2217
|
+
* decision before calling `analyze` — see `analyze`'s docstring for
|
|
2218
|
+
* why symmetry matters.
|
|
2219
|
+
*/
|
|
2220
|
+
static evaluateProjectionApplicability(oldText, newText) {
|
|
2221
|
+
const oldWords = WordSplitter.convertHtmlToListOfWords(oldText, []);
|
|
2222
|
+
const newWords = WordSplitter.convertHtmlToListOfWords(newText, []);
|
|
2223
|
+
if (!HtmlDiff.hasStructuralDifferences(oldWords, newWords)) return false;
|
|
2224
|
+
const oldProj = HtmlDiff.createContentProjection(oldWords);
|
|
2225
|
+
const newProj = HtmlDiff.createContentProjection(newWords);
|
|
2226
|
+
return HtmlDiff.shouldUseContentProjections(oldWords, newWords, oldProj, newProj);
|
|
2227
|
+
}
|
|
2228
|
+
/**
|
|
2229
|
+
* Three-way HTML diff. Given V1 (the version Me last sent), V2 (the
|
|
2230
|
+
* version CP sent back), and V3 (Me's current draft), produces a
|
|
2231
|
+
* single attributed HTML output where CP's and Me's changes are
|
|
2232
|
+
* distinguished by `data-author` ('cp' or 'me') and matching
|
|
2233
|
+
* `class='diffins cp'` / `class='diffdel me'` etc. The "Me rejected
|
|
2234
|
+
* CP's proposal" case (Me deleted text CP had inserted) gets a
|
|
2235
|
+
* dedicated marker: `data-rejects='cp'` plus `class='... rejects-cp'`.
|
|
2236
|
+
*
|
|
2237
|
+
* Coordinates the symmetric-projection decision (D1) across both
|
|
2238
|
+
* internal `analyze` calls so V2 tokenises identically on each side
|
|
2239
|
+
* of the spine. When `useProjections` is left undefined, the decision
|
|
2240
|
+
* is the conjunction of both pair-wise heuristics — project iff both
|
|
2241
|
+
* pairs would project on their own. Pass an explicit boolean to
|
|
2242
|
+
* override.
|
|
2243
|
+
*/
|
|
2244
|
+
static executeThreeWay(v1, v2, v3, options = {}) {
|
|
2245
|
+
return HtmlDiff.executeThreeWayWithDepth(v1, v2, v3, options, 0);
|
|
2246
|
+
}
|
|
2247
|
+
static executeThreeWayWithDepth(v1, v2, v3, options, depth) {
|
|
2248
|
+
const tablePreprocess = depth < HtmlDiff.MaxThreeWayDepth ? preprocessTablesThreeWay(v1, v2, v3, (c1, c2, c3) => HtmlDiff.executeThreeWayWithDepth(c1, c2, c3, options, depth + 1)) : null;
|
|
2249
|
+
const inV1 = tablePreprocess?.modifiedV1 ?? v1;
|
|
2250
|
+
const inV2 = tablePreprocess?.modifiedV2 ?? v2;
|
|
2251
|
+
const inV3 = tablePreprocess?.modifiedV3 ?? v3;
|
|
2252
|
+
const analyzeOpts = {
|
|
2253
|
+
useProjections: options.useProjections ?? (HtmlDiff.evaluateProjectionApplicability(inV1, inV2) && HtmlDiff.evaluateProjectionApplicability(inV2, inV3)),
|
|
2254
|
+
blockExpressions: options.blockExpressions,
|
|
2255
|
+
repeatingWordsAccuracy: options.repeatingWordsAccuracy,
|
|
2256
|
+
orphanMatchThreshold: options.orphanMatchThreshold,
|
|
2257
|
+
ignoreWhitespaceDifferences: options.ignoreWhitespaceDifferences
|
|
2258
|
+
};
|
|
2259
|
+
const d1 = HtmlDiff.analyze(inV1, inV2, analyzeOpts);
|
|
2260
|
+
const d2 = HtmlDiff.analyze(inV2, inV3, analyzeOpts);
|
|
2261
|
+
if (d1.newDiffWords.length !== d2.oldDiffWords.length) throw new Error(`HtmlDiff.executeThreeWay: V2 tokenisation diverged across pair-wise analyses (${d1.newDiffWords.length} vs ${d2.oldDiffWords.length}). This indicates the symmetric-projection coordination has a bug.`);
|
|
2262
|
+
const segments = buildSegments(d1, d2);
|
|
2263
|
+
const merged = HtmlDiff.emitSegments(segments);
|
|
2264
|
+
return tablePreprocess ? restoreTablePlaceholders(merged, tablePreprocess.placeholderToDiff) : merged;
|
|
2265
|
+
}
|
|
2266
|
+
/**
|
|
2267
|
+
* Drives a fresh `HtmlDiff` instance through `insertTag` for ins/del
|
|
2268
|
+
* segments and pushes equal segments straight to its `content`
|
|
2269
|
+
* buffer. Reusing the instance keeps the formatting-tag stack
|
|
2270
|
+
* (`specialTagDiffStack`) coherent across segments — a `<strong>`
|
|
2271
|
+
* opened in one segment and closed in another stays balanced.
|
|
2272
|
+
*/
|
|
2273
|
+
static emitSegments(segments) {
|
|
2274
|
+
const emitter = new HtmlDiff("", "");
|
|
2275
|
+
for (const seg of segments) {
|
|
2276
|
+
if (seg.attr.kind === "equal") {
|
|
2277
|
+
emitter.content.push(seg.words.join(""));
|
|
2278
|
+
continue;
|
|
2279
|
+
}
|
|
2280
|
+
const { tag, baseClass, metadata } = segmentEmissionShape(seg.attr);
|
|
2281
|
+
emitter.insertTag(tag, baseClass, [...seg.words], metadata);
|
|
2282
|
+
}
|
|
2283
|
+
if (emitter.specialTagDiffStack.length > 0) throw new Error(`HtmlDiff.executeThreeWay: emission left ${emitter.specialTagDiffStack.length} unclosed formatting tag(s) on the stack — input may have unbalanced <strong>/<em>/etc. or there is a bug in segment emission.`);
|
|
2284
|
+
return emitter.content.join("");
|
|
2285
|
+
}
|
|
2286
|
+
/**
|
|
2287
|
+
* Internal entry point used by the table-cell recursion. Constructs an
|
|
2288
|
+
* inner `HtmlDiff`, applies the caller's settings, and bumps the
|
|
2289
|
+
* recursion depth — keeping the public constructor signature clean
|
|
2290
|
+
* while still threading the configuration that's required for cell-
|
|
2291
|
+
* level output to match the top-level call's behaviour.
|
|
2292
|
+
*/
|
|
2293
|
+
static executeWithContext(oldText, newText, ctx) {
|
|
2294
|
+
const inner = new HtmlDiff(oldText, newText);
|
|
2295
|
+
inner.tablePreprocessDepth = ctx.depth;
|
|
2296
|
+
for (const expr of ctx.blockExpressions) inner.addBlockExpression(expr);
|
|
2297
|
+
inner.repeatingWordsAccuracy = ctx.repeatingWordsAccuracy;
|
|
2298
|
+
inner.orphanMatchThreshold = ctx.orphanMatchThreshold;
|
|
2299
|
+
inner.ignoreWhitespaceDifferences = ctx.ignoreWhitespaceDifferences;
|
|
2300
|
+
return inner.build();
|
|
1539
2301
|
}
|
|
1540
2302
|
/**
|
|
1541
2303
|
* Builds the HTML diff output
|
|
@@ -1543,18 +2305,17 @@ var HtmlDiff = class HtmlDiff {
|
|
|
1543
2305
|
*/
|
|
1544
2306
|
build() {
|
|
1545
2307
|
if (this.oldText === this.newText) return this.newText;
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
});
|
|
2308
|
+
let tablePreprocess = null;
|
|
2309
|
+
if (this.tablePreprocessDepth < HtmlDiff.MaxTablePreprocessDepth) {
|
|
2310
|
+
const ctx = {
|
|
2311
|
+
depth: this.tablePreprocessDepth + 1,
|
|
2312
|
+
blockExpressions: this.blockExpressions,
|
|
2313
|
+
repeatingWordsAccuracy: this.repeatingWordsAccuracy,
|
|
2314
|
+
orphanMatchThreshold: this.orphanMatchThreshold,
|
|
2315
|
+
ignoreWhitespaceDifferences: this.ignoreWhitespaceDifferences
|
|
2316
|
+
};
|
|
2317
|
+
tablePreprocess = preprocessTables(this.oldText, this.newText, (oldCell, newCell) => HtmlDiff.executeWithContext(oldCell, newCell, ctx));
|
|
2318
|
+
}
|
|
1558
2319
|
if (tablePreprocess) {
|
|
1559
2320
|
this.oldText = tablePreprocess.modifiedOld;
|
|
1560
2321
|
this.newText = tablePreprocess.modifiedNew;
|
|
@@ -1764,12 +2525,12 @@ var HtmlDiff = class HtmlDiff {
|
|
|
1764
2525
|
* @param words
|
|
1765
2526
|
* @private
|
|
1766
2527
|
*/
|
|
1767
|
-
insertTag(tag, cssClass, words) {
|
|
2528
|
+
insertTag(tag, cssClass, words, metadata) {
|
|
1768
2529
|
while (true) {
|
|
1769
2530
|
if (words.length === 0) break;
|
|
1770
2531
|
const allWordsUntilFirstTag = this.extractConsecutiveWords(words, (x) => !Utils_default.isTag(x));
|
|
1771
2532
|
if (allWordsUntilFirstTag.length > 0) {
|
|
1772
|
-
const text = Utils_default.wrapText(allWordsUntilFirstTag.join(""), tag, cssClass);
|
|
2533
|
+
const text = Utils_default.wrapText(allWordsUntilFirstTag.join(""), tag, cssClass, metadata);
|
|
1773
2534
|
this.content.push(text);
|
|
1774
2535
|
}
|
|
1775
2536
|
if (words.length === 0) break;
|
|
@@ -1782,7 +2543,7 @@ var HtmlDiff = class HtmlDiff {
|
|
|
1782
2543
|
for (const word of words) if (Utils_default.isTag(word)) tagNames.add(Utils_default.getTagName(word));
|
|
1783
2544
|
const styledTagNames = Array.from(tagNames).join(" ");
|
|
1784
2545
|
this.specialTagDiffStack.push(words[0]);
|
|
1785
|
-
specialCaseTagInjection = `<ins
|
|
2546
|
+
specialCaseTagInjection = `<ins${Utils_default.composeTagAttributes(`mod ${styledTagNames}`, metadata ?? {})}>`;
|
|
1786
2547
|
if (tag === HtmlDiff.DelTag) {
|
|
1787
2548
|
words.shift();
|
|
1788
2549
|
while (words.length > 0 && HtmlDiff.SpecialCaseOpeningTagRegex.test(words[0])) words.shift();
|
|
@@ -1808,7 +2569,7 @@ var HtmlDiff = class HtmlDiff {
|
|
|
1808
2569
|
if (specialCaseTagInjectionIsBefore) this.content.push(specialCaseTagInjection + this.extractConsecutiveWords(words, isTagForExtraction).join(""));
|
|
1809
2570
|
else this.content.push(this.extractConsecutiveWords(words, isTagForExtraction).join("") + specialCaseTagInjection);
|
|
1810
2571
|
if (words.length === 0) continue;
|
|
1811
|
-
this.insertTag(tag, cssClass, words);
|
|
2572
|
+
this.insertTag(tag, cssClass, words, metadata);
|
|
1812
2573
|
break;
|
|
1813
2574
|
}
|
|
1814
2575
|
}
|