@createiq/htmldiff 1.0.4 → 1.0.5-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/dist/HtmlDiff.cjs +881 -46
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +55 -19
- package/dist/HtmlDiff.d.mts +55 -19
- package/dist/HtmlDiff.mjs +881 -46
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +8 -8
- package/src/HtmlDiff.ts +156 -69
- package/src/TableDiff.ts +1196 -0
- package/test/HtmlDiff.spec.ts +119 -1
- package/test/HtmlDiff.tables.spec.ts +1419 -0
- package/test/TableDiff.bench.ts +244 -0
package/src/HtmlDiff.ts
CHANGED
|
@@ -2,6 +2,7 @@ import Action from './Action'
|
|
|
2
2
|
import Match from './Match'
|
|
3
3
|
import MatchFinder from './MatchFinder'
|
|
4
4
|
import Operation from './Operation'
|
|
5
|
+
import { preprocessTables, restoreTablePlaceholders } from './TableDiff'
|
|
5
6
|
import Utils from './Utils'
|
|
6
7
|
import WordSplitter from './WordSplitter'
|
|
7
8
|
|
|
@@ -64,9 +65,21 @@ export default class HtmlDiff {
|
|
|
64
65
|
'span',
|
|
65
66
|
])
|
|
66
67
|
|
|
68
|
+
/**
|
|
69
|
+
* Hard cap on nested `HtmlDiff.execute` calls (table preprocessing
|
|
70
|
+
* recurses through `diffCell` for cell content). Each level allocates
|
|
71
|
+
* fresh DP matrices and word arrays; without a guard a maliciously
|
|
72
|
+
* nested table-in-cell-in-table-in-cell input could blow stack and
|
|
73
|
+
* memory. Set high enough to comfortably handle real legal documents
|
|
74
|
+
* (tables nested 2-3 deep at most), low enough to short-circuit
|
|
75
|
+
* pathological input.
|
|
76
|
+
*/
|
|
77
|
+
private static MaxTablePreprocessDepth = 8
|
|
78
|
+
|
|
67
79
|
private content: string[] = []
|
|
68
80
|
private newText: string
|
|
69
81
|
private oldText: string
|
|
82
|
+
private readonly tablePreprocessDepth: number
|
|
70
83
|
|
|
71
84
|
private specialTagDiffStack: string[] = []
|
|
72
85
|
private newWords: string[] = []
|
|
@@ -80,8 +93,17 @@ export default class HtmlDiff {
|
|
|
80
93
|
/** Maps content-word index → original word index */
|
|
81
94
|
private oldContentToOriginal: number[] | null = null
|
|
82
95
|
private newContentToOriginal: number[] | null = null
|
|
83
|
-
/**
|
|
96
|
+
/**
|
|
97
|
+
* Tracks the next unwritten word index in oldWords/newWords. Mutated only by
|
|
98
|
+
* {@link sliceOriginalWordsForOp} (each op reads a slice and advances its cursor).
|
|
99
|
+
* Advances monotonically. Used so:
|
|
100
|
+
* - subsequent equal/delete ops know where in old to resume from
|
|
101
|
+
* - subsequent insert ops know where in new to resume from
|
|
102
|
+
* The two cursors are independent: equal/delete output from old and advance the old
|
|
103
|
+
* cursor; insert outputs from new and advances the new cursor.
|
|
104
|
+
*/
|
|
84
105
|
private lastOriginalOldOutputIndex = 0
|
|
106
|
+
private lastOriginalNewOutputIndex = 0
|
|
85
107
|
private matchGranularity = 0
|
|
86
108
|
private blockExpressions: RegExp[] = []
|
|
87
109
|
|
|
@@ -125,14 +147,18 @@ export default class HtmlDiff {
|
|
|
125
147
|
* Initializes a new instance of the class.
|
|
126
148
|
* @param oldText The old text.
|
|
127
149
|
* @param newText The new text.
|
|
150
|
+
* @param tablePreprocessDepth Internal: nested-call depth for table
|
|
151
|
+
* preprocessing. Callers should leave at default (0); the recursive
|
|
152
|
+
* `diffCell` callback in TableDiff bumps it.
|
|
128
153
|
*/
|
|
129
|
-
constructor(oldText: string, newText: string) {
|
|
154
|
+
constructor(oldText: string, newText: string, tablePreprocessDepth = 0) {
|
|
130
155
|
this.oldText = oldText
|
|
131
156
|
this.newText = newText
|
|
157
|
+
this.tablePreprocessDepth = tablePreprocessDepth
|
|
132
158
|
}
|
|
133
159
|
|
|
134
|
-
static execute(oldText: string, newText: string) {
|
|
135
|
-
return new HtmlDiff(oldText, newText).build()
|
|
160
|
+
static execute(oldText: string, newText: string, tablePreprocessDepth = 0) {
|
|
161
|
+
return new HtmlDiff(oldText, newText, tablePreprocessDepth).build()
|
|
136
162
|
}
|
|
137
163
|
|
|
138
164
|
/**
|
|
@@ -145,6 +171,34 @@ export default class HtmlDiff {
|
|
|
145
171
|
return this.newText
|
|
146
172
|
}
|
|
147
173
|
|
|
174
|
+
// Table preprocessing: when both sides have matching `<table>` structures,
|
|
175
|
+
// diff cells positionally so cross-cell content shifts produce one
|
|
176
|
+
// independent del/ins per cell rather than cell-misaligned output.
|
|
177
|
+
// Recursion guarded by MaxTablePreprocessDepth to bound work on
|
|
178
|
+
// deeply-nested table-in-cell-in-table inputs. Caller-configured
|
|
179
|
+
// settings (block expressions, accuracy thresholds) are propagated to
|
|
180
|
+
// the recursive cell diff so cell-level output is consistent with the
|
|
181
|
+
// top-level configuration.
|
|
182
|
+
const blockExpressions = this.blockExpressions
|
|
183
|
+
const repeatingWordsAccuracy = this.repeatingWordsAccuracy
|
|
184
|
+
const orphanMatchThreshold = this.orphanMatchThreshold
|
|
185
|
+
const ignoreWhitespaceDifferences = this.ignoreWhitespaceDifferences
|
|
186
|
+
const tablePreprocess =
|
|
187
|
+
this.tablePreprocessDepth >= HtmlDiff.MaxTablePreprocessDepth
|
|
188
|
+
? null
|
|
189
|
+
: preprocessTables(this.oldText, this.newText, (oldCell, newCell) => {
|
|
190
|
+
const inner = new HtmlDiff(oldCell, newCell, this.tablePreprocessDepth + 1)
|
|
191
|
+
for (const expr of blockExpressions) inner.addBlockExpression(expr)
|
|
192
|
+
inner.repeatingWordsAccuracy = repeatingWordsAccuracy
|
|
193
|
+
inner.orphanMatchThreshold = orphanMatchThreshold
|
|
194
|
+
inner.ignoreWhitespaceDifferences = ignoreWhitespaceDifferences
|
|
195
|
+
return inner.build()
|
|
196
|
+
})
|
|
197
|
+
if (tablePreprocess) {
|
|
198
|
+
this.oldText = tablePreprocess.modifiedOld
|
|
199
|
+
this.newText = tablePreprocess.modifiedNew
|
|
200
|
+
}
|
|
201
|
+
|
|
148
202
|
this.splitInputsToWords()
|
|
149
203
|
this.buildContentProjections()
|
|
150
204
|
|
|
@@ -161,7 +215,8 @@ export default class HtmlDiff {
|
|
|
161
215
|
this.performOperation(op)
|
|
162
216
|
}
|
|
163
217
|
|
|
164
|
-
|
|
218
|
+
const result = this.content.join('')
|
|
219
|
+
return tablePreprocess ? restoreTablePlaceholders(result, tablePreprocess.placeholderToDiff) : result
|
|
165
220
|
}
|
|
166
221
|
|
|
167
222
|
/**
|
|
@@ -185,23 +240,18 @@ export default class HtmlDiff {
|
|
|
185
240
|
}
|
|
186
241
|
|
|
187
242
|
/**
|
|
188
|
-
*
|
|
189
|
-
*
|
|
190
|
-
*
|
|
243
|
+
* Builds "content projections" — word arrays with structural wrapper tags stripped — when
|
|
244
|
+
* structural normalization is appropriate for these inputs. The diff algorithm operates on
|
|
245
|
+
* the projections so wrapper-tag differences (e.g. `<p>` vs `<div>`) don't appear as content
|
|
246
|
+
* changes; structural tags are then folded back in at output time.
|
|
191
247
|
*/
|
|
192
248
|
private buildContentProjections() {
|
|
193
|
-
|
|
194
|
-
// If structural tags are the same, the normal diff works fine and is simpler.
|
|
195
|
-
if (!HtmlDiff.hasStructuralDifferences(this.oldWords, this.newWords)) {
|
|
196
|
-
return
|
|
197
|
-
}
|
|
249
|
+
if (!HtmlDiff.hasStructuralDifferences(this.oldWords, this.newWords)) return
|
|
198
250
|
|
|
199
251
|
const oldProjection = HtmlDiff.createContentProjection(this.oldWords)
|
|
200
252
|
const newProjection = HtmlDiff.createContentProjection(this.newWords)
|
|
201
253
|
|
|
202
|
-
|
|
203
|
-
// that's a genuine addition/deletion, not a re-wrapping scenario.
|
|
204
|
-
if (oldProjection.contentWords.length === 0 || newProjection.contentWords.length === 0) {
|
|
254
|
+
if (!HtmlDiff.shouldUseContentProjections(this.oldWords, this.newWords, oldProjection, newProjection)) {
|
|
205
255
|
return
|
|
206
256
|
}
|
|
207
257
|
|
|
@@ -211,6 +261,32 @@ export default class HtmlDiff {
|
|
|
211
261
|
this.newContentToOriginal = newProjection.contentToOriginal
|
|
212
262
|
}
|
|
213
263
|
|
|
264
|
+
/**
|
|
265
|
+
* Decides whether structural normalization should be activated for this pair of inputs.
|
|
266
|
+
* Each clause is a distinct correctness or fitness check — extend by adding a named
|
|
267
|
+
* sub-predicate rather than chaining ad-hoc conditions.
|
|
268
|
+
*/
|
|
269
|
+
private static shouldUseContentProjections(
|
|
270
|
+
oldWords: string[],
|
|
271
|
+
newWords: string[],
|
|
272
|
+
oldProjection: { contentWords: string[]; contentToOriginal: number[] },
|
|
273
|
+
newProjection: { contentWords: string[]; contentToOriginal: number[] }
|
|
274
|
+
): boolean {
|
|
275
|
+
// One side has no content at all: that's a genuine addition/deletion, not a wrapper rename.
|
|
276
|
+
// Normalization would mis-attribute the wrappers as part of the diff.
|
|
277
|
+
if (oldProjection.contentWords.length === 0 || newProjection.contentWords.length === 0) return false
|
|
278
|
+
|
|
279
|
+
// Asymmetric structural state: one side has no structural wrappers at all (e.g. plain text
|
|
280
|
+
// vs. wrapped HTML). Normalization would force the equal output to use the unwrapped side's
|
|
281
|
+
// (missing) structure and emit dangling closing tags from the wrapped side. The plain
|
|
282
|
+
// word-level diff handles this correctly without normalization.
|
|
283
|
+
const oldHasStructuralTags = oldProjection.contentWords.length < oldWords.length
|
|
284
|
+
const newHasStructuralTags = newProjection.contentWords.length < newWords.length
|
|
285
|
+
if (oldHasStructuralTags !== newHasStructuralTags) return false
|
|
286
|
+
|
|
287
|
+
return true
|
|
288
|
+
}
|
|
289
|
+
|
|
214
290
|
/**
|
|
215
291
|
* Tags that commonly serve as content wrappers and may change structurally
|
|
216
292
|
* without affecting the actual content. Only these tags are stripped during
|
|
@@ -224,6 +300,11 @@ export default class HtmlDiff {
|
|
|
224
300
|
return HtmlDiff.WrapperTags.has(tagName)
|
|
225
301
|
}
|
|
226
302
|
|
|
303
|
+
/** True when the word is a structural opening tag (e.g. `<p>`, `<div>`). */
|
|
304
|
+
private static isOpeningStructuralTag(word: string): boolean {
|
|
305
|
+
return HtmlDiff.isStructuralTag(word) && !word.startsWith('</')
|
|
306
|
+
}
|
|
307
|
+
|
|
227
308
|
/**
|
|
228
309
|
* Returns true if words between structural tags are just whitespace (indentation).
|
|
229
310
|
*/
|
|
@@ -303,84 +384,90 @@ export default class HtmlDiff {
|
|
|
303
384
|
}
|
|
304
385
|
|
|
305
386
|
private processInsertOperation(operation: Operation, cssClass: string) {
|
|
306
|
-
const words = this.
|
|
307
|
-
? this.
|
|
387
|
+
const words = this.usingContentProjections()
|
|
388
|
+
? this.sliceOriginalWordsForOp('new', operation.startInNew, operation.endInNew)
|
|
308
389
|
: this.newWords.slice(operation.startInNew, operation.endInNew)
|
|
309
390
|
this.insertTag(HtmlDiff.InsTag, cssClass, words)
|
|
310
391
|
}
|
|
311
392
|
|
|
312
393
|
private processDeleteOperation(operation: Operation, cssClass: string) {
|
|
313
|
-
const words = this.
|
|
314
|
-
? this.
|
|
394
|
+
const words = this.usingContentProjections()
|
|
395
|
+
? this.sliceOriginalWordsForOp('old', operation.startInOld, operation.endInOld)
|
|
315
396
|
: this.oldWords.slice(operation.startInOld, operation.endInOld)
|
|
316
397
|
this.insertTag(HtmlDiff.DelTag, cssClass, words)
|
|
317
|
-
|
|
318
|
-
// Advance the tracking index past the deleted range so subsequent equal operations
|
|
319
|
-
// don't re-include structural tags from the deleted section.
|
|
320
|
-
if (this.oldContentToOriginal && operation.endInOld > 0) {
|
|
321
|
-
const lastDeletedOrigIdx = this.oldContentToOriginal[operation.endInOld - 1]
|
|
322
|
-
this.lastOriginalOldOutputIndex = Math.max(this.lastOriginalOldOutputIndex, lastDeletedOrigIdx + 1)
|
|
323
|
-
}
|
|
324
398
|
}
|
|
325
399
|
|
|
326
400
|
private processEqualOperation(operation: Operation) {
|
|
327
|
-
if (this.
|
|
328
|
-
//
|
|
329
|
-
const result = this.
|
|
401
|
+
if (this.usingContentProjections()) {
|
|
402
|
+
// Output from old to preserve old's HTML structure for the matched content.
|
|
403
|
+
const result = this.sliceOriginalWordsForOp('old', operation.startInOld, operation.endInOld)
|
|
330
404
|
this.content.push(result.join(''))
|
|
405
|
+
|
|
406
|
+
// Advance new-side tracking past the equivalent range in new so the next insert op
|
|
407
|
+
// resumes from the correct position. We compute new's range with the same rule used
|
|
408
|
+
// for old (rather than mirroring old's count) so the two sides are independently sound
|
|
409
|
+
// when their structural tags don't perfectly parallel each other.
|
|
410
|
+
this.sliceOriginalWordsForOp('new', operation.startInNew, operation.endInNew)
|
|
331
411
|
} else {
|
|
332
412
|
const result = this.newWords.slice(operation.startInNew, operation.endInNew)
|
|
333
413
|
this.content.push(result.join(''))
|
|
334
414
|
}
|
|
335
415
|
}
|
|
336
416
|
|
|
337
|
-
/**
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
*/
|
|
341
|
-
private getOriginalOldWords(contentStart: number, contentEnd: number): string[] {
|
|
342
|
-
if (!this.oldContentToOriginal) return this.oldWords.slice(contentStart, contentEnd)
|
|
343
|
-
const result: string[] = []
|
|
344
|
-
for (let i = contentStart; i < contentEnd; i++) {
|
|
345
|
-
result.push(this.oldWords[this.oldContentToOriginal[i]])
|
|
346
|
-
}
|
|
347
|
-
return result
|
|
417
|
+
/** True when content projections are active for both sides — i.e. structural normalization is in effect. */
|
|
418
|
+
private usingContentProjections(): boolean {
|
|
419
|
+
return this.oldContentToOriginal !== null && this.newContentToOriginal !== null
|
|
348
420
|
}
|
|
349
421
|
|
|
350
422
|
/**
|
|
351
|
-
*
|
|
352
|
-
*
|
|
423
|
+
* Returns the slice of original (old or new) words covering a content-index range,
|
|
424
|
+
* including the structural tags that surround the content. Advances the side's cursor
|
|
425
|
+
* past the slice so the next op resumes correctly.
|
|
426
|
+
*
|
|
427
|
+
* The slice extends:
|
|
428
|
+
* - LEADING: from the side's cursor (or the first content word's original index,
|
|
429
|
+
* whichever is smaller) so structural tags that precede the first content word
|
|
430
|
+
* are picked up by this op rather than left orphaned.
|
|
431
|
+
* - TRAILING (non-last range): from just after the last content word, including
|
|
432
|
+
* closing structural tags that close *this* op's paragraphs, but stopping at
|
|
433
|
+
* the first opening structural tag — that opening tag belongs to the next
|
|
434
|
+
* op's paragraph and would otherwise be emitted twice.
|
|
435
|
+
* - TRAILING (last range): all the way to the end of words, since there is no next
|
|
436
|
+
* op to claim the trailing tags.
|
|
353
437
|
*/
|
|
354
|
-
private
|
|
355
|
-
|
|
356
|
-
const
|
|
357
|
-
for (let i = contentStart; i < contentEnd; i++) {
|
|
358
|
-
result.push(this.newWords[this.newContentToOriginal[i]])
|
|
359
|
-
}
|
|
360
|
-
return result
|
|
361
|
-
}
|
|
438
|
+
private sliceOriginalWordsForOp(side: 'old' | 'new', contentStart: number, contentEnd: number): string[] {
|
|
439
|
+
const words = side === 'old' ? this.oldWords : this.newWords
|
|
440
|
+
const contentToOriginal = side === 'old' ? this.oldContentToOriginal : this.newContentToOriginal
|
|
362
441
|
|
|
363
|
-
|
|
364
|
-
* Gets original old words for a content-index range, INCLUDING structural tags and whitespace
|
|
365
|
-
* between the content words (used for equal operations to preserve old HTML structure).
|
|
366
|
-
*/
|
|
367
|
-
private getOriginalOldWordsWithStructure(contentStart: number, contentEnd: number): string[] {
|
|
368
|
-
if (!this.oldContentToOriginal) return this.oldWords.slice(contentStart, contentEnd)
|
|
442
|
+
if (!contentToOriginal) return words.slice(contentStart, contentEnd)
|
|
369
443
|
if (contentStart >= contentEnd) return []
|
|
370
444
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
const
|
|
375
|
-
|
|
445
|
+
const firstContentOrigIdx = contentToOriginal[contentStart]
|
|
446
|
+
const lastContentOrigIdx = contentToOriginal[contentEnd - 1]
|
|
447
|
+
const cursor = side === 'old' ? this.lastOriginalOldOutputIndex : this.lastOriginalNewOutputIndex
|
|
448
|
+
const origStart = Math.min(cursor, firstContentOrigIdx)
|
|
449
|
+
|
|
450
|
+
let origEnd: number
|
|
451
|
+
if (contentEnd < contentToOriginal.length) {
|
|
452
|
+
// Non-last range: walk trailing tags after the last content word, stopping at the
|
|
453
|
+
// first opening structural tag so it can be emitted by the next op.
|
|
454
|
+
const limit = contentToOriginal[contentEnd]
|
|
455
|
+
origEnd = lastContentOrigIdx + 1
|
|
456
|
+
while (origEnd < limit && !HtmlDiff.isOpeningStructuralTag(words[origEnd])) {
|
|
457
|
+
origEnd++
|
|
458
|
+
}
|
|
459
|
+
} else {
|
|
460
|
+
// Last range: include everything to the end.
|
|
461
|
+
origEnd = words.length
|
|
462
|
+
}
|
|
376
463
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
464
|
+
if (side === 'old') {
|
|
465
|
+
this.lastOriginalOldOutputIndex = origEnd
|
|
466
|
+
} else {
|
|
467
|
+
this.lastOriginalNewOutputIndex = origEnd
|
|
468
|
+
}
|
|
381
469
|
|
|
382
|
-
|
|
383
|
-
return this.oldWords.slice(origStart, origEnd)
|
|
470
|
+
return words.slice(origStart, origEnd)
|
|
384
471
|
}
|
|
385
472
|
|
|
386
473
|
/**
|
|
@@ -476,7 +563,7 @@ export default class HtmlDiff {
|
|
|
476
563
|
const openingAndClosingTagsMatch =
|
|
477
564
|
!!openingTag && Utils.getTagName(openingTag) === Utils.getTagName(words[tagIndexToCompare])
|
|
478
565
|
|
|
479
|
-
if (
|
|
566
|
+
if (openingTag && openingAndClosingTagsMatch) {
|
|
480
567
|
specialCaseTagInjection = '</ins>'
|
|
481
568
|
specialCaseTagInjectionIsBefore = true
|
|
482
569
|
}
|