@createiq/htmldiff 1.0.4 → 1.0.5-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/HtmlDiff.ts CHANGED
@@ -2,6 +2,7 @@ import Action from './Action'
2
2
  import Match from './Match'
3
3
  import MatchFinder from './MatchFinder'
4
4
  import Operation from './Operation'
5
+ import { preprocessTables, restoreTablePlaceholders } from './TableDiff'
5
6
  import Utils from './Utils'
6
7
  import WordSplitter from './WordSplitter'
7
8
 
@@ -64,9 +65,21 @@ export default class HtmlDiff {
64
65
  'span',
65
66
  ])
66
67
 
68
+ /**
69
+ * Hard cap on nested `HtmlDiff.execute` calls (table preprocessing
70
+ * recurses through `diffCell` for cell content). Each level allocates
71
+ * fresh DP matrices and word arrays; without a guard a maliciously
72
+ * nested table-in-cell-in-table-in-cell input could blow stack and
73
+ * memory. Set high enough to comfortably handle real legal documents
74
+ * (tables nested 2-3 deep at most), low enough to short-circuit
75
+ * pathological input.
76
+ */
77
+ private static MaxTablePreprocessDepth = 8
78
+
67
79
  private content: string[] = []
68
80
  private newText: string
69
81
  private oldText: string
82
+ private readonly tablePreprocessDepth: number
70
83
 
71
84
  private specialTagDiffStack: string[] = []
72
85
  private newWords: string[] = []
@@ -80,8 +93,17 @@ export default class HtmlDiff {
80
93
  /** Maps content-word index → original word index */
81
94
  private oldContentToOriginal: number[] | null = null
82
95
  private newContentToOriginal: number[] | null = null
83
- /** Tracks the last original old word index output, so equal operations can include leading structural tags */
96
+ /**
97
+ * Tracks the next unwritten word index in oldWords/newWords. Mutated only by
98
+ * {@link sliceOriginalWordsForOp} (each op reads a slice and advances its cursor).
99
+ * Advances monotonically. Used so:
100
+ * - subsequent equal/delete ops know where in old to resume from
101
+ * - subsequent insert ops know where in new to resume from
102
+ * The two cursors are independent: equal/delete output from old and advance the old
103
+ * cursor; insert outputs from new and advances the new cursor.
104
+ */
84
105
  private lastOriginalOldOutputIndex = 0
106
+ private lastOriginalNewOutputIndex = 0
85
107
  private matchGranularity = 0
86
108
  private blockExpressions: RegExp[] = []
87
109
 
@@ -125,14 +147,18 @@ export default class HtmlDiff {
125
147
  * Initializes a new instance of the class.
126
148
  * @param oldText The old text.
127
149
  * @param newText The new text.
150
+ * @param tablePreprocessDepth Internal: nested-call depth for table
151
+ * preprocessing. Callers should leave at default (0); the recursive
152
+ * `diffCell` callback in TableDiff bumps it.
128
153
  */
129
- constructor(oldText: string, newText: string) {
154
+ constructor(oldText: string, newText: string, tablePreprocessDepth = 0) {
130
155
  this.oldText = oldText
131
156
  this.newText = newText
157
+ this.tablePreprocessDepth = tablePreprocessDepth
132
158
  }
133
159
 
134
- static execute(oldText: string, newText: string) {
135
- return new HtmlDiff(oldText, newText).build()
160
+ static execute(oldText: string, newText: string, tablePreprocessDepth = 0) {
161
+ return new HtmlDiff(oldText, newText, tablePreprocessDepth).build()
136
162
  }
137
163
 
138
164
  /**
@@ -145,6 +171,34 @@ export default class HtmlDiff {
145
171
  return this.newText
146
172
  }
147
173
 
174
+ // Table preprocessing: when both sides have matching `<table>` structures,
175
+ // diff cells positionally so cross-cell content shifts produce one
176
+ // independent del/ins per cell rather than cell-misaligned output.
177
+ // Recursion guarded by MaxTablePreprocessDepth to bound work on
178
+ // deeply-nested table-in-cell-in-table inputs. Caller-configured
179
+ // settings (block expressions, accuracy thresholds) are propagated to
180
+ // the recursive cell diff so cell-level output is consistent with the
181
+ // top-level configuration.
182
+ const blockExpressions = this.blockExpressions
183
+ const repeatingWordsAccuracy = this.repeatingWordsAccuracy
184
+ const orphanMatchThreshold = this.orphanMatchThreshold
185
+ const ignoreWhitespaceDifferences = this.ignoreWhitespaceDifferences
186
+ const tablePreprocess =
187
+ this.tablePreprocessDepth >= HtmlDiff.MaxTablePreprocessDepth
188
+ ? null
189
+ : preprocessTables(this.oldText, this.newText, (oldCell, newCell) => {
190
+ const inner = new HtmlDiff(oldCell, newCell, this.tablePreprocessDepth + 1)
191
+ for (const expr of blockExpressions) inner.addBlockExpression(expr)
192
+ inner.repeatingWordsAccuracy = repeatingWordsAccuracy
193
+ inner.orphanMatchThreshold = orphanMatchThreshold
194
+ inner.ignoreWhitespaceDifferences = ignoreWhitespaceDifferences
195
+ return inner.build()
196
+ })
197
+ if (tablePreprocess) {
198
+ this.oldText = tablePreprocess.modifiedOld
199
+ this.newText = tablePreprocess.modifiedNew
200
+ }
201
+
148
202
  this.splitInputsToWords()
149
203
  this.buildContentProjections()
150
204
 
@@ -161,7 +215,8 @@ export default class HtmlDiff {
161
215
  this.performOperation(op)
162
216
  }
163
217
 
164
- return this.content.join('')
218
+ const result = this.content.join('')
219
+ return tablePreprocess ? restoreTablePlaceholders(result, tablePreprocess.placeholderToDiff) : result
165
220
  }
166
221
 
167
222
  /**
@@ -185,23 +240,18 @@ export default class HtmlDiff {
185
240
  }
186
241
 
187
242
  /**
188
- * Checks whether the two word arrays have structural HTML differences (different non-formatting tags
189
- * or different whitespace between structural tags). When they do, builds "content projections" that
190
- * strip structural noise so the diff algorithm only sees meaningful content and formatting changes.
243
+ * Builds "content projections" word arrays with structural wrapper tags stripped when
244
+ * structural normalization is appropriate for these inputs. The diff algorithm operates on
245
+ * the projections so wrapper-tag differences (e.g. `<p>` vs `<div>`) don't appear as content
246
+ * changes; structural tags are then folded back in at output time.
191
247
  */
192
248
  private buildContentProjections() {
193
- // Only use projections if the structural tags actually differ.
194
- // If structural tags are the same, the normal diff works fine and is simpler.
195
- if (!HtmlDiff.hasStructuralDifferences(this.oldWords, this.newWords)) {
196
- return
197
- }
249
+ if (!HtmlDiff.hasStructuralDifferences(this.oldWords, this.newWords)) return
198
250
 
199
251
  const oldProjection = HtmlDiff.createContentProjection(this.oldWords)
200
252
  const newProjection = HtmlDiff.createContentProjection(this.newWords)
201
253
 
202
- // Don't activate structural normalization when one side has no content —
203
- // that's a genuine addition/deletion, not a re-wrapping scenario.
204
- if (oldProjection.contentWords.length === 0 || newProjection.contentWords.length === 0) {
254
+ if (!HtmlDiff.shouldUseContentProjections(this.oldWords, this.newWords, oldProjection, newProjection)) {
205
255
  return
206
256
  }
207
257
 
@@ -211,6 +261,32 @@ export default class HtmlDiff {
211
261
  this.newContentToOriginal = newProjection.contentToOriginal
212
262
  }
213
263
 
264
+ /**
265
+ * Decides whether structural normalization should be activated for this pair of inputs.
266
+ * Each clause is a distinct correctness or fitness check — extend by adding a named
267
+ * sub-predicate rather than chaining ad-hoc conditions.
268
+ */
269
+ private static shouldUseContentProjections(
270
+ oldWords: string[],
271
+ newWords: string[],
272
+ oldProjection: { contentWords: string[]; contentToOriginal: number[] },
273
+ newProjection: { contentWords: string[]; contentToOriginal: number[] }
274
+ ): boolean {
275
+ // One side has no content at all: that's a genuine addition/deletion, not a wrapper rename.
276
+ // Normalization would mis-attribute the wrappers as part of the diff.
277
+ if (oldProjection.contentWords.length === 0 || newProjection.contentWords.length === 0) return false
278
+
279
+ // Asymmetric structural state: one side has no structural wrappers at all (e.g. plain text
280
+ // vs. wrapped HTML). Normalization would force the equal output to use the unwrapped side's
281
+ // (missing) structure and emit dangling closing tags from the wrapped side. The plain
282
+ // word-level diff handles this correctly without normalization.
283
+ const oldHasStructuralTags = oldProjection.contentWords.length < oldWords.length
284
+ const newHasStructuralTags = newProjection.contentWords.length < newWords.length
285
+ if (oldHasStructuralTags !== newHasStructuralTags) return false
286
+
287
+ return true
288
+ }
289
+
214
290
  /**
215
291
  * Tags that commonly serve as content wrappers and may change structurally
216
292
  * without affecting the actual content. Only these tags are stripped during
@@ -224,6 +300,11 @@ export default class HtmlDiff {
224
300
  return HtmlDiff.WrapperTags.has(tagName)
225
301
  }
226
302
 
303
+ /** True when the word is a structural opening tag (e.g. `<p>`, `<div>`). */
304
+ private static isOpeningStructuralTag(word: string): boolean {
305
+ return HtmlDiff.isStructuralTag(word) && !word.startsWith('</')
306
+ }
307
+
227
308
  /**
228
309
  * Returns true if words between structural tags are just whitespace (indentation).
229
310
  */
@@ -303,84 +384,90 @@ export default class HtmlDiff {
303
384
  }
304
385
 
305
386
  private processInsertOperation(operation: Operation, cssClass: string) {
306
- const words = this.oldContentWords
307
- ? this.getOriginalNewWords(operation.startInNew, operation.endInNew)
387
+ const words = this.usingContentProjections()
388
+ ? this.sliceOriginalWordsForOp('new', operation.startInNew, operation.endInNew)
308
389
  : this.newWords.slice(operation.startInNew, operation.endInNew)
309
390
  this.insertTag(HtmlDiff.InsTag, cssClass, words)
310
391
  }
311
392
 
312
393
  private processDeleteOperation(operation: Operation, cssClass: string) {
313
- const words = this.oldContentWords
314
- ? this.getOriginalOldWords(operation.startInOld, operation.endInOld)
394
+ const words = this.usingContentProjections()
395
+ ? this.sliceOriginalWordsForOp('old', operation.startInOld, operation.endInOld)
315
396
  : this.oldWords.slice(operation.startInOld, operation.endInOld)
316
397
  this.insertTag(HtmlDiff.DelTag, cssClass, words)
317
-
318
- // Advance the tracking index past the deleted range so subsequent equal operations
319
- // don't re-include structural tags from the deleted section.
320
- if (this.oldContentToOriginal && operation.endInOld > 0) {
321
- const lastDeletedOrigIdx = this.oldContentToOriginal[operation.endInOld - 1]
322
- this.lastOriginalOldOutputIndex = Math.max(this.lastOriginalOldOutputIndex, lastDeletedOrigIdx + 1)
323
- }
324
398
  }
325
399
 
326
400
  private processEqualOperation(operation: Operation) {
327
- if (this.oldContentWords) {
328
- // When using content projections, output from old original words to preserve old structure
329
- const result = this.getOriginalOldWordsWithStructure(operation.startInOld, operation.endInOld)
401
+ if (this.usingContentProjections()) {
402
+ // Output from old to preserve old's HTML structure for the matched content.
403
+ const result = this.sliceOriginalWordsForOp('old', operation.startInOld, operation.endInOld)
330
404
  this.content.push(result.join(''))
405
+
406
+ // Advance new-side tracking past the equivalent range in new so the next insert op
407
+ // resumes from the correct position. We compute new's range with the same rule used
408
+ // for old (rather than mirroring old's count) so the two sides are independently sound
409
+ // when their structural tags don't perfectly parallel each other.
410
+ this.sliceOriginalWordsForOp('new', operation.startInNew, operation.endInNew)
331
411
  } else {
332
412
  const result = this.newWords.slice(operation.startInNew, operation.endInNew)
333
413
  this.content.push(result.join(''))
334
414
  }
335
415
  }
336
416
 
337
- /**
338
- * Gets original old words for a content-index range, including only content and formatting tags
339
- * (used for delete/replace operations where we don't want structural tags).
340
- */
341
- private getOriginalOldWords(contentStart: number, contentEnd: number): string[] {
342
- if (!this.oldContentToOriginal) return this.oldWords.slice(contentStart, contentEnd)
343
- const result: string[] = []
344
- for (let i = contentStart; i < contentEnd; i++) {
345
- result.push(this.oldWords[this.oldContentToOriginal[i]])
346
- }
347
- return result
417
+ /** True when content projections are active for both sides — i.e. structural normalization is in effect. */
418
+ private usingContentProjections(): boolean {
419
+ return this.oldContentToOriginal !== null && this.newContentToOriginal !== null
348
420
  }
349
421
 
350
422
  /**
351
- * Gets original new words for a content-index range, including only content and formatting tags
352
- * (used for insert/replace operations where we don't want structural tags).
423
+ * Returns the slice of original (old or new) words covering a content-index range,
424
+ * including the structural tags that surround the content. Advances the side's cursor
425
+ * past the slice so the next op resumes correctly.
426
+ *
427
+ * The slice extends:
428
+ * - LEADING: from the side's cursor (or the first content word's original index,
429
+ * whichever is smaller) so structural tags that precede the first content word
430
+ * are picked up by this op rather than left orphaned.
431
+ * - TRAILING (non-last range): from just after the last content word, including
432
+ * closing structural tags that close *this* op's paragraphs, but stopping at
433
+ * the first opening structural tag — that opening tag belongs to the next
434
+ * op's paragraph and would otherwise be emitted twice.
435
+ * - TRAILING (last range): all the way to the end of words, since there is no next
436
+ * op to claim the trailing tags.
353
437
  */
354
- private getOriginalNewWords(contentStart: number, contentEnd: number): string[] {
355
- if (!this.newContentToOriginal) return this.newWords.slice(contentStart, contentEnd)
356
- const result: string[] = []
357
- for (let i = contentStart; i < contentEnd; i++) {
358
- result.push(this.newWords[this.newContentToOriginal[i]])
359
- }
360
- return result
361
- }
438
+ private sliceOriginalWordsForOp(side: 'old' | 'new', contentStart: number, contentEnd: number): string[] {
439
+ const words = side === 'old' ? this.oldWords : this.newWords
440
+ const contentToOriginal = side === 'old' ? this.oldContentToOriginal : this.newContentToOriginal
362
441
 
363
- /**
364
- * Gets original old words for a content-index range, INCLUDING structural tags and whitespace
365
- * between the content words (used for equal operations to preserve old HTML structure).
366
- */
367
- private getOriginalOldWordsWithStructure(contentStart: number, contentEnd: number): string[] {
368
- if (!this.oldContentToOriginal) return this.oldWords.slice(contentStart, contentEnd)
442
+ if (!contentToOriginal) return words.slice(contentStart, contentEnd)
369
443
  if (contentStart >= contentEnd) return []
370
444
 
371
- // Start from where we last left off in the original array (or the first content word's
372
- // original index, whichever is smaller) to include any structural tags that precede
373
- // the content words in this range.
374
- const firstContentOrigIdx = this.oldContentToOriginal[contentStart]
375
- const origStart = Math.min(this.lastOriginalOldOutputIndex, firstContentOrigIdx)
445
+ const firstContentOrigIdx = contentToOriginal[contentStart]
446
+ const lastContentOrigIdx = contentToOriginal[contentEnd - 1]
447
+ const cursor = side === 'old' ? this.lastOriginalOldOutputIndex : this.lastOriginalNewOutputIndex
448
+ const origStart = Math.min(cursor, firstContentOrigIdx)
449
+
450
+ let origEnd: number
451
+ if (contentEnd < contentToOriginal.length) {
452
+ // Non-last range: walk trailing tags after the last content word, stopping at the
453
+ // first opening structural tag so it can be emitted by the next op.
454
+ const limit = contentToOriginal[contentEnd]
455
+ origEnd = lastContentOrigIdx + 1
456
+ while (origEnd < limit && !HtmlDiff.isOpeningStructuralTag(words[origEnd])) {
457
+ origEnd++
458
+ }
459
+ } else {
460
+ // Last range: include everything to the end.
461
+ origEnd = words.length
462
+ }
376
463
 
377
- // Include up to (but not including) the next content word's original index,
378
- // or to the end of oldWords if this is the last content range
379
- const origEnd =
380
- contentEnd < this.oldContentToOriginal.length ? this.oldContentToOriginal[contentEnd] : this.oldWords.length
464
+ if (side === 'old') {
465
+ this.lastOriginalOldOutputIndex = origEnd
466
+ } else {
467
+ this.lastOriginalNewOutputIndex = origEnd
468
+ }
381
469
 
382
- this.lastOriginalOldOutputIndex = origEnd
383
- return this.oldWords.slice(origStart, origEnd)
470
+ return words.slice(origStart, origEnd)
384
471
  }
385
472
 
386
473
  /**
@@ -476,7 +563,7 @@ export default class HtmlDiff {
476
563
  const openingAndClosingTagsMatch =
477
564
  !!openingTag && Utils.getTagName(openingTag) === Utils.getTagName(words[tagIndexToCompare])
478
565
 
479
- if (!!openingTag && openingAndClosingTagsMatch) {
566
+ if (openingTag && openingAndClosingTagsMatch) {
480
567
  specialCaseTagInjection = '</ins>'
481
568
  specialCaseTagInjectionIsBefore = true
482
569
  }