@createiq/htmldiff 1.2.0-beta.0 → 1.2.0-beta.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.2.0-beta.0",
3
+ "version": "1.2.0-beta.10",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
package/src/HtmlDiff.ts CHANGED
@@ -71,6 +71,40 @@ export interface AnalyzeResult {
71
71
  */
72
72
  export type ThreeWayOptions = AnalyzeOptions
73
73
 
74
+ /**
75
+ * Opinionated options that align htmldiff's output with Microsoft Word's
76
+ * track-changes rendering for legal-document rewrites.
77
+ *
78
+ * The library's bare default (`orphanMatchThreshold = 0`) keeps every
79
+ * LCS match, however small — which fragments long sentence rewrites
80
+ * into many tiny ins/del pairs around stray word matches ("of", "the",
81
+ * "shall"). Word collapses those into a single coarse del+ins, which is
82
+ * dramatically more readable for legal text.
83
+ *
84
+ * 0.25 was tuned empirically against a customer Word reference (US
85
+ * Commercial One CP, May 2026):
86
+ * - short edits (typo / one-word insert): output identical to
87
+ * threshold=0 — inter-match distances are tiny so every match
88
+ * trivially clears the bar;
89
+ * - long rewrites (the "Specified Indebtedness" rewrite in the
90
+ * reference): previously produced 6 dels + 5 ins fragmented around
91
+ * stray matches; at 0.25 it condenses to 3 dels + 2 ins — close to
92
+ * Word's 1+1 and a major readability win;
93
+ * - higher values (0.3+) collapsed short edits containing inline
94
+ * formatting changes into a single block — too aggressive.
95
+ *
96
+ * Consumers rendering legal documents should spread this into their
97
+ * options:
98
+ * `HtmlDiff.execute(old, new, { ...WORD_ALIGNED_OPTIONS })`
99
+ * `HtmlDiff.executeThreeWay(g, c, m, { ...WORD_ALIGNED_OPTIONS })`
100
+ *
101
+ * Other consumers (machine-readable diff, exact-token alignment) can
102
+ * keep the bare default.
103
+ */
104
+ export const WORD_ALIGNED_OPTIONS: AnalyzeOptions = {
105
+ orphanMatchThreshold: 0.25,
106
+ }
107
+
74
108
  export default class HtmlDiff {
75
109
  /**
76
110
  * This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
@@ -162,7 +196,22 @@ export default class HtmlDiff {
162
196
  // constructor overload that would re-leak the parameter we just hid.
163
197
  private tablePreprocessDepth = 0
164
198
 
165
- private specialTagDiffStack: string[] = []
199
+ /**
200
+ * Tracks currently-open formatting-tag wraps. Each entry pairs the
201
+ * opening tag (so a later closing tag can find its match) with the
202
+ * styling info needed to RE-OPEN the wrap if an overlapping
203
+ * formatting-tag close forces it to split. Without the styling info,
204
+ * an overlap like `<strong>X</strong>` ↔ `<u>X</u>` produces an
205
+ * unclosable wrap (the closing tag for the outer wrap arrives while
206
+ * an inner wrap is still on the stack); see `insertTag`'s closing
207
+ * handler for the split logic.
208
+ */
209
+ private specialTagDiffStack: Array<{
210
+ tag: string
211
+ styledTagNames: string
212
+ cssClass: string
213
+ metadata: WrapMetadata | undefined
214
+ }> = []
166
215
  private newWords: string[] = []
167
216
  private oldWords: string[] = []
168
217
  /**
@@ -234,8 +283,31 @@ export default class HtmlDiff {
234
283
  this.newText = newText
235
284
  }
236
285
 
237
- static execute(oldText: string, newText: string): string {
238
- return new HtmlDiff(oldText, newText).build()
286
+ /**
287
+ * Two-way diff entry point. Accepts the same `AnalyzeOptions` bag as
288
+ * `executeThreeWay`, with two intentional exceptions documented
289
+ * inline below. Consumers wanting Word-aligned output should spread
290
+ * `WORD_ALIGNED_OPTIONS` into the third argument.
291
+ *
292
+ * Note: unlike `analyze`, `execute` runs `build()` which performs
293
+ * full table preprocessing — `tablePreprocessDepth` stays at 0 so
294
+ * the recursive cell diff can happen. Callers can't override that.
295
+ */
296
+ static execute(oldText: string, newText: string, options: AnalyzeOptions = {}): string {
297
+ const inner = new HtmlDiff(oldText, newText)
298
+ if (options.blockExpressions) {
299
+ for (const expr of options.blockExpressions) inner.addBlockExpression(expr)
300
+ }
301
+ if (options.repeatingWordsAccuracy !== undefined) inner.repeatingWordsAccuracy = options.repeatingWordsAccuracy
302
+ if (options.orphanMatchThreshold !== undefined) inner.orphanMatchThreshold = options.orphanMatchThreshold
303
+ if (options.ignoreWhitespaceDifferences !== undefined) {
304
+ inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
305
+ }
306
+ // `useProjections` is intentionally NOT plumbed here — the 2-way
307
+ // path's build() runs its own heuristic. `analyze` honours it; if
308
+ // you need to force it for a 2-way result, route through `analyze`
309
+ // and consume the operations directly.
310
+ return inner.build()
239
311
  }
240
312
 
241
313
  /**
@@ -321,56 +393,60 @@ export default class HtmlDiff {
321
393
  }
322
394
 
323
395
  /**
324
- * Three-way HTML diff. Given V1 (the version Me last sent), V2 (the
325
- * version CP sent back), and V3 (Me's current draft), produces a
326
- * single attributed HTML output where CP's and Me's changes are
327
- * distinguished by `data-author` ('cp' or 'me') and matching
328
- * `class='diffins cp'` / `class='diffdel me'` etc. The "Me rejected
329
- * CP's proposal" case (Me deleted text CP had inserted) gets a
330
- * dedicated marker: `data-rejects='cp'` plus `class='... rejects-cp'`.
396
+ * Three-way HTML diff against a shared genesis. Produces attributed
397
+ * HTML that distinguishes CP's accumulated changes (genesis cpLatest)
398
+ * from Me's accumulated changes (genesis meCurrent). Use this for
399
+ * blackline UX where the negotiation has gone through multiple turns
400
+ * and the reader wants to see "who proposed what" across the whole
401
+ * history, not just the most recent round.
402
+ *
403
+ * When both parties happen to have made the same change (e.g. CP
404
+ * proposed a wording change in turn N, Me adopted it in turn N+1),
405
+ * the change reads as "settled" and is emitted unmarked — only
406
+ * disagreements and pending proposals carry author attribution.
331
407
  *
332
- * Coordinates the symmetric-projection decision (D1) across both
333
- * internal `analyze` calls so V2 tokenises identically on each side
334
- * of the spine. When `useProjections` is left undefined, the decision
335
- * is the conjunction of both pair-wise heuristics — project iff both
336
- * pairs would project on their own. Pass an explicit boolean to
337
- * override.
408
+ * @param genesis the shared common ancestor (per-user the FE
409
+ * picks between V1.0 and /preview/initialAnswers
410
+ * based on `prefillReceiverAnswers`)
411
+ * @param cpLatest the counterparty's current published version
412
+ * @param meCurrent Me's current draft (the document on screen)
338
413
  */
339
- static executeThreeWay(v1: string, v2: string, v3: string, options: ThreeWayOptions = {}): string {
340
- return HtmlDiff.executeThreeWayWithDepth(v1, v2, v3, options, 0)
414
+ static executeThreeWay(genesis: string, cpLatest: string, meCurrent: string, options: ThreeWayOptions = {}): string {
415
+ return HtmlDiff.executeThreeWayWithDepth(genesis, cpLatest, meCurrent, options, 0)
341
416
  }
342
417
 
343
418
  private static executeThreeWayWithDepth(
344
- v1: string,
345
- v2: string,
346
- v3: string,
419
+ genesis: string,
420
+ cpLatest: string,
421
+ meCurrent: string,
347
422
  options: ThreeWayOptions,
348
423
  depth: number
349
424
  ): string {
350
- // Table preprocessing first — replaces each V1/V2/V3 table with a
425
+ // Table preprocessing first — replaces each genesis/cp/me table with a
351
426
  // shared-nonce placeholder, then the word-level merge runs over the
352
427
  // table-free inputs. Cells are diffed recursively via executeThreeWay
353
- // so the cell content is itself three-way attributed. Restoration
354
- // happens at the end.
428
+ // so the cell content is itself three-way attributed.
355
429
  //
356
- // Depth-cap the recursion. Each level recurses cellDiff → executeThreeWay,
357
- // which would otherwise run unbounded on adversarially-nested input.
358
- // Beyond the cap, skip table preprocessing entirely and let the
359
- // word-level merge handle the raw HTML — same bail-out semantics as
360
- // the 2-way `MaxTablePreprocessDepth` cap.
430
+ // Depth-cap the recursion so adversarially-nested input can't blow
431
+ // stack/memory.
361
432
  const tablePreprocess =
362
433
  depth < HtmlDiff.MaxThreeWayDepth
363
- ? preprocessTablesThreeWay(v1, v2, v3, (c1, c2, c3) =>
364
- HtmlDiff.executeThreeWayWithDepth(c1, c2, c3, options, depth + 1)
434
+ ? preprocessTablesThreeWay(genesis, cpLatest, meCurrent, (g, c, m) =>
435
+ HtmlDiff.executeThreeWayWithDepth(g, c, m, options, depth + 1)
365
436
  )
366
437
  : null
367
- const inV1 = tablePreprocess?.modifiedV1 ?? v1
368
- const inV2 = tablePreprocess?.modifiedV2 ?? v2
369
- const inV3 = tablePreprocess?.modifiedV3 ?? v3
370
-
438
+ const inGenesis = tablePreprocess?.modifiedGenesis ?? genesis
439
+ const inCp = tablePreprocess?.modifiedCp ?? cpLatest
440
+ const inMe = tablePreprocess?.modifiedMe ?? meCurrent
441
+
442
+ // Symmetric projection across both analyses. The genesis-spine
443
+ // algorithm requires `genesis` to tokenise identically on each
444
+ // pair-wise analysis (both have genesis as the OLD side), so the
445
+ // useProjections decision must agree across both calls.
371
446
  const useProjections =
372
447
  options.useProjections ??
373
- (HtmlDiff.evaluateProjectionApplicability(inV1, inV2) && HtmlDiff.evaluateProjectionApplicability(inV2, inV3))
448
+ (HtmlDiff.evaluateProjectionApplicability(inGenesis, inCp) &&
449
+ HtmlDiff.evaluateProjectionApplicability(inGenesis, inMe))
374
450
 
375
451
  const analyzeOpts: AnalyzeOptions = {
376
452
  useProjections,
@@ -379,21 +455,21 @@ export default class HtmlDiff {
379
455
  orphanMatchThreshold: options.orphanMatchThreshold,
380
456
  ignoreWhitespaceDifferences: options.ignoreWhitespaceDifferences,
381
457
  }
382
- const d1 = HtmlDiff.analyze(inV1, inV2, analyzeOpts)
383
- const d2 = HtmlDiff.analyze(inV2, inV3, analyzeOpts)
458
+ const dCp = HtmlDiff.analyze(inGenesis, inCp, analyzeOpts)
459
+ const dMe = HtmlDiff.analyze(inGenesis, inMe, analyzeOpts)
384
460
 
385
- // Spine sanity check. Symmetric `useProjections` should guarantee
386
- // alignment, but if a bug ever lets these diverge we want to fail
387
- // loudly rather than silently produce a misattributed output.
388
- if (d1.newDiffWords.length !== d2.oldDiffWords.length) {
461
+ // Spine sanity check both analyses must share an identical genesis
462
+ // tokenisation. Symmetric useProjections guarantees this; if it ever
463
+ // diverges, fail loudly rather than silently misattribute.
464
+ if (dCp.oldDiffWords.length !== dMe.oldDiffWords.length) {
389
465
  throw new Error(
390
- 'HtmlDiff.executeThreeWay: V2 tokenisation diverged across pair-wise analyses ' +
391
- `(${d1.newDiffWords.length} vs ${d2.oldDiffWords.length}). ` +
466
+ 'HtmlDiff.executeThreeWay: genesis tokenisation diverged across pair-wise analyses ' +
467
+ `(${dCp.oldDiffWords.length} vs ${dMe.oldDiffWords.length}). ` +
392
468
  'This indicates the symmetric-projection coordination has a bug.'
393
469
  )
394
470
  }
395
471
 
396
- const segments = buildSegments(d1, d2)
472
+ const segments = buildSegments(dCp, dMe)
397
473
  const merged = HtmlDiff.emitSegments(segments)
398
474
  return tablePreprocess ? restoreTablePlaceholders(merged, tablePreprocess.placeholderToDiff) : merged
399
475
  }
@@ -404,6 +480,25 @@ export default class HtmlDiff {
404
480
  * buffer. Reusing the instance keeps the formatting-tag stack
405
481
  * (`specialTagDiffStack`) coherent across segments — a `<strong>`
406
482
  * opened in one segment and closed in another stays balanced.
483
+ *
484
+ * Edge case: an ins/del segment can open a formatting wrap whose
485
+ * matching closer ends up in an equal segment (`<strong>` deleted
486
+ * by CP but `</strong>` kept by both — buildSegments emits the open
487
+ * as del-cp and the close as equal). Equal segments bypass
488
+ * `insertTag` and push raw, so the stack entry for the open is
489
+ * never popped. Rather than throw — which forces the caller's UI
490
+ * into an error boundary — close every leftover wrap with `</ins>`
491
+ * at the end of emission.
492
+ *
493
+ * Caveat: the `</ins>` close is honest for the mod-wrap that the
494
+ * opener pushed (every formatting opener emits an inner `<ins…>`
495
+ * postInject regardless of whether the outer segment is ins or
496
+ * del). For del-segment formatting openers the outer `<del>` may
497
+ * itself be left open by the same emission imbalance; this fixup
498
+ * doesn't address that. Downstream browsers/DOMParser normalise
499
+ * mildly-malformed HTML by closing dangling tags, so the rendered
500
+ * output is usually acceptable — but the warning IS the signal
501
+ * that the input had a real imbalance worth investigating.
407
502
  */
408
503
  private static emitSegments(segments: Segment[]): string {
409
504
  const emitter = new HtmlDiff('', '')
@@ -416,18 +511,21 @@ export default class HtmlDiff {
416
511
  // insertTag mutates its `words` array; pass a copy.
417
512
  emitter.insertTag(tag, baseClass, [...seg.words], metadata)
418
513
  }
419
- // Stack-balance invariant: every special-case opening tag pushed onto
420
- // `specialTagDiffStack` during emission must have been matched by a
421
- // closing tag. An unbalanced stack means the input had unbalanced
422
- // formatting tags AND a Replace at an inconvenient position — the
423
- // output would be silently malformed (half-closed `<ins>`). Fail
424
- // loudly so the caller can investigate rather than ship broken HTML.
425
514
  if (emitter.specialTagDiffStack.length > 0) {
426
- throw new Error(
515
+ // Log once so we can spot bad inputs in dev tools, but don't
516
+ // throw — the caller's only fallback was to crash the React
517
+ // tree, which is worse than emitting slightly-imperfect HTML.
518
+ // eslint-disable-next-line no-console
519
+ console.warn(
427
520
  `HtmlDiff.executeThreeWay: emission left ${emitter.specialTagDiffStack.length} ` +
428
- 'unclosed formatting tag(s) on the stack input may have unbalanced ' +
429
- '<strong>/<em>/etc. or there is a bug in segment emission.'
521
+ 'unclosed formatting wrap(s) on the stack. Closing defensively. ' +
522
+ 'This usually means a formatting tag opens in a del/ins segment ' +
523
+ 'and its matching closer is in an equal segment.'
430
524
  )
525
+ while (emitter.specialTagDiffStack.length > 0) {
526
+ emitter.content.push('</ins>')
527
+ emitter.specialTagDiffStack.pop()
528
+ }
431
529
  }
432
530
  return emitter.content.join('')
433
531
  }
@@ -807,8 +905,13 @@ export default class HtmlDiff {
807
905
  // if there are nonTags, the index of the last tag is the index before the first nonTag.
808
906
  const indexLastTagInFirstTagBlock = indexOfFirstNonTag === -1 ? words.length - 1 : indexOfFirstNonTag - 1
809
907
 
810
- let specialCaseTagInjection = ''
811
- let specialCaseTagInjectionIsBefore = false
908
+ // Pre-injection sits BEFORE the extracted tag-block content (used
909
+ // by closing tags so `</ins></strong>` reads left-to-right).
910
+ // Post-injection sits AFTER (used by opening tags so the rendered
911
+ // order is `<strong><ins ...>` and by the overlap-split case so
912
+ // the re-opened `<ins>`s sit AFTER the actual closing tag).
913
+ let preInject = ''
914
+ let postInject = ''
812
915
 
813
916
  // handle opening tag
814
917
  if (HtmlDiff.SpecialCaseOpeningTagRegex.test(words[0])) {
@@ -820,10 +923,11 @@ export default class HtmlDiff {
820
923
  }
821
924
  const styledTagNames = Array.from(tagNames).join(' ')
822
925
 
823
- this.specialTagDiffStack.push(words[0])
824
926
  // Carry the caller's metadata into the formatting-tag wrapper so
825
927
  // a 3-way author tag survives a `<strong>`/`<em>` content edit.
826
- specialCaseTagInjection = `<ins${Utils.composeTagAttributes(`mod ${styledTagNames}`, metadata ?? {})}>`
928
+ const styledCssClass = `mod ${styledTagNames}`
929
+ this.specialTagDiffStack.push({ tag: words[0], styledTagNames, cssClass: styledCssClass, metadata })
930
+ postInject = `<ins${Utils.composeTagAttributes(styledCssClass, metadata ?? {})}>`
827
931
  if (tag === HtmlDiff.DelTag) {
828
932
  words.shift()
829
933
 
@@ -835,7 +939,6 @@ export default class HtmlDiff {
835
939
  }
836
940
  // handle closing tag
837
941
  else if (HtmlDiff.SpecialCaseClosingTagsSet.has(words[0].toLowerCase())) {
838
- const openingTag = this.specialTagDiffStack.length === 0 ? null : this.specialTagDiffStack.pop()
839
942
  // For delete operations: when the tag block contains a mix of formatting and
840
943
  // non-formatting closing tags (e.g. </strong></div>), compare against the first
841
944
  // closing tag (the formatting one) rather than the last tag in the block.
@@ -850,19 +953,39 @@ export default class HtmlDiff {
850
953
  tagIndexToCompare = 0
851
954
  }
852
955
  }
853
- const openingAndClosingTagsMatch =
854
- !!openingTag && Utils.getTagName(openingTag) === Utils.getTagName(words[tagIndexToCompare])
855
956
 
856
- if (openingTag && openingAndClosingTagsMatch) {
857
- specialCaseTagInjection = '</ins>'
858
- specialCaseTagInjectionIsBefore = true
957
+ // Search the stack for a matching opener (LIFO). When the match
958
+ // is the top entry, this is the normal balanced case and we
959
+ // emit a single `</ins>` before the closing tag. When the match
960
+ // is below an unmatched opener — i.e. another formatting wrap
961
+ // opened after it but hasn't been closed yet — the wraps
962
+ // overlap in source order, which has no valid LIFO HTML
963
+ // expression. Resolve by SPLITTING the wraps: close everything
964
+ // above the match (their `<ins>`s and the match's `<ins>`), then
965
+ // re-open the above wraps with fresh `<ins>` tags AFTER the
966
+ // closing tag emits. The above wraps continue to apply until
967
+ // their own closing tag arrives.
968
+ const closingTagName = Utils.getTagName(words[tagIndexToCompare])
969
+ let matchIdx = -1
970
+ for (let i = this.specialTagDiffStack.length - 1; i >= 0; i--) {
971
+ if (Utils.getTagName(this.specialTagDiffStack[i].tag) === closingTagName) {
972
+ matchIdx = i
973
+ break
974
+ }
859
975
  }
860
976
 
861
- // if the tag has a corresponding opening tag, but they don't match,
862
- // we need to push the opening tag back onto the stack
863
- else if (openingTag) {
864
- this.specialTagDiffStack.push(openingTag)
977
+ if (matchIdx >= 0) {
978
+ const aboveEntries = this.specialTagDiffStack.splice(matchIdx + 1)
979
+ this.specialTagDiffStack.pop() // pop the matched entry
980
+ // One `</ins>` per above entry, then one for the match itself.
981
+ preInject = '</ins>'.repeat(aboveEntries.length + 1)
982
+ for (const entry of aboveEntries) {
983
+ postInject += `<ins${Utils.composeTagAttributes(entry.cssClass, entry.metadata ?? {})}>`
984
+ this.specialTagDiffStack.push(entry) // their wrap continues via the new <ins>
985
+ }
865
986
  }
987
+ // No match in stack — orphan closing tag, drop the `<ins>` work
988
+ // and just let the tag itself flow through extractConsecutiveWords.
866
989
 
867
990
  if (tag === HtmlDiff.DelTag) {
868
991
  words.shift()
@@ -873,7 +996,7 @@ export default class HtmlDiff {
873
996
  }
874
997
  }
875
998
 
876
- if (words.length === 0 && specialCaseTagInjection.length === 0) {
999
+ if (words.length === 0 && preInject.length === 0 && postInject.length === 0) {
877
1000
  break
878
1001
  }
879
1002
 
@@ -889,11 +1012,7 @@ export default class HtmlDiff {
889
1012
  !HtmlDiff.SpecialCaseClosingTagsSet.has(x.toLowerCase())
890
1013
  : Utils.isTag
891
1014
 
892
- if (specialCaseTagInjectionIsBefore) {
893
- this.content.push(specialCaseTagInjection + this.extractConsecutiveWords(words, isTagForExtraction).join(''))
894
- } else {
895
- this.content.push(this.extractConsecutiveWords(words, isTagForExtraction).join('') + specialCaseTagInjection)
896
- }
1015
+ this.content.push(preInject + this.extractConsecutiveWords(words, isTagForExtraction).join('') + postInject)
897
1016
 
898
1017
  if (words.length === 0) continue
899
1018
 
@@ -1000,6 +1119,31 @@ export default class HtmlDiff {
1000
1119
  continue
1001
1120
  }
1002
1121
 
1122
+ // Never orphan-reject a match whose tokens are ALL HTML tags.
1123
+ // Tag tokens are structural; rejecting `</strong>` / `</em>` as
1124
+ // an orphan match between two content deletions merges the tag
1125
+ // into the deletion, leaving the matching opener unclosed —
1126
+ // browsers then auto-close the opener at the END of the
1127
+ // deletion, producing visually-wrong output (e.g. the body of
1128
+ // a section deletion rendered as bold-italic because the
1129
+ // closing `</strong></em>` ended up after the body deletion
1130
+ // rather than after the heading). The orphan threshold is
1131
+ // designed for stray word matches between heavily-edited spans,
1132
+ // not for formatting boundaries.
1133
+ let allTags = true
1134
+ for (let i = curr.startInNew; i < curr.endInNew; i++) {
1135
+ if (!Utils.isTag(wordsForDiffNew[i])) {
1136
+ allTags = false
1137
+ break
1138
+ }
1139
+ }
1140
+ if (allTags) {
1141
+ yield curr
1142
+ prev = curr
1143
+ curr = next
1144
+ continue
1145
+ }
1146
+
1003
1147
  let oldDistanceInChars = 0
1004
1148
  for (let i = prev.endInOld; i < next.startInOld; i++) {
1005
1149
  oldDistanceInChars += wordsForDiffOld[i].length