@createiq/htmldiff 1.2.0-beta.1 → 1.2.0-beta.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.2.0-beta.1",
3
+ "version": "1.2.0-beta.10",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
package/src/HtmlDiff.ts CHANGED
@@ -71,6 +71,40 @@ export interface AnalyzeResult {
71
71
  */
72
72
  export type ThreeWayOptions = AnalyzeOptions
73
73
 
74
+ /**
75
+ * Opinionated options that align htmldiff's output with Microsoft Word's
76
+ * track-changes rendering for legal-document rewrites.
77
+ *
78
+ * The library's bare default (`orphanMatchThreshold = 0`) keeps every
79
+ * LCS match, however small — which fragments long sentence rewrites
80
+ * into many tiny ins/del pairs around stray word matches ("of", "the",
81
+ * "shall"). Word collapses those into a single coarse del+ins, which is
82
+ * dramatically more readable for legal text.
83
+ *
84
+ * 0.25 was tuned empirically against a customer Word reference (US
85
+ * Commercial One CP, May 2026):
86
+ * - short edits (typo / one-word insert): output identical to
87
+ * threshold=0 — inter-match distances are tiny so every match
88
+ * trivially clears the bar;
89
+ * - long rewrites (the "Specified Indebtedness" rewrite in the
90
+ * reference): previously produced 6 dels + 5 ins fragmented around
91
+ * stray matches; at 0.25 it condenses to 3 dels + 2 ins — close to
92
+ * Word's 1+1 and a major readability win;
93
+ * - higher values (0.3+) collapsed short edits containing inline
94
+ * formatting changes into a single block — too aggressive.
95
+ *
96
+ * Consumers rendering legal documents should spread this into their
97
+ * options:
98
+ * `HtmlDiff.execute(old, new, { ...WORD_ALIGNED_OPTIONS })`
99
+ * `HtmlDiff.executeThreeWay(g, c, m, { ...WORD_ALIGNED_OPTIONS })`
100
+ *
101
+ * Other consumers (machine-readable diff, exact-token alignment) can
102
+ * keep the bare default.
103
+ */
104
+ export const WORD_ALIGNED_OPTIONS: AnalyzeOptions = {
105
+ orphanMatchThreshold: 0.25,
106
+ }
107
+
74
108
  export default class HtmlDiff {
75
109
  /**
76
110
  * This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
@@ -162,7 +196,22 @@ export default class HtmlDiff {
162
196
  // constructor overload that would re-leak the parameter we just hid.
163
197
  private tablePreprocessDepth = 0
164
198
 
165
- private specialTagDiffStack: string[] = []
199
+ /**
200
+ * Tracks currently-open formatting-tag wraps. Each entry pairs the
201
+ * opening tag (so a later closing tag can find its match) with the
202
+ * styling info needed to RE-OPEN the wrap if an overlapping
203
+ * formatting-tag close forces it to split. Without the styling info,
204
+ * an overlap like `<strong>X</strong>` ↔ `<u>X</u>` produces an
205
+ * unclosable wrap (the closing tag for the outer wrap arrives while
206
+ * an inner wrap is still on the stack); see `insertTag`'s closing
207
+ * handler for the split logic.
208
+ */
209
+ private specialTagDiffStack: Array<{
210
+ tag: string
211
+ styledTagNames: string
212
+ cssClass: string
213
+ metadata: WrapMetadata | undefined
214
+ }> = []
166
215
  private newWords: string[] = []
167
216
  private oldWords: string[] = []
168
217
  /**
@@ -234,8 +283,31 @@ export default class HtmlDiff {
234
283
  this.newText = newText
235
284
  }
236
285
 
237
- static execute(oldText: string, newText: string): string {
238
- return new HtmlDiff(oldText, newText).build()
286
+ /**
287
+ * Two-way diff entry point. Accepts the same `AnalyzeOptions` bag as
288
+ * `executeThreeWay`, with two intentional exceptions documented
289
+ * inline below. Consumers wanting Word-aligned output should spread
290
+ * `WORD_ALIGNED_OPTIONS` into the third argument.
291
+ *
292
+ * Note: unlike `analyze`, `execute` runs `build()` which performs
293
+ * full table preprocessing — `tablePreprocessDepth` stays at 0 so
294
+ * the recursive cell diff can happen. Callers can't override that.
295
+ */
296
+ static execute(oldText: string, newText: string, options: AnalyzeOptions = {}): string {
297
+ const inner = new HtmlDiff(oldText, newText)
298
+ if (options.blockExpressions) {
299
+ for (const expr of options.blockExpressions) inner.addBlockExpression(expr)
300
+ }
301
+ if (options.repeatingWordsAccuracy !== undefined) inner.repeatingWordsAccuracy = options.repeatingWordsAccuracy
302
+ if (options.orphanMatchThreshold !== undefined) inner.orphanMatchThreshold = options.orphanMatchThreshold
303
+ if (options.ignoreWhitespaceDifferences !== undefined) {
304
+ inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
305
+ }
306
+ // `useProjections` is intentionally NOT plumbed here — the 2-way
307
+ // path's build() runs its own heuristic. `analyze` honours it; if
308
+ // you need to force it for a 2-way result, route through `analyze`
309
+ // and consume the operations directly.
310
+ return inner.build()
239
311
  }
240
312
 
241
313
  /**
@@ -320,22 +392,6 @@ export default class HtmlDiff {
320
392
  return HtmlDiff.shouldUseContentProjections(oldWords, newWords, oldProj, newProj)
321
393
  }
322
394
 
323
- /**
324
- * Three-way HTML diff. Given V1 (the version Me last sent), V2 (the
325
- * version CP sent back), and V3 (Me's current draft), produces a
326
- * single attributed HTML output where CP's and Me's changes are
327
- * distinguished by `data-author` ('cp' or 'me') and matching
328
- * `class='diffins cp'` / `class='diffdel me'` etc. The "Me rejected
329
- * CP's proposal" case (Me deleted text CP had inserted) gets a
330
- * dedicated marker: `data-rejects='cp'` plus `class='... rejects-cp'`.
331
- *
332
- * Coordinates the symmetric-projection decision (D1) across both
333
- * internal `analyze` calls so V2 tokenises identically on each side
334
- * of the spine. When `useProjections` is left undefined, the decision
335
- * is the conjunction of both pair-wise heuristics — project iff both
336
- * pairs would project on their own. Pass an explicit boolean to
337
- * override.
338
- */
339
395
  /**
340
396
  * Three-way HTML diff against a shared genesis. Produces attributed
341
397
  * HTML that distinguishes CP's accumulated changes (genesis → cpLatest)
@@ -424,6 +480,25 @@ export default class HtmlDiff {
424
480
  * buffer. Reusing the instance keeps the formatting-tag stack
425
481
  * (`specialTagDiffStack`) coherent across segments — a `<strong>`
426
482
  * opened in one segment and closed in another stays balanced.
483
+ *
484
+ * Edge case: an ins/del segment can open a formatting wrap whose
485
+ * matching closer ends up in an equal segment (`<strong>` deleted
486
+ * by CP but `</strong>` kept by both — buildSegments emits the open
487
+ * as del-cp and the close as equal). Equal segments bypass
488
+ * `insertTag` and push raw, so the stack entry for the open is
489
+ * never popped. Rather than throw — which forces the caller's UI
490
+ * into an error boundary — close every leftover wrap with `</ins>`
491
+ * at the end of emission.
492
+ *
493
+ * Caveat: the `</ins>` close is honest for the mod-wrap that the
494
+ * opener pushed (every formatting opener emits an inner `<ins…>`
495
+ * postInject regardless of whether the outer segment is ins or
496
+ * del). For del-segment formatting openers the outer `<del>` may
497
+ * itself be left open by the same emission imbalance; this fixup
498
+ * doesn't address that. Downstream browsers/DOMParser normalise
499
+ * mildly-malformed HTML by closing dangling tags, so the rendered
500
+ * output is usually acceptable — but the warning IS the signal
501
+ * that the input had a real imbalance worth investigating.
427
502
  */
428
503
  private static emitSegments(segments: Segment[]): string {
429
504
  const emitter = new HtmlDiff('', '')
@@ -436,18 +511,21 @@ export default class HtmlDiff {
436
511
  // insertTag mutates its `words` array; pass a copy.
437
512
  emitter.insertTag(tag, baseClass, [...seg.words], metadata)
438
513
  }
439
- // Stack-balance invariant: every special-case opening tag pushed onto
440
- // `specialTagDiffStack` during emission must have been matched by a
441
- // closing tag. An unbalanced stack means the input had unbalanced
442
- // formatting tags AND a Replace at an inconvenient position — the
443
- // output would be silently malformed (half-closed `<ins>`). Fail
444
- // loudly so the caller can investigate rather than ship broken HTML.
445
514
  if (emitter.specialTagDiffStack.length > 0) {
446
- throw new Error(
515
+ // Log once so we can spot bad inputs in dev tools, but don't
516
+ // throw — the caller's only fallback was to crash the React
517
+ // tree, which is worse than emitting slightly-imperfect HTML.
518
+ // eslint-disable-next-line no-console
519
+ console.warn(
447
520
  `HtmlDiff.executeThreeWay: emission left ${emitter.specialTagDiffStack.length} ` +
448
- 'unclosed formatting tag(s) on the stack input may have unbalanced ' +
449
- '<strong>/<em>/etc. or there is a bug in segment emission.'
521
+ 'unclosed formatting wrap(s) on the stack. Closing defensively. ' +
522
+ 'This usually means a formatting tag opens in a del/ins segment ' +
523
+ 'and its matching closer is in an equal segment.'
450
524
  )
525
+ while (emitter.specialTagDiffStack.length > 0) {
526
+ emitter.content.push('</ins>')
527
+ emitter.specialTagDiffStack.pop()
528
+ }
451
529
  }
452
530
  return emitter.content.join('')
453
531
  }
@@ -827,8 +905,13 @@ export default class HtmlDiff {
827
905
  // if there are nonTags, the index of the last tag is the index before the first nonTag.
828
906
  const indexLastTagInFirstTagBlock = indexOfFirstNonTag === -1 ? words.length - 1 : indexOfFirstNonTag - 1
829
907
 
830
- let specialCaseTagInjection = ''
831
- let specialCaseTagInjectionIsBefore = false
908
+ // Pre-injection sits BEFORE the extracted tag-block content (used
909
+ // by closing tags so `</ins></strong>` reads left-to-right).
910
+ // Post-injection sits AFTER (used by opening tags so the rendered
911
+ // order is `<strong><ins ...>` and by the overlap-split case so
912
+ // the re-opened `<ins>`s sit AFTER the actual closing tag).
913
+ let preInject = ''
914
+ let postInject = ''
832
915
 
833
916
  // handle opening tag
834
917
  if (HtmlDiff.SpecialCaseOpeningTagRegex.test(words[0])) {
@@ -840,10 +923,11 @@ export default class HtmlDiff {
840
923
  }
841
924
  const styledTagNames = Array.from(tagNames).join(' ')
842
925
 
843
- this.specialTagDiffStack.push(words[0])
844
926
  // Carry the caller's metadata into the formatting-tag wrapper so
845
927
  // a 3-way author tag survives a `<strong>`/`<em>` content edit.
846
- specialCaseTagInjection = `<ins${Utils.composeTagAttributes(`mod ${styledTagNames}`, metadata ?? {})}>`
928
+ const styledCssClass = `mod ${styledTagNames}`
929
+ this.specialTagDiffStack.push({ tag: words[0], styledTagNames, cssClass: styledCssClass, metadata })
930
+ postInject = `<ins${Utils.composeTagAttributes(styledCssClass, metadata ?? {})}>`
847
931
  if (tag === HtmlDiff.DelTag) {
848
932
  words.shift()
849
933
 
@@ -855,7 +939,6 @@ export default class HtmlDiff {
855
939
  }
856
940
  // handle closing tag
857
941
  else if (HtmlDiff.SpecialCaseClosingTagsSet.has(words[0].toLowerCase())) {
858
- const openingTag = this.specialTagDiffStack.length === 0 ? null : this.specialTagDiffStack.pop()
859
942
  // For delete operations: when the tag block contains a mix of formatting and
860
943
  // non-formatting closing tags (e.g. </strong></div>), compare against the first
861
944
  // closing tag (the formatting one) rather than the last tag in the block.
@@ -870,19 +953,39 @@ export default class HtmlDiff {
870
953
  tagIndexToCompare = 0
871
954
  }
872
955
  }
873
- const openingAndClosingTagsMatch =
874
- !!openingTag && Utils.getTagName(openingTag) === Utils.getTagName(words[tagIndexToCompare])
875
956
 
876
- if (openingTag && openingAndClosingTagsMatch) {
877
- specialCaseTagInjection = '</ins>'
878
- specialCaseTagInjectionIsBefore = true
957
+ // Search the stack for a matching opener (LIFO). When the match
958
+ // is the top entry, this is the normal balanced case and we
959
+ // emit a single `</ins>` before the closing tag. When the match
960
+ // is below an unmatched opener — i.e. another formatting wrap
961
+ // opened after it but hasn't been closed yet — the wraps
962
+ // overlap in source order, which has no valid LIFO HTML
963
+ // expression. Resolve by SPLITTING the wraps: close everything
964
+ // above the match (their `<ins>`s and the match's `<ins>`), then
965
+ // re-open the above wraps with fresh `<ins>` tags AFTER the
966
+ // closing tag emits. The above wraps continue to apply until
967
+ // their own closing tag arrives.
968
+ const closingTagName = Utils.getTagName(words[tagIndexToCompare])
969
+ let matchIdx = -1
970
+ for (let i = this.specialTagDiffStack.length - 1; i >= 0; i--) {
971
+ if (Utils.getTagName(this.specialTagDiffStack[i].tag) === closingTagName) {
972
+ matchIdx = i
973
+ break
974
+ }
879
975
  }
880
976
 
881
- // if the tag has a corresponding opening tag, but they don't match,
882
- // we need to push the opening tag back onto the stack
883
- else if (openingTag) {
884
- this.specialTagDiffStack.push(openingTag)
977
+ if (matchIdx >= 0) {
978
+ const aboveEntries = this.specialTagDiffStack.splice(matchIdx + 1)
979
+ this.specialTagDiffStack.pop() // pop the matched entry
980
+ // One `</ins>` per above entry, then one for the match itself.
981
+ preInject = '</ins>'.repeat(aboveEntries.length + 1)
982
+ for (const entry of aboveEntries) {
983
+ postInject += `<ins${Utils.composeTagAttributes(entry.cssClass, entry.metadata ?? {})}>`
984
+ this.specialTagDiffStack.push(entry) // their wrap continues via the new <ins>
985
+ }
885
986
  }
987
+ // No match in stack — orphan closing tag, drop the `<ins>` work
988
+ // and just let the tag itself flow through extractConsecutiveWords.
886
989
 
887
990
  if (tag === HtmlDiff.DelTag) {
888
991
  words.shift()
@@ -893,7 +996,7 @@ export default class HtmlDiff {
893
996
  }
894
997
  }
895
998
 
896
- if (words.length === 0 && specialCaseTagInjection.length === 0) {
999
+ if (words.length === 0 && preInject.length === 0 && postInject.length === 0) {
897
1000
  break
898
1001
  }
899
1002
 
@@ -909,11 +1012,7 @@ export default class HtmlDiff {
909
1012
  !HtmlDiff.SpecialCaseClosingTagsSet.has(x.toLowerCase())
910
1013
  : Utils.isTag
911
1014
 
912
- if (specialCaseTagInjectionIsBefore) {
913
- this.content.push(specialCaseTagInjection + this.extractConsecutiveWords(words, isTagForExtraction).join(''))
914
- } else {
915
- this.content.push(this.extractConsecutiveWords(words, isTagForExtraction).join('') + specialCaseTagInjection)
916
- }
1015
+ this.content.push(preInject + this.extractConsecutiveWords(words, isTagForExtraction).join('') + postInject)
917
1016
 
918
1017
  if (words.length === 0) continue
919
1018
 
@@ -1020,6 +1119,31 @@ export default class HtmlDiff {
1020
1119
  continue
1021
1120
  }
1022
1121
 
1122
+ // Never orphan-reject a match whose tokens are ALL HTML tags.
1123
+ // Tag tokens are structural; rejecting `</strong>` / `</em>` as
1124
+ // an orphan match between two content deletions merges the tag
1125
+ // into the deletion, leaving the matching opener unclosed —
1126
+ // browsers then auto-close the opener at the END of the
1127
+ // deletion, producing visually-wrong output (e.g. the body of
1128
+ // a section deletion rendered as bold-italic because the
1129
+ // closing `</strong></em>` ended up after the body deletion
1130
+ // rather than after the heading). The orphan threshold is
1131
+ // designed for stray word matches between heavily-edited spans,
1132
+ // not for formatting boundaries.
1133
+ let allTags = true
1134
+ for (let i = curr.startInNew; i < curr.endInNew; i++) {
1135
+ if (!Utils.isTag(wordsForDiffNew[i])) {
1136
+ allTags = false
1137
+ break
1138
+ }
1139
+ }
1140
+ if (allTags) {
1141
+ yield curr
1142
+ prev = curr
1143
+ curr = next
1144
+ continue
1145
+ }
1146
+
1023
1147
  let oldDistanceInChars = 0
1024
1148
  for (let i = prev.endInOld; i < next.startInOld; i++) {
1025
1149
  oldDistanceInChars += wordsForDiffOld[i].length
@@ -1,4 +1,5 @@
1
1
  import Action from './Action'
2
+ import { lcsAlign } from './Alignment'
2
3
  import type { AnalyzeResult } from './HtmlDiff'
3
4
  import type Operation from './Operation'
4
5
  import type { WrapMetadata } from './Utils'
@@ -183,13 +184,33 @@ function collectInsertionsKeyedByEnd(d: AnalyzeResult): Map<number, string[]> {
183
184
  }
184
185
 
185
186
  /**
186
- * Emit any insertions at boundary `b`. When both authors inserted at
187
- * the same boundary AND the inserted token sequences are textually
188
- * identical, the insertion is treated as agreed and emitted unmarked.
189
- * Otherwise each side's insertion is emitted with author attribution.
187
+ * Emit any insertions at boundary `b`.
190
188
  *
191
- * The CP-then-Me ordering for disagreement is arbitrary but consistent;
192
- * callers don't depend on it.
189
+ * Reading model: a legal reviewer wants to see CP's INTENT relative
190
+ * to Me's current content. Me's content is the base; CP's deltas are
191
+ * what they need to act on. Under that framing:
192
+ * - tokens both authors inserted at the same boundary → settled
193
+ * - tokens CP inserted that Me doesn't have → ins-cp (CP wants
194
+ * this added)
195
+ * - tokens Me inserted that CP doesn't have → del-cp (CP wants
196
+ * this removed from Me's content)
197
+ *
198
+ * The third case is the load-bearing attribution flip. The
199
+ * genesis-spine view technically labels me-only-at-boundary tokens
200
+ * as "ins-me" (Me added them; CP didn't), but that's confusing to
201
+ * a reviewer: they see "Me added X" alongside "CP added Y" and have
202
+ * to mentally derive "CP wants X gone, replaced with Y". Surfacing
203
+ * me-only tokens as `del-cp` shows CP's intent directly:
204
+ * - "CP accepted Me's text minus `things`": settled bulk + del-cp
205
+ * `things` (no parallel redundant insertions)
206
+ * - "CP wants `cruel` where Me wrote `brave`": ins-cp `cruel` +
207
+ * del-cp `brave` (the substitution intent reads directly)
208
+ * - "CP added extra words": cp-extras stay as ins-cp (same as
209
+ * before; the cp-only direction was always intent-correct)
210
+ *
211
+ * Pure single-side insertions (Me added text CP doesn't engage
212
+ * with at all, or vice versa) keep their genesis-spine attribution
213
+ * — these aren't refinement cases, just Me's own content additions.
193
214
  */
194
215
  function emitBoundary(
195
216
  b: number,
@@ -205,14 +226,40 @@ function emitBoundary(
205
226
  const hasMe = !!meIns && meIns.length > 0
206
227
  if (!hasCp && !hasMe) return
207
228
 
208
- if (hasCp && hasMe && tokenArraysEqual(cpIns, meIns)) {
209
- // Both authors inserted the same content — settled. Emit unmarked.
210
- appendSegment(segments, { kind: 'equal' }, cpIns)
229
+ // Only-one-side: emit verbatim with that side's attribution.
230
+ // Genuine single-author additions stay author-attributed.
231
+ if (!hasCp) {
232
+ appendSegment(segments, { kind: 'ins', author: 'me' }, meIns!)
233
+ return
234
+ }
235
+ if (!hasMe) {
236
+ appendSegment(segments, { kind: 'ins', author: 'cp' }, cpIns!)
211
237
  return
212
238
  }
213
239
 
214
- if (hasCp) appendSegment(segments, { kind: 'ins', author: 'cp' }, cpIns)
215
- if (hasMe) appendSegment(segments, { kind: 'ins', author: 'me' }, meIns)
240
+ // Both sides inserted. Identical settled. Otherwise LCS-align
241
+ // and apply the asymmetric intent reading.
242
+ if (tokenArraysEqual(cpIns!, meIns!)) {
243
+ appendSegment(segments, { kind: 'equal' }, cpIns!)
244
+ return
245
+ }
246
+
247
+ const alignment = lcsAlign(cpIns! as string[], meIns! as string[])
248
+ for (const a of alignment) {
249
+ if (a.oldIdx !== null && a.newIdx !== null) {
250
+ // Token appears in both insertions → settled.
251
+ appendSegment(segments, { kind: 'equal' }, [cpIns![a.oldIdx]])
252
+ } else if (a.oldIdx !== null) {
253
+ // Token in cp's insertion only → CP wants this added.
254
+ appendSegment(segments, { kind: 'ins', author: 'cp' }, [cpIns![a.oldIdx]])
255
+ } else if (a.newIdx !== null) {
256
+ // Token in me's insertion only → CP wants this removed from
257
+ // Me's content. (Genesis-spine would label this ins-me, but
258
+ // that reading is misleading for a reviewer at this kind of
259
+ // shared boundary — see the function-level comment.)
260
+ appendSegment(segments, { kind: 'del', author: 'cp' }, [meIns![a.newIdx]])
261
+ }
262
+ }
216
263
  }
217
264
 
218
265
  function tokenArraysEqual(a: readonly string[], b: readonly string[]): boolean {
@@ -1,4 +1,4 @@
1
- import { lcsAlign, textSimilarity } from './Alignment'
1
+ import { type Alignment, lcsAlign, pairSimilarUnmatched, textSimilarity } from './Alignment'
2
2
  import { injectClass, parseOpeningTagAt } from './HtmlScanner'
3
3
  import {
4
4
  type CellRange,
@@ -8,6 +8,7 @@ import {
8
8
  PLACEHOLDER_SUFFIX,
9
9
  type RowRange,
10
10
  rowKey,
11
+ rowText,
11
12
  sameDimensions,
12
13
  spliceString,
13
14
  type TableRange,
@@ -143,8 +144,14 @@ function preprocessByContent(
143
144
  const cKeys = cTables.map(t => tableKey(cpLatest, t))
144
145
  const mKeys = mTables.map(t => tableKey(meCurrent, t))
145
146
 
146
- const alignCp = lcsAlign(gKeys, cKeys)
147
- const alignMe = lcsAlign(gKeys, mKeys)
147
+ // Exact tableKey LCS, then fuzzy-pair unmatched runs by content
148
+ // similarity. Without this, a table whose cells were edited (but
149
+ // not its overall shape) fails the exact tableKey match and the
150
+ // table-level aligner pulls it apart into a whole-table del + a
151
+ // whole-table ins. Same fuzzy pass `TableDiff` uses for the 2-way
152
+ // path — `pairSimilarTablesThreeWay` is defined below.
153
+ const alignCp = pairSimilarTablesThreeWay(lcsAlign(gKeys, cKeys), genesis, cpLatest, gTables, cTables)
154
+ const alignMe = pairSimilarTablesThreeWay(lcsAlign(gKeys, mKeys), genesis, meCurrent, gTables, mTables)
148
155
 
149
156
  // Maps: genesisIdx → matching cpIdx (-1 if none); cpIdx → matching genesisIdx; etc.
150
157
  const gToCp = new Array<number>(gTables.length).fill(-1)
@@ -303,7 +310,21 @@ function preprocessByContent(
303
310
  return { modifiedGenesis, modifiedCp, modifiedMe, placeholderToDiff }
304
311
  }
305
312
 
306
- const POSITIONAL_PAIR_SIMILARITY_THRESHOLD = 0.5
313
+ // Positional pairing is the strict-default for three-way table merge:
314
+ // when all three inputs have the same number of tables in the same
315
+ // order, we pair them by index and let `diffTableThreeWay` handle
316
+ // per-table cell/row level differences. The similarity guard below
317
+ // only kicks in to *reject* positional alignment when a pair is
318
+ // SO dissimilar that it's near-certainly a table reorder/rename
319
+ // where content-LCS pairing would be materially better. The
320
+ // threshold is intentionally low — the 2-way path has no such guard
321
+ // and pairs purely by index (its `diffTable` falls back through
322
+ // same-dimension → equal-row-count → row-LCS → whole-table on its
323
+ // own), so the three-way path was stricter than its sibling and
324
+ // silently dropped to whole-table del+ins for legitimate edits
325
+ // like "rename one column and tweak its values". Aligning the
326
+ // threshold here keeps the two-way and three-way paths in step.
327
+ const POSITIONAL_PAIR_SIMILARITY_THRESHOLD = 0.15
307
328
 
308
329
  function positionallyAligned(
309
330
  genesis: string,
@@ -328,6 +349,79 @@ function tableKey(html: string, table: TableRange): string {
328
349
  return html.slice(table.tableStart, table.tableEnd).replace(/\s+/g, ' ').trim()
329
350
  }
330
351
 
352
+ /**
353
+ * Character-level similarity above which the three-way aligner treats
354
+ * two rows / tables as "the same logical entry, edited" rather than
355
+ * an unrelated delete + insert. Matched to TableDiff's
356
+ * `ROW_FUZZY_THRESHOLD` / `CELL_FUZZY_THRESHOLD` so 2-way and 3-way
357
+ * agree on which pairings are reachable; if a row's content overlap
358
+ * is enough to fool the 2-way diff into pairing, it should also be
359
+ * enough for 3-way.
360
+ */
361
+ const THREE_WAY_FUZZY_THRESHOLD = 0.5
362
+
363
+ /**
364
+ * Run the same fuzzy-pairing pass `TableDiff.pairSimilarUnmatchedRows`
365
+ * applies after its exact-LCS, but against one side of the genesis
366
+ * spine (either cp or me). The genesis tables/rows are always the
367
+ * "old" side; `newTable` is the cp or me table being aligned. Returns
368
+ * the enriched alignment with additional paired entries.
369
+ *
370
+ * Cell-count guard: only fuzzy-pair when both rows have the same cell
371
+ * count. Without this guard an asymmetric restructure — e.g. CP and
372
+ * Me both added a different column — leads to ONE side fuzzy-pairing
373
+ * its row with genesis (content overlap above threshold) while the
374
+ * other side falls below threshold. That mismatch routes through
375
+ * `diffTableStructural`'s "Me dropped, CP kept" (or the mirror)
376
+ * branch, which emits CP's row as a Me-attributed deletion. In
377
+ * cp-only mode `stripMeAttributedMarkers` then removes the row
378
+ * entirely and CP's edit vanishes from the view — exactly the
379
+ * content-loss case we're meant to prevent. Restricting fuzzy
380
+ * pairing to same-shape rows preserves the common case (single cell
381
+ * edit, identical row shape) while pushing structural mismatches
382
+ * back to the boundary-insertion path that emits both sides
383
+ * explicitly.
384
+ */
385
+ function pairSimilarRowsThreeWay(
386
+ alignment: Alignment[],
387
+ genesis: string,
388
+ newHtml: string,
389
+ oldTable: TableRange,
390
+ newTable: TableRange
391
+ ): Alignment[] {
392
+ const oldTexts = oldTable.rows.map(r => rowText(genesis, r))
393
+ const newTexts = newTable.rows.map(r => rowText(newHtml, r))
394
+ return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) => {
395
+ // Returning 0 sits below any positive threshold so
396
+ // `pairSimilarUnmatched` won't pair these rows; the guard remains
397
+ // defensive should the threshold ever be lowered to 0.
398
+ if (oldTable.rows[oldIdx].cells.length !== newTable.rows[newIdx].cells.length) return 0
399
+ return textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
400
+ })
401
+ }
402
+
403
+ /**
404
+ * Table-level counterpart: after `lcsAlign(gKeys, otherKeys)` over
405
+ * full table HTML keys, fuzzy-pair unmatched table runs by their
406
+ * row-text-concatenated content. Without this, a table whose body
407
+ * was edited (but not its outer shape) fails the exact-key match
408
+ * and the preprocessing emits whole-table del + whole-table ins
409
+ * instead of recursing into per-cell three-way diffs.
410
+ */
411
+ function pairSimilarTablesThreeWay(
412
+ alignment: Alignment[],
413
+ oldHtml: string,
414
+ newHtml: string,
415
+ oldTables: TableRange[],
416
+ newTables: TableRange[]
417
+ ): Alignment[] {
418
+ const oldTexts = oldTables.map(t => t.rows.map(r => rowText(oldHtml, r)).join(' '))
419
+ const newTexts = newTables.map(t => t.rows.map(r => rowText(newHtml, r)).join(' '))
420
+ return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
421
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
422
+ )
423
+ }
424
+
331
425
  // ────────────────────────────────────────────────────────────────────────────
332
426
  // Per-table diff: positional cells or row-level structural change.
333
427
 
@@ -412,8 +506,17 @@ function diffTableStructural(
412
506
  const cKeys = tC.rows.map(r => rowKey(cpLatest, r))
413
507
  const mKeys = tM.rows.map(r => rowKey(meCurrent, r))
414
508
 
415
- const alignCp = lcsAlign(gKeys, cKeys)
416
- const alignMe = lcsAlign(gKeys, mKeys)
509
+ // Exact LCS first, then fuzzy-pair remaining unmatched runs. Without
510
+ // the fuzzy pass, a row where CP edited just a single cell's text
511
+ // produces no key match — the row aligner emits the genesis row as
512
+ // CP-deleted AND CP's reshaped row as inserted, when a cell-level
513
+ // diff against the paired row would render the edit far more
514
+ // legibly. The 2-way path (`TableDiff.pairSimilarUnmatchedRows`)
515
+ // has done this since inception; bringing the three-way path in
516
+ // step removes the asymmetry where the cp-only / all-changes view
517
+ // looks markedly worse than plain 2-way for ordinary cell edits.
518
+ const alignCp = pairSimilarRowsThreeWay(lcsAlign(gKeys, cKeys), genesis, cpLatest, tG, tC)
519
+ const alignMe = pairSimilarRowsThreeWay(lcsAlign(gKeys, mKeys), genesis, meCurrent, tG, tM)
417
520
 
418
521
  // genesisIdx → matching cpIdx (-1 if cp deleted this row)
419
522
  const gToCp = new Array<number>(tG.rows.length).fill(-1)
@@ -521,9 +624,40 @@ function emitPreservedRow(
521
624
  return out.join('')
522
625
  }
523
626
  // Cell-count mismatch within a preserved row — cell-level structural
524
- // change deferred. Fall back to me-attributed Replace (genesis row
525
- // removed, me row inserted). Lossy for CP within that row.
526
- return emitFullRowAttributed(genesis, rG, 'del', 'me') + emitFullRowAttributed(meCurrent, rM, 'ins', 'me')
627
+ // alignment is non-trivial (which Me cell maps to which CP cell when
628
+ // the counts diverge?). The previous fallback emitted only
629
+ // genesis-as-del + me-as-ins, which silently destroyed CP's row
630
+ // content whenever CP changed the cell count — a content-loss bug
631
+ // (a row where CP added a column would disappear from the rendered
632
+ // diff entirely). Emit each side's row as a distinct attributed
633
+ // block so neither party's restructure can vanish:
634
+ // - if both restructured (different shapes on both sides) the
635
+ // genesis row is settled-deleted (silent) and we emit cp + me
636
+ // rows side by side, each attributed to its author;
637
+ // - if only one restructured, the genesis row is del-attributed to
638
+ // the restructuring author so the reader sees what was there
639
+ // before, then the new shape ins-attributed to the same author.
640
+ //
641
+ // Content edits inside a side that DID keep the genesis cell count
642
+ // are not surfaced here (no positional path is available across
643
+ // mismatched shapes); the underlying data is still present in the
644
+ // source document but the visual diff doesn't decompose it. That is
645
+ // a degradation of detail, not content loss — symmetric for cp/me.
646
+ const cpRestructured = rC.cells.length !== rG.cells.length
647
+ const meRestructured = rM.cells.length !== rG.cells.length
648
+ const blocks: string[] = []
649
+ if (cpRestructured && meRestructured) {
650
+ // Both sides restructured; genesis shape retained by neither.
651
+ blocks.push(emitFullRowAttributed(cpLatest, rC, 'ins', 'cp'))
652
+ blocks.push(emitFullRowAttributed(meCurrent, rM, 'ins', 'me'))
653
+ } else if (cpRestructured) {
654
+ blocks.push(emitFullRowAttributed(genesis, rG, 'del', 'cp'))
655
+ blocks.push(emitFullRowAttributed(cpLatest, rC, 'ins', 'cp'))
656
+ } else {
657
+ blocks.push(emitFullRowAttributed(genesis, rG, 'del', 'me'))
658
+ blocks.push(emitFullRowAttributed(meCurrent, rM, 'ins', 'me'))
659
+ }
660
+ return blocks.join('')
527
661
  }
528
662
 
529
663
  /**
@@ -48,6 +48,21 @@ describe('HtmlDiff', () => {
48
48
  'Some formatted text',
49
49
  "Some <ins class='mod strong i'>formatted</ins> text",
50
50
  ],
51
+ // Overlapping formatting wraps — old wraps a word in <strong>, new wraps the same
52
+ // word in <u>. The wraps cross (mod-strong opens before mod-u, but the </strong>
53
+ // closing arrives before </u>), so emission must split the inner wrap to keep
54
+ // HTML well-formed. Regression: previously left mod-strong unclosed and the
55
+ // 3-way path threw on the unbalanced stack.
56
+ [
57
+ '<strong>three</strong>',
58
+ '<u>three</u>',
59
+ "<ins class='mod strong'><u><ins class='mod u'>three</ins></ins><ins class='mod u'></ins></u>",
60
+ ],
61
+ [
62
+ 'a <strong>three</strong> b',
63
+ 'a <u>three</u> b',
64
+ "a <ins class='mod strong'><u><ins class='mod u'>three</ins></ins><ins class='mod u'></ins></u> b",
65
+ ],
51
66
  [
52
67
  '<table><tr><td>col1</td><td>col2</td></tr><tr><td>Data 1</td><td>Data 2</td></tr></table>',
53
68
  '<table><tr><td>col1</td><td>col2</td></tr></table>',