@createiq/htmldiff 1.2.0-beta.2 → 1.2.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.2.0-beta.2",
3
+ "version": "1.2.0-beta.4",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
package/src/HtmlDiff.ts CHANGED
@@ -335,22 +335,6 @@ export default class HtmlDiff {
335
335
  return HtmlDiff.shouldUseContentProjections(oldWords, newWords, oldProj, newProj)
336
336
  }
337
337
 
338
- /**
339
- * Three-way HTML diff. Given V1 (the version Me last sent), V2 (the
340
- * version CP sent back), and V3 (Me's current draft), produces a
341
- * single attributed HTML output where CP's and Me's changes are
342
- * distinguished by `data-author` ('cp' or 'me') and matching
343
- * `class='diffins cp'` / `class='diffdel me'` etc. The "Me rejected
344
- * CP's proposal" case (Me deleted text CP had inserted) gets a
345
- * dedicated marker: `data-rejects='cp'` plus `class='... rejects-cp'`.
346
- *
347
- * Coordinates the symmetric-projection decision (D1) across both
348
- * internal `analyze` calls so V2 tokenises identically on each side
349
- * of the spine. When `useProjections` is left undefined, the decision
350
- * is the conjunction of both pair-wise heuristics — project iff both
351
- * pairs would project on their own. Pass an explicit boolean to
352
- * override.
353
- */
354
338
  /**
355
339
  * Three-way HTML diff against a shared genesis. Produces attributed
356
340
  * HTML that distinguishes CP's accumulated changes (genesis → cpLatest)
@@ -439,6 +423,17 @@ export default class HtmlDiff {
439
423
  * buffer. Reusing the instance keeps the formatting-tag stack
440
424
  * (`specialTagDiffStack`) coherent across segments — a `<strong>`
441
425
  * opened in one segment and closed in another stays balanced.
426
+ *
427
+ * Edge case: an ins/del segment can open a formatting wrap whose
428
+ * matching closer ends up in an equal segment (`<strong>` deleted
429
+ * by CP but `</strong>` kept by both — buildSegments emits the open
430
+ * as del-cp and the close as equal). Equal segments bypass
431
+ * `insertTag` and push raw, so the stack entry for the open is
432
+ * never popped. Rather than throw — which forces the caller's UI
433
+ * into an error boundary — close every leftover wrap with `</ins>`
434
+ * at the end of emission. The resulting HTML has an extra
435
+ * `</ins>` next to the formatting closer; DOMParser-normalisation
436
+ * downstream produces sensible nesting.
442
437
  */
443
438
  private static emitSegments(segments: Segment[]): string {
444
439
  const emitter = new HtmlDiff('', '')
@@ -451,18 +446,21 @@ export default class HtmlDiff {
451
446
  // insertTag mutates its `words` array; pass a copy.
452
447
  emitter.insertTag(tag, baseClass, [...seg.words], metadata)
453
448
  }
454
- // Stack-balance invariant: every special-case opening tag pushed onto
455
- // `specialTagDiffStack` during emission must have been matched by a
456
- // closing tag. An unbalanced stack means the input had unbalanced
457
- // formatting tags AND a Replace at an inconvenient position — the
458
- // output would be silently malformed (half-closed `<ins>`). Fail
459
- // loudly so the caller can investigate rather than ship broken HTML.
460
449
  if (emitter.specialTagDiffStack.length > 0) {
461
- throw new Error(
450
+ // Log once so we can spot bad inputs in dev tools, but don't
451
+ // throw — the caller's only fallback was to crash the React
452
+ // tree, which is worse than emitting slightly-imperfect HTML.
453
+ // eslint-disable-next-line no-console
454
+ console.warn(
462
455
  `HtmlDiff.executeThreeWay: emission left ${emitter.specialTagDiffStack.length} ` +
463
- 'unclosed formatting tag(s) on the stack input may have unbalanced ' +
464
- '<strong>/<em>/etc. or there is a bug in segment emission.'
456
+ 'unclosed formatting wrap(s) on the stack. Closing defensively. ' +
457
+ 'This usually means a formatting tag opens in a del/ins segment ' +
458
+ 'and its matching closer is in an equal segment.'
465
459
  )
460
+ while (emitter.specialTagDiffStack.length > 0) {
461
+ emitter.content.push('</ins>')
462
+ emitter.specialTagDiffStack.pop()
463
+ }
466
464
  }
467
465
  return emitter.content.join('')
468
466
  }
@@ -303,7 +303,21 @@ function preprocessByContent(
303
303
  return { modifiedGenesis, modifiedCp, modifiedMe, placeholderToDiff }
304
304
  }
305
305
 
306
- const POSITIONAL_PAIR_SIMILARITY_THRESHOLD = 0.5
306
+ // Positional pairing is the strict-default for three-way table merge:
307
+ // when all three inputs have the same number of tables in the same
308
+ // order, we pair them by index and let `diffTableThreeWay` handle
309
+ // per-table cell/row level differences. The similarity guard below
310
+ // only kicks in to *reject* positional alignment when a pair is
311
+ // SO dissimilar that it's near-certainly a table reorder/rename
312
+ // where content-LCS pairing would be materially better. The
313
+ // threshold is intentionally low — the 2-way path has no such guard
314
+ // and pairs purely by index (its `diffTable` falls back through
315
+ // same-dimension → equal-row-count → row-LCS → whole-table on its
316
+ // own), so the three-way path was stricter than its sibling and
317
+ // silently dropped to whole-table del+ins for legitimate edits
318
+ // like "rename one column and tweak its values". Aligning the
319
+ // threshold here keeps the two-way and three-way paths in step.
320
+ const POSITIONAL_PAIR_SIMILARITY_THRESHOLD = 0.15
307
321
 
308
322
  function positionallyAligned(
309
323
  genesis: string,
@@ -521,9 +535,40 @@ function emitPreservedRow(
521
535
  return out.join('')
522
536
  }
523
537
  // Cell-count mismatch within a preserved row — cell-level structural
524
- // change deferred. Fall back to me-attributed Replace (genesis row
525
- // removed, me row inserted). Lossy for CP within that row.
526
- return emitFullRowAttributed(genesis, rG, 'del', 'me') + emitFullRowAttributed(meCurrent, rM, 'ins', 'me')
538
+ // alignment is non-trivial (which Me cell maps to which CP cell when
539
+ // the counts diverge?). The previous fallback emitted only
540
+ // genesis-as-del + me-as-ins, which silently destroyed CP's row
541
+ // content whenever CP changed the cell count — a content-loss bug
542
+ // (a row where CP added a column would disappear from the rendered
543
+ // diff entirely). Emit each side's row as a distinct attributed
544
+ // block so neither party's restructure can vanish:
545
+ // - if both restructured (different shapes on both sides) the
546
+ // genesis row is settled-deleted (silent) and we emit cp + me
547
+ // rows side by side, each attributed to its author;
548
+ // - if only one restructured, the genesis row is del-attributed to
549
+ // the restructuring author so the reader sees what was there
550
+ // before, then the new shape ins-attributed to the same author.
551
+ //
552
+ // Content edits inside a side that DID keep the genesis cell count
553
+ // are not surfaced here (no positional path is available across
554
+ // mismatched shapes); the underlying data is still present in the
555
+ // source document but the visual diff doesn't decompose it. That is
556
+ // a degradation of detail, not content loss — symmetric for cp/me.
557
+ const cpRestructured = rC.cells.length !== rG.cells.length
558
+ const meRestructured = rM.cells.length !== rG.cells.length
559
+ const blocks: string[] = []
560
+ if (cpRestructured && meRestructured) {
561
+ // Both sides restructured; genesis shape retained by neither.
562
+ blocks.push(emitFullRowAttributed(cpLatest, rC, 'ins', 'cp'))
563
+ blocks.push(emitFullRowAttributed(meCurrent, rM, 'ins', 'me'))
564
+ } else if (cpRestructured) {
565
+ blocks.push(emitFullRowAttributed(genesis, rG, 'del', 'cp'))
566
+ blocks.push(emitFullRowAttributed(cpLatest, rC, 'ins', 'cp'))
567
+ } else {
568
+ blocks.push(emitFullRowAttributed(genesis, rG, 'del', 'me'))
569
+ blocks.push(emitFullRowAttributed(meCurrent, rM, 'ins', 'me'))
570
+ }
571
+ return blocks.join('')
527
572
  }
528
573
 
529
574
  /**
@@ -120,11 +120,19 @@ describe('HtmlDiff.executeThreeWay (genesis-spine)', () => {
120
120
  })
121
121
 
122
122
  it('cp matches genesis (only Me changed)', () => {
123
- expect(HtmlDiff.executeThreeWay('Hello world', 'Hello world', 'Hello brave world')).toContain("data-author='me'")
123
+ // Negative assertion is load-bearing: without `not.toContain`
124
+ // a cp↔me swap inside the genesis-spine merge would still
125
+ // emit `data-author='cp'` somewhere in the output and the
126
+ // positive assertion would silently pass.
127
+ const out = HtmlDiff.executeThreeWay('Hello world', 'Hello world', 'Hello brave world')
128
+ expect(out).toContain("data-author='me'")
129
+ expect(out).not.toContain("data-author='cp'")
124
130
  })
125
131
 
126
132
  it('me matches genesis (only CP changed)', () => {
127
- expect(HtmlDiff.executeThreeWay('Hello world', 'Hello cruel world', 'Hello world')).toContain("data-author='cp'")
133
+ const out = HtmlDiff.executeThreeWay('Hello world', 'Hello cruel world', 'Hello world')
134
+ expect(out).toContain("data-author='cp'")
135
+ expect(out).not.toContain("data-author='me'")
128
136
  })
129
137
  })
130
138
 
@@ -163,10 +171,47 @@ describe('HtmlDiff.executeThreeWay (genesis-spine)', () => {
163
171
  const without = HtmlDiff.executeThreeWay('a b', 'a b', 'a b')
164
172
  const withFlag = HtmlDiff.executeThreeWay('a b', 'a b', 'a b', { ignoreWhitespaceDifferences: true })
165
173
  expect(without).toContain("data-author='me'")
174
+ // CP matches genesis — any cp attribution would be a mis-merge.
175
+ expect(without).not.toContain("data-author='cp'")
166
176
  expect(withFlag).not.toContain('data-author=')
167
177
  })
168
178
  })
169
179
 
180
+ describe('stack-balance defence', () => {
181
+ // The emission walks segments built by `buildSegments`: ins/del
182
+ // segments go through `insertTag` (which manages the formatting-
183
+ // tag stack), but equal segments push raw words straight to the
184
+ // content buffer. When a formatting opener is in a del segment
185
+ // and its matching closer falls in an equal segment, the stack
186
+ // entry never gets popped — the emitter used to throw "emission
187
+ // left 1 unclosed formatting tag(s) on the stack" and crash the
188
+ // caller. Now it closes the leftover wraps defensively with
189
+ // `</ins>` so the output stays renderable.
190
+
191
+ it('CP inserted a <strong> opener whose closer is matched as equal — does not throw', () => {
192
+ // Genesis has an orphan closer (`X</strong>`); CP wrapped X in
193
+ // a fresh `<strong>`. The opener is ins-cp (no genesis match)
194
+ // but the closer is shared by all three and emits as equal.
195
+ // The mod-`<ins>` opened on the strong push needs to be closed
196
+ // somehow; the defensive path emits a trailing `</ins>`.
197
+ expect(() => HtmlDiff.executeThreeWay('X</strong>', '<strong>X</strong>', 'X</strong>')).not.toThrow()
198
+ })
199
+
200
+ it('CP deleted only the <strong> opener — does not throw', () => {
201
+ // Symmetric: genesis had `<strong>X</strong>`, CP dropped the
202
+ // opener but kept the closer. The opener-delete pushes onto
203
+ // the stack and the closer arrives via an equal segment.
204
+ expect(() => HtmlDiff.executeThreeWay('<strong>X</strong>', 'X</strong>', '<strong>X</strong>')).not.toThrow()
205
+ })
206
+
207
+ it('produces non-empty output even when the stack is left unbalanced at end', () => {
208
+ const out = HtmlDiff.executeThreeWay('X</strong>', '<strong>X</strong>', 'X</strong>')
209
+ // The content is still there, the formatting wraps just close
210
+ // defensively. Sanity-check the visible content survives.
211
+ expect(out).toContain('X')
212
+ })
213
+ })
214
+
170
215
  describe('first-turn fallback', () => {
171
216
  it('cp == genesis means CP made no changes — Me-only attribution', () => {
172
217
  // Common case: this is the first turn where the counterparty hasn't
@@ -259,6 +259,47 @@ describe('HtmlDiff.executeThreeWay (tables, genesis-spine)', () => {
259
259
  const html = `<table>${rows}</table>`
260
260
  expect(HtmlDiff.executeThreeWay(html, html, html)).toBe(html)
261
261
  })
262
+
263
+ it('cell-count mismatch: CP added a column — CP row content is visible (not silently dropped)', () => {
264
+ // Regression: the previous fallback in emitPreservedRow emitted
265
+ // only `del me` + `ins me` for any cell-count mismatch, which
266
+ // silently destroyed CP's row content whenever CP changed the
267
+ // cell count. A reader in cp-only mode would see no trace of
268
+ // CP's added column — a content-loss bug that violates the
269
+ // "CP's changes always visible" invariant.
270
+ const out = HtmlDiff.executeThreeWay(
271
+ '<table><tr><td>a</td><td>b</td></tr></table>',
272
+ '<table><tr><td>a</td><td>X</td><td>b</td></tr></table>',
273
+ '<table><tr><td>a</td><td>b</td></tr></table>'
274
+ )
275
+ expect(out).toBe(
276
+ "<table><tr class='diffdel cp' data-author='cp'><td class='diffdel cp' data-author='cp'><del class='diffdel cp' data-author='cp'>a</del></td><td class='diffdel cp' data-author='cp'><del class='diffdel cp' data-author='cp'>b</del></td></tr><tr class='diffins cp' data-author='cp'><td class='diffins cp' data-author='cp'><ins class='diffins cp' data-author='cp'>a</ins></td><td class='diffins cp' data-author='cp'><ins class='diffins cp' data-author='cp'>X</ins></td><td class='diffins cp' data-author='cp'><ins class='diffins cp' data-author='cp'>b</ins></td></tr></table>"
277
+ )
278
+ })
279
+
280
+ it('cell-count mismatch: Me removed a column — symmetric to the CP case', () => {
281
+ const out = HtmlDiff.executeThreeWay(
282
+ '<table><tr><td>a</td><td>b</td></tr></table>',
283
+ '<table><tr><td>a</td><td>b</td></tr></table>',
284
+ '<table><tr><td>a</td></tr></table>'
285
+ )
286
+ expect(out).toBe(
287
+ "<table><tr class='diffdel me' data-author='me'><td class='diffdel me' data-author='me'><del class='diffdel me' data-author='me'>a</del></td><td class='diffdel me' data-author='me'><del class='diffdel me' data-author='me'>b</del></td></tr><tr class='diffins me' data-author='me'><td class='diffins me' data-author='me'><ins class='diffins me' data-author='me'>a</ins></td></tr></table>"
288
+ )
289
+ })
290
+
291
+ it('cell-count mismatch: both sides restructured differently — both ins rows attributed', () => {
292
+ // Genesis 2 cells, CP 3 cells, Me 4 cells. Neither side keeps
293
+ // the genesis shape, so both restructures must be visible.
294
+ const out = HtmlDiff.executeThreeWay(
295
+ '<table><tr><td>a</td><td>b</td></tr></table>',
296
+ '<table><tr><td>a</td><td>X</td><td>b</td></tr></table>',
297
+ '<table><tr><td>a</td><td>b</td><td>Y</td><td>Z</td></tr></table>'
298
+ )
299
+ expect(out).toBe(
300
+ "<table><tr class='diffins cp' data-author='cp'><td class='diffins cp' data-author='cp'><ins class='diffins cp' data-author='cp'>a</ins></td><td class='diffins cp' data-author='cp'><ins class='diffins cp' data-author='cp'>X</ins></td><td class='diffins cp' data-author='cp'><ins class='diffins cp' data-author='cp'>b</ins></td></tr><tr class='diffins me' data-author='me'><td class='diffins me' data-author='me'><ins class='diffins me' data-author='me'>a</ins></td><td class='diffins me' data-author='me'><ins class='diffins me' data-author='me'>b</ins></td><td class='diffins me' data-author='me'><ins class='diffins me' data-author='me'>Y</ins></td><td class='diffins me' data-author='me'><ins class='diffins me' data-author='me'>Z</ins></td></tr></table>"
301
+ )
302
+ })
262
303
  })
263
304
 
264
305
  describe('nested tables', () => {
@@ -270,6 +311,9 @@ describe('HtmlDiff.executeThreeWay (tables, genesis-spine)', () => {
270
311
  )
271
312
  expect(out).toMatch(/<del[^>]*data-author='cp'[^>]*>inner<\/del>/)
272
313
  expect(out).toMatch(/<ins[^>]*data-author='cp'[^>]*>INNER<\/ins>/)
314
+ // me == genesis here, so any me attribution would indicate a
315
+ // cp↔me swap inside the table-cell merge.
316
+ expect(out).not.toContain("data-author='me'")
273
317
  expect(out.startsWith('<table><tr><td><table>')).toBe(true)
274
318
  expect(out.endsWith('</table></td></tr></table>')).toBe(true)
275
319
  })
@@ -298,4 +342,34 @@ describe('HtmlDiff.executeThreeWay (tables, genesis-spine)', () => {
298
342
  expect(HtmlDiff.executeThreeWay('<p>a</p>', '<p>a</p>', '<p>a</p>')).toBe('<p>a</p>')
299
343
  })
300
344
  })
345
+
346
+ describe('positional pairing under moderate dissimilarity', () => {
347
+ it('column rename + value rewrite still routes through cell-level diff (not whole-table del+ins)', () => {
348
+ // Real-world regression: cp renamed a column ("Form/Document/Certificate"
349
+ // → "Extra column") and replaced the values in that column with short
350
+ // tokens. Word-level Jaccard between the genesis table and cp's edited
351
+ // table drops to ~0.38 — under the 0.5 threshold the three-way path
352
+ // used to take, which kicked the diff into multi-table content-LCS
353
+ // and produced whole-table del+ins (the cp's CP-bubble showed the
354
+ // entire old table struck through and the entire new table inserted).
355
+ // 2-way had no such guard and produced a cell-level diff for the same
356
+ // inputs; lowering the 3-way threshold brings the two paths in step.
357
+ const genesis =
358
+ '<table><tr><td>A</td><td>Form/Document/Certificate</td><td>Date</td></tr><tr><td>Party A</td><td>IRS W-8</td><td>On execution</td></tr></table>'
359
+ const cp =
360
+ '<table><tr><td>A</td><td>Extra column</td><td>Date</td></tr><tr><td>Party A</td><td>Yes</td><td>On execution</td></tr></table>'
361
+ const me = genesis
362
+ const out = HtmlDiff.executeThreeWay(genesis, cp, me)
363
+ // Expect cell-level cp attribution INSIDE the table cells, NOT a
364
+ // whole-table del+ins wrapping the entire <table>.
365
+ expect(out).not.toMatch(/<del[^>]*><table/)
366
+ expect(out).toMatch(/data-author='cp'/)
367
+ // me === genesis, so any me-attribution markers would mean the
368
+ // diff swapped CP's edits onto Me. Negative assertion locks the
369
+ // attribution direction.
370
+ expect(out).not.toContain("data-author='me'")
371
+ expect(out).toContain('Extra column')
372
+ expect(out).toContain('Form/Document/Certificate')
373
+ })
374
+ })
301
375
  })
@@ -138,10 +138,10 @@ describe('Utils', () => {
138
138
  it('combines extraClasses and dataAttrs in one call', () => {
139
139
  expect(
140
140
  Utils.wrapText('hello', 'del', 'diffdel', {
141
- extraClasses: 'me rejects-cp',
142
- dataAttrs: { author: 'me', rejects: 'cp' },
141
+ extraClasses: 'me',
142
+ dataAttrs: { author: 'me', source: 'edit' },
143
143
  })
144
- ).toBe("<del class='diffdel me rejects-cp' data-author='me' data-rejects='cp'>hello</del>")
144
+ ).toBe("<del class='diffdel me' data-author='me' data-source='edit'>hello</del>")
145
145
  })
146
146
 
147
147
  it('skips the metadata path entirely when neither extraClasses nor dataAttrs is set', () => {