@createiq/htmldiff 1.2.0-beta.6 → 1.2.0-beta.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.2.0-beta.6",
3
+ "version": "1.2.0-beta.8",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
package/src/HtmlDiff.ts CHANGED
@@ -71,6 +71,40 @@ export interface AnalyzeResult {
71
71
  */
72
72
  export type ThreeWayOptions = AnalyzeOptions
73
73
 
74
+ /**
75
+ * Opinionated options that align htmldiff's output with Microsoft Word's
76
+ * track-changes rendering for legal-document rewrites.
77
+ *
78
+ * The library's bare default (`orphanMatchThreshold = 0`) keeps every
79
+ * LCS match, however small — which fragments long sentence rewrites
80
+ * into many tiny ins/del pairs around stray word matches ("of", "the",
81
+ * "shall"). Word collapses those into a single coarse del+ins, which is
82
+ * dramatically more readable for legal text.
83
+ *
84
+ * 0.25 was tuned empirically against a customer Word reference (US
85
+ * Commercial One CP, May 2026):
86
+ * - short edits (typo / one-word insert): output identical to
87
+ * threshold=0 — inter-match distances are tiny so every match
88
+ * trivially clears the bar;
89
+ * - long rewrites (the "Specified Indebtedness" rewrite in the
90
+ * reference): previously produced 6 dels + 5 ins fragmented around
91
+ * stray matches; at 0.25 it condenses to 3 dels + 2 ins — close to
92
+ * Word's 1+1 and a major readability win;
93
+ * - higher values (0.3+) collapsed short edits containing inline
94
+ * formatting changes into a single block — too aggressive.
95
+ *
96
+ * Consumers rendering legal documents should spread this into their
97
+ * options:
98
+ * `HtmlDiff.execute(old, new, { ...WORD_ALIGNED_OPTIONS })`
99
+ * `HtmlDiff.executeThreeWay(g, c, m, { ...WORD_ALIGNED_OPTIONS })`
100
+ *
101
+ * Other consumers (machine-readable diff, exact-token alignment) can
102
+ * keep the bare default.
103
+ */
104
+ export const WORD_ALIGNED_OPTIONS: AnalyzeOptions = {
105
+ orphanMatchThreshold: 0.25,
106
+ }
107
+
74
108
  export default class HtmlDiff {
75
109
  /**
76
110
  * This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
@@ -249,6 +283,16 @@ export default class HtmlDiff {
249
283
  this.newText = newText
250
284
  }
251
285
 
286
+ /**
287
+ * Two-way diff entry point. Accepts the same `AnalyzeOptions` bag as
288
+ * `executeThreeWay`, with two intentional exceptions documented
289
+ * inline below. Consumers wanting Word-aligned output should spread
290
+ * `WORD_ALIGNED_OPTIONS` into the third argument.
291
+ *
292
+ * Note: unlike `analyze`, `execute` runs `build()` which performs
293
+ * full table preprocessing — `tablePreprocessDepth` stays at 0 so
294
+ * the recursive cell diff can happen. Callers can't override that.
295
+ */
252
296
  static execute(oldText: string, newText: string, options: AnalyzeOptions = {}): string {
253
297
  const inner = new HtmlDiff(oldText, newText)
254
298
  if (options.blockExpressions) {
@@ -260,8 +304,9 @@ export default class HtmlDiff {
260
304
  inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
261
305
  }
262
306
  // `useProjections` is intentionally NOT plumbed here — the 2-way
263
- // path's build() runs its own heuristic. Pass via `analyze` if a
264
- // caller needs to force it.
307
+ // path's build() runs its own heuristic. `analyze` honours it; if
308
+ // you need to force it for a 2-way result, route through `analyze`
309
+ // and consume the operations directly.
265
310
  return inner.build()
266
311
  }
267
312
 
@@ -443,9 +488,17 @@ export default class HtmlDiff {
443
488
  * `insertTag` and push raw, so the stack entry for the open is
444
489
  * never popped. Rather than throw — which forces the caller's UI
445
490
  * into an error boundary — close every leftover wrap with `</ins>`
446
- * at the end of emission. The resulting HTML has an extra
447
- * `</ins>` next to the formatting closer; DOMParser-normalisation
448
- * downstream produces sensible nesting.
491
+ * at the end of emission.
492
+ *
493
+ * Caveat: the `</ins>` close is honest for the mod-wrap that the
494
+ * opener pushed (every formatting opener emits an inner `<ins…>`
495
+ * postInject regardless of whether the outer segment is ins or
496
+ * del). For del-segment formatting openers the outer `<del>` may
497
+ * itself be left open by the same emission imbalance; this fixup
498
+ * doesn't address that. Downstream browsers/DOMParser normalise
499
+ * mildly-malformed HTML by closing dangling tags, so the rendered
500
+ * output is usually acceptable — but the warning IS the signal
501
+ * that the input had a real imbalance worth investigating.
449
502
  */
450
503
  private static emitSegments(segments: Segment[]): string {
451
504
  const emitter = new HtmlDiff('', '')
@@ -1066,6 +1119,31 @@ export default class HtmlDiff {
1066
1119
  continue
1067
1120
  }
1068
1121
 
1122
+ // Never orphan-reject a match whose tokens are ALL HTML tags.
1123
+ // Tag tokens are structural; rejecting `</strong>` / `</em>` as
1124
+ // an orphan match between two content deletions merges the tag
1125
+ // into the deletion, leaving the matching opener unclosed —
1126
+ // browsers then auto-close the opener at the END of the
1127
+ // deletion, producing visually-wrong output (e.g. the body of
1128
+ // a section deletion rendered as bold-italic because the
1129
+ // closing `</strong></em>` ended up after the body deletion
1130
+ // rather than after the heading). The orphan threshold is
1131
+ // designed for stray word matches between heavily-edited spans,
1132
+ // not for formatting boundaries.
1133
+ let allTags = true
1134
+ for (let i = curr.startInNew; i < curr.endInNew; i++) {
1135
+ if (!Utils.isTag(wordsForDiffNew[i])) {
1136
+ allTags = false
1137
+ break
1138
+ }
1139
+ }
1140
+ if (allTags) {
1141
+ yield curr
1142
+ prev = curr
1143
+ curr = next
1144
+ continue
1145
+ }
1146
+
1069
1147
  let oldDistanceInChars = 0
1070
1148
  for (let i = prev.endInOld; i < next.startInOld; i++) {
1071
1149
  oldDistanceInChars += wordsForDiffOld[i].length
@@ -392,6 +392,9 @@ function pairSimilarRowsThreeWay(
392
392
  const oldTexts = oldTable.rows.map(r => rowText(genesis, r))
393
393
  const newTexts = newTable.rows.map(r => rowText(newHtml, r))
394
394
  return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) => {
395
+ // Returning 0 sits below any positive threshold so
396
+ // `pairSimilarUnmatched` won't pair these rows; the guard remains
397
+ // defensive should the threshold ever be lowered to 0.
395
398
  if (oldTable.rows[oldIdx].cells.length !== newTable.rows[newIdx].cells.length) return 0
396
399
  return textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
397
400
  })
@@ -1,6 +1,6 @@
1
- import { describe, expect, it } from 'vitest'
1
+ import { describe, expect, it, vi } from 'vitest'
2
2
 
3
- import HtmlDiff from '../src/HtmlDiff'
3
+ import HtmlDiff, { WORD_ALIGNED_OPTIONS } from '../src/HtmlDiff'
4
4
 
5
5
  /**
6
6
  * Three-way diff tests under the genesis-spine model.
@@ -204,11 +204,99 @@ describe('HtmlDiff.executeThreeWay (genesis-spine)', () => {
204
204
  expect(() => HtmlDiff.executeThreeWay('<strong>X</strong>', 'X</strong>', '<strong>X</strong>')).not.toThrow()
205
205
  })
206
206
 
207
- it('produces non-empty output even when the stack is left unbalanced at end', () => {
208
- const out = HtmlDiff.executeThreeWay('X</strong>', '<strong>X</strong>', 'X</strong>')
209
- // The content is still there, the formatting wraps just close
210
- // defensively. Sanity-check the visible content survives.
211
- expect(out).toContain('X')
207
+ it('emits the defensive </ins> close and logs a warning when the stack is unbalanced', async () => {
208
+ const warn = vi.spyOn(console, 'warn').mockImplementation(() => {})
209
+ try {
210
+ const out = HtmlDiff.executeThreeWay('X</strong>', '<strong>X</strong>', 'X</strong>')
211
+ // The content survives.
212
+ expect(out).toContain('X')
213
+ // The defensive close path actually ran — output contains
214
+ // at least one `</ins>` that wasn't paired by `insertTag`
215
+ // (the only way the defensive branch can add one).
216
+ expect(out).toMatch(/<\/ins>/)
217
+ // And the warn was emitted. Without this assertion the path
218
+ // could silently stop firing in a future refactor and the
219
+ // test would still pass on the (incidentally-present) content.
220
+ expect(warn).toHaveBeenCalledWith(expect.stringContaining('unclosed formatting wrap'))
221
+ } finally {
222
+ warn.mockRestore()
223
+ }
224
+ })
225
+ })
226
+
227
+ describe('WORD_ALIGNED_OPTIONS — opinionated consumer defaults', () => {
228
+ // The library default (`orphanMatchThreshold = 0`) keeps every LCS
229
+ // match, however small — which fragments long sentence rewrites
230
+ // into many tiny ins/del pairs around stray word matches. Word's
231
+ // track-changes collapses those into a single coarse del+ins,
232
+ // which is markedly more readable for legal text. The exported
233
+ // `WORD_ALIGNED_OPTIONS` lets consumers opt into that without
234
+ // re-tuning the magic number themselves.
235
+ const longGenesis =
236
+ '"Specified Indebtedness" will have the meaning specified in Section 14 and shall include, with respect to Party B, any obligation (whether present or future, contingent or otherwise) for the payment or repayment of money.'
237
+ const longCp =
238
+ '"Specified Indebtedness" will have the meaning specified in Section 14 of the Agreement except that such term shall not include obligations.'
239
+
240
+ it('exports a 0.25 orphan threshold tuned for Word-aligned output', () => {
241
+ expect(WORD_ALIGNED_OPTIONS).toEqual({ orphanMatchThreshold: 0.25 })
242
+ })
243
+
244
+ it('plumbs through HtmlDiff.execute and reduces fragmentation versus the bare default', () => {
245
+ const bare = HtmlDiff.execute(longGenesis, longCp)
246
+ const aligned = HtmlDiff.execute(longGenesis, longCp, WORD_ALIGNED_OPTIONS)
247
+ const count = (s: string, re: RegExp) => (s.match(re) ?? []).length
248
+ // The bare default keeps every tiny match — Word-aligned produces
249
+ // strictly fewer ins/del wrappers for the same input.
250
+ expect(count(aligned, /<ins/g)).toBeLessThan(count(bare, /<ins/g))
251
+ expect(count(aligned, /<del/g)).toBeLessThan(count(bare, /<del/g))
252
+ })
253
+
254
+ it('plumbs through HtmlDiff.executeThreeWay too', () => {
255
+ const bare = HtmlDiff.executeThreeWay(longGenesis, longCp, longGenesis)
256
+ const aligned = HtmlDiff.executeThreeWay(longGenesis, longCp, longGenesis, WORD_ALIGNED_OPTIONS)
257
+ const count = (s: string, re: RegExp) => (s.match(re) ?? []).length
258
+ expect(count(aligned, /<ins/g)).toBeLessThan(count(bare, /<ins/g))
259
+ expect(count(aligned, /<del/g)).toBeLessThan(count(bare, /<del/g))
260
+ })
261
+ })
262
+
263
+ describe('orphan-match guard for structural tags', () => {
264
+ // Real regression from the live preview (Additional Condition
265
+ // Precedent in the 2002 ISDA Schedule): when CP deletes a section
266
+ // whose answer renders as an empty formatting shell —
267
+ // <p data-html="x"><em><strong></strong></em></p>
268
+ // — the `</strong>` and `</em>` matches sit between two content
269
+ // deletions ("Heading. " before, body after). At
270
+ // WORD_ALIGNED_OPTIONS.orphanMatchThreshold=0.25 those structural
271
+ // matches were rejected as orphans, swallowed into the deletion
272
+ // span, and the browser auto-closed the openers AT THE END of
273
+ // the deletion — visually rendering the entire deletion as
274
+ // bold-italic. The orphan filter now exempts tag-only matches
275
+ // so structural boundaries always survive.
276
+
277
+ it('CP deletes section with em+strong heading + plain body — closers stay between heading and body', () => {
278
+ const genesis =
279
+ '<p data-html="x"><em><strong>Additional Condition Precedent. </strong></em>For the purposes of Section 2(a)(iii).</p>'
280
+ const cp = '<p data-html="x"><em><strong></strong></em></p>'
281
+ const me = genesis
282
+
283
+ const out = HtmlDiff.executeThreeWay(genesis, cp, me, WORD_ALIGNED_OPTIONS)
284
+
285
+ // </strong> appears BEFORE the body deletion — meaning the
286
+ // body sits outside the bold-italic wrap, not inside it.
287
+ const closeStrongIdx = out.indexOf('</strong>')
288
+ const bodyDelIdx = out.indexOf('For the purposes')
289
+ expect(closeStrongIdx).toBeGreaterThan(0)
290
+ expect(bodyDelIdx).toBeGreaterThan(closeStrongIdx)
291
+ // No `<strong>…<del>body` substring exists — confirm by exact
292
+ // shape too. Heading wraps in strong+em, body is a plain del.
293
+ expect(out).toBe(
294
+ '<p data-html="x"><em><strong>' +
295
+ "<del class='diffdel cp' data-author='cp'>Additional Condition Precedent. </del>" +
296
+ '</strong></em>' +
297
+ "<del class='diffdel cp' data-author='cp'>For the purposes of Section 2(a)(iii).</del>" +
298
+ '</p>'
299
+ )
212
300
  })
213
301
  })
214
302