@createiq/htmldiff 1.2.0-beta.5 → 1.2.0-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.2.0-beta.5",
3
+ "version": "1.2.0-beta.7",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
package/src/HtmlDiff.ts CHANGED
@@ -71,6 +71,40 @@ export interface AnalyzeResult {
71
71
  */
72
72
  export type ThreeWayOptions = AnalyzeOptions
73
73
 
74
+ /**
75
+ * Opinionated options that align htmldiff's output with Microsoft Word's
76
+ * track-changes rendering for legal-document rewrites.
77
+ *
78
+ * The library's bare default (`orphanMatchThreshold = 0`) keeps every
79
+ * LCS match, however small — which fragments long sentence rewrites
80
+ * into many tiny ins/del pairs around stray word matches ("of", "the",
81
+ * "shall"). Word collapses those into a single coarse del+ins, which is
82
+ * dramatically more readable for legal text.
83
+ *
84
+ * 0.25 was tuned empirically against a customer Word reference (US
85
+ * Commercial One CP, May 2026):
86
+ * - short edits (typo / one-word insert): output identical to
87
+ * threshold=0 — inter-match distances are tiny so every match
88
+ * trivially clears the bar;
89
+ * - long rewrites (the "Specified Indebtedness" rewrite in the
90
+ * reference): previously produced 6 dels + 5 ins fragmented around
91
+ * stray matches; at 0.25 it condenses to 3 dels + 2 ins — close to
92
+ * Word's 1+1 and a major readability win;
93
+ * - higher values (0.3+) collapsed short edits containing inline
94
+ * formatting changes into a single block — too aggressive.
95
+ *
96
+ * Consumers rendering legal documents should spread this into their
97
+ * options:
98
+ * `HtmlDiff.execute(old, new, { ...WORD_ALIGNED_OPTIONS })`
99
+ * `HtmlDiff.executeThreeWay(g, c, m, { ...WORD_ALIGNED_OPTIONS })`
100
+ *
101
+ * Other consumers (machine-readable diff, exact-token alignment) can
102
+ * keep the bare default.
103
+ */
104
+ export const WORD_ALIGNED_OPTIONS: AnalyzeOptions = {
105
+ orphanMatchThreshold: 0.25,
106
+ }
107
+
74
108
  export default class HtmlDiff {
75
109
  /**
76
110
  * This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
@@ -249,8 +283,31 @@ export default class HtmlDiff {
249
283
  this.newText = newText
250
284
  }
251
285
 
252
- static execute(oldText: string, newText: string): string {
253
- return new HtmlDiff(oldText, newText).build()
286
+ /**
287
+ * Two-way diff entry point. Accepts the same `AnalyzeOptions` bag as
288
+ * `executeThreeWay`, with two intentional exceptions documented
289
+ * inline below. Consumers wanting Word-aligned output should spread
290
+ * `WORD_ALIGNED_OPTIONS` into the third argument.
291
+ *
292
+ * Note: unlike `analyze`, `execute` runs `build()` which performs
293
+ * full table preprocessing — `tablePreprocessDepth` stays at 0 so
294
+ * the recursive cell diff can happen. Callers can't override that.
295
+ */
296
+ static execute(oldText: string, newText: string, options: AnalyzeOptions = {}): string {
297
+ const inner = new HtmlDiff(oldText, newText)
298
+ if (options.blockExpressions) {
299
+ for (const expr of options.blockExpressions) inner.addBlockExpression(expr)
300
+ }
301
+ if (options.repeatingWordsAccuracy !== undefined) inner.repeatingWordsAccuracy = options.repeatingWordsAccuracy
302
+ if (options.orphanMatchThreshold !== undefined) inner.orphanMatchThreshold = options.orphanMatchThreshold
303
+ if (options.ignoreWhitespaceDifferences !== undefined) {
304
+ inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
305
+ }
306
+ // `useProjections` is intentionally NOT plumbed here — the 2-way
307
+ // path's build() runs its own heuristic. `analyze` honours it; if
308
+ // you need to force it for a 2-way result, route through `analyze`
309
+ // and consume the operations directly.
310
+ return inner.build()
254
311
  }
255
312
 
256
313
  /**
@@ -431,9 +488,17 @@ export default class HtmlDiff {
431
488
  * `insertTag` and push raw, so the stack entry for the open is
432
489
  * never popped. Rather than throw — which forces the caller's UI
433
490
  * into an error boundary — close every leftover wrap with `</ins>`
434
- * at the end of emission. The resulting HTML has an extra
435
- * `</ins>` next to the formatting closer; DOMParser-normalisation
436
- * downstream produces sensible nesting.
491
+ * at the end of emission.
492
+ *
493
+ * Caveat: the `</ins>` close is honest for the mod-wrap that the
494
+ * opener pushed (every formatting opener emits an inner `<ins…>`
495
+ * postInject regardless of whether the outer segment is ins or
496
+ * del). For del-segment formatting openers the outer `<del>` may
497
+ * itself be left open by the same emission imbalance; this fixup
498
+ * doesn't address that. Downstream browsers/DOMParser normalise
499
+ * mildly-malformed HTML by closing dangling tags, so the rendered
500
+ * output is usually acceptable — but the warning IS the signal
501
+ * that the input had a real imbalance worth investigating.
437
502
  */
438
503
  private static emitSegments(segments: Segment[]): string {
439
504
  const emitter = new HtmlDiff('', '')
@@ -392,6 +392,9 @@ function pairSimilarRowsThreeWay(
392
392
  const oldTexts = oldTable.rows.map(r => rowText(genesis, r))
393
393
  const newTexts = newTable.rows.map(r => rowText(newHtml, r))
394
394
  return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) => {
395
+ // Returning 0 sits below any positive threshold so
396
+ // `pairSimilarUnmatched` won't pair these rows; the guard remains
397
+ // defensive should the threshold ever be lowered to 0.
395
398
  if (oldTable.rows[oldIdx].cells.length !== newTable.rows[newIdx].cells.length) return 0
396
399
  return textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
397
400
  })
@@ -1,6 +1,6 @@
1
- import { describe, expect, it } from 'vitest'
1
+ import { describe, expect, it, vi } from 'vitest'
2
2
 
3
- import HtmlDiff from '../src/HtmlDiff'
3
+ import HtmlDiff, { WORD_ALIGNED_OPTIONS } from '../src/HtmlDiff'
4
4
 
5
5
  /**
6
6
  * Three-way diff tests under the genesis-spine model.
@@ -204,11 +204,59 @@ describe('HtmlDiff.executeThreeWay (genesis-spine)', () => {
204
204
  expect(() => HtmlDiff.executeThreeWay('<strong>X</strong>', 'X</strong>', '<strong>X</strong>')).not.toThrow()
205
205
  })
206
206
 
207
- it('produces non-empty output even when the stack is left unbalanced at end', () => {
208
- const out = HtmlDiff.executeThreeWay('X</strong>', '<strong>X</strong>', 'X</strong>')
209
- // The content is still there, the formatting wraps just close
210
- // defensively. Sanity-check the visible content survives.
211
- expect(out).toContain('X')
207
+ it('emits the defensive </ins> close and logs a warning when the stack is unbalanced', async () => {
208
+ const warn = vi.spyOn(console, 'warn').mockImplementation(() => {})
209
+ try {
210
+ const out = HtmlDiff.executeThreeWay('X</strong>', '<strong>X</strong>', 'X</strong>')
211
+ // The content survives.
212
+ expect(out).toContain('X')
213
+ // The defensive close path actually ran — output contains
214
+ // at least one `</ins>` that wasn't paired by `insertTag`
215
+ // (the only way the defensive branch can add one).
216
+ expect(out).toMatch(/<\/ins>/)
217
+ // And the warn was emitted. Without this assertion the path
218
+ // could silently stop firing in a future refactor and the
219
+ // test would still pass on the (incidentally-present) content.
220
+ expect(warn).toHaveBeenCalledWith(expect.stringContaining('unclosed formatting wrap'))
221
+ } finally {
222
+ warn.mockRestore()
223
+ }
224
+ })
225
+ })
226
+
227
+ describe('WORD_ALIGNED_OPTIONS — opinionated consumer defaults', () => {
228
+ // The library default (`orphanMatchThreshold = 0`) keeps every LCS
229
+ // match, however small — which fragments long sentence rewrites
230
+ // into many tiny ins/del pairs around stray word matches. Word's
231
+ // track-changes collapses those into a single coarse del+ins,
232
+ // which is markedly more readable for legal text. The exported
233
+ // `WORD_ALIGNED_OPTIONS` lets consumers opt into that without
234
+ // re-tuning the magic number themselves.
235
+ const longGenesis =
236
+ '"Specified Indebtedness" will have the meaning specified in Section 14 and shall include, with respect to Party B, any obligation (whether present or future, contingent or otherwise) for the payment or repayment of money.'
237
+ const longCp =
238
+ '"Specified Indebtedness" will have the meaning specified in Section 14 of the Agreement except that such term shall not include obligations.'
239
+
240
+ it('exports a 0.25 orphan threshold tuned for Word-aligned output', () => {
241
+ expect(WORD_ALIGNED_OPTIONS).toEqual({ orphanMatchThreshold: 0.25 })
242
+ })
243
+
244
+ it('plumbs through HtmlDiff.execute and reduces fragmentation versus the bare default', () => {
245
+ const bare = HtmlDiff.execute(longGenesis, longCp)
246
+ const aligned = HtmlDiff.execute(longGenesis, longCp, WORD_ALIGNED_OPTIONS)
247
+ const count = (s: string, re: RegExp) => (s.match(re) ?? []).length
248
+ // The bare default keeps every tiny match — Word-aligned produces
249
+ // strictly fewer ins/del wrappers for the same input.
250
+ expect(count(aligned, /<ins/g)).toBeLessThan(count(bare, /<ins/g))
251
+ expect(count(aligned, /<del/g)).toBeLessThan(count(bare, /<del/g))
252
+ })
253
+
254
+ it('plumbs through HtmlDiff.executeThreeWay too', () => {
255
+ const bare = HtmlDiff.executeThreeWay(longGenesis, longCp, longGenesis)
256
+ const aligned = HtmlDiff.executeThreeWay(longGenesis, longCp, longGenesis, WORD_ALIGNED_OPTIONS)
257
+ const count = (s: string, re: RegExp) => (s.match(re) ?? []).length
258
+ expect(count(aligned, /<ins/g)).toBeLessThan(count(bare, /<ins/g))
259
+ expect(count(aligned, /<del/g)).toBeLessThan(count(bare, /<del/g))
212
260
  })
213
261
  })
214
262