@createiq/htmldiff 1.2.0-beta.6 → 1.2.0-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/HtmlDiff.cjs +58 -4
- package/dist/HtmlDiff.cjs.map +1 -1
- package/dist/HtmlDiff.d.cts +53 -4
- package/dist/HtmlDiff.d.mts +53 -4
- package/dist/HtmlDiff.mjs +53 -4
- package/dist/HtmlDiff.mjs.map +1 -1
- package/package.json +1 -1
- package/src/HtmlDiff.ts +58 -5
- package/src/ThreeWayTable.ts +3 -0
- package/test/HtmlDiff.threeWay.spec.ts +55 -7
package/package.json
CHANGED
package/src/HtmlDiff.ts
CHANGED
|
@@ -71,6 +71,40 @@ export interface AnalyzeResult {
|
|
|
71
71
|
*/
|
|
72
72
|
export type ThreeWayOptions = AnalyzeOptions
|
|
73
73
|
|
|
74
|
+
/**
|
|
75
|
+
* Opinionated options that align htmldiff's output with Microsoft Word's
|
|
76
|
+
* track-changes rendering for legal-document rewrites.
|
|
77
|
+
*
|
|
78
|
+
* The library's bare default (`orphanMatchThreshold = 0`) keeps every
|
|
79
|
+
* LCS match, however small — which fragments long sentence rewrites
|
|
80
|
+
* into many tiny ins/del pairs around stray word matches ("of", "the",
|
|
81
|
+
* "shall"). Word collapses those into a single coarse del+ins, which is
|
|
82
|
+
* dramatically more readable for legal text.
|
|
83
|
+
*
|
|
84
|
+
* 0.25 was tuned empirically against a customer Word reference (US
|
|
85
|
+
* Commercial One CP, May 2026):
|
|
86
|
+
* - short edits (typo / one-word insert): output identical to
|
|
87
|
+
* threshold=0 — inter-match distances are tiny so every match
|
|
88
|
+
* trivially clears the bar;
|
|
89
|
+
* - long rewrites (the "Specified Indebtedness" rewrite in the
|
|
90
|
+
* reference): previously produced 6 dels + 5 ins fragmented around
|
|
91
|
+
* stray matches; at 0.25 it condenses to 3 dels + 2 ins — close to
|
|
92
|
+
* Word's 1+1 and a major readability win;
|
|
93
|
+
* - higher values (0.3+) collapsed short edits containing inline
|
|
94
|
+
* formatting changes into a single block — too aggressive.
|
|
95
|
+
*
|
|
96
|
+
* Consumers rendering legal documents should spread this into their
|
|
97
|
+
* options:
|
|
98
|
+
* `HtmlDiff.execute(old, new, { ...WORD_ALIGNED_OPTIONS })`
|
|
99
|
+
* `HtmlDiff.executeThreeWay(g, c, m, { ...WORD_ALIGNED_OPTIONS })`
|
|
100
|
+
*
|
|
101
|
+
* Other consumers (machine-readable diff, exact-token alignment) can
|
|
102
|
+
* keep the bare default.
|
|
103
|
+
*/
|
|
104
|
+
export const WORD_ALIGNED_OPTIONS: AnalyzeOptions = {
|
|
105
|
+
orphanMatchThreshold: 0.25,
|
|
106
|
+
}
|
|
107
|
+
|
|
74
108
|
export default class HtmlDiff {
|
|
75
109
|
/**
|
|
76
110
|
* This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
|
|
@@ -249,6 +283,16 @@ export default class HtmlDiff {
|
|
|
249
283
|
this.newText = newText
|
|
250
284
|
}
|
|
251
285
|
|
|
286
|
+
/**
|
|
287
|
+
* Two-way diff entry point. Accepts the same `AnalyzeOptions` bag as
|
|
288
|
+
* `executeThreeWay`, with two intentional exceptions documented
|
|
289
|
+
* inline below. Consumers wanting Word-aligned output should spread
|
|
290
|
+
* `WORD_ALIGNED_OPTIONS` into the third argument.
|
|
291
|
+
*
|
|
292
|
+
* Note: unlike `analyze`, `execute` runs `build()` which performs
|
|
293
|
+
* full table preprocessing — `tablePreprocessDepth` stays at 0 so
|
|
294
|
+
* the recursive cell diff can happen. Callers can't override that.
|
|
295
|
+
*/
|
|
252
296
|
static execute(oldText: string, newText: string, options: AnalyzeOptions = {}): string {
|
|
253
297
|
const inner = new HtmlDiff(oldText, newText)
|
|
254
298
|
if (options.blockExpressions) {
|
|
@@ -260,8 +304,9 @@ export default class HtmlDiff {
|
|
|
260
304
|
inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
|
|
261
305
|
}
|
|
262
306
|
// `useProjections` is intentionally NOT plumbed here — the 2-way
|
|
263
|
-
// path's build() runs its own heuristic.
|
|
264
|
-
//
|
|
307
|
+
// path's build() runs its own heuristic. `analyze` honours it; if
|
|
308
|
+
// you need to force it for a 2-way result, route through `analyze`
|
|
309
|
+
// and consume the operations directly.
|
|
265
310
|
return inner.build()
|
|
266
311
|
}
|
|
267
312
|
|
|
@@ -443,9 +488,17 @@ export default class HtmlDiff {
|
|
|
443
488
|
* `insertTag` and push raw, so the stack entry for the open is
|
|
444
489
|
* never popped. Rather than throw — which forces the caller's UI
|
|
445
490
|
* into an error boundary — close every leftover wrap with `</ins>`
|
|
446
|
-
* at the end of emission.
|
|
447
|
-
*
|
|
448
|
-
*
|
|
491
|
+
* at the end of emission.
|
|
492
|
+
*
|
|
493
|
+
* Caveat: the `</ins>` close is honest for the mod-wrap that the
|
|
494
|
+
* opener pushed (every formatting opener emits an inner `<ins…>`
|
|
495
|
+
* postInject regardless of whether the outer segment is ins or
|
|
496
|
+
* del). For del-segment formatting openers the outer `<del>` may
|
|
497
|
+
* itself be left open by the same emission imbalance; this fixup
|
|
498
|
+
* doesn't address that. Downstream browsers/DOMParser normalise
|
|
499
|
+
* mildly-malformed HTML by closing dangling tags, so the rendered
|
|
500
|
+
* output is usually acceptable — but the warning IS the signal
|
|
501
|
+
* that the input had a real imbalance worth investigating.
|
|
449
502
|
*/
|
|
450
503
|
private static emitSegments(segments: Segment[]): string {
|
|
451
504
|
const emitter = new HtmlDiff('', '')
|
package/src/ThreeWayTable.ts
CHANGED
|
@@ -392,6 +392,9 @@ function pairSimilarRowsThreeWay(
|
|
|
392
392
|
const oldTexts = oldTable.rows.map(r => rowText(genesis, r))
|
|
393
393
|
const newTexts = newTable.rows.map(r => rowText(newHtml, r))
|
|
394
394
|
return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) => {
|
|
395
|
+
// Returning 0 sits below any positive threshold so
|
|
396
|
+
// `pairSimilarUnmatched` won't pair these rows; the guard remains
|
|
397
|
+
// defensive should the threshold ever be lowered to 0.
|
|
395
398
|
if (oldTable.rows[oldIdx].cells.length !== newTable.rows[newIdx].cells.length) return 0
|
|
396
399
|
return textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
|
|
397
400
|
})
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { describe, expect, it } from 'vitest'
|
|
1
|
+
import { describe, expect, it, vi } from 'vitest'
|
|
2
2
|
|
|
3
|
-
import HtmlDiff from '../src/HtmlDiff'
|
|
3
|
+
import HtmlDiff, { WORD_ALIGNED_OPTIONS } from '../src/HtmlDiff'
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Three-way diff tests under the genesis-spine model.
|
|
@@ -204,11 +204,59 @@ describe('HtmlDiff.executeThreeWay (genesis-spine)', () => {
|
|
|
204
204
|
expect(() => HtmlDiff.executeThreeWay('<strong>X</strong>', 'X</strong>', '<strong>X</strong>')).not.toThrow()
|
|
205
205
|
})
|
|
206
206
|
|
|
207
|
-
it('
|
|
208
|
-
const
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
207
|
+
it('emits the defensive </ins> close and logs a warning when the stack is unbalanced', async () => {
|
|
208
|
+
const warn = vi.spyOn(console, 'warn').mockImplementation(() => {})
|
|
209
|
+
try {
|
|
210
|
+
const out = HtmlDiff.executeThreeWay('X</strong>', '<strong>X</strong>', 'X</strong>')
|
|
211
|
+
// The content survives.
|
|
212
|
+
expect(out).toContain('X')
|
|
213
|
+
// The defensive close path actually ran — output contains
|
|
214
|
+
// at least one `</ins>` that wasn't paired by `insertTag`
|
|
215
|
+
// (the only way the defensive branch can add one).
|
|
216
|
+
expect(out).toMatch(/<\/ins>/)
|
|
217
|
+
// And the warn was emitted. Without this assertion the path
|
|
218
|
+
// could silently stop firing in a future refactor and the
|
|
219
|
+
// test would still pass on the (incidentally-present) content.
|
|
220
|
+
expect(warn).toHaveBeenCalledWith(expect.stringContaining('unclosed formatting wrap'))
|
|
221
|
+
} finally {
|
|
222
|
+
warn.mockRestore()
|
|
223
|
+
}
|
|
224
|
+
})
|
|
225
|
+
})
|
|
226
|
+
|
|
227
|
+
describe('WORD_ALIGNED_OPTIONS — opinionated consumer defaults', () => {
|
|
228
|
+
// The library default (`orphanMatchThreshold = 0`) keeps every LCS
|
|
229
|
+
// match, however small — which fragments long sentence rewrites
|
|
230
|
+
// into many tiny ins/del pairs around stray word matches. Word's
|
|
231
|
+
// track-changes collapses those into a single coarse del+ins,
|
|
232
|
+
// which is markedly more readable for legal text. The exported
|
|
233
|
+
// `WORD_ALIGNED_OPTIONS` lets consumers opt into that without
|
|
234
|
+
// re-tuning the magic number themselves.
|
|
235
|
+
const longGenesis =
|
|
236
|
+
'"Specified Indebtedness" will have the meaning specified in Section 14 and shall include, with respect to Party B, any obligation (whether present or future, contingent or otherwise) for the payment or repayment of money.'
|
|
237
|
+
const longCp =
|
|
238
|
+
'"Specified Indebtedness" will have the meaning specified in Section 14 of the Agreement except that such term shall not include obligations.'
|
|
239
|
+
|
|
240
|
+
it('exports a 0.25 orphan threshold tuned for Word-aligned output', () => {
|
|
241
|
+
expect(WORD_ALIGNED_OPTIONS).toEqual({ orphanMatchThreshold: 0.25 })
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
it('plumbs through HtmlDiff.execute and reduces fragmentation versus the bare default', () => {
|
|
245
|
+
const bare = HtmlDiff.execute(longGenesis, longCp)
|
|
246
|
+
const aligned = HtmlDiff.execute(longGenesis, longCp, WORD_ALIGNED_OPTIONS)
|
|
247
|
+
const count = (s: string, re: RegExp) => (s.match(re) ?? []).length
|
|
248
|
+
// The bare default keeps every tiny match — Word-aligned produces
|
|
249
|
+
// strictly fewer ins/del wrappers for the same input.
|
|
250
|
+
expect(count(aligned, /<ins/g)).toBeLessThan(count(bare, /<ins/g))
|
|
251
|
+
expect(count(aligned, /<del/g)).toBeLessThan(count(bare, /<del/g))
|
|
252
|
+
})
|
|
253
|
+
|
|
254
|
+
it('plumbs through HtmlDiff.executeThreeWay too', () => {
|
|
255
|
+
const bare = HtmlDiff.executeThreeWay(longGenesis, longCp, longGenesis)
|
|
256
|
+
const aligned = HtmlDiff.executeThreeWay(longGenesis, longCp, longGenesis, WORD_ALIGNED_OPTIONS)
|
|
257
|
+
const count = (s: string, re: RegExp) => (s.match(re) ?? []).length
|
|
258
|
+
expect(count(aligned, /<ins/g)).toBeLessThan(count(bare, /<ins/g))
|
|
259
|
+
expect(count(aligned, /<del/g)).toBeLessThan(count(bare, /<del/g))
|
|
212
260
|
})
|
|
213
261
|
})
|
|
214
262
|
|