npm - @createiq/htmldiff - Versions diffs - 1.2.0-beta.6 → 1.2.0-beta.8 - Mend

@createiq/htmldiff 1.2.0-beta.6 → 1.2.0-beta.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/HtmlDiff.cjs +69 -4
package/dist/HtmlDiff.cjs.map +1 -1
package/dist/HtmlDiff.d.cts +53 -4
package/dist/HtmlDiff.d.mts +53 -4
package/dist/HtmlDiff.mjs +64 -4
package/dist/HtmlDiff.mjs.map +1 -1
package/package.json +1 -1
package/src/HtmlDiff.ts +83 -5
package/src/ThreeWayTable.ts +3 -0
package/test/HtmlDiff.threeWay.spec.ts +95 -7

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@createiq/htmldiff",
-  "version": "1.2.0-beta.6",
+  "version": "1.2.0-beta.8",
   "description": "TypeScript port of htmldiff.net",
   "type": "module",
   "author": "Mathew Mannion <mathew.mannion@linklaters.com>",

package/src/HtmlDiff.ts CHANGED Viewed

@@ -71,6 +71,40 @@ export interface AnalyzeResult {
  */
 export type ThreeWayOptions = AnalyzeOptions
+/**
+ * Opinionated options that align htmldiff's output with Microsoft Word's
+ * track-changes rendering for legal-document rewrites.
+ *
+ * The library's bare default (`orphanMatchThreshold = 0`) keeps every
+ * LCS match, however small — which fragments long sentence rewrites
+ * into many tiny ins/del pairs around stray word matches ("of", "the",
+ * "shall"). Word collapses those into a single coarse del+ins, which is
+ * dramatically more readable for legal text.
+ *
+ * 0.25 was tuned empirically against a customer Word reference (US
+ * Commercial One CP, May 2026):
+ *   - short edits (typo / one-word insert): output identical to
+ *     threshold=0 — inter-match distances are tiny so every match
+ *     trivially clears the bar;
+ *   - long rewrites (the "Specified Indebtedness" rewrite in the
+ *     reference): previously produced 6 dels + 5 ins fragmented around
+ *     stray matches; at 0.25 it condenses to 3 dels + 2 ins — close to
+ *     Word's 1+1 and a major readability win;
+ *   - higher values (0.3+) collapsed short edits containing inline
+ *     formatting changes into a single block — too aggressive.
+ *
+ * Consumers rendering legal documents should spread this into their
+ * options:
+ *   `HtmlDiff.execute(old, new, { ...WORD_ALIGNED_OPTIONS })`
+ *   `HtmlDiff.executeThreeWay(g, c, m, { ...WORD_ALIGNED_OPTIONS })`
+ *
+ * Other consumers (machine-readable diff, exact-token alignment) can
+ * keep the bare default.
+ */
+export const WORD_ALIGNED_OPTIONS: AnalyzeOptions = {
+  orphanMatchThreshold: 0.25,
+}
 export default class HtmlDiff {
   /**
    * This value defines balance between speed and memory utilization. The higher it is the faster it works and more memory consumes.
@@ -249,6 +283,16 @@ export default class HtmlDiff {
     this.newText = newText
   }
+  /**
+   * Two-way diff entry point. Accepts the same `AnalyzeOptions` bag as
+   * `executeThreeWay`, with two intentional exceptions documented
+   * inline below. Consumers wanting Word-aligned output should spread
+   * `WORD_ALIGNED_OPTIONS` into the third argument.
+   *
+   * Note: unlike `analyze`, `execute` runs `build()` which performs
+   * full table preprocessing — `tablePreprocessDepth` stays at 0 so
+   * the recursive cell diff can happen. Callers can't override that.
+   */
   static execute(oldText: string, newText: string, options: AnalyzeOptions = {}): string {
     const inner = new HtmlDiff(oldText, newText)
     if (options.blockExpressions) {
@@ -260,8 +304,9 @@ export default class HtmlDiff {
       inner.ignoreWhitespaceDifferences = options.ignoreWhitespaceDifferences
     }
     // `useProjections` is intentionally NOT plumbed here — the 2-way
-    // path's build() runs its own heuristic. Pass via `analyze` if a
-    // caller needs to force it.
+    // path's build() runs its own heuristic. `analyze` honours it; if
+    // you need to force it for a 2-way result, route through `analyze`
+    // and consume the operations directly.
     return inner.build()
   }
@@ -443,9 +488,17 @@ export default class HtmlDiff {
    * `insertTag` and push raw, so the stack entry for the open is
    * never popped. Rather than throw — which forces the caller's UI
    * into an error boundary — close every leftover wrap with `</ins>`
-   * at the end of emission. The resulting HTML has an extra
-   * `</ins>` next to the formatting closer; DOMParser-normalisation
-   * downstream produces sensible nesting.
+   * at the end of emission.
+   *
+   * Caveat: the `</ins>` close is honest for the mod-wrap that the
+   * opener pushed (every formatting opener emits an inner `<ins…>`
+   * postInject regardless of whether the outer segment is ins or
+   * del). For del-segment formatting openers the outer `<del>` may
+   * itself be left open by the same emission imbalance; this fixup
+   * doesn't address that. Downstream browsers/DOMParser normalise
+   * mildly-malformed HTML by closing dangling tags, so the rendered
+   * output is usually acceptable — but the warning IS the signal
+   * that the input had a real imbalance worth investigating.
    */
   private static emitSegments(segments: Segment[]): string {
     const emitter = new HtmlDiff('', '')
@@ -1066,6 +1119,31 @@ export default class HtmlDiff {
         continue
       }
+      // Never orphan-reject a match whose tokens are ALL HTML tags.
+      // Tag tokens are structural; rejecting `</strong>` / `</em>` as
+      // an orphan match between two content deletions merges the tag
+      // into the deletion, leaving the matching opener unclosed —
+      // browsers then auto-close the opener at the END of the
+      // deletion, producing visually-wrong output (e.g. the body of
+      // a section deletion rendered as bold-italic because the
+      // closing `</strong></em>` ended up after the body deletion
+      // rather than after the heading). The orphan threshold is
+      // designed for stray word matches between heavily-edited spans,
+      // not for formatting boundaries.
+      let allTags = true
+      for (let i = curr.startInNew; i < curr.endInNew; i++) {
+        if (!Utils.isTag(wordsForDiffNew[i])) {
+          allTags = false
+          break
+        }
+      }
+      if (allTags) {
+        yield curr
+        prev = curr
+        curr = next
+        continue
+      }
       let oldDistanceInChars = 0
       for (let i = prev.endInOld; i < next.startInOld; i++) {
         oldDistanceInChars += wordsForDiffOld[i].length

package/src/ThreeWayTable.ts CHANGED Viewed

@@ -392,6 +392,9 @@ function pairSimilarRowsThreeWay(
   const oldTexts = oldTable.rows.map(r => rowText(genesis, r))
   const newTexts = newTable.rows.map(r => rowText(newHtml, r))
   return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) => {
+    // Returning 0 sits below any positive threshold so
+    // `pairSimilarUnmatched` won't pair these rows; the guard remains
+    // defensive should the threshold ever be lowered to 0.
     if (oldTable.rows[oldIdx].cells.length !== newTable.rows[newIdx].cells.length) return 0
     return textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
   })

package/test/HtmlDiff.threeWay.spec.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-import { describe, expect, it } from 'vitest'
+import { describe, expect, it, vi } from 'vitest'
-import HtmlDiff from '../src/HtmlDiff'
+import HtmlDiff, { WORD_ALIGNED_OPTIONS } from '../src/HtmlDiff'
 /**
  * Three-way diff tests under the genesis-spine model.
@@ -204,11 +204,99 @@ describe('HtmlDiff.executeThreeWay (genesis-spine)', () => {
       expect(() => HtmlDiff.executeThreeWay('<strong>X</strong>', 'X</strong>', '<strong>X</strong>')).not.toThrow()
     })
-    it('produces non-empty output even when the stack is left unbalanced at end', () => {
-      const out = HtmlDiff.executeThreeWay('X</strong>', '<strong>X</strong>', 'X</strong>')
-      // The content is still there, the formatting wraps just close
-      // defensively. Sanity-check the visible content survives.
-      expect(out).toContain('X')
+    it('emits the defensive </ins> close and logs a warning when the stack is unbalanced', async () => {
+      const warn = vi.spyOn(console, 'warn').mockImplementation(() => {})
+      try {
+        const out = HtmlDiff.executeThreeWay('X</strong>', '<strong>X</strong>', 'X</strong>')
+        // The content survives.
+        expect(out).toContain('X')
+        // The defensive close path actually ran — output contains
+        // at least one `</ins>` that wasn't paired by `insertTag`
+        // (the only way the defensive branch can add one).
+        expect(out).toMatch(/<\/ins>/)
+        // And the warn was emitted. Without this assertion the path
+        // could silently stop firing in a future refactor and the
+        // test would still pass on the (incidentally-present) content.
+        expect(warn).toHaveBeenCalledWith(expect.stringContaining('unclosed formatting wrap'))
+      } finally {
+        warn.mockRestore()
+      }
+    })
+  })
+  describe('WORD_ALIGNED_OPTIONS — opinionated consumer defaults', () => {
+    // The library default (`orphanMatchThreshold = 0`) keeps every LCS
+    // match, however small — which fragments long sentence rewrites
+    // into many tiny ins/del pairs around stray word matches. Word's
+    // track-changes collapses those into a single coarse del+ins,
+    // which is markedly more readable for legal text. The exported
+    // `WORD_ALIGNED_OPTIONS` lets consumers opt into that without
+    // re-tuning the magic number themselves.
+    const longGenesis =
+      '"Specified Indebtedness" will have the meaning specified in Section 14 and shall include, with respect to Party B, any obligation (whether present or future, contingent or otherwise) for the payment or repayment of money.'
+    const longCp =
+      '"Specified Indebtedness" will have the meaning specified in Section 14 of the Agreement except that such term shall not include obligations.'
+    it('exports a 0.25 orphan threshold tuned for Word-aligned output', () => {
+      expect(WORD_ALIGNED_OPTIONS).toEqual({ orphanMatchThreshold: 0.25 })
+    })
+    it('plumbs through HtmlDiff.execute and reduces fragmentation versus the bare default', () => {
+      const bare = HtmlDiff.execute(longGenesis, longCp)
+      const aligned = HtmlDiff.execute(longGenesis, longCp, WORD_ALIGNED_OPTIONS)
+      const count = (s: string, re: RegExp) => (s.match(re) ?? []).length
+      // The bare default keeps every tiny match — Word-aligned produces
+      // strictly fewer ins/del wrappers for the same input.
+      expect(count(aligned, /<ins/g)).toBeLessThan(count(bare, /<ins/g))
+      expect(count(aligned, /<del/g)).toBeLessThan(count(bare, /<del/g))
+    })
+    it('plumbs through HtmlDiff.executeThreeWay too', () => {
+      const bare = HtmlDiff.executeThreeWay(longGenesis, longCp, longGenesis)
+      const aligned = HtmlDiff.executeThreeWay(longGenesis, longCp, longGenesis, WORD_ALIGNED_OPTIONS)
+      const count = (s: string, re: RegExp) => (s.match(re) ?? []).length
+      expect(count(aligned, /<ins/g)).toBeLessThan(count(bare, /<ins/g))
+      expect(count(aligned, /<del/g)).toBeLessThan(count(bare, /<del/g))
+    })
+  })
+  describe('orphan-match guard for structural tags', () => {
+    // Real regression from the live preview (Additional Condition
+    // Precedent in the 2002 ISDA Schedule): when CP deletes a section
+    // whose answer renders as an empty formatting shell —
+    //   <p data-html="x"><em><strong></strong></em></p>
+    // — the `</strong>` and `</em>` matches sit between two content
+    // deletions ("Heading. " before, body after). At
+    // WORD_ALIGNED_OPTIONS.orphanMatchThreshold=0.25 those structural
+    // matches were rejected as orphans, swallowed into the deletion
+    // span, and the browser auto-closed the openers AT THE END of
+    // the deletion — visually rendering the entire deletion as
+    // bold-italic. The orphan filter now exempts tag-only matches
+    // so structural boundaries always survive.
+    it('CP deletes section with em+strong heading + plain body — closers stay between heading and body', () => {
+      const genesis =
+        '<p data-html="x"><em><strong>Additional Condition Precedent. </strong></em>For the purposes of Section 2(a)(iii).</p>'
+      const cp = '<p data-html="x"><em><strong></strong></em></p>'
+      const me = genesis
+      const out = HtmlDiff.executeThreeWay(genesis, cp, me, WORD_ALIGNED_OPTIONS)
+      // </strong> appears BEFORE the body deletion — meaning the
+      // body sits outside the bold-italic wrap, not inside it.
+      const closeStrongIdx = out.indexOf('</strong>')
+      const bodyDelIdx = out.indexOf('For the purposes')
+      expect(closeStrongIdx).toBeGreaterThan(0)
+      expect(bodyDelIdx).toBeGreaterThan(closeStrongIdx)
+      // No `<strong>…<del>body` substring exists — confirm by exact
+      // shape too. Heading wraps in strong+em, body is a plain del.
+      expect(out).toBe(
+        '<p data-html="x"><em><strong>' +
+          "<del class='diffdel cp' data-author='cp'>Additional Condition Precedent. </del>" +
+          '</strong></em>' +
+          "<del class='diffdel cp' data-author='cp'>For the purposes of Section 2(a)(iii).</del>" +
+          '</p>'
+      )
     })
   })