npm - @createiq/htmldiff - Versions diffs - 1.0.5-beta.2 → 1.0.5-beta.4 - Mend

@createiq/htmldiff 1.0.5-beta.2 → 1.0.5-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/HtmlDiff.cjs +204 -26
package/dist/HtmlDiff.cjs.map +1 -1
package/dist/HtmlDiff.mjs +204 -26
package/dist/HtmlDiff.mjs.map +1 -1
package/package.json +1 -1
package/src/TableDiff.ts +280 -45
package/test/HtmlDiff.tables.matrix.spec.ts +367 -0
package/test/HtmlDiff.tables.spec.ts +80 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@createiq/htmldiff",
-  "version": "1.0.5-beta.2",
+  "version": "1.0.5-beta.4",
   "description": "TypeScript port of htmldiff.net",
   "type": "module",
   "author": "Mathew Mannion <mathew.mannion@linklaters.com>",

package/src/TableDiff.ts CHANGED Viewed

@@ -379,20 +379,25 @@ function diffStructurallyAlignedTable(
   const oldKeys = oldTable.rows.map(row => rowKey(oldHtml, row))
   const newKeys = newTable.rows.map(row => rowKey(newHtml, row))
   const exactAlignment = lcsAlign(oldKeys, newKeys)
-  const alignment = pairSimilarUnmatchedRows(exactAlignment, oldTable, newTable, oldHtml, newHtml)
+  const paired = pairSimilarUnmatchedRows(exactAlignment, oldTable, newTable, oldHtml, newHtml)
+  // Reorder so unpaired deleted rows appear at their *natural old-side
+  // position* — immediately after the preserved/paired row that came
+  // before them in old. Without this, runs of unpaired dels at low
+  // alignment indices end up emitted before any preserved row (the
+  // "deleted rows out of order" bug).
+  const alignment = orderAlignmentForEmission(paired)
   // Walk new's tableStart→tableEnd, substituting rows with their diffed
   // form so `<thead>`/`<tbody>` wrappers and inter-row whitespace are
   // preserved verbatim. Deleted rows (no position in new) are injected
-  // inline at their alignment position. If new has no rows at all, fall
-  // back to a from-scratch reconstruction so we still emit deleted rows.
+  // inline at the cursor's current position, which now corresponds to
+  // their natural old-side slot thanks to the reordering above. If new
+  // has no rows at all, fall back to a from-scratch reconstruction so
+  // we still emit deleted rows.
   if (newTable.rows.length === 0) {
     return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell)
   }
-  // Emit the table header (`<table>` + any `<thead>`/`<tbody>` opening
-  // text up to the first row) up-front so a leading run of deleted-only
-  // alignments doesn't slip in before the table opens.
   const out: string[] = []
   out.push(newHtml.slice(newTable.tableStart, newTable.rows[0].rowStart))
   let cursor = newTable.rows[0].rowStart
@@ -407,8 +412,6 @@ function diffStructurallyAlignedTable(
       }
       cursor = newRow.rowEnd
     } else if (align.oldIdx !== null) {
-      // Deleted row: inject inline at the current cursor (between the
-      // previously emitted row and the next one in new).
       out.push(emitFullRow(oldHtml, oldTable.rows[align.oldIdx], 'del', diffCell))
     }
   }
@@ -416,6 +419,79 @@ function diffStructurallyAlignedTable(
   return out.join('')
 }
+/**
+ * Reorders the alignment so emission produces rows in the visually-
+ * correct order. Each entry is assigned a fractional "position" in
+ * new's flow:
+ *
+ *   • Preserved/paired (oldIdx, newIdx): position = newIdx.
+ *   • Pure insert (null, newIdx): position = newIdx.
+ *   • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
+ *     before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
+ *     they appear in old's row order. The +0.5 places dels BEFORE any
+ *     insert at the same gap (insert at newIdx N1+1 has position N1+1
+ *     which is > N1+0.5), giving the natural "delete first, insert
+ *     second" reading order at a replaced position.
+ *
+ * This handles the full range:
+ *   • Run of unpaired dels at the start (no preserved predecessor):
+ *     position -0.5, sorted by oldIdx.
+ *   • Dels in the middle: positioned right after their preceding
+ *     preserved row.
+ *   • Dels at the end (no preserved successor): positioned after the
+ *     last preserved row.
+ *
+ * Without this reordering, a run of unpaired deletes at low alignment
+ * indices got emitted at cursor = first-new-row position — putting
+ * all deletes before any preserved row in the output, regardless of
+ * where they came from in old.
+ */
+function orderAlignmentForEmission(alignment: Alignment[]): Alignment[] {
+  const preserved: Array<{ oldIdx: number; newIdx: number }> = []
+  for (const a of alignment) {
+    if (a.oldIdx !== null && a.newIdx !== null) {
+      preserved.push({ oldIdx: a.oldIdx, newIdx: a.newIdx })
+    }
+  }
+  preserved.sort((a, b) => a.oldIdx - b.oldIdx)
+  // For a deleted row with oldIdx K, return the newIdx of the preserved
+  // entry with the largest oldIdx less than K, or -1 if none.
+  function newIdxOfPreservedBefore(oldIdx: number): number {
+    let result = -1
+    for (const p of preserved) {
+      if (p.oldIdx >= oldIdx) break
+      result = p.newIdx
+    }
+    return result
+  }
+  // Decorate each alignment with a fractional position. We use
+  // (primary, secondary) tuples so dels at the same gap sort by oldIdx
+  // (in old's row order) and inserts at the same newIdx stay stable.
+  const decorated = alignment.map((a, i) => {
+    let primary: number
+    let secondary: number
+    if (a.newIdx !== null) {
+      primary = a.newIdx
+      secondary = a.oldIdx === null ? 1 : 0 // preserved before pure-insert at same newIdx (rare)
+    } else {
+      // Pure delete
+      primary = newIdxOfPreservedBefore(a.oldIdx as number) + 0.5
+      secondary = a.oldIdx as number
+    }
+    return { entry: a, primary, secondary, originalIdx: i }
+  })
+  decorated.sort((a, b) => {
+    if (a.primary !== b.primary) return a.primary - b.primary
+    if (a.secondary !== b.secondary) return a.secondary - b.secondary
+    return a.originalIdx - b.originalIdx // stable
+  })
+  return decorated.map(d => d.entry)
+}
 function rebuildStructurallyAlignedTable(
   oldHtml: string,
   newHtml: string,
@@ -469,13 +545,165 @@ function diffPreservedRow(
   }
   // Cell counts differ. Try to interpret it as a horizontal merge/split via
   // colspan first — preserving the new structure with `class='mod colspan'`
-  // on each affected cell. Falls back to the cell-LCS path if the cells
-  // don't align cleanly on logical column positions.
+  // on each affected cell.
   const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
   if (colspanAligned !== null) return colspanAligned
+  // For a single-column add/delete (cell count differs by exactly 1),
+  // detect the position via positional similarity scan and align the
+  // remaining cells positionally. This handles the case where a column
+  // was added AND a different cell got an unrelated content edit — the
+  // edited cell still aligns by position rather than getting orphaned by
+  // the cell-LCS exact-match.
+  const delta = newRow.cells.length - oldRow.cells.length
+  // For column add/delete (cell counts differ), find the best insertion
+  // or deletion positions via positional similarity scan and align the
+  // remaining cells positionally. This handles content-edit alongside
+  // column-add by keeping the edited cell in its column position rather
+  // than orphaning it via the cell-LCS exact match.
+  // Guardrail: combinatorial search is C(newCount, k); we cap to avoid
+  // explosion on very wide tables. The cap is generous for real legal
+  // schedules; anything above falls through to cell-LCS.
+  const absDelta = Math.abs(delta)
+  if (
+    absDelta > 0 &&
+    absDelta <= MAX_COLUMN_DELTA &&
+    Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH
+  ) {
+    if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell)
+    return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell)
+  }
   return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell)
 }
+const MAX_COLUMN_DELTA = 6
+const MAX_COLUMN_SEARCH_WIDTH = 40
+/**
+ * For a row where new has K more cells than old, find the K column
+ * positions in new where cells were inserted by scanning all C(newCount,
+ * K) combinations and picking the one that maximises positional content
+ * similarity with the remaining cells. The inserted cells are emitted
+ * with diff markers; the rest are aligned positionally with content
+ * diff for matched pairs.
+ */
+function diffMultiColumnAddRow(
+  oldHtml: string,
+  newHtml: string,
+  oldRow: RowRange,
+  newRow: RowRange,
+  k: number,
+  diffCell: DiffCellFn
+): string {
+  const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml)
+  const inserted = new Set(insertedPositions)
+  const out: string[] = [rowHeaderSlice(newHtml, newRow)]
+  let oldIdx = 0
+  for (let c = 0; c < newRow.cells.length; c++) {
+    if (inserted.has(c)) {
+      out.push(emitFullCell(newHtml, newRow.cells[c], 'ins', diffCell))
+    } else {
+      out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell))
+      oldIdx++
+    }
+  }
+  out.push('</tr>')
+  return out.join('')
+}
+function diffMultiColumnDeleteRow(
+  oldHtml: string,
+  newHtml: string,
+  oldRow: RowRange,
+  newRow: RowRange,
+  k: number,
+  diffCell: DiffCellFn
+): string {
+  const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml)
+  const deleted = new Set(deletedPositions)
+  const out: string[] = [rowHeaderSlice(newHtml, newRow)]
+  let newIdx = 0
+  for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
+    if (deleted.has(oldIdx)) {
+      out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], 'del', diffCell))
+      continue
+    }
+    out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell))
+    newIdx++
+  }
+  out.push('</tr>')
+  return out.join('')
+}
+function findBestColumnInsertPositions(
+  oldRow: RowRange,
+  newRow: RowRange,
+  k: number,
+  oldHtml: string,
+  newHtml: string
+): number[] {
+  let bestPositions: number[] = []
+  let bestScore = -1
+  for (const combo of combinationsOfRange(newRow.cells.length, k)) {
+    const inserted = new Set(combo)
+    let score = 0
+    let oldIdx = 0
+    for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
+      if (inserted.has(newIdx)) continue
+      score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
+      oldIdx++
+    }
+    if (score > bestScore) {
+      bestScore = score
+      bestPositions = combo
+    }
+  }
+  return bestPositions
+}
+function findBestColumnDeletePositions(
+  oldRow: RowRange,
+  newRow: RowRange,
+  k: number,
+  oldHtml: string,
+  newHtml: string
+): number[] {
+  let bestPositions: number[] = []
+  let bestScore = -1
+  for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
+    const deleted = new Set(combo)
+    let score = 0
+    let newIdx = 0
+    for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
+      if (deleted.has(oldIdx)) continue
+      score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml)
+      newIdx++
+    }
+    if (score > bestScore) {
+      bestScore = score
+      bestPositions = combo
+    }
+  }
+  return bestPositions
+}
+/**
+ * Yields all sorted-ascending combinations of `k` distinct integers
+ * from [0, n). Iterative implementation avoids recursion overhead and
+ * keeps memory at O(k).
+ */
+function* combinationsOfRange(n: number, k: number): IterableIterator<number[]> {
+  if (k === 0 || k > n) return
+  const indices = Array.from({ length: k }, (_, i) => i)
+  while (true) {
+    yield indices.slice()
+    let i = k - 1
+    while (i >= 0 && indices[i] === n - k + i) i--
+    if (i < 0) return
+    indices[i]++
+    for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1
+  }
+}
 /**
  * Try to align cells by logical column position (sum of colspans). When
  * one side has a colspan'd cell that absorbs multiple cells on the other
@@ -887,18 +1115,41 @@ function pairSimilarUnmatched(
 }
 /**
- * Character-level similarity using shared prefix + suffix as a fraction
- * of the longer string. Catches "single edit somewhere in a long row"
- * (which token-Jaccard misses on short rows) while still correctly
- * rejecting rows with no positional overlap. HTML tags are stripped to
- * keep the comparison content-focused.
+ * Combined similarity metric used for both row-level and cell-level
+ * fuzzy pairing. Returns the MAX of two complementary metrics:
+ *
+ *   1. **Character prefix+suffix similarity** — fraction of the longer
+ *      string covered by shared prefix + shared suffix. Catches small
+ *      edits in the middle of a string (one word changed in a row).
+ *      Misses cases where the bulk of common content is in the middle
+ *      and the ends differ.
+ *
+ *   2. **Token Jaccard similarity** — intersection-over-union of the
+ *      whitespace-split tokens. Catches "most of the content is the
+ *      same but bookended by different bits" — e.g. a row whose only
+ *      edit is a column added at the start and another at the end,
+ *      where the ~50 chars in the middle that DO match would be
+ *      invisible to prefix+suffix.
+ *
+ * Either metric exceeding the threshold means pair. Neither alone is
+ * sufficient for the full range of legal-doc edits we see in
+ * production tables.
  */
 function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newHtml: string): number {
-  const a = rowText(oldHtml, oldRow)
-  const b = rowText(newHtml, newRow)
+  return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow))
+}
+function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
+  return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell))
+}
+function textSimilarity(a: string, b: string): number {
   if (a === b) return 1
   if (a.length === 0 || b.length === 0) return 0
+  return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b))
+}
+function charPrefixSuffixSimilarity(a: string, b: string): number {
   let prefix = 0
   const minLen = Math.min(a.length, b.length)
   while (prefix < minLen && a[prefix] === b[prefix]) prefix++
@@ -915,6 +1166,18 @@ function rowSimilarity(oldRow: RowRange, newRow: RowRange, oldHtml: string, newH
   return (prefix + suffix) / Math.max(a.length, b.length)
 }
+function tokenJaccardSimilarity(a: string, b: string): number {
+  const tokensA = new Set(a.split(/\s+/).filter(Boolean))
+  const tokensB = new Set(b.split(/\s+/).filter(Boolean))
+  if (tokensA.size === 0 && tokensB.size === 0) return 1
+  let intersection = 0
+  for (const t of tokensA) {
+    if (tokensB.has(t)) intersection++
+  }
+  const union = tokensA.size + tokensB.size - intersection
+  return union === 0 ? 0 : intersection / union
+}
 function rowText(html: string, row: RowRange): string {
   const parts: string[] = []
   for (const cell of row.cells) {
@@ -923,34 +1186,6 @@ function rowText(html: string, row: RowRange): string {
   return parts.join(' ').replace(/\s+/g, ' ').trim().toLowerCase()
 }
-/**
- * Character-level prefix+suffix similarity for a single cell's text
- * content. Same metric as rowSimilarity, scoped to one cell so we can
- * fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
- * a column add in the same row).
- */
-function cellSimilarity(oldCell: CellRange, newCell: CellRange, oldHtml: string, newHtml: string): number {
-  const a = cellText(oldHtml, oldCell)
-  const b = cellText(newHtml, newCell)
-  if (a === b) return 1
-  if (a.length === 0 || b.length === 0) return 0
-  let prefix = 0
-  const minLen = Math.min(a.length, b.length)
-  while (prefix < minLen && a[prefix] === b[prefix]) prefix++
-  let suffix = 0
-  while (
-    suffix < a.length - prefix &&
-    suffix < b.length - prefix &&
-    a[a.length - 1 - suffix] === b[b.length - 1 - suffix]
-  ) {
-    suffix++
-  }
-  return (prefix + suffix) / Math.max(a.length, b.length)
-}
 function cellText(html: string, cell: CellRange): string {
   return html
     .slice(cell.contentStart, cell.contentEnd)