@createiq/htmldiff 1.2.0-beta.4 → 1.2.0-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@createiq/htmldiff",
3
- "version": "1.2.0-beta.4",
3
+ "version": "1.2.0-beta.5",
4
4
  "description": "TypeScript port of htmldiff.net",
5
5
  "type": "module",
6
6
  "author": "Mathew Mannion <mathew.mannion@linklaters.com>",
@@ -1,4 +1,4 @@
1
- import { lcsAlign, textSimilarity } from './Alignment'
1
+ import { type Alignment, lcsAlign, pairSimilarUnmatched, textSimilarity } from './Alignment'
2
2
  import { injectClass, parseOpeningTagAt } from './HtmlScanner'
3
3
  import {
4
4
  type CellRange,
@@ -8,6 +8,7 @@ import {
8
8
  PLACEHOLDER_SUFFIX,
9
9
  type RowRange,
10
10
  rowKey,
11
+ rowText,
11
12
  sameDimensions,
12
13
  spliceString,
13
14
  type TableRange,
@@ -143,8 +144,14 @@ function preprocessByContent(
143
144
  const cKeys = cTables.map(t => tableKey(cpLatest, t))
144
145
  const mKeys = mTables.map(t => tableKey(meCurrent, t))
145
146
 
146
- const alignCp = lcsAlign(gKeys, cKeys)
147
- const alignMe = lcsAlign(gKeys, mKeys)
147
+ // Exact tableKey LCS, then fuzzy-pair unmatched runs by content
148
+ // similarity. Without this, a table whose cells were edited (but
149
+ // not its overall shape) fails the exact tableKey match and the
150
+ // table-level aligner pulls it apart into a whole-table del + a
151
+ // whole-table ins. Same fuzzy pass `TableDiff` uses for the 2-way
152
+ // path — `pairSimilarTablesThreeWay` is defined below.
153
+ const alignCp = pairSimilarTablesThreeWay(lcsAlign(gKeys, cKeys), genesis, cpLatest, gTables, cTables)
154
+ const alignMe = pairSimilarTablesThreeWay(lcsAlign(gKeys, mKeys), genesis, meCurrent, gTables, mTables)
148
155
 
149
156
  // Maps: genesisIdx → matching cpIdx (-1 if none); cpIdx → matching genesisIdx; etc.
150
157
  const gToCp = new Array<number>(gTables.length).fill(-1)
@@ -342,6 +349,76 @@ function tableKey(html: string, table: TableRange): string {
342
349
  return html.slice(table.tableStart, table.tableEnd).replace(/\s+/g, ' ').trim()
343
350
  }
344
351
 
352
+ /**
353
+ * Character-level similarity above which the three-way aligner treats
354
+ * two rows / tables as "the same logical entry, edited" rather than
355
+ * an unrelated delete + insert. Matched to TableDiff's
356
+ * `ROW_FUZZY_THRESHOLD` / `CELL_FUZZY_THRESHOLD` so 2-way and 3-way
357
+ * agree on which pairings are reachable; if a row's content overlap
358
+ * is enough to fool the 2-way diff into pairing, it should also be
359
+ * enough for 3-way.
360
+ */
361
+ const THREE_WAY_FUZZY_THRESHOLD = 0.5
362
+
363
+ /**
364
+ * Run the same fuzzy-pairing pass `TableDiff.pairSimilarUnmatchedRows`
365
+ * applies after its exact-LCS, but against one side of the genesis
366
+ * spine (either cp or me). The genesis tables/rows are always the
367
+ * "old" side; `newTable` is the cp or me table being aligned. Returns
368
+ * the enriched alignment with additional paired entries.
369
+ *
370
+ * Cell-count guard: only fuzzy-pair when both rows have the same cell
371
+ * count. Without this guard an asymmetric restructure — e.g. CP and
372
+ * Me both added a different column — leads to ONE side fuzzy-pairing
373
+ * its row with genesis (content overlap above threshold) while the
374
+ * other side falls below threshold. That mismatch routes through
375
+ * `diffTableStructural`'s "Me dropped, CP kept" (or the mirror)
376
+ * branch, which emits CP's row as a Me-attributed deletion. In
377
+ * cp-only mode `stripMeAttributedMarkers` then removes the row
378
+ * entirely and CP's edit vanishes from the view — exactly the
379
+ * content-loss case we're meant to prevent. Restricting fuzzy
380
+ * pairing to same-shape rows preserves the common case (single cell
381
+ * edit, identical row shape) while pushing structural mismatches
382
+ * back to the boundary-insertion path that emits both sides
383
+ * explicitly.
384
+ */
385
+ function pairSimilarRowsThreeWay(
386
+ alignment: Alignment[],
387
+ genesis: string,
388
+ newHtml: string,
389
+ oldTable: TableRange,
390
+ newTable: TableRange
391
+ ): Alignment[] {
392
+ const oldTexts = oldTable.rows.map(r => rowText(genesis, r))
393
+ const newTexts = newTable.rows.map(r => rowText(newHtml, r))
394
+ return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) => {
395
+ if (oldTable.rows[oldIdx].cells.length !== newTable.rows[newIdx].cells.length) return 0
396
+ return textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
397
+ })
398
+ }
399
+
400
+ /**
401
+ * Table-level counterpart: after `lcsAlign(gKeys, otherKeys)` over
402
+ * full table HTML keys, fuzzy-pair unmatched table runs by their
403
+ * row-text-concatenated content. Without this, a table whose body
404
+ * was edited (but not its outer shape) fails the exact-key match
405
+ * and the preprocessing emits whole-table del + whole-table ins
406
+ * instead of recursing into per-cell three-way diffs.
407
+ */
408
+ function pairSimilarTablesThreeWay(
409
+ alignment: Alignment[],
410
+ oldHtml: string,
411
+ newHtml: string,
412
+ oldTables: TableRange[],
413
+ newTables: TableRange[]
414
+ ): Alignment[] {
415
+ const oldTexts = oldTables.map(t => t.rows.map(r => rowText(oldHtml, r)).join(' '))
416
+ const newTexts = newTables.map(t => t.rows.map(r => rowText(newHtml, r)).join(' '))
417
+ return pairSimilarUnmatched(alignment, THREE_WAY_FUZZY_THRESHOLD, (oldIdx, newIdx) =>
418
+ textSimilarity(oldTexts[oldIdx], newTexts[newIdx])
419
+ )
420
+ }
421
+
345
422
  // ────────────────────────────────────────────────────────────────────────────
346
423
  // Per-table diff: positional cells or row-level structural change.
347
424
 
@@ -426,8 +503,17 @@ function diffTableStructural(
426
503
  const cKeys = tC.rows.map(r => rowKey(cpLatest, r))
427
504
  const mKeys = tM.rows.map(r => rowKey(meCurrent, r))
428
505
 
429
- const alignCp = lcsAlign(gKeys, cKeys)
430
- const alignMe = lcsAlign(gKeys, mKeys)
506
+ // Exact LCS first, then fuzzy-pair remaining unmatched runs. Without
507
+ // the fuzzy pass, a row where CP edited just a single cell's text
508
+ // produces no key match — the row aligner emits the genesis row as
509
+ // CP-deleted AND CP's reshaped row as inserted, when a cell-level
510
+ // diff against the paired row would render the edit far more
511
+ // legibly. The 2-way path (`TableDiff.pairSimilarUnmatchedRows`)
512
+ // has done this since inception; bringing the three-way path in
513
+ // step removes the asymmetry where the cp-only / all-changes view
514
+ // looks markedly worse than plain 2-way for ordinary cell edits.
515
+ const alignCp = pairSimilarRowsThreeWay(lcsAlign(gKeys, cKeys), genesis, cpLatest, tG, tC)
516
+ const alignMe = pairSimilarRowsThreeWay(lcsAlign(gKeys, mKeys), genesis, meCurrent, tG, tM)
431
517
 
432
518
  // genesisIdx → matching cpIdx (-1 if cp deleted this row)
433
519
  const gToCp = new Array<number>(tG.rows.length).fill(-1)
@@ -288,6 +288,34 @@ describe('HtmlDiff.executeThreeWay (tables, genesis-spine)', () => {
288
288
  )
289
289
  })
290
290
 
291
+ it('CP edited one cell in a row (same shape) — fuzzy-pairs and emits a cell-level diff, not whole-row del+ins', () => {
292
+ // Regression: the 3-way row aligner only did exact lcsAlign over
293
+ // rowKey, so a row where CP edited a single cell's text produced
294
+ // no key match and the algorithm split the row into a whole-row
295
+ // delete + whole-row insert. The 2-way path has always run a
296
+ // fuzzy-pairing pass after lcsAlign; bringing the 3-way path in
297
+ // step removes the asymmetry where cp-only / all-changes views
298
+ // looked materially worse than 2-way for ordinary cell edits.
299
+ //
300
+ // Same-shape genesis/cp/me; CP edited the middle cell's text.
301
+ // Me === genesis. Expect a paired row with cell-level cp-ins
302
+ // markup, NOT two distinct whole-row entries.
303
+ const out = HtmlDiff.executeThreeWay(
304
+ '<table><tr><td>Party A</td><td>old details</td><td>kept</td></tr></table>',
305
+ '<table><tr><td>Party A</td><td>new details</td><td>kept</td></tr></table>',
306
+ '<table><tr><td>Party A</td><td>old details</td><td>kept</td></tr></table>'
307
+ )
308
+ // CP's edit lives inside the row, not as a parallel whole-row
309
+ // delete-then-insert. Whole-row markers would carry `class='diffdel ...'`
310
+ // or `class='diffins ...'` on the `<tr>` itself.
311
+ expect(out).not.toMatch(/<tr [^>]*class=['"]diffdel/)
312
+ expect(out).not.toMatch(/<tr [^>]*class=['"]diffins/)
313
+ expect(out).toContain('Party A')
314
+ expect(out).toContain("data-author='cp'")
315
+ // Me === genesis so any me attribution would indicate a swap.
316
+ expect(out).not.toContain("data-author='me'")
317
+ })
318
+
291
319
  it('cell-count mismatch: both sides restructured differently — both ins rows attributed', () => {
292
320
  // Genesis 2 cells, CP 3 cells, Me 4 cells. Neither side keeps
293
321
  // the genesis shape, so both restructures must be visible.