@createiq/htmldiff 1.0.5-beta.1 → 1.0.5-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/HtmlDiff.cjs CHANGED
@@ -447,8 +447,108 @@ function diffPreservedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
447
447
  if (oldRow.cells.length === newRow.cells.length) return diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell);
448
448
  const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
449
449
  if (colspanAligned !== null) return colspanAligned;
450
+ const delta = newRow.cells.length - oldRow.cells.length;
451
+ const absDelta = Math.abs(delta);
452
+ if (absDelta > 0 && absDelta <= MAX_COLUMN_DELTA && Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH) {
453
+ if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell);
454
+ return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell);
455
+ }
450
456
  return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
451
457
  }
458
+ const MAX_COLUMN_DELTA = 6;
459
+ const MAX_COLUMN_SEARCH_WIDTH = 40;
460
+ /**
461
+ * For a row where new has K more cells than old, find the K column
462
+ * positions in new where cells were inserted by scanning all C(newCount,
463
+ * K) combinations and picking the one that maximises positional content
464
+ * similarity with the remaining cells. The inserted cells are emitted
465
+ * with diff markers; the rest are aligned positionally with content
466
+ * diff for matched pairs.
467
+ */
468
+ function diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
469
+ const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml);
470
+ const inserted = new Set(insertedPositions);
471
+ const out = [rowHeaderSlice(newHtml, newRow)];
472
+ let oldIdx = 0;
473
+ for (let c = 0; c < newRow.cells.length; c++) if (inserted.has(c)) out.push(emitFullCell(newHtml, newRow.cells[c], "ins", diffCell));
474
+ else {
475
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell));
476
+ oldIdx++;
477
+ }
478
+ out.push("</tr>");
479
+ return out.join("");
480
+ }
481
+ function diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
482
+ const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml);
483
+ const deleted = new Set(deletedPositions);
484
+ const out = [rowHeaderSlice(newHtml, newRow)];
485
+ let newIdx = 0;
486
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
487
+ if (deleted.has(oldIdx)) {
488
+ out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], "del", diffCell));
489
+ continue;
490
+ }
491
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell));
492
+ newIdx++;
493
+ }
494
+ out.push("</tr>");
495
+ return out.join("");
496
+ }
497
+ function findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml) {
498
+ let bestPositions = [];
499
+ let bestScore = -1;
500
+ for (const combo of combinationsOfRange(newRow.cells.length, k)) {
501
+ const inserted = new Set(combo);
502
+ let score = 0;
503
+ let oldIdx = 0;
504
+ for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
505
+ if (inserted.has(newIdx)) continue;
506
+ score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
507
+ oldIdx++;
508
+ }
509
+ if (score > bestScore) {
510
+ bestScore = score;
511
+ bestPositions = combo;
512
+ }
513
+ }
514
+ return bestPositions;
515
+ }
516
+ function findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml) {
517
+ let bestPositions = [];
518
+ let bestScore = -1;
519
+ for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
520
+ const deleted = new Set(combo);
521
+ let score = 0;
522
+ let newIdx = 0;
523
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
524
+ if (deleted.has(oldIdx)) continue;
525
+ score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
526
+ newIdx++;
527
+ }
528
+ if (score > bestScore) {
529
+ bestScore = score;
530
+ bestPositions = combo;
531
+ }
532
+ }
533
+ return bestPositions;
534
+ }
535
+ /**
536
+ * Yields all sorted-ascending combinations of `k` distinct integers
537
+ * from [0, n). Iterative implementation avoids recursion overhead and
538
+ * keeps memory at O(k).
539
+ */
540
+ function* combinationsOfRange(n, k) {
541
+ if (k === 0 || k > n) return;
542
+ const indices = Array.from({ length: k }, (_, i) => i);
543
+ while (true) {
544
+ yield indices.slice();
545
+ let i = k - 1;
546
+ while (i >= 0 && indices[i] === n - k + i) i--;
547
+ if (i < 0) return;
548
+ indices[i]++;
549
+ for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1;
550
+ }
551
+ }
452
552
  /**
453
553
  * Try to align cells by logical column position (sum of colspans). When
454
554
  * one side has a colspan'd cell that absorbs multiple cells on the other
@@ -544,7 +644,7 @@ function diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
544
644
  return out.join("");
545
645
  }
546
646
  function diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
547
- const alignment = lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell)));
647
+ const alignment = pairSimilarUnmatchedCells(lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell))), oldRow, newRow, oldHtml, newHtml);
548
648
  const out = [];
549
649
  out.push(rowHeaderSlice(newHtml, newRow));
550
650
  for (const align of alignment) if (align.oldIdx !== null && align.newIdx !== null) {
@@ -643,9 +743,16 @@ function rowHeaderSlice(html, row) {
643
743
  if (row.cells.length === 0) return html.slice(row.rowStart, opening.end);
644
744
  return html.slice(row.rowStart, row.cells[0].cellStart);
645
745
  }
646
- /** Jaccard similarity threshold above which we treat two rows as "the same row, edited". */
746
+ /** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
647
747
  const ROW_FUZZY_THRESHOLD = .5;
648
748
  /**
749
+ * Threshold for "this cell is a content-edit of that cell." Tuned the same
750
+ * as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
751
+ * content typically ARE the same logical cell with a body edit, so 0.5
752
+ * works for both granularities in practice.
753
+ */
754
+ const CELL_FUZZY_THRESHOLD = .5;
755
+ /**
649
756
  * After exact LCS, scan the alignment for runs of "old deleted, then new
650
757
  * inserted" (or vice versa) and pair entries whose content is similar
651
758
  * enough to be treated as an edit rather than a delete+insert. This keeps
@@ -654,6 +761,26 @@ const ROW_FUZZY_THRESHOLD = .5;
654
761
  * expect from a typical track-changes view.
655
762
  */
656
763
  function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtml) {
764
+ return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) => rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml));
765
+ }
766
+ function pairSimilarUnmatchedCells(alignment, oldRow, newRow, oldHtml, newHtml) {
767
+ return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) => cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml));
768
+ }
769
+ /**
770
+ * Identify pairings inside each unmatched-only run, then build the output
771
+ * alignment by walking the original and substituting paired entries at
772
+ * the *ins position* (not the del position). This keeps the result
773
+ * monotonic in newIdx — critical because the cursor-based emission
774
+ * downstream walks new's html in order. Emitting at the del position
775
+ * would be fine when del<ins in the alignment array (the typical case),
776
+ * but can violate monotonicity when there are mixed unpaired entries in
777
+ * between (column-add + row-add together, content-edit + column-add,
778
+ * etc.).
779
+ *
780
+ * Generic over what's being paired — works for both rows (by full row
781
+ * content similarity) and cells (by per-cell content similarity).
782
+ */
783
+ function pairSimilarUnmatched(alignment, threshold, similarity) {
657
784
  const pairs = /* @__PURE__ */ new Map();
658
785
  let i = 0;
659
786
  while (i < alignment.length) {
@@ -671,10 +798,10 @@ function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtm
671
798
  const usedIns = /* @__PURE__ */ new Set();
672
799
  for (const di of delIndices) {
673
800
  let bestIi = -1;
674
- let bestSim = ROW_FUZZY_THRESHOLD;
801
+ let bestSim = threshold;
675
802
  for (const ii of insIndices) {
676
803
  if (usedIns.has(ii)) continue;
677
- const sim = rowSimilarity(oldTable.rows[alignment[di].oldIdx], newTable.rows[alignment[ii].newIdx], oldHtml, newHtml);
804
+ const sim = similarity(alignment[di].oldIdx, alignment[ii].newIdx);
678
805
  if (sim > bestSim) {
679
806
  bestSim = sim;
680
807
  bestIi = ii;
@@ -703,17 +830,38 @@ function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtm
703
830
  return result;
704
831
  }
705
832
  /**
706
- * Character-level similarity using shared prefix + suffix as a fraction
707
- * of the longer string. Catches "single edit somewhere in a long row"
708
- * (which token-Jaccard misses on short rows) while still correctly
709
- * rejecting rows with no positional overlap. HTML tags are stripped to
710
- * keep the comparison content-focused.
833
+ * Combined similarity metric used for both row-level and cell-level
834
+ * fuzzy pairing. Returns the MAX of two complementary metrics:
835
+ *
836
+ * 1. **Character prefix+suffix similarity** fraction of the longer
837
+ * string covered by shared prefix + shared suffix. Catches small
838
+ * edits in the middle of a string (one word changed in a row).
839
+ * Misses cases where the bulk of common content is in the middle
840
+ * and the ends differ.
841
+ *
842
+ * 2. **Token Jaccard similarity** — intersection-over-union of the
843
+ * whitespace-split tokens. Catches "most of the content is the
844
+ * same but bookended by different bits" — e.g. a row whose only
845
+ * edit is a column added at the start and another at the end,
846
+ * where the ~50 chars in the middle that DO match would be
847
+ * invisible to prefix+suffix.
848
+ *
849
+ * Either metric exceeding the threshold means pair. Neither alone is
850
+ * sufficient for the full range of legal-doc edits we see in
851
+ * production tables.
711
852
  */
712
853
  function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
713
- const a = rowText(oldHtml, oldRow);
714
- const b = rowText(newHtml, newRow);
854
+ return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow));
855
+ }
856
+ function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
857
+ return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell));
858
+ }
859
+ function textSimilarity(a, b) {
715
860
  if (a === b) return 1;
716
861
  if (a.length === 0 || b.length === 0) return 0;
862
+ return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b));
863
+ }
864
+ function charPrefixSuffixSimilarity(a, b) {
717
865
  let prefix = 0;
718
866
  const minLen = Math.min(a.length, b.length);
719
867
  while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
@@ -721,11 +869,23 @@ function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
721
869
  while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
722
870
  return (prefix + suffix) / Math.max(a.length, b.length);
723
871
  }
872
+ function tokenJaccardSimilarity(a, b) {
873
+ const tokensA = new Set(a.split(/\s+/).filter(Boolean));
874
+ const tokensB = new Set(b.split(/\s+/).filter(Boolean));
875
+ if (tokensA.size === 0 && tokensB.size === 0) return 1;
876
+ let intersection = 0;
877
+ for (const t of tokensA) if (tokensB.has(t)) intersection++;
878
+ const union = tokensA.size + tokensB.size - intersection;
879
+ return union === 0 ? 0 : intersection / union;
880
+ }
724
881
  function rowText(html, row) {
725
882
  const parts = [];
726
883
  for (const cell of row.cells) parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " "));
727
884
  return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
728
885
  }
886
+ function cellText(html, cell) {
887
+ return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
888
+ }
729
889
  /**
730
890
  * Standard LCS alignment: walks both sequences and emits a list of pairs
731
891
  * where `(oldIdx, newIdx)` are both set for matching positions, and one