@createiq/htmldiff 1.0.5-beta.1 → 1.0.5-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/HtmlDiff.cjs CHANGED
@@ -544,7 +544,7 @@ function diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
544
544
  return out.join("");
545
545
  }
546
546
  function diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
547
- const alignment = lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell)));
547
+ const alignment = pairSimilarUnmatchedCells(lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell))), oldRow, newRow, oldHtml, newHtml);
548
548
  const out = [];
549
549
  out.push(rowHeaderSlice(newHtml, newRow));
550
550
  for (const align of alignment) if (align.oldIdx !== null && align.newIdx !== null) {
@@ -643,9 +643,16 @@ function rowHeaderSlice(html, row) {
643
643
  if (row.cells.length === 0) return html.slice(row.rowStart, opening.end);
644
644
  return html.slice(row.rowStart, row.cells[0].cellStart);
645
645
  }
646
- /** Jaccard similarity threshold above which we treat two rows as "the same row, edited". */
646
+ /** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
647
647
  const ROW_FUZZY_THRESHOLD = .5;
648
648
  /**
649
+ * Threshold for "this cell is a content-edit of that cell." Tuned the same
650
+ * as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
651
+ * content typically ARE the same logical cell with a body edit, so 0.5
652
+ * works for both granularities in practice.
653
+ */
654
+ const CELL_FUZZY_THRESHOLD = .5;
655
+ /**
649
656
  * After exact LCS, scan the alignment for runs of "old deleted, then new
650
657
  * inserted" (or vice versa) and pair entries whose content is similar
651
658
  * enough to be treated as an edit rather than a delete+insert. This keeps
@@ -654,6 +661,26 @@ const ROW_FUZZY_THRESHOLD = .5;
654
661
  * expect from a typical track-changes view.
655
662
  */
656
663
  function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtml) {
664
+ return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) => rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml));
665
+ }
666
+ function pairSimilarUnmatchedCells(alignment, oldRow, newRow, oldHtml, newHtml) {
667
+ return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) => cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml));
668
+ }
669
+ /**
670
+ * Identify pairings inside each unmatched-only run, then build the output
671
+ * alignment by walking the original and substituting paired entries at
672
+ * the *ins position* (not the del position). This keeps the result
673
+ * monotonic in newIdx — critical because the cursor-based emission
674
+ * downstream walks new's html in order. Emitting at the del position
675
+ * would be fine when del<ins in the alignment array (the typical case),
676
+ * but can violate monotonicity when there are mixed unpaired entries in
677
+ * between (column-add + row-add together, content-edit + column-add,
678
+ * etc.).
679
+ *
680
+ * Generic over what's being paired — works for both rows (by full row
681
+ * content similarity) and cells (by per-cell content similarity).
682
+ */
683
+ function pairSimilarUnmatched(alignment, threshold, similarity) {
657
684
  const pairs = /* @__PURE__ */ new Map();
658
685
  let i = 0;
659
686
  while (i < alignment.length) {
@@ -671,10 +698,10 @@ function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtm
671
698
  const usedIns = /* @__PURE__ */ new Set();
672
699
  for (const di of delIndices) {
673
700
  let bestIi = -1;
674
- let bestSim = ROW_FUZZY_THRESHOLD;
701
+ let bestSim = threshold;
675
702
  for (const ii of insIndices) {
676
703
  if (usedIns.has(ii)) continue;
677
- const sim = rowSimilarity(oldTable.rows[alignment[di].oldIdx], newTable.rows[alignment[ii].newIdx], oldHtml, newHtml);
704
+ const sim = similarity(alignment[di].oldIdx, alignment[ii].newIdx);
678
705
  if (sim > bestSim) {
679
706
  bestSim = sim;
680
707
  bestIi = ii;
@@ -727,6 +754,27 @@ function rowText(html, row) {
727
754
  return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
728
755
  }
729
756
  /**
757
+ * Character-level prefix+suffix similarity for a single cell's text
758
+ * content. Same metric as rowSimilarity, scoped to one cell so we can
759
+ * fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
760
+ * a column add in the same row).
761
+ */
762
+ function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
763
+ const a = cellText(oldHtml, oldCell);
764
+ const b = cellText(newHtml, newCell);
765
+ if (a === b) return 1;
766
+ if (a.length === 0 || b.length === 0) return 0;
767
+ let prefix = 0;
768
+ const minLen = Math.min(a.length, b.length);
769
+ while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
770
+ let suffix = 0;
771
+ while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
772
+ return (prefix + suffix) / Math.max(a.length, b.length);
773
+ }
774
+ function cellText(html, cell) {
775
+ return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
776
+ }
777
+ /**
730
778
  * Standard LCS alignment: walks both sequences and emits a list of pairs
731
779
  * where `(oldIdx, newIdx)` are both set for matching positions, and one
732
780
  * side is null for an unmatched entry on the other side. Equality uses