@createiq/htmldiff 1.0.5-beta.2 → 1.0.5-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/HtmlDiff.mjs CHANGED
@@ -447,8 +447,108 @@ function diffPreservedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
447
447
  if (oldRow.cells.length === newRow.cells.length) return diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell);
448
448
  const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
449
449
  if (colspanAligned !== null) return colspanAligned;
450
+ const delta = newRow.cells.length - oldRow.cells.length;
451
+ const absDelta = Math.abs(delta);
452
+ if (absDelta > 0 && absDelta <= MAX_COLUMN_DELTA && Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH) {
453
+ if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell);
454
+ return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell);
455
+ }
450
456
  return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
451
457
  }
458
+ const MAX_COLUMN_DELTA = 6;
459
+ const MAX_COLUMN_SEARCH_WIDTH = 40;
460
+ /**
461
+ * For a row where new has K more cells than old, find the K column
462
+ * positions in new where cells were inserted by scanning all C(newCount,
463
+ * K) combinations and picking the one that maximises positional content
464
+ * similarity with the remaining cells. The inserted cells are emitted
465
+ * with diff markers; the rest are aligned positionally with content
466
+ * diff for matched pairs.
467
+ */
468
+ function diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
469
+ const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml);
470
+ const inserted = new Set(insertedPositions);
471
+ const out = [rowHeaderSlice(newHtml, newRow)];
472
+ let oldIdx = 0;
473
+ for (let c = 0; c < newRow.cells.length; c++) if (inserted.has(c)) out.push(emitFullCell(newHtml, newRow.cells[c], "ins", diffCell));
474
+ else {
475
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell));
476
+ oldIdx++;
477
+ }
478
+ out.push("</tr>");
479
+ return out.join("");
480
+ }
481
+ function diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
482
+ const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml);
483
+ const deleted = new Set(deletedPositions);
484
+ const out = [rowHeaderSlice(newHtml, newRow)];
485
+ let newIdx = 0;
486
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
487
+ if (deleted.has(oldIdx)) {
488
+ out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], "del", diffCell));
489
+ continue;
490
+ }
491
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell));
492
+ newIdx++;
493
+ }
494
+ out.push("</tr>");
495
+ return out.join("");
496
+ }
497
+ function findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml) {
498
+ let bestPositions = [];
499
+ let bestScore = -1;
500
+ for (const combo of combinationsOfRange(newRow.cells.length, k)) {
501
+ const inserted = new Set(combo);
502
+ let score = 0;
503
+ let oldIdx = 0;
504
+ for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
505
+ if (inserted.has(newIdx)) continue;
506
+ score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
507
+ oldIdx++;
508
+ }
509
+ if (score > bestScore) {
510
+ bestScore = score;
511
+ bestPositions = combo;
512
+ }
513
+ }
514
+ return bestPositions;
515
+ }
516
+ function findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml) {
517
+ let bestPositions = [];
518
+ let bestScore = -1;
519
+ for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
520
+ const deleted = new Set(combo);
521
+ let score = 0;
522
+ let newIdx = 0;
523
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
524
+ if (deleted.has(oldIdx)) continue;
525
+ score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
526
+ newIdx++;
527
+ }
528
+ if (score > bestScore) {
529
+ bestScore = score;
530
+ bestPositions = combo;
531
+ }
532
+ }
533
+ return bestPositions;
534
+ }
535
+ /**
536
+ * Yields all sorted-ascending combinations of `k` distinct integers
537
+ * from [0, n). Iterative implementation avoids recursion overhead and
538
+ * keeps memory at O(k).
539
+ */
540
+ function* combinationsOfRange(n, k) {
541
+ if (k === 0 || k > n) return;
542
+ const indices = Array.from({ length: k }, (_, i) => i);
543
+ while (true) {
544
+ yield indices.slice();
545
+ let i = k - 1;
546
+ while (i >= 0 && indices[i] === n - k + i) i--;
547
+ if (i < 0) return;
548
+ indices[i]++;
549
+ for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1;
550
+ }
551
+ }
452
552
  /**
453
553
  * Try to align cells by logical column position (sum of colspans). When
454
554
  * one side has a colspan'd cell that absorbs multiple cells on the other
@@ -730,17 +830,38 @@ function pairSimilarUnmatched(alignment, threshold, similarity) {
730
830
  return result;
731
831
  }
732
832
  /**
733
- * Character-level similarity using shared prefix + suffix as a fraction
734
- * of the longer string. Catches "single edit somewhere in a long row"
735
- * (which token-Jaccard misses on short rows) while still correctly
736
- * rejecting rows with no positional overlap. HTML tags are stripped to
737
- * keep the comparison content-focused.
833
+ * Combined similarity metric used for both row-level and cell-level
834
+ * fuzzy pairing. Returns the MAX of two complementary metrics:
835
+ *
836
+ * 1. **Character prefix+suffix similarity** fraction of the longer
837
+ * string covered by shared prefix + shared suffix. Catches small
838
+ * edits in the middle of a string (one word changed in a row).
839
+ * Misses cases where the bulk of common content is in the middle
840
+ * and the ends differ.
841
+ *
842
+ * 2. **Token Jaccard similarity** — intersection-over-union of the
843
+ * whitespace-split tokens. Catches "most of the content is the
844
+ * same but bookended by different bits" — e.g. a row whose only
845
+ * edit is a column added at the start and another at the end,
846
+ * where the ~50 chars in the middle that DO match would be
847
+ * invisible to prefix+suffix.
848
+ *
849
+ * Either metric exceeding the threshold means pair. Neither alone is
850
+ * sufficient for the full range of legal-doc edits we see in
851
+ * production tables.
738
852
  */
739
853
  function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
740
- const a = rowText(oldHtml, oldRow);
741
- const b = rowText(newHtml, newRow);
854
+ return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow));
855
+ }
856
+ function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
857
+ return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell));
858
+ }
859
+ function textSimilarity(a, b) {
742
860
  if (a === b) return 1;
743
861
  if (a.length === 0 || b.length === 0) return 0;
862
+ return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b));
863
+ }
864
+ function charPrefixSuffixSimilarity(a, b) {
744
865
  let prefix = 0;
745
866
  const minLen = Math.min(a.length, b.length);
746
867
  while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
@@ -748,29 +869,20 @@ function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
748
869
  while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
749
870
  return (prefix + suffix) / Math.max(a.length, b.length);
750
871
  }
872
+ function tokenJaccardSimilarity(a, b) {
873
+ const tokensA = new Set(a.split(/\s+/).filter(Boolean));
874
+ const tokensB = new Set(b.split(/\s+/).filter(Boolean));
875
+ if (tokensA.size === 0 && tokensB.size === 0) return 1;
876
+ let intersection = 0;
877
+ for (const t of tokensA) if (tokensB.has(t)) intersection++;
878
+ const union = tokensA.size + tokensB.size - intersection;
879
+ return union === 0 ? 0 : intersection / union;
880
+ }
751
881
  function rowText(html, row) {
752
882
  const parts = [];
753
883
  for (const cell of row.cells) parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " "));
754
884
  return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
755
885
  }
756
- /**
757
- * Character-level prefix+suffix similarity for a single cell's text
758
- * content. Same metric as rowSimilarity, scoped to one cell so we can
759
- * fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
760
- * a column add in the same row).
761
- */
762
- function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
763
- const a = cellText(oldHtml, oldCell);
764
- const b = cellText(newHtml, newCell);
765
- if (a === b) return 1;
766
- if (a.length === 0 || b.length === 0) return 0;
767
- let prefix = 0;
768
- const minLen = Math.min(a.length, b.length);
769
- while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
770
- let suffix = 0;
771
- while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
772
- return (prefix + suffix) / Math.max(a.length, b.length);
773
- }
774
886
  function cellText(html, cell) {
775
887
  return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
776
888
  }