@createiq/htmldiff 1.0.5-beta.2 → 1.0.5-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/HtmlDiff.mjs CHANGED
@@ -411,7 +411,7 @@ function diffPositionalTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
411
411
  * sides.
412
412
  */
413
413
  function diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diffCell) {
414
- const alignment = pairSimilarUnmatchedRows(lcsAlign(oldTable.rows.map((row) => rowKey(oldHtml, row)), newTable.rows.map((row) => rowKey(newHtml, row))), oldTable, newTable, oldHtml, newHtml);
414
+ const alignment = orderAlignmentForEmission(pairSimilarUnmatchedRows(lcsAlign(oldTable.rows.map((row) => rowKey(oldHtml, row)), newTable.rows.map((row) => rowKey(newHtml, row))), oldTable, newTable, oldHtml, newHtml));
415
415
  if (newTable.rows.length === 0) return rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell);
416
416
  const out = [];
417
417
  out.push(newHtml.slice(newTable.tableStart, newTable.rows[0].rowStart));
@@ -426,6 +426,72 @@ function diffStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, diff
426
426
  out.push(newHtml.slice(cursor, newTable.tableEnd));
427
427
  return out.join("");
428
428
  }
429
+ /**
430
+ * Reorders the alignment so emission produces rows in the visually-
431
+ * correct order. Each entry is assigned a fractional "position" in
432
+ * new's flow:
433
+ *
434
+ * • Preserved/paired (oldIdx, newIdx): position = newIdx.
435
+ * • Pure insert (null, newIdx): position = newIdx.
436
+ * • Pure delete (oldIdx, null): position = newIdx-of-preserved-just-
437
+ * before-this-oldIdx + 0.5. Dels at the same gap sort by oldIdx so
438
+ * they appear in old's row order. The +0.5 places dels BEFORE any
439
+ * insert at the same gap (insert at newIdx N1+1 has position N1+1
440
+ * which is > N1+0.5), giving the natural "delete first, insert
441
+ * second" reading order at a replaced position.
442
+ *
443
+ * This handles the full range:
444
+ * • Run of unpaired dels at the start (no preserved predecessor):
445
+ * position -0.5, sorted by oldIdx.
446
+ * • Dels in the middle: positioned right after their preceding
447
+ * preserved row.
448
+ * • Dels at the end (no preserved successor): positioned after the
449
+ * last preserved row.
450
+ *
451
+ * Without this reordering, a run of unpaired deletes at low alignment
452
+ * indices got emitted at cursor = first-new-row position — putting
453
+ * all deletes before any preserved row in the output, regardless of
454
+ * where they came from in old.
455
+ */
456
+ function orderAlignmentForEmission(alignment) {
457
+ const preserved = [];
458
+ for (const a of alignment) if (a.oldIdx !== null && a.newIdx !== null) preserved.push({
459
+ oldIdx: a.oldIdx,
460
+ newIdx: a.newIdx
461
+ });
462
+ preserved.sort((a, b) => a.oldIdx - b.oldIdx);
463
+ function newIdxOfPreservedBefore(oldIdx) {
464
+ let result = -1;
465
+ for (const p of preserved) {
466
+ if (p.oldIdx >= oldIdx) break;
467
+ result = p.newIdx;
468
+ }
469
+ return result;
470
+ }
471
+ const decorated = alignment.map((a, i) => {
472
+ let primary;
473
+ let secondary;
474
+ if (a.newIdx !== null) {
475
+ primary = a.newIdx;
476
+ secondary = a.oldIdx === null ? 1 : 0;
477
+ } else {
478
+ primary = newIdxOfPreservedBefore(a.oldIdx) + .5;
479
+ secondary = a.oldIdx;
480
+ }
481
+ return {
482
+ entry: a,
483
+ primary,
484
+ secondary,
485
+ originalIdx: i
486
+ };
487
+ });
488
+ decorated.sort((a, b) => {
489
+ if (a.primary !== b.primary) return a.primary - b.primary;
490
+ if (a.secondary !== b.secondary) return a.secondary - b.secondary;
491
+ return a.originalIdx - b.originalIdx;
492
+ });
493
+ return decorated.map((d) => d.entry);
494
+ }
429
495
  function rebuildStructurallyAlignedTable(oldHtml, newHtml, oldTable, newTable, alignment, diffCell) {
430
496
  const out = [];
431
497
  out.push(headerSlice(newHtml, newTable, oldHtml, oldTable));
@@ -447,8 +513,108 @@ function diffPreservedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
447
513
  if (oldRow.cells.length === newRow.cells.length) return diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell);
448
514
  const colspanAligned = diffColspanChangedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
449
515
  if (colspanAligned !== null) return colspanAligned;
516
+ const delta = newRow.cells.length - oldRow.cells.length;
517
+ const absDelta = Math.abs(delta);
518
+ if (absDelta > 0 && absDelta <= MAX_COLUMN_DELTA && Math.max(oldRow.cells.length, newRow.cells.length) <= MAX_COLUMN_SEARCH_WIDTH) {
519
+ if (delta > 0) return diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, delta, diffCell);
520
+ return diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, -delta, diffCell);
521
+ }
450
522
  return diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell);
451
523
  }
524
+ const MAX_COLUMN_DELTA = 6;
525
+ const MAX_COLUMN_SEARCH_WIDTH = 40;
526
+ /**
527
+ * For a row where new has K more cells than old, find the K column
528
+ * positions in new where cells were inserted by scanning all C(newCount,
529
+ * K) combinations and picking the one that maximises positional content
530
+ * similarity with the remaining cells. The inserted cells are emitted
531
+ * with diff markers; the rest are aligned positionally with content
532
+ * diff for matched pairs.
533
+ */
534
+ function diffMultiColumnAddRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
535
+ const insertedPositions = findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml);
536
+ const inserted = new Set(insertedPositions);
537
+ const out = [rowHeaderSlice(newHtml, newRow)];
538
+ let oldIdx = 0;
539
+ for (let c = 0; c < newRow.cells.length; c++) if (inserted.has(c)) out.push(emitFullCell(newHtml, newRow.cells[c], "ins", diffCell));
540
+ else {
541
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[c], diffCell));
542
+ oldIdx++;
543
+ }
544
+ out.push("</tr>");
545
+ return out.join("");
546
+ }
547
+ function diffMultiColumnDeleteRow(oldHtml, newHtml, oldRow, newRow, k, diffCell) {
548
+ const deletedPositions = findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml);
549
+ const deleted = new Set(deletedPositions);
550
+ const out = [rowHeaderSlice(newHtml, newRow)];
551
+ let newIdx = 0;
552
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
553
+ if (deleted.has(oldIdx)) {
554
+ out.push(emitFullCell(oldHtml, oldRow.cells[oldIdx], "del", diffCell));
555
+ continue;
556
+ }
557
+ out.push(emitDiffedCell(oldHtml, newHtml, oldRow.cells[oldIdx], newRow.cells[newIdx], diffCell));
558
+ newIdx++;
559
+ }
560
+ out.push("</tr>");
561
+ return out.join("");
562
+ }
563
+ function findBestColumnInsertPositions(oldRow, newRow, k, oldHtml, newHtml) {
564
+ let bestPositions = [];
565
+ let bestScore = -1;
566
+ for (const combo of combinationsOfRange(newRow.cells.length, k)) {
567
+ const inserted = new Set(combo);
568
+ let score = 0;
569
+ let oldIdx = 0;
570
+ for (let newIdx = 0; newIdx < newRow.cells.length; newIdx++) {
571
+ if (inserted.has(newIdx)) continue;
572
+ score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
573
+ oldIdx++;
574
+ }
575
+ if (score > bestScore) {
576
+ bestScore = score;
577
+ bestPositions = combo;
578
+ }
579
+ }
580
+ return bestPositions;
581
+ }
582
+ function findBestColumnDeletePositions(oldRow, newRow, k, oldHtml, newHtml) {
583
+ let bestPositions = [];
584
+ let bestScore = -1;
585
+ for (const combo of combinationsOfRange(oldRow.cells.length, k)) {
586
+ const deleted = new Set(combo);
587
+ let score = 0;
588
+ let newIdx = 0;
589
+ for (let oldIdx = 0; oldIdx < oldRow.cells.length; oldIdx++) {
590
+ if (deleted.has(oldIdx)) continue;
591
+ score += cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml);
592
+ newIdx++;
593
+ }
594
+ if (score > bestScore) {
595
+ bestScore = score;
596
+ bestPositions = combo;
597
+ }
598
+ }
599
+ return bestPositions;
600
+ }
601
+ /**
602
+ * Yields all sorted-ascending combinations of `k` distinct integers
603
+ * from [0, n). Iterative implementation avoids recursion overhead and
604
+ * keeps memory at O(k).
605
+ */
606
+ function* combinationsOfRange(n, k) {
607
+ if (k === 0 || k > n) return;
608
+ const indices = Array.from({ length: k }, (_, i) => i);
609
+ while (true) {
610
+ yield indices.slice();
611
+ let i = k - 1;
612
+ while (i >= 0 && indices[i] === n - k + i) i--;
613
+ if (i < 0) return;
614
+ indices[i]++;
615
+ for (let j = i + 1; j < k; j++) indices[j] = indices[j - 1] + 1;
616
+ }
617
+ }
452
618
  /**
453
619
  * Try to align cells by logical column position (sum of colspans). When
454
620
  * one side has a colspan'd cell that absorbs multiple cells on the other
@@ -730,17 +896,38 @@ function pairSimilarUnmatched(alignment, threshold, similarity) {
730
896
  return result;
731
897
  }
732
898
  /**
733
- * Character-level similarity using shared prefix + suffix as a fraction
734
- * of the longer string. Catches "single edit somewhere in a long row"
735
- * (which token-Jaccard misses on short rows) while still correctly
736
- * rejecting rows with no positional overlap. HTML tags are stripped to
737
- * keep the comparison content-focused.
899
+ * Combined similarity metric used for both row-level and cell-level
900
+ * fuzzy pairing. Returns the MAX of two complementary metrics:
901
+ *
902
+ * 1. **Character prefix+suffix similarity** fraction of the longer
903
+ * string covered by shared prefix + shared suffix. Catches small
904
+ * edits in the middle of a string (one word changed in a row).
905
+ * Misses cases where the bulk of common content is in the middle
906
+ * and the ends differ.
907
+ *
908
+ * 2. **Token Jaccard similarity** — intersection-over-union of the
909
+ * whitespace-split tokens. Catches "most of the content is the
910
+ * same but bookended by different bits" — e.g. a row whose only
911
+ * edit is a column added at the start and another at the end,
912
+ * where the ~50 chars in the middle that DO match would be
913
+ * invisible to prefix+suffix.
914
+ *
915
+ * Either metric exceeding the threshold means pair. Neither alone is
916
+ * sufficient for the full range of legal-doc edits we see in
917
+ * production tables.
738
918
  */
739
919
  function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
740
- const a = rowText(oldHtml, oldRow);
741
- const b = rowText(newHtml, newRow);
920
+ return textSimilarity(rowText(oldHtml, oldRow), rowText(newHtml, newRow));
921
+ }
922
+ function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
923
+ return textSimilarity(cellText(oldHtml, oldCell), cellText(newHtml, newCell));
924
+ }
925
+ function textSimilarity(a, b) {
742
926
  if (a === b) return 1;
743
927
  if (a.length === 0 || b.length === 0) return 0;
928
+ return Math.max(charPrefixSuffixSimilarity(a, b), tokenJaccardSimilarity(a, b));
929
+ }
930
+ function charPrefixSuffixSimilarity(a, b) {
744
931
  let prefix = 0;
745
932
  const minLen = Math.min(a.length, b.length);
746
933
  while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
@@ -748,29 +935,20 @@ function rowSimilarity(oldRow, newRow, oldHtml, newHtml) {
748
935
  while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
749
936
  return (prefix + suffix) / Math.max(a.length, b.length);
750
937
  }
938
+ function tokenJaccardSimilarity(a, b) {
939
+ const tokensA = new Set(a.split(/\s+/).filter(Boolean));
940
+ const tokensB = new Set(b.split(/\s+/).filter(Boolean));
941
+ if (tokensA.size === 0 && tokensB.size === 0) return 1;
942
+ let intersection = 0;
943
+ for (const t of tokensA) if (tokensB.has(t)) intersection++;
944
+ const union = tokensA.size + tokensB.size - intersection;
945
+ return union === 0 ? 0 : intersection / union;
946
+ }
751
947
  function rowText(html, row) {
752
948
  const parts = [];
753
949
  for (const cell of row.cells) parts.push(html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " "));
754
950
  return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
755
951
  }
756
- /**
757
- * Character-level prefix+suffix similarity for a single cell's text
758
- * content. Same metric as rowSimilarity, scoped to one cell so we can
759
- * fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
760
- * a column add in the same row).
761
- */
762
- function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
763
- const a = cellText(oldHtml, oldCell);
764
- const b = cellText(newHtml, newCell);
765
- if (a === b) return 1;
766
- if (a.length === 0 || b.length === 0) return 0;
767
- let prefix = 0;
768
- const minLen = Math.min(a.length, b.length);
769
- while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
770
- let suffix = 0;
771
- while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
772
- return (prefix + suffix) / Math.max(a.length, b.length);
773
- }
774
952
  function cellText(html, cell) {
775
953
  return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
776
954
  }