npm - @createiq/htmldiff - Versions diffs - 1.0.5-beta.1 → 1.0.5-beta.2 - Mend

@createiq/htmldiff 1.0.5-beta.1 → 1.0.5-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/HtmlDiff.cjs +52 -4
package/dist/HtmlDiff.cjs.map +1 -1
package/dist/HtmlDiff.mjs +52 -4
package/dist/HtmlDiff.mjs.map +1 -1
package/package.json +1 -1
package/src/TableDiff.ts +90 -17
package/test/HtmlDiff.tables.spec.ts +39 -0

package/dist/HtmlDiff.cjs CHANGED Viewed

@@ -544,7 +544,7 @@ function diffPositionalRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
 	return out.join("");
 }
 function diffStructurallyAlignedRow(oldHtml, newHtml, oldRow, newRow, diffCell) {
-	const alignment = lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell)));
+	const alignment = pairSimilarUnmatchedCells(lcsAlign(oldRow.cells.map((cell) => cellKey(oldHtml, cell)), newRow.cells.map((cell) => cellKey(newHtml, cell))), oldRow, newRow, oldHtml, newHtml);
 	const out = [];
 	out.push(rowHeaderSlice(newHtml, newRow));
 	for (const align of alignment) if (align.oldIdx !== null && align.newIdx !== null) {
@@ -643,9 +643,16 @@ function rowHeaderSlice(html, row) {
 	if (row.cells.length === 0) return html.slice(row.rowStart, opening.end);
 	return html.slice(row.rowStart, row.cells[0].cellStart);
 }
-/** Jaccard similarity threshold above which we treat two rows as "the same row, edited". */
+/** Character-level similarity threshold above which we treat two rows as "the same row, edited". */
 const ROW_FUZZY_THRESHOLD = .5;
 /**
+* Threshold for "this cell is a content-edit of that cell." Tuned the same
+* as ROW_FUZZY_THRESHOLD; cells in legal docs that share most of their
+* content typically ARE the same logical cell with a body edit, so 0.5
+* works for both granularities in practice.
+*/
+const CELL_FUZZY_THRESHOLD = .5;
+/**
 * After exact LCS, scan the alignment for runs of "old deleted, then new
 * inserted" (or vice versa) and pair entries whose content is similar
 * enough to be treated as an edit rather than a delete+insert. This keeps
@@ -654,6 +661,26 @@ const ROW_FUZZY_THRESHOLD = .5;
 * expect from a typical track-changes view.
 */
 function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtml) {
+	return pairSimilarUnmatched(alignment, ROW_FUZZY_THRESHOLD, (oldIdx, newIdx) => rowSimilarity(oldTable.rows[oldIdx], newTable.rows[newIdx], oldHtml, newHtml));
+}
+function pairSimilarUnmatchedCells(alignment, oldRow, newRow, oldHtml, newHtml) {
+	return pairSimilarUnmatched(alignment, CELL_FUZZY_THRESHOLD, (oldIdx, newIdx) => cellSimilarity(oldRow.cells[oldIdx], newRow.cells[newIdx], oldHtml, newHtml));
+}
+/**
+* Identify pairings inside each unmatched-only run, then build the output
+* alignment by walking the original and substituting paired entries at
+* the *ins position* (not the del position). This keeps the result
+* monotonic in newIdx — critical because the cursor-based emission
+* downstream walks new's html in order. Emitting at the del position
+* would be fine when del<ins in the alignment array (the typical case),
+* but can violate monotonicity when there are mixed unpaired entries in
+* between (column-add + row-add together, content-edit + column-add,
+* etc.).
+*
+* Generic over what's being paired — works for both rows (by full row
+* content similarity) and cells (by per-cell content similarity).
+*/
+function pairSimilarUnmatched(alignment, threshold, similarity) {
 	const pairs = /* @__PURE__ */ new Map();
 	let i = 0;
 	while (i < alignment.length) {
@@ -671,10 +698,10 @@ function pairSimilarUnmatchedRows(alignment, oldTable, newTable, oldHtml, newHtm
 		const usedIns = /* @__PURE__ */ new Set();
 		for (const di of delIndices) {
 			let bestIi = -1;
-			let bestSim = ROW_FUZZY_THRESHOLD;
+			let bestSim = threshold;
 			for (const ii of insIndices) {
 				if (usedIns.has(ii)) continue;
-				const sim = rowSimilarity(oldTable.rows[alignment[di].oldIdx], newTable.rows[alignment[ii].newIdx], oldHtml, newHtml);
+				const sim = similarity(alignment[di].oldIdx, alignment[ii].newIdx);
 				if (sim > bestSim) {
 					bestSim = sim;
 					bestIi = ii;
@@ -727,6 +754,27 @@ function rowText(html, row) {
 	return parts.join(" ").replace(/\s+/g, " ").trim().toLowerCase();
 }
 /**
+* Character-level prefix+suffix similarity for a single cell's text
+* content. Same metric as rowSimilarity, scoped to one cell so we can
+* fuzzy-pair unmatched cells (e.g. a cell with a content edit alongside
+* a column add in the same row).
+*/
+function cellSimilarity(oldCell, newCell, oldHtml, newHtml) {
+	const a = cellText(oldHtml, oldCell);
+	const b = cellText(newHtml, newCell);
+	if (a === b) return 1;
+	if (a.length === 0 || b.length === 0) return 0;
+	let prefix = 0;
+	const minLen = Math.min(a.length, b.length);
+	while (prefix < minLen && a[prefix] === b[prefix]) prefix++;
+	let suffix = 0;
+	while (suffix < a.length - prefix && suffix < b.length - prefix && a[a.length - 1 - suffix] === b[b.length - 1 - suffix]) suffix++;
+	return (prefix + suffix) / Math.max(a.length, b.length);
+}
+function cellText(html, cell) {
+	return html.slice(cell.contentStart, cell.contentEnd).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim().toLowerCase();
+}
+/**
 * Standard LCS alignment: walks both sequences and emits a list of pairs
 * where `(oldIdx, newIdx)` are both set for matching positions, and one
 * side is null for an unmatched entry on the other side. Equality uses