npm - @adeu/core - Versions diffs - 1.6.7 → 1.6.9 - Mend

@adeu/core 1.6.7 → 1.6.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/dist/index.cjs +3969 -1859
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +95 -8
package/dist/index.d.ts +95 -8
package/dist/index.js +3966 -1859
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/src/consistency.test.ts +134 -0
package/src/diff.test.ts +13 -1
package/src/diff.ts +220 -47
package/src/docx/bridge.ts +111 -57
package/src/docx/dom.ts +66 -7
package/src/domain.test.ts +280 -0
package/src/domain.ts +264 -10
package/src/engine.bugs.test.ts +481 -0
package/src/engine.ts +1346 -192
package/src/index.ts +7 -8
package/src/ingest.ts +8 -0
package/src/markup.ts +160 -53
package/src/outline.ts +199 -69
package/src/sanitize/core.ts +130 -0
package/src/sanitize/report.ts +125 -0
package/src/sanitize/sanitize.test.ts +237 -0
package/src/sanitize/transforms.ts +452 -0
package/src/utils/docx.ts +292 -158

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adeu/core",
-  "version": "1.6.7",
+  "version": "1.6.9",
   "description": "",
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",

package/src/consistency.test.ts ADDED Viewed

@@ -0,0 +1,134 @@
+import { describe, it, expect } from "vitest";
+import {
+  readFileSync,
+  existsSync,
+  readdirSync,
+  writeFileSync,
+  unlinkSync,
+} from "node:fs";
+import { resolve, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+import { execSync } from "node:child_process";
+import { tmpdir } from "node:os";
+import { DocumentObject } from "./docx/bridge.js";
+import { RedlineEngine } from "./engine.js";
+import { extractTextFromBuffer } from "./ingest.js";
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+const CORPUS_DIR = resolve(
+  __dirname,
+  "../../../../shared/cross_platform_tests",
+);
+const PYTHON_ABSTRACT_CMD = resolve(
+  __dirname,
+  "../../../../python/scripts/abstract_xml.py",
+);
+const PYTHON_DIR = resolve(__dirname, "../../../../python");
+function normalizeMdTimestamps(mdText: string): string {
+  return mdText.replace(/@ \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z/g, "@ DATE");
+}
+describe("Polyglot Consistency Framework (TS vs Python)", () => {
+  if (!existsSync(CORPUS_DIR)) {
+    it.skip("Cross-platform test corpus not found", () => {});
+    return;
+  }
+  const testFolders = readdirSync(CORPUS_DIR, { withFileTypes: true })
+    .filter((dirent) => dirent.isDirectory())
+    .map((dirent) => dirent.name);
+  for (const folder of testFolders) {
+    const testDir = resolve(CORPUS_DIR, folder);
+    const testJsonPath = resolve(testDir, "test.json");
+    const inputDocxPath = resolve(testDir, "input.docx");
+    if (!existsSync(testJsonPath) || !existsSync(inputDocxPath)) {
+      continue;
+    }
+    const testConfig = JSON.parse(readFileSync(testJsonPath, "utf-8"));
+    const isReadOnly = testConfig.read_only || false;
+    // CRITICAL: We must inherit the author from the JSON so the XML Abstraction comparison
+    // doesn't fail on `w:author="Adeu AI"` vs `w:author="Adeu AI (TS)"`.
+    const author = testConfig.author || "Adeu AI";
+    describe(`Corpus Scenario: [${folder}]`, () => {
+      it("Strictly matches the Python Golden Masters", async () => {
+        const inputBuffer = readFileSync(inputDocxPath);
+        let outBuffer: Buffer;
+        // 1. Process Edits (if not read-only)
+        if (isReadOnly) {
+          outBuffer = inputBuffer;
+        } else {
+          const doc = await DocumentObject.load(inputBuffer);
+          const engine = new RedlineEngine(doc, author);
+          engine.process_batch(testConfig.changes || []);
+          outBuffer = await doc.save();
+          // 2. Assert XML Structure Parity (via Python Bridge)
+          const goldenXmlPath = resolve(testDir, "golden_abstract.xml");
+          if (existsSync(goldenXmlPath)) {
+            const expectedXml = readFileSync(goldenXmlPath, "utf-8");
+            const tmpDocx = resolve(
+              tmpdir(),
+              `adeu_test_${folder}_${Date.now()}.docx`,
+            );
+            writeFileSync(tmpDocx, outBuffer);
+            try {
+              // Pipe to Python to bypass Node vs Python XML serialization differences
+              const cmd = `uv run python "${PYTHON_ABSTRACT_CMD}" "${tmpDocx}"`;
+              const actualXml = execSync(cmd, {
+                cwd: PYTHON_DIR,
+                encoding: "utf-8",
+                stdio: ["pipe", "pipe", "inherit"],
+                env: { ...process.env, PYTHONIOENCODING: "utf-8" },
+              });
+              // Normalize line endings for reliable string comparison
+              const normExpected = expectedXml.replace(/\r\n/g, "\n").trim();
+              const normActual = actualXml.replace(/\r\n/g, "\n").trim();
+              expect(normActual).toBe(normExpected);
+            } finally {
+              if (existsSync(tmpDocx)) unlinkSync(tmpDocx);
+            }
+          }
+        }
+        // 3. Assert Markdown Extraction Parity (Raw View)
+        const rawMdPath = resolve(testDir, "golden_raw.md");
+        if (existsSync(rawMdPath)) {
+          const expectedRaw = readFileSync(rawMdPath, "utf-8").replace(
+            /\r\n/g,
+            "\n",
+          );
+          const actualRaw = normalizeMdTimestamps(
+            await extractTextFromBuffer(outBuffer, false),
+          ).replace(/\r\n/g, "\n");
+          expect(actualRaw).toBe(expectedRaw);
+        }
+        // 4. Assert Markdown Extraction Parity (Clean View)
+        const cleanMdPath = resolve(testDir, "golden_clean.md");
+        if (existsSync(cleanMdPath)) {
+          const expectedClean = readFileSync(cleanMdPath, "utf-8").replace(
+            /\r\n/g,
+            "\n",
+          );
+          const actualClean = normalizeMdTimestamps(
+            await extractTextFromBuffer(outBuffer, true),
+          ).replace(/\r\n/g, "\n");
+          expect(actualClean).toBe(expectedClean);
+        }
+      });
+    });
+  }
+});

package/src/diff.test.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { describe, it, expect } from 'vitest';
-import { trim_common_context, generate_edits_from_text } from './diff.js';
+import { trim_common_context, generate_edits_from_text, create_word_patch_diff } from './diff.js';
 describe('Diff Logic & Context Trimming', () => {
   it('handles basic prefix and suffix', () => {
@@ -59,4 +59,16 @@ describe('Diff Logic & Context Trimming', () => {
       expect(edit.new_text).toContain('Big');
     }
   });
+  it('generates a Word Patch formatted diff matching Python parity', () => {
+    const original = "This agreement is made between the Company and the Contractor.";
+    const modified = "This agreement is made between the Corporation and the Contractor.";
+    const diff = create_word_patch_diff(original, modified);
+    expect(diff).toContain("@@ Word Patch @@");
+    expect(diff).toContain("- Company");
+    expect(diff).toContain("+ Corporation");
+    expect(diff).toContain(" This agreement is made between the"); // Within 40-char context window so no truncation
+  });
 });

package/src/diff.ts CHANGED Viewed

@@ -1,7 +1,10 @@
-import diff_match_patch from 'diff-match-patch';
-import { ModifyText } from './models.js';
+import diff_match_patch from "diff-match-patch";
+import { ModifyText } from "./models.js";
-export function trim_common_context(target: string, new_val: string): [number, number] {
+export function trim_common_context(
+  target: string,
+  new_val: string,
+): [number, number] {
   if (!target || !new_val) return [0, 0];
   const isSpace = (char: string) => /\s/.test(char);
@@ -16,8 +19,10 @@ export function trim_common_context(target: string, new_val: string): [number, n
   // Backtrack to nearest whitespace if we split a word
   if (prefix_len < target.length && prefix_len < new_val.length) {
     while (prefix_len > 0) {
-      const target_split = !isSpace(target[prefix_len - 1]) && !isSpace(target[prefix_len]);
-      const new_split = !isSpace(new_val[prefix_len - 1]) && !isSpace(new_val[prefix_len]);
+      const target_split =
+        !isSpace(target[prefix_len - 1]) && !isSpace(target[prefix_len]);
+      const new_split =
+        !isSpace(new_val[prefix_len - 1]) && !isSpace(new_val[prefix_len]);
       if (target_split || new_split) {
         prefix_len--;
       } else {
@@ -30,7 +35,7 @@ export function trim_common_context(target: string, new_val: string): [number, n
   while (prefix_len > 0) {
     if (prefix_len < target.length) {
       const charSeq = target.substring(prefix_len - 1, prefix_len + 1);
-      if (charSeq === '**' || charSeq === '__') {
+      if (charSeq === "**" || charSeq === "__") {
         prefix_len--;
         continue;
       }
@@ -39,22 +44,24 @@ export function trim_common_context(target: string, new_val: string): [number, n
     const left = target.substring(0, prefix_len);
     const b_count = (left.match(/\*\*/g) || []).length;
     const u2_count = (left.match(/__/g) || []).length;
-    const u1_count = (left.replace(/__/g, '').match(/_/g) || []).length;
+    const u1_count = (left.replace(/__/g, "").match(/_/g) || []).length;
     if (b_count % 2 !== 0) {
-      prefix_len = left.lastIndexOf('**');
+      prefix_len = left.lastIndexOf("**");
       continue;
     }
     if (u2_count % 2 !== 0) {
-      prefix_len = left.lastIndexOf('__');
+      prefix_len = left.lastIndexOf("__");
       continue;
     }
     if (u1_count % 2 !== 0) {
       let idx = left.length - 1;
       while (idx >= 0) {
-        if (left[idx] === '_' &&
-           (idx === 0 || left[idx - 1] !== '_') &&
-           (idx === left.length - 1 || left[idx + 1] !== '_')) {
+        if (
+          left[idx] === "_" &&
+          (idx === 0 || left[idx - 1] !== "_") &&
+          (idx === left.length - 1 || left[idx + 1] !== "_")
+        ) {
           prefix_len = idx;
           break;
         }
@@ -68,15 +75,15 @@ export function trim_common_context(target: string, new_val: string): [number, n
     let hit_header = false;
     while (temp_len > 0) {
       const char = target[temp_len - 1];
-      if (char === '#') {
+      if (char === "#") {
         prefix_len = temp_len - 1;
-        while (prefix_len > 0 && target[prefix_len - 1] !== '\n') {
+        while (prefix_len > 0 && target[prefix_len - 1] !== "\n") {
           prefix_len--;
         }
         hit_header = true;
         break;
       }
-      if (char === '\n') break;
+      if (char === "\n") break;
       temp_len--;
     }
     if (hit_header) continue;
@@ -90,7 +97,11 @@ export function trim_common_context(target: string, new_val: string): [number, n
   const new_rem_len = new_val.length - prefix_len;
   const limit_suffix = Math.min(target_rem_len, new_rem_len);
-  while (suffix_len < limit_suffix && target[target.length - 1 - suffix_len] === new_val[new_val.length - 1 - suffix_len]) {
+  while (
+    suffix_len < limit_suffix &&
+    target[target.length - 1 - suffix_len] ===
+      new_val[new_val.length - 1 - suffix_len]
+  ) {
     suffix_len++;
   }
@@ -98,11 +109,15 @@ export function trim_common_context(target: string, new_val: string): [number, n
     while (suffix_len > 0) {
       let target_split = false;
       if (suffix_len < target.length) {
-        target_split = !isSpace(target[target.length - 1 - suffix_len]) && !isSpace(target[target.length - suffix_len]);
+        target_split =
+          !isSpace(target[target.length - 1 - suffix_len]) &&
+          !isSpace(target[target.length - suffix_len]);
       }
       let new_split = false;
       if (suffix_len < new_val.length) {
-        new_split = !isSpace(new_val[new_val.length - 1 - suffix_len]) && !isSpace(new_val[new_val.length - suffix_len]);
+        new_split =
+          !isSpace(new_val[new_val.length - 1 - suffix_len]) &&
+          !isSpace(new_val[new_val.length - suffix_len]);
       }
       if (target_split || new_split) {
         suffix_len--;
@@ -116,7 +131,7 @@ export function trim_common_context(target: string, new_val: string): [number, n
     const idx = target.length - suffix_len;
     if (idx > 0) {
       const charSeq = target.substring(idx - 1, idx + 1);
-      if (charSeq === '**' || charSeq === '__') {
+      if (charSeq === "**" || charSeq === "__") {
         suffix_len--;
         continue;
       }
@@ -125,22 +140,24 @@ export function trim_common_context(target: string, new_val: string): [number, n
     const right = target.substring(target.length - suffix_len);
     const b_count = (right.match(/\*\*/g) || []).length;
     const u2_count = (right.match(/__/g) || []).length;
-    const u1_count = (right.replace(/__/g, '').match(/_/g) || []).length;
+    const u1_count = (right.replace(/__/g, "").match(/_/g) || []).length;
     if (b_count % 2 !== 0) {
-      suffix_len -= right.indexOf('**') + 2;
+      suffix_len -= right.indexOf("**") + 2;
       continue;
     }
     if (u2_count % 2 !== 0) {
-      suffix_len -= right.indexOf('__') + 2;
+      suffix_len -= right.indexOf("__") + 2;
       continue;
     }
     if (u1_count % 2 !== 0) {
       let idx_in_right = 0;
       while (idx_in_right < right.length) {
-        if (right[idx_in_right] === '_' &&
-           (idx_in_right === 0 || right[idx_in_right - 1] !== '_') &&
-           (idx_in_right === right.length - 1 || right[idx_in_right + 1] !== '_')) {
+        if (
+          right[idx_in_right] === "_" &&
+          (idx_in_right === 0 || right[idx_in_right - 1] !== "_") &&
+          (idx_in_right === right.length - 1 || right[idx_in_right + 1] !== "_")
+        ) {
           suffix_len -= idx_in_right + 1;
           break;
         }
@@ -151,20 +168,26 @@ export function trim_common_context(target: string, new_val: string): [number, n
     break;
   }
-  if (suffix_len > 0 && /^\s+$/.test(target.substring(target.length - suffix_len))) {
+  if (
+    suffix_len > 0 &&
+    /^\s+$/.test(target.substring(target.length - suffix_len))
+  ) {
     suffix_len = 0;
   }
   // Absorb balanced wrappers
-  for (const marker of ['**', '__', '_']) {
+  for (const marker of ["**", "__", "_"]) {
     const mlen = marker.length;
     const tgt_rem = target.substring(prefix_len, target.length - suffix_len);
     const new_rem = new_val.substring(prefix_len, new_val.length - suffix_len);
     if (
-      tgt_rem.startsWith(marker) && new_rem.startsWith(marker) &&
-      tgt_rem.endsWith(marker) && new_rem.endsWith(marker) &&
-      tgt_rem.length >= 2 * mlen && new_rem.length >= 2 * mlen
+      tgt_rem.startsWith(marker) &&
+      new_rem.startsWith(marker) &&
+      tgt_rem.endsWith(marker) &&
+      new_rem.endsWith(marker) &&
+      tgt_rem.length >= 2 * mlen &&
+      new_rem.length >= 2 * mlen
     ) {
       prefix_len += mlen;
       suffix_len += mlen;
@@ -174,17 +197,20 @@ export function trim_common_context(target: string, new_val: string): [number, n
   return [prefix_len, suffix_len];
 }
-function _words_to_chars(text1: string, text2: string): [string, string, string[]] {
+function _words_to_chars(
+  text1: string,
+  text2: string,
+): [string, string, string[]] {
   const token_array: string[] = [];
   const token_hash: Record<string, number> = {};
   // RegExp equivalent to Python's r"(\s+|\w+|[^\w\s])" with unicode support
   const split_pattern = /(\s+|[\p{L}\p{N}_]+|[^\p{L}\p{N}_\s])/gu;
   const encode_text = (text: string) => {
     // Keep delimiters via capture group in split
     const tokens = text.split(split_pattern).filter(Boolean);
-    let encoded_chars = '';
+    let encoded_chars = "";
     for (const token of tokens) {
       if (token in token_hash) {
         encoded_chars += String.fromCharCode(token_hash[token]);
@@ -201,18 +227,26 @@ function _words_to_chars(text1: string, text2: string): [string, string, string[
   return [encode_text(text1), encode_text(text2), token_array];
 }
-export function generate_edits_from_text(original_text: string, modified_text: string): ModifyText[] {
+export function generate_edits_from_text(
+  original_text: string,
+  modified_text: string,
+): ModifyText[] {
   const dmp = new diff_match_patch.diff_match_patch();
-  const [chars1, chars2, token_array] = _words_to_chars(original_text, modified_text);
+  dmp.Diff_Timeout = 2.0; // Enforce strict 2-second timeout to prevent deep recursion hangs
+  const [chars1, chars2, token_array] = _words_to_chars(
+    original_text,
+    modified_text,
+  );
   const diffs = dmp.diff_main(chars1, chars2, false);
   dmp.diff_cleanupSemantic(diffs);
   // Manually map characters back to words to bypass prototype volatility (diff_charsToLines_)
   for (let i = 0; i < diffs.length; i++) {
     const chars = diffs[i][1];
-    let text = '';
-    for (let j = 0; j < chars.length; j++) text += token_array[chars.charCodeAt(j)];
+    let text = "";
+    for (let j = 0; j < chars.length; j++)
+      text += token_array[chars.charCodeAt(j)];
     diffs[i][1] = text;
   }
@@ -221,31 +255,170 @@ export function generate_edits_from_text(original_text: string, modified_text: s
   let pending_delete: [number, string] | null = null;
   for (const [op, text] of diffs) {
-    if (op === 0) { // Equal
+    if (op === 0) {
+      // Equal
       if (pending_delete) {
         const [idx, del_txt] = pending_delete;
-        edits.push({ type: 'modify', target_text: del_txt, new_text: '', comment: 'Diff: Text deleted', _match_start_index: idx });
+        edits.push({
+          type: "modify",
+          target_text: del_txt,
+          new_text: "",
+          comment: "Diff: Text deleted",
+          _match_start_index: idx,
+        });
         pending_delete = null;
       }
       current_original_index += text.length;
-    } else if (op === -1) { // Delete
+    } else if (op === -1) {
+      // Delete
       pending_delete = [current_original_index, text];
       current_original_index += text.length;
-    } else if (op === 1) { // Insert
+    } else if (op === 1) {
+      // Insert
       if (pending_delete) {
         const [idx, del_txt] = pending_delete;
-        edits.push({ type: 'modify', target_text: del_txt, new_text: text, comment: 'Diff: Replacement', _match_start_index: idx });
+        edits.push({
+          type: "modify",
+          target_text: del_txt,
+          new_text: text,
+          comment: "Diff: Replacement",
+          _match_start_index: idx,
+        });
         pending_delete = null;
       } else {
-        edits.push({ type: 'modify', target_text: '', new_text: text, comment: 'Diff: Text inserted', _match_start_index: current_original_index });
+        edits.push({
+          type: "modify",
+          target_text: "",
+          new_text: text,
+          comment: "Diff: Text inserted",
+          _match_start_index: current_original_index,
+        });
       }
     }
   }
   if (pending_delete) {
     const [idx, del_txt] = pending_delete;
-    edits.push({ type: 'modify', target_text: del_txt, new_text: '', comment: 'Diff: Text deleted', _match_start_index: idx });
+    edits.push({
+      type: "modify",
+      target_text: del_txt,
+      new_text: "",
+      comment: "Diff: Text deleted",
+      _match_start_index: idx,
+    });
   }
   return edits;
-}
+}
+export function create_unified_diff(
+  original_text: string,
+  modified_text: string,
+  context_lines: number = 3,
+): string {
+  const dmp = new diff_match_patch.diff_match_patch();
+  dmp.Diff_Timeout = 2.0;
+  const a = dmp.diff_linesToChars_(original_text, modified_text);
+  const diffs = dmp.diff_main(a.chars1, a.chars2, false);
+  dmp.diff_charsToLines_(diffs, a.lineArray);
+  const output: string[] = [];
+  output.push("--- Original");
+  output.push("+++ Modified");
+  let i = 0;
+  while (i < diffs.length) {
+    while (i < diffs.length && diffs[i][0] === 0) i++;
+    if (i >= diffs.length) break;
+    let start = i;
+    let preContext: string[] = [];
+    if (start > 0 && diffs[start - 1][0] === 0) {
+      const lines = diffs[start - 1][1].replace(/\n$/, "").split("\n");
+      preContext = lines.slice(-context_lines);
+    }
+    const chunk: string[] = [];
+    chunk.push(...preContext.map((l) => ` ${l}`));
+    while (i < diffs.length) {
+      const [op, text] = diffs[i];
+      const lines = text.replace(/\n$/, "").split("\n");
+      if (op === 0) {
+        if (lines.length > context_lines * 2) break;
+        chunk.push(...lines.map((l) => ` ${l}`));
+      } else {
+        const prefix = op === -1 ? "-" : "+";
+        chunk.push(...lines.map((l) => `${prefix}${l}`));
+      }
+      i++;
+    }
+    let postContext: string[] = [];
+    if (i < diffs.length && diffs[i][0] === 0) {
+      const lines = diffs[i][1].replace(/\n$/, "").split("\n");
+      postContext = lines.slice(0, context_lines);
+    }
+    chunk.push(...postContext.map((l) => ` ${l}`));
+    output.push("@@ ... @@");
+    output.push(...chunk);
+  }
+  if (output.length === 2) return ""; // No changes
+  return output.join("\n");
+}
+export function create_word_patch_diff(
+  original_text: string,
+  modified_text: string,
+  original_path: string = "Original",
+  modified_path: string = "Modified"
+): string {
+  const edits = generate_edits_from_text(original_text, modified_text);
+  const output: string[] = [
+    `--- ${original_path}`,
+    `+++ ${modified_path}`,
+    ""
+  ];
+  const CONTEXT_SIZE = 40;
+  for (const edit of edits) {
+    const raw_start = edit._match_start_index || 0;
+    const raw_target = edit.target_text || "";
+    const raw_new = edit.new_text || "";
+    const [prefix_len, suffix_len] = trim_common_context(raw_target, raw_new);
+    const target_end_in_target = raw_target.length - suffix_len;
+    const new_end_in_new = raw_new.length - suffix_len;
+    const display_target = raw_target.substring(prefix_len, target_end_in_target);
+    const display_new = raw_new.substring(prefix_len, new_end_in_new);
+    const change_start = raw_start + prefix_len;
+    const change_end = change_start + display_target.length;
+    let pre_start = Math.max(0, change_start - CONTEXT_SIZE);
+    let pre_context = original_text.substring(pre_start, change_start);
+    if (pre_start > 0) pre_context = "..." + pre_context;
+    let post_end = Math.min(original_text.length, change_end + CONTEXT_SIZE);
+    let post_context = original_text.substring(change_end, post_end);
+    if (post_end < original_text.length) post_context = post_context + "...";
+    pre_context = pre_context.replace(/\n/g, " ").replace(/\r/g, "");
+    post_context = post_context.replace(/\n/g, " ").replace(/\r/g, "");
+    output.push("@@ Word Patch @@");
+    output.push(` ${pre_context}`);
+    if (display_target) output.push(`- ${display_target}`);
+    if (display_new) output.push(`+ ${display_new}`);
+    output.push(` ${post_context}`);
+    output.push("");
+  }
+  return output.join("\n");
+}