@adeu/core 1.6.7 → 1.6.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3969 -1859
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +95 -8
- package/dist/index.d.ts +95 -8
- package/dist/index.js +3966 -1859
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/consistency.test.ts +134 -0
- package/src/diff.test.ts +13 -1
- package/src/diff.ts +220 -47
- package/src/docx/bridge.ts +111 -57
- package/src/docx/dom.ts +66 -7
- package/src/domain.test.ts +280 -0
- package/src/domain.ts +264 -10
- package/src/engine.bugs.test.ts +481 -0
- package/src/engine.ts +1346 -192
- package/src/index.ts +7 -8
- package/src/ingest.ts +8 -0
- package/src/markup.ts +160 -53
- package/src/outline.ts +199 -69
- package/src/sanitize/core.ts +130 -0
- package/src/sanitize/report.ts +125 -0
- package/src/sanitize/sanitize.test.ts +237 -0
- package/src/sanitize/transforms.ts +452 -0
- package/src/utils/docx.ts +292 -158
package/package.json
CHANGED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
readFileSync,
|
|
4
|
+
existsSync,
|
|
5
|
+
readdirSync,
|
|
6
|
+
writeFileSync,
|
|
7
|
+
unlinkSync,
|
|
8
|
+
} from "node:fs";
|
|
9
|
+
import { resolve, dirname } from "node:path";
|
|
10
|
+
import { fileURLToPath } from "node:url";
|
|
11
|
+
import { execSync } from "node:child_process";
|
|
12
|
+
import { tmpdir } from "node:os";
|
|
13
|
+
|
|
14
|
+
import { DocumentObject } from "./docx/bridge.js";
|
|
15
|
+
import { RedlineEngine } from "./engine.js";
|
|
16
|
+
import { extractTextFromBuffer } from "./ingest.js";
|
|
17
|
+
|
|
18
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
19
|
+
const __dirname = dirname(__filename);
|
|
20
|
+
|
|
21
|
+
const CORPUS_DIR = resolve(
|
|
22
|
+
__dirname,
|
|
23
|
+
"../../../../shared/cross_platform_tests",
|
|
24
|
+
);
|
|
25
|
+
const PYTHON_ABSTRACT_CMD = resolve(
|
|
26
|
+
__dirname,
|
|
27
|
+
"../../../../python/scripts/abstract_xml.py",
|
|
28
|
+
);
|
|
29
|
+
const PYTHON_DIR = resolve(__dirname, "../../../../python");
|
|
30
|
+
|
|
31
|
+
function normalizeMdTimestamps(mdText: string): string {
|
|
32
|
+
return mdText.replace(/@ \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z/g, "@ DATE");
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
describe("Polyglot Consistency Framework (TS vs Python)", () => {
|
|
36
|
+
if (!existsSync(CORPUS_DIR)) {
|
|
37
|
+
it.skip("Cross-platform test corpus not found", () => {});
|
|
38
|
+
return;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const testFolders = readdirSync(CORPUS_DIR, { withFileTypes: true })
|
|
42
|
+
.filter((dirent) => dirent.isDirectory())
|
|
43
|
+
.map((dirent) => dirent.name);
|
|
44
|
+
|
|
45
|
+
for (const folder of testFolders) {
|
|
46
|
+
const testDir = resolve(CORPUS_DIR, folder);
|
|
47
|
+
const testJsonPath = resolve(testDir, "test.json");
|
|
48
|
+
const inputDocxPath = resolve(testDir, "input.docx");
|
|
49
|
+
|
|
50
|
+
if (!existsSync(testJsonPath) || !existsSync(inputDocxPath)) {
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const testConfig = JSON.parse(readFileSync(testJsonPath, "utf-8"));
|
|
55
|
+
const isReadOnly = testConfig.read_only || false;
|
|
56
|
+
// CRITICAL: We must inherit the author from the JSON so the XML Abstraction comparison
|
|
57
|
+
// doesn't fail on `w:author="Adeu AI"` vs `w:author="Adeu AI (TS)"`.
|
|
58
|
+
const author = testConfig.author || "Adeu AI";
|
|
59
|
+
|
|
60
|
+
describe(`Corpus Scenario: [${folder}]`, () => {
|
|
61
|
+
it("Strictly matches the Python Golden Masters", async () => {
|
|
62
|
+
const inputBuffer = readFileSync(inputDocxPath);
|
|
63
|
+
let outBuffer: Buffer;
|
|
64
|
+
|
|
65
|
+
// 1. Process Edits (if not read-only)
|
|
66
|
+
if (isReadOnly) {
|
|
67
|
+
outBuffer = inputBuffer;
|
|
68
|
+
} else {
|
|
69
|
+
const doc = await DocumentObject.load(inputBuffer);
|
|
70
|
+
const engine = new RedlineEngine(doc, author);
|
|
71
|
+
|
|
72
|
+
engine.process_batch(testConfig.changes || []);
|
|
73
|
+
outBuffer = await doc.save();
|
|
74
|
+
|
|
75
|
+
// 2. Assert XML Structure Parity (via Python Bridge)
|
|
76
|
+
const goldenXmlPath = resolve(testDir, "golden_abstract.xml");
|
|
77
|
+
if (existsSync(goldenXmlPath)) {
|
|
78
|
+
const expectedXml = readFileSync(goldenXmlPath, "utf-8");
|
|
79
|
+
|
|
80
|
+
const tmpDocx = resolve(
|
|
81
|
+
tmpdir(),
|
|
82
|
+
`adeu_test_${folder}_${Date.now()}.docx`,
|
|
83
|
+
);
|
|
84
|
+
writeFileSync(tmpDocx, outBuffer);
|
|
85
|
+
|
|
86
|
+
try {
|
|
87
|
+
// Pipe to Python to bypass Node vs Python XML serialization differences
|
|
88
|
+
const cmd = `uv run python "${PYTHON_ABSTRACT_CMD}" "${tmpDocx}"`;
|
|
89
|
+
const actualXml = execSync(cmd, {
|
|
90
|
+
cwd: PYTHON_DIR,
|
|
91
|
+
encoding: "utf-8",
|
|
92
|
+
stdio: ["pipe", "pipe", "inherit"],
|
|
93
|
+
env: { ...process.env, PYTHONIOENCODING: "utf-8" },
|
|
94
|
+
});
|
|
95
|
+
// Normalize line endings for reliable string comparison
|
|
96
|
+
const normExpected = expectedXml.replace(/\r\n/g, "\n").trim();
|
|
97
|
+
const normActual = actualXml.replace(/\r\n/g, "\n").trim();
|
|
98
|
+
|
|
99
|
+
expect(normActual).toBe(normExpected);
|
|
100
|
+
} finally {
|
|
101
|
+
if (existsSync(tmpDocx)) unlinkSync(tmpDocx);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// 3. Assert Markdown Extraction Parity (Raw View)
|
|
107
|
+
const rawMdPath = resolve(testDir, "golden_raw.md");
|
|
108
|
+
if (existsSync(rawMdPath)) {
|
|
109
|
+
const expectedRaw = readFileSync(rawMdPath, "utf-8").replace(
|
|
110
|
+
/\r\n/g,
|
|
111
|
+
"\n",
|
|
112
|
+
);
|
|
113
|
+
const actualRaw = normalizeMdTimestamps(
|
|
114
|
+
await extractTextFromBuffer(outBuffer, false),
|
|
115
|
+
).replace(/\r\n/g, "\n");
|
|
116
|
+
expect(actualRaw).toBe(expectedRaw);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// 4. Assert Markdown Extraction Parity (Clean View)
|
|
120
|
+
const cleanMdPath = resolve(testDir, "golden_clean.md");
|
|
121
|
+
if (existsSync(cleanMdPath)) {
|
|
122
|
+
const expectedClean = readFileSync(cleanMdPath, "utf-8").replace(
|
|
123
|
+
/\r\n/g,
|
|
124
|
+
"\n",
|
|
125
|
+
);
|
|
126
|
+
const actualClean = normalizeMdTimestamps(
|
|
127
|
+
await extractTextFromBuffer(outBuffer, true),
|
|
128
|
+
).replace(/\r\n/g, "\n");
|
|
129
|
+
expect(actualClean).toBe(expectedClean);
|
|
130
|
+
}
|
|
131
|
+
});
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
});
|
package/src/diff.test.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { describe, it, expect } from 'vitest';
|
|
2
|
-
import { trim_common_context, generate_edits_from_text } from './diff.js';
|
|
2
|
+
import { trim_common_context, generate_edits_from_text, create_word_patch_diff } from './diff.js';
|
|
3
3
|
|
|
4
4
|
describe('Diff Logic & Context Trimming', () => {
|
|
5
5
|
it('handles basic prefix and suffix', () => {
|
|
@@ -59,4 +59,16 @@ describe('Diff Logic & Context Trimming', () => {
|
|
|
59
59
|
expect(edit.new_text).toContain('Big');
|
|
60
60
|
}
|
|
61
61
|
});
|
|
62
|
+
|
|
63
|
+
it('generates a Word Patch formatted diff matching Python parity', () => {
|
|
64
|
+
const original = "This agreement is made between the Company and the Contractor.";
|
|
65
|
+
const modified = "This agreement is made between the Corporation and the Contractor.";
|
|
66
|
+
|
|
67
|
+
const diff = create_word_patch_diff(original, modified);
|
|
68
|
+
|
|
69
|
+
expect(diff).toContain("@@ Word Patch @@");
|
|
70
|
+
expect(diff).toContain("- Company");
|
|
71
|
+
expect(diff).toContain("+ Corporation");
|
|
72
|
+
expect(diff).toContain(" This agreement is made between the"); // Within 40-char context window so no truncation
|
|
73
|
+
});
|
|
62
74
|
});
|
package/src/diff.ts
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
|
-
import diff_match_patch from
|
|
2
|
-
import { ModifyText } from
|
|
1
|
+
import diff_match_patch from "diff-match-patch";
|
|
2
|
+
import { ModifyText } from "./models.js";
|
|
3
3
|
|
|
4
|
-
export function trim_common_context(
|
|
4
|
+
export function trim_common_context(
|
|
5
|
+
target: string,
|
|
6
|
+
new_val: string,
|
|
7
|
+
): [number, number] {
|
|
5
8
|
if (!target || !new_val) return [0, 0];
|
|
6
9
|
|
|
7
10
|
const isSpace = (char: string) => /\s/.test(char);
|
|
@@ -16,8 +19,10 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
16
19
|
// Backtrack to nearest whitespace if we split a word
|
|
17
20
|
if (prefix_len < target.length && prefix_len < new_val.length) {
|
|
18
21
|
while (prefix_len > 0) {
|
|
19
|
-
const target_split =
|
|
20
|
-
|
|
22
|
+
const target_split =
|
|
23
|
+
!isSpace(target[prefix_len - 1]) && !isSpace(target[prefix_len]);
|
|
24
|
+
const new_split =
|
|
25
|
+
!isSpace(new_val[prefix_len - 1]) && !isSpace(new_val[prefix_len]);
|
|
21
26
|
if (target_split || new_split) {
|
|
22
27
|
prefix_len--;
|
|
23
28
|
} else {
|
|
@@ -30,7 +35,7 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
30
35
|
while (prefix_len > 0) {
|
|
31
36
|
if (prefix_len < target.length) {
|
|
32
37
|
const charSeq = target.substring(prefix_len - 1, prefix_len + 1);
|
|
33
|
-
if (charSeq ===
|
|
38
|
+
if (charSeq === "**" || charSeq === "__") {
|
|
34
39
|
prefix_len--;
|
|
35
40
|
continue;
|
|
36
41
|
}
|
|
@@ -39,22 +44,24 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
39
44
|
const left = target.substring(0, prefix_len);
|
|
40
45
|
const b_count = (left.match(/\*\*/g) || []).length;
|
|
41
46
|
const u2_count = (left.match(/__/g) || []).length;
|
|
42
|
-
const u1_count = (left.replace(/__/g,
|
|
47
|
+
const u1_count = (left.replace(/__/g, "").match(/_/g) || []).length;
|
|
43
48
|
|
|
44
49
|
if (b_count % 2 !== 0) {
|
|
45
|
-
prefix_len = left.lastIndexOf(
|
|
50
|
+
prefix_len = left.lastIndexOf("**");
|
|
46
51
|
continue;
|
|
47
52
|
}
|
|
48
53
|
if (u2_count % 2 !== 0) {
|
|
49
|
-
prefix_len = left.lastIndexOf(
|
|
54
|
+
prefix_len = left.lastIndexOf("__");
|
|
50
55
|
continue;
|
|
51
56
|
}
|
|
52
57
|
if (u1_count % 2 !== 0) {
|
|
53
58
|
let idx = left.length - 1;
|
|
54
59
|
while (idx >= 0) {
|
|
55
|
-
if (
|
|
56
|
-
|
|
57
|
-
|
|
60
|
+
if (
|
|
61
|
+
left[idx] === "_" &&
|
|
62
|
+
(idx === 0 || left[idx - 1] !== "_") &&
|
|
63
|
+
(idx === left.length - 1 || left[idx + 1] !== "_")
|
|
64
|
+
) {
|
|
58
65
|
prefix_len = idx;
|
|
59
66
|
break;
|
|
60
67
|
}
|
|
@@ -68,15 +75,15 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
68
75
|
let hit_header = false;
|
|
69
76
|
while (temp_len > 0) {
|
|
70
77
|
const char = target[temp_len - 1];
|
|
71
|
-
if (char ===
|
|
78
|
+
if (char === "#") {
|
|
72
79
|
prefix_len = temp_len - 1;
|
|
73
|
-
while (prefix_len > 0 && target[prefix_len - 1] !==
|
|
80
|
+
while (prefix_len > 0 && target[prefix_len - 1] !== "\n") {
|
|
74
81
|
prefix_len--;
|
|
75
82
|
}
|
|
76
83
|
hit_header = true;
|
|
77
84
|
break;
|
|
78
85
|
}
|
|
79
|
-
if (char ===
|
|
86
|
+
if (char === "\n") break;
|
|
80
87
|
temp_len--;
|
|
81
88
|
}
|
|
82
89
|
if (hit_header) continue;
|
|
@@ -90,7 +97,11 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
90
97
|
const new_rem_len = new_val.length - prefix_len;
|
|
91
98
|
const limit_suffix = Math.min(target_rem_len, new_rem_len);
|
|
92
99
|
|
|
93
|
-
while (
|
|
100
|
+
while (
|
|
101
|
+
suffix_len < limit_suffix &&
|
|
102
|
+
target[target.length - 1 - suffix_len] ===
|
|
103
|
+
new_val[new_val.length - 1 - suffix_len]
|
|
104
|
+
) {
|
|
94
105
|
suffix_len++;
|
|
95
106
|
}
|
|
96
107
|
|
|
@@ -98,11 +109,15 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
98
109
|
while (suffix_len > 0) {
|
|
99
110
|
let target_split = false;
|
|
100
111
|
if (suffix_len < target.length) {
|
|
101
|
-
target_split =
|
|
112
|
+
target_split =
|
|
113
|
+
!isSpace(target[target.length - 1 - suffix_len]) &&
|
|
114
|
+
!isSpace(target[target.length - suffix_len]);
|
|
102
115
|
}
|
|
103
116
|
let new_split = false;
|
|
104
117
|
if (suffix_len < new_val.length) {
|
|
105
|
-
new_split =
|
|
118
|
+
new_split =
|
|
119
|
+
!isSpace(new_val[new_val.length - 1 - suffix_len]) &&
|
|
120
|
+
!isSpace(new_val[new_val.length - suffix_len]);
|
|
106
121
|
}
|
|
107
122
|
if (target_split || new_split) {
|
|
108
123
|
suffix_len--;
|
|
@@ -116,7 +131,7 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
116
131
|
const idx = target.length - suffix_len;
|
|
117
132
|
if (idx > 0) {
|
|
118
133
|
const charSeq = target.substring(idx - 1, idx + 1);
|
|
119
|
-
if (charSeq ===
|
|
134
|
+
if (charSeq === "**" || charSeq === "__") {
|
|
120
135
|
suffix_len--;
|
|
121
136
|
continue;
|
|
122
137
|
}
|
|
@@ -125,22 +140,24 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
125
140
|
const right = target.substring(target.length - suffix_len);
|
|
126
141
|
const b_count = (right.match(/\*\*/g) || []).length;
|
|
127
142
|
const u2_count = (right.match(/__/g) || []).length;
|
|
128
|
-
const u1_count = (right.replace(/__/g,
|
|
143
|
+
const u1_count = (right.replace(/__/g, "").match(/_/g) || []).length;
|
|
129
144
|
|
|
130
145
|
if (b_count % 2 !== 0) {
|
|
131
|
-
suffix_len -= right.indexOf(
|
|
146
|
+
suffix_len -= right.indexOf("**") + 2;
|
|
132
147
|
continue;
|
|
133
148
|
}
|
|
134
149
|
if (u2_count % 2 !== 0) {
|
|
135
|
-
suffix_len -= right.indexOf(
|
|
150
|
+
suffix_len -= right.indexOf("__") + 2;
|
|
136
151
|
continue;
|
|
137
152
|
}
|
|
138
153
|
if (u1_count % 2 !== 0) {
|
|
139
154
|
let idx_in_right = 0;
|
|
140
155
|
while (idx_in_right < right.length) {
|
|
141
|
-
if (
|
|
142
|
-
|
|
143
|
-
|
|
156
|
+
if (
|
|
157
|
+
right[idx_in_right] === "_" &&
|
|
158
|
+
(idx_in_right === 0 || right[idx_in_right - 1] !== "_") &&
|
|
159
|
+
(idx_in_right === right.length - 1 || right[idx_in_right + 1] !== "_")
|
|
160
|
+
) {
|
|
144
161
|
suffix_len -= idx_in_right + 1;
|
|
145
162
|
break;
|
|
146
163
|
}
|
|
@@ -151,20 +168,26 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
151
168
|
break;
|
|
152
169
|
}
|
|
153
170
|
|
|
154
|
-
if (
|
|
171
|
+
if (
|
|
172
|
+
suffix_len > 0 &&
|
|
173
|
+
/^\s+$/.test(target.substring(target.length - suffix_len))
|
|
174
|
+
) {
|
|
155
175
|
suffix_len = 0;
|
|
156
176
|
}
|
|
157
177
|
|
|
158
178
|
// Absorb balanced wrappers
|
|
159
|
-
for (const marker of [
|
|
179
|
+
for (const marker of ["**", "__", "_"]) {
|
|
160
180
|
const mlen = marker.length;
|
|
161
181
|
const tgt_rem = target.substring(prefix_len, target.length - suffix_len);
|
|
162
182
|
const new_rem = new_val.substring(prefix_len, new_val.length - suffix_len);
|
|
163
183
|
|
|
164
184
|
if (
|
|
165
|
-
tgt_rem.startsWith(marker) &&
|
|
166
|
-
|
|
167
|
-
tgt_rem.
|
|
185
|
+
tgt_rem.startsWith(marker) &&
|
|
186
|
+
new_rem.startsWith(marker) &&
|
|
187
|
+
tgt_rem.endsWith(marker) &&
|
|
188
|
+
new_rem.endsWith(marker) &&
|
|
189
|
+
tgt_rem.length >= 2 * mlen &&
|
|
190
|
+
new_rem.length >= 2 * mlen
|
|
168
191
|
) {
|
|
169
192
|
prefix_len += mlen;
|
|
170
193
|
suffix_len += mlen;
|
|
@@ -174,17 +197,20 @@ export function trim_common_context(target: string, new_val: string): [number, n
|
|
|
174
197
|
return [prefix_len, suffix_len];
|
|
175
198
|
}
|
|
176
199
|
|
|
177
|
-
function _words_to_chars(
|
|
200
|
+
function _words_to_chars(
|
|
201
|
+
text1: string,
|
|
202
|
+
text2: string,
|
|
203
|
+
): [string, string, string[]] {
|
|
178
204
|
const token_array: string[] = [];
|
|
179
205
|
const token_hash: Record<string, number> = {};
|
|
180
|
-
|
|
206
|
+
|
|
181
207
|
// RegExp equivalent to Python's r"(\s+|\w+|[^\w\s])" with unicode support
|
|
182
208
|
const split_pattern = /(\s+|[\p{L}\p{N}_]+|[^\p{L}\p{N}_\s])/gu;
|
|
183
209
|
|
|
184
210
|
const encode_text = (text: string) => {
|
|
185
211
|
// Keep delimiters via capture group in split
|
|
186
212
|
const tokens = text.split(split_pattern).filter(Boolean);
|
|
187
|
-
let encoded_chars =
|
|
213
|
+
let encoded_chars = "";
|
|
188
214
|
for (const token of tokens) {
|
|
189
215
|
if (token in token_hash) {
|
|
190
216
|
encoded_chars += String.fromCharCode(token_hash[token]);
|
|
@@ -201,18 +227,26 @@ function _words_to_chars(text1: string, text2: string): [string, string, string[
|
|
|
201
227
|
return [encode_text(text1), encode_text(text2), token_array];
|
|
202
228
|
}
|
|
203
229
|
|
|
204
|
-
export function generate_edits_from_text(
|
|
230
|
+
export function generate_edits_from_text(
|
|
231
|
+
original_text: string,
|
|
232
|
+
modified_text: string,
|
|
233
|
+
): ModifyText[] {
|
|
205
234
|
const dmp = new diff_match_patch.diff_match_patch();
|
|
206
|
-
|
|
207
|
-
|
|
235
|
+
dmp.Diff_Timeout = 2.0; // Enforce strict 2-second timeout to prevent deep recursion hangs
|
|
236
|
+
|
|
237
|
+
const [chars1, chars2, token_array] = _words_to_chars(
|
|
238
|
+
original_text,
|
|
239
|
+
modified_text,
|
|
240
|
+
);
|
|
208
241
|
const diffs = dmp.diff_main(chars1, chars2, false);
|
|
209
242
|
dmp.diff_cleanupSemantic(diffs);
|
|
210
|
-
|
|
243
|
+
|
|
211
244
|
// Manually map characters back to words to bypass prototype volatility (diff_charsToLines_)
|
|
212
245
|
for (let i = 0; i < diffs.length; i++) {
|
|
213
246
|
const chars = diffs[i][1];
|
|
214
|
-
let text =
|
|
215
|
-
for (let j = 0; j < chars.length; j++)
|
|
247
|
+
let text = "";
|
|
248
|
+
for (let j = 0; j < chars.length; j++)
|
|
249
|
+
text += token_array[chars.charCodeAt(j)];
|
|
216
250
|
diffs[i][1] = text;
|
|
217
251
|
}
|
|
218
252
|
|
|
@@ -221,31 +255,170 @@ export function generate_edits_from_text(original_text: string, modified_text: s
|
|
|
221
255
|
let pending_delete: [number, string] | null = null;
|
|
222
256
|
|
|
223
257
|
for (const [op, text] of diffs) {
|
|
224
|
-
if (op === 0) {
|
|
258
|
+
if (op === 0) {
|
|
259
|
+
// Equal
|
|
225
260
|
if (pending_delete) {
|
|
226
261
|
const [idx, del_txt] = pending_delete;
|
|
227
|
-
edits.push({
|
|
262
|
+
edits.push({
|
|
263
|
+
type: "modify",
|
|
264
|
+
target_text: del_txt,
|
|
265
|
+
new_text: "",
|
|
266
|
+
comment: "Diff: Text deleted",
|
|
267
|
+
_match_start_index: idx,
|
|
268
|
+
});
|
|
228
269
|
pending_delete = null;
|
|
229
270
|
}
|
|
230
271
|
current_original_index += text.length;
|
|
231
|
-
} else if (op === -1) {
|
|
272
|
+
} else if (op === -1) {
|
|
273
|
+
// Delete
|
|
232
274
|
pending_delete = [current_original_index, text];
|
|
233
275
|
current_original_index += text.length;
|
|
234
|
-
} else if (op === 1) {
|
|
276
|
+
} else if (op === 1) {
|
|
277
|
+
// Insert
|
|
235
278
|
if (pending_delete) {
|
|
236
279
|
const [idx, del_txt] = pending_delete;
|
|
237
|
-
edits.push({
|
|
280
|
+
edits.push({
|
|
281
|
+
type: "modify",
|
|
282
|
+
target_text: del_txt,
|
|
283
|
+
new_text: text,
|
|
284
|
+
comment: "Diff: Replacement",
|
|
285
|
+
_match_start_index: idx,
|
|
286
|
+
});
|
|
238
287
|
pending_delete = null;
|
|
239
288
|
} else {
|
|
240
|
-
edits.push({
|
|
289
|
+
edits.push({
|
|
290
|
+
type: "modify",
|
|
291
|
+
target_text: "",
|
|
292
|
+
new_text: text,
|
|
293
|
+
comment: "Diff: Text inserted",
|
|
294
|
+
_match_start_index: current_original_index,
|
|
295
|
+
});
|
|
241
296
|
}
|
|
242
297
|
}
|
|
243
298
|
}
|
|
244
299
|
|
|
245
300
|
if (pending_delete) {
|
|
246
301
|
const [idx, del_txt] = pending_delete;
|
|
247
|
-
edits.push({
|
|
302
|
+
edits.push({
|
|
303
|
+
type: "modify",
|
|
304
|
+
target_text: del_txt,
|
|
305
|
+
new_text: "",
|
|
306
|
+
comment: "Diff: Text deleted",
|
|
307
|
+
_match_start_index: idx,
|
|
308
|
+
});
|
|
248
309
|
}
|
|
249
310
|
|
|
250
311
|
return edits;
|
|
251
|
-
}
|
|
312
|
+
}
|
|
313
|
+
export function create_unified_diff(
|
|
314
|
+
original_text: string,
|
|
315
|
+
modified_text: string,
|
|
316
|
+
context_lines: number = 3,
|
|
317
|
+
): string {
|
|
318
|
+
const dmp = new diff_match_patch.diff_match_patch();
|
|
319
|
+
dmp.Diff_Timeout = 2.0;
|
|
320
|
+
|
|
321
|
+
const a = dmp.diff_linesToChars_(original_text, modified_text);
|
|
322
|
+
const diffs = dmp.diff_main(a.chars1, a.chars2, false);
|
|
323
|
+
dmp.diff_charsToLines_(diffs, a.lineArray);
|
|
324
|
+
|
|
325
|
+
const output: string[] = [];
|
|
326
|
+
output.push("--- Original");
|
|
327
|
+
output.push("+++ Modified");
|
|
328
|
+
|
|
329
|
+
let i = 0;
|
|
330
|
+
while (i < diffs.length) {
|
|
331
|
+
while (i < diffs.length && diffs[i][0] === 0) i++;
|
|
332
|
+
if (i >= diffs.length) break;
|
|
333
|
+
|
|
334
|
+
let start = i;
|
|
335
|
+
let preContext: string[] = [];
|
|
336
|
+
if (start > 0 && diffs[start - 1][0] === 0) {
|
|
337
|
+
const lines = diffs[start - 1][1].replace(/\n$/, "").split("\n");
|
|
338
|
+
preContext = lines.slice(-context_lines);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
const chunk: string[] = [];
|
|
342
|
+
chunk.push(...preContext.map((l) => ` ${l}`));
|
|
343
|
+
|
|
344
|
+
while (i < diffs.length) {
|
|
345
|
+
const [op, text] = diffs[i];
|
|
346
|
+
const lines = text.replace(/\n$/, "").split("\n");
|
|
347
|
+
|
|
348
|
+
if (op === 0) {
|
|
349
|
+
if (lines.length > context_lines * 2) break;
|
|
350
|
+
chunk.push(...lines.map((l) => ` ${l}`));
|
|
351
|
+
} else {
|
|
352
|
+
const prefix = op === -1 ? "-" : "+";
|
|
353
|
+
chunk.push(...lines.map((l) => `${prefix}${l}`));
|
|
354
|
+
}
|
|
355
|
+
i++;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
let postContext: string[] = [];
|
|
359
|
+
if (i < diffs.length && diffs[i][0] === 0) {
|
|
360
|
+
const lines = diffs[i][1].replace(/\n$/, "").split("\n");
|
|
361
|
+
postContext = lines.slice(0, context_lines);
|
|
362
|
+
}
|
|
363
|
+
chunk.push(...postContext.map((l) => ` ${l}`));
|
|
364
|
+
|
|
365
|
+
output.push("@@ ... @@");
|
|
366
|
+
output.push(...chunk);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
if (output.length === 2) return ""; // No changes
|
|
370
|
+
return output.join("\n");
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
export function create_word_patch_diff(
|
|
374
|
+
original_text: string,
|
|
375
|
+
modified_text: string,
|
|
376
|
+
original_path: string = "Original",
|
|
377
|
+
modified_path: string = "Modified"
|
|
378
|
+
): string {
|
|
379
|
+
const edits = generate_edits_from_text(original_text, modified_text);
|
|
380
|
+
const output: string[] = [
|
|
381
|
+
`--- ${original_path}`,
|
|
382
|
+
`+++ ${modified_path}`,
|
|
383
|
+
""
|
|
384
|
+
];
|
|
385
|
+
|
|
386
|
+
const CONTEXT_SIZE = 40;
|
|
387
|
+
|
|
388
|
+
for (const edit of edits) {
|
|
389
|
+
const raw_start = edit._match_start_index || 0;
|
|
390
|
+
const raw_target = edit.target_text || "";
|
|
391
|
+
const raw_new = edit.new_text || "";
|
|
392
|
+
|
|
393
|
+
const [prefix_len, suffix_len] = trim_common_context(raw_target, raw_new);
|
|
394
|
+
|
|
395
|
+
const target_end_in_target = raw_target.length - suffix_len;
|
|
396
|
+
const new_end_in_new = raw_new.length - suffix_len;
|
|
397
|
+
|
|
398
|
+
const display_target = raw_target.substring(prefix_len, target_end_in_target);
|
|
399
|
+
const display_new = raw_new.substring(prefix_len, new_end_in_new);
|
|
400
|
+
|
|
401
|
+
const change_start = raw_start + prefix_len;
|
|
402
|
+
const change_end = change_start + display_target.length;
|
|
403
|
+
|
|
404
|
+
let pre_start = Math.max(0, change_start - CONTEXT_SIZE);
|
|
405
|
+
let pre_context = original_text.substring(pre_start, change_start);
|
|
406
|
+
if (pre_start > 0) pre_context = "..." + pre_context;
|
|
407
|
+
|
|
408
|
+
let post_end = Math.min(original_text.length, change_end + CONTEXT_SIZE);
|
|
409
|
+
let post_context = original_text.substring(change_end, post_end);
|
|
410
|
+
if (post_end < original_text.length) post_context = post_context + "...";
|
|
411
|
+
|
|
412
|
+
pre_context = pre_context.replace(/\n/g, " ").replace(/\r/g, "");
|
|
413
|
+
post_context = post_context.replace(/\n/g, " ").replace(/\r/g, "");
|
|
414
|
+
|
|
415
|
+
output.push("@@ Word Patch @@");
|
|
416
|
+
output.push(` ${pre_context}`);
|
|
417
|
+
if (display_target) output.push(`- ${display_target}`);
|
|
418
|
+
if (display_new) output.push(`+ ${display_new}`);
|
|
419
|
+
output.push(` ${post_context}`);
|
|
420
|
+
output.push("");
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return output.join("\n");
|
|
424
|
+
}
|