@adeu/core 1.6.6 → 1.6.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1774 -957
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +21 -8
- package/dist/index.d.ts +21 -8
- package/dist/index.js +1772 -957
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/diff.ts +54 -0
- package/src/docx/bridge.ts +15 -3
- package/src/domain.test.ts +280 -0
- package/src/domain.ts +264 -10
- package/src/index.ts +7 -8
- package/src/ingest.ts +8 -0
- package/src/sanitize/core.ts +104 -0
- package/src/sanitize/report.ts +125 -0
- package/src/sanitize/sanitize.test.ts +192 -0
- package/src/sanitize/transforms.ts +365 -0
- package/src/utils/docx.ts +12 -3
package/package.json
CHANGED
package/src/diff.ts
CHANGED
|
@@ -248,4 +248,58 @@ export function generate_edits_from_text(original_text: string, modified_text: s
|
|
|
248
248
|
}
|
|
249
249
|
|
|
250
250
|
return edits;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
export function create_unified_diff(original_text: string, modified_text: string, context_lines: number = 3): string {
|
|
254
|
+
const dmp = new diff_match_patch.diff_match_patch();
|
|
255
|
+
const a = dmp.diff_linesToChars_(original_text, modified_text);
|
|
256
|
+
const diffs = dmp.diff_main(a.chars1, a.chars2, false);
|
|
257
|
+
dmp.diff_charsToLines_(diffs, a.lineArray);
|
|
258
|
+
|
|
259
|
+
const output: string[] = [];
|
|
260
|
+
output.push('--- Original');
|
|
261
|
+
output.push('+++ Modified');
|
|
262
|
+
|
|
263
|
+
let i = 0;
|
|
264
|
+
while (i < diffs.length) {
|
|
265
|
+
while (i < diffs.length && diffs[i][0] === 0) i++;
|
|
266
|
+
if (i >= diffs.length) break;
|
|
267
|
+
|
|
268
|
+
let start = i;
|
|
269
|
+
let preContext: string[] = [];
|
|
270
|
+
if (start > 0 && diffs[start - 1][0] === 0) {
|
|
271
|
+
const lines = diffs[start - 1][1].replace(/\n$/, '').split('\n');
|
|
272
|
+
preContext = lines.slice(-context_lines);
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
const chunk: string[] = [];
|
|
276
|
+
chunk.push(...preContext.map(l => ` ${l}`));
|
|
277
|
+
|
|
278
|
+
while (i < diffs.length) {
|
|
279
|
+
const [op, text] = diffs[i];
|
|
280
|
+
const lines = text.replace(/\n$/, '').split('\n');
|
|
281
|
+
|
|
282
|
+
if (op === 0) {
|
|
283
|
+
if (lines.length > context_lines * 2) break;
|
|
284
|
+
chunk.push(...lines.map(l => ` ${l}`));
|
|
285
|
+
} else {
|
|
286
|
+
const prefix = op === -1 ? '-' : '+';
|
|
287
|
+
chunk.push(...lines.map(l => `${prefix}${l}`));
|
|
288
|
+
}
|
|
289
|
+
i++;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
let postContext: string[] = [];
|
|
293
|
+
if (i < diffs.length && diffs[i][0] === 0) {
|
|
294
|
+
const lines = diffs[i][1].replace(/\n$/, '').split('\n');
|
|
295
|
+
postContext = lines.slice(0, context_lines);
|
|
296
|
+
}
|
|
297
|
+
chunk.push(...postContext.map(l => ` ${l}`));
|
|
298
|
+
|
|
299
|
+
output.push('@@ ... @@');
|
|
300
|
+
output.push(...chunk);
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
if (output.length === 2) return ''; // No changes
|
|
304
|
+
return output.join('\n');
|
|
251
305
|
}
|
package/src/docx/bridge.ts
CHANGED
|
@@ -26,11 +26,12 @@ export class Part {
|
|
|
26
26
|
public addRelationship(id: string, type: string, target: string, isExternal: boolean = false) {
|
|
27
27
|
this.rels.set(id, new Relationship(id, type, target, isExternal));
|
|
28
28
|
|
|
29
|
-
//
|
|
30
|
-
if (this.
|
|
29
|
+
// Directly append the relationship element to the document structure
|
|
30
|
+
if (this.partname.endsWith('.rels')) {
|
|
31
31
|
const doc = this._element.ownerDocument;
|
|
32
32
|
if (doc) {
|
|
33
|
-
|
|
33
|
+
// Use strict namespace to ensure it parses successfully on reload
|
|
34
|
+
const relEl = doc.createElementNS('http://schemas.openxmlformats.org/package/2006/relationships', 'Relationship');
|
|
34
35
|
relEl.setAttribute('Id', id);
|
|
35
36
|
relEl.setAttribute('Type', type);
|
|
36
37
|
relEl.setAttribute('Target', target);
|
|
@@ -176,6 +177,17 @@ export class DocumentObject {
|
|
|
176
177
|
relsPart.addRelationship(id, relType, target, false);
|
|
177
178
|
}
|
|
178
179
|
|
|
180
|
+
public relateToExternal(target: string, relType: string): string {
|
|
181
|
+
let rId = 1;
|
|
182
|
+
while (this.part.rels.has(`rId${rId}`)) rId++;
|
|
183
|
+
const id = `rId${rId}`;
|
|
184
|
+
|
|
185
|
+
this.part.rels.set(id, new Relationship(id, relType, target, true));
|
|
186
|
+
const relsPart = this.pkg.getOrCreateRelsPart(this.part.partname);
|
|
187
|
+
relsPart.addRelationship(id, relType, target, true);
|
|
188
|
+
return id;
|
|
189
|
+
}
|
|
190
|
+
|
|
179
191
|
public async save(): Promise<Buffer> {
|
|
180
192
|
for (const part of this.pkg.parts) {
|
|
181
193
|
let xmlStr = serializeXml(part._element.ownerDocument || part._element);
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { createTestDocument, addParagraph } from './test-utils.js';
|
|
3
|
+
import { DocumentObject } from './docx/bridge.js';
|
|
4
|
+
import { extractTextFromBuffer } from './ingest.js';
|
|
5
|
+
import { RedlineEngine, BatchValidationError } from './engine.js';
|
|
6
|
+
import { ModifyText } from './models.js';
|
|
7
|
+
import { split_structural_appendix } from './pagination.js';
|
|
8
|
+
|
|
9
|
+
function addBookmark(paragraph: Element, name: string, idVal: string = "0", text: string = "") {
|
|
10
|
+
const doc = paragraph.ownerDocument!;
|
|
11
|
+
const start = doc.createElement('w:bookmarkStart');
|
|
12
|
+
start.setAttribute('w:name', name);
|
|
13
|
+
start.setAttribute('w:id', idVal);
|
|
14
|
+
paragraph.appendChild(start);
|
|
15
|
+
|
|
16
|
+
if (text) {
|
|
17
|
+
const r = doc.createElement('w:r');
|
|
18
|
+
const t = doc.createElement('w:t');
|
|
19
|
+
t.textContent = text;
|
|
20
|
+
if (text.includes(' ')) t.setAttribute('xml:space', 'preserve');
|
|
21
|
+
r.appendChild(t);
|
|
22
|
+
paragraph.appendChild(r);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const end = doc.createElement('w:bookmarkEnd');
|
|
26
|
+
end.setAttribute('w:id', idVal);
|
|
27
|
+
paragraph.appendChild(end);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function addCrossReference(paragraph: Element, refName: string, text: string) {
|
|
31
|
+
const doc = paragraph.ownerDocument!;
|
|
32
|
+
const fld = doc.createElement('w:fldSimple');
|
|
33
|
+
fld.setAttribute('w:instr', ` REF ${refName} \\h `);
|
|
34
|
+
const r = doc.createElement('w:r');
|
|
35
|
+
const t = doc.createElement('w:t');
|
|
36
|
+
t.textContent = text;
|
|
37
|
+
if (text.includes(' ')) t.setAttribute('xml:space', 'preserve');
|
|
38
|
+
r.appendChild(t);
|
|
39
|
+
fld.appendChild(r);
|
|
40
|
+
paragraph.appendChild(fld);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function addHyperlink(docObj: DocumentObject, paragraph: Element, url: string, text: string) {
|
|
44
|
+
const idStr = docObj.relateToExternal(url, 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink');
|
|
45
|
+
|
|
46
|
+
const doc = paragraph.ownerDocument!;
|
|
47
|
+
const hyperlink = doc.createElement('w:hyperlink');
|
|
48
|
+
hyperlink.setAttribute('r:id', idStr);
|
|
49
|
+
const r = doc.createElement('w:r');
|
|
50
|
+
const t = doc.createElement('w:t');
|
|
51
|
+
t.textContent = text;
|
|
52
|
+
if (text.includes(' ')) t.setAttribute('xml:space', 'preserve');
|
|
53
|
+
r.appendChild(t);
|
|
54
|
+
hyperlink.appendChild(r);
|
|
55
|
+
paragraph.appendChild(hyperlink);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function setupFootnotesPart(docObj: DocumentObject) {
|
|
59
|
+
const fnXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
60
|
+
<w:footnotes xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
61
|
+
<w:footnote w:type="separator" w:id="-1">
|
|
62
|
+
<w:p><w:r><w:separator/></w:r></w:p>
|
|
63
|
+
</w:footnote>
|
|
64
|
+
<w:footnote w:id="1">
|
|
65
|
+
<w:p><w:r><w:t>Footnote content.</w:t></w:r></w:p>
|
|
66
|
+
</w:footnote>
|
|
67
|
+
</w:footnotes>`;
|
|
68
|
+
|
|
69
|
+
const partname = '/word/footnotes.xml';
|
|
70
|
+
const ctype = 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml';
|
|
71
|
+
const relType = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes';
|
|
72
|
+
|
|
73
|
+
const part = docObj.pkg.addPart(partname, ctype, fnXml);
|
|
74
|
+
docObj.relateTo(part, relType);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
async function createDomainSemanticsStream() {
|
|
78
|
+
const doc = await createTestDocument();
|
|
79
|
+
|
|
80
|
+
// 1. Appendix / Definitions
|
|
81
|
+
const p1 = addParagraph(doc, "1. Definitions");
|
|
82
|
+
p1.insertBefore(p1.ownerDocument!.createElement('w:pPr'), p1.firstChild);
|
|
83
|
+
addParagraph(doc, '"Affiliate" means any entity that controls, is controlled by, or is under common control.');
|
|
84
|
+
addParagraph(doc, "“Confidential Information” means all non-public information disclosed by one party to the other.");
|
|
85
|
+
addParagraph(doc, "This paragraph does not define anything.");
|
|
86
|
+
|
|
87
|
+
const p2 = addParagraph(doc, "2. Obligations");
|
|
88
|
+
p2.insertBefore(p2.ownerDocument!.createElement('w:pPr'), p2.firstChild);
|
|
89
|
+
addParagraph(doc, "The Affiliate shall protect the Confidential Information to the highest standard.");
|
|
90
|
+
|
|
91
|
+
// 3. Bookmarks and Cross-References
|
|
92
|
+
const p3 = addParagraph(doc, "Subject to ");
|
|
93
|
+
addBookmark(p3, "MyBookmark_1", "1", "Anchored Clause");
|
|
94
|
+
const r3 = p3.ownerDocument!.createElement('w:r');
|
|
95
|
+
const t3 = p3.ownerDocument!.createElement('w:t');
|
|
96
|
+
t3.textContent = ", the parties agree to...";
|
|
97
|
+
t3.setAttribute('xml:space', 'preserve');
|
|
98
|
+
r3.appendChild(t3);
|
|
99
|
+
p3.appendChild(r3);
|
|
100
|
+
|
|
101
|
+
const p4 = addParagraph(doc, "As strictly stated in ");
|
|
102
|
+
addCrossReference(p4, "MyBookmark_1", "Anchored Clause");
|
|
103
|
+
const r4 = p4.ownerDocument!.createElement('w:r');
|
|
104
|
+
const t4 = p4.ownerDocument!.createElement('w:t');
|
|
105
|
+
t4.textContent = ", either party may terminate.";
|
|
106
|
+
t4.setAttribute('xml:space', 'preserve');
|
|
107
|
+
r4.appendChild(t4);
|
|
108
|
+
p4.appendChild(r4);
|
|
109
|
+
|
|
110
|
+
// 4. Internal Anchors
|
|
111
|
+
const pAnchor = addParagraph(doc, "Section 5. Indemnification");
|
|
112
|
+
addBookmark(pAnchor, "_Ref12345", "0");
|
|
113
|
+
|
|
114
|
+
const pNoise = addParagraph(doc, "Some text.");
|
|
115
|
+
addBookmark(pNoise, "_GoBack", "2");
|
|
116
|
+
addBookmark(pNoise, "_Toc123456789", "3");
|
|
117
|
+
|
|
118
|
+
// 5. Footnotes
|
|
119
|
+
const pFn = addParagraph(doc, "Sentence with footnote");
|
|
120
|
+
const rFn = pFn.ownerDocument!.createElement('w:r');
|
|
121
|
+
const ref = pFn.ownerDocument!.createElement('w:footnoteReference');
|
|
122
|
+
ref.setAttribute('w:id', "1");
|
|
123
|
+
rFn.appendChild(ref);
|
|
124
|
+
pFn.appendChild(rFn);
|
|
125
|
+
setupFootnotesPart(doc);
|
|
126
|
+
|
|
127
|
+
// 6. Links and Cross references
|
|
128
|
+
const pLink = addParagraph(doc, "Please visit ");
|
|
129
|
+
addHyperlink(doc, pLink, "https://adeu.com", "Adeu HQ");
|
|
130
|
+
|
|
131
|
+
const pXref = addParagraph(doc, "As detailed in ");
|
|
132
|
+
addCrossReference(pXref, "_Ref12345", "Section 5");
|
|
133
|
+
|
|
134
|
+
return doc.save();
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
describe('Domain Semantics Engine', () => {
|
|
138
|
+
it('extracts and projects structural appendix and diagnostics correctly', async () => {
|
|
139
|
+
const buf = await createDomainSemanticsStream();
|
|
140
|
+
const text = await extractTextFromBuffer(buf);
|
|
141
|
+
|
|
142
|
+
expect(text).toContain("<!-- READONLY_BOUNDARY_START -->");
|
|
143
|
+
expect(text).toContain("# Document Structure (Read-Only)");
|
|
144
|
+
|
|
145
|
+
// Definitions
|
|
146
|
+
expect(text).toContain("## Defined Terms");
|
|
147
|
+
expect(text).toContain('"Affiliate"');
|
|
148
|
+
expect(text).toContain('"Confidential Information"');
|
|
149
|
+
expect(text).toContain("used 1 times");
|
|
150
|
+
|
|
151
|
+
// Named Anchors & Back-References
|
|
152
|
+
expect(text).toContain("## Named Anchors");
|
|
153
|
+
expect(text).toContain("MyBookmark_1");
|
|
154
|
+
expect(text).toContain("Anchored to:");
|
|
155
|
+
expect(text).toContain("Referenced from:");
|
|
156
|
+
|
|
157
|
+
// Internal anchors & Noise suppression
|
|
158
|
+
expect(text).toContain("{#_Ref12345}");
|
|
159
|
+
expect(text).toContain("Section 5. Indemnification{#_Ref12345}");
|
|
160
|
+
expect(text).not.toContain("{#_GoBack}");
|
|
161
|
+
expect(text).not.toContain("{#_Toc123456789}");
|
|
162
|
+
|
|
163
|
+
// Footnotes
|
|
164
|
+
expect(text).toContain("[^fn-1]");
|
|
165
|
+
expect(text).toContain("## Footnotes");
|
|
166
|
+
expect(text).toContain("[^fn-1]: Footnote content.");
|
|
167
|
+
|
|
168
|
+
// Links
|
|
169
|
+
expect(text).toContain("[Adeu HQ](https://adeu.com)");
|
|
170
|
+
expect(text).toContain("[~Section 5~](#_Ref12345)");
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
const edgeCases = [
|
|
174
|
+
{ target: "# Document Structure (Read-Only)", newText: "# Modified Document Structure", errChecker: (m: string) => m.includes('read-only boundary') || m.includes('appendix') },
|
|
175
|
+
{ target: "Sentence with footnote[^fn-1]", newText: "Sentence with footnote", errChecker: (m: string) => m.includes('footnote') && (m.includes('delete') || m.includes('remove')) },
|
|
176
|
+
{ target: "Sentence with footnote", newText: "Sentence with footnote[^fn-99]", errChecker: (m: string) => m.includes('footnote') && (m.includes('insert') || m.includes('create')) },
|
|
177
|
+
{ target: "Some text.", newText: "Some text.{#_Ref99999}", errChecker: (m: string) => m.includes('anchor') },
|
|
178
|
+
{ target: "Section 5. Indemnification{#_Ref12345}", newText: "Section 5. Indemnification{#_Ref99999}", errChecker: (m: string) => m.includes('anchor') },
|
|
179
|
+
{ target: "[~Section 5~](#_Ref12345)", newText: "[~Section 6~](#_Ref12345)", errChecker: (m: string) => m.includes('cross-reference') || m.includes('rejected') },
|
|
180
|
+
{ target: "[~Section 5~](#_Ref12345)", newText: "[~Section 5~](#_Ref99999)", errChecker: (m: string) => m.includes('dependency corruption') || m.includes('rejected') },
|
|
181
|
+
{ target: "As detailed in [~Section 5~](#_Ref12345)", newText: "As detailed in [~Section 5~](#_Ref12345) and [~Section 6~](#_Ref999)", errChecker: (m: string) => m.includes('cross-reference') || m.includes('read-only') },
|
|
182
|
+
{ target: "As detailed in [~Section 5~](#_Ref12345)", newText: "As detailed in nothing", errChecker: (m: string) => m.includes('cross-reference') || m.includes('delete') },
|
|
183
|
+
{ target: "Please visit [Adeu HQ](https://adeu.com)", newText: "Please visit [Adeu HQ](https://adeu.com) and [Google](https://google.com)", errChecker: (m: string) => m.includes('hyperlink') || m.includes('insert') },
|
|
184
|
+
{ target: "Please visit [Adeu HQ](https://adeu.com)", newText: "Please visit nothing", errChecker: (m: string) => m.includes('hyperlink') || m.includes('delete') },
|
|
185
|
+
];
|
|
186
|
+
|
|
187
|
+
for (const tc of edgeCases) {
|
|
188
|
+
it(`rejects invalid edits: ${tc.target} -> ${tc.newText}`, async () => {
|
|
189
|
+
const buf = await createDomainSemanticsStream();
|
|
190
|
+
const doc = await DocumentObject.load(buf);
|
|
191
|
+
const engine = new RedlineEngine(doc);
|
|
192
|
+
const edit: ModifyText = { type: 'modify', target_text: tc.target, new_text: tc.newText };
|
|
193
|
+
|
|
194
|
+
let errorThrown = false;
|
|
195
|
+
try {
|
|
196
|
+
engine.process_batch([edit]);
|
|
197
|
+
} catch (e) {
|
|
198
|
+
errorThrown = true;
|
|
199
|
+
if (e instanceof BatchValidationError) {
|
|
200
|
+
const msg = e.errors.join('\n').toLowerCase();
|
|
201
|
+
expect(tc.errChecker(msg)).toBe(true);
|
|
202
|
+
} else {
|
|
203
|
+
throw e; // unexpected error
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
expect(errorThrown).toBe(true);
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
it('safely edits footnotes and accepts changes', async () => {
|
|
211
|
+
const buf = await createDomainSemanticsStream();
|
|
212
|
+
const doc = await DocumentObject.load(buf);
|
|
213
|
+
const engine = new RedlineEngine(doc);
|
|
214
|
+
|
|
215
|
+
const edit: ModifyText = { type: 'modify', target_text: "Footnote content.", new_text: "This is an edited footnote." };
|
|
216
|
+
const stats = engine.process_batch([edit]);
|
|
217
|
+
expect(stats.edits_applied).toBe(1);
|
|
218
|
+
|
|
219
|
+
engine.accept_all_revisions();
|
|
220
|
+
const outBuf = await doc.save();
|
|
221
|
+
const cleanText = await extractTextFromBuffer(outBuf, true);
|
|
222
|
+
|
|
223
|
+
expect(cleanText).toContain("[^fn-1]: This is an edited footnote.");
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
it('extracts defined terms and finds typos correctly', async () => {
|
|
227
|
+
const doc = await createTestDocument();
|
|
228
|
+
addParagraph(doc, '"Agreement" means this contract.');
|
|
229
|
+
addParagraph(doc, "“Party” shall mean either side.");
|
|
230
|
+
addParagraph(doc, '"Agreement" means another thing.'); // Duplicate
|
|
231
|
+
addParagraph(doc, 'This contract (hereinafter, the "Contract") is valid.');
|
|
232
|
+
addParagraph(doc, '"Confidential Information" on salainen asia.');
|
|
233
|
+
addParagraph(doc, '1.1 "Affiliate" tarkoittaa osakkuusyhtiötä.');
|
|
234
|
+
addParagraph(doc, 'We will act as the disclosing party (jäljempänä "Discloser").');
|
|
235
|
+
addParagraph(doc, 'This is a syntax example: ("Heading*") and ("<Term>")');
|
|
236
|
+
|
|
237
|
+
addParagraph(doc, "The Agreement is binding. The Contract is signed.");
|
|
238
|
+
addParagraph(doc, "There is an Agrement here.");
|
|
239
|
+
addParagraph(doc, "We shared Confidential Information with the Affiliate. The Discloser is happy.");
|
|
240
|
+
|
|
241
|
+
const buf = await doc.save();
|
|
242
|
+
const full_text = await extractTextFromBuffer(buf, false);
|
|
243
|
+
const [, appendix] = split_structural_appendix(full_text);
|
|
244
|
+
|
|
245
|
+
expect(appendix).toContain('"Agreement" \u2014 used');
|
|
246
|
+
expect(appendix).toContain('"Contract" \u2014 used');
|
|
247
|
+
expect(appendix).toContain('"Confidential Information" \u2014 used');
|
|
248
|
+
expect(appendix).toContain('"Affiliate" \u2014 used');
|
|
249
|
+
expect(appendix).toContain('"Discloser" \u2014 used');
|
|
250
|
+
|
|
251
|
+
expect(appendix).not.toContain('"Party"');
|
|
252
|
+
expect(appendix).not.toContain('"Heading*"');
|
|
253
|
+
expect(appendix).not.toContain('"<Term>"');
|
|
254
|
+
|
|
255
|
+
expect(appendix).toContain("[Error] Duplicate Definition: 'Agreement' is defined multiple times.");
|
|
256
|
+
expect(appendix).toContain("[Info] Possible Typos for 'Agreement': Found 'Agrement'");
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
it('reduces typo noise for short acronyms', async () => {
|
|
260
|
+
const doc = await createTestDocument();
|
|
261
|
+
addParagraph(doc, '"PSUs" means power supply units.');
|
|
262
|
+
addParagraph(doc, '"CPU" means central processing unit.');
|
|
263
|
+
addParagraph(doc, '"Party" means the entity.');
|
|
264
|
+
addParagraph(doc, "We rely on ESAs, LSPs, and GPUs for the servers.");
|
|
265
|
+
addParagraph(doc, "The GPU is very fast.");
|
|
266
|
+
addParagraph(doc, "The Pary signed the contract.");
|
|
267
|
+
addParagraph(doc, "We bought PSUs and a CPU.");
|
|
268
|
+
addParagraph(doc, "The Party begins today.");
|
|
269
|
+
|
|
270
|
+
const buf = await doc.save();
|
|
271
|
+
const full_text = await extractTextFromBuffer(buf, false);
|
|
272
|
+
const [, appendix] = split_structural_appendix(full_text);
|
|
273
|
+
|
|
274
|
+
expect(appendix).toContain("[Info] Possible Typos for 'Party': Found 'Pary'");
|
|
275
|
+
expect(appendix).not.toContain("'GPU'");
|
|
276
|
+
expect(appendix).not.toContain("'GPUs'");
|
|
277
|
+
expect(appendix).not.toContain("'ESAs'");
|
|
278
|
+
expect(appendix).not.toContain("'LSPs'");
|
|
279
|
+
});
|
|
280
|
+
});
|
package/src/domain.ts
CHANGED
|
@@ -1,11 +1,265 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
return
|
|
1
|
+
import { DocumentObject } from './docx/bridge.js';
|
|
2
|
+
import { Paragraph, Run } from './docx/primitives.js';
|
|
3
|
+
import { iter_block_items, get_run_text } from './utils/docx.js';
|
|
4
|
+
import { findAllDescendants } from './docx/dom.js';
|
|
5
|
+
|
|
6
|
+
function boundedLevenshtein(a: string, b: string, maxDist: number = 2): number {
|
|
7
|
+
if (a === b) return 0;
|
|
8
|
+
if (Math.abs(a.length - b.length) > maxDist) return maxDist + 1;
|
|
9
|
+
if (a.length === 0) return b.length <= maxDist ? b.length : maxDist + 1;
|
|
10
|
+
if (b.length === 0) return a.length <= maxDist ? a.length : maxDist + 1;
|
|
11
|
+
|
|
12
|
+
if (a.length > b.length) {
|
|
13
|
+
const temp = a;
|
|
14
|
+
a = b;
|
|
15
|
+
b = temp;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
let row = Array.from({ length: a.length + 1 }, (_, i) => i);
|
|
19
|
+
|
|
20
|
+
for (let i = 1; i <= b.length; i++) {
|
|
21
|
+
const newRow = [i];
|
|
22
|
+
let minInRow = i;
|
|
23
|
+
for (let j = 1; j <= a.length; j++) {
|
|
24
|
+
const cost = a[j - 1] === b[i - 1] ? 0 : 1;
|
|
25
|
+
const val = Math.min(
|
|
26
|
+
row[j] + 1,
|
|
27
|
+
newRow[j - 1] + 1,
|
|
28
|
+
row[j - 1] + cost
|
|
29
|
+
);
|
|
30
|
+
newRow.push(val);
|
|
31
|
+
if (val < minInRow) minInRow = val;
|
|
32
|
+
}
|
|
33
|
+
if (minInRow > maxDist) return maxDist + 1;
|
|
34
|
+
row = newRow;
|
|
35
|
+
}
|
|
36
|
+
return row[a.length] <= maxDist ? row[a.length] : maxDist + 1;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function _get_paragraph_text(p: Paragraph): string {
|
|
40
|
+
let text = '';
|
|
41
|
+
const runs = findAllDescendants(p._element, 'w:r');
|
|
42
|
+
for (const r of runs) {
|
|
43
|
+
text += get_run_text(new Run(r, p));
|
|
44
|
+
}
|
|
45
|
+
return text;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export function extract_all_domain_metadata(
|
|
49
|
+
doc: DocumentObject,
|
|
50
|
+
base_text: string
|
|
51
|
+
): [Record<string, { count: number }>, string[], Record<string, { anchored_to: string; referenced_from: string[] }>] {
|
|
52
|
+
const definitions: Record<string, { count: number }> = {};
|
|
53
|
+
const duplicates = new Set<string>();
|
|
54
|
+
const raw_anchors: Record<string, { anchored_to: string; referenced_from: string[] }> = {};
|
|
55
|
+
const raw_references: [string, string][] = [];
|
|
56
|
+
|
|
57
|
+
const leading_re = /^(?:[\d.\-()a-zA-Z]+\s*)?["“]([A-Z][A-Za-z0-9\s\-&'’]{1,60})["”]/;
|
|
58
|
+
const inline_re = /\([^)]*?["“]([A-Z][A-Za-z0-9\s\-&'’]{1,60})["”][^)]*?\)/g;
|
|
59
|
+
|
|
60
|
+
for (const item of iter_block_items(doc)) {
|
|
61
|
+
if (!(item instanceof Paragraph)) continue;
|
|
62
|
+
|
|
63
|
+
const text = _get_paragraph_text(item).trim();
|
|
64
|
+
if (!text) continue;
|
|
65
|
+
|
|
66
|
+
const extracted_terms: string[] = [];
|
|
67
|
+
const leading_match = text.match(leading_re);
|
|
68
|
+
if (leading_match) extracted_terms.push(leading_match[1].trim());
|
|
69
|
+
|
|
70
|
+
const inline_matches = text.matchAll(inline_re);
|
|
71
|
+
for (const m of inline_matches) {
|
|
72
|
+
extracted_terms.push(m[1].trim());
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
for (const term of extracted_terms) {
|
|
76
|
+
if (definitions[term]) duplicates.add(term);
|
|
77
|
+
else definitions[term] = { count: 0 };
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const short_text = text.length > 60 ? text.substring(0, 60) + '...' : text;
|
|
81
|
+
|
|
82
|
+
const nodes = findAllDescendants(item._element, '*');
|
|
83
|
+
for (const node of nodes) {
|
|
84
|
+
if (node.tagName === 'w:bookmarkStart') {
|
|
85
|
+
const b_name = node.getAttribute('w:name');
|
|
86
|
+
if (b_name && (!b_name.startsWith('_') || b_name.startsWith('_Ref'))) {
|
|
87
|
+
if (!raw_anchors[b_name]) {
|
|
88
|
+
raw_anchors[b_name] = { anchored_to: short_text, referenced_from: [] };
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
let target: string | null = null;
|
|
94
|
+
if (node.tagName === 'w:fldSimple') {
|
|
95
|
+
const instr = node.getAttribute('w:instr') || '';
|
|
96
|
+
const parts = instr.trim().split(/\s+/);
|
|
97
|
+
if (parts.length > 1 && parts[0] === 'REF') target = parts[1];
|
|
98
|
+
} else if (node.tagName === 'w:instrText') {
|
|
99
|
+
const instr = node.textContent || '';
|
|
100
|
+
const parts = instr.trim().split(/\s+/);
|
|
101
|
+
if (parts.length > 1 && parts[0] === 'REF') target = parts[1];
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (target) raw_references.push([target, short_text]);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
for (const [target, ref_text] of raw_references) {
|
|
109
|
+
if (raw_anchors[target]) {
|
|
110
|
+
raw_anchors[target].referenced_from.push(ref_text);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const diagnostics: string[] = [];
|
|
115
|
+
|
|
116
|
+
const def_keys = Object.keys(definitions);
|
|
117
|
+
if (def_keys.length > 0) {
|
|
118
|
+
const sorted_terms = def_keys.sort((a, b) => b.length - a.length);
|
|
119
|
+
const escapeRegExp = (str: string) => str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
120
|
+
const alt = sorted_terms.map(escapeRegExp).join('|');
|
|
121
|
+
const usage_pattern = new RegExp(`(?<!["“])\\b(${alt})\\b(?![”"])`, 'g');
|
|
122
|
+
|
|
123
|
+
for (const m of base_text.matchAll(usage_pattern)) {
|
|
124
|
+
const matched_term = m[1];
|
|
125
|
+
if (definitions[matched_term]) definitions[matched_term].count++;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
for (const term of def_keys) {
|
|
129
|
+
if (definitions[term].count === 0) {
|
|
130
|
+
delete definitions[term];
|
|
131
|
+
duplicates.delete(term);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
for (const term of duplicates) {
|
|
137
|
+
diagnostics.push(`[Error] Duplicate Definition: '${term}' is defined multiple times.`);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const stop_words = new Set([
|
|
141
|
+
"The", "This", "That", "Such", "A", "An", "Any", "All", "Some", "No",
|
|
142
|
+
"Every", "Each", "As", "In", "Of", "For", "To", "On", "By", "With"
|
|
143
|
+
]);
|
|
144
|
+
|
|
145
|
+
const all_cap_pattern = /\b[A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*\b/g;
|
|
146
|
+
const all_caps = new Set(base_text.match(all_cap_pattern) || []);
|
|
147
|
+
|
|
148
|
+
const valid_terms = new Set(Object.keys(definitions));
|
|
149
|
+
const terms_by_first_letter: Record<string, string[]> = {};
|
|
150
|
+
for (const term of valid_terms) {
|
|
151
|
+
const fl = term[0].toLowerCase();
|
|
152
|
+
if (!terms_by_first_letter[fl]) terms_by_first_letter[fl] = [];
|
|
153
|
+
terms_by_first_letter[fl].push(term);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const candidates_by_term: Record<string, string[]> = {};
|
|
157
|
+
|
|
158
|
+
for (const raw_candidate of all_caps) {
|
|
159
|
+
let candidate = raw_candidate.trim();
|
|
160
|
+
const words = candidate.split(/\s+/);
|
|
161
|
+
while (words.length > 0) {
|
|
162
|
+
const first = words[0];
|
|
163
|
+
const title = first.charAt(0).toUpperCase() + first.slice(1).toLowerCase();
|
|
164
|
+
if (stop_words.has(title)) words.shift();
|
|
165
|
+
else break;
|
|
166
|
+
}
|
|
167
|
+
candidate = words.join(' ');
|
|
168
|
+
|
|
169
|
+
if (candidate.length < 4) continue;
|
|
170
|
+
if (valid_terms.has(candidate)) continue;
|
|
171
|
+
|
|
172
|
+
const first_letter = candidate[0].toLowerCase();
|
|
173
|
+
let candidate_terms = terms_by_first_letter[first_letter] || [];
|
|
174
|
+
|
|
175
|
+
if (candidate.length > 5) {
|
|
176
|
+
for (const [k, v] of Object.entries(terms_by_first_letter)) {
|
|
177
|
+
if (k !== first_letter) candidate_terms = candidate_terms.concat(v);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
for (const term of candidate_terms) {
|
|
182
|
+
if (Math.abs(candidate.length - term.length) > 2) continue;
|
|
183
|
+
if (candidate === term + 's' || candidate === term + 'es') continue;
|
|
184
|
+
if (term === candidate + 's' || term === candidate + 'es') continue;
|
|
185
|
+
|
|
186
|
+
const dist = boundedLevenshtein(candidate, term, 2);
|
|
187
|
+
if (dist === 0 || dist > 2) continue;
|
|
188
|
+
|
|
189
|
+
if (term.length <= 5) {
|
|
190
|
+
if (dist > 1) continue;
|
|
191
|
+
if (candidate[0].toLowerCase() !== term[0].toLowerCase()) continue;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if (!candidates_by_term[term]) candidates_by_term[term] = [];
|
|
195
|
+
if (!candidates_by_term[term].includes(candidate)) candidates_by_term[term].push(candidate);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
for (const [term, candidates] of Object.entries(candidates_by_term)) {
|
|
200
|
+
candidates.sort();
|
|
201
|
+
const c_str = candidates.map(c => `'${c}'`).join(', ');
|
|
202
|
+
diagnostics.push(`[Info] Possible Typos for '${term}': Found ${c_str}`);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function diag_sort_key(msg: string) {
|
|
206
|
+
if (msg.startsWith('[Error]')) return 0;
|
|
207
|
+
if (msg.startsWith('[Warning]')) return 1;
|
|
208
|
+
return 2;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
diagnostics.sort((a, b) => {
|
|
212
|
+
const keyA = diag_sort_key(a);
|
|
213
|
+
const keyB = diag_sort_key(b);
|
|
214
|
+
if (keyA !== keyB) return keyA - keyB;
|
|
215
|
+
return a.localeCompare(b);
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
return [definitions, diagnostics, raw_anchors];
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
export function build_structural_appendix(doc: DocumentObject, base_text: string): string {
|
|
222
|
+
const [defs, diagnostics, anchors] = extract_all_domain_metadata(doc, base_text);
|
|
223
|
+
|
|
224
|
+
const lines: string[] = [
|
|
225
|
+
"\n\n---",
|
|
226
|
+
"",
|
|
227
|
+
"<!-- READONLY_BOUNDARY_START -->",
|
|
228
|
+
"# Document Structure (Read-Only)",
|
|
229
|
+
"The content below is metadata describing the document's reference structure. Do not include this section in any tracked changes or edits \u2014 it is for your context only and will be discarded on write."
|
|
230
|
+
];
|
|
231
|
+
|
|
232
|
+
let has_content = false;
|
|
233
|
+
|
|
234
|
+
if (Object.keys(defs).length > 0) {
|
|
235
|
+
has_content = true;
|
|
236
|
+
lines.push("\n## Defined Terms");
|
|
237
|
+
for (const [term, data] of Object.entries(defs)) {
|
|
238
|
+
lines.push(`- "${term}" \u2014 used ${data.count} times.`);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if (diagnostics.length > 0) {
|
|
243
|
+
has_content = true;
|
|
244
|
+
lines.push("\n## Semantic Diagnostics");
|
|
245
|
+
for (const diag of diagnostics) {
|
|
246
|
+
lines.push(`- ${diag}`);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if (Object.keys(anchors).length > 0) {
|
|
251
|
+
has_content = true;
|
|
252
|
+
lines.push("\n## Named Anchors");
|
|
253
|
+
for (const [b_name, data] of Object.entries(anchors)) {
|
|
254
|
+
lines.push(`- ${b_name} \u2192 Anchored to: "${data.anchored_to}"`);
|
|
255
|
+
for (const ref of data.referenced_from) {
|
|
256
|
+
lines.push(` - Referenced from: "${ref}"`);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if (has_content) {
|
|
262
|
+
return lines.join('\n');
|
|
263
|
+
}
|
|
264
|
+
return "";
|
|
11
265
|
}
|
package/src/index.ts
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
*/
|
|
5
|
-
export const identifyEngine = () => 'adeu-core-node';
|
|
1
|
+
export function identifyEngine() {
|
|
2
|
+
return 'adeu-core-node';
|
|
3
|
+
}
|
|
6
4
|
|
|
7
|
-
export { extractTextFromBuffer } from './ingest.js';
|
|
8
5
|
export { DocumentObject } from './docx/bridge.js';
|
|
9
6
|
export { DocumentMapper, TextSpan } from './mapper.js';
|
|
10
7
|
export { RedlineEngine, BatchValidationError } from './engine.js';
|
|
11
|
-
export { generate_edits_from_text, trim_common_context } from './diff.js';
|
|
8
|
+
export { generate_edits_from_text, trim_common_context, create_unified_diff } from './diff.js';
|
|
12
9
|
export { apply_edits_to_markdown } from './markup.js';
|
|
13
10
|
export { paginate, split_structural_appendix, PaginationResult, PageInfo } from './pagination.js';
|
|
14
|
-
export { extract_outline, OutlineNode } from './outline.js';
|
|
11
|
+
export { extract_outline, OutlineNode } from './outline.js';
|
|
12
|
+
export { extractTextFromBuffer } from './ingest.js';
|
|
13
|
+
export { finalize_document, FinalizeOptions, FinalizeResult } from './sanitize/core.js';
|