@adeu/core 1.6.6 → 1.6.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adeu/core",
3
- "version": "1.6.6",
3
+ "version": "1.6.8",
4
4
  "description": "",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",
package/src/diff.ts CHANGED
@@ -248,4 +248,58 @@ export function generate_edits_from_text(original_text: string, modified_text: s
248
248
  }
249
249
 
250
250
  return edits;
251
+ }
252
+
253
+ export function create_unified_diff(original_text: string, modified_text: string, context_lines: number = 3): string {
254
+ const dmp = new diff_match_patch.diff_match_patch();
255
+ const a = dmp.diff_linesToChars_(original_text, modified_text);
256
+ const diffs = dmp.diff_main(a.chars1, a.chars2, false);
257
+ dmp.diff_charsToLines_(diffs, a.lineArray);
258
+
259
+ const output: string[] = [];
260
+ output.push('--- Original');
261
+ output.push('+++ Modified');
262
+
263
+ let i = 0;
264
+ while (i < diffs.length) {
265
+ while (i < diffs.length && diffs[i][0] === 0) i++;
266
+ if (i >= diffs.length) break;
267
+
268
+ let start = i;
269
+ let preContext: string[] = [];
270
+ if (start > 0 && diffs[start - 1][0] === 0) {
271
+ const lines = diffs[start - 1][1].replace(/\n$/, '').split('\n');
272
+ preContext = lines.slice(-context_lines);
273
+ }
274
+
275
+ const chunk: string[] = [];
276
+ chunk.push(...preContext.map(l => ` ${l}`));
277
+
278
+ while (i < diffs.length) {
279
+ const [op, text] = diffs[i];
280
+ const lines = text.replace(/\n$/, '').split('\n');
281
+
282
+ if (op === 0) {
283
+ if (lines.length > context_lines * 2) break;
284
+ chunk.push(...lines.map(l => ` ${l}`));
285
+ } else {
286
+ const prefix = op === -1 ? '-' : '+';
287
+ chunk.push(...lines.map(l => `${prefix}${l}`));
288
+ }
289
+ i++;
290
+ }
291
+
292
+ let postContext: string[] = [];
293
+ if (i < diffs.length && diffs[i][0] === 0) {
294
+ const lines = diffs[i][1].replace(/\n$/, '').split('\n');
295
+ postContext = lines.slice(0, context_lines);
296
+ }
297
+ chunk.push(...postContext.map(l => ` ${l}`));
298
+
299
+ output.push('@@ ... @@');
300
+ output.push(...chunk);
301
+ }
302
+
303
+ if (output.length === 2) return ''; // No changes
304
+ return output.join('\n');
251
305
  }
@@ -26,11 +26,12 @@ export class Part {
26
26
  public addRelationship(id: string, type: string, target: string, isExternal: boolean = false) {
27
27
  this.rels.set(id, new Relationship(id, type, target, isExternal));
28
28
 
29
- // If this part represents a .rels file, update the XML directly
30
- if (this._element.tagName === 'Relationships') {
29
+ // Directly append the relationship element to the document structure
30
+ if (this.partname.endsWith('.rels')) {
31
31
  const doc = this._element.ownerDocument;
32
32
  if (doc) {
33
- const relEl = doc.createElement('Relationship');
33
+ // Use strict namespace to ensure it parses successfully on reload
34
+ const relEl = doc.createElementNS('http://schemas.openxmlformats.org/package/2006/relationships', 'Relationship');
34
35
  relEl.setAttribute('Id', id);
35
36
  relEl.setAttribute('Type', type);
36
37
  relEl.setAttribute('Target', target);
@@ -176,6 +177,17 @@ export class DocumentObject {
176
177
  relsPart.addRelationship(id, relType, target, false);
177
178
  }
178
179
 
180
+ public relateToExternal(target: string, relType: string): string {
181
+ let rId = 1;
182
+ while (this.part.rels.has(`rId${rId}`)) rId++;
183
+ const id = `rId${rId}`;
184
+
185
+ this.part.rels.set(id, new Relationship(id, relType, target, true));
186
+ const relsPart = this.pkg.getOrCreateRelsPart(this.part.partname);
187
+ relsPart.addRelationship(id, relType, target, true);
188
+ return id;
189
+ }
190
+
179
191
  public async save(): Promise<Buffer> {
180
192
  for (const part of this.pkg.parts) {
181
193
  let xmlStr = serializeXml(part._element.ownerDocument || part._element);
@@ -0,0 +1,280 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { createTestDocument, addParagraph } from './test-utils.js';
3
+ import { DocumentObject } from './docx/bridge.js';
4
+ import { extractTextFromBuffer } from './ingest.js';
5
+ import { RedlineEngine, BatchValidationError } from './engine.js';
6
+ import { ModifyText } from './models.js';
7
+ import { split_structural_appendix } from './pagination.js';
8
+
9
+ function addBookmark(paragraph: Element, name: string, idVal: string = "0", text: string = "") {
10
+ const doc = paragraph.ownerDocument!;
11
+ const start = doc.createElement('w:bookmarkStart');
12
+ start.setAttribute('w:name', name);
13
+ start.setAttribute('w:id', idVal);
14
+ paragraph.appendChild(start);
15
+
16
+ if (text) {
17
+ const r = doc.createElement('w:r');
18
+ const t = doc.createElement('w:t');
19
+ t.textContent = text;
20
+ if (text.includes(' ')) t.setAttribute('xml:space', 'preserve');
21
+ r.appendChild(t);
22
+ paragraph.appendChild(r);
23
+ }
24
+
25
+ const end = doc.createElement('w:bookmarkEnd');
26
+ end.setAttribute('w:id', idVal);
27
+ paragraph.appendChild(end);
28
+ }
29
+
30
+ function addCrossReference(paragraph: Element, refName: string, text: string) {
31
+ const doc = paragraph.ownerDocument!;
32
+ const fld = doc.createElement('w:fldSimple');
33
+ fld.setAttribute('w:instr', ` REF ${refName} \\h `);
34
+ const r = doc.createElement('w:r');
35
+ const t = doc.createElement('w:t');
36
+ t.textContent = text;
37
+ if (text.includes(' ')) t.setAttribute('xml:space', 'preserve');
38
+ r.appendChild(t);
39
+ fld.appendChild(r);
40
+ paragraph.appendChild(fld);
41
+ }
42
+
43
+ function addHyperlink(docObj: DocumentObject, paragraph: Element, url: string, text: string) {
44
+ const idStr = docObj.relateToExternal(url, 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink');
45
+
46
+ const doc = paragraph.ownerDocument!;
47
+ const hyperlink = doc.createElement('w:hyperlink');
48
+ hyperlink.setAttribute('r:id', idStr);
49
+ const r = doc.createElement('w:r');
50
+ const t = doc.createElement('w:t');
51
+ t.textContent = text;
52
+ if (text.includes(' ')) t.setAttribute('xml:space', 'preserve');
53
+ r.appendChild(t);
54
+ hyperlink.appendChild(r);
55
+ paragraph.appendChild(hyperlink);
56
+ }
57
+
58
+ function setupFootnotesPart(docObj: DocumentObject) {
59
+ const fnXml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
60
+ <w:footnotes xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
61
+ <w:footnote w:type="separator" w:id="-1">
62
+ <w:p><w:r><w:separator/></w:r></w:p>
63
+ </w:footnote>
64
+ <w:footnote w:id="1">
65
+ <w:p><w:r><w:t>Footnote content.</w:t></w:r></w:p>
66
+ </w:footnote>
67
+ </w:footnotes>`;
68
+
69
+ const partname = '/word/footnotes.xml';
70
+ const ctype = 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml';
71
+ const relType = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes';
72
+
73
+ const part = docObj.pkg.addPart(partname, ctype, fnXml);
74
+ docObj.relateTo(part, relType);
75
+ }
76
+
77
+ async function createDomainSemanticsStream() {
78
+ const doc = await createTestDocument();
79
+
80
+ // 1. Appendix / Definitions
81
+ const p1 = addParagraph(doc, "1. Definitions");
82
+ p1.insertBefore(p1.ownerDocument!.createElement('w:pPr'), p1.firstChild);
83
+ addParagraph(doc, '"Affiliate" means any entity that controls, is controlled by, or is under common control.');
84
+ addParagraph(doc, "“Confidential Information” means all non-public information disclosed by one party to the other.");
85
+ addParagraph(doc, "This paragraph does not define anything.");
86
+
87
+ const p2 = addParagraph(doc, "2. Obligations");
88
+ p2.insertBefore(p2.ownerDocument!.createElement('w:pPr'), p2.firstChild);
89
+ addParagraph(doc, "The Affiliate shall protect the Confidential Information to the highest standard.");
90
+
91
+ // 3. Bookmarks and Cross-References
92
+ const p3 = addParagraph(doc, "Subject to ");
93
+ addBookmark(p3, "MyBookmark_1", "1", "Anchored Clause");
94
+ const r3 = p3.ownerDocument!.createElement('w:r');
95
+ const t3 = p3.ownerDocument!.createElement('w:t');
96
+ t3.textContent = ", the parties agree to...";
97
+ t3.setAttribute('xml:space', 'preserve');
98
+ r3.appendChild(t3);
99
+ p3.appendChild(r3);
100
+
101
+ const p4 = addParagraph(doc, "As strictly stated in ");
102
+ addCrossReference(p4, "MyBookmark_1", "Anchored Clause");
103
+ const r4 = p4.ownerDocument!.createElement('w:r');
104
+ const t4 = p4.ownerDocument!.createElement('w:t');
105
+ t4.textContent = ", either party may terminate.";
106
+ t4.setAttribute('xml:space', 'preserve');
107
+ r4.appendChild(t4);
108
+ p4.appendChild(r4);
109
+
110
+ // 4. Internal Anchors
111
+ const pAnchor = addParagraph(doc, "Section 5. Indemnification");
112
+ addBookmark(pAnchor, "_Ref12345", "0");
113
+
114
+ const pNoise = addParagraph(doc, "Some text.");
115
+ addBookmark(pNoise, "_GoBack", "2");
116
+ addBookmark(pNoise, "_Toc123456789", "3");
117
+
118
+ // 5. Footnotes
119
+ const pFn = addParagraph(doc, "Sentence with footnote");
120
+ const rFn = pFn.ownerDocument!.createElement('w:r');
121
+ const ref = pFn.ownerDocument!.createElement('w:footnoteReference');
122
+ ref.setAttribute('w:id', "1");
123
+ rFn.appendChild(ref);
124
+ pFn.appendChild(rFn);
125
+ setupFootnotesPart(doc);
126
+
127
+ // 6. Links and Cross references
128
+ const pLink = addParagraph(doc, "Please visit ");
129
+ addHyperlink(doc, pLink, "https://adeu.com", "Adeu HQ");
130
+
131
+ const pXref = addParagraph(doc, "As detailed in ");
132
+ addCrossReference(pXref, "_Ref12345", "Section 5");
133
+
134
+ return doc.save();
135
+ }
136
+
137
+ describe('Domain Semantics Engine', () => {
138
+ it('extracts and projects structural appendix and diagnostics correctly', async () => {
139
+ const buf = await createDomainSemanticsStream();
140
+ const text = await extractTextFromBuffer(buf);
141
+
142
+ expect(text).toContain("<!-- READONLY_BOUNDARY_START -->");
143
+ expect(text).toContain("# Document Structure (Read-Only)");
144
+
145
+ // Definitions
146
+ expect(text).toContain("## Defined Terms");
147
+ expect(text).toContain('"Affiliate"');
148
+ expect(text).toContain('"Confidential Information"');
149
+ expect(text).toContain("used 1 times");
150
+
151
+ // Named Anchors & Back-References
152
+ expect(text).toContain("## Named Anchors");
153
+ expect(text).toContain("MyBookmark_1");
154
+ expect(text).toContain("Anchored to:");
155
+ expect(text).toContain("Referenced from:");
156
+
157
+ // Internal anchors & Noise suppression
158
+ expect(text).toContain("{#_Ref12345}");
159
+ expect(text).toContain("Section 5. Indemnification{#_Ref12345}");
160
+ expect(text).not.toContain("{#_GoBack}");
161
+ expect(text).not.toContain("{#_Toc123456789}");
162
+
163
+ // Footnotes
164
+ expect(text).toContain("[^fn-1]");
165
+ expect(text).toContain("## Footnotes");
166
+ expect(text).toContain("[^fn-1]: Footnote content.");
167
+
168
+ // Links
169
+ expect(text).toContain("[Adeu HQ](https://adeu.com)");
170
+ expect(text).toContain("[~Section 5~](#_Ref12345)");
171
+ });
172
+
173
+ const edgeCases = [
174
+ { target: "# Document Structure (Read-Only)", newText: "# Modified Document Structure", errChecker: (m: string) => m.includes('read-only boundary') || m.includes('appendix') },
175
+ { target: "Sentence with footnote[^fn-1]", newText: "Sentence with footnote", errChecker: (m: string) => m.includes('footnote') && (m.includes('delete') || m.includes('remove')) },
176
+ { target: "Sentence with footnote", newText: "Sentence with footnote[^fn-99]", errChecker: (m: string) => m.includes('footnote') && (m.includes('insert') || m.includes('create')) },
177
+ { target: "Some text.", newText: "Some text.{#_Ref99999}", errChecker: (m: string) => m.includes('anchor') },
178
+ { target: "Section 5. Indemnification{#_Ref12345}", newText: "Section 5. Indemnification{#_Ref99999}", errChecker: (m: string) => m.includes('anchor') },
179
+ { target: "[~Section 5~](#_Ref12345)", newText: "[~Section 6~](#_Ref12345)", errChecker: (m: string) => m.includes('cross-reference') || m.includes('rejected') },
180
+ { target: "[~Section 5~](#_Ref12345)", newText: "[~Section 5~](#_Ref99999)", errChecker: (m: string) => m.includes('dependency corruption') || m.includes('rejected') },
181
+ { target: "As detailed in [~Section 5~](#_Ref12345)", newText: "As detailed in [~Section 5~](#_Ref12345) and [~Section 6~](#_Ref999)", errChecker: (m: string) => m.includes('cross-reference') || m.includes('read-only') },
182
+ { target: "As detailed in [~Section 5~](#_Ref12345)", newText: "As detailed in nothing", errChecker: (m: string) => m.includes('cross-reference') || m.includes('delete') },
183
+ { target: "Please visit [Adeu HQ](https://adeu.com)", newText: "Please visit [Adeu HQ](https://adeu.com) and [Google](https://google.com)", errChecker: (m: string) => m.includes('hyperlink') || m.includes('insert') },
184
+ { target: "Please visit [Adeu HQ](https://adeu.com)", newText: "Please visit nothing", errChecker: (m: string) => m.includes('hyperlink') || m.includes('delete') },
185
+ ];
186
+
187
+ for (const tc of edgeCases) {
188
+ it(`rejects invalid edits: ${tc.target} -> ${tc.newText}`, async () => {
189
+ const buf = await createDomainSemanticsStream();
190
+ const doc = await DocumentObject.load(buf);
191
+ const engine = new RedlineEngine(doc);
192
+ const edit: ModifyText = { type: 'modify', target_text: tc.target, new_text: tc.newText };
193
+
194
+ let errorThrown = false;
195
+ try {
196
+ engine.process_batch([edit]);
197
+ } catch (e) {
198
+ errorThrown = true;
199
+ if (e instanceof BatchValidationError) {
200
+ const msg = e.errors.join('\n').toLowerCase();
201
+ expect(tc.errChecker(msg)).toBe(true);
202
+ } else {
203
+ throw e; // unexpected error
204
+ }
205
+ }
206
+ expect(errorThrown).toBe(true);
207
+ });
208
+ }
209
+
210
+ it('safely edits footnotes and accepts changes', async () => {
211
+ const buf = await createDomainSemanticsStream();
212
+ const doc = await DocumentObject.load(buf);
213
+ const engine = new RedlineEngine(doc);
214
+
215
+ const edit: ModifyText = { type: 'modify', target_text: "Footnote content.", new_text: "This is an edited footnote." };
216
+ const stats = engine.process_batch([edit]);
217
+ expect(stats.edits_applied).toBe(1);
218
+
219
+ engine.accept_all_revisions();
220
+ const outBuf = await doc.save();
221
+ const cleanText = await extractTextFromBuffer(outBuf, true);
222
+
223
+ expect(cleanText).toContain("[^fn-1]: This is an edited footnote.");
224
+ });
225
+
226
+ it('extracts defined terms and finds typos correctly', async () => {
227
+ const doc = await createTestDocument();
228
+ addParagraph(doc, '"Agreement" means this contract.');
229
+ addParagraph(doc, "“Party” shall mean either side.");
230
+ addParagraph(doc, '"Agreement" means another thing.'); // Duplicate
231
+ addParagraph(doc, 'This contract (hereinafter, the "Contract") is valid.');
232
+ addParagraph(doc, '"Confidential Information" on salainen asia.');
233
+ addParagraph(doc, '1.1 "Affiliate" tarkoittaa osakkuusyhtiötä.');
234
+ addParagraph(doc, 'We will act as the disclosing party (jäljempänä "Discloser").');
235
+ addParagraph(doc, 'This is a syntax example: ("Heading*") and ("<Term>")');
236
+
237
+ addParagraph(doc, "The Agreement is binding. The Contract is signed.");
238
+ addParagraph(doc, "There is an Agrement here.");
239
+ addParagraph(doc, "We shared Confidential Information with the Affiliate. The Discloser is happy.");
240
+
241
+ const buf = await doc.save();
242
+ const full_text = await extractTextFromBuffer(buf, false);
243
+ const [, appendix] = split_structural_appendix(full_text);
244
+
245
+ expect(appendix).toContain('"Agreement" \u2014 used');
246
+ expect(appendix).toContain('"Contract" \u2014 used');
247
+ expect(appendix).toContain('"Confidential Information" \u2014 used');
248
+ expect(appendix).toContain('"Affiliate" \u2014 used');
249
+ expect(appendix).toContain('"Discloser" \u2014 used');
250
+
251
+ expect(appendix).not.toContain('"Party"');
252
+ expect(appendix).not.toContain('"Heading*"');
253
+ expect(appendix).not.toContain('"<Term>"');
254
+
255
+ expect(appendix).toContain("[Error] Duplicate Definition: 'Agreement' is defined multiple times.");
256
+ expect(appendix).toContain("[Info] Possible Typos for 'Agreement': Found 'Agrement'");
257
+ });
258
+
259
+ it('reduces typo noise for short acronyms', async () => {
260
+ const doc = await createTestDocument();
261
+ addParagraph(doc, '"PSUs" means power supply units.');
262
+ addParagraph(doc, '"CPU" means central processing unit.');
263
+ addParagraph(doc, '"Party" means the entity.');
264
+ addParagraph(doc, "We rely on ESAs, LSPs, and GPUs for the servers.");
265
+ addParagraph(doc, "The GPU is very fast.");
266
+ addParagraph(doc, "The Pary signed the contract.");
267
+ addParagraph(doc, "We bought PSUs and a CPU.");
268
+ addParagraph(doc, "The Party begins today.");
269
+
270
+ const buf = await doc.save();
271
+ const full_text = await extractTextFromBuffer(buf, false);
272
+ const [, appendix] = split_structural_appendix(full_text);
273
+
274
+ expect(appendix).toContain("[Info] Possible Typos for 'Party': Found 'Pary'");
275
+ expect(appendix).not.toContain("'GPU'");
276
+ expect(appendix).not.toContain("'GPUs'");
277
+ expect(appendix).not.toContain("'ESAs'");
278
+ expect(appendix).not.toContain("'LSPs'");
279
+ });
280
+ });
package/src/domain.ts CHANGED
@@ -1,11 +1,265 @@
1
- /**
2
- * Lightweight port of domain.py (Semantic Diagnostics & Appendix).
3
- * Uses a simplified heuristic since full rapidfuzz isn't available.
4
- */
5
-
6
- export function build_structural_appendix(doc: any, base_text: string): string {
7
- // To keep the initial ingestion port lean and maintain 100% parity on body text,
8
- // we will return an empty appendix string for now. The python port can be completed
9
- // in a follow-up PR if diagnostics are required in Node MCPs.
10
- return '';
1
+ import { DocumentObject } from './docx/bridge.js';
2
+ import { Paragraph, Run } from './docx/primitives.js';
3
+ import { iter_block_items, get_run_text } from './utils/docx.js';
4
+ import { findAllDescendants } from './docx/dom.js';
5
+
6
+ function boundedLevenshtein(a: string, b: string, maxDist: number = 2): number {
7
+ if (a === b) return 0;
8
+ if (Math.abs(a.length - b.length) > maxDist) return maxDist + 1;
9
+ if (a.length === 0) return b.length <= maxDist ? b.length : maxDist + 1;
10
+ if (b.length === 0) return a.length <= maxDist ? a.length : maxDist + 1;
11
+
12
+ if (a.length > b.length) {
13
+ const temp = a;
14
+ a = b;
15
+ b = temp;
16
+ }
17
+
18
+ let row = Array.from({ length: a.length + 1 }, (_, i) => i);
19
+
20
+ for (let i = 1; i <= b.length; i++) {
21
+ const newRow = [i];
22
+ let minInRow = i;
23
+ for (let j = 1; j <= a.length; j++) {
24
+ const cost = a[j - 1] === b[i - 1] ? 0 : 1;
25
+ const val = Math.min(
26
+ row[j] + 1,
27
+ newRow[j - 1] + 1,
28
+ row[j - 1] + cost
29
+ );
30
+ newRow.push(val);
31
+ if (val < minInRow) minInRow = val;
32
+ }
33
+ if (minInRow > maxDist) return maxDist + 1;
34
+ row = newRow;
35
+ }
36
+ return row[a.length] <= maxDist ? row[a.length] : maxDist + 1;
37
+ }
38
+
39
+ function _get_paragraph_text(p: Paragraph): string {
40
+ let text = '';
41
+ const runs = findAllDescendants(p._element, 'w:r');
42
+ for (const r of runs) {
43
+ text += get_run_text(new Run(r, p));
44
+ }
45
+ return text;
46
+ }
47
+
48
+ export function extract_all_domain_metadata(
49
+ doc: DocumentObject,
50
+ base_text: string
51
+ ): [Record<string, { count: number }>, string[], Record<string, { anchored_to: string; referenced_from: string[] }>] {
52
+ const definitions: Record<string, { count: number }> = {};
53
+ const duplicates = new Set<string>();
54
+ const raw_anchors: Record<string, { anchored_to: string; referenced_from: string[] }> = {};
55
+ const raw_references: [string, string][] = [];
56
+
57
+ const leading_re = /^(?:[\d.\-()a-zA-Z]+\s*)?["“]([A-Z][A-Za-z0-9\s\-&'’]{1,60})["”]/;
58
+ const inline_re = /\([^)]*?["“]([A-Z][A-Za-z0-9\s\-&'’]{1,60})["”][^)]*?\)/g;
59
+
60
+ for (const item of iter_block_items(doc)) {
61
+ if (!(item instanceof Paragraph)) continue;
62
+
63
+ const text = _get_paragraph_text(item).trim();
64
+ if (!text) continue;
65
+
66
+ const extracted_terms: string[] = [];
67
+ const leading_match = text.match(leading_re);
68
+ if (leading_match) extracted_terms.push(leading_match[1].trim());
69
+
70
+ const inline_matches = text.matchAll(inline_re);
71
+ for (const m of inline_matches) {
72
+ extracted_terms.push(m[1].trim());
73
+ }
74
+
75
+ for (const term of extracted_terms) {
76
+ if (definitions[term]) duplicates.add(term);
77
+ else definitions[term] = { count: 0 };
78
+ }
79
+
80
+ const short_text = text.length > 60 ? text.substring(0, 60) + '...' : text;
81
+
82
+ const nodes = findAllDescendants(item._element, '*');
83
+ for (const node of nodes) {
84
+ if (node.tagName === 'w:bookmarkStart') {
85
+ const b_name = node.getAttribute('w:name');
86
+ if (b_name && (!b_name.startsWith('_') || b_name.startsWith('_Ref'))) {
87
+ if (!raw_anchors[b_name]) {
88
+ raw_anchors[b_name] = { anchored_to: short_text, referenced_from: [] };
89
+ }
90
+ }
91
+ }
92
+
93
+ let target: string | null = null;
94
+ if (node.tagName === 'w:fldSimple') {
95
+ const instr = node.getAttribute('w:instr') || '';
96
+ const parts = instr.trim().split(/\s+/);
97
+ if (parts.length > 1 && parts[0] === 'REF') target = parts[1];
98
+ } else if (node.tagName === 'w:instrText') {
99
+ const instr = node.textContent || '';
100
+ const parts = instr.trim().split(/\s+/);
101
+ if (parts.length > 1 && parts[0] === 'REF') target = parts[1];
102
+ }
103
+
104
+ if (target) raw_references.push([target, short_text]);
105
+ }
106
+ }
107
+
108
+ for (const [target, ref_text] of raw_references) {
109
+ if (raw_anchors[target]) {
110
+ raw_anchors[target].referenced_from.push(ref_text);
111
+ }
112
+ }
113
+
114
+ const diagnostics: string[] = [];
115
+
116
+ const def_keys = Object.keys(definitions);
117
+ if (def_keys.length > 0) {
118
+ const sorted_terms = def_keys.sort((a, b) => b.length - a.length);
119
+ const escapeRegExp = (str: string) => str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
120
+ const alt = sorted_terms.map(escapeRegExp).join('|');
121
+ const usage_pattern = new RegExp(`(?<!["“])\\b(${alt})\\b(?![”"])`, 'g');
122
+
123
+ for (const m of base_text.matchAll(usage_pattern)) {
124
+ const matched_term = m[1];
125
+ if (definitions[matched_term]) definitions[matched_term].count++;
126
+ }
127
+
128
+ for (const term of def_keys) {
129
+ if (definitions[term].count === 0) {
130
+ delete definitions[term];
131
+ duplicates.delete(term);
132
+ }
133
+ }
134
+ }
135
+
136
+ for (const term of duplicates) {
137
+ diagnostics.push(`[Error] Duplicate Definition: '${term}' is defined multiple times.`);
138
+ }
139
+
140
+ const stop_words = new Set([
141
+ "The", "This", "That", "Such", "A", "An", "Any", "All", "Some", "No",
142
+ "Every", "Each", "As", "In", "Of", "For", "To", "On", "By", "With"
143
+ ]);
144
+
145
+ const all_cap_pattern = /\b[A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*\b/g;
146
+ const all_caps = new Set(base_text.match(all_cap_pattern) || []);
147
+
148
+ const valid_terms = new Set(Object.keys(definitions));
149
+ const terms_by_first_letter: Record<string, string[]> = {};
150
+ for (const term of valid_terms) {
151
+ const fl = term[0].toLowerCase();
152
+ if (!terms_by_first_letter[fl]) terms_by_first_letter[fl] = [];
153
+ terms_by_first_letter[fl].push(term);
154
+ }
155
+
156
+ const candidates_by_term: Record<string, string[]> = {};
157
+
158
+ for (const raw_candidate of all_caps) {
159
+ let candidate = raw_candidate.trim();
160
+ const words = candidate.split(/\s+/);
161
+ while (words.length > 0) {
162
+ const first = words[0];
163
+ const title = first.charAt(0).toUpperCase() + first.slice(1).toLowerCase();
164
+ if (stop_words.has(title)) words.shift();
165
+ else break;
166
+ }
167
+ candidate = words.join(' ');
168
+
169
+ if (candidate.length < 4) continue;
170
+ if (valid_terms.has(candidate)) continue;
171
+
172
+ const first_letter = candidate[0].toLowerCase();
173
+ let candidate_terms = terms_by_first_letter[first_letter] || [];
174
+
175
+ if (candidate.length > 5) {
176
+ for (const [k, v] of Object.entries(terms_by_first_letter)) {
177
+ if (k !== first_letter) candidate_terms = candidate_terms.concat(v);
178
+ }
179
+ }
180
+
181
+ for (const term of candidate_terms) {
182
+ if (Math.abs(candidate.length - term.length) > 2) continue;
183
+ if (candidate === term + 's' || candidate === term + 'es') continue;
184
+ if (term === candidate + 's' || term === candidate + 'es') continue;
185
+
186
+ const dist = boundedLevenshtein(candidate, term, 2);
187
+ if (dist === 0 || dist > 2) continue;
188
+
189
+ if (term.length <= 5) {
190
+ if (dist > 1) continue;
191
+ if (candidate[0].toLowerCase() !== term[0].toLowerCase()) continue;
192
+ }
193
+
194
+ if (!candidates_by_term[term]) candidates_by_term[term] = [];
195
+ if (!candidates_by_term[term].includes(candidate)) candidates_by_term[term].push(candidate);
196
+ }
197
+ }
198
+
199
+ for (const [term, candidates] of Object.entries(candidates_by_term)) {
200
+ candidates.sort();
201
+ const c_str = candidates.map(c => `'${c}'`).join(', ');
202
+ diagnostics.push(`[Info] Possible Typos for '${term}': Found ${c_str}`);
203
+ }
204
+
205
+ function diag_sort_key(msg: string) {
206
+ if (msg.startsWith('[Error]')) return 0;
207
+ if (msg.startsWith('[Warning]')) return 1;
208
+ return 2;
209
+ }
210
+
211
+ diagnostics.sort((a, b) => {
212
+ const keyA = diag_sort_key(a);
213
+ const keyB = diag_sort_key(b);
214
+ if (keyA !== keyB) return keyA - keyB;
215
+ return a.localeCompare(b);
216
+ });
217
+
218
+ return [definitions, diagnostics, raw_anchors];
219
+ }
220
+
221
+ export function build_structural_appendix(doc: DocumentObject, base_text: string): string {
222
+ const [defs, diagnostics, anchors] = extract_all_domain_metadata(doc, base_text);
223
+
224
+ const lines: string[] = [
225
+ "\n\n---",
226
+ "",
227
+ "<!-- READONLY_BOUNDARY_START -->",
228
+ "# Document Structure (Read-Only)",
229
+ "The content below is metadata describing the document's reference structure. Do not include this section in any tracked changes or edits \u2014 it is for your context only and will be discarded on write."
230
+ ];
231
+
232
+ let has_content = false;
233
+
234
+ if (Object.keys(defs).length > 0) {
235
+ has_content = true;
236
+ lines.push("\n## Defined Terms");
237
+ for (const [term, data] of Object.entries(defs)) {
238
+ lines.push(`- "${term}" \u2014 used ${data.count} times.`);
239
+ }
240
+ }
241
+
242
+ if (diagnostics.length > 0) {
243
+ has_content = true;
244
+ lines.push("\n## Semantic Diagnostics");
245
+ for (const diag of diagnostics) {
246
+ lines.push(`- ${diag}`);
247
+ }
248
+ }
249
+
250
+ if (Object.keys(anchors).length > 0) {
251
+ has_content = true;
252
+ lines.push("\n## Named Anchors");
253
+ for (const [b_name, data] of Object.entries(anchors)) {
254
+ lines.push(`- ${b_name} \u2192 Anchored to: "${data.anchored_to}"`);
255
+ for (const ref of data.referenced_from) {
256
+ lines.push(` - Referenced from: "${ref}"`);
257
+ }
258
+ }
259
+ }
260
+
261
+ if (has_content) {
262
+ return lines.join('\n');
263
+ }
264
+ return "";
11
265
  }
package/src/index.ts CHANGED
@@ -1,14 +1,13 @@
1
- /**
2
- * @adeu/core
3
- * Cross-platform XML Redlining Engine
4
- */
5
- export const identifyEngine = () => 'adeu-core-node';
1
+ export function identifyEngine() {
2
+ return 'adeu-core-node';
3
+ }
6
4
 
7
- export { extractTextFromBuffer } from './ingest.js';
8
5
  export { DocumentObject } from './docx/bridge.js';
9
6
  export { DocumentMapper, TextSpan } from './mapper.js';
10
7
  export { RedlineEngine, BatchValidationError } from './engine.js';
11
- export { generate_edits_from_text, trim_common_context } from './diff.js';
8
+ export { generate_edits_from_text, trim_common_context, create_unified_diff } from './diff.js';
12
9
  export { apply_edits_to_markdown } from './markup.js';
13
10
  export { paginate, split_structural_appendix, PaginationResult, PageInfo } from './pagination.js';
14
- export { extract_outline, OutlineNode } from './outline.js';
11
+ export { extract_outline, OutlineNode } from './outline.js';
12
+ export { extractTextFromBuffer } from './ingest.js';
13
+ export { finalize_document, FinalizeOptions, FinalizeResult } from './sanitize/core.js';