@adeu/core 1.6.7 → 1.6.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/domain.ts CHANGED
@@ -1,11 +1,265 @@
1
- /**
2
- * Lightweight port of domain.py (Semantic Diagnostics & Appendix).
3
- * Uses a simplified heuristic since full rapidfuzz isn't available.
4
- */
5
-
6
- export function build_structural_appendix(doc: any, base_text: string): string {
7
- // To keep the initial ingestion port lean and maintain 100% parity on body text,
8
- // we will return an empty appendix string for now. The python port can be completed
9
- // in a follow-up PR if diagnostics are required in Node MCPs.
10
- return '';
1
+ import { DocumentObject } from './docx/bridge.js';
2
+ import { Paragraph, Run } from './docx/primitives.js';
3
+ import { iter_block_items, get_run_text } from './utils/docx.js';
4
+ import { findAllDescendants } from './docx/dom.js';
5
+
6
+ function boundedLevenshtein(a: string, b: string, maxDist: number = 2): number {
7
+ if (a === b) return 0;
8
+ if (Math.abs(a.length - b.length) > maxDist) return maxDist + 1;
9
+ if (a.length === 0) return b.length <= maxDist ? b.length : maxDist + 1;
10
+ if (b.length === 0) return a.length <= maxDist ? a.length : maxDist + 1;
11
+
12
+ if (a.length > b.length) {
13
+ const temp = a;
14
+ a = b;
15
+ b = temp;
16
+ }
17
+
18
+ let row = Array.from({ length: a.length + 1 }, (_, i) => i);
19
+
20
+ for (let i = 1; i <= b.length; i++) {
21
+ const newRow = [i];
22
+ let minInRow = i;
23
+ for (let j = 1; j <= a.length; j++) {
24
+ const cost = a[j - 1] === b[i - 1] ? 0 : 1;
25
+ const val = Math.min(
26
+ row[j] + 1,
27
+ newRow[j - 1] + 1,
28
+ row[j - 1] + cost
29
+ );
30
+ newRow.push(val);
31
+ if (val < minInRow) minInRow = val;
32
+ }
33
+ if (minInRow > maxDist) return maxDist + 1;
34
+ row = newRow;
35
+ }
36
+ return row[a.length] <= maxDist ? row[a.length] : maxDist + 1;
37
+ }
38
+
39
+ function _get_paragraph_text(p: Paragraph): string {
40
+ let text = '';
41
+ const runs = findAllDescendants(p._element, 'w:r');
42
+ for (const r of runs) {
43
+ text += get_run_text(new Run(r, p));
44
+ }
45
+ return text;
46
+ }
47
+
48
+ export function extract_all_domain_metadata(
49
+ doc: DocumentObject,
50
+ base_text: string
51
+ ): [Record<string, { count: number }>, string[], Record<string, { anchored_to: string; referenced_from: string[] }>] {
52
+ const definitions: Record<string, { count: number }> = {};
53
+ const duplicates = new Set<string>();
54
+ const raw_anchors: Record<string, { anchored_to: string; referenced_from: string[] }> = {};
55
+ const raw_references: [string, string][] = [];
56
+
57
+ const leading_re = /^(?:[\d.\-()a-zA-Z]+\s*)?["“]([A-Z][A-Za-z0-9\s\-&'’]{1,60})["”]/;
58
+ const inline_re = /\([^)]*?["“]([A-Z][A-Za-z0-9\s\-&'’]{1,60})["”][^)]*?\)/g;
59
+
60
+ for (const item of iter_block_items(doc)) {
61
+ if (!(item instanceof Paragraph)) continue;
62
+
63
+ const text = _get_paragraph_text(item).trim();
64
+ if (!text) continue;
65
+
66
+ const extracted_terms: string[] = [];
67
+ const leading_match = text.match(leading_re);
68
+ if (leading_match) extracted_terms.push(leading_match[1].trim());
69
+
70
+ const inline_matches = text.matchAll(inline_re);
71
+ for (const m of inline_matches) {
72
+ extracted_terms.push(m[1].trim());
73
+ }
74
+
75
+ for (const term of extracted_terms) {
76
+ if (definitions[term]) duplicates.add(term);
77
+ else definitions[term] = { count: 0 };
78
+ }
79
+
80
+ const short_text = text.length > 60 ? text.substring(0, 60) + '...' : text;
81
+
82
+ const nodes = findAllDescendants(item._element, '*');
83
+ for (const node of nodes) {
84
+ if (node.tagName === 'w:bookmarkStart') {
85
+ const b_name = node.getAttribute('w:name');
86
+ if (b_name && (!b_name.startsWith('_') || b_name.startsWith('_Ref'))) {
87
+ if (!raw_anchors[b_name]) {
88
+ raw_anchors[b_name] = { anchored_to: short_text, referenced_from: [] };
89
+ }
90
+ }
91
+ }
92
+
93
+ let target: string | null = null;
94
+ if (node.tagName === 'w:fldSimple') {
95
+ const instr = node.getAttribute('w:instr') || '';
96
+ const parts = instr.trim().split(/\s+/);
97
+ if (parts.length > 1 && parts[0] === 'REF') target = parts[1];
98
+ } else if (node.tagName === 'w:instrText') {
99
+ const instr = node.textContent || '';
100
+ const parts = instr.trim().split(/\s+/);
101
+ if (parts.length > 1 && parts[0] === 'REF') target = parts[1];
102
+ }
103
+
104
+ if (target) raw_references.push([target, short_text]);
105
+ }
106
+ }
107
+
108
+ for (const [target, ref_text] of raw_references) {
109
+ if (raw_anchors[target]) {
110
+ raw_anchors[target].referenced_from.push(ref_text);
111
+ }
112
+ }
113
+
114
+ const diagnostics: string[] = [];
115
+
116
+ const def_keys = Object.keys(definitions);
117
+ if (def_keys.length > 0) {
118
+ const sorted_terms = def_keys.sort((a, b) => b.length - a.length);
119
+ const escapeRegExp = (str: string) => str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
120
+ const alt = sorted_terms.map(escapeRegExp).join('|');
121
+ const usage_pattern = new RegExp(`(?<!["“])\\b(${alt})\\b(?![”"])`, 'g');
122
+
123
+ for (const m of base_text.matchAll(usage_pattern)) {
124
+ const matched_term = m[1];
125
+ if (definitions[matched_term]) definitions[matched_term].count++;
126
+ }
127
+
128
+ for (const term of def_keys) {
129
+ if (definitions[term].count === 0) {
130
+ delete definitions[term];
131
+ duplicates.delete(term);
132
+ }
133
+ }
134
+ }
135
+
136
+ for (const term of duplicates) {
137
+ diagnostics.push(`[Error] Duplicate Definition: '${term}' is defined multiple times.`);
138
+ }
139
+
140
+ const stop_words = new Set([
141
+ "The", "This", "That", "Such", "A", "An", "Any", "All", "Some", "No",
142
+ "Every", "Each", "As", "In", "Of", "For", "To", "On", "By", "With"
143
+ ]);
144
+
145
+ const all_cap_pattern = /\b[A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)*\b/g;
146
+ const all_caps = new Set(base_text.match(all_cap_pattern) || []);
147
+
148
+ const valid_terms = new Set(Object.keys(definitions));
149
+ const terms_by_first_letter: Record<string, string[]> = {};
150
+ for (const term of valid_terms) {
151
+ const fl = term[0].toLowerCase();
152
+ if (!terms_by_first_letter[fl]) terms_by_first_letter[fl] = [];
153
+ terms_by_first_letter[fl].push(term);
154
+ }
155
+
156
+ const candidates_by_term: Record<string, string[]> = {};
157
+
158
+ for (const raw_candidate of all_caps) {
159
+ let candidate = raw_candidate.trim();
160
+ const words = candidate.split(/\s+/);
161
+ while (words.length > 0) {
162
+ const first = words[0];
163
+ const title = first.charAt(0).toUpperCase() + first.slice(1).toLowerCase();
164
+ if (stop_words.has(title)) words.shift();
165
+ else break;
166
+ }
167
+ candidate = words.join(' ');
168
+
169
+ if (candidate.length < 4) continue;
170
+ if (valid_terms.has(candidate)) continue;
171
+
172
+ const first_letter = candidate[0].toLowerCase();
173
+ let candidate_terms = terms_by_first_letter[first_letter] || [];
174
+
175
+ if (candidate.length > 5) {
176
+ for (const [k, v] of Object.entries(terms_by_first_letter)) {
177
+ if (k !== first_letter) candidate_terms = candidate_terms.concat(v);
178
+ }
179
+ }
180
+
181
+ for (const term of candidate_terms) {
182
+ if (Math.abs(candidate.length - term.length) > 2) continue;
183
+ if (candidate === term + 's' || candidate === term + 'es') continue;
184
+ if (term === candidate + 's' || term === candidate + 'es') continue;
185
+
186
+ const dist = boundedLevenshtein(candidate, term, 2);
187
+ if (dist === 0 || dist > 2) continue;
188
+
189
+ if (term.length <= 5) {
190
+ if (dist > 1) continue;
191
+ if (candidate[0].toLowerCase() !== term[0].toLowerCase()) continue;
192
+ }
193
+
194
+ if (!candidates_by_term[term]) candidates_by_term[term] = [];
195
+ if (!candidates_by_term[term].includes(candidate)) candidates_by_term[term].push(candidate);
196
+ }
197
+ }
198
+
199
+ for (const [term, candidates] of Object.entries(candidates_by_term)) {
200
+ candidates.sort();
201
+ const c_str = candidates.map(c => `'${c}'`).join(', ');
202
+ diagnostics.push(`[Info] Possible Typos for '${term}': Found ${c_str}`);
203
+ }
204
+
205
+ function diag_sort_key(msg: string) {
206
+ if (msg.startsWith('[Error]')) return 0;
207
+ if (msg.startsWith('[Warning]')) return 1;
208
+ return 2;
209
+ }
210
+
211
+ diagnostics.sort((a, b) => {
212
+ const keyA = diag_sort_key(a);
213
+ const keyB = diag_sort_key(b);
214
+ if (keyA !== keyB) return keyA - keyB;
215
+ return a.localeCompare(b);
216
+ });
217
+
218
+ return [definitions, diagnostics, raw_anchors];
219
+ }
220
+
221
+ export function build_structural_appendix(doc: DocumentObject, base_text: string): string {
222
+ const [defs, diagnostics, anchors] = extract_all_domain_metadata(doc, base_text);
223
+
224
+ const lines: string[] = [
225
+ "\n\n---",
226
+ "",
227
+ "<!-- READONLY_BOUNDARY_START -->",
228
+ "# Document Structure (Read-Only)",
229
+ "The content below is metadata describing the document's reference structure. Do not include this section in any tracked changes or edits \u2014 it is for your context only and will be discarded on write."
230
+ ];
231
+
232
+ let has_content = false;
233
+
234
+ if (Object.keys(defs).length > 0) {
235
+ has_content = true;
236
+ lines.push("\n## Defined Terms");
237
+ for (const [term, data] of Object.entries(defs)) {
238
+ lines.push(`- "${term}" \u2014 used ${data.count} times.`);
239
+ }
240
+ }
241
+
242
+ if (diagnostics.length > 0) {
243
+ has_content = true;
244
+ lines.push("\n## Semantic Diagnostics");
245
+ for (const diag of diagnostics) {
246
+ lines.push(`- ${diag}`);
247
+ }
248
+ }
249
+
250
+ if (Object.keys(anchors).length > 0) {
251
+ has_content = true;
252
+ lines.push("\n## Named Anchors");
253
+ for (const [b_name, data] of Object.entries(anchors)) {
254
+ lines.push(`- ${b_name} \u2192 Anchored to: "${data.anchored_to}"`);
255
+ for (const ref of data.referenced_from) {
256
+ lines.push(` - Referenced from: "${ref}"`);
257
+ }
258
+ }
259
+ }
260
+
261
+ if (has_content) {
262
+ return lines.join('\n');
263
+ }
264
+ return "";
11
265
  }