@adeu/core 1.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3627 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +247 -0
- package/dist/index.d.ts +247 -0
- package/dist/index.js +3579 -0
- package/dist/index.js.map +1 -0
- package/package.json +38 -0
- package/src/comments.test.ts +38 -0
- package/src/comments.ts +451 -0
- package/src/diff.test.ts +62 -0
- package/src/diff.ts +251 -0
- package/src/docx/bridge.ts +189 -0
- package/src/docx/dom.ts +54 -0
- package/src/docx/primitives.ts +65 -0
- package/src/domain.ts +11 -0
- package/src/engine.atomic.test.ts +58 -0
- package/src/engine.batch.test.ts +93 -0
- package/src/engine.safety.test.ts +42 -0
- package/src/engine.tables.test.ts +166 -0
- package/src/engine.ts +735 -0
- package/src/index.test.ts +8 -0
- package/src/index.ts +14 -0
- package/src/ingest.test.ts +44 -0
- package/src/ingest.ts +400 -0
- package/src/mapper.test.ts +66 -0
- package/src/mapper.ts +835 -0
- package/src/markup.test.ts +150 -0
- package/src/markup.ts +323 -0
- package/src/models.ts +51 -0
- package/src/outline.ts +377 -0
- package/src/pagination.ts +239 -0
- package/src/test-utils.ts +142 -0
- package/src/utils/docx.ts +478 -0
- package/tsconfig.json +21 -0
- package/tsup.config.ts +10 -0
- package/vitest.config.ts +12 -0
package/src/diff.ts
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import diff_match_patch from 'diff-match-patch';
|
|
2
|
+
import { ModifyText } from './models.js';
|
|
3
|
+
|
|
4
|
+
export function trim_common_context(target: string, new_val: string): [number, number] {
|
|
5
|
+
if (!target || !new_val) return [0, 0];
|
|
6
|
+
|
|
7
|
+
const isSpace = (char: string) => /\s/.test(char);
|
|
8
|
+
|
|
9
|
+
// 1. Prefix with Word Boundary Check
|
|
10
|
+
let prefix_len = 0;
|
|
11
|
+
let limit = Math.min(target.length, new_val.length);
|
|
12
|
+
while (prefix_len < limit && target[prefix_len] === new_val[prefix_len]) {
|
|
13
|
+
prefix_len++;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
// Backtrack to nearest whitespace if we split a word
|
|
17
|
+
if (prefix_len < target.length && prefix_len < new_val.length) {
|
|
18
|
+
while (prefix_len > 0) {
|
|
19
|
+
const target_split = !isSpace(target[prefix_len - 1]) && !isSpace(target[prefix_len]);
|
|
20
|
+
const new_split = !isSpace(new_val[prefix_len - 1]) && !isSpace(new_val[prefix_len]);
|
|
21
|
+
if (target_split || new_split) {
|
|
22
|
+
prefix_len--;
|
|
23
|
+
} else {
|
|
24
|
+
break;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Backtrack prefix to avoid splitting markdown markers
|
|
30
|
+
while (prefix_len > 0) {
|
|
31
|
+
if (prefix_len < target.length) {
|
|
32
|
+
const charSeq = target.substring(prefix_len - 1, prefix_len + 1);
|
|
33
|
+
if (charSeq === '**' || charSeq === '__') {
|
|
34
|
+
prefix_len--;
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const left = target.substring(0, prefix_len);
|
|
40
|
+
const b_count = (left.match(/\*\*/g) || []).length;
|
|
41
|
+
const u2_count = (left.match(/__/g) || []).length;
|
|
42
|
+
const u1_count = (left.replace(/__/g, '').match(/_/g) || []).length;
|
|
43
|
+
|
|
44
|
+
if (b_count % 2 !== 0) {
|
|
45
|
+
prefix_len = left.lastIndexOf('**');
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
if (u2_count % 2 !== 0) {
|
|
49
|
+
prefix_len = left.lastIndexOf('__');
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
if (u1_count % 2 !== 0) {
|
|
53
|
+
let idx = left.length - 1;
|
|
54
|
+
while (idx >= 0) {
|
|
55
|
+
if (left[idx] === '_' &&
|
|
56
|
+
(idx === 0 || left[idx - 1] !== '_') &&
|
|
57
|
+
(idx === left.length - 1 || left[idx + 1] !== '_')) {
|
|
58
|
+
prefix_len = idx;
|
|
59
|
+
break;
|
|
60
|
+
}
|
|
61
|
+
idx--;
|
|
62
|
+
}
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Safety: Backtrack if we consumed a Markdown Header marker (#)
|
|
67
|
+
let temp_len = prefix_len;
|
|
68
|
+
let hit_header = false;
|
|
69
|
+
while (temp_len > 0) {
|
|
70
|
+
const char = target[temp_len - 1];
|
|
71
|
+
if (char === '#') {
|
|
72
|
+
prefix_len = temp_len - 1;
|
|
73
|
+
while (prefix_len > 0 && target[prefix_len - 1] !== '\n') {
|
|
74
|
+
prefix_len--;
|
|
75
|
+
}
|
|
76
|
+
hit_header = true;
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
if (char === '\n') break;
|
|
80
|
+
temp_len--;
|
|
81
|
+
}
|
|
82
|
+
if (hit_header) continue;
|
|
83
|
+
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// 2. Suffix with Word Boundary Check
|
|
88
|
+
let suffix_len = 0;
|
|
89
|
+
const target_rem_len = target.length - prefix_len;
|
|
90
|
+
const new_rem_len = new_val.length - prefix_len;
|
|
91
|
+
const limit_suffix = Math.min(target_rem_len, new_rem_len);
|
|
92
|
+
|
|
93
|
+
while (suffix_len < limit_suffix && target[target.length - 1 - suffix_len] === new_val[new_val.length - 1 - suffix_len]) {
|
|
94
|
+
suffix_len++;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if (suffix_len > 0) {
|
|
98
|
+
while (suffix_len > 0) {
|
|
99
|
+
let target_split = false;
|
|
100
|
+
if (suffix_len < target.length) {
|
|
101
|
+
target_split = !isSpace(target[target.length - 1 - suffix_len]) && !isSpace(target[target.length - suffix_len]);
|
|
102
|
+
}
|
|
103
|
+
let new_split = false;
|
|
104
|
+
if (suffix_len < new_val.length) {
|
|
105
|
+
new_split = !isSpace(new_val[new_val.length - 1 - suffix_len]) && !isSpace(new_val[new_val.length - suffix_len]);
|
|
106
|
+
}
|
|
107
|
+
if (target_split || new_split) {
|
|
108
|
+
suffix_len--;
|
|
109
|
+
} else {
|
|
110
|
+
break;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
while (suffix_len > 0) {
|
|
116
|
+
const idx = target.length - suffix_len;
|
|
117
|
+
if (idx > 0) {
|
|
118
|
+
const charSeq = target.substring(idx - 1, idx + 1);
|
|
119
|
+
if (charSeq === '**' || charSeq === '__') {
|
|
120
|
+
suffix_len--;
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const right = target.substring(target.length - suffix_len);
|
|
126
|
+
const b_count = (right.match(/\*\*/g) || []).length;
|
|
127
|
+
const u2_count = (right.match(/__/g) || []).length;
|
|
128
|
+
const u1_count = (right.replace(/__/g, '').match(/_/g) || []).length;
|
|
129
|
+
|
|
130
|
+
if (b_count % 2 !== 0) {
|
|
131
|
+
suffix_len -= right.indexOf('**') + 2;
|
|
132
|
+
continue;
|
|
133
|
+
}
|
|
134
|
+
if (u2_count % 2 !== 0) {
|
|
135
|
+
suffix_len -= right.indexOf('__') + 2;
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
if (u1_count % 2 !== 0) {
|
|
139
|
+
let idx_in_right = 0;
|
|
140
|
+
while (idx_in_right < right.length) {
|
|
141
|
+
if (right[idx_in_right] === '_' &&
|
|
142
|
+
(idx_in_right === 0 || right[idx_in_right - 1] !== '_') &&
|
|
143
|
+
(idx_in_right === right.length - 1 || right[idx_in_right + 1] !== '_')) {
|
|
144
|
+
suffix_len -= idx_in_right + 1;
|
|
145
|
+
break;
|
|
146
|
+
}
|
|
147
|
+
idx_in_right++;
|
|
148
|
+
}
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
break;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (suffix_len > 0 && /^\s+$/.test(target.substring(target.length - suffix_len))) {
|
|
155
|
+
suffix_len = 0;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Absorb balanced wrappers
|
|
159
|
+
for (const marker of ['**', '__', '_']) {
|
|
160
|
+
const mlen = marker.length;
|
|
161
|
+
const tgt_rem = target.substring(prefix_len, target.length - suffix_len);
|
|
162
|
+
const new_rem = new_val.substring(prefix_len, new_val.length - suffix_len);
|
|
163
|
+
|
|
164
|
+
if (
|
|
165
|
+
tgt_rem.startsWith(marker) && new_rem.startsWith(marker) &&
|
|
166
|
+
tgt_rem.endsWith(marker) && new_rem.endsWith(marker) &&
|
|
167
|
+
tgt_rem.length >= 2 * mlen && new_rem.length >= 2 * mlen
|
|
168
|
+
) {
|
|
169
|
+
prefix_len += mlen;
|
|
170
|
+
suffix_len += mlen;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
return [prefix_len, suffix_len];
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
function _words_to_chars(text1: string, text2: string): [string, string, string[]] {
|
|
178
|
+
const token_array: string[] = [];
|
|
179
|
+
const token_hash: Record<string, number> = {};
|
|
180
|
+
|
|
181
|
+
// RegExp equivalent to Python's r"(\s+|\w+|[^\w\s])" with unicode support
|
|
182
|
+
const split_pattern = /(\s+|[\p{L}\p{N}_]+|[^\p{L}\p{N}_\s])/gu;
|
|
183
|
+
|
|
184
|
+
const encode_text = (text: string) => {
|
|
185
|
+
// Keep delimiters via capture group in split
|
|
186
|
+
const tokens = text.split(split_pattern).filter(Boolean);
|
|
187
|
+
let encoded_chars = '';
|
|
188
|
+
for (const token of tokens) {
|
|
189
|
+
if (token in token_hash) {
|
|
190
|
+
encoded_chars += String.fromCharCode(token_hash[token]);
|
|
191
|
+
} else {
|
|
192
|
+
const code = token_array.length;
|
|
193
|
+
token_hash[token] = code;
|
|
194
|
+
token_array.push(token);
|
|
195
|
+
encoded_chars += String.fromCharCode(code);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
return encoded_chars;
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
return [encode_text(text1), encode_text(text2), token_array];
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
export function generate_edits_from_text(original_text: string, modified_text: string): ModifyText[] {
|
|
205
|
+
const dmp = new diff_match_patch.diff_match_patch();
|
|
206
|
+
|
|
207
|
+
const [chars1, chars2, token_array] = _words_to_chars(original_text, modified_text);
|
|
208
|
+
const diffs = dmp.diff_main(chars1, chars2, false);
|
|
209
|
+
dmp.diff_cleanupSemantic(diffs);
|
|
210
|
+
|
|
211
|
+
// Manually map characters back to words to bypass prototype volatility (diff_charsToLines_)
|
|
212
|
+
for (let i = 0; i < diffs.length; i++) {
|
|
213
|
+
const chars = diffs[i][1];
|
|
214
|
+
let text = '';
|
|
215
|
+
for (let j = 0; j < chars.length; j++) text += token_array[chars.charCodeAt(j)];
|
|
216
|
+
diffs[i][1] = text;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const edits: ModifyText[] = [];
|
|
220
|
+
let current_original_index = 0;
|
|
221
|
+
let pending_delete: [number, string] | null = null;
|
|
222
|
+
|
|
223
|
+
for (const [op, text] of diffs) {
|
|
224
|
+
if (op === 0) { // Equal
|
|
225
|
+
if (pending_delete) {
|
|
226
|
+
const [idx, del_txt] = pending_delete;
|
|
227
|
+
edits.push({ type: 'modify', target_text: del_txt, new_text: '', comment: 'Diff: Text deleted', _match_start_index: idx });
|
|
228
|
+
pending_delete = null;
|
|
229
|
+
}
|
|
230
|
+
current_original_index += text.length;
|
|
231
|
+
} else if (op === -1) { // Delete
|
|
232
|
+
pending_delete = [current_original_index, text];
|
|
233
|
+
current_original_index += text.length;
|
|
234
|
+
} else if (op === 1) { // Insert
|
|
235
|
+
if (pending_delete) {
|
|
236
|
+
const [idx, del_txt] = pending_delete;
|
|
237
|
+
edits.push({ type: 'modify', target_text: del_txt, new_text: text, comment: 'Diff: Replacement', _match_start_index: idx });
|
|
238
|
+
pending_delete = null;
|
|
239
|
+
} else {
|
|
240
|
+
edits.push({ type: 'modify', target_text: '', new_text: text, comment: 'Diff: Text inserted', _match_start_index: current_original_index });
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
if (pending_delete) {
|
|
246
|
+
const [idx, del_txt] = pending_delete;
|
|
247
|
+
edits.push({ type: 'modify', target_text: del_txt, new_text: '', comment: 'Diff: Text deleted', _match_start_index: idx });
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
return edits;
|
|
251
|
+
}
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
import JSZip from 'jszip';
|
|
2
|
+
import { parseXml, findChild, findAllDescendants, serializeXml } from './dom.js';
|
|
3
|
+
|
|
4
|
+
export class Relationship {
|
|
5
|
+
constructor(
|
|
6
|
+
public id: string,
|
|
7
|
+
public type: string,
|
|
8
|
+
public target: string,
|
|
9
|
+
public isExternal: boolean
|
|
10
|
+
) {}
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export class Part {
|
|
14
|
+
public rels: Map<string, Relationship> = new Map();
|
|
15
|
+
public _element: Element;
|
|
16
|
+
|
|
17
|
+
constructor(
|
|
18
|
+
public partname: string,
|
|
19
|
+
public blob: string,
|
|
20
|
+
element: Element,
|
|
21
|
+
public contentType: string
|
|
22
|
+
) {
|
|
23
|
+
this._element = element;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
public addRelationship(id: string, type: string, target: string, isExternal: boolean = false) {
|
|
27
|
+
this.rels.set(id, new Relationship(id, type, target, isExternal));
|
|
28
|
+
|
|
29
|
+
// If this part represents a .rels file, update the XML directly
|
|
30
|
+
if (this._element.tagName === 'Relationships') {
|
|
31
|
+
const doc = this._element.ownerDocument;
|
|
32
|
+
if (doc) {
|
|
33
|
+
const relEl = doc.createElement('Relationship');
|
|
34
|
+
relEl.setAttribute('Id', id);
|
|
35
|
+
relEl.setAttribute('Type', type);
|
|
36
|
+
relEl.setAttribute('Target', target);
|
|
37
|
+
if (isExternal) relEl.setAttribute('TargetMode', 'External');
|
|
38
|
+
this._element.appendChild(relEl);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export class DocxPackage {
|
|
45
|
+
public parts: Part[] = [];
|
|
46
|
+
public mainDocumentPart!: Part;
|
|
47
|
+
|
|
48
|
+
constructor(public zip: JSZip) {}
|
|
49
|
+
|
|
50
|
+
public getPartByPath(path: string): Part | undefined {
|
|
51
|
+
// Strip leading slash for jszip compat
|
|
52
|
+
const searchPath = path.startsWith('/') ? path.substring(1) : path;
|
|
53
|
+
return this.parts.find((p) => p.partname === searchPath || p.partname === '/' + searchPath);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
public nextPartname(pattern: string): string {
|
|
57
|
+
let i = 1;
|
|
58
|
+
while (true) {
|
|
59
|
+
const candidate = pattern.replace('%d', i === 1 ? '' : i.toString());
|
|
60
|
+
if (!this.getPartByPath(candidate)) return candidate;
|
|
61
|
+
i++;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
public addPart(partname: string, contentType: string, xmlString: string): Part {
|
|
66
|
+
const doc = parseXml(xmlString);
|
|
67
|
+
const part = new Part(partname, xmlString, doc.documentElement, contentType);
|
|
68
|
+
this.parts.push(part);
|
|
69
|
+
|
|
70
|
+
// Update [Content_Types].xml
|
|
71
|
+
const ctPart = this.getPartByPath('[Content_Types].xml');
|
|
72
|
+
if (ctPart) {
|
|
73
|
+
const docCT = ctPart._element.ownerDocument;
|
|
74
|
+
if (docCT) {
|
|
75
|
+
const override = docCT.createElement('Override');
|
|
76
|
+
override.setAttribute('PartName', partname);
|
|
77
|
+
override.setAttribute('ContentType', contentType);
|
|
78
|
+
ctPart._element.appendChild(override);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return part;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
public getOrCreateRelsPart(sourcePartname: string): Part {
|
|
85
|
+
// e.g., /word/document.xml -> /word/_rels/document.xml.rels
|
|
86
|
+
const parts = sourcePartname.split('/');
|
|
87
|
+
const file = parts.pop();
|
|
88
|
+
const relsPath = parts.join('/') + '/_rels/' + file + '.rels';
|
|
89
|
+
|
|
90
|
+
let relsPart = this.getPartByPath(relsPath);
|
|
91
|
+
if (!relsPart) {
|
|
92
|
+
const xml = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"></Relationships>`;
|
|
93
|
+
relsPart = this.addPart(relsPath, 'application/vnd.openxmlformats-package.relationships+xml', xml);
|
|
94
|
+
}
|
|
95
|
+
return relsPart;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
export class DocumentObject {
|
|
100
|
+
public part: Part;
|
|
101
|
+
public settings: { oddAndEvenPagesHeaderFooter: boolean } = { oddAndEvenPagesHeaderFooter: false };
|
|
102
|
+
// Simplification for the TS port: sections hold header/footer refs
|
|
103
|
+
public sections: any[] = [];
|
|
104
|
+
|
|
105
|
+
constructor(public pkg: DocxPackage, part: Part) {
|
|
106
|
+
this.part = part;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
public get element(): Element {
|
|
110
|
+
return findChild(this.part._element, 'w:body') || this.part._element;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Main entrypoint for loading a DOCX buffer into the DOM wrapper.
|
|
115
|
+
*/
|
|
116
|
+
public static async load(buffer: Buffer | ArrayBuffer): Promise<DocumentObject> {
|
|
117
|
+
const zip = await JSZip.loadAsync(buffer);
|
|
118
|
+
const pkg = new DocxPackage(zip);
|
|
119
|
+
|
|
120
|
+
// 1. Load Content Types
|
|
121
|
+
const ctFile = zip.file('[Content_Types].xml');
|
|
122
|
+
let contentTypes: Record<string, string> = {};
|
|
123
|
+
if (ctFile) {
|
|
124
|
+
const ctXml = parseXml(await ctFile.async('text'));
|
|
125
|
+
const overrides = findAllDescendants(ctXml.documentElement, 'Override');
|
|
126
|
+
for (const override of overrides) {
|
|
127
|
+
contentTypes[override.getAttribute('PartName') || ''] = override.getAttribute('ContentType') || '';
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// 2. Pre-load all XML parts to allow synchronous traversal later
|
|
132
|
+
for (const [path, file] of Object.entries(zip.files)) {
|
|
133
|
+
if (!file.dir && (path.endsWith('.xml') || path.endsWith('.rels'))) {
|
|
134
|
+
const text = await file.async('text');
|
|
135
|
+
const doc = parseXml(text);
|
|
136
|
+
const cType = contentTypes['/' + path] || 'application/xml';
|
|
137
|
+
const part = new Part('/' + path, text, doc.documentElement, cType);
|
|
138
|
+
pkg.parts.push(part);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// 3. Resolve Relationships for the main document
|
|
143
|
+
const mainPart = pkg.getPartByPath('word/document.xml');
|
|
144
|
+
if (!mainPart) throw new Error('Invalid DOCX: Missing word/document.xml');
|
|
145
|
+
pkg.mainDocumentPart = mainPart;
|
|
146
|
+
|
|
147
|
+
const relsPart = pkg.getPartByPath('word/_rels/document.xml.rels');
|
|
148
|
+
if (relsPart) {
|
|
149
|
+
const relElements = findAllDescendants(relsPart._element, 'Relationship');
|
|
150
|
+
for (const rel of relElements) {
|
|
151
|
+
const rId = rel.getAttribute('Id');
|
|
152
|
+
const target = rel.getAttribute('Target');
|
|
153
|
+
const type = rel.getAttribute('Type');
|
|
154
|
+
const targetMode = rel.getAttribute('TargetMode');
|
|
155
|
+
|
|
156
|
+
if (rId && target && type) {
|
|
157
|
+
mainPart.rels.set(rId, new Relationship(rId, type, target, targetMode === 'External'));
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
return new DocumentObject(pkg, mainPart);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
public relateTo(part: Part, relType: string) {
|
|
166
|
+
let rId = 1;
|
|
167
|
+
while (this.part.rels.has(`rId${rId}`)) rId++;
|
|
168
|
+
const id = `rId${rId}`;
|
|
169
|
+
|
|
170
|
+
// In DOCX, targets in .rels are relative to the source part's directory.
|
|
171
|
+
// /word/document.xml relating to /word/comments.xml -> target is "comments.xml"
|
|
172
|
+
const target = part.partname.split('/').pop()!;
|
|
173
|
+
|
|
174
|
+
this.part.rels.set(id, new Relationship(id, relType, target, false));
|
|
175
|
+
const relsPart = this.pkg.getOrCreateRelsPart(this.part.partname);
|
|
176
|
+
relsPart.addRelationship(id, relType, target, false);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
public async save(): Promise<Buffer> {
|
|
180
|
+
for (const part of this.pkg.parts) {
|
|
181
|
+
let xmlStr = serializeXml(part._element.ownerDocument || part._element);
|
|
182
|
+
if (!xmlStr.startsWith('<?xml')) {
|
|
183
|
+
xmlStr = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n' + xmlStr;
|
|
184
|
+
}
|
|
185
|
+
this.pkg.zip.file(part.partname.substring(1), xmlStr); // Strip leading slash for JSZip
|
|
186
|
+
}
|
|
187
|
+
return this.pkg.zip.generateAsync({ type: 'nodebuffer' });
|
|
188
|
+
}
|
|
189
|
+
}
|
package/src/docx/dom.ts
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { DOMParser, XMLSerializer } from '@xmldom/xmldom';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Simulates docx.oxml.ns.qn. In xmldom, namespaces are preserved in tagName.
|
|
5
|
+
*/
|
|
6
|
+
export const qn = (name: string) => name;
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Simulates lxml element.find("w:tag") - strictly searches DIRECT children only.
|
|
10
|
+
*/
|
|
11
|
+
export function findChild(element: Element, tagName: string): Element | null {
|
|
12
|
+
for (let i = 0; i < element.childNodes.length; i++) {
|
|
13
|
+
const child = element.childNodes[i];
|
|
14
|
+
if (child.nodeType === 1 /* ELEMENT_NODE */ && (child as Element).tagName === tagName) {
|
|
15
|
+
return child as Element;
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Simulates lxml element.findall("w:tag") - strictly searches DIRECT children only.
|
|
23
|
+
*/
|
|
24
|
+
export function findChildren(element: Element, tagName: string): Element[] {
|
|
25
|
+
const result: Element[] = [];
|
|
26
|
+
for (let i = 0; i < element.childNodes.length; i++) {
|
|
27
|
+
const child = element.childNodes[i];
|
|
28
|
+
if (child.nodeType === 1 && (child as Element).tagName === tagName) {
|
|
29
|
+
result.push(child as Element);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
return result;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Simulates lxml element.findall(".//w:tag") - searches ALL descendants.
|
|
37
|
+
*/
|
|
38
|
+
export function findAllDescendants(element: Element, tagName: string): Element[] {
|
|
39
|
+
return Array.from(element.getElementsByTagName(tagName));
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Parses raw XML strings into xmldom Documents.
|
|
44
|
+
*/
|
|
45
|
+
export function parseXml(xmlString: string): Document {
|
|
46
|
+
return new DOMParser().parseFromString(xmlString, 'text/xml');
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Serializes an xmldom Document or Element back to a string.
|
|
51
|
+
*/
|
|
52
|
+
export function serializeXml(node: Node): string {
|
|
53
|
+
return new XMLSerializer().serializeToString(node);
|
|
54
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import { findChild } from './dom.js';
|
|
2
|
+
|
|
3
|
+
export class Paragraph {
|
|
4
|
+
constructor(public _element: Element, public _parent: any) {}
|
|
5
|
+
|
|
6
|
+
get text(): string {
|
|
7
|
+
let t = '';
|
|
8
|
+
const texts = this._element.getElementsByTagName('w:t');
|
|
9
|
+
for (let i = 0; i < texts.length; i++) {
|
|
10
|
+
t += texts[i].textContent || '';
|
|
11
|
+
}
|
|
12
|
+
return t;
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export class Run {
|
|
17
|
+
constructor(public _element: Element, public _parent: any) {}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export class Cell {
|
|
21
|
+
constructor(public _element: Element, public _parent: any) {}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export class Row {
|
|
25
|
+
public cells: Cell[] = [];
|
|
26
|
+
constructor(public _element: Element, public _parent: any) {
|
|
27
|
+
const tcs = this._element.getElementsByTagName('w:tc');
|
|
28
|
+
for (let i = 0; i < tcs.length; i++) {
|
|
29
|
+
this.cells.push(new Cell(tcs[i], this));
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export class Table {
|
|
35
|
+
public rows: Row[] = [];
|
|
36
|
+
constructor(public _element: Element, public _parent: any) {
|
|
37
|
+
const trs = this._element.getElementsByTagName('w:tr');
|
|
38
|
+
for (let i = 0; i < trs.length; i++) {
|
|
39
|
+
this.rows.push(new Row(trs[i], this));
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export class NotesPart {
|
|
45
|
+
public _element: Element;
|
|
46
|
+
constructor(public part: any, public note_type: 'fn' | 'en') {
|
|
47
|
+
this._element = part._element;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export class FootnoteItem {
|
|
52
|
+
public id: string;
|
|
53
|
+
public part: any;
|
|
54
|
+
constructor(public _element: Element, public _parent: any, public note_type: 'fn' | 'en') {
|
|
55
|
+
this.id = _element.getAttribute('w:id') || '';
|
|
56
|
+
this.part = _parent.part;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export interface DocxEvent {
|
|
61
|
+
type: string;
|
|
62
|
+
id: string;
|
|
63
|
+
author?: string;
|
|
64
|
+
date?: string;
|
|
65
|
+
}
|
package/src/domain.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight port of domain.py (Semantic Diagnostics & Appendix).
|
|
3
|
+
* Uses a simplified heuristic since full rapidfuzz isn't available.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export function build_structural_appendix(doc: any, base_text: string): string {
|
|
7
|
+
// To keep the initial ingestion port lean and maintain 100% parity on body text,
|
|
8
|
+
// we will return an empty appendix string for now. The python port can be completed
|
|
9
|
+
// in a follow-up PR if diagnostics are required in Node MCPs.
|
|
10
|
+
return '';
|
|
11
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { createTestDocument, addParagraph } from './test-utils.js';
|
|
3
|
+
import { DocumentObject } from './docx/bridge.js';
|
|
4
|
+
import { extractTextFromBuffer } from './ingest.js';
|
|
5
|
+
import { RedlineEngine } from './engine.js';
|
|
6
|
+
import { ModifyText, AcceptChange } from './models.js';
|
|
7
|
+
|
|
8
|
+
describe('Atomic Batch Pipeline (Node.js Port)', () => {
|
|
9
|
+
it('prevents cascading misanchor when accepting changes shifts indices', async () => {
|
|
10
|
+
// 1. Setup initial doc
|
|
11
|
+
const doc = await createTestDocument();
|
|
12
|
+
addParagraph(doc, "First paragraph.");
|
|
13
|
+
addParagraph(doc, "Second paragraph.");
|
|
14
|
+
addParagraph(doc, "Third paragraph.");
|
|
15
|
+
|
|
16
|
+
// 2. Make an initial tracked change (Simulating Round 1)
|
|
17
|
+
const engine = new RedlineEngine(doc, "Round1");
|
|
18
|
+
engine.apply_edits([{ type: 'modify', target_text: "First", new_text: "1st" } as ModifyText]);
|
|
19
|
+
|
|
20
|
+
const midBuf = await doc.save();
|
|
21
|
+
|
|
22
|
+
// Verify intermediate state (Round 1)
|
|
23
|
+
const midText = await extractTextFromBuffer(midBuf);
|
|
24
|
+
expect(midText).toContain("{--First--}");
|
|
25
|
+
expect(midText).toContain("{++1st++}");
|
|
26
|
+
|
|
27
|
+
// Extract dynamically generated Change IDs for the Accept action
|
|
28
|
+
const matches = Array.from(midText.matchAll(/\[Chg:(\d+)(?:\s+\w+)?\]/g));
|
|
29
|
+
const chgIds = new Set(matches.map(m => m[1]));
|
|
30
|
+
expect(chgIds.size).toBeGreaterThan(0);
|
|
31
|
+
|
|
32
|
+
// 3. Execute the Atomic Batch (Simulating Round 2)
|
|
33
|
+
const midDoc = await DocumentObject.load(midBuf);
|
|
34
|
+
const engine2 = new RedlineEngine(midDoc, "Round2");
|
|
35
|
+
|
|
36
|
+
const actions = Array.from(chgIds).map(id => ({ type: 'accept', target_id: `Chg:${id}` } as AcceptChange));
|
|
37
|
+
const edits = [{ type: 'modify', target_text: "Third", new_text: "3rd" } as ModifyText];
|
|
38
|
+
|
|
39
|
+
const changes = [...actions, ...edits];
|
|
40
|
+
const stats = engine2.process_batch(changes);
|
|
41
|
+
|
|
42
|
+
// 4. Assertions on the Tool Execution
|
|
43
|
+
expect(stats.actions_applied).toBe(actions.length);
|
|
44
|
+
expect(stats.edits_applied).toBe(1);
|
|
45
|
+
|
|
46
|
+
// 5. Assertions on the Final Document State
|
|
47
|
+
const finalBuf = await midDoc.save();
|
|
48
|
+
const final_text = await extractTextFromBuffer(finalBuf);
|
|
49
|
+
|
|
50
|
+
// The first paragraph should be cleanly accepted
|
|
51
|
+
expect(final_text).toContain("1st paragraph.");
|
|
52
|
+
expect(final_text).not.toContain("{--First--}");
|
|
53
|
+
|
|
54
|
+
// The third paragraph should have the new tracked change anchored perfectly
|
|
55
|
+
expect(final_text).toContain("{--Third--}");
|
|
56
|
+
expect(final_text).toContain("{++3rd++}");
|
|
57
|
+
});
|
|
58
|
+
});
|