@adeu/core 1.6.2 → 1.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs.map +1 -1
- package/dist/index.js.map +1 -1
- package/package.json +38 -38
- package/src/comments.test.ts +37 -37
- package/src/comments.ts +450 -450
- package/src/diff.test.ts +61 -61
- package/src/diff.ts +250 -250
- package/src/docx/bridge.ts +188 -188
- package/src/docx/dom.ts +53 -53
- package/src/docx/primitives.ts +64 -64
- package/src/domain.ts +10 -10
- package/src/engine.atomic.test.ts +57 -57
- package/src/engine.batch.test.ts +92 -92
- package/src/engine.safety.test.ts +41 -41
- package/src/engine.tables.test.ts +165 -165
- package/src/engine.ts +734 -734
- package/src/index.test.ts +7 -7
- package/src/index.ts +13 -13
- package/src/ingest.test.ts +43 -43
- package/src/ingest.ts +399 -399
- package/src/mapper.test.ts +65 -65
- package/src/mapper.ts +834 -834
- package/src/markup.test.ts +149 -149
- package/src/markup.ts +322 -322
- package/src/models.ts +50 -50
- package/src/outline.ts +376 -376
- package/src/pagination.ts +238 -238
- package/src/test-utils.ts +141 -141
- package/src/utils/docx.ts +477 -477
- package/tsconfig.json +21 -21
- package/tsup.config.ts +9 -9
- package/vitest.config.ts +11 -11
package/src/ingest.ts
CHANGED
|
@@ -1,400 +1,400 @@
|
|
|
1
|
-
import { DocumentObject } from './docx/bridge.js';
|
|
2
|
-
import { Paragraph, Table, Run, DocxEvent } from './docx/primitives.js';
|
|
3
|
-
import {
|
|
4
|
-
_get_style_cache, get_paragraph_prefix, is_heading_paragraph, is_native_heading,
|
|
5
|
-
get_run_style_markers, get_run_text, apply_formatting_to_segments,
|
|
6
|
-
iter_block_items, iter_document_parts, iter_paragraph_content
|
|
7
|
-
} from './utils/docx.js';
|
|
8
|
-
import { findChild } from './docx/dom.js';
|
|
9
|
-
import { build_structural_appendix } from './domain.js';
|
|
10
|
-
import { extract_comments_data } from './comments.js';
|
|
11
|
-
|
|
12
|
-
export async function extractTextFromBuffer(buffer: Buffer, cleanView = false): Promise<string> {
|
|
13
|
-
const doc = await DocumentObject.load(buffer);
|
|
14
|
-
return _extractTextFromDoc(doc, cleanView);
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
export function _extractTextFromDoc(doc: DocumentObject, cleanView = false, includeAppendix = true): string {
|
|
18
|
-
const comments_map = extract_comments_data(doc.pkg);
|
|
19
|
-
|
|
20
|
-
const full_text: string[] = [];
|
|
21
|
-
let cursor = 0;
|
|
22
|
-
|
|
23
|
-
for (const part of iter_document_parts(doc)) {
|
|
24
|
-
const part_cursor = full_text.length > 0 ? cursor + 2 : cursor;
|
|
25
|
-
const part_text = _extract_blocks(part, comments_map, cleanView, part_cursor);
|
|
26
|
-
if (part_text) {
|
|
27
|
-
if (full_text.length > 0) cursor += 2;
|
|
28
|
-
full_text.push(part_text);
|
|
29
|
-
cursor += part_text.length;
|
|
30
|
-
}
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
let base_text = full_text.join('\n\n');
|
|
34
|
-
|
|
35
|
-
if (includeAppendix) {
|
|
36
|
-
const appendix = build_structural_appendix(doc, base_text);
|
|
37
|
-
if (appendix) base_text += appendix;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
return base_text;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
function _extract_blocks(container: any, comments_map: any, cleanView: boolean, cursor: number): string {
|
|
44
|
-
const part = container.part || container;
|
|
45
|
-
const [style_cache, default_pstyle] = _get_style_cache(part);
|
|
46
|
-
|
|
47
|
-
const blocks: string[] = [];
|
|
48
|
-
let local_cursor = cursor;
|
|
49
|
-
let is_first_block = true;
|
|
50
|
-
let is_first_para = true;
|
|
51
|
-
|
|
52
|
-
for (const item of iter_block_items(container)) {
|
|
53
|
-
if (!is_first_block) local_cursor += 2;
|
|
54
|
-
const block_start = local_cursor;
|
|
55
|
-
|
|
56
|
-
if (item.constructor.name === 'FootnoteItem') {
|
|
57
|
-
const fn_text = _extract_blocks(item, comments_map, cleanView, block_start);
|
|
58
|
-
if (fn_text) {
|
|
59
|
-
blocks.push(fn_text);
|
|
60
|
-
local_cursor = block_start + fn_text.length;
|
|
61
|
-
is_first_block = false;
|
|
62
|
-
} else if (!is_first_block) {
|
|
63
|
-
local_cursor -= 2;
|
|
64
|
-
}
|
|
65
|
-
} else if (item instanceof Paragraph) {
|
|
66
|
-
let prefix = get_paragraph_prefix(item, style_cache, default_pstyle);
|
|
67
|
-
if (is_first_para && container.constructor.name === 'FootnoteItem') {
|
|
68
|
-
prefix = `[^${container.note_type}-${container.id}]: ` + prefix;
|
|
69
|
-
}
|
|
70
|
-
const p_text = build_paragraph_text(item, comments_map, cleanView, style_cache, default_pstyle);
|
|
71
|
-
const full_block = prefix + p_text;
|
|
72
|
-
blocks.push(full_block);
|
|
73
|
-
local_cursor = block_start + full_block.length;
|
|
74
|
-
is_first_para = false;
|
|
75
|
-
is_first_block = false;
|
|
76
|
-
} else if (item instanceof Table) {
|
|
77
|
-
const table_text = extract_table(item, comments_map, cleanView, block_start);
|
|
78
|
-
if (table_text) {
|
|
79
|
-
blocks.push(table_text);
|
|
80
|
-
local_cursor = block_start + table_text.length;
|
|
81
|
-
is_first_block = false;
|
|
82
|
-
} else if (!is_first_block) {
|
|
83
|
-
local_cursor -= 2;
|
|
84
|
-
}
|
|
85
|
-
is_first_para = false;
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
return blocks.join('\n\n');
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
export function extract_table(table: Table, comments_map: any, cleanView: boolean, cursor: number): string {
|
|
93
|
-
const rows_text: string[] = [];
|
|
94
|
-
let rows_processed = 0;
|
|
95
|
-
let local_cursor = cursor;
|
|
96
|
-
|
|
97
|
-
for (const row of table.rows) {
|
|
98
|
-
const cell_texts: string[] = [];
|
|
99
|
-
const seen_cells = new Set();
|
|
100
|
-
|
|
101
|
-
const trPr = findChild(row._element, 'w:trPr');
|
|
102
|
-
const ins = trPr ? findChild(trPr, 'w:ins') : null;
|
|
103
|
-
const del_node = trPr ? findChild(trPr, 'w:del') : null;
|
|
104
|
-
|
|
105
|
-
if (cleanView && del_node) continue;
|
|
106
|
-
|
|
107
|
-
const row_start = local_cursor + (rows_processed > 0 ? 1 : 0);
|
|
108
|
-
const wrapper_prefix_len = (!cleanView && ins) ? 4 : (!cleanView && del_node) ? 4 : 0;
|
|
109
|
-
|
|
110
|
-
let cell_cursor = row_start + wrapper_prefix_len;
|
|
111
|
-
let first_cell = true;
|
|
112
|
-
|
|
113
|
-
for (const cell of row.cells) {
|
|
114
|
-
if (seen_cells.has(cell)) continue;
|
|
115
|
-
seen_cells.add(cell);
|
|
116
|
-
|
|
117
|
-
if (!first_cell) cell_cursor += 3;
|
|
118
|
-
|
|
119
|
-
const cell_content = _extract_blocks(cell, comments_map, cleanView, cell_cursor);
|
|
120
|
-
cell_texts.push(cell_content);
|
|
121
|
-
cell_cursor += cell_content.length;
|
|
122
|
-
first_cell = false;
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
let row_str = cell_texts.join(' | ');
|
|
126
|
-
|
|
127
|
-
if (!cleanView) {
|
|
128
|
-
if (ins) row_str = `{++ ${row_str} |Chg:${ins.getAttribute('w:id')}++}`;
|
|
129
|
-
else if (del_node) row_str = `{-- ${row_str} |Chg:${del_node.getAttribute('w:id')}--}`;
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
rows_text.push(row_str);
|
|
133
|
-
local_cursor = row_start + row_str.length;
|
|
134
|
-
rows_processed++;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
return rows_text.join('\n');
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
export function build_paragraph_text(
|
|
141
|
-
paragraph: Paragraph, comments_map: any, cleanView: boolean,
|
|
142
|
-
style_cache?: any, default_pstyle?: string | null
|
|
143
|
-
): string {
|
|
144
|
-
const parts: string[] = [];
|
|
145
|
-
const active_ins: Record<string, DocxEvent> = {};
|
|
146
|
-
const active_del: Record<string, DocxEvent> = {};
|
|
147
|
-
const active_comments: Set<string> = new Set();
|
|
148
|
-
const active_fmt: Record<string, DocxEvent> = {};
|
|
149
|
-
const deferred_meta_states: any[] = [];
|
|
150
|
-
|
|
151
|
-
let pending_text = '';
|
|
152
|
-
let current_wrappers: [string, string] = ['', ''];
|
|
153
|
-
let current_style: [string, string] = ['', ''];
|
|
154
|
-
|
|
155
|
-
const items = Array.from(iter_paragraph_content(paragraph));
|
|
156
|
-
const is_heading = is_heading_paragraph(paragraph, style_cache, default_pstyle);
|
|
157
|
-
const native_heading = is_native_heading(paragraph, style_cache, default_pstyle);
|
|
158
|
-
let leading_strip_active = is_heading;
|
|
159
|
-
|
|
160
|
-
for (let i = 0; i < items.length; i++) {
|
|
161
|
-
const item = items[i];
|
|
162
|
-
|
|
163
|
-
if (item instanceof Run) {
|
|
164
|
-
const [prefix, suffix] = get_run_style_markers(item, native_heading);
|
|
165
|
-
const text = get_run_text(item);
|
|
166
|
-
|
|
167
|
-
if (cleanView && Object.keys(active_del).length > 0) continue;
|
|
168
|
-
|
|
169
|
-
if (leading_strip_active) {
|
|
170
|
-
if (!text || !text.trim()) continue;
|
|
171
|
-
leading_strip_active = false;
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
const seg = apply_formatting_to_segments(text, prefix, suffix);
|
|
175
|
-
if (seg) {
|
|
176
|
-
const new_wrappers = cleanView ? ['', ''] as [string, string] : _get_wrappers(active_ins, active_del, active_comments, active_fmt);
|
|
177
|
-
const new_style: [string, string] = [prefix, suffix];
|
|
178
|
-
|
|
179
|
-
if (pending_text && new_wrappers[0] === current_wrappers[0] && new_wrappers[1] === current_wrappers[1]) {
|
|
180
|
-
if (new_style[0] === current_style[0] && new_style[1] === current_style[1] && current_style[0] !== '' &&
|
|
181
|
-
pending_text.endsWith(current_style[1]) && seg.startsWith(new_style[0])) {
|
|
182
|
-
pending_text = pending_text.slice(0, -current_style[1].length) + seg.slice(new_style[0].length);
|
|
183
|
-
} else {
|
|
184
|
-
pending_text += seg;
|
|
185
|
-
}
|
|
186
|
-
current_style = new_style;
|
|
187
|
-
} else {
|
|
188
|
-
if (pending_text) parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
189
|
-
pending_text = seg;
|
|
190
|
-
current_wrappers = new_wrappers;
|
|
191
|
-
current_style = new_style;
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
if (!cleanView) {
|
|
195
|
-
const has_meta = Object.keys(active_ins).length > 0 || Object.keys(active_del).length > 0 || active_comments.size > 0 || Object.keys(active_fmt).length > 0;
|
|
196
|
-
if (has_meta) {
|
|
197
|
-
deferred_meta_states.push([{...active_ins}, {...active_del}, new Set(active_comments), {...active_fmt}]);
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
let should_defer = false;
|
|
201
|
-
const is_redline = Object.keys(active_ins).length > 0 || Object.keys(active_del).length > 0 || Object.keys(active_fmt).length > 0;
|
|
202
|
-
|
|
203
|
-
if (is_redline) {
|
|
204
|
-
let j = i + 1;
|
|
205
|
-
let next_is_redline = false;
|
|
206
|
-
let temp_ins = Object.keys(active_ins).length;
|
|
207
|
-
let temp_del = Object.keys(active_del).length;
|
|
208
|
-
let temp_fmt = Object.keys(active_fmt).length;
|
|
209
|
-
|
|
210
|
-
while (j < items.length) {
|
|
211
|
-
const next_item = items[j];
|
|
212
|
-
if (next_item instanceof Run) {
|
|
213
|
-
if (!get_run_text(next_item)) { j++; continue; }
|
|
214
|
-
if (temp_ins > 0 || temp_del > 0 || temp_fmt > 0) next_is_redline = true;
|
|
215
|
-
break;
|
|
216
|
-
} else {
|
|
217
|
-
const ev = next_item as DocxEvent;
|
|
218
|
-
if (ev.type === 'ins_start') temp_ins++;
|
|
219
|
-
else if (ev.type === 'ins_end') temp_ins = Math.max(0, temp_ins - 1);
|
|
220
|
-
else if (ev.type === 'del_start') temp_del++;
|
|
221
|
-
else if (ev.type === 'del_end') temp_del = Math.max(0, temp_del - 1);
|
|
222
|
-
else if (ev.type === 'fmt_start') temp_fmt++;
|
|
223
|
-
else if (ev.type === 'fmt_end') temp_fmt = Math.max(0, temp_fmt - 1);
|
|
224
|
-
}
|
|
225
|
-
j++;
|
|
226
|
-
}
|
|
227
|
-
if (next_is_redline) should_defer = true;
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
if (!should_defer && deferred_meta_states.length > 0) {
|
|
231
|
-
const meta_block = _build_merged_meta_block(deferred_meta_states, comments_map);
|
|
232
|
-
if (meta_block) {
|
|
233
|
-
if (pending_text) {
|
|
234
|
-
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
235
|
-
pending_text = '';
|
|
236
|
-
current_wrappers = ['', ''];
|
|
237
|
-
current_style = ['', ''];
|
|
238
|
-
}
|
|
239
|
-
parts.push(`{>>${meta_block}<<}`);
|
|
240
|
-
}
|
|
241
|
-
deferred_meta_states.length = 0; // clear
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
} else {
|
|
246
|
-
const ev = item as DocxEvent;
|
|
247
|
-
leading_strip_active = false;
|
|
248
|
-
|
|
249
|
-
if (!['ins_start', 'ins_end', 'del_start', 'del_end', 'fmt_start', 'fmt_end'].includes(ev.type)) {
|
|
250
|
-
if (pending_text) {
|
|
251
|
-
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
252
|
-
pending_text = '';
|
|
253
|
-
current_wrappers = ['', ''];
|
|
254
|
-
current_style = ['', ''];
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
if (ev.type === 'start') active_comments.add(ev.id);
|
|
259
|
-
else if (ev.type === 'end') active_comments.delete(ev.id);
|
|
260
|
-
else if (ev.type === 'ins_start') active_ins[ev.id] = ev;
|
|
261
|
-
else if (ev.type === 'ins_end') delete active_ins[ev.id];
|
|
262
|
-
else if (ev.type === 'del_start') active_del[ev.id] = ev;
|
|
263
|
-
else if (ev.type === 'del_end') delete active_del[ev.id];
|
|
264
|
-
else if (ev.type === 'fmt_start') active_fmt[ev.id] = ev;
|
|
265
|
-
else if (ev.type === 'fmt_end') delete active_fmt[ev.id];
|
|
266
|
-
else if (ev.type === 'footnote' || ev.type === 'endnote') {
|
|
267
|
-
if (pending_text) {
|
|
268
|
-
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
269
|
-
pending_text = '';
|
|
270
|
-
current_wrappers = ['', ''];
|
|
271
|
-
current_style = ['', ''];
|
|
272
|
-
}
|
|
273
|
-
parts.push(`[^${ev.type === 'footnote' ? 'fn' : 'en'}-${ev.id}]`);
|
|
274
|
-
} else if (ev.type === 'hyperlink_start') {
|
|
275
|
-
if (pending_text) {
|
|
276
|
-
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
277
|
-
pending_text = '';
|
|
278
|
-
current_wrappers = ['', ''];
|
|
279
|
-
current_style = ['', ''];
|
|
280
|
-
}
|
|
281
|
-
parts.push('[');
|
|
282
|
-
} else if (ev.type === 'hyperlink_end') {
|
|
283
|
-
if (pending_text) {
|
|
284
|
-
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
285
|
-
pending_text = '';
|
|
286
|
-
current_wrappers = ['', ''];
|
|
287
|
-
current_style = ['', ''];
|
|
288
|
-
}
|
|
289
|
-
parts.push(`](${ev.date})`);
|
|
290
|
-
} else if (ev.type === 'xref_start') {
|
|
291
|
-
if (pending_text) {
|
|
292
|
-
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
293
|
-
pending_text = '';
|
|
294
|
-
current_wrappers = ['', ''];
|
|
295
|
-
current_style = ['', ''];
|
|
296
|
-
}
|
|
297
|
-
parts.push('[~');
|
|
298
|
-
} else if (ev.type === 'xref_end') {
|
|
299
|
-
if (pending_text) {
|
|
300
|
-
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
301
|
-
pending_text = '';
|
|
302
|
-
current_wrappers = ['', ''];
|
|
303
|
-
current_style = ['', ''];
|
|
304
|
-
}
|
|
305
|
-
parts.push(`~](#${ev.id})`);
|
|
306
|
-
} else if (ev.type === 'bookmark') {
|
|
307
|
-
if (pending_text) {
|
|
308
|
-
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
309
|
-
pending_text = '';
|
|
310
|
-
current_wrappers = ['', ''];
|
|
311
|
-
current_style = ['', ''];
|
|
312
|
-
}
|
|
313
|
-
parts.push(`{#${ev.id}}`);
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
if (pending_text) parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
319
|
-
|
|
320
|
-
if (deferred_meta_states.length > 0) {
|
|
321
|
-
const meta_block = _build_merged_meta_block(deferred_meta_states, comments_map);
|
|
322
|
-
if (meta_block) parts.push(`{>>${meta_block}<<}`);
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
return parts.join('');
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
function _get_wrappers(ins: any, del: any, comments: Set<string>, fmt: any): [string, string] {
|
|
329
|
-
if (Object.keys(del).length > 0) return ['{--', '--}'];
|
|
330
|
-
if (Object.keys(ins).length > 0) return ['{++', '++}'];
|
|
331
|
-
if (comments.size > 0 || Object.keys(fmt).length > 0) return ['{==', '==}'];
|
|
332
|
-
return ['', ''];
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
function _build_merged_meta_block(states_list: any[], comments_map: any): string {
|
|
336
|
-
const change_lines: string[] = [];
|
|
337
|
-
const comment_lines: string[] = [];
|
|
338
|
-
const seen_sigs = new Set<string>();
|
|
339
|
-
|
|
340
|
-
for (const [ins_map, del_map, comments_set, fmt_map] of states_list) {
|
|
341
|
-
for (const [uid, meta] of Object.entries(ins_map as Record<string, DocxEvent>)) {
|
|
342
|
-
const sig = `Chg:${uid}`;
|
|
343
|
-
if (!seen_sigs.has(sig)) {
|
|
344
|
-
change_lines.push(`[${sig} insert] ${meta.author || 'Unknown'}`);
|
|
345
|
-
seen_sigs.add(sig);
|
|
346
|
-
}
|
|
347
|
-
}
|
|
348
|
-
for (const [uid, meta] of Object.entries(del_map as Record<string, DocxEvent>)) {
|
|
349
|
-
const sig = `Chg:${uid}`;
|
|
350
|
-
if (!seen_sigs.has(sig)) {
|
|
351
|
-
change_lines.push(`[${sig} delete] ${meta.author || 'Unknown'}`);
|
|
352
|
-
seen_sigs.add(sig);
|
|
353
|
-
}
|
|
354
|
-
}
|
|
355
|
-
for (const [uid, meta] of Object.entries(fmt_map as Record<string, DocxEvent>)) {
|
|
356
|
-
const sig = `Chg:${uid}`;
|
|
357
|
-
if (!seen_sigs.has(sig)) {
|
|
358
|
-
change_lines.push(`[${sig} format] ${meta.author || 'Unknown'}`);
|
|
359
|
-
seen_sigs.add(sig);
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
// Threaded Comment Resolution Tree
|
|
364
|
-
const children_map: Record<string, string[]> = {};
|
|
365
|
-
for (const [c_id, data] of Object.entries(comments_map as Record<string, any>)) {
|
|
366
|
-
const p_id = data.parent_id;
|
|
367
|
-
if (p_id) {
|
|
368
|
-
if (!children_map[p_id]) children_map[p_id] = [];
|
|
369
|
-
children_map[p_id].push(c_id);
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
function render_comment(cid: string) {
|
|
374
|
-
if (!comments_map[cid]) return;
|
|
375
|
-
const sig = `Com:${cid}`;
|
|
376
|
-
if (seen_sigs.has(sig)) return;
|
|
377
|
-
|
|
378
|
-
const data = comments_map[cid];
|
|
379
|
-
let header = `[${sig}] ${data.author}`;
|
|
380
|
-
if (data.date) header += ` @ ${data.date}`;
|
|
381
|
-
if (data.resolved) header += `(RESOLVED)`;
|
|
382
|
-
comment_lines.push(`${header}: ${data.text}`);
|
|
383
|
-
seen_sigs.add(sig);
|
|
384
|
-
|
|
385
|
-
if (children_map[cid]) {
|
|
386
|
-
const children = children_map[cid].sort((a, b) => (comments_map[a]?.date || '').localeCompare(comments_map[b]?.date || ''));
|
|
387
|
-
for (const child_id of children) {
|
|
388
|
-
render_comment(child_id);
|
|
389
|
-
}
|
|
390
|
-
}
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
const sorted_ids = Array.from(comments_set as Set<string>).sort();
|
|
394
|
-
for (const c_id of sorted_ids) {
|
|
395
|
-
render_comment(c_id);
|
|
396
|
-
}
|
|
397
|
-
}
|
|
398
|
-
|
|
399
|
-
return [...change_lines, ...comment_lines].join('\n');
|
|
1
|
+
import { DocumentObject } from './docx/bridge.js';
|
|
2
|
+
import { Paragraph, Table, Run, DocxEvent } from './docx/primitives.js';
|
|
3
|
+
import {
|
|
4
|
+
_get_style_cache, get_paragraph_prefix, is_heading_paragraph, is_native_heading,
|
|
5
|
+
get_run_style_markers, get_run_text, apply_formatting_to_segments,
|
|
6
|
+
iter_block_items, iter_document_parts, iter_paragraph_content
|
|
7
|
+
} from './utils/docx.js';
|
|
8
|
+
import { findChild } from './docx/dom.js';
|
|
9
|
+
import { build_structural_appendix } from './domain.js';
|
|
10
|
+
import { extract_comments_data } from './comments.js';
|
|
11
|
+
|
|
12
|
+
export async function extractTextFromBuffer(buffer: Buffer, cleanView = false): Promise<string> {
|
|
13
|
+
const doc = await DocumentObject.load(buffer);
|
|
14
|
+
return _extractTextFromDoc(doc, cleanView);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function _extractTextFromDoc(doc: DocumentObject, cleanView = false, includeAppendix = true): string {
|
|
18
|
+
const comments_map = extract_comments_data(doc.pkg);
|
|
19
|
+
|
|
20
|
+
const full_text: string[] = [];
|
|
21
|
+
let cursor = 0;
|
|
22
|
+
|
|
23
|
+
for (const part of iter_document_parts(doc)) {
|
|
24
|
+
const part_cursor = full_text.length > 0 ? cursor + 2 : cursor;
|
|
25
|
+
const part_text = _extract_blocks(part, comments_map, cleanView, part_cursor);
|
|
26
|
+
if (part_text) {
|
|
27
|
+
if (full_text.length > 0) cursor += 2;
|
|
28
|
+
full_text.push(part_text);
|
|
29
|
+
cursor += part_text.length;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
let base_text = full_text.join('\n\n');
|
|
34
|
+
|
|
35
|
+
if (includeAppendix) {
|
|
36
|
+
const appendix = build_structural_appendix(doc, base_text);
|
|
37
|
+
if (appendix) base_text += appendix;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return base_text;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function _extract_blocks(container: any, comments_map: any, cleanView: boolean, cursor: number): string {
|
|
44
|
+
const part = container.part || container;
|
|
45
|
+
const [style_cache, default_pstyle] = _get_style_cache(part);
|
|
46
|
+
|
|
47
|
+
const blocks: string[] = [];
|
|
48
|
+
let local_cursor = cursor;
|
|
49
|
+
let is_first_block = true;
|
|
50
|
+
let is_first_para = true;
|
|
51
|
+
|
|
52
|
+
for (const item of iter_block_items(container)) {
|
|
53
|
+
if (!is_first_block) local_cursor += 2;
|
|
54
|
+
const block_start = local_cursor;
|
|
55
|
+
|
|
56
|
+
if (item.constructor.name === 'FootnoteItem') {
|
|
57
|
+
const fn_text = _extract_blocks(item, comments_map, cleanView, block_start);
|
|
58
|
+
if (fn_text) {
|
|
59
|
+
blocks.push(fn_text);
|
|
60
|
+
local_cursor = block_start + fn_text.length;
|
|
61
|
+
is_first_block = false;
|
|
62
|
+
} else if (!is_first_block) {
|
|
63
|
+
local_cursor -= 2;
|
|
64
|
+
}
|
|
65
|
+
} else if (item instanceof Paragraph) {
|
|
66
|
+
let prefix = get_paragraph_prefix(item, style_cache, default_pstyle);
|
|
67
|
+
if (is_first_para && container.constructor.name === 'FootnoteItem') {
|
|
68
|
+
prefix = `[^${container.note_type}-${container.id}]: ` + prefix;
|
|
69
|
+
}
|
|
70
|
+
const p_text = build_paragraph_text(item, comments_map, cleanView, style_cache, default_pstyle);
|
|
71
|
+
const full_block = prefix + p_text;
|
|
72
|
+
blocks.push(full_block);
|
|
73
|
+
local_cursor = block_start + full_block.length;
|
|
74
|
+
is_first_para = false;
|
|
75
|
+
is_first_block = false;
|
|
76
|
+
} else if (item instanceof Table) {
|
|
77
|
+
const table_text = extract_table(item, comments_map, cleanView, block_start);
|
|
78
|
+
if (table_text) {
|
|
79
|
+
blocks.push(table_text);
|
|
80
|
+
local_cursor = block_start + table_text.length;
|
|
81
|
+
is_first_block = false;
|
|
82
|
+
} else if (!is_first_block) {
|
|
83
|
+
local_cursor -= 2;
|
|
84
|
+
}
|
|
85
|
+
is_first_para = false;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return blocks.join('\n\n');
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export function extract_table(table: Table, comments_map: any, cleanView: boolean, cursor: number): string {
|
|
93
|
+
const rows_text: string[] = [];
|
|
94
|
+
let rows_processed = 0;
|
|
95
|
+
let local_cursor = cursor;
|
|
96
|
+
|
|
97
|
+
for (const row of table.rows) {
|
|
98
|
+
const cell_texts: string[] = [];
|
|
99
|
+
const seen_cells = new Set();
|
|
100
|
+
|
|
101
|
+
const trPr = findChild(row._element, 'w:trPr');
|
|
102
|
+
const ins = trPr ? findChild(trPr, 'w:ins') : null;
|
|
103
|
+
const del_node = trPr ? findChild(trPr, 'w:del') : null;
|
|
104
|
+
|
|
105
|
+
if (cleanView && del_node) continue;
|
|
106
|
+
|
|
107
|
+
const row_start = local_cursor + (rows_processed > 0 ? 1 : 0);
|
|
108
|
+
const wrapper_prefix_len = (!cleanView && ins) ? 4 : (!cleanView && del_node) ? 4 : 0;
|
|
109
|
+
|
|
110
|
+
let cell_cursor = row_start + wrapper_prefix_len;
|
|
111
|
+
let first_cell = true;
|
|
112
|
+
|
|
113
|
+
for (const cell of row.cells) {
|
|
114
|
+
if (seen_cells.has(cell)) continue;
|
|
115
|
+
seen_cells.add(cell);
|
|
116
|
+
|
|
117
|
+
if (!first_cell) cell_cursor += 3;
|
|
118
|
+
|
|
119
|
+
const cell_content = _extract_blocks(cell, comments_map, cleanView, cell_cursor);
|
|
120
|
+
cell_texts.push(cell_content);
|
|
121
|
+
cell_cursor += cell_content.length;
|
|
122
|
+
first_cell = false;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
let row_str = cell_texts.join(' | ');
|
|
126
|
+
|
|
127
|
+
if (!cleanView) {
|
|
128
|
+
if (ins) row_str = `{++ ${row_str} |Chg:${ins.getAttribute('w:id')}++}`;
|
|
129
|
+
else if (del_node) row_str = `{-- ${row_str} |Chg:${del_node.getAttribute('w:id')}--}`;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
rows_text.push(row_str);
|
|
133
|
+
local_cursor = row_start + row_str.length;
|
|
134
|
+
rows_processed++;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return rows_text.join('\n');
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export function build_paragraph_text(
|
|
141
|
+
paragraph: Paragraph, comments_map: any, cleanView: boolean,
|
|
142
|
+
style_cache?: any, default_pstyle?: string | null
|
|
143
|
+
): string {
|
|
144
|
+
const parts: string[] = [];
|
|
145
|
+
const active_ins: Record<string, DocxEvent> = {};
|
|
146
|
+
const active_del: Record<string, DocxEvent> = {};
|
|
147
|
+
const active_comments: Set<string> = new Set();
|
|
148
|
+
const active_fmt: Record<string, DocxEvent> = {};
|
|
149
|
+
const deferred_meta_states: any[] = [];
|
|
150
|
+
|
|
151
|
+
let pending_text = '';
|
|
152
|
+
let current_wrappers: [string, string] = ['', ''];
|
|
153
|
+
let current_style: [string, string] = ['', ''];
|
|
154
|
+
|
|
155
|
+
const items = Array.from(iter_paragraph_content(paragraph));
|
|
156
|
+
const is_heading = is_heading_paragraph(paragraph, style_cache, default_pstyle);
|
|
157
|
+
const native_heading = is_native_heading(paragraph, style_cache, default_pstyle);
|
|
158
|
+
let leading_strip_active = is_heading;
|
|
159
|
+
|
|
160
|
+
for (let i = 0; i < items.length; i++) {
|
|
161
|
+
const item = items[i];
|
|
162
|
+
|
|
163
|
+
if (item instanceof Run) {
|
|
164
|
+
const [prefix, suffix] = get_run_style_markers(item, native_heading);
|
|
165
|
+
const text = get_run_text(item);
|
|
166
|
+
|
|
167
|
+
if (cleanView && Object.keys(active_del).length > 0) continue;
|
|
168
|
+
|
|
169
|
+
if (leading_strip_active) {
|
|
170
|
+
if (!text || !text.trim()) continue;
|
|
171
|
+
leading_strip_active = false;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const seg = apply_formatting_to_segments(text, prefix, suffix);
|
|
175
|
+
if (seg) {
|
|
176
|
+
const new_wrappers = cleanView ? ['', ''] as [string, string] : _get_wrappers(active_ins, active_del, active_comments, active_fmt);
|
|
177
|
+
const new_style: [string, string] = [prefix, suffix];
|
|
178
|
+
|
|
179
|
+
if (pending_text && new_wrappers[0] === current_wrappers[0] && new_wrappers[1] === current_wrappers[1]) {
|
|
180
|
+
if (new_style[0] === current_style[0] && new_style[1] === current_style[1] && current_style[0] !== '' &&
|
|
181
|
+
pending_text.endsWith(current_style[1]) && seg.startsWith(new_style[0])) {
|
|
182
|
+
pending_text = pending_text.slice(0, -current_style[1].length) + seg.slice(new_style[0].length);
|
|
183
|
+
} else {
|
|
184
|
+
pending_text += seg;
|
|
185
|
+
}
|
|
186
|
+
current_style = new_style;
|
|
187
|
+
} else {
|
|
188
|
+
if (pending_text) parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
189
|
+
pending_text = seg;
|
|
190
|
+
current_wrappers = new_wrappers;
|
|
191
|
+
current_style = new_style;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if (!cleanView) {
|
|
195
|
+
const has_meta = Object.keys(active_ins).length > 0 || Object.keys(active_del).length > 0 || active_comments.size > 0 || Object.keys(active_fmt).length > 0;
|
|
196
|
+
if (has_meta) {
|
|
197
|
+
deferred_meta_states.push([{...active_ins}, {...active_del}, new Set(active_comments), {...active_fmt}]);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
let should_defer = false;
|
|
201
|
+
const is_redline = Object.keys(active_ins).length > 0 || Object.keys(active_del).length > 0 || Object.keys(active_fmt).length > 0;
|
|
202
|
+
|
|
203
|
+
if (is_redline) {
|
|
204
|
+
let j = i + 1;
|
|
205
|
+
let next_is_redline = false;
|
|
206
|
+
let temp_ins = Object.keys(active_ins).length;
|
|
207
|
+
let temp_del = Object.keys(active_del).length;
|
|
208
|
+
let temp_fmt = Object.keys(active_fmt).length;
|
|
209
|
+
|
|
210
|
+
while (j < items.length) {
|
|
211
|
+
const next_item = items[j];
|
|
212
|
+
if (next_item instanceof Run) {
|
|
213
|
+
if (!get_run_text(next_item)) { j++; continue; }
|
|
214
|
+
if (temp_ins > 0 || temp_del > 0 || temp_fmt > 0) next_is_redline = true;
|
|
215
|
+
break;
|
|
216
|
+
} else {
|
|
217
|
+
const ev = next_item as DocxEvent;
|
|
218
|
+
if (ev.type === 'ins_start') temp_ins++;
|
|
219
|
+
else if (ev.type === 'ins_end') temp_ins = Math.max(0, temp_ins - 1);
|
|
220
|
+
else if (ev.type === 'del_start') temp_del++;
|
|
221
|
+
else if (ev.type === 'del_end') temp_del = Math.max(0, temp_del - 1);
|
|
222
|
+
else if (ev.type === 'fmt_start') temp_fmt++;
|
|
223
|
+
else if (ev.type === 'fmt_end') temp_fmt = Math.max(0, temp_fmt - 1);
|
|
224
|
+
}
|
|
225
|
+
j++;
|
|
226
|
+
}
|
|
227
|
+
if (next_is_redline) should_defer = true;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if (!should_defer && deferred_meta_states.length > 0) {
|
|
231
|
+
const meta_block = _build_merged_meta_block(deferred_meta_states, comments_map);
|
|
232
|
+
if (meta_block) {
|
|
233
|
+
if (pending_text) {
|
|
234
|
+
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
235
|
+
pending_text = '';
|
|
236
|
+
current_wrappers = ['', ''];
|
|
237
|
+
current_style = ['', ''];
|
|
238
|
+
}
|
|
239
|
+
parts.push(`{>>${meta_block}<<}`);
|
|
240
|
+
}
|
|
241
|
+
deferred_meta_states.length = 0; // clear
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
} else {
|
|
246
|
+
const ev = item as DocxEvent;
|
|
247
|
+
leading_strip_active = false;
|
|
248
|
+
|
|
249
|
+
if (!['ins_start', 'ins_end', 'del_start', 'del_end', 'fmt_start', 'fmt_end'].includes(ev.type)) {
|
|
250
|
+
if (pending_text) {
|
|
251
|
+
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
252
|
+
pending_text = '';
|
|
253
|
+
current_wrappers = ['', ''];
|
|
254
|
+
current_style = ['', ''];
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if (ev.type === 'start') active_comments.add(ev.id);
|
|
259
|
+
else if (ev.type === 'end') active_comments.delete(ev.id);
|
|
260
|
+
else if (ev.type === 'ins_start') active_ins[ev.id] = ev;
|
|
261
|
+
else if (ev.type === 'ins_end') delete active_ins[ev.id];
|
|
262
|
+
else if (ev.type === 'del_start') active_del[ev.id] = ev;
|
|
263
|
+
else if (ev.type === 'del_end') delete active_del[ev.id];
|
|
264
|
+
else if (ev.type === 'fmt_start') active_fmt[ev.id] = ev;
|
|
265
|
+
else if (ev.type === 'fmt_end') delete active_fmt[ev.id];
|
|
266
|
+
else if (ev.type === 'footnote' || ev.type === 'endnote') {
|
|
267
|
+
if (pending_text) {
|
|
268
|
+
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
269
|
+
pending_text = '';
|
|
270
|
+
current_wrappers = ['', ''];
|
|
271
|
+
current_style = ['', ''];
|
|
272
|
+
}
|
|
273
|
+
parts.push(`[^${ev.type === 'footnote' ? 'fn' : 'en'}-${ev.id}]`);
|
|
274
|
+
} else if (ev.type === 'hyperlink_start') {
|
|
275
|
+
if (pending_text) {
|
|
276
|
+
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
277
|
+
pending_text = '';
|
|
278
|
+
current_wrappers = ['', ''];
|
|
279
|
+
current_style = ['', ''];
|
|
280
|
+
}
|
|
281
|
+
parts.push('[');
|
|
282
|
+
} else if (ev.type === 'hyperlink_end') {
|
|
283
|
+
if (pending_text) {
|
|
284
|
+
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
285
|
+
pending_text = '';
|
|
286
|
+
current_wrappers = ['', ''];
|
|
287
|
+
current_style = ['', ''];
|
|
288
|
+
}
|
|
289
|
+
parts.push(`](${ev.date})`);
|
|
290
|
+
} else if (ev.type === 'xref_start') {
|
|
291
|
+
if (pending_text) {
|
|
292
|
+
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
293
|
+
pending_text = '';
|
|
294
|
+
current_wrappers = ['', ''];
|
|
295
|
+
current_style = ['', ''];
|
|
296
|
+
}
|
|
297
|
+
parts.push('[~');
|
|
298
|
+
} else if (ev.type === 'xref_end') {
|
|
299
|
+
if (pending_text) {
|
|
300
|
+
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
301
|
+
pending_text = '';
|
|
302
|
+
current_wrappers = ['', ''];
|
|
303
|
+
current_style = ['', ''];
|
|
304
|
+
}
|
|
305
|
+
parts.push(`~](#${ev.id})`);
|
|
306
|
+
} else if (ev.type === 'bookmark') {
|
|
307
|
+
if (pending_text) {
|
|
308
|
+
parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
309
|
+
pending_text = '';
|
|
310
|
+
current_wrappers = ['', ''];
|
|
311
|
+
current_style = ['', ''];
|
|
312
|
+
}
|
|
313
|
+
parts.push(`{#${ev.id}}`);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if (pending_text) parts.push(`${current_wrappers[0]}${pending_text}${current_wrappers[1]}`);
|
|
319
|
+
|
|
320
|
+
if (deferred_meta_states.length > 0) {
|
|
321
|
+
const meta_block = _build_merged_meta_block(deferred_meta_states, comments_map);
|
|
322
|
+
if (meta_block) parts.push(`{>>${meta_block}<<}`);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
return parts.join('');
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
function _get_wrappers(ins: any, del: any, comments: Set<string>, fmt: any): [string, string] {
|
|
329
|
+
if (Object.keys(del).length > 0) return ['{--', '--}'];
|
|
330
|
+
if (Object.keys(ins).length > 0) return ['{++', '++}'];
|
|
331
|
+
if (comments.size > 0 || Object.keys(fmt).length > 0) return ['{==', '==}'];
|
|
332
|
+
return ['', ''];
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
function _build_merged_meta_block(states_list: any[], comments_map: any): string {
|
|
336
|
+
const change_lines: string[] = [];
|
|
337
|
+
const comment_lines: string[] = [];
|
|
338
|
+
const seen_sigs = new Set<string>();
|
|
339
|
+
|
|
340
|
+
for (const [ins_map, del_map, comments_set, fmt_map] of states_list) {
|
|
341
|
+
for (const [uid, meta] of Object.entries(ins_map as Record<string, DocxEvent>)) {
|
|
342
|
+
const sig = `Chg:${uid}`;
|
|
343
|
+
if (!seen_sigs.has(sig)) {
|
|
344
|
+
change_lines.push(`[${sig} insert] ${meta.author || 'Unknown'}`);
|
|
345
|
+
seen_sigs.add(sig);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
for (const [uid, meta] of Object.entries(del_map as Record<string, DocxEvent>)) {
|
|
349
|
+
const sig = `Chg:${uid}`;
|
|
350
|
+
if (!seen_sigs.has(sig)) {
|
|
351
|
+
change_lines.push(`[${sig} delete] ${meta.author || 'Unknown'}`);
|
|
352
|
+
seen_sigs.add(sig);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
for (const [uid, meta] of Object.entries(fmt_map as Record<string, DocxEvent>)) {
|
|
356
|
+
const sig = `Chg:${uid}`;
|
|
357
|
+
if (!seen_sigs.has(sig)) {
|
|
358
|
+
change_lines.push(`[${sig} format] ${meta.author || 'Unknown'}`);
|
|
359
|
+
seen_sigs.add(sig);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
// Threaded Comment Resolution Tree
|
|
364
|
+
const children_map: Record<string, string[]> = {};
|
|
365
|
+
for (const [c_id, data] of Object.entries(comments_map as Record<string, any>)) {
|
|
366
|
+
const p_id = data.parent_id;
|
|
367
|
+
if (p_id) {
|
|
368
|
+
if (!children_map[p_id]) children_map[p_id] = [];
|
|
369
|
+
children_map[p_id].push(c_id);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
function render_comment(cid: string) {
|
|
374
|
+
if (!comments_map[cid]) return;
|
|
375
|
+
const sig = `Com:${cid}`;
|
|
376
|
+
if (seen_sigs.has(sig)) return;
|
|
377
|
+
|
|
378
|
+
const data = comments_map[cid];
|
|
379
|
+
let header = `[${sig}] ${data.author}`;
|
|
380
|
+
if (data.date) header += ` @ ${data.date}`;
|
|
381
|
+
if (data.resolved) header += `(RESOLVED)`;
|
|
382
|
+
comment_lines.push(`${header}: ${data.text}`);
|
|
383
|
+
seen_sigs.add(sig);
|
|
384
|
+
|
|
385
|
+
if (children_map[cid]) {
|
|
386
|
+
const children = children_map[cid].sort((a, b) => (comments_map[a]?.date || '').localeCompare(comments_map[b]?.date || ''));
|
|
387
|
+
for (const child_id of children) {
|
|
388
|
+
render_comment(child_id);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
const sorted_ids = Array.from(comments_set as Set<string>).sort();
|
|
394
|
+
for (const c_id of sorted_ids) {
|
|
395
|
+
render_comment(c_id);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
return [...change_lines, ...comment_lines].join('\n');
|
|
400
400
|
}
|