@adeu/core 1.6.7 → 1.6.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3969 -1859
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +95 -8
- package/dist/index.d.ts +95 -8
- package/dist/index.js +3966 -1859
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/consistency.test.ts +134 -0
- package/src/diff.test.ts +13 -1
- package/src/diff.ts +220 -47
- package/src/docx/bridge.ts +111 -57
- package/src/docx/dom.ts +66 -7
- package/src/domain.test.ts +280 -0
- package/src/domain.ts +264 -10
- package/src/engine.bugs.test.ts +481 -0
- package/src/engine.ts +1346 -192
- package/src/index.ts +7 -8
- package/src/ingest.ts +8 -0
- package/src/markup.ts +160 -53
- package/src/outline.ts +199 -69
- package/src/sanitize/core.ts +130 -0
- package/src/sanitize/report.ts +125 -0
- package/src/sanitize/sanitize.test.ts +237 -0
- package/src/sanitize/transforms.ts +452 -0
- package/src/utils/docx.ts +292 -158
package/src/outline.ts
CHANGED
|
@@ -2,18 +2,18 @@
|
|
|
2
2
|
* Structural outline extractor.
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
-
import { DocumentObject } from
|
|
6
|
-
import { Paragraph, Table, DocxEvent } from
|
|
7
|
-
import { build_paragraph_text, extract_table } from
|
|
8
|
-
import { extract_comments_data } from
|
|
9
|
-
import { findChild } from
|
|
5
|
+
import { DocumentObject } from "./docx/bridge.js";
|
|
6
|
+
import { Paragraph, Table, DocxEvent } from "./docx/primitives.js";
|
|
7
|
+
import { build_paragraph_text, extract_table } from "./ingest.js";
|
|
8
|
+
import { extract_comments_data } from "./comments.js";
|
|
9
|
+
import { findChild } from "./docx/dom.js";
|
|
10
10
|
import {
|
|
11
11
|
_get_style_cache,
|
|
12
12
|
get_paragraph_prefix,
|
|
13
13
|
iter_block_items,
|
|
14
14
|
iter_document_parts,
|
|
15
15
|
iter_paragraph_content,
|
|
16
|
-
} from
|
|
16
|
+
} from "./utils/docx.js";
|
|
17
17
|
|
|
18
18
|
const _HEADING_PREFIX_RE = /^(#{1,6}) /;
|
|
19
19
|
const _HEURISTIC_MIN_WORDS = 3;
|
|
@@ -40,10 +40,10 @@ export function extract_outline(
|
|
|
40
40
|
projected_body: string,
|
|
41
41
|
body_pages: string[],
|
|
42
42
|
body_page_offsets: number[],
|
|
43
|
-
paragraph_offsets: Record<string, [number, number]> | null = null
|
|
43
|
+
paragraph_offsets: Record<string, [number, number]> | null = null,
|
|
44
44
|
): OutlineNode[] {
|
|
45
45
|
if (body_pages.length !== body_page_offsets.length) {
|
|
46
|
-
throw new Error(
|
|
46
|
+
throw new Error("body_pages and body_page_offsets length mismatch");
|
|
47
47
|
}
|
|
48
48
|
|
|
49
49
|
const comments_map = extract_comments_data(doc.pkg);
|
|
@@ -69,7 +69,12 @@ export function extract_outline(
|
|
|
69
69
|
const text = _heading_text(paragraph, comments_map);
|
|
70
70
|
const style = _determine_heading_style(paragraph);
|
|
71
71
|
|
|
72
|
-
const owned_end = _find_owned_end(
|
|
72
|
+
const owned_end = _find_owned_end(
|
|
73
|
+
block_records,
|
|
74
|
+
heading_indices,
|
|
75
|
+
h_pos,
|
|
76
|
+
level,
|
|
77
|
+
);
|
|
73
78
|
const owned_blocks = block_records.slice(rec_idx + 1, owned_end);
|
|
74
79
|
|
|
75
80
|
const has_table = _direct_has_table(block_records, rec_idx + 1, owned_end);
|
|
@@ -83,7 +88,11 @@ export function extract_outline(
|
|
|
83
88
|
return nodes;
|
|
84
89
|
}
|
|
85
90
|
|
|
86
|
-
function _direct_has_table(
|
|
91
|
+
function _direct_has_table(
|
|
92
|
+
block_records: _BlockRecord[],
|
|
93
|
+
range_start: number,
|
|
94
|
+
range_end: number,
|
|
95
|
+
): boolean {
|
|
87
96
|
for (let idx = range_start; idx < range_end; idx++) {
|
|
88
97
|
const rec = block_records[idx];
|
|
89
98
|
if (rec.is_paragraph && _is_heading(rec.item)) return false;
|
|
@@ -92,7 +101,10 @@ function _direct_has_table(block_records: _BlockRecord[], range_start: number, r
|
|
|
92
101
|
return false;
|
|
93
102
|
}
|
|
94
103
|
|
|
95
|
-
function _walk_doc_body(
|
|
104
|
+
function _walk_doc_body(
|
|
105
|
+
doc: DocumentObject,
|
|
106
|
+
comments_map: any,
|
|
107
|
+
): _BlockRecord[] {
|
|
96
108
|
const parts = Array.from(iter_document_parts(doc));
|
|
97
109
|
let body_start_offset = 0;
|
|
98
110
|
let body_part: any = null;
|
|
@@ -128,7 +140,13 @@ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[]
|
|
|
128
140
|
|
|
129
141
|
if (!is_first_block) cursor += 2;
|
|
130
142
|
|
|
131
|
-
records.push({
|
|
143
|
+
records.push({
|
|
144
|
+
item,
|
|
145
|
+
is_paragraph: true,
|
|
146
|
+
is_table: false,
|
|
147
|
+
start_offset: cursor,
|
|
148
|
+
projected_length: block_len,
|
|
149
|
+
});
|
|
132
150
|
cursor += block_len;
|
|
133
151
|
is_first_block = false;
|
|
134
152
|
} else if (item instanceof Table) {
|
|
@@ -138,7 +156,13 @@ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[]
|
|
|
138
156
|
if (!is_first_block) cursor += 2;
|
|
139
157
|
|
|
140
158
|
const table_start = cursor;
|
|
141
|
-
records.push({
|
|
159
|
+
records.push({
|
|
160
|
+
item,
|
|
161
|
+
is_paragraph: false,
|
|
162
|
+
is_table: true,
|
|
163
|
+
start_offset: table_start,
|
|
164
|
+
projected_length: block_len,
|
|
165
|
+
});
|
|
142
166
|
_record_table_inner_blocks_lite(item, table_start, records, comments_map);
|
|
143
167
|
cursor += block_len;
|
|
144
168
|
is_first_block = false;
|
|
@@ -148,7 +172,12 @@ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[]
|
|
|
148
172
|
return records;
|
|
149
173
|
}
|
|
150
174
|
|
|
151
|
-
function _compute_inner_block_offset(
|
|
175
|
+
function _compute_inner_block_offset(
|
|
176
|
+
table: Table,
|
|
177
|
+
target_paragraph: Paragraph,
|
|
178
|
+
table_start_offset: number,
|
|
179
|
+
comments_map: any,
|
|
180
|
+
): number {
|
|
152
181
|
const target_el = target_paragraph._element;
|
|
153
182
|
let cursor = table_start_offset;
|
|
154
183
|
let rows_processed = 0;
|
|
@@ -165,7 +194,12 @@ function _compute_inner_block_offset(table: Table, target_paragraph: Paragraph,
|
|
|
165
194
|
|
|
166
195
|
if (cells_in_row > 0) cursor += 3;
|
|
167
196
|
|
|
168
|
-
const [new_cursor, found] = _walk_cell_for_offset(
|
|
197
|
+
const [new_cursor, found] = _walk_cell_for_offset(
|
|
198
|
+
cell,
|
|
199
|
+
target_el,
|
|
200
|
+
cursor,
|
|
201
|
+
comments_map,
|
|
202
|
+
);
|
|
169
203
|
if (found) return new_cursor;
|
|
170
204
|
cursor = new_cursor;
|
|
171
205
|
|
|
@@ -177,7 +211,12 @@ function _compute_inner_block_offset(table: Table, target_paragraph: Paragraph,
|
|
|
177
211
|
return table_start_offset;
|
|
178
212
|
}
|
|
179
213
|
|
|
180
|
-
function _walk_cell_for_offset(
|
|
214
|
+
function _walk_cell_for_offset(
|
|
215
|
+
cell: any,
|
|
216
|
+
target_el: any,
|
|
217
|
+
cell_start_cursor: number,
|
|
218
|
+
comments_map: any,
|
|
219
|
+
): [number, boolean] {
|
|
181
220
|
let cursor = cell_start_cursor;
|
|
182
221
|
let is_first_block = true;
|
|
183
222
|
|
|
@@ -190,9 +229,15 @@ function _walk_cell_for_offset(cell: any, target_el: any, cell_start_cursor: num
|
|
|
190
229
|
const p_text = build_paragraph_text(inner_item, comments_map, false);
|
|
191
230
|
cursor += (prefix + p_text).length;
|
|
192
231
|
} else if (inner_item instanceof Table) {
|
|
193
|
-
const nested_offset = _compute_inner_block_offset(
|
|
232
|
+
const nested_offset = _compute_inner_block_offset(
|
|
233
|
+
inner_item,
|
|
234
|
+
new Paragraph(target_el, null),
|
|
235
|
+
cursor,
|
|
236
|
+
comments_map,
|
|
237
|
+
);
|
|
194
238
|
if (nested_offset !== cursor) {
|
|
195
|
-
if (_element_is_descendant(target_el, inner_item._element))
|
|
239
|
+
if (_element_is_descendant(target_el, inner_item._element))
|
|
240
|
+
return [nested_offset, true];
|
|
196
241
|
}
|
|
197
242
|
const table_text = extract_table(inner_item, comments_map, false, 0);
|
|
198
243
|
cursor += table_text ? table_text.length : 0;
|
|
@@ -202,7 +247,10 @@ function _walk_cell_for_offset(cell: any, target_el: any, cell_start_cursor: num
|
|
|
202
247
|
return [cursor, false];
|
|
203
248
|
}
|
|
204
249
|
|
|
205
|
-
function _element_is_descendant(
|
|
250
|
+
function _element_is_descendant(
|
|
251
|
+
target_el: Element,
|
|
252
|
+
ancestor_el: Element,
|
|
253
|
+
): boolean {
|
|
206
254
|
let cur: Node | null = target_el.parentNode;
|
|
207
255
|
while (cur) {
|
|
208
256
|
if (cur === ancestor_el) return true;
|
|
@@ -211,7 +259,12 @@ function _element_is_descendant(target_el: Element, ancestor_el: Element): boole
|
|
|
211
259
|
return false;
|
|
212
260
|
}
|
|
213
261
|
|
|
214
|
-
function _record_table_inner_blocks_lite(
|
|
262
|
+
function _record_table_inner_blocks_lite(
|
|
263
|
+
table: Table,
|
|
264
|
+
inherited_offset: number,
|
|
265
|
+
records: _BlockRecord[],
|
|
266
|
+
comments_map: any,
|
|
267
|
+
) {
|
|
215
268
|
const seen_cells = new Set();
|
|
216
269
|
for (const row of table.rows) {
|
|
217
270
|
for (const cell of row.cells) {
|
|
@@ -220,11 +273,35 @@ function _record_table_inner_blocks_lite(table: Table, inherited_offset: number,
|
|
|
220
273
|
|
|
221
274
|
for (const inner_item of iter_block_items(cell)) {
|
|
222
275
|
if (inner_item instanceof Paragraph) {
|
|
223
|
-
const true_offset = _is_heading(inner_item)
|
|
224
|
-
|
|
276
|
+
const true_offset = _is_heading(inner_item)
|
|
277
|
+
? _compute_inner_block_offset(
|
|
278
|
+
table,
|
|
279
|
+
inner_item,
|
|
280
|
+
inherited_offset,
|
|
281
|
+
comments_map,
|
|
282
|
+
)
|
|
283
|
+
: inherited_offset;
|
|
284
|
+
records.push({
|
|
285
|
+
item: inner_item,
|
|
286
|
+
is_paragraph: true,
|
|
287
|
+
is_table: false,
|
|
288
|
+
start_offset: true_offset,
|
|
289
|
+
projected_length: 0,
|
|
290
|
+
});
|
|
225
291
|
} else if (inner_item instanceof Table) {
|
|
226
|
-
records.push({
|
|
227
|
-
|
|
292
|
+
records.push({
|
|
293
|
+
item: inner_item,
|
|
294
|
+
is_paragraph: false,
|
|
295
|
+
is_table: true,
|
|
296
|
+
start_offset: inherited_offset,
|
|
297
|
+
projected_length: 0,
|
|
298
|
+
});
|
|
299
|
+
_record_table_inner_blocks_lite(
|
|
300
|
+
inner_item,
|
|
301
|
+
inherited_offset,
|
|
302
|
+
records,
|
|
303
|
+
comments_map,
|
|
304
|
+
);
|
|
228
305
|
}
|
|
229
306
|
}
|
|
230
307
|
}
|
|
@@ -235,19 +312,20 @@ function _project_part(part: any, comments_map: any): string {
|
|
|
235
312
|
const blocks: string[] = [];
|
|
236
313
|
const c_type = part.constructor.name;
|
|
237
314
|
|
|
238
|
-
if (c_type ===
|
|
239
|
-
const header = part.note_type ===
|
|
315
|
+
if (c_type === "NotesPart") {
|
|
316
|
+
const header = part.note_type === "fn" ? "## Footnotes" : "## Endnotes";
|
|
240
317
|
blocks.push(`---\n${header}`);
|
|
241
318
|
}
|
|
242
319
|
|
|
243
320
|
let is_first_para = true;
|
|
244
321
|
for (const item of iter_block_items(part)) {
|
|
245
|
-
if (item.constructor.name ===
|
|
322
|
+
if (item.constructor.name === "FootnoteItem") {
|
|
246
323
|
const fn_text = _project_part(item, comments_map);
|
|
247
324
|
if (fn_text) blocks.push(fn_text);
|
|
248
325
|
} else if (item instanceof Paragraph) {
|
|
249
326
|
let prefix = get_paragraph_prefix(item);
|
|
250
|
-
if (is_first_para && c_type ===
|
|
327
|
+
if (is_first_para && c_type === "FootnoteItem")
|
|
328
|
+
prefix = `[^${part.note_type}-${part.id}]: ${prefix}`;
|
|
251
329
|
const p_text = build_paragraph_text(item, comments_map, false);
|
|
252
330
|
blocks.push(prefix + p_text);
|
|
253
331
|
is_first_para = false;
|
|
@@ -258,16 +336,19 @@ function _project_part(part: any, comments_map: any): string {
|
|
|
258
336
|
}
|
|
259
337
|
}
|
|
260
338
|
|
|
261
|
-
return blocks.join(
|
|
339
|
+
return blocks.join("\n\n");
|
|
262
340
|
}
|
|
263
341
|
|
|
264
342
|
function _is_heading(paragraph: Paragraph): boolean {
|
|
265
343
|
return _HEADING_PREFIX_RE.test(get_paragraph_prefix(paragraph));
|
|
266
344
|
}
|
|
267
345
|
|
|
268
|
-
function _heading_passes_quality_filter(
|
|
346
|
+
function _heading_passes_quality_filter(
|
|
347
|
+
paragraph: Paragraph,
|
|
348
|
+
comments_map: any,
|
|
349
|
+
): boolean {
|
|
269
350
|
const style = _determine_heading_style(paragraph);
|
|
270
|
-
if (style !==
|
|
351
|
+
if (style !== "(heuristic)") return true;
|
|
271
352
|
const text = _heading_text(paragraph, comments_map);
|
|
272
353
|
if (!text) return false;
|
|
273
354
|
const word_count = (text.match(/\w+/g) || []).length;
|
|
@@ -287,60 +368,109 @@ function _heading_text(paragraph: Paragraph, comments_map: any): string {
|
|
|
287
368
|
}
|
|
288
369
|
|
|
289
370
|
function _strip_critic_markup(text: string): string {
|
|
290
|
-
if (!text) return
|
|
291
|
-
text = text.replace(/\{--[\s\S]*?--\}/g,
|
|
292
|
-
text = text.replace(/\{>>[\s\S]*?<<\}/g,
|
|
293
|
-
text = text.replace(/\{\+\+([\s\S]*?)\+\+\}/g,
|
|
294
|
-
text = text.replace(/\{==([\s\S]*?)==\}/g,
|
|
371
|
+
if (!text) return "";
|
|
372
|
+
text = text.replace(/\{--[\s\S]*?--\}/g, "");
|
|
373
|
+
text = text.replace(/\{>>[\s\S]*?<<\}/g, "");
|
|
374
|
+
text = text.replace(/\{\+\+([\s\S]*?)\+\+\}/g, "$1");
|
|
375
|
+
text = text.replace(/\{==([\s\S]*?)==\}/g, "$1");
|
|
295
376
|
return text;
|
|
296
377
|
}
|
|
297
378
|
|
|
298
379
|
function _strip_inline_formatting(text: string): string {
|
|
299
|
-
if (!text) return
|
|
300
|
-
text = text.replace(/\*\*(.+?)\*\*/g,
|
|
301
|
-
text = text.replace(/__(.+?)__/g,
|
|
302
|
-
text = text.replace(/(?<!\w)_(\S(?:.*?\S)?)_(?!\w)/g,
|
|
380
|
+
if (!text) return "";
|
|
381
|
+
text = text.replace(/\*\*(.+?)\*\*/g, "$1");
|
|
382
|
+
text = text.replace(/__(.+?)__/g, "$1");
|
|
383
|
+
text = text.replace(/(?<!\w)_(\S(?:.*?\S)?)_(?!\w)/g, "$1");
|
|
303
384
|
return text;
|
|
304
385
|
}
|
|
305
386
|
|
|
306
387
|
function _determine_heading_style(paragraph: Paragraph): string {
|
|
307
|
-
const [style_cache, default_pstyle] = _get_style_cache(
|
|
308
|
-
|
|
388
|
+
const [style_cache, default_pstyle] = _get_style_cache(
|
|
389
|
+
paragraph._parent.part || paragraph._parent,
|
|
390
|
+
);
|
|
391
|
+
const pPr = findChild(paragraph._element, "w:pPr");
|
|
309
392
|
let style_id = default_pstyle;
|
|
310
|
-
|
|
393
|
+
|
|
311
394
|
if (pPr) {
|
|
312
|
-
const
|
|
313
|
-
if (
|
|
314
|
-
const style = _safe_style_name(paragraph, style_cache, default_pstyle);
|
|
315
|
-
if (style && (style.startsWith('Heading') || style === 'Title')) return style;
|
|
316
|
-
return '(outline_level)';
|
|
317
|
-
}
|
|
318
|
-
const pStyle = findChild(pPr, 'w:pStyle');
|
|
319
|
-
if (pStyle) style_id = pStyle.getAttribute('w:val') || default_pstyle;
|
|
395
|
+
const pStyle = findChild(pPr, "w:pStyle");
|
|
396
|
+
if (pStyle) style_id = pStyle.getAttribute("w:val") || default_pstyle;
|
|
320
397
|
}
|
|
321
398
|
|
|
322
|
-
|
|
323
|
-
if (
|
|
399
|
+
let outline_level: number | null = null;
|
|
400
|
+
if (pPr) {
|
|
401
|
+
const oLvl = findChild(pPr, "w:outlineLvl");
|
|
402
|
+
if (oLvl && /^\d+$/.test(oLvl.getAttribute("w:val") || "")) {
|
|
403
|
+
outline_level = parseInt(oLvl.getAttribute("w:val") as string, 10);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
324
406
|
|
|
325
|
-
if (
|
|
407
|
+
if (outline_level === null && style_id && style_cache && style_cache[style_id]) {
|
|
408
|
+
outline_level = style_cache[style_id].outline_level;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
const style_name =
|
|
412
|
+
style_id && style_cache && style_cache[style_id]
|
|
413
|
+
? style_cache[style_id].name
|
|
414
|
+
: style_id;
|
|
415
|
+
|
|
416
|
+
let normalized_style_name = style_name;
|
|
417
|
+
if (normalized_style_name && typeof normalized_style_name === "string") {
|
|
418
|
+
if (normalized_style_name.toLowerCase().startsWith("heading")) {
|
|
419
|
+
normalized_style_name = normalized_style_name.replace(/^heading/i, "Heading");
|
|
420
|
+
} else if (normalized_style_name.toLowerCase() === "title") {
|
|
421
|
+
normalized_style_name = "Title";
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
if (outline_level !== null && outline_level >= 0 && outline_level <= 8) {
|
|
426
|
+
if (normalized_style_name && (normalized_style_name.startsWith("Heading") || normalized_style_name === "Title")) {
|
|
427
|
+
return normalized_style_name;
|
|
428
|
+
}
|
|
429
|
+
return "(outline_level)";
|
|
430
|
+
}
|
|
326
431
|
|
|
327
|
-
|
|
432
|
+
if (
|
|
433
|
+
normalized_style_name &&
|
|
434
|
+
(normalized_style_name.startsWith("Heading") || normalized_style_name === "Title")
|
|
435
|
+
)
|
|
436
|
+
return normalized_style_name;
|
|
437
|
+
|
|
438
|
+
if (normalized_style_name && /Heading[ ]?([1-6])(?![0-9])/.test(normalized_style_name))
|
|
439
|
+
return normalized_style_name;
|
|
440
|
+
|
|
441
|
+
return "(heuristic)";
|
|
328
442
|
}
|
|
329
443
|
|
|
330
|
-
function _safe_style_name(
|
|
331
|
-
|
|
444
|
+
function _safe_style_name(
|
|
445
|
+
paragraph: Paragraph,
|
|
446
|
+
style_cache: any,
|
|
447
|
+
default_pstyle: any,
|
|
448
|
+
): string | null {
|
|
449
|
+
const pPr = findChild(paragraph._element, "w:pPr");
|
|
332
450
|
let style_id = default_pstyle;
|
|
333
451
|
if (pPr) {
|
|
334
|
-
const pStyle = findChild(pPr,
|
|
335
|
-
if (pStyle) style_id = pStyle.getAttribute(
|
|
452
|
+
const pStyle = findChild(pPr, "w:pStyle");
|
|
453
|
+
if (pStyle) style_id = pStyle.getAttribute("w:val") || default_pstyle;
|
|
336
454
|
}
|
|
337
|
-
return
|
|
455
|
+
return style_id && style_cache && style_cache[style_id]
|
|
456
|
+
? style_cache[style_id].name
|
|
457
|
+
: style_id;
|
|
338
458
|
}
|
|
339
459
|
|
|
340
|
-
function _find_owned_end(
|
|
341
|
-
|
|
460
|
+
function _find_owned_end(
|
|
461
|
+
block_records: _BlockRecord[],
|
|
462
|
+
heading_indices: number[],
|
|
463
|
+
current_h_pos: number,
|
|
464
|
+
current_level: number,
|
|
465
|
+
): number {
|
|
466
|
+
for (
|
|
467
|
+
let next_h_pos = current_h_pos + 1;
|
|
468
|
+
next_h_pos < heading_indices.length;
|
|
469
|
+
next_h_pos++
|
|
470
|
+
) {
|
|
342
471
|
const next_idx = heading_indices[next_h_pos];
|
|
343
|
-
if (_heading_level(block_records[next_idx].item) <= current_level)
|
|
472
|
+
if (_heading_level(block_records[next_idx].item) <= current_level)
|
|
473
|
+
return next_idx;
|
|
344
474
|
}
|
|
345
475
|
return block_records.length;
|
|
346
476
|
}
|
|
@@ -351,12 +481,12 @@ function _collect_footnote_ids(owned_blocks: _BlockRecord[]): string[] {
|
|
|
351
481
|
for (const rec of owned_blocks) {
|
|
352
482
|
if (!rec.is_paragraph) continue;
|
|
353
483
|
for (const event of iter_paragraph_content(rec.item)) {
|
|
354
|
-
if (!(
|
|
355
|
-
let fn_id =
|
|
356
|
-
if (event.type ===
|
|
357
|
-
else if (event.type ===
|
|
484
|
+
if (!("type" in event)) continue;
|
|
485
|
+
let fn_id = "";
|
|
486
|
+
if (event.type === "footnote") fn_id = `fn-${event.id}`;
|
|
487
|
+
else if (event.type === "endnote") fn_id = `en-${event.id}`;
|
|
358
488
|
else continue;
|
|
359
|
-
|
|
489
|
+
|
|
360
490
|
if (!seen.has(fn_id)) {
|
|
361
491
|
seen.add(fn_id);
|
|
362
492
|
ordered.push(fn_id);
|
|
@@ -374,4 +504,4 @@ function _offset_to_page(offset: number, body_page_offsets: number[]): number {
|
|
|
374
504
|
else break;
|
|
375
505
|
}
|
|
376
506
|
return page;
|
|
377
|
-
}
|
|
507
|
+
}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import { DocumentObject } from '../docx/bridge.js';
|
|
2
|
+
import { SanitizeReport } from './report.js';
|
|
3
|
+
import * as transforms from './transforms.js';
|
|
4
|
+
import { findAllDescendants } from '../docx/dom.js';
|
|
5
|
+
|
|
6
|
+
export interface FinalizeOptions {
|
|
7
|
+
filename: string;
|
|
8
|
+
sanitize_mode?: 'full' | 'keep-markup' | 'baseline';
|
|
9
|
+
accept_all?: boolean;
|
|
10
|
+
protection_mode?: 'read_only' | 'encrypt' | null;
|
|
11
|
+
password?: string | null;
|
|
12
|
+
author?: string | null;
|
|
13
|
+
export_pdf?: boolean;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface FinalizeResult {
|
|
17
|
+
reportText: string;
|
|
18
|
+
outBuffer?: Buffer;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export async function finalize_document(doc: DocumentObject, options: FinalizeOptions): Promise<FinalizeResult> {
|
|
22
|
+
const report = new SanitizeReport(options.filename, options.sanitize_mode || 'full', options.author || null);
|
|
23
|
+
|
|
24
|
+
if (options.sanitize_mode === 'full') {
|
|
25
|
+
const counts = transforms.count_tracked_changes(doc);
|
|
26
|
+
const total = counts[0] + counts[1] + counts[2];
|
|
27
|
+
report.tracked_changes_found = total;
|
|
28
|
+
|
|
29
|
+
if (total > 0 && !options.accept_all) {
|
|
30
|
+
report.status = 'blocked';
|
|
31
|
+
report.blocked_reason = `Document contains ${total} unresolved tracked changes (${counts[0]} insertions, ${counts[1]} deletions, ${counts[2]} formatting). Review in Word first, or set accept_all=true.`;
|
|
32
|
+
return { reportText: report.render() };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (total > 0) {
|
|
36
|
+
const authors = transforms.get_track_change_authors(doc);
|
|
37
|
+
if (authors.size > 1) {
|
|
38
|
+
report.warnings.push(`Multiple authors detected in tracked changes: ${Array.from(authors).sort().join(', ')}. Review per-change list before sending.`);
|
|
39
|
+
}
|
|
40
|
+
report.add_transform_lines(transforms.accept_all_tracked_changes(doc));
|
|
41
|
+
report.tracked_changes_accepted = total;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const commentsSummary = transforms.get_comments_summary(doc);
|
|
45
|
+
report.comments_removed = commentsSummary.total;
|
|
46
|
+
report.add_transform_lines(transforms.remove_all_comments(doc));
|
|
47
|
+
} else if (options.sanitize_mode === 'keep-markup') {
|
|
48
|
+
// Basic support for keep-markup in TS
|
|
49
|
+
const counts = transforms.count_tracked_changes(doc);
|
|
50
|
+
report.tracked_changes_found = counts[0] + counts[1] + counts[2];
|
|
51
|
+
report.tracked_changes_kept = report.tracked_changes_found;
|
|
52
|
+
|
|
53
|
+
if (options.author) {
|
|
54
|
+
report.add_transform_lines(transforms.replace_comment_authors(doc, options.author));
|
|
55
|
+
report.add_transform_lines(transforms.replace_change_authors(doc, options.author));
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Common transforms
|
|
60
|
+
report.add_transform_lines(transforms.strip_rsid(doc));
|
|
61
|
+
report.add_transform_lines(transforms.strip_para_ids(doc));
|
|
62
|
+
report.add_transform_lines(transforms.strip_proof_errors(doc));
|
|
63
|
+
report.add_transform_lines(transforms.strip_empty_properties(doc));
|
|
64
|
+
report.add_transform_lines(transforms.strip_hidden_text(doc));
|
|
65
|
+
report.add_transform_lines(transforms.coalesce_runs(doc));
|
|
66
|
+
report.add_transform_lines(transforms.scrub_doc_properties(doc));
|
|
67
|
+
report.add_transform_lines(transforms.scrub_timestamps(doc));
|
|
68
|
+
report.add_transform_lines(transforms.strip_custom_xml(doc));
|
|
69
|
+
report.add_transform_lines(transforms.strip_image_alt_text(doc));
|
|
70
|
+
|
|
71
|
+
const warnings = transforms.audit_hyperlinks(doc);
|
|
72
|
+
for (const w of warnings) report.warnings.push(w);
|
|
73
|
+
|
|
74
|
+
report.add_transform_lines(transforms.normalize_change_dates(doc));
|
|
75
|
+
|
|
76
|
+
// Protection (Settings injection)
|
|
77
|
+
if (options.protection_mode === 'read_only' || options.protection_mode === 'encrypt') {
|
|
78
|
+
if (options.protection_mode === 'encrypt') {
|
|
79
|
+
report.warnings.push("Encryption mode (AES compound wrappers) is strictly unsupported in the zero-dependency Node engine. Falling back to native Word Read-Only lock.");
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const settingsPart = doc.pkg.getPartByPath('word/settings.xml');
|
|
83
|
+
if (settingsPart) {
|
|
84
|
+
const docEl = settingsPart._element.ownerDocument!;
|
|
85
|
+
let prot = transforms.findDescendantsByLocalName(settingsPart._element, 'documentProtection')[0];
|
|
86
|
+
if (!prot) {
|
|
87
|
+
prot = docEl.createElement('w:documentProtection');
|
|
88
|
+
// Word expects documentProtection to be inserted before elements like w:autoFormatOverride, w:styleLockTheme, etc.
|
|
89
|
+
// For standard robustness without complex XSD enforcement, appendChild generally works.
|
|
90
|
+
settingsPart._element.appendChild(prot);
|
|
91
|
+
}
|
|
92
|
+
prot.setAttribute('w:edit', 'readOnly');
|
|
93
|
+
prot.setAttribute('w:enforcement', '1');
|
|
94
|
+
report.structural_lines.push("Document locked (Read-Only enforcement injected into settings.xml)");
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (options.export_pdf) {
|
|
99
|
+
report.warnings.push("PDF export requires the Python/Word COM environment and is skipped in this zero-dependency Node agent.");
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Clean up leaked Microsoft namespaces
|
|
103
|
+
for (const part of doc.pkg.parts) {
|
|
104
|
+
// Match the exact injection condition from RedlineEngine constructor
|
|
105
|
+
if (part === doc.part || (part.contentType.includes('wordprocessingml') && part.contentType.endsWith('+xml'))) {
|
|
106
|
+
if (part._element.hasAttribute('xmlns:w16du')) {
|
|
107
|
+
let hasW16du = false;
|
|
108
|
+
// Check root element attributes (excluding the xmlns declaration itself)
|
|
109
|
+
if (Array.from(part._element.attributes || []).some(a => a.name.startsWith('w16du:') && a.name !== 'xmlns:w16du')) {
|
|
110
|
+
hasW16du = true;
|
|
111
|
+
}
|
|
112
|
+
if (!hasW16du) {
|
|
113
|
+
const allNodes = findAllDescendants(part._element, '*');
|
|
114
|
+
for (const n of allNodes) {
|
|
115
|
+
if (n.tagName.startsWith('w16du:') || Array.from(n.attributes || []).some(a => a.name.startsWith('w16du:'))) {
|
|
116
|
+
hasW16du = true;
|
|
117
|
+
break;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
if (!hasW16du) part._element.removeAttribute('xmlns:w16du');
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (report.warnings.length > 0) report.status = 'clean_with_warnings';
|
|
127
|
+
|
|
128
|
+
const outBuffer = await doc.save();
|
|
129
|
+
return { reportText: report.render(), outBuffer };
|
|
130
|
+
}
|