@adeu/core 1.6.8 → 1.6.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1833 -540
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +75 -1
- package/dist/index.d.ts +75 -1
- package/dist/index.js +1832 -540
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/consistency.test.ts +134 -0
- package/src/diff.test.ts +13 -1
- package/src/diff.ts +189 -70
- package/src/docx/bridge.ts +99 -57
- package/src/docx/dom.ts +66 -7
- package/src/engine.bugs.test.ts +481 -0
- package/src/engine.ts +1346 -192
- package/src/index.ts +1 -1
- package/src/markup.ts +160 -53
- package/src/outline.ts +199 -69
- package/src/sanitize/core.ts +26 -0
- package/src/sanitize/report.ts +1 -1
- package/src/sanitize/sanitize.test.ts +47 -2
- package/src/sanitize/transforms.ts +87 -0
- package/src/utils/docx.ts +282 -157
package/src/outline.ts
CHANGED
|
@@ -2,18 +2,18 @@
|
|
|
2
2
|
* Structural outline extractor.
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
-
import { DocumentObject } from
|
|
6
|
-
import { Paragraph, Table, DocxEvent } from
|
|
7
|
-
import { build_paragraph_text, extract_table } from
|
|
8
|
-
import { extract_comments_data } from
|
|
9
|
-
import { findChild } from
|
|
5
|
+
import { DocumentObject } from "./docx/bridge.js";
|
|
6
|
+
import { Paragraph, Table, DocxEvent } from "./docx/primitives.js";
|
|
7
|
+
import { build_paragraph_text, extract_table } from "./ingest.js";
|
|
8
|
+
import { extract_comments_data } from "./comments.js";
|
|
9
|
+
import { findChild } from "./docx/dom.js";
|
|
10
10
|
import {
|
|
11
11
|
_get_style_cache,
|
|
12
12
|
get_paragraph_prefix,
|
|
13
13
|
iter_block_items,
|
|
14
14
|
iter_document_parts,
|
|
15
15
|
iter_paragraph_content,
|
|
16
|
-
} from
|
|
16
|
+
} from "./utils/docx.js";
|
|
17
17
|
|
|
18
18
|
const _HEADING_PREFIX_RE = /^(#{1,6}) /;
|
|
19
19
|
const _HEURISTIC_MIN_WORDS = 3;
|
|
@@ -40,10 +40,10 @@ export function extract_outline(
|
|
|
40
40
|
projected_body: string,
|
|
41
41
|
body_pages: string[],
|
|
42
42
|
body_page_offsets: number[],
|
|
43
|
-
paragraph_offsets: Record<string, [number, number]> | null = null
|
|
43
|
+
paragraph_offsets: Record<string, [number, number]> | null = null,
|
|
44
44
|
): OutlineNode[] {
|
|
45
45
|
if (body_pages.length !== body_page_offsets.length) {
|
|
46
|
-
throw new Error(
|
|
46
|
+
throw new Error("body_pages and body_page_offsets length mismatch");
|
|
47
47
|
}
|
|
48
48
|
|
|
49
49
|
const comments_map = extract_comments_data(doc.pkg);
|
|
@@ -69,7 +69,12 @@ export function extract_outline(
|
|
|
69
69
|
const text = _heading_text(paragraph, comments_map);
|
|
70
70
|
const style = _determine_heading_style(paragraph);
|
|
71
71
|
|
|
72
|
-
const owned_end = _find_owned_end(
|
|
72
|
+
const owned_end = _find_owned_end(
|
|
73
|
+
block_records,
|
|
74
|
+
heading_indices,
|
|
75
|
+
h_pos,
|
|
76
|
+
level,
|
|
77
|
+
);
|
|
73
78
|
const owned_blocks = block_records.slice(rec_idx + 1, owned_end);
|
|
74
79
|
|
|
75
80
|
const has_table = _direct_has_table(block_records, rec_idx + 1, owned_end);
|
|
@@ -83,7 +88,11 @@ export function extract_outline(
|
|
|
83
88
|
return nodes;
|
|
84
89
|
}
|
|
85
90
|
|
|
86
|
-
function _direct_has_table(
|
|
91
|
+
function _direct_has_table(
|
|
92
|
+
block_records: _BlockRecord[],
|
|
93
|
+
range_start: number,
|
|
94
|
+
range_end: number,
|
|
95
|
+
): boolean {
|
|
87
96
|
for (let idx = range_start; idx < range_end; idx++) {
|
|
88
97
|
const rec = block_records[idx];
|
|
89
98
|
if (rec.is_paragraph && _is_heading(rec.item)) return false;
|
|
@@ -92,7 +101,10 @@ function _direct_has_table(block_records: _BlockRecord[], range_start: number, r
|
|
|
92
101
|
return false;
|
|
93
102
|
}
|
|
94
103
|
|
|
95
|
-
function _walk_doc_body(
|
|
104
|
+
function _walk_doc_body(
|
|
105
|
+
doc: DocumentObject,
|
|
106
|
+
comments_map: any,
|
|
107
|
+
): _BlockRecord[] {
|
|
96
108
|
const parts = Array.from(iter_document_parts(doc));
|
|
97
109
|
let body_start_offset = 0;
|
|
98
110
|
let body_part: any = null;
|
|
@@ -128,7 +140,13 @@ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[]
|
|
|
128
140
|
|
|
129
141
|
if (!is_first_block) cursor += 2;
|
|
130
142
|
|
|
131
|
-
records.push({
|
|
143
|
+
records.push({
|
|
144
|
+
item,
|
|
145
|
+
is_paragraph: true,
|
|
146
|
+
is_table: false,
|
|
147
|
+
start_offset: cursor,
|
|
148
|
+
projected_length: block_len,
|
|
149
|
+
});
|
|
132
150
|
cursor += block_len;
|
|
133
151
|
is_first_block = false;
|
|
134
152
|
} else if (item instanceof Table) {
|
|
@@ -138,7 +156,13 @@ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[]
|
|
|
138
156
|
if (!is_first_block) cursor += 2;
|
|
139
157
|
|
|
140
158
|
const table_start = cursor;
|
|
141
|
-
records.push({
|
|
159
|
+
records.push({
|
|
160
|
+
item,
|
|
161
|
+
is_paragraph: false,
|
|
162
|
+
is_table: true,
|
|
163
|
+
start_offset: table_start,
|
|
164
|
+
projected_length: block_len,
|
|
165
|
+
});
|
|
142
166
|
_record_table_inner_blocks_lite(item, table_start, records, comments_map);
|
|
143
167
|
cursor += block_len;
|
|
144
168
|
is_first_block = false;
|
|
@@ -148,7 +172,12 @@ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[]
|
|
|
148
172
|
return records;
|
|
149
173
|
}
|
|
150
174
|
|
|
151
|
-
function _compute_inner_block_offset(
|
|
175
|
+
function _compute_inner_block_offset(
|
|
176
|
+
table: Table,
|
|
177
|
+
target_paragraph: Paragraph,
|
|
178
|
+
table_start_offset: number,
|
|
179
|
+
comments_map: any,
|
|
180
|
+
): number {
|
|
152
181
|
const target_el = target_paragraph._element;
|
|
153
182
|
let cursor = table_start_offset;
|
|
154
183
|
let rows_processed = 0;
|
|
@@ -165,7 +194,12 @@ function _compute_inner_block_offset(table: Table, target_paragraph: Paragraph,
|
|
|
165
194
|
|
|
166
195
|
if (cells_in_row > 0) cursor += 3;
|
|
167
196
|
|
|
168
|
-
const [new_cursor, found] = _walk_cell_for_offset(
|
|
197
|
+
const [new_cursor, found] = _walk_cell_for_offset(
|
|
198
|
+
cell,
|
|
199
|
+
target_el,
|
|
200
|
+
cursor,
|
|
201
|
+
comments_map,
|
|
202
|
+
);
|
|
169
203
|
if (found) return new_cursor;
|
|
170
204
|
cursor = new_cursor;
|
|
171
205
|
|
|
@@ -177,7 +211,12 @@ function _compute_inner_block_offset(table: Table, target_paragraph: Paragraph,
|
|
|
177
211
|
return table_start_offset;
|
|
178
212
|
}
|
|
179
213
|
|
|
180
|
-
function _walk_cell_for_offset(
|
|
214
|
+
function _walk_cell_for_offset(
|
|
215
|
+
cell: any,
|
|
216
|
+
target_el: any,
|
|
217
|
+
cell_start_cursor: number,
|
|
218
|
+
comments_map: any,
|
|
219
|
+
): [number, boolean] {
|
|
181
220
|
let cursor = cell_start_cursor;
|
|
182
221
|
let is_first_block = true;
|
|
183
222
|
|
|
@@ -190,9 +229,15 @@ function _walk_cell_for_offset(cell: any, target_el: any, cell_start_cursor: num
|
|
|
190
229
|
const p_text = build_paragraph_text(inner_item, comments_map, false);
|
|
191
230
|
cursor += (prefix + p_text).length;
|
|
192
231
|
} else if (inner_item instanceof Table) {
|
|
193
|
-
const nested_offset = _compute_inner_block_offset(
|
|
232
|
+
const nested_offset = _compute_inner_block_offset(
|
|
233
|
+
inner_item,
|
|
234
|
+
new Paragraph(target_el, null),
|
|
235
|
+
cursor,
|
|
236
|
+
comments_map,
|
|
237
|
+
);
|
|
194
238
|
if (nested_offset !== cursor) {
|
|
195
|
-
if (_element_is_descendant(target_el, inner_item._element))
|
|
239
|
+
if (_element_is_descendant(target_el, inner_item._element))
|
|
240
|
+
return [nested_offset, true];
|
|
196
241
|
}
|
|
197
242
|
const table_text = extract_table(inner_item, comments_map, false, 0);
|
|
198
243
|
cursor += table_text ? table_text.length : 0;
|
|
@@ -202,7 +247,10 @@ function _walk_cell_for_offset(cell: any, target_el: any, cell_start_cursor: num
|
|
|
202
247
|
return [cursor, false];
|
|
203
248
|
}
|
|
204
249
|
|
|
205
|
-
function _element_is_descendant(
|
|
250
|
+
function _element_is_descendant(
|
|
251
|
+
target_el: Element,
|
|
252
|
+
ancestor_el: Element,
|
|
253
|
+
): boolean {
|
|
206
254
|
let cur: Node | null = target_el.parentNode;
|
|
207
255
|
while (cur) {
|
|
208
256
|
if (cur === ancestor_el) return true;
|
|
@@ -211,7 +259,12 @@ function _element_is_descendant(target_el: Element, ancestor_el: Element): boole
|
|
|
211
259
|
return false;
|
|
212
260
|
}
|
|
213
261
|
|
|
214
|
-
function _record_table_inner_blocks_lite(
|
|
262
|
+
function _record_table_inner_blocks_lite(
|
|
263
|
+
table: Table,
|
|
264
|
+
inherited_offset: number,
|
|
265
|
+
records: _BlockRecord[],
|
|
266
|
+
comments_map: any,
|
|
267
|
+
) {
|
|
215
268
|
const seen_cells = new Set();
|
|
216
269
|
for (const row of table.rows) {
|
|
217
270
|
for (const cell of row.cells) {
|
|
@@ -220,11 +273,35 @@ function _record_table_inner_blocks_lite(table: Table, inherited_offset: number,
|
|
|
220
273
|
|
|
221
274
|
for (const inner_item of iter_block_items(cell)) {
|
|
222
275
|
if (inner_item instanceof Paragraph) {
|
|
223
|
-
const true_offset = _is_heading(inner_item)
|
|
224
|
-
|
|
276
|
+
const true_offset = _is_heading(inner_item)
|
|
277
|
+
? _compute_inner_block_offset(
|
|
278
|
+
table,
|
|
279
|
+
inner_item,
|
|
280
|
+
inherited_offset,
|
|
281
|
+
comments_map,
|
|
282
|
+
)
|
|
283
|
+
: inherited_offset;
|
|
284
|
+
records.push({
|
|
285
|
+
item: inner_item,
|
|
286
|
+
is_paragraph: true,
|
|
287
|
+
is_table: false,
|
|
288
|
+
start_offset: true_offset,
|
|
289
|
+
projected_length: 0,
|
|
290
|
+
});
|
|
225
291
|
} else if (inner_item instanceof Table) {
|
|
226
|
-
records.push({
|
|
227
|
-
|
|
292
|
+
records.push({
|
|
293
|
+
item: inner_item,
|
|
294
|
+
is_paragraph: false,
|
|
295
|
+
is_table: true,
|
|
296
|
+
start_offset: inherited_offset,
|
|
297
|
+
projected_length: 0,
|
|
298
|
+
});
|
|
299
|
+
_record_table_inner_blocks_lite(
|
|
300
|
+
inner_item,
|
|
301
|
+
inherited_offset,
|
|
302
|
+
records,
|
|
303
|
+
comments_map,
|
|
304
|
+
);
|
|
228
305
|
}
|
|
229
306
|
}
|
|
230
307
|
}
|
|
@@ -235,19 +312,20 @@ function _project_part(part: any, comments_map: any): string {
|
|
|
235
312
|
const blocks: string[] = [];
|
|
236
313
|
const c_type = part.constructor.name;
|
|
237
314
|
|
|
238
|
-
if (c_type ===
|
|
239
|
-
const header = part.note_type ===
|
|
315
|
+
if (c_type === "NotesPart") {
|
|
316
|
+
const header = part.note_type === "fn" ? "## Footnotes" : "## Endnotes";
|
|
240
317
|
blocks.push(`---\n${header}`);
|
|
241
318
|
}
|
|
242
319
|
|
|
243
320
|
let is_first_para = true;
|
|
244
321
|
for (const item of iter_block_items(part)) {
|
|
245
|
-
if (item.constructor.name ===
|
|
322
|
+
if (item.constructor.name === "FootnoteItem") {
|
|
246
323
|
const fn_text = _project_part(item, comments_map);
|
|
247
324
|
if (fn_text) blocks.push(fn_text);
|
|
248
325
|
} else if (item instanceof Paragraph) {
|
|
249
326
|
let prefix = get_paragraph_prefix(item);
|
|
250
|
-
if (is_first_para && c_type ===
|
|
327
|
+
if (is_first_para && c_type === "FootnoteItem")
|
|
328
|
+
prefix = `[^${part.note_type}-${part.id}]: ${prefix}`;
|
|
251
329
|
const p_text = build_paragraph_text(item, comments_map, false);
|
|
252
330
|
blocks.push(prefix + p_text);
|
|
253
331
|
is_first_para = false;
|
|
@@ -258,16 +336,19 @@ function _project_part(part: any, comments_map: any): string {
|
|
|
258
336
|
}
|
|
259
337
|
}
|
|
260
338
|
|
|
261
|
-
return blocks.join(
|
|
339
|
+
return blocks.join("\n\n");
|
|
262
340
|
}
|
|
263
341
|
|
|
264
342
|
function _is_heading(paragraph: Paragraph): boolean {
|
|
265
343
|
return _HEADING_PREFIX_RE.test(get_paragraph_prefix(paragraph));
|
|
266
344
|
}
|
|
267
345
|
|
|
268
|
-
function _heading_passes_quality_filter(
|
|
346
|
+
function _heading_passes_quality_filter(
|
|
347
|
+
paragraph: Paragraph,
|
|
348
|
+
comments_map: any,
|
|
349
|
+
): boolean {
|
|
269
350
|
const style = _determine_heading_style(paragraph);
|
|
270
|
-
if (style !==
|
|
351
|
+
if (style !== "(heuristic)") return true;
|
|
271
352
|
const text = _heading_text(paragraph, comments_map);
|
|
272
353
|
if (!text) return false;
|
|
273
354
|
const word_count = (text.match(/\w+/g) || []).length;
|
|
@@ -287,60 +368,109 @@ function _heading_text(paragraph: Paragraph, comments_map: any): string {
|
|
|
287
368
|
}
|
|
288
369
|
|
|
289
370
|
function _strip_critic_markup(text: string): string {
|
|
290
|
-
if (!text) return
|
|
291
|
-
text = text.replace(/\{--[\s\S]*?--\}/g,
|
|
292
|
-
text = text.replace(/\{>>[\s\S]*?<<\}/g,
|
|
293
|
-
text = text.replace(/\{\+\+([\s\S]*?)\+\+\}/g,
|
|
294
|
-
text = text.replace(/\{==([\s\S]*?)==\}/g,
|
|
371
|
+
if (!text) return "";
|
|
372
|
+
text = text.replace(/\{--[\s\S]*?--\}/g, "");
|
|
373
|
+
text = text.replace(/\{>>[\s\S]*?<<\}/g, "");
|
|
374
|
+
text = text.replace(/\{\+\+([\s\S]*?)\+\+\}/g, "$1");
|
|
375
|
+
text = text.replace(/\{==([\s\S]*?)==\}/g, "$1");
|
|
295
376
|
return text;
|
|
296
377
|
}
|
|
297
378
|
|
|
298
379
|
function _strip_inline_formatting(text: string): string {
|
|
299
|
-
if (!text) return
|
|
300
|
-
text = text.replace(/\*\*(.+?)\*\*/g,
|
|
301
|
-
text = text.replace(/__(.+?)__/g,
|
|
302
|
-
text = text.replace(/(?<!\w)_(\S(?:.*?\S)?)_(?!\w)/g,
|
|
380
|
+
if (!text) return "";
|
|
381
|
+
text = text.replace(/\*\*(.+?)\*\*/g, "$1");
|
|
382
|
+
text = text.replace(/__(.+?)__/g, "$1");
|
|
383
|
+
text = text.replace(/(?<!\w)_(\S(?:.*?\S)?)_(?!\w)/g, "$1");
|
|
303
384
|
return text;
|
|
304
385
|
}
|
|
305
386
|
|
|
306
387
|
function _determine_heading_style(paragraph: Paragraph): string {
|
|
307
|
-
const [style_cache, default_pstyle] = _get_style_cache(
|
|
308
|
-
|
|
388
|
+
const [style_cache, default_pstyle] = _get_style_cache(
|
|
389
|
+
paragraph._parent.part || paragraph._parent,
|
|
390
|
+
);
|
|
391
|
+
const pPr = findChild(paragraph._element, "w:pPr");
|
|
309
392
|
let style_id = default_pstyle;
|
|
310
|
-
|
|
393
|
+
|
|
311
394
|
if (pPr) {
|
|
312
|
-
const
|
|
313
|
-
if (
|
|
314
|
-
const style = _safe_style_name(paragraph, style_cache, default_pstyle);
|
|
315
|
-
if (style && (style.startsWith('Heading') || style === 'Title')) return style;
|
|
316
|
-
return '(outline_level)';
|
|
317
|
-
}
|
|
318
|
-
const pStyle = findChild(pPr, 'w:pStyle');
|
|
319
|
-
if (pStyle) style_id = pStyle.getAttribute('w:val') || default_pstyle;
|
|
395
|
+
const pStyle = findChild(pPr, "w:pStyle");
|
|
396
|
+
if (pStyle) style_id = pStyle.getAttribute("w:val") || default_pstyle;
|
|
320
397
|
}
|
|
321
398
|
|
|
322
|
-
|
|
323
|
-
if (
|
|
399
|
+
let outline_level: number | null = null;
|
|
400
|
+
if (pPr) {
|
|
401
|
+
const oLvl = findChild(pPr, "w:outlineLvl");
|
|
402
|
+
if (oLvl && /^\d+$/.test(oLvl.getAttribute("w:val") || "")) {
|
|
403
|
+
outline_level = parseInt(oLvl.getAttribute("w:val") as string, 10);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
324
406
|
|
|
325
|
-
if (
|
|
407
|
+
if (outline_level === null && style_id && style_cache && style_cache[style_id]) {
|
|
408
|
+
outline_level = style_cache[style_id].outline_level;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
const style_name =
|
|
412
|
+
style_id && style_cache && style_cache[style_id]
|
|
413
|
+
? style_cache[style_id].name
|
|
414
|
+
: style_id;
|
|
415
|
+
|
|
416
|
+
let normalized_style_name = style_name;
|
|
417
|
+
if (normalized_style_name && typeof normalized_style_name === "string") {
|
|
418
|
+
if (normalized_style_name.toLowerCase().startsWith("heading")) {
|
|
419
|
+
normalized_style_name = normalized_style_name.replace(/^heading/i, "Heading");
|
|
420
|
+
} else if (normalized_style_name.toLowerCase() === "title") {
|
|
421
|
+
normalized_style_name = "Title";
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
if (outline_level !== null && outline_level >= 0 && outline_level <= 8) {
|
|
426
|
+
if (normalized_style_name && (normalized_style_name.startsWith("Heading") || normalized_style_name === "Title")) {
|
|
427
|
+
return normalized_style_name;
|
|
428
|
+
}
|
|
429
|
+
return "(outline_level)";
|
|
430
|
+
}
|
|
326
431
|
|
|
327
|
-
|
|
432
|
+
if (
|
|
433
|
+
normalized_style_name &&
|
|
434
|
+
(normalized_style_name.startsWith("Heading") || normalized_style_name === "Title")
|
|
435
|
+
)
|
|
436
|
+
return normalized_style_name;
|
|
437
|
+
|
|
438
|
+
if (normalized_style_name && /Heading[ ]?([1-6])(?![0-9])/.test(normalized_style_name))
|
|
439
|
+
return normalized_style_name;
|
|
440
|
+
|
|
441
|
+
return "(heuristic)";
|
|
328
442
|
}
|
|
329
443
|
|
|
330
|
-
function _safe_style_name(
|
|
331
|
-
|
|
444
|
+
function _safe_style_name(
|
|
445
|
+
paragraph: Paragraph,
|
|
446
|
+
style_cache: any,
|
|
447
|
+
default_pstyle: any,
|
|
448
|
+
): string | null {
|
|
449
|
+
const pPr = findChild(paragraph._element, "w:pPr");
|
|
332
450
|
let style_id = default_pstyle;
|
|
333
451
|
if (pPr) {
|
|
334
|
-
const pStyle = findChild(pPr,
|
|
335
|
-
if (pStyle) style_id = pStyle.getAttribute(
|
|
452
|
+
const pStyle = findChild(pPr, "w:pStyle");
|
|
453
|
+
if (pStyle) style_id = pStyle.getAttribute("w:val") || default_pstyle;
|
|
336
454
|
}
|
|
337
|
-
return
|
|
455
|
+
return style_id && style_cache && style_cache[style_id]
|
|
456
|
+
? style_cache[style_id].name
|
|
457
|
+
: style_id;
|
|
338
458
|
}
|
|
339
459
|
|
|
340
|
-
function _find_owned_end(
|
|
341
|
-
|
|
460
|
+
function _find_owned_end(
|
|
461
|
+
block_records: _BlockRecord[],
|
|
462
|
+
heading_indices: number[],
|
|
463
|
+
current_h_pos: number,
|
|
464
|
+
current_level: number,
|
|
465
|
+
): number {
|
|
466
|
+
for (
|
|
467
|
+
let next_h_pos = current_h_pos + 1;
|
|
468
|
+
next_h_pos < heading_indices.length;
|
|
469
|
+
next_h_pos++
|
|
470
|
+
) {
|
|
342
471
|
const next_idx = heading_indices[next_h_pos];
|
|
343
|
-
if (_heading_level(block_records[next_idx].item) <= current_level)
|
|
472
|
+
if (_heading_level(block_records[next_idx].item) <= current_level)
|
|
473
|
+
return next_idx;
|
|
344
474
|
}
|
|
345
475
|
return block_records.length;
|
|
346
476
|
}
|
|
@@ -351,12 +481,12 @@ function _collect_footnote_ids(owned_blocks: _BlockRecord[]): string[] {
|
|
|
351
481
|
for (const rec of owned_blocks) {
|
|
352
482
|
if (!rec.is_paragraph) continue;
|
|
353
483
|
for (const event of iter_paragraph_content(rec.item)) {
|
|
354
|
-
if (!(
|
|
355
|
-
let fn_id =
|
|
356
|
-
if (event.type ===
|
|
357
|
-
else if (event.type ===
|
|
484
|
+
if (!("type" in event)) continue;
|
|
485
|
+
let fn_id = "";
|
|
486
|
+
if (event.type === "footnote") fn_id = `fn-${event.id}`;
|
|
487
|
+
else if (event.type === "endnote") fn_id = `en-${event.id}`;
|
|
358
488
|
else continue;
|
|
359
|
-
|
|
489
|
+
|
|
360
490
|
if (!seen.has(fn_id)) {
|
|
361
491
|
seen.add(fn_id);
|
|
362
492
|
ordered.push(fn_id);
|
|
@@ -374,4 +504,4 @@ function _offset_to_page(offset: number, body_page_offsets: number[]): number {
|
|
|
374
504
|
else break;
|
|
375
505
|
}
|
|
376
506
|
return page;
|
|
377
|
-
}
|
|
507
|
+
}
|
package/src/sanitize/core.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { DocumentObject } from '../docx/bridge.js';
|
|
2
2
|
import { SanitizeReport } from './report.js';
|
|
3
3
|
import * as transforms from './transforms.js';
|
|
4
|
+
import { findAllDescendants } from '../docx/dom.js';
|
|
4
5
|
|
|
5
6
|
export interface FinalizeOptions {
|
|
6
7
|
filename: string;
|
|
@@ -61,6 +62,7 @@ export async function finalize_document(doc: DocumentObject, options: FinalizeOp
|
|
|
61
62
|
report.add_transform_lines(transforms.strip_proof_errors(doc));
|
|
62
63
|
report.add_transform_lines(transforms.strip_empty_properties(doc));
|
|
63
64
|
report.add_transform_lines(transforms.strip_hidden_text(doc));
|
|
65
|
+
report.add_transform_lines(transforms.coalesce_runs(doc));
|
|
64
66
|
report.add_transform_lines(transforms.scrub_doc_properties(doc));
|
|
65
67
|
report.add_transform_lines(transforms.scrub_timestamps(doc));
|
|
66
68
|
report.add_transform_lines(transforms.strip_custom_xml(doc));
|
|
@@ -97,6 +99,30 @@ export async function finalize_document(doc: DocumentObject, options: FinalizeOp
|
|
|
97
99
|
report.warnings.push("PDF export requires the Python/Word COM environment and is skipped in this zero-dependency Node agent.");
|
|
98
100
|
}
|
|
99
101
|
|
|
102
|
+
// Clean up leaked Microsoft namespaces
|
|
103
|
+
for (const part of doc.pkg.parts) {
|
|
104
|
+
// Match the exact injection condition from RedlineEngine constructor
|
|
105
|
+
if (part === doc.part || (part.contentType.includes('wordprocessingml') && part.contentType.endsWith('+xml'))) {
|
|
106
|
+
if (part._element.hasAttribute('xmlns:w16du')) {
|
|
107
|
+
let hasW16du = false;
|
|
108
|
+
// Check root element attributes (excluding the xmlns declaration itself)
|
|
109
|
+
if (Array.from(part._element.attributes || []).some(a => a.name.startsWith('w16du:') && a.name !== 'xmlns:w16du')) {
|
|
110
|
+
hasW16du = true;
|
|
111
|
+
}
|
|
112
|
+
if (!hasW16du) {
|
|
113
|
+
const allNodes = findAllDescendants(part._element, '*');
|
|
114
|
+
for (const n of allNodes) {
|
|
115
|
+
if (n.tagName.startsWith('w16du:') || Array.from(n.attributes || []).some(a => a.name.startsWith('w16du:'))) {
|
|
116
|
+
hasW16du = true;
|
|
117
|
+
break;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
if (!hasW16du) part._element.removeAttribute('xmlns:w16du');
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
100
126
|
if (report.warnings.length > 0) report.status = 'clean_with_warnings';
|
|
101
127
|
|
|
102
128
|
const outBuffer = await doc.save();
|
package/src/sanitize/report.ts
CHANGED
|
@@ -116,7 +116,7 @@ export class SanitizeReport {
|
|
|
116
116
|
if (this.warnings.length > 0) {
|
|
117
117
|
lines.push(`Result: CLEAN WITH WARNINGS (${this.warnings.length} warning${this.warnings.length > 1 ? 's' : ''})`);
|
|
118
118
|
} else {
|
|
119
|
-
lines.push(
|
|
119
|
+
lines.push(`Result: CLEAN (${this.tracked_changes_found} changes resolved, ${this.comments_removed} comments removed)`);
|
|
120
120
|
}
|
|
121
121
|
lines.push(sep);
|
|
122
122
|
|
|
@@ -162,8 +162,8 @@ describe('Finalize Document (Core)', () => {
|
|
|
162
162
|
});
|
|
163
163
|
|
|
164
164
|
const finalSettings = settingsPart._element.toString();
|
|
165
|
-
|
|
166
|
-
expect(res.reportText).toContain('Result:
|
|
165
|
+
|
|
166
|
+
expect(res.reportText).toContain('Result: CLEAN');
|
|
167
167
|
expect(res.reportText).toContain('Document locked (Read-Only');
|
|
168
168
|
|
|
169
169
|
// Validate mathematical injection
|
|
@@ -189,4 +189,49 @@ describe('Finalize Document (Core)', () => {
|
|
|
189
189
|
expect(res.reportText).toContain('unresolved tracked changes');
|
|
190
190
|
});
|
|
191
191
|
|
|
192
|
+
describe('Resolved Bugs Sanitize Parity Verification', () => {
|
|
193
|
+
|
|
194
|
+
it('BUG-FRAG-1: Coalesces adjacent identical runs after accepting tracked changes', async () => {
|
|
195
|
+
const doc = createMockDoc(`
|
|
196
|
+
<w:p>
|
|
197
|
+
<w:r><w:t xml:space="preserve">The term shall be </w:t></w:r>
|
|
198
|
+
<w:ins w:id="1"><w:r><w:t>five (5)</w:t></w:r></w:ins>
|
|
199
|
+
<w:r><w:t xml:space="preserve"> years from the Effective Date.</w:t></w:r>
|
|
200
|
+
</w:p>
|
|
201
|
+
`);
|
|
202
|
+
|
|
203
|
+
doc.save = vi.fn().mockResolvedValue(Buffer.from('mock'));
|
|
204
|
+
|
|
205
|
+
await finalize_document(doc, {
|
|
206
|
+
filename: 'test.docx',
|
|
207
|
+
sanitize_mode: 'full',
|
|
208
|
+
accept_all: true
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
const xml = doc.element.toString();
|
|
212
|
+
// We should see a single coalesced string rather than fragmented <w:t> nodes
|
|
213
|
+
expect(xml).toContain('The term shall be five (5) years from the Effective Date.');
|
|
214
|
+
|
|
215
|
+
const runs = doc.element.getElementsByTagName('w:r');
|
|
216
|
+
// If they are coalesced properly, there will be exactly 1 run instead of 3
|
|
217
|
+
expect(runs.length).toBe(1);
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
it('BUG-NS-1: Strips unused xmlns:w16du namespace declarations during finalization', async () => {
|
|
221
|
+
const doc = createMockDoc('<w:p/>');
|
|
222
|
+
// Manually inject the namespace onto the absolute root as the engine does
|
|
223
|
+
doc.part._element.setAttribute('xmlns:w16du', 'http://schemas.microsoft.com/office/word/2023/wordml/word16du');
|
|
224
|
+
|
|
225
|
+
doc.save = vi.fn().mockResolvedValue(Buffer.from('mock'));
|
|
226
|
+
|
|
227
|
+
await finalize_document(doc, {
|
|
228
|
+
filename: 'test.docx',
|
|
229
|
+
sanitize_mode: 'full'
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
// The final stringified XML of the root document should NOT contain the unused namespace
|
|
233
|
+
const xml = doc.part._element.toString();
|
|
234
|
+
expect(xml).not.toContain('xmlns:w16du');
|
|
235
|
+
});
|
|
236
|
+
});
|
|
192
237
|
});
|
|
@@ -15,6 +15,93 @@ export function findDescendantsByLocalName(element: Element, localName: string):
|
|
|
15
15
|
return result;
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
export function coalesce_runs(doc: DocumentObject): string[] {
|
|
19
|
+
let count = 0;
|
|
20
|
+
|
|
21
|
+
function areRunsIdentical(rPr1: Element | null, rPr2: Element | null): boolean {
|
|
22
|
+
const xml1 = rPr1 ? rPr1.toString() : '';
|
|
23
|
+
const xml2 = rPr2 ? rPr2.toString() : '';
|
|
24
|
+
return xml1 === xml2;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function hasSpecialContent(run: Element): boolean {
|
|
28
|
+
const safeTags = ['w:t', 'w:tab', 'w:br', 'w:cr', 'w:delText', 'w:rPr'];
|
|
29
|
+
for (let i = 0; i < run.childNodes.length; i++) {
|
|
30
|
+
const child = run.childNodes[i];
|
|
31
|
+
if (child.nodeType === 1) {
|
|
32
|
+
const tag = (child as Element).tagName;
|
|
33
|
+
if (!safeTags.includes(tag)) return true;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
return false;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function coalesceContainer(container: Element) {
|
|
40
|
+
const children = Array.from(container.childNodes).filter(n => n.nodeType === 1) as Element[];
|
|
41
|
+
let i = 0;
|
|
42
|
+
while (i < children.length - 1) {
|
|
43
|
+
const curr = children[i];
|
|
44
|
+
const nxt = children[i + 1];
|
|
45
|
+
|
|
46
|
+
if (curr.tagName === 'w:r' && nxt.tagName === 'w:r') {
|
|
47
|
+
if (!hasSpecialContent(curr) && !hasSpecialContent(nxt)) {
|
|
48
|
+
const rPr1 = findChild(curr, 'w:rPr');
|
|
49
|
+
const rPr2 = findChild(nxt, 'w:rPr');
|
|
50
|
+
if (areRunsIdentical(rPr1, rPr2)) {
|
|
51
|
+
let last_t: Element | null = null;
|
|
52
|
+
for (let c = 0; c < curr.childNodes.length; c++) {
|
|
53
|
+
const child = curr.childNodes[c];
|
|
54
|
+
if (child.nodeType === 1 && ((child as Element).tagName === 'w:t' || (child as Element).tagName === 'w:delText')) {
|
|
55
|
+
last_t = child as Element;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const nxtChildren = Array.from(nxt.childNodes).filter(n => n.nodeType === 1) as Element[];
|
|
60
|
+
for (const child of nxtChildren) {
|
|
61
|
+
if (child.tagName === 'w:rPr') continue;
|
|
62
|
+
if ((child.tagName === 'w:t' || child.tagName === 'w:delText') && last_t && last_t.tagName === child.tagName) {
|
|
63
|
+
const t1 = last_t.textContent || '';
|
|
64
|
+
const t2 = child.textContent || '';
|
|
65
|
+
const combined = t1 + t2;
|
|
66
|
+
last_t.textContent = combined;
|
|
67
|
+
if (combined.trim() !== combined) {
|
|
68
|
+
last_t.setAttribute('xml:space', 'preserve');
|
|
69
|
+
}
|
|
70
|
+
} else {
|
|
71
|
+
curr.appendChild(child);
|
|
72
|
+
if (child.tagName === 'w:t' || child.tagName === 'w:delText') {
|
|
73
|
+
last_t = child;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
container.removeChild(nxt);
|
|
78
|
+
children.splice(i + 1, 1);
|
|
79
|
+
count++;
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (['w:ins', 'w:del', 'w:hyperlink', 'w:sdt', 'w:smartTag', 'w:fldSimple', 'w:sdtContent'].includes(curr.tagName)) {
|
|
86
|
+
coalesceContainer(curr);
|
|
87
|
+
}
|
|
88
|
+
i++;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if (children.length > 0) {
|
|
92
|
+
const last = children[children.length - 1];
|
|
93
|
+
if (['w:ins', 'w:del', 'w:hyperlink', 'w:sdt', 'w:smartTag', 'w:fldSimple', 'w:sdtContent'].includes(last.tagName)) {
|
|
94
|
+
coalesceContainer(last);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const paragraphs = findAllDescendants(doc.element, 'w:p');
|
|
100
|
+
for (const p of paragraphs) coalesceContainer(p);
|
|
101
|
+
|
|
102
|
+
return count ? [`Adjacent identical runs coalesced: ${count}`] : [];
|
|
103
|
+
}
|
|
104
|
+
|
|
18
105
|
export function strip_rsid(doc: DocumentObject): string[] {
|
|
19
106
|
let count = 0;
|
|
20
107
|
const rsidAttrs = ['w:rsidR', 'w:rsidRPr', 'w:rsidRDefault', 'w:rsidP', 'w:rsidDel', 'w:rsidSect', 'w:rsidTr'];
|