@adeu/core 1.6.8 → 1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/outline.ts CHANGED
@@ -2,18 +2,18 @@
2
2
  * Structural outline extractor.
3
3
  */
4
4
 
5
- import { DocumentObject } from './docx/bridge.js';
6
- import { Paragraph, Table, DocxEvent } from './docx/primitives.js';
7
- import { build_paragraph_text, extract_table } from './ingest.js';
8
- import { extract_comments_data } from './comments.js';
9
- import { findChild } from './docx/dom.js';
5
+ import { DocumentObject } from "./docx/bridge.js";
6
+ import { Paragraph, Table, DocxEvent } from "./docx/primitives.js";
7
+ import { build_paragraph_text, extract_table } from "./ingest.js";
8
+ import { extract_comments_data } from "./comments.js";
9
+ import { findChild } from "./docx/dom.js";
10
10
  import {
11
11
  _get_style_cache,
12
12
  get_paragraph_prefix,
13
13
  iter_block_items,
14
14
  iter_document_parts,
15
15
  iter_paragraph_content,
16
- } from './utils/docx.js';
16
+ } from "./utils/docx.js";
17
17
 
18
18
  const _HEADING_PREFIX_RE = /^(#{1,6}) /;
19
19
  const _HEURISTIC_MIN_WORDS = 3;
@@ -40,10 +40,10 @@ export function extract_outline(
40
40
  projected_body: string,
41
41
  body_pages: string[],
42
42
  body_page_offsets: number[],
43
- paragraph_offsets: Record<string, [number, number]> | null = null
43
+ paragraph_offsets: Record<string, [number, number]> | null = null,
44
44
  ): OutlineNode[] {
45
45
  if (body_pages.length !== body_page_offsets.length) {
46
- throw new Error('body_pages and body_page_offsets length mismatch');
46
+ throw new Error("body_pages and body_page_offsets length mismatch");
47
47
  }
48
48
 
49
49
  const comments_map = extract_comments_data(doc.pkg);
@@ -69,7 +69,12 @@ export function extract_outline(
69
69
  const text = _heading_text(paragraph, comments_map);
70
70
  const style = _determine_heading_style(paragraph);
71
71
 
72
- const owned_end = _find_owned_end(block_records, heading_indices, h_pos, level);
72
+ const owned_end = _find_owned_end(
73
+ block_records,
74
+ heading_indices,
75
+ h_pos,
76
+ level,
77
+ );
73
78
  const owned_blocks = block_records.slice(rec_idx + 1, owned_end);
74
79
 
75
80
  const has_table = _direct_has_table(block_records, rec_idx + 1, owned_end);
@@ -83,7 +88,11 @@ export function extract_outline(
83
88
  return nodes;
84
89
  }
85
90
 
86
- function _direct_has_table(block_records: _BlockRecord[], range_start: number, range_end: number): boolean {
91
+ function _direct_has_table(
92
+ block_records: _BlockRecord[],
93
+ range_start: number,
94
+ range_end: number,
95
+ ): boolean {
87
96
  for (let idx = range_start; idx < range_end; idx++) {
88
97
  const rec = block_records[idx];
89
98
  if (rec.is_paragraph && _is_heading(rec.item)) return false;
@@ -92,7 +101,10 @@ function _direct_has_table(block_records: _BlockRecord[], range_start: number, r
92
101
  return false;
93
102
  }
94
103
 
95
- function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[] {
104
+ function _walk_doc_body(
105
+ doc: DocumentObject,
106
+ comments_map: any,
107
+ ): _BlockRecord[] {
96
108
  const parts = Array.from(iter_document_parts(doc));
97
109
  let body_start_offset = 0;
98
110
  let body_part: any = null;
@@ -128,7 +140,13 @@ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[]
128
140
 
129
141
  if (!is_first_block) cursor += 2;
130
142
 
131
- records.push({ item, is_paragraph: true, is_table: false, start_offset: cursor, projected_length: block_len });
143
+ records.push({
144
+ item,
145
+ is_paragraph: true,
146
+ is_table: false,
147
+ start_offset: cursor,
148
+ projected_length: block_len,
149
+ });
132
150
  cursor += block_len;
133
151
  is_first_block = false;
134
152
  } else if (item instanceof Table) {
@@ -138,7 +156,13 @@ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[]
138
156
  if (!is_first_block) cursor += 2;
139
157
 
140
158
  const table_start = cursor;
141
- records.push({ item, is_paragraph: false, is_table: true, start_offset: table_start, projected_length: block_len });
159
+ records.push({
160
+ item,
161
+ is_paragraph: false,
162
+ is_table: true,
163
+ start_offset: table_start,
164
+ projected_length: block_len,
165
+ });
142
166
  _record_table_inner_blocks_lite(item, table_start, records, comments_map);
143
167
  cursor += block_len;
144
168
  is_first_block = false;
@@ -148,7 +172,12 @@ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[]
148
172
  return records;
149
173
  }
150
174
 
151
- function _compute_inner_block_offset(table: Table, target_paragraph: Paragraph, table_start_offset: number, comments_map: any): number {
175
+ function _compute_inner_block_offset(
176
+ table: Table,
177
+ target_paragraph: Paragraph,
178
+ table_start_offset: number,
179
+ comments_map: any,
180
+ ): number {
152
181
  const target_el = target_paragraph._element;
153
182
  let cursor = table_start_offset;
154
183
  let rows_processed = 0;
@@ -165,7 +194,12 @@ function _compute_inner_block_offset(table: Table, target_paragraph: Paragraph,
165
194
 
166
195
  if (cells_in_row > 0) cursor += 3;
167
196
 
168
- const [new_cursor, found] = _walk_cell_for_offset(cell, target_el, cursor, comments_map);
197
+ const [new_cursor, found] = _walk_cell_for_offset(
198
+ cell,
199
+ target_el,
200
+ cursor,
201
+ comments_map,
202
+ );
169
203
  if (found) return new_cursor;
170
204
  cursor = new_cursor;
171
205
 
@@ -177,7 +211,12 @@ function _compute_inner_block_offset(table: Table, target_paragraph: Paragraph,
177
211
  return table_start_offset;
178
212
  }
179
213
 
180
- function _walk_cell_for_offset(cell: any, target_el: any, cell_start_cursor: number, comments_map: any): [number, boolean] {
214
+ function _walk_cell_for_offset(
215
+ cell: any,
216
+ target_el: any,
217
+ cell_start_cursor: number,
218
+ comments_map: any,
219
+ ): [number, boolean] {
181
220
  let cursor = cell_start_cursor;
182
221
  let is_first_block = true;
183
222
 
@@ -190,9 +229,15 @@ function _walk_cell_for_offset(cell: any, target_el: any, cell_start_cursor: num
190
229
  const p_text = build_paragraph_text(inner_item, comments_map, false);
191
230
  cursor += (prefix + p_text).length;
192
231
  } else if (inner_item instanceof Table) {
193
- const nested_offset = _compute_inner_block_offset(inner_item, new Paragraph(target_el, null), cursor, comments_map);
232
+ const nested_offset = _compute_inner_block_offset(
233
+ inner_item,
234
+ new Paragraph(target_el, null),
235
+ cursor,
236
+ comments_map,
237
+ );
194
238
  if (nested_offset !== cursor) {
195
- if (_element_is_descendant(target_el, inner_item._element)) return [nested_offset, true];
239
+ if (_element_is_descendant(target_el, inner_item._element))
240
+ return [nested_offset, true];
196
241
  }
197
242
  const table_text = extract_table(inner_item, comments_map, false, 0);
198
243
  cursor += table_text ? table_text.length : 0;
@@ -202,7 +247,10 @@ function _walk_cell_for_offset(cell: any, target_el: any, cell_start_cursor: num
202
247
  return [cursor, false];
203
248
  }
204
249
 
205
- function _element_is_descendant(target_el: Element, ancestor_el: Element): boolean {
250
+ function _element_is_descendant(
251
+ target_el: Element,
252
+ ancestor_el: Element,
253
+ ): boolean {
206
254
  let cur: Node | null = target_el.parentNode;
207
255
  while (cur) {
208
256
  if (cur === ancestor_el) return true;
@@ -211,7 +259,12 @@ function _element_is_descendant(target_el: Element, ancestor_el: Element): boole
211
259
  return false;
212
260
  }
213
261
 
214
- function _record_table_inner_blocks_lite(table: Table, inherited_offset: number, records: _BlockRecord[], comments_map: any) {
262
+ function _record_table_inner_blocks_lite(
263
+ table: Table,
264
+ inherited_offset: number,
265
+ records: _BlockRecord[],
266
+ comments_map: any,
267
+ ) {
215
268
  const seen_cells = new Set();
216
269
  for (const row of table.rows) {
217
270
  for (const cell of row.cells) {
@@ -220,11 +273,35 @@ function _record_table_inner_blocks_lite(table: Table, inherited_offset: number,
220
273
 
221
274
  for (const inner_item of iter_block_items(cell)) {
222
275
  if (inner_item instanceof Paragraph) {
223
- const true_offset = _is_heading(inner_item) ? _compute_inner_block_offset(table, inner_item, inherited_offset, comments_map) : inherited_offset;
224
- records.push({ item: inner_item, is_paragraph: true, is_table: false, start_offset: true_offset, projected_length: 0 });
276
+ const true_offset = _is_heading(inner_item)
277
+ ? _compute_inner_block_offset(
278
+ table,
279
+ inner_item,
280
+ inherited_offset,
281
+ comments_map,
282
+ )
283
+ : inherited_offset;
284
+ records.push({
285
+ item: inner_item,
286
+ is_paragraph: true,
287
+ is_table: false,
288
+ start_offset: true_offset,
289
+ projected_length: 0,
290
+ });
225
291
  } else if (inner_item instanceof Table) {
226
- records.push({ item: inner_item, is_paragraph: false, is_table: true, start_offset: inherited_offset, projected_length: 0 });
227
- _record_table_inner_blocks_lite(inner_item, inherited_offset, records, comments_map);
292
+ records.push({
293
+ item: inner_item,
294
+ is_paragraph: false,
295
+ is_table: true,
296
+ start_offset: inherited_offset,
297
+ projected_length: 0,
298
+ });
299
+ _record_table_inner_blocks_lite(
300
+ inner_item,
301
+ inherited_offset,
302
+ records,
303
+ comments_map,
304
+ );
228
305
  }
229
306
  }
230
307
  }
@@ -235,19 +312,20 @@ function _project_part(part: any, comments_map: any): string {
235
312
  const blocks: string[] = [];
236
313
  const c_type = part.constructor.name;
237
314
 
238
- if (c_type === 'NotesPart') {
239
- const header = part.note_type === 'fn' ? '## Footnotes' : '## Endnotes';
315
+ if (c_type === "NotesPart") {
316
+ const header = part.note_type === "fn" ? "## Footnotes" : "## Endnotes";
240
317
  blocks.push(`---\n${header}`);
241
318
  }
242
319
 
243
320
  let is_first_para = true;
244
321
  for (const item of iter_block_items(part)) {
245
- if (item.constructor.name === 'FootnoteItem') {
322
+ if (item.constructor.name === "FootnoteItem") {
246
323
  const fn_text = _project_part(item, comments_map);
247
324
  if (fn_text) blocks.push(fn_text);
248
325
  } else if (item instanceof Paragraph) {
249
326
  let prefix = get_paragraph_prefix(item);
250
- if (is_first_para && c_type === 'FootnoteItem') prefix = `[^${part.note_type}-${part.id}]: ${prefix}`;
327
+ if (is_first_para && c_type === "FootnoteItem")
328
+ prefix = `[^${part.note_type}-${part.id}]: ${prefix}`;
251
329
  const p_text = build_paragraph_text(item, comments_map, false);
252
330
  blocks.push(prefix + p_text);
253
331
  is_first_para = false;
@@ -258,16 +336,19 @@ function _project_part(part: any, comments_map: any): string {
258
336
  }
259
337
  }
260
338
 
261
- return blocks.join('\n\n');
339
+ return blocks.join("\n\n");
262
340
  }
263
341
 
264
342
  function _is_heading(paragraph: Paragraph): boolean {
265
343
  return _HEADING_PREFIX_RE.test(get_paragraph_prefix(paragraph));
266
344
  }
267
345
 
268
- function _heading_passes_quality_filter(paragraph: Paragraph, comments_map: any): boolean {
346
+ function _heading_passes_quality_filter(
347
+ paragraph: Paragraph,
348
+ comments_map: any,
349
+ ): boolean {
269
350
  const style = _determine_heading_style(paragraph);
270
- if (style !== '(heuristic)') return true;
351
+ if (style !== "(heuristic)") return true;
271
352
  const text = _heading_text(paragraph, comments_map);
272
353
  if (!text) return false;
273
354
  const word_count = (text.match(/\w+/g) || []).length;
@@ -287,60 +368,109 @@ function _heading_text(paragraph: Paragraph, comments_map: any): string {
287
368
  }
288
369
 
289
370
  function _strip_critic_markup(text: string): string {
290
- if (!text) return '';
291
- text = text.replace(/\{--[\s\S]*?--\}/g, '');
292
- text = text.replace(/\{>>[\s\S]*?<<\}/g, '');
293
- text = text.replace(/\{\+\+([\s\S]*?)\+\+\}/g, '$1');
294
- text = text.replace(/\{==([\s\S]*?)==\}/g, '$1');
371
+ if (!text) return "";
372
+ text = text.replace(/\{--[\s\S]*?--\}/g, "");
373
+ text = text.replace(/\{>>[\s\S]*?<<\}/g, "");
374
+ text = text.replace(/\{\+\+([\s\S]*?)\+\+\}/g, "$1");
375
+ text = text.replace(/\{==([\s\S]*?)==\}/g, "$1");
295
376
  return text;
296
377
  }
297
378
 
298
379
  function _strip_inline_formatting(text: string): string {
299
- if (!text) return '';
300
- text = text.replace(/\*\*(.+?)\*\*/g, '$1');
301
- text = text.replace(/__(.+?)__/g, '$1');
302
- text = text.replace(/(?<!\w)_(\S(?:.*?\S)?)_(?!\w)/g, '$1');
380
+ if (!text) return "";
381
+ text = text.replace(/\*\*(.+?)\*\*/g, "$1");
382
+ text = text.replace(/__(.+?)__/g, "$1");
383
+ text = text.replace(/(?<!\w)_(\S(?:.*?\S)?)_(?!\w)/g, "$1");
303
384
  return text;
304
385
  }
305
386
 
306
387
  function _determine_heading_style(paragraph: Paragraph): string {
307
- const [style_cache, default_pstyle] = _get_style_cache(paragraph._parent.part || paragraph._parent);
308
- const pPr = findChild(paragraph._element, 'w:pPr');
388
+ const [style_cache, default_pstyle] = _get_style_cache(
389
+ paragraph._parent.part || paragraph._parent,
390
+ );
391
+ const pPr = findChild(paragraph._element, "w:pPr");
309
392
  let style_id = default_pstyle;
310
-
393
+
311
394
  if (pPr) {
312
- const oLvl = findChild(pPr, 'w:outlineLvl');
313
- if (oLvl && /^\d+$/.test(oLvl.getAttribute('w:val') || '')) {
314
- const style = _safe_style_name(paragraph, style_cache, default_pstyle);
315
- if (style && (style.startsWith('Heading') || style === 'Title')) return style;
316
- return '(outline_level)';
317
- }
318
- const pStyle = findChild(pPr, 'w:pStyle');
319
- if (pStyle) style_id = pStyle.getAttribute('w:val') || default_pstyle;
395
+ const pStyle = findChild(pPr, "w:pStyle");
396
+ if (pStyle) style_id = pStyle.getAttribute("w:val") || default_pstyle;
320
397
  }
321
398
 
322
- const style_name = (style_id && style_cache && style_cache[style_id]) ? style_cache[style_id].name : null;
323
- if (style_name && (style_name.startsWith('Heading') || style_name === 'Title')) return style_name;
399
+ let outline_level: number | null = null;
400
+ if (pPr) {
401
+ const oLvl = findChild(pPr, "w:outlineLvl");
402
+ if (oLvl && /^\d+$/.test(oLvl.getAttribute("w:val") || "")) {
403
+ outline_level = parseInt(oLvl.getAttribute("w:val") as string, 10);
404
+ }
405
+ }
324
406
 
325
- if (style_name && /Heading[ ]?([1-6])(?![0-9])/.test(style_name)) return style_name;
407
+ if (outline_level === null && style_id && style_cache && style_cache[style_id]) {
408
+ outline_level = style_cache[style_id].outline_level;
409
+ }
410
+
411
+ const style_name =
412
+ style_id && style_cache && style_cache[style_id]
413
+ ? style_cache[style_id].name
414
+ : style_id;
415
+
416
+ let normalized_style_name = style_name;
417
+ if (normalized_style_name && typeof normalized_style_name === "string") {
418
+ if (normalized_style_name.toLowerCase().startsWith("heading")) {
419
+ normalized_style_name = normalized_style_name.replace(/^heading/i, "Heading");
420
+ } else if (normalized_style_name.toLowerCase() === "title") {
421
+ normalized_style_name = "Title";
422
+ }
423
+ }
424
+
425
+ if (outline_level !== null && outline_level >= 0 && outline_level <= 8) {
426
+ if (normalized_style_name && (normalized_style_name.startsWith("Heading") || normalized_style_name === "Title")) {
427
+ return normalized_style_name;
428
+ }
429
+ return "(outline_level)";
430
+ }
326
431
 
327
- return '(heuristic)';
432
+ if (
433
+ normalized_style_name &&
434
+ (normalized_style_name.startsWith("Heading") || normalized_style_name === "Title")
435
+ )
436
+ return normalized_style_name;
437
+
438
+ if (normalized_style_name && /Heading[ ]?([1-6])(?![0-9])/.test(normalized_style_name))
439
+ return normalized_style_name;
440
+
441
+ return "(heuristic)";
328
442
  }
329
443
 
330
- function _safe_style_name(paragraph: Paragraph, style_cache: any, default_pstyle: any): string | null {
331
- const pPr = findChild(paragraph._element, 'w:pPr');
444
+ function _safe_style_name(
445
+ paragraph: Paragraph,
446
+ style_cache: any,
447
+ default_pstyle: any,
448
+ ): string | null {
449
+ const pPr = findChild(paragraph._element, "w:pPr");
332
450
  let style_id = default_pstyle;
333
451
  if (pPr) {
334
- const pStyle = findChild(pPr, 'w:pStyle');
335
- if (pStyle) style_id = pStyle.getAttribute('w:val') || default_pstyle;
452
+ const pStyle = findChild(pPr, "w:pStyle");
453
+ if (pStyle) style_id = pStyle.getAttribute("w:val") || default_pstyle;
336
454
  }
337
- return (style_id && style_cache && style_cache[style_id]) ? style_cache[style_id].name : null;
455
+ return style_id && style_cache && style_cache[style_id]
456
+ ? style_cache[style_id].name
457
+ : style_id;
338
458
  }
339
459
 
340
- function _find_owned_end(block_records: _BlockRecord[], heading_indices: number[], current_h_pos: number, current_level: number): number {
341
- for (let next_h_pos = current_h_pos + 1; next_h_pos < heading_indices.length; next_h_pos++) {
460
+ function _find_owned_end(
461
+ block_records: _BlockRecord[],
462
+ heading_indices: number[],
463
+ current_h_pos: number,
464
+ current_level: number,
465
+ ): number {
466
+ for (
467
+ let next_h_pos = current_h_pos + 1;
468
+ next_h_pos < heading_indices.length;
469
+ next_h_pos++
470
+ ) {
342
471
  const next_idx = heading_indices[next_h_pos];
343
- if (_heading_level(block_records[next_idx].item) <= current_level) return next_idx;
472
+ if (_heading_level(block_records[next_idx].item) <= current_level)
473
+ return next_idx;
344
474
  }
345
475
  return block_records.length;
346
476
  }
@@ -351,12 +481,12 @@ function _collect_footnote_ids(owned_blocks: _BlockRecord[]): string[] {
351
481
  for (const rec of owned_blocks) {
352
482
  if (!rec.is_paragraph) continue;
353
483
  for (const event of iter_paragraph_content(rec.item)) {
354
- if (!('type' in event)) continue;
355
- let fn_id = '';
356
- if (event.type === 'footnote') fn_id = `fn-${event.id}`;
357
- else if (event.type === 'endnote') fn_id = `en-${event.id}`;
484
+ if (!("type" in event)) continue;
485
+ let fn_id = "";
486
+ if (event.type === "footnote") fn_id = `fn-${event.id}`;
487
+ else if (event.type === "endnote") fn_id = `en-${event.id}`;
358
488
  else continue;
359
-
489
+
360
490
  if (!seen.has(fn_id)) {
361
491
  seen.add(fn_id);
362
492
  ordered.push(fn_id);
@@ -374,4 +504,4 @@ function _offset_to_page(offset: number, body_page_offsets: number[]): number {
374
504
  else break;
375
505
  }
376
506
  return page;
377
- }
507
+ }
@@ -1,6 +1,7 @@
1
1
  import { DocumentObject } from '../docx/bridge.js';
2
2
  import { SanitizeReport } from './report.js';
3
3
  import * as transforms from './transforms.js';
4
+ import { findAllDescendants } from '../docx/dom.js';
4
5
 
5
6
  export interface FinalizeOptions {
6
7
  filename: string;
@@ -61,6 +62,7 @@ export async function finalize_document(doc: DocumentObject, options: FinalizeOp
61
62
  report.add_transform_lines(transforms.strip_proof_errors(doc));
62
63
  report.add_transform_lines(transforms.strip_empty_properties(doc));
63
64
  report.add_transform_lines(transforms.strip_hidden_text(doc));
65
+ report.add_transform_lines(transforms.coalesce_runs(doc));
64
66
  report.add_transform_lines(transforms.scrub_doc_properties(doc));
65
67
  report.add_transform_lines(transforms.scrub_timestamps(doc));
66
68
  report.add_transform_lines(transforms.strip_custom_xml(doc));
@@ -97,6 +99,30 @@ export async function finalize_document(doc: DocumentObject, options: FinalizeOp
97
99
  report.warnings.push("PDF export requires the Python/Word COM environment and is skipped in this zero-dependency Node agent.");
98
100
  }
99
101
 
102
+ // Clean up leaked Microsoft namespaces
103
+ for (const part of doc.pkg.parts) {
104
+ // Match the exact injection condition from RedlineEngine constructor
105
+ if (part === doc.part || (part.contentType.includes('wordprocessingml') && part.contentType.endsWith('+xml'))) {
106
+ if (part._element.hasAttribute('xmlns:w16du')) {
107
+ let hasW16du = false;
108
+ // Check root element attributes (excluding the xmlns declaration itself)
109
+ if (Array.from(part._element.attributes || []).some(a => a.name.startsWith('w16du:') && a.name !== 'xmlns:w16du')) {
110
+ hasW16du = true;
111
+ }
112
+ if (!hasW16du) {
113
+ const allNodes = findAllDescendants(part._element, '*');
114
+ for (const n of allNodes) {
115
+ if (n.tagName.startsWith('w16du:') || Array.from(n.attributes || []).some(a => a.name.startsWith('w16du:'))) {
116
+ hasW16du = true;
117
+ break;
118
+ }
119
+ }
120
+ }
121
+ if (!hasW16du) part._element.removeAttribute('xmlns:w16du');
122
+ }
123
+ }
124
+ }
125
+
100
126
  if (report.warnings.length > 0) report.status = 'clean_with_warnings';
101
127
 
102
128
  const outBuffer = await doc.save();
@@ -116,7 +116,7 @@ export class SanitizeReport {
116
116
  if (this.warnings.length > 0) {
117
117
  lines.push(`Result: CLEAN WITH WARNINGS (${this.warnings.length} warning${this.warnings.length > 1 ? 's' : ''})`);
118
118
  } else {
119
- lines.push("Result: SECURE & READY TO SEND");
119
+ lines.push(`Result: CLEAN (${this.tracked_changes_found} changes resolved, ${this.comments_removed} comments removed)`);
120
120
  }
121
121
  lines.push(sep);
122
122
 
@@ -162,8 +162,8 @@ describe('Finalize Document (Core)', () => {
162
162
  });
163
163
 
164
164
  const finalSettings = settingsPart._element.toString();
165
-
166
- expect(res.reportText).toContain('Result: SECURE & READY TO SEND');
165
+
166
+ expect(res.reportText).toContain('Result: CLEAN');
167
167
  expect(res.reportText).toContain('Document locked (Read-Only');
168
168
 
169
169
  // Validate mathematical injection
@@ -189,4 +189,49 @@ describe('Finalize Document (Core)', () => {
189
189
  expect(res.reportText).toContain('unresolved tracked changes');
190
190
  });
191
191
 
192
+ describe('Resolved Bugs Sanitize Parity Verification', () => {
193
+
194
+ it('BUG-FRAG-1: Coalesces adjacent identical runs after accepting tracked changes', async () => {
195
+ const doc = createMockDoc(`
196
+ <w:p>
197
+ <w:r><w:t xml:space="preserve">The term shall be </w:t></w:r>
198
+ <w:ins w:id="1"><w:r><w:t>five (5)</w:t></w:r></w:ins>
199
+ <w:r><w:t xml:space="preserve"> years from the Effective Date.</w:t></w:r>
200
+ </w:p>
201
+ `);
202
+
203
+ doc.save = vi.fn().mockResolvedValue(Buffer.from('mock'));
204
+
205
+ await finalize_document(doc, {
206
+ filename: 'test.docx',
207
+ sanitize_mode: 'full',
208
+ accept_all: true
209
+ });
210
+
211
+ const xml = doc.element.toString();
212
+ // We should see a single coalesced string rather than fragmented <w:t> nodes
213
+ expect(xml).toContain('The term shall be five (5) years from the Effective Date.');
214
+
215
+ const runs = doc.element.getElementsByTagName('w:r');
216
+ // If they are coalesced properly, there will be exactly 1 run instead of 3
217
+ expect(runs.length).toBe(1);
218
+ });
219
+
220
+ it('BUG-NS-1: Strips unused xmlns:w16du namespace declarations during finalization', async () => {
221
+ const doc = createMockDoc('<w:p/>');
222
+ // Manually inject the namespace onto the absolute root as the engine does
223
+ doc.part._element.setAttribute('xmlns:w16du', 'http://schemas.microsoft.com/office/word/2023/wordml/word16du');
224
+
225
+ doc.save = vi.fn().mockResolvedValue(Buffer.from('mock'));
226
+
227
+ await finalize_document(doc, {
228
+ filename: 'test.docx',
229
+ sanitize_mode: 'full'
230
+ });
231
+
232
+ // The final stringified XML of the root document should NOT contain the unused namespace
233
+ const xml = doc.part._element.toString();
234
+ expect(xml).not.toContain('xmlns:w16du');
235
+ });
236
+ });
192
237
  });
@@ -15,6 +15,93 @@ export function findDescendantsByLocalName(element: Element, localName: string):
15
15
  return result;
16
16
  }
17
17
 
18
+ export function coalesce_runs(doc: DocumentObject): string[] {
19
+ let count = 0;
20
+
21
+ function areRunsIdentical(rPr1: Element | null, rPr2: Element | null): boolean {
22
+ const xml1 = rPr1 ? rPr1.toString() : '';
23
+ const xml2 = rPr2 ? rPr2.toString() : '';
24
+ return xml1 === xml2;
25
+ }
26
+
27
+ function hasSpecialContent(run: Element): boolean {
28
+ const safeTags = ['w:t', 'w:tab', 'w:br', 'w:cr', 'w:delText', 'w:rPr'];
29
+ for (let i = 0; i < run.childNodes.length; i++) {
30
+ const child = run.childNodes[i];
31
+ if (child.nodeType === 1) {
32
+ const tag = (child as Element).tagName;
33
+ if (!safeTags.includes(tag)) return true;
34
+ }
35
+ }
36
+ return false;
37
+ }
38
+
39
+ function coalesceContainer(container: Element) {
40
+ const children = Array.from(container.childNodes).filter(n => n.nodeType === 1) as Element[];
41
+ let i = 0;
42
+ while (i < children.length - 1) {
43
+ const curr = children[i];
44
+ const nxt = children[i + 1];
45
+
46
+ if (curr.tagName === 'w:r' && nxt.tagName === 'w:r') {
47
+ if (!hasSpecialContent(curr) && !hasSpecialContent(nxt)) {
48
+ const rPr1 = findChild(curr, 'w:rPr');
49
+ const rPr2 = findChild(nxt, 'w:rPr');
50
+ if (areRunsIdentical(rPr1, rPr2)) {
51
+ let last_t: Element | null = null;
52
+ for (let c = 0; c < curr.childNodes.length; c++) {
53
+ const child = curr.childNodes[c];
54
+ if (child.nodeType === 1 && ((child as Element).tagName === 'w:t' || (child as Element).tagName === 'w:delText')) {
55
+ last_t = child as Element;
56
+ }
57
+ }
58
+
59
+ const nxtChildren = Array.from(nxt.childNodes).filter(n => n.nodeType === 1) as Element[];
60
+ for (const child of nxtChildren) {
61
+ if (child.tagName === 'w:rPr') continue;
62
+ if ((child.tagName === 'w:t' || child.tagName === 'w:delText') && last_t && last_t.tagName === child.tagName) {
63
+ const t1 = last_t.textContent || '';
64
+ const t2 = child.textContent || '';
65
+ const combined = t1 + t2;
66
+ last_t.textContent = combined;
67
+ if (combined.trim() !== combined) {
68
+ last_t.setAttribute('xml:space', 'preserve');
69
+ }
70
+ } else {
71
+ curr.appendChild(child);
72
+ if (child.tagName === 'w:t' || child.tagName === 'w:delText') {
73
+ last_t = child;
74
+ }
75
+ }
76
+ }
77
+ container.removeChild(nxt);
78
+ children.splice(i + 1, 1);
79
+ count++;
80
+ continue;
81
+ }
82
+ }
83
+ }
84
+
85
+ if (['w:ins', 'w:del', 'w:hyperlink', 'w:sdt', 'w:smartTag', 'w:fldSimple', 'w:sdtContent'].includes(curr.tagName)) {
86
+ coalesceContainer(curr);
87
+ }
88
+ i++;
89
+ }
90
+
91
+ if (children.length > 0) {
92
+ const last = children[children.length - 1];
93
+ if (['w:ins', 'w:del', 'w:hyperlink', 'w:sdt', 'w:smartTag', 'w:fldSimple', 'w:sdtContent'].includes(last.tagName)) {
94
+ coalesceContainer(last);
95
+ }
96
+ }
97
+ }
98
+
99
+ const paragraphs = findAllDescendants(doc.element, 'w:p');
100
+ for (const p of paragraphs) coalesceContainer(p);
101
+
102
+ return count ? [`Adjacent identical runs coalesced: ${count}`] : [];
103
+ }
104
+
18
105
  export function strip_rsid(doc: DocumentObject): string[] {
19
106
  let count = 0;
20
107
  const rsidAttrs = ['w:rsidR', 'w:rsidRPr', 'w:rsidRDefault', 'w:rsidP', 'w:rsidDel', 'w:rsidSect', 'w:rsidTr'];