@adeu/core 1.10.1 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +229 -21
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +9 -3
- package/dist/index.d.ts +9 -3
- package/dist/index.js +228 -21
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/engine.bugs.test.ts +2 -2
- package/src/engine.ts +53 -6
- package/src/index.ts +1 -1
- package/src/ingest.ts +32 -8
- package/src/mapper.ts +14 -8
- package/src/outline.ts +196 -1
- package/src/parity_gaps.test.ts +98 -0
package/dist/index.d.cts
CHANGED
|
@@ -68,12 +68,13 @@ interface TextSpan {
|
|
|
68
68
|
declare class DocumentMapper {
|
|
69
69
|
doc: DocumentObject;
|
|
70
70
|
clean_view: boolean;
|
|
71
|
+
original_view: boolean;
|
|
71
72
|
comments_map: Record<string, any>;
|
|
72
73
|
full_text: string;
|
|
73
74
|
spans: TextSpan[];
|
|
74
75
|
appendix_start_index: number;
|
|
75
76
|
private _text_chunks;
|
|
76
|
-
constructor(doc: DocumentObject, clean_view?: boolean);
|
|
77
|
+
constructor(doc: DocumentObject, clean_view?: boolean, original_view?: boolean);
|
|
77
78
|
private _build_map;
|
|
78
79
|
private _map_blocks;
|
|
79
80
|
private _map_table;
|
|
@@ -178,6 +179,7 @@ declare class RedlineEngine {
|
|
|
178
179
|
mapper: DocumentMapper;
|
|
179
180
|
comments_manager: CommentsManager;
|
|
180
181
|
clean_mapper: DocumentMapper | null;
|
|
182
|
+
original_mapper: DocumentMapper | null;
|
|
181
183
|
skipped_details: string[];
|
|
182
184
|
constructor(doc: DocumentObject, author?: string);
|
|
183
185
|
private _check_punctuation_warning;
|
|
@@ -321,9 +323,13 @@ interface OutlineNode {
|
|
|
321
323
|
has_table: boolean;
|
|
322
324
|
footnote_ids: string[];
|
|
323
325
|
}
|
|
324
|
-
declare function extract_outline(doc: DocumentObject, projected_body: string, body_pages: string[], body_page_offsets: number[], paragraph_offsets?: Record<string, [number, number]> | null): OutlineNode[];
|
|
326
|
+
declare function extract_outline(doc: DocumentObject, projected_body: string, body_pages: string[], body_page_offsets: number[], paragraph_offsets?: Record<string, [number, number]> | Map<any, [number, number]> | null): OutlineNode[];
|
|
325
327
|
|
|
326
328
|
declare function extractTextFromBuffer(buffer: Buffer, cleanView?: boolean): Promise<string>;
|
|
329
|
+
declare function _extractTextFromDoc(doc: DocumentObject, cleanView?: boolean, includeAppendix?: boolean, return_paragraph_offsets?: boolean): string | {
|
|
330
|
+
text: string;
|
|
331
|
+
paragraph_offsets: Map<any, [number, number]>;
|
|
332
|
+
};
|
|
327
333
|
|
|
328
334
|
interface FinalizeOptions {
|
|
329
335
|
filename: string;
|
|
@@ -342,4 +348,4 @@ declare function finalize_document(doc: DocumentObject, options: FinalizeOptions
|
|
|
342
348
|
|
|
343
349
|
declare function identifyEngine(): string;
|
|
344
350
|
|
|
345
|
-
export { BatchValidationError, DocumentMapper, DocumentObject, type FinalizeOptions, type FinalizeResult, type OutlineNode, type PageInfo, type PaginationResult, RedlineEngine, type TextSpan, apply_edits_to_markdown, create_unified_diff, create_word_patch_diff, extractTextFromBuffer, extract_outline, finalize_document, generate_edits_from_text, identifyEngine, paginate, split_structural_appendix, trim_common_context };
|
|
351
|
+
export { BatchValidationError, DocumentMapper, DocumentObject, type FinalizeOptions, type FinalizeResult, type OutlineNode, type PageInfo, type PaginationResult, RedlineEngine, type TextSpan, _extractTextFromDoc, apply_edits_to_markdown, create_unified_diff, create_word_patch_diff, extractTextFromBuffer, extract_outline, finalize_document, generate_edits_from_text, identifyEngine, paginate, split_structural_appendix, trim_common_context };
|
package/dist/index.d.ts
CHANGED
|
@@ -68,12 +68,13 @@ interface TextSpan {
|
|
|
68
68
|
declare class DocumentMapper {
|
|
69
69
|
doc: DocumentObject;
|
|
70
70
|
clean_view: boolean;
|
|
71
|
+
original_view: boolean;
|
|
71
72
|
comments_map: Record<string, any>;
|
|
72
73
|
full_text: string;
|
|
73
74
|
spans: TextSpan[];
|
|
74
75
|
appendix_start_index: number;
|
|
75
76
|
private _text_chunks;
|
|
76
|
-
constructor(doc: DocumentObject, clean_view?: boolean);
|
|
77
|
+
constructor(doc: DocumentObject, clean_view?: boolean, original_view?: boolean);
|
|
77
78
|
private _build_map;
|
|
78
79
|
private _map_blocks;
|
|
79
80
|
private _map_table;
|
|
@@ -178,6 +179,7 @@ declare class RedlineEngine {
|
|
|
178
179
|
mapper: DocumentMapper;
|
|
179
180
|
comments_manager: CommentsManager;
|
|
180
181
|
clean_mapper: DocumentMapper | null;
|
|
182
|
+
original_mapper: DocumentMapper | null;
|
|
181
183
|
skipped_details: string[];
|
|
182
184
|
constructor(doc: DocumentObject, author?: string);
|
|
183
185
|
private _check_punctuation_warning;
|
|
@@ -321,9 +323,13 @@ interface OutlineNode {
|
|
|
321
323
|
has_table: boolean;
|
|
322
324
|
footnote_ids: string[];
|
|
323
325
|
}
|
|
324
|
-
declare function extract_outline(doc: DocumentObject, projected_body: string, body_pages: string[], body_page_offsets: number[], paragraph_offsets?: Record<string, [number, number]> | null): OutlineNode[];
|
|
326
|
+
declare function extract_outline(doc: DocumentObject, projected_body: string, body_pages: string[], body_page_offsets: number[], paragraph_offsets?: Record<string, [number, number]> | Map<any, [number, number]> | null): OutlineNode[];
|
|
325
327
|
|
|
326
328
|
declare function extractTextFromBuffer(buffer: Buffer, cleanView?: boolean): Promise<string>;
|
|
329
|
+
declare function _extractTextFromDoc(doc: DocumentObject, cleanView?: boolean, includeAppendix?: boolean, return_paragraph_offsets?: boolean): string | {
|
|
330
|
+
text: string;
|
|
331
|
+
paragraph_offsets: Map<any, [number, number]>;
|
|
332
|
+
};
|
|
327
333
|
|
|
328
334
|
interface FinalizeOptions {
|
|
329
335
|
filename: string;
|
|
@@ -342,4 +348,4 @@ declare function finalize_document(doc: DocumentObject, options: FinalizeOptions
|
|
|
342
348
|
|
|
343
349
|
declare function identifyEngine(): string;
|
|
344
350
|
|
|
345
|
-
export { BatchValidationError, DocumentMapper, DocumentObject, type FinalizeOptions, type FinalizeResult, type OutlineNode, type PageInfo, type PaginationResult, RedlineEngine, type TextSpan, apply_edits_to_markdown, create_unified_diff, create_word_patch_diff, extractTextFromBuffer, extract_outline, finalize_document, generate_edits_from_text, identifyEngine, paginate, split_structural_appendix, trim_common_context };
|
|
351
|
+
export { BatchValidationError, DocumentMapper, DocumentObject, type FinalizeOptions, type FinalizeResult, type OutlineNode, type PageInfo, type PaginationResult, RedlineEngine, type TextSpan, _extractTextFromDoc, apply_edits_to_markdown, create_unified_diff, create_word_patch_diff, extractTextFromBuffer, extract_outline, finalize_document, generate_edits_from_text, identifyEngine, paginate, split_structural_appendix, trim_common_context };
|
package/dist/index.js
CHANGED
|
@@ -1216,14 +1216,16 @@ function* iter_paragraph_content(paragraph) {
|
|
|
1216
1216
|
var DocumentMapper = class {
|
|
1217
1217
|
doc;
|
|
1218
1218
|
clean_view;
|
|
1219
|
+
original_view;
|
|
1219
1220
|
comments_map;
|
|
1220
1221
|
full_text = "";
|
|
1221
1222
|
spans = [];
|
|
1222
1223
|
appendix_start_index = -1;
|
|
1223
1224
|
_text_chunks = [];
|
|
1224
|
-
constructor(doc, clean_view = false) {
|
|
1225
|
+
constructor(doc, clean_view = false, original_view = false) {
|
|
1225
1226
|
this.doc = doc;
|
|
1226
1227
|
this.clean_view = clean_view;
|
|
1228
|
+
this.original_view = original_view;
|
|
1227
1229
|
this.comments_map = extract_comments_data(doc.pkg);
|
|
1228
1230
|
this._build_map();
|
|
1229
1231
|
}
|
|
@@ -1305,14 +1307,15 @@ ${header}`;
|
|
|
1305
1307
|
const ins = trPr ? findChild(trPr, "w:ins") : null;
|
|
1306
1308
|
const del_node = trPr ? findChild(trPr, "w:del") : null;
|
|
1307
1309
|
if (this.clean_view && del_node) continue;
|
|
1310
|
+
if (this.original_view && ins) continue;
|
|
1308
1311
|
if (rows_processed > 0) {
|
|
1309
1312
|
this._add_virtual_text("\n", current, null);
|
|
1310
1313
|
current += 1;
|
|
1311
1314
|
}
|
|
1312
|
-
if (ins && !this.clean_view) {
|
|
1315
|
+
if (ins && !this.clean_view && !this.original_view) {
|
|
1313
1316
|
this._add_virtual_text("{++ ", current, null);
|
|
1314
1317
|
current += 4;
|
|
1315
|
-
} else if (del_node && !this.clean_view) {
|
|
1318
|
+
} else if (del_node && !this.clean_view && !this.original_view) {
|
|
1316
1319
|
this._add_virtual_text("{-- ", current, null);
|
|
1317
1320
|
current += 4;
|
|
1318
1321
|
}
|
|
@@ -1328,11 +1331,11 @@ ${header}`;
|
|
|
1328
1331
|
current = this._map_blocks(cell, current);
|
|
1329
1332
|
cells_processed += 1;
|
|
1330
1333
|
}
|
|
1331
|
-
if (ins && !this.clean_view) {
|
|
1334
|
+
if (ins && !this.clean_view && !this.original_view) {
|
|
1332
1335
|
const suffix = ` |Chg:${ins.getAttribute("w:id")}++}`;
|
|
1333
1336
|
this._add_virtual_text(suffix, current, null);
|
|
1334
1337
|
current += suffix.length;
|
|
1335
|
-
} else if (del_node && !this.clean_view) {
|
|
1338
|
+
} else if (del_node && !this.clean_view && !this.original_view) {
|
|
1336
1339
|
const suffix = ` |Chg:${del_node.getAttribute("w:id")}--}`;
|
|
1337
1340
|
this._add_virtual_text(suffix, current, null);
|
|
1338
1341
|
current += suffix.length;
|
|
@@ -1426,11 +1429,13 @@ ${header}`;
|
|
|
1426
1429
|
}
|
|
1427
1430
|
if (this.clean_view && Object.keys(active_del).length > 0) {
|
|
1428
1431
|
}
|
|
1432
|
+
if (this.original_view && Object.keys(active_ins).length > 0) {
|
|
1433
|
+
}
|
|
1429
1434
|
const full_seg_text = run_parts.map((x) => x[1]).join("");
|
|
1430
1435
|
const curr_ins_id = Object.keys(active_ins).pop() || null;
|
|
1431
1436
|
const curr_del_id = Object.keys(active_del).pop() || null;
|
|
1432
|
-
if (full_seg_text && !(this.clean_view && curr_del_id)) {
|
|
1433
|
-
const new_wrappers = this.clean_view ? ["", ""] : this._get_wrappers(curr_ins_id, curr_del_id, active_ids, active_fmt);
|
|
1437
|
+
if (full_seg_text && !(this.clean_view && curr_del_id) && !(this.original_view && curr_ins_id)) {
|
|
1438
|
+
const new_wrappers = this.clean_view || this.original_view ? ["", ""] : this._get_wrappers(curr_ins_id, curr_del_id, active_ids, active_fmt);
|
|
1434
1439
|
const new_style = [prefix, suffix];
|
|
1435
1440
|
if (pending_runs.length > 0 && new_wrappers[0] === current_wrappers[0] && new_wrappers[1] === current_wrappers[1]) {
|
|
1436
1441
|
let skip_leading_prefix = false;
|
|
@@ -1455,7 +1460,7 @@ ${header}`;
|
|
|
1455
1460
|
}
|
|
1456
1461
|
}
|
|
1457
1462
|
}
|
|
1458
|
-
if (!this.clean_view) {
|
|
1463
|
+
if (!this.clean_view && !this.original_view) {
|
|
1459
1464
|
const has_meta = Object.keys(active_ins).length > 0 || Object.keys(active_del).length > 0 || active_ids.size > 0 || Object.keys(active_fmt).length > 0;
|
|
1460
1465
|
if (has_meta) {
|
|
1461
1466
|
deferred_meta_states.push([{ ...active_ins }, { ...active_del }, new Set(active_ids), { ...active_fmt }]);
|
|
@@ -2631,6 +2636,7 @@ var RedlineEngine = class {
|
|
|
2631
2636
|
mapper;
|
|
2632
2637
|
comments_manager;
|
|
2633
2638
|
clean_mapper = null;
|
|
2639
|
+
original_mapper = null;
|
|
2634
2640
|
skipped_details = [];
|
|
2635
2641
|
constructor(doc, author = "Adeu AI (TS)") {
|
|
2636
2642
|
this.doc = doc;
|
|
@@ -3392,13 +3398,17 @@ var RedlineEngine = class {
|
|
|
3392
3398
|
if (!edit.target_text) continue;
|
|
3393
3399
|
let matches = this.mapper.find_all_match_indices(edit.target_text);
|
|
3394
3400
|
let activeText = this.mapper.full_text;
|
|
3401
|
+
let target_mapper = this.mapper;
|
|
3395
3402
|
if (matches.length === 0) {
|
|
3396
3403
|
if (!this.clean_mapper)
|
|
3397
3404
|
this.clean_mapper = new DocumentMapper(this.doc, true);
|
|
3398
3405
|
matches = this.clean_mapper.find_all_match_indices(edit.target_text);
|
|
3399
|
-
if (matches.length > 0)
|
|
3406
|
+
if (matches.length > 0) {
|
|
3407
|
+
activeText = this.clean_mapper.full_text;
|
|
3408
|
+
target_mapper = this.clean_mapper;
|
|
3409
|
+
}
|
|
3400
3410
|
}
|
|
3401
|
-
if (activeText === this.mapper.full_text && matches.length >
|
|
3411
|
+
if (activeText === this.mapper.full_text && matches.length > 0) {
|
|
3402
3412
|
const liveMatches = matches.filter(([start, length]) => {
|
|
3403
3413
|
const realSpans = this.mapper.spans.filter(
|
|
3404
3414
|
(s) => s.run !== null && s.end > start && s.start < start + length
|
|
@@ -3406,13 +3416,51 @@ var RedlineEngine = class {
|
|
|
3406
3416
|
if (realSpans.length === 0) return true;
|
|
3407
3417
|
return realSpans.some((s) => !s.del_id);
|
|
3408
3418
|
});
|
|
3409
|
-
|
|
3419
|
+
matches = liveMatches;
|
|
3410
3420
|
}
|
|
3421
|
+
let is_deleted_text = false;
|
|
3422
|
+
const deleted_authors = /* @__PURE__ */ new Set();
|
|
3411
3423
|
if (matches.length === 0) {
|
|
3412
|
-
|
|
3413
|
-
|
|
3424
|
+
if (!this.original_mapper) {
|
|
3425
|
+
this.original_mapper = new DocumentMapper(this.doc, false, true);
|
|
3426
|
+
}
|
|
3427
|
+
const orig_matches = this.original_mapper.find_all_match_indices(edit.target_text);
|
|
3428
|
+
if (orig_matches.length > 0) {
|
|
3429
|
+
is_deleted_text = true;
|
|
3430
|
+
for (const [start, length] of orig_matches) {
|
|
3431
|
+
const spans = this.original_mapper.spans.filter(
|
|
3432
|
+
(s) => s.end > start && s.start < start + length
|
|
3433
|
+
);
|
|
3434
|
+
for (const s of spans) {
|
|
3435
|
+
if (s.run !== null) {
|
|
3436
|
+
let parent = s.run._element;
|
|
3437
|
+
while (parent) {
|
|
3438
|
+
if (parent.nodeType === 1 && parent.tagName === "w:del") {
|
|
3439
|
+
const auth = parent.getAttribute("w:author");
|
|
3440
|
+
if (auth) {
|
|
3441
|
+
deleted_authors.add(auth);
|
|
3442
|
+
}
|
|
3443
|
+
break;
|
|
3444
|
+
}
|
|
3445
|
+
parent = parent.parentNode;
|
|
3446
|
+
}
|
|
3447
|
+
}
|
|
3448
|
+
}
|
|
3449
|
+
}
|
|
3450
|
+
}
|
|
3451
|
+
}
|
|
3452
|
+
if (matches.length === 0) {
|
|
3453
|
+
if (is_deleted_text) {
|
|
3454
|
+
const author_phrase = deleted_authors.size > 0 ? `by ${Array.from(deleted_authors).sort().join(", ")}` : "by an existing revision";
|
|
3455
|
+
errors.push(
|
|
3456
|
+
`- Edit ${i + 1} Failed: Target text matches text inside a tracked deletion ${author_phrase}. Reject/accept that change first or target the active replacement text instead.`
|
|
3457
|
+
);
|
|
3458
|
+
} else {
|
|
3459
|
+
errors.push(
|
|
3460
|
+
`- Edit ${i + 1} Failed: Target text not found in document:
|
|
3414
3461
|
"${edit.target_text}"`
|
|
3415
|
-
|
|
3462
|
+
);
|
|
3463
|
+
}
|
|
3416
3464
|
} else if (matches.length > 1) {
|
|
3417
3465
|
const positions = matches.map(([start, length]) => [
|
|
3418
3466
|
start,
|
|
@@ -4834,13 +4882,14 @@ async function extractTextFromBuffer(buffer, cleanView = false) {
|
|
|
4834
4882
|
const doc = await DocumentObject.load(buffer);
|
|
4835
4883
|
return _extractTextFromDoc(doc, cleanView);
|
|
4836
4884
|
}
|
|
4837
|
-
function _extractTextFromDoc(doc, cleanView = false, includeAppendix = true) {
|
|
4885
|
+
function _extractTextFromDoc(doc, cleanView = false, includeAppendix = true, return_paragraph_offsets = false) {
|
|
4838
4886
|
const comments_map = extract_comments_data(doc.pkg);
|
|
4839
4887
|
const full_text = [];
|
|
4888
|
+
const paragraph_offsets = /* @__PURE__ */ new Map();
|
|
4840
4889
|
let cursor = 0;
|
|
4841
4890
|
for (const part of iter_document_parts(doc)) {
|
|
4842
4891
|
const part_cursor = full_text.length > 0 ? cursor + 2 : cursor;
|
|
4843
|
-
const part_text = _extract_blocks(part, comments_map, cleanView, part_cursor);
|
|
4892
|
+
const part_text = _extract_blocks(part, comments_map, cleanView, part_cursor, return_paragraph_offsets ? paragraph_offsets : void 0);
|
|
4844
4893
|
if (part_text) {
|
|
4845
4894
|
if (full_text.length > 0) cursor += 2;
|
|
4846
4895
|
full_text.push(part_text);
|
|
@@ -4852,9 +4901,12 @@ function _extractTextFromDoc(doc, cleanView = false, includeAppendix = true) {
|
|
|
4852
4901
|
const appendix = build_structural_appendix(doc, base_text);
|
|
4853
4902
|
if (appendix) base_text += appendix;
|
|
4854
4903
|
}
|
|
4904
|
+
if (return_paragraph_offsets) {
|
|
4905
|
+
return { text: base_text, paragraph_offsets };
|
|
4906
|
+
}
|
|
4855
4907
|
return base_text;
|
|
4856
4908
|
}
|
|
4857
|
-
function _extract_blocks(container, comments_map, cleanView, cursor) {
|
|
4909
|
+
function _extract_blocks(container, comments_map, cleanView, cursor, paragraph_offsets) {
|
|
4858
4910
|
const part = container.part || container;
|
|
4859
4911
|
const [style_cache, default_pstyle] = _get_style_cache(part);
|
|
4860
4912
|
const blocks = [];
|
|
@@ -4873,7 +4925,7 @@ ${header}`;
|
|
|
4873
4925
|
if (!is_first_block) local_cursor += 2;
|
|
4874
4926
|
const block_start = local_cursor;
|
|
4875
4927
|
if (item.constructor.name === "FootnoteItem") {
|
|
4876
|
-
const fn_text = _extract_blocks(item, comments_map, cleanView, block_start);
|
|
4928
|
+
const fn_text = _extract_blocks(item, comments_map, cleanView, block_start, paragraph_offsets);
|
|
4877
4929
|
if (fn_text) {
|
|
4878
4930
|
blocks.push(fn_text);
|
|
4879
4931
|
local_cursor = block_start + fn_text.length;
|
|
@@ -4889,11 +4941,14 @@ ${header}`;
|
|
|
4889
4941
|
const p_text = build_paragraph_text(item, comments_map, cleanView, style_cache, default_pstyle);
|
|
4890
4942
|
const full_block = prefix + p_text;
|
|
4891
4943
|
blocks.push(full_block);
|
|
4944
|
+
if (paragraph_offsets) {
|
|
4945
|
+
paragraph_offsets.set(item._element, [block_start, full_block.length]);
|
|
4946
|
+
}
|
|
4892
4947
|
local_cursor = block_start + full_block.length;
|
|
4893
4948
|
is_first_para = false;
|
|
4894
4949
|
is_first_block = false;
|
|
4895
4950
|
} else if (item instanceof Table) {
|
|
4896
|
-
const table_text = extract_table(item, comments_map, cleanView, block_start);
|
|
4951
|
+
const table_text = extract_table(item, comments_map, cleanView, block_start, paragraph_offsets);
|
|
4897
4952
|
if (table_text) {
|
|
4898
4953
|
blocks.push(table_text);
|
|
4899
4954
|
local_cursor = block_start + table_text.length;
|
|
@@ -4906,7 +4961,7 @@ ${header}`;
|
|
|
4906
4961
|
}
|
|
4907
4962
|
return blocks.join("\n\n");
|
|
4908
4963
|
}
|
|
4909
|
-
function extract_table(table, comments_map, cleanView, cursor) {
|
|
4964
|
+
function extract_table(table, comments_map, cleanView, cursor, paragraph_offsets) {
|
|
4910
4965
|
const rows_text = [];
|
|
4911
4966
|
let rows_processed = 0;
|
|
4912
4967
|
let local_cursor = cursor;
|
|
@@ -4925,7 +4980,7 @@ function extract_table(table, comments_map, cleanView, cursor) {
|
|
|
4925
4980
|
if (seen_cells.has(cell)) continue;
|
|
4926
4981
|
seen_cells.add(cell);
|
|
4927
4982
|
if (!first_cell) cell_cursor += 3;
|
|
4928
|
-
const cell_content = _extract_blocks(cell, comments_map, cleanView, cell_cursor);
|
|
4983
|
+
const cell_content = _extract_blocks(cell, comments_map, cleanView, cell_cursor, paragraph_offsets);
|
|
4929
4984
|
cell_texts.push(cell_content);
|
|
4930
4985
|
cell_cursor += cell_content.length;
|
|
4931
4986
|
first_cell = false;
|
|
@@ -5182,6 +5237,9 @@ function extract_outline(doc, projected_body, body_pages, body_page_offsets, par
|
|
|
5182
5237
|
if (body_pages.length !== body_page_offsets.length) {
|
|
5183
5238
|
throw new Error("body_pages and body_page_offsets length mismatch");
|
|
5184
5239
|
}
|
|
5240
|
+
if (paragraph_offsets) {
|
|
5241
|
+
return _extract_outline_fast(doc, projected_body, body_page_offsets, paragraph_offsets);
|
|
5242
|
+
}
|
|
5185
5243
|
const comments_map = extract_comments_data(doc.pkg);
|
|
5186
5244
|
const block_records = _walk_doc_body(doc, comments_map);
|
|
5187
5245
|
const heading_indices = [];
|
|
@@ -5457,6 +5515,7 @@ function _determine_heading_style(paragraph) {
|
|
|
5457
5515
|
if (pStyle) style_id = pStyle.getAttribute("w:val") || default_pstyle;
|
|
5458
5516
|
}
|
|
5459
5517
|
let outline_level = null;
|
|
5518
|
+
let outline_level_from_style = false;
|
|
5460
5519
|
if (pPr) {
|
|
5461
5520
|
const oLvl = findChild(pPr, "w:outlineLvl");
|
|
5462
5521
|
if (oLvl && /^\d+$/.test(oLvl.getAttribute("w:val") || "")) {
|
|
@@ -5465,6 +5524,7 @@ function _determine_heading_style(paragraph) {
|
|
|
5465
5524
|
}
|
|
5466
5525
|
if (outline_level === null && style_id && style_cache && style_cache[style_id]) {
|
|
5467
5526
|
outline_level = style_cache[style_id].outline_level;
|
|
5527
|
+
outline_level_from_style = true;
|
|
5468
5528
|
}
|
|
5469
5529
|
const style_name = style_id && style_cache && style_cache[style_id] ? style_cache[style_id].name : style_id;
|
|
5470
5530
|
let normalized_style_name = style_name;
|
|
@@ -5475,6 +5535,12 @@ function _determine_heading_style(paragraph) {
|
|
|
5475
5535
|
normalized_style_name = "Title";
|
|
5476
5536
|
}
|
|
5477
5537
|
}
|
|
5538
|
+
if (outline_level_from_style && outline_level !== null) {
|
|
5539
|
+
const is_heading_or_title = normalized_style_name && (normalized_style_name.startsWith("Heading") || normalized_style_name === "Title");
|
|
5540
|
+
if (!is_heading_or_title) {
|
|
5541
|
+
outline_level = null;
|
|
5542
|
+
}
|
|
5543
|
+
}
|
|
5478
5544
|
if (outline_level !== null && outline_level >= 0 && outline_level <= 8) {
|
|
5479
5545
|
if (normalized_style_name && (normalized_style_name.startsWith("Heading") || normalized_style_name === "Title")) {
|
|
5480
5546
|
return normalized_style_name;
|
|
@@ -5523,6 +5589,146 @@ function _offset_to_page(offset, body_page_offsets) {
|
|
|
5523
5589
|
}
|
|
5524
5590
|
return page;
|
|
5525
5591
|
}
|
|
5592
|
+
function _extract_outline_fast(doc, projected_body, body_page_offsets, paragraph_offsets) {
|
|
5593
|
+
const paragraphs_and_tables = [];
|
|
5594
|
+
const seen_cells = /* @__PURE__ */ new Set();
|
|
5595
|
+
function walk(container) {
|
|
5596
|
+
for (const item of iter_block_items(container)) {
|
|
5597
|
+
const i_type = item.constructor.name;
|
|
5598
|
+
if (i_type === "FootnoteItem") {
|
|
5599
|
+
walk(item);
|
|
5600
|
+
} else if (item instanceof Paragraph) {
|
|
5601
|
+
paragraphs_and_tables.push(["p", item]);
|
|
5602
|
+
} else if (item instanceof Table) {
|
|
5603
|
+
paragraphs_and_tables.push(["t", item]);
|
|
5604
|
+
for (const row of item.rows) {
|
|
5605
|
+
for (const cell of row.cells) {
|
|
5606
|
+
if (seen_cells.has(cell._element)) {
|
|
5607
|
+
continue;
|
|
5608
|
+
}
|
|
5609
|
+
seen_cells.add(cell._element);
|
|
5610
|
+
walk(cell);
|
|
5611
|
+
}
|
|
5612
|
+
}
|
|
5613
|
+
}
|
|
5614
|
+
}
|
|
5615
|
+
}
|
|
5616
|
+
walk(doc);
|
|
5617
|
+
const heading_indices = [];
|
|
5618
|
+
for (let idx = 0; idx < paragraphs_and_tables.length; idx++) {
|
|
5619
|
+
const [kind, item] = paragraphs_and_tables[idx];
|
|
5620
|
+
if (kind !== "p") continue;
|
|
5621
|
+
let hasOffset = false;
|
|
5622
|
+
if (paragraph_offsets instanceof Map) {
|
|
5623
|
+
hasOffset = paragraph_offsets.has(item._element);
|
|
5624
|
+
} else {
|
|
5625
|
+
hasOffset = item._element in paragraph_offsets;
|
|
5626
|
+
}
|
|
5627
|
+
if (!hasOffset) {
|
|
5628
|
+
continue;
|
|
5629
|
+
}
|
|
5630
|
+
if (!_is_heading(item)) continue;
|
|
5631
|
+
if (!_heading_passes_quality_filter_fast(item, projected_body, paragraph_offsets)) continue;
|
|
5632
|
+
heading_indices.push(idx);
|
|
5633
|
+
}
|
|
5634
|
+
if (heading_indices.length === 0) return [];
|
|
5635
|
+
const nodes = [];
|
|
5636
|
+
for (let h_pos = 0; h_pos < heading_indices.length; h_pos++) {
|
|
5637
|
+
const item_idx = heading_indices[h_pos];
|
|
5638
|
+
const paragraph = paragraphs_and_tables[item_idx][1];
|
|
5639
|
+
const level = _heading_level(paragraph);
|
|
5640
|
+
const text = _heading_text_fast(paragraph, projected_body, paragraph_offsets);
|
|
5641
|
+
const style = _determine_heading_style(paragraph);
|
|
5642
|
+
let owned_end = item_idx;
|
|
5643
|
+
for (let next_h_pos = h_pos + 1; next_h_pos < heading_indices.length; next_h_pos++) {
|
|
5644
|
+
const next_idx = heading_indices[next_h_pos];
|
|
5645
|
+
const next_paragraph = paragraphs_and_tables[next_idx][1];
|
|
5646
|
+
if (_heading_level(next_paragraph) <= level) {
|
|
5647
|
+
owned_end = next_idx;
|
|
5648
|
+
break;
|
|
5649
|
+
}
|
|
5650
|
+
}
|
|
5651
|
+
if (owned_end === item_idx) {
|
|
5652
|
+
owned_end = paragraphs_and_tables.length;
|
|
5653
|
+
}
|
|
5654
|
+
const owned = paragraphs_and_tables.slice(item_idx + 1, owned_end);
|
|
5655
|
+
let has_table = false;
|
|
5656
|
+
for (const [kind2, item2] of owned) {
|
|
5657
|
+
if (kind2 === "p" && _is_heading(item2)) {
|
|
5658
|
+
break;
|
|
5659
|
+
}
|
|
5660
|
+
if (kind2 === "t") {
|
|
5661
|
+
has_table = true;
|
|
5662
|
+
break;
|
|
5663
|
+
}
|
|
5664
|
+
}
|
|
5665
|
+
const footnote_ids = _collect_footnote_ids_fast(owned);
|
|
5666
|
+
let para_offset;
|
|
5667
|
+
if (paragraph_offsets instanceof Map) {
|
|
5668
|
+
para_offset = paragraph_offsets.get(paragraph._element);
|
|
5669
|
+
} else {
|
|
5670
|
+
para_offset = paragraph_offsets[paragraph._element];
|
|
5671
|
+
}
|
|
5672
|
+
let page_num = 1;
|
|
5673
|
+
if (para_offset !== void 0) {
|
|
5674
|
+
const [start_offset] = para_offset;
|
|
5675
|
+
page_num = _offset_to_page(start_offset, body_page_offsets);
|
|
5676
|
+
}
|
|
5677
|
+
nodes.push({
|
|
5678
|
+
level,
|
|
5679
|
+
text,
|
|
5680
|
+
page: page_num,
|
|
5681
|
+
style,
|
|
5682
|
+
has_table,
|
|
5683
|
+
footnote_ids
|
|
5684
|
+
});
|
|
5685
|
+
}
|
|
5686
|
+
return nodes;
|
|
5687
|
+
}
|
|
5688
|
+
function _heading_passes_quality_filter_fast(paragraph, projected_body, paragraph_offsets) {
|
|
5689
|
+
const style = _determine_heading_style(paragraph);
|
|
5690
|
+
if (style !== "(heuristic)") return true;
|
|
5691
|
+
const text = _heading_text_fast(paragraph, projected_body, paragraph_offsets);
|
|
5692
|
+
if (!text) return false;
|
|
5693
|
+
const words = text.match(/\w+/g) || [];
|
|
5694
|
+
return words.length >= _HEURISTIC_MIN_WORDS;
|
|
5695
|
+
}
|
|
5696
|
+
function _heading_text_fast(paragraph, projected_body, paragraph_offsets) {
|
|
5697
|
+
let offset;
|
|
5698
|
+
if (paragraph_offsets instanceof Map) {
|
|
5699
|
+
offset = paragraph_offsets.get(paragraph._element);
|
|
5700
|
+
} else {
|
|
5701
|
+
offset = paragraph_offsets[paragraph._element];
|
|
5702
|
+
}
|
|
5703
|
+
if (offset === void 0) {
|
|
5704
|
+
return "";
|
|
5705
|
+
}
|
|
5706
|
+
const [start, length] = offset;
|
|
5707
|
+
const raw = projected_body.substring(start, start + length);
|
|
5708
|
+
let cleaned = _strip_critic_markup(raw);
|
|
5709
|
+
cleaned = _strip_inline_formatting(cleaned);
|
|
5710
|
+
cleaned = cleaned.replace(/^#+\s+/, "");
|
|
5711
|
+
return cleaned.trim();
|
|
5712
|
+
}
|
|
5713
|
+
function _collect_footnote_ids_fast(owned_items) {
|
|
5714
|
+
const seen = /* @__PURE__ */ new Set();
|
|
5715
|
+
const ordered = [];
|
|
5716
|
+
for (const [kind, item] of owned_items) {
|
|
5717
|
+
if (kind !== "p") continue;
|
|
5718
|
+
for (const event of iter_paragraph_content(item)) {
|
|
5719
|
+
if (!("type" in event)) continue;
|
|
5720
|
+
let fn_id = "";
|
|
5721
|
+
if (event.type === "footnote") fn_id = `fn-${event.id}`;
|
|
5722
|
+
else if (event.type === "endnote") fn_id = `en-${event.id}`;
|
|
5723
|
+
else continue;
|
|
5724
|
+
if (!seen.has(fn_id)) {
|
|
5725
|
+
seen.add(fn_id);
|
|
5726
|
+
ordered.push(fn_id);
|
|
5727
|
+
}
|
|
5728
|
+
}
|
|
5729
|
+
}
|
|
5730
|
+
return ordered;
|
|
5731
|
+
}
|
|
5526
5732
|
|
|
5527
5733
|
// src/sanitize/report.ts
|
|
5528
5734
|
var SanitizeReport = class {
|
|
@@ -6209,6 +6415,7 @@ export {
|
|
|
6209
6415
|
DocumentMapper,
|
|
6210
6416
|
DocumentObject,
|
|
6211
6417
|
RedlineEngine,
|
|
6418
|
+
_extractTextFromDoc,
|
|
6212
6419
|
apply_edits_to_markdown,
|
|
6213
6420
|
create_unified_diff,
|
|
6214
6421
|
create_word_patch_diff,
|