@adeu/core 1.10.0 → 1.11.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +259 -30
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +9 -3
- package/dist/index.d.ts +9 -3
- package/dist/index.js +258 -30
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/src/engine.bugs.test.ts +2 -2
- package/src/engine.ts +179 -49
- package/src/index.ts +1 -1
- package/src/ingest.ts +32 -8
- package/src/mapper.ts +14 -8
- package/src/outline.ts +196 -1
- package/src/parity_gaps.test.ts +98 -0
package/dist/index.d.cts
CHANGED
|
@@ -68,12 +68,13 @@ interface TextSpan {
|
|
|
68
68
|
declare class DocumentMapper {
|
|
69
69
|
doc: DocumentObject;
|
|
70
70
|
clean_view: boolean;
|
|
71
|
+
original_view: boolean;
|
|
71
72
|
comments_map: Record<string, any>;
|
|
72
73
|
full_text: string;
|
|
73
74
|
spans: TextSpan[];
|
|
74
75
|
appendix_start_index: number;
|
|
75
76
|
private _text_chunks;
|
|
76
|
-
constructor(doc: DocumentObject, clean_view?: boolean);
|
|
77
|
+
constructor(doc: DocumentObject, clean_view?: boolean, original_view?: boolean);
|
|
77
78
|
private _build_map;
|
|
78
79
|
private _map_blocks;
|
|
79
80
|
private _map_table;
|
|
@@ -178,6 +179,7 @@ declare class RedlineEngine {
|
|
|
178
179
|
mapper: DocumentMapper;
|
|
179
180
|
comments_manager: CommentsManager;
|
|
180
181
|
clean_mapper: DocumentMapper | null;
|
|
182
|
+
original_mapper: DocumentMapper | null;
|
|
181
183
|
skipped_details: string[];
|
|
182
184
|
constructor(doc: DocumentObject, author?: string);
|
|
183
185
|
private _check_punctuation_warning;
|
|
@@ -321,9 +323,13 @@ interface OutlineNode {
|
|
|
321
323
|
has_table: boolean;
|
|
322
324
|
footnote_ids: string[];
|
|
323
325
|
}
|
|
324
|
-
declare function extract_outline(doc: DocumentObject, projected_body: string, body_pages: string[], body_page_offsets: number[], paragraph_offsets?: Record<string, [number, number]> | null): OutlineNode[];
|
|
326
|
+
declare function extract_outline(doc: DocumentObject, projected_body: string, body_pages: string[], body_page_offsets: number[], paragraph_offsets?: Record<string, [number, number]> | Map<any, [number, number]> | null): OutlineNode[];
|
|
325
327
|
|
|
326
328
|
declare function extractTextFromBuffer(buffer: Buffer, cleanView?: boolean): Promise<string>;
|
|
329
|
+
declare function _extractTextFromDoc(doc: DocumentObject, cleanView?: boolean, includeAppendix?: boolean, return_paragraph_offsets?: boolean): string | {
|
|
330
|
+
text: string;
|
|
331
|
+
paragraph_offsets: Map<any, [number, number]>;
|
|
332
|
+
};
|
|
327
333
|
|
|
328
334
|
interface FinalizeOptions {
|
|
329
335
|
filename: string;
|
|
@@ -342,4 +348,4 @@ declare function finalize_document(doc: DocumentObject, options: FinalizeOptions
|
|
|
342
348
|
|
|
343
349
|
declare function identifyEngine(): string;
|
|
344
350
|
|
|
345
|
-
export { BatchValidationError, DocumentMapper, DocumentObject, type FinalizeOptions, type FinalizeResult, type OutlineNode, type PageInfo, type PaginationResult, RedlineEngine, type TextSpan, apply_edits_to_markdown, create_unified_diff, create_word_patch_diff, extractTextFromBuffer, extract_outline, finalize_document, generate_edits_from_text, identifyEngine, paginate, split_structural_appendix, trim_common_context };
|
|
351
|
+
export { BatchValidationError, DocumentMapper, DocumentObject, type FinalizeOptions, type FinalizeResult, type OutlineNode, type PageInfo, type PaginationResult, RedlineEngine, type TextSpan, _extractTextFromDoc, apply_edits_to_markdown, create_unified_diff, create_word_patch_diff, extractTextFromBuffer, extract_outline, finalize_document, generate_edits_from_text, identifyEngine, paginate, split_structural_appendix, trim_common_context };
|
package/dist/index.d.ts
CHANGED
|
@@ -68,12 +68,13 @@ interface TextSpan {
|
|
|
68
68
|
declare class DocumentMapper {
|
|
69
69
|
doc: DocumentObject;
|
|
70
70
|
clean_view: boolean;
|
|
71
|
+
original_view: boolean;
|
|
71
72
|
comments_map: Record<string, any>;
|
|
72
73
|
full_text: string;
|
|
73
74
|
spans: TextSpan[];
|
|
74
75
|
appendix_start_index: number;
|
|
75
76
|
private _text_chunks;
|
|
76
|
-
constructor(doc: DocumentObject, clean_view?: boolean);
|
|
77
|
+
constructor(doc: DocumentObject, clean_view?: boolean, original_view?: boolean);
|
|
77
78
|
private _build_map;
|
|
78
79
|
private _map_blocks;
|
|
79
80
|
private _map_table;
|
|
@@ -178,6 +179,7 @@ declare class RedlineEngine {
|
|
|
178
179
|
mapper: DocumentMapper;
|
|
179
180
|
comments_manager: CommentsManager;
|
|
180
181
|
clean_mapper: DocumentMapper | null;
|
|
182
|
+
original_mapper: DocumentMapper | null;
|
|
181
183
|
skipped_details: string[];
|
|
182
184
|
constructor(doc: DocumentObject, author?: string);
|
|
183
185
|
private _check_punctuation_warning;
|
|
@@ -321,9 +323,13 @@ interface OutlineNode {
|
|
|
321
323
|
has_table: boolean;
|
|
322
324
|
footnote_ids: string[];
|
|
323
325
|
}
|
|
324
|
-
declare function extract_outline(doc: DocumentObject, projected_body: string, body_pages: string[], body_page_offsets: number[], paragraph_offsets?: Record<string, [number, number]> | null): OutlineNode[];
|
|
326
|
+
declare function extract_outline(doc: DocumentObject, projected_body: string, body_pages: string[], body_page_offsets: number[], paragraph_offsets?: Record<string, [number, number]> | Map<any, [number, number]> | null): OutlineNode[];
|
|
325
327
|
|
|
326
328
|
declare function extractTextFromBuffer(buffer: Buffer, cleanView?: boolean): Promise<string>;
|
|
329
|
+
declare function _extractTextFromDoc(doc: DocumentObject, cleanView?: boolean, includeAppendix?: boolean, return_paragraph_offsets?: boolean): string | {
|
|
330
|
+
text: string;
|
|
331
|
+
paragraph_offsets: Map<any, [number, number]>;
|
|
332
|
+
};
|
|
327
333
|
|
|
328
334
|
interface FinalizeOptions {
|
|
329
335
|
filename: string;
|
|
@@ -342,4 +348,4 @@ declare function finalize_document(doc: DocumentObject, options: FinalizeOptions
|
|
|
342
348
|
|
|
343
349
|
declare function identifyEngine(): string;
|
|
344
350
|
|
|
345
|
-
export { BatchValidationError, DocumentMapper, DocumentObject, type FinalizeOptions, type FinalizeResult, type OutlineNode, type PageInfo, type PaginationResult, RedlineEngine, type TextSpan, apply_edits_to_markdown, create_unified_diff, create_word_patch_diff, extractTextFromBuffer, extract_outline, finalize_document, generate_edits_from_text, identifyEngine, paginate, split_structural_appendix, trim_common_context };
|
|
351
|
+
export { BatchValidationError, DocumentMapper, DocumentObject, type FinalizeOptions, type FinalizeResult, type OutlineNode, type PageInfo, type PaginationResult, RedlineEngine, type TextSpan, _extractTextFromDoc, apply_edits_to_markdown, create_unified_diff, create_word_patch_diff, extractTextFromBuffer, extract_outline, finalize_document, generate_edits_from_text, identifyEngine, paginate, split_structural_appendix, trim_common_context };
|
package/dist/index.js
CHANGED
|
@@ -1216,14 +1216,16 @@ function* iter_paragraph_content(paragraph) {
|
|
|
1216
1216
|
var DocumentMapper = class {
|
|
1217
1217
|
doc;
|
|
1218
1218
|
clean_view;
|
|
1219
|
+
original_view;
|
|
1219
1220
|
comments_map;
|
|
1220
1221
|
full_text = "";
|
|
1221
1222
|
spans = [];
|
|
1222
1223
|
appendix_start_index = -1;
|
|
1223
1224
|
_text_chunks = [];
|
|
1224
|
-
constructor(doc, clean_view = false) {
|
|
1225
|
+
constructor(doc, clean_view = false, original_view = false) {
|
|
1225
1226
|
this.doc = doc;
|
|
1226
1227
|
this.clean_view = clean_view;
|
|
1228
|
+
this.original_view = original_view;
|
|
1227
1229
|
this.comments_map = extract_comments_data(doc.pkg);
|
|
1228
1230
|
this._build_map();
|
|
1229
1231
|
}
|
|
@@ -1305,14 +1307,15 @@ ${header}`;
|
|
|
1305
1307
|
const ins = trPr ? findChild(trPr, "w:ins") : null;
|
|
1306
1308
|
const del_node = trPr ? findChild(trPr, "w:del") : null;
|
|
1307
1309
|
if (this.clean_view && del_node) continue;
|
|
1310
|
+
if (this.original_view && ins) continue;
|
|
1308
1311
|
if (rows_processed > 0) {
|
|
1309
1312
|
this._add_virtual_text("\n", current, null);
|
|
1310
1313
|
current += 1;
|
|
1311
1314
|
}
|
|
1312
|
-
if (ins && !this.clean_view) {
|
|
1315
|
+
if (ins && !this.clean_view && !this.original_view) {
|
|
1313
1316
|
this._add_virtual_text("{++ ", current, null);
|
|
1314
1317
|
current += 4;
|
|
1315
|
-
} else if (del_node && !this.clean_view) {
|
|
1318
|
+
} else if (del_node && !this.clean_view && !this.original_view) {
|
|
1316
1319
|
this._add_virtual_text("{-- ", current, null);
|
|
1317
1320
|
current += 4;
|
|
1318
1321
|
}
|
|
@@ -1328,11 +1331,11 @@ ${header}`;
|
|
|
1328
1331
|
current = this._map_blocks(cell, current);
|
|
1329
1332
|
cells_processed += 1;
|
|
1330
1333
|
}
|
|
1331
|
-
if (ins && !this.clean_view) {
|
|
1334
|
+
if (ins && !this.clean_view && !this.original_view) {
|
|
1332
1335
|
const suffix = ` |Chg:${ins.getAttribute("w:id")}++}`;
|
|
1333
1336
|
this._add_virtual_text(suffix, current, null);
|
|
1334
1337
|
current += suffix.length;
|
|
1335
|
-
} else if (del_node && !this.clean_view) {
|
|
1338
|
+
} else if (del_node && !this.clean_view && !this.original_view) {
|
|
1336
1339
|
const suffix = ` |Chg:${del_node.getAttribute("w:id")}--}`;
|
|
1337
1340
|
this._add_virtual_text(suffix, current, null);
|
|
1338
1341
|
current += suffix.length;
|
|
@@ -1426,11 +1429,13 @@ ${header}`;
|
|
|
1426
1429
|
}
|
|
1427
1430
|
if (this.clean_view && Object.keys(active_del).length > 0) {
|
|
1428
1431
|
}
|
|
1432
|
+
if (this.original_view && Object.keys(active_ins).length > 0) {
|
|
1433
|
+
}
|
|
1429
1434
|
const full_seg_text = run_parts.map((x) => x[1]).join("");
|
|
1430
1435
|
const curr_ins_id = Object.keys(active_ins).pop() || null;
|
|
1431
1436
|
const curr_del_id = Object.keys(active_del).pop() || null;
|
|
1432
|
-
if (full_seg_text && !(this.clean_view && curr_del_id)) {
|
|
1433
|
-
const new_wrappers = this.clean_view ? ["", ""] : this._get_wrappers(curr_ins_id, curr_del_id, active_ids, active_fmt);
|
|
1437
|
+
if (full_seg_text && !(this.clean_view && curr_del_id) && !(this.original_view && curr_ins_id)) {
|
|
1438
|
+
const new_wrappers = this.clean_view || this.original_view ? ["", ""] : this._get_wrappers(curr_ins_id, curr_del_id, active_ids, active_fmt);
|
|
1434
1439
|
const new_style = [prefix, suffix];
|
|
1435
1440
|
if (pending_runs.length > 0 && new_wrappers[0] === current_wrappers[0] && new_wrappers[1] === current_wrappers[1]) {
|
|
1436
1441
|
let skip_leading_prefix = false;
|
|
@@ -1455,7 +1460,7 @@ ${header}`;
|
|
|
1455
1460
|
}
|
|
1456
1461
|
}
|
|
1457
1462
|
}
|
|
1458
|
-
if (!this.clean_view) {
|
|
1463
|
+
if (!this.clean_view && !this.original_view) {
|
|
1459
1464
|
const has_meta = Object.keys(active_ins).length > 0 || Object.keys(active_del).length > 0 || active_ids.size > 0 || Object.keys(active_fmt).length > 0;
|
|
1460
1465
|
if (has_meta) {
|
|
1461
1466
|
deferred_meta_states.push([{ ...active_ins }, { ...active_del }, new Set(active_ids), { ...active_fmt }]);
|
|
@@ -2631,6 +2636,7 @@ var RedlineEngine = class {
|
|
|
2631
2636
|
mapper;
|
|
2632
2637
|
comments_manager;
|
|
2633
2638
|
clean_mapper = null;
|
|
2639
|
+
original_mapper = null;
|
|
2634
2640
|
skipped_details = [];
|
|
2635
2641
|
constructor(doc, author = "Adeu AI (TS)") {
|
|
2636
2642
|
this.doc = doc;
|
|
@@ -2670,7 +2676,10 @@ var RedlineEngine = class {
|
|
|
2670
2676
|
if (!full_text) return [null, null];
|
|
2671
2677
|
const before_start = Math.max(0, start_idx - 30);
|
|
2672
2678
|
const context_before = full_text.substring(before_start, start_idx);
|
|
2673
|
-
const context_after = full_text.substring(
|
|
2679
|
+
const context_after = full_text.substring(
|
|
2680
|
+
start_idx + length,
|
|
2681
|
+
start_idx + length + 30
|
|
2682
|
+
);
|
|
2674
2683
|
const critic_markup = `${context_before}{--${target_text}--}{++${new_text}++}${context_after}`;
|
|
2675
2684
|
let clean_text = critic_markup;
|
|
2676
2685
|
clean_text = clean_text.replace(/\{>>.*?<<\}/gs, "");
|
|
@@ -2842,7 +2851,9 @@ var RedlineEngine = class {
|
|
|
2842
2851
|
overrideEl.parentNode?.removeChild(overrideEl);
|
|
2843
2852
|
}
|
|
2844
2853
|
}
|
|
2845
|
-
pkg.parts = pkg.parts.filter(
|
|
2854
|
+
pkg.parts = pkg.parts.filter(
|
|
2855
|
+
(p) => !p.partname.toLowerCase().includes("comments")
|
|
2856
|
+
);
|
|
2846
2857
|
for (const key of Object.keys(pkg.unzipped)) {
|
|
2847
2858
|
if (key.toLowerCase().includes("comments")) {
|
|
2848
2859
|
delete pkg.unzipped[key];
|
|
@@ -3387,13 +3398,17 @@ var RedlineEngine = class {
|
|
|
3387
3398
|
if (!edit.target_text) continue;
|
|
3388
3399
|
let matches = this.mapper.find_all_match_indices(edit.target_text);
|
|
3389
3400
|
let activeText = this.mapper.full_text;
|
|
3401
|
+
let target_mapper = this.mapper;
|
|
3390
3402
|
if (matches.length === 0) {
|
|
3391
3403
|
if (!this.clean_mapper)
|
|
3392
3404
|
this.clean_mapper = new DocumentMapper(this.doc, true);
|
|
3393
3405
|
matches = this.clean_mapper.find_all_match_indices(edit.target_text);
|
|
3394
|
-
if (matches.length > 0)
|
|
3406
|
+
if (matches.length > 0) {
|
|
3407
|
+
activeText = this.clean_mapper.full_text;
|
|
3408
|
+
target_mapper = this.clean_mapper;
|
|
3409
|
+
}
|
|
3395
3410
|
}
|
|
3396
|
-
if (activeText === this.mapper.full_text && matches.length >
|
|
3411
|
+
if (activeText === this.mapper.full_text && matches.length > 0) {
|
|
3397
3412
|
const liveMatches = matches.filter(([start, length]) => {
|
|
3398
3413
|
const realSpans = this.mapper.spans.filter(
|
|
3399
3414
|
(s) => s.run !== null && s.end > start && s.start < start + length
|
|
@@ -3401,13 +3416,51 @@ var RedlineEngine = class {
|
|
|
3401
3416
|
if (realSpans.length === 0) return true;
|
|
3402
3417
|
return realSpans.some((s) => !s.del_id);
|
|
3403
3418
|
});
|
|
3404
|
-
|
|
3419
|
+
matches = liveMatches;
|
|
3405
3420
|
}
|
|
3421
|
+
let is_deleted_text = false;
|
|
3422
|
+
const deleted_authors = /* @__PURE__ */ new Set();
|
|
3406
3423
|
if (matches.length === 0) {
|
|
3407
|
-
|
|
3408
|
-
|
|
3424
|
+
if (!this.original_mapper) {
|
|
3425
|
+
this.original_mapper = new DocumentMapper(this.doc, false, true);
|
|
3426
|
+
}
|
|
3427
|
+
const orig_matches = this.original_mapper.find_all_match_indices(edit.target_text);
|
|
3428
|
+
if (orig_matches.length > 0) {
|
|
3429
|
+
is_deleted_text = true;
|
|
3430
|
+
for (const [start, length] of orig_matches) {
|
|
3431
|
+
const spans = this.original_mapper.spans.filter(
|
|
3432
|
+
(s) => s.end > start && s.start < start + length
|
|
3433
|
+
);
|
|
3434
|
+
for (const s of spans) {
|
|
3435
|
+
if (s.run !== null) {
|
|
3436
|
+
let parent = s.run._element;
|
|
3437
|
+
while (parent) {
|
|
3438
|
+
if (parent.nodeType === 1 && parent.tagName === "w:del") {
|
|
3439
|
+
const auth = parent.getAttribute("w:author");
|
|
3440
|
+
if (auth) {
|
|
3441
|
+
deleted_authors.add(auth);
|
|
3442
|
+
}
|
|
3443
|
+
break;
|
|
3444
|
+
}
|
|
3445
|
+
parent = parent.parentNode;
|
|
3446
|
+
}
|
|
3447
|
+
}
|
|
3448
|
+
}
|
|
3449
|
+
}
|
|
3450
|
+
}
|
|
3451
|
+
}
|
|
3452
|
+
if (matches.length === 0) {
|
|
3453
|
+
if (is_deleted_text) {
|
|
3454
|
+
const author_phrase = deleted_authors.size > 0 ? `by ${Array.from(deleted_authors).sort().join(", ")}` : "by an existing revision";
|
|
3455
|
+
errors.push(
|
|
3456
|
+
`- Edit ${i + 1} Failed: Target text matches text inside a tracked deletion ${author_phrase}. Reject/accept that change first or target the active replacement text instead.`
|
|
3457
|
+
);
|
|
3458
|
+
} else {
|
|
3459
|
+
errors.push(
|
|
3460
|
+
`- Edit ${i + 1} Failed: Target text not found in document:
|
|
3409
3461
|
"${edit.target_text}"`
|
|
3410
|
-
|
|
3462
|
+
);
|
|
3463
|
+
}
|
|
3411
3464
|
} else if (matches.length > 1) {
|
|
3412
3465
|
const positions = matches.map(([start, length]) => [
|
|
3413
3466
|
start,
|
|
@@ -3428,7 +3481,10 @@ var RedlineEngine = class {
|
|
|
3428
3481
|
const [pfx, sfx] = trim_common_context(matched, edit.new_text || "");
|
|
3429
3482
|
const t_end = matched.length - sfx;
|
|
3430
3483
|
const final_target = matched.substring(pfx, t_end);
|
|
3431
|
-
const final_new = (edit.new_text || "").substring(
|
|
3484
|
+
const final_new = (edit.new_text || "").substring(
|
|
3485
|
+
pfx,
|
|
3486
|
+
(edit.new_text || "").length - sfx
|
|
3487
|
+
);
|
|
3432
3488
|
if (final_target.includes("\n\n")) {
|
|
3433
3489
|
if (final_new.includes("\n\n")) {
|
|
3434
3490
|
const parts = matched.split("\n\n");
|
|
@@ -3584,7 +3640,9 @@ var RedlineEngine = class {
|
|
|
3584
3640
|
if (dry_run_mode) {
|
|
3585
3641
|
for (const edit of edits) {
|
|
3586
3642
|
const single_errors = this.validate_edits([edit]);
|
|
3587
|
-
const warning = this._check_punctuation_warning(
|
|
3643
|
+
const warning = this._check_punctuation_warning(
|
|
3644
|
+
edit.target_text || ""
|
|
3645
|
+
);
|
|
3588
3646
|
if (single_errors.length > 0) {
|
|
3589
3647
|
skipped_edits++;
|
|
3590
3648
|
edits_reports.push({
|
|
@@ -3638,7 +3696,9 @@ var RedlineEngine = class {
|
|
|
3638
3696
|
for (const edit of cloned_edits) {
|
|
3639
3697
|
const success = edit._applied_status || false;
|
|
3640
3698
|
const error_msg = edit._error_msg || null;
|
|
3641
|
-
const warning = this._check_punctuation_warning(
|
|
3699
|
+
const warning = this._check_punctuation_warning(
|
|
3700
|
+
edit.target_text || ""
|
|
3701
|
+
);
|
|
3642
3702
|
let critic_markup = null;
|
|
3643
3703
|
let clean_text = null;
|
|
3644
3704
|
if (success) {
|
|
@@ -3666,7 +3726,7 @@ var RedlineEngine = class {
|
|
|
3666
3726
|
skipped_details: this.skipped_details,
|
|
3667
3727
|
edits: edits_reports,
|
|
3668
3728
|
engine: "node",
|
|
3669
|
-
version: "1.
|
|
3729
|
+
version: "1.10.0"
|
|
3670
3730
|
};
|
|
3671
3731
|
}
|
|
3672
3732
|
apply_edits(edits) {
|
|
@@ -4189,7 +4249,10 @@ var RedlineEngine = class {
|
|
|
4189
4249
|
if (result.first_node.tagName === "w:p") {
|
|
4190
4250
|
first_anchor_target = findAllDescendants(result.first_node, "w:ins")[0] || result.first_node;
|
|
4191
4251
|
}
|
|
4192
|
-
const anchor = ascend_to_paragraph_child(
|
|
4252
|
+
const anchor = ascend_to_paragraph_child(
|
|
4253
|
+
first_anchor_target,
|
|
4254
|
+
host_p
|
|
4255
|
+
);
|
|
4193
4256
|
this._attach_comment(host_p, anchor, anchor, edit.comment);
|
|
4194
4257
|
}
|
|
4195
4258
|
}
|
|
@@ -4201,7 +4264,10 @@ var RedlineEngine = class {
|
|
|
4201
4264
|
length,
|
|
4202
4265
|
rebuild_map
|
|
4203
4266
|
);
|
|
4204
|
-
const virtual_spans = active_mapper.get_virtual_spans_in_range(
|
|
4267
|
+
const virtual_spans = active_mapper.get_virtual_spans_in_range(
|
|
4268
|
+
start_idx,
|
|
4269
|
+
length
|
|
4270
|
+
);
|
|
4205
4271
|
if (target_runs.length === 0 && virtual_spans.length === 0) return false;
|
|
4206
4272
|
const affected_ps = /* @__PURE__ */ new Set();
|
|
4207
4273
|
for (const run of target_runs) {
|
|
@@ -4284,7 +4350,10 @@ var RedlineEngine = class {
|
|
|
4284
4350
|
let pPr = findChild(p1_element, "w:pPr");
|
|
4285
4351
|
if (!pPr) {
|
|
4286
4352
|
pPr = p1_element.ownerDocument.createElement("w:pPr");
|
|
4287
|
-
p1_element.insertBefore(
|
|
4353
|
+
p1_element.insertBefore(
|
|
4354
|
+
pPr,
|
|
4355
|
+
p1_element.firstChild
|
|
4356
|
+
);
|
|
4288
4357
|
}
|
|
4289
4358
|
let rPr = findChild(pPr, "w:rPr");
|
|
4290
4359
|
if (!rPr) {
|
|
@@ -4813,13 +4882,14 @@ async function extractTextFromBuffer(buffer, cleanView = false) {
|
|
|
4813
4882
|
const doc = await DocumentObject.load(buffer);
|
|
4814
4883
|
return _extractTextFromDoc(doc, cleanView);
|
|
4815
4884
|
}
|
|
4816
|
-
function _extractTextFromDoc(doc, cleanView = false, includeAppendix = true) {
|
|
4885
|
+
function _extractTextFromDoc(doc, cleanView = false, includeAppendix = true, return_paragraph_offsets = false) {
|
|
4817
4886
|
const comments_map = extract_comments_data(doc.pkg);
|
|
4818
4887
|
const full_text = [];
|
|
4888
|
+
const paragraph_offsets = /* @__PURE__ */ new Map();
|
|
4819
4889
|
let cursor = 0;
|
|
4820
4890
|
for (const part of iter_document_parts(doc)) {
|
|
4821
4891
|
const part_cursor = full_text.length > 0 ? cursor + 2 : cursor;
|
|
4822
|
-
const part_text = _extract_blocks(part, comments_map, cleanView, part_cursor);
|
|
4892
|
+
const part_text = _extract_blocks(part, comments_map, cleanView, part_cursor, return_paragraph_offsets ? paragraph_offsets : void 0);
|
|
4823
4893
|
if (part_text) {
|
|
4824
4894
|
if (full_text.length > 0) cursor += 2;
|
|
4825
4895
|
full_text.push(part_text);
|
|
@@ -4831,9 +4901,12 @@ function _extractTextFromDoc(doc, cleanView = false, includeAppendix = true) {
|
|
|
4831
4901
|
const appendix = build_structural_appendix(doc, base_text);
|
|
4832
4902
|
if (appendix) base_text += appendix;
|
|
4833
4903
|
}
|
|
4904
|
+
if (return_paragraph_offsets) {
|
|
4905
|
+
return { text: base_text, paragraph_offsets };
|
|
4906
|
+
}
|
|
4834
4907
|
return base_text;
|
|
4835
4908
|
}
|
|
4836
|
-
function _extract_blocks(container, comments_map, cleanView, cursor) {
|
|
4909
|
+
function _extract_blocks(container, comments_map, cleanView, cursor, paragraph_offsets) {
|
|
4837
4910
|
const part = container.part || container;
|
|
4838
4911
|
const [style_cache, default_pstyle] = _get_style_cache(part);
|
|
4839
4912
|
const blocks = [];
|
|
@@ -4852,7 +4925,7 @@ ${header}`;
|
|
|
4852
4925
|
if (!is_first_block) local_cursor += 2;
|
|
4853
4926
|
const block_start = local_cursor;
|
|
4854
4927
|
if (item.constructor.name === "FootnoteItem") {
|
|
4855
|
-
const fn_text = _extract_blocks(item, comments_map, cleanView, block_start);
|
|
4928
|
+
const fn_text = _extract_blocks(item, comments_map, cleanView, block_start, paragraph_offsets);
|
|
4856
4929
|
if (fn_text) {
|
|
4857
4930
|
blocks.push(fn_text);
|
|
4858
4931
|
local_cursor = block_start + fn_text.length;
|
|
@@ -4868,11 +4941,14 @@ ${header}`;
|
|
|
4868
4941
|
const p_text = build_paragraph_text(item, comments_map, cleanView, style_cache, default_pstyle);
|
|
4869
4942
|
const full_block = prefix + p_text;
|
|
4870
4943
|
blocks.push(full_block);
|
|
4944
|
+
if (paragraph_offsets) {
|
|
4945
|
+
paragraph_offsets.set(item._element, [block_start, full_block.length]);
|
|
4946
|
+
}
|
|
4871
4947
|
local_cursor = block_start + full_block.length;
|
|
4872
4948
|
is_first_para = false;
|
|
4873
4949
|
is_first_block = false;
|
|
4874
4950
|
} else if (item instanceof Table) {
|
|
4875
|
-
const table_text = extract_table(item, comments_map, cleanView, block_start);
|
|
4951
|
+
const table_text = extract_table(item, comments_map, cleanView, block_start, paragraph_offsets);
|
|
4876
4952
|
if (table_text) {
|
|
4877
4953
|
blocks.push(table_text);
|
|
4878
4954
|
local_cursor = block_start + table_text.length;
|
|
@@ -4885,7 +4961,7 @@ ${header}`;
|
|
|
4885
4961
|
}
|
|
4886
4962
|
return blocks.join("\n\n");
|
|
4887
4963
|
}
|
|
4888
|
-
function extract_table(table, comments_map, cleanView, cursor) {
|
|
4964
|
+
function extract_table(table, comments_map, cleanView, cursor, paragraph_offsets) {
|
|
4889
4965
|
const rows_text = [];
|
|
4890
4966
|
let rows_processed = 0;
|
|
4891
4967
|
let local_cursor = cursor;
|
|
@@ -4904,7 +4980,7 @@ function extract_table(table, comments_map, cleanView, cursor) {
|
|
|
4904
4980
|
if (seen_cells.has(cell)) continue;
|
|
4905
4981
|
seen_cells.add(cell);
|
|
4906
4982
|
if (!first_cell) cell_cursor += 3;
|
|
4907
|
-
const cell_content = _extract_blocks(cell, comments_map, cleanView, cell_cursor);
|
|
4983
|
+
const cell_content = _extract_blocks(cell, comments_map, cleanView, cell_cursor, paragraph_offsets);
|
|
4908
4984
|
cell_texts.push(cell_content);
|
|
4909
4985
|
cell_cursor += cell_content.length;
|
|
4910
4986
|
first_cell = false;
|
|
@@ -5161,6 +5237,9 @@ function extract_outline(doc, projected_body, body_pages, body_page_offsets, par
|
|
|
5161
5237
|
if (body_pages.length !== body_page_offsets.length) {
|
|
5162
5238
|
throw new Error("body_pages and body_page_offsets length mismatch");
|
|
5163
5239
|
}
|
|
5240
|
+
if (paragraph_offsets) {
|
|
5241
|
+
return _extract_outline_fast(doc, projected_body, body_page_offsets, paragraph_offsets);
|
|
5242
|
+
}
|
|
5164
5243
|
const comments_map = extract_comments_data(doc.pkg);
|
|
5165
5244
|
const block_records = _walk_doc_body(doc, comments_map);
|
|
5166
5245
|
const heading_indices = [];
|
|
@@ -5436,6 +5515,7 @@ function _determine_heading_style(paragraph) {
|
|
|
5436
5515
|
if (pStyle) style_id = pStyle.getAttribute("w:val") || default_pstyle;
|
|
5437
5516
|
}
|
|
5438
5517
|
let outline_level = null;
|
|
5518
|
+
let outline_level_from_style = false;
|
|
5439
5519
|
if (pPr) {
|
|
5440
5520
|
const oLvl = findChild(pPr, "w:outlineLvl");
|
|
5441
5521
|
if (oLvl && /^\d+$/.test(oLvl.getAttribute("w:val") || "")) {
|
|
@@ -5444,6 +5524,7 @@ function _determine_heading_style(paragraph) {
|
|
|
5444
5524
|
}
|
|
5445
5525
|
if (outline_level === null && style_id && style_cache && style_cache[style_id]) {
|
|
5446
5526
|
outline_level = style_cache[style_id].outline_level;
|
|
5527
|
+
outline_level_from_style = true;
|
|
5447
5528
|
}
|
|
5448
5529
|
const style_name = style_id && style_cache && style_cache[style_id] ? style_cache[style_id].name : style_id;
|
|
5449
5530
|
let normalized_style_name = style_name;
|
|
@@ -5454,6 +5535,12 @@ function _determine_heading_style(paragraph) {
|
|
|
5454
5535
|
normalized_style_name = "Title";
|
|
5455
5536
|
}
|
|
5456
5537
|
}
|
|
5538
|
+
if (outline_level_from_style && outline_level !== null) {
|
|
5539
|
+
const is_heading_or_title = normalized_style_name && (normalized_style_name.startsWith("Heading") || normalized_style_name === "Title");
|
|
5540
|
+
if (!is_heading_or_title) {
|
|
5541
|
+
outline_level = null;
|
|
5542
|
+
}
|
|
5543
|
+
}
|
|
5457
5544
|
if (outline_level !== null && outline_level >= 0 && outline_level <= 8) {
|
|
5458
5545
|
if (normalized_style_name && (normalized_style_name.startsWith("Heading") || normalized_style_name === "Title")) {
|
|
5459
5546
|
return normalized_style_name;
|
|
@@ -5502,6 +5589,146 @@ function _offset_to_page(offset, body_page_offsets) {
|
|
|
5502
5589
|
}
|
|
5503
5590
|
return page;
|
|
5504
5591
|
}
|
|
5592
|
+
function _extract_outline_fast(doc, projected_body, body_page_offsets, paragraph_offsets) {
|
|
5593
|
+
const paragraphs_and_tables = [];
|
|
5594
|
+
const seen_cells = /* @__PURE__ */ new Set();
|
|
5595
|
+
function walk(container) {
|
|
5596
|
+
for (const item of iter_block_items(container)) {
|
|
5597
|
+
const i_type = item.constructor.name;
|
|
5598
|
+
if (i_type === "FootnoteItem") {
|
|
5599
|
+
walk(item);
|
|
5600
|
+
} else if (item instanceof Paragraph) {
|
|
5601
|
+
paragraphs_and_tables.push(["p", item]);
|
|
5602
|
+
} else if (item instanceof Table) {
|
|
5603
|
+
paragraphs_and_tables.push(["t", item]);
|
|
5604
|
+
for (const row of item.rows) {
|
|
5605
|
+
for (const cell of row.cells) {
|
|
5606
|
+
if (seen_cells.has(cell._element)) {
|
|
5607
|
+
continue;
|
|
5608
|
+
}
|
|
5609
|
+
seen_cells.add(cell._element);
|
|
5610
|
+
walk(cell);
|
|
5611
|
+
}
|
|
5612
|
+
}
|
|
5613
|
+
}
|
|
5614
|
+
}
|
|
5615
|
+
}
|
|
5616
|
+
walk(doc);
|
|
5617
|
+
const heading_indices = [];
|
|
5618
|
+
for (let idx = 0; idx < paragraphs_and_tables.length; idx++) {
|
|
5619
|
+
const [kind, item] = paragraphs_and_tables[idx];
|
|
5620
|
+
if (kind !== "p") continue;
|
|
5621
|
+
let hasOffset = false;
|
|
5622
|
+
if (paragraph_offsets instanceof Map) {
|
|
5623
|
+
hasOffset = paragraph_offsets.has(item._element);
|
|
5624
|
+
} else {
|
|
5625
|
+
hasOffset = item._element in paragraph_offsets;
|
|
5626
|
+
}
|
|
5627
|
+
if (!hasOffset) {
|
|
5628
|
+
continue;
|
|
5629
|
+
}
|
|
5630
|
+
if (!_is_heading(item)) continue;
|
|
5631
|
+
if (!_heading_passes_quality_filter_fast(item, projected_body, paragraph_offsets)) continue;
|
|
5632
|
+
heading_indices.push(idx);
|
|
5633
|
+
}
|
|
5634
|
+
if (heading_indices.length === 0) return [];
|
|
5635
|
+
const nodes = [];
|
|
5636
|
+
for (let h_pos = 0; h_pos < heading_indices.length; h_pos++) {
|
|
5637
|
+
const item_idx = heading_indices[h_pos];
|
|
5638
|
+
const paragraph = paragraphs_and_tables[item_idx][1];
|
|
5639
|
+
const level = _heading_level(paragraph);
|
|
5640
|
+
const text = _heading_text_fast(paragraph, projected_body, paragraph_offsets);
|
|
5641
|
+
const style = _determine_heading_style(paragraph);
|
|
5642
|
+
let owned_end = item_idx;
|
|
5643
|
+
for (let next_h_pos = h_pos + 1; next_h_pos < heading_indices.length; next_h_pos++) {
|
|
5644
|
+
const next_idx = heading_indices[next_h_pos];
|
|
5645
|
+
const next_paragraph = paragraphs_and_tables[next_idx][1];
|
|
5646
|
+
if (_heading_level(next_paragraph) <= level) {
|
|
5647
|
+
owned_end = next_idx;
|
|
5648
|
+
break;
|
|
5649
|
+
}
|
|
5650
|
+
}
|
|
5651
|
+
if (owned_end === item_idx) {
|
|
5652
|
+
owned_end = paragraphs_and_tables.length;
|
|
5653
|
+
}
|
|
5654
|
+
const owned = paragraphs_and_tables.slice(item_idx + 1, owned_end);
|
|
5655
|
+
let has_table = false;
|
|
5656
|
+
for (const [kind2, item2] of owned) {
|
|
5657
|
+
if (kind2 === "p" && _is_heading(item2)) {
|
|
5658
|
+
break;
|
|
5659
|
+
}
|
|
5660
|
+
if (kind2 === "t") {
|
|
5661
|
+
has_table = true;
|
|
5662
|
+
break;
|
|
5663
|
+
}
|
|
5664
|
+
}
|
|
5665
|
+
const footnote_ids = _collect_footnote_ids_fast(owned);
|
|
5666
|
+
let para_offset;
|
|
5667
|
+
if (paragraph_offsets instanceof Map) {
|
|
5668
|
+
para_offset = paragraph_offsets.get(paragraph._element);
|
|
5669
|
+
} else {
|
|
5670
|
+
para_offset = paragraph_offsets[paragraph._element];
|
|
5671
|
+
}
|
|
5672
|
+
let page_num = 1;
|
|
5673
|
+
if (para_offset !== void 0) {
|
|
5674
|
+
const [start_offset] = para_offset;
|
|
5675
|
+
page_num = _offset_to_page(start_offset, body_page_offsets);
|
|
5676
|
+
}
|
|
5677
|
+
nodes.push({
|
|
5678
|
+
level,
|
|
5679
|
+
text,
|
|
5680
|
+
page: page_num,
|
|
5681
|
+
style,
|
|
5682
|
+
has_table,
|
|
5683
|
+
footnote_ids
|
|
5684
|
+
});
|
|
5685
|
+
}
|
|
5686
|
+
return nodes;
|
|
5687
|
+
}
|
|
5688
|
+
function _heading_passes_quality_filter_fast(paragraph, projected_body, paragraph_offsets) {
|
|
5689
|
+
const style = _determine_heading_style(paragraph);
|
|
5690
|
+
if (style !== "(heuristic)") return true;
|
|
5691
|
+
const text = _heading_text_fast(paragraph, projected_body, paragraph_offsets);
|
|
5692
|
+
if (!text) return false;
|
|
5693
|
+
const words = text.match(/\w+/g) || [];
|
|
5694
|
+
return words.length >= _HEURISTIC_MIN_WORDS;
|
|
5695
|
+
}
|
|
5696
|
+
function _heading_text_fast(paragraph, projected_body, paragraph_offsets) {
|
|
5697
|
+
let offset;
|
|
5698
|
+
if (paragraph_offsets instanceof Map) {
|
|
5699
|
+
offset = paragraph_offsets.get(paragraph._element);
|
|
5700
|
+
} else {
|
|
5701
|
+
offset = paragraph_offsets[paragraph._element];
|
|
5702
|
+
}
|
|
5703
|
+
if (offset === void 0) {
|
|
5704
|
+
return "";
|
|
5705
|
+
}
|
|
5706
|
+
const [start, length] = offset;
|
|
5707
|
+
const raw = projected_body.substring(start, start + length);
|
|
5708
|
+
let cleaned = _strip_critic_markup(raw);
|
|
5709
|
+
cleaned = _strip_inline_formatting(cleaned);
|
|
5710
|
+
cleaned = cleaned.replace(/^#+\s+/, "");
|
|
5711
|
+
return cleaned.trim();
|
|
5712
|
+
}
|
|
5713
|
+
function _collect_footnote_ids_fast(owned_items) {
|
|
5714
|
+
const seen = /* @__PURE__ */ new Set();
|
|
5715
|
+
const ordered = [];
|
|
5716
|
+
for (const [kind, item] of owned_items) {
|
|
5717
|
+
if (kind !== "p") continue;
|
|
5718
|
+
for (const event of iter_paragraph_content(item)) {
|
|
5719
|
+
if (!("type" in event)) continue;
|
|
5720
|
+
let fn_id = "";
|
|
5721
|
+
if (event.type === "footnote") fn_id = `fn-${event.id}`;
|
|
5722
|
+
else if (event.type === "endnote") fn_id = `en-${event.id}`;
|
|
5723
|
+
else continue;
|
|
5724
|
+
if (!seen.has(fn_id)) {
|
|
5725
|
+
seen.add(fn_id);
|
|
5726
|
+
ordered.push(fn_id);
|
|
5727
|
+
}
|
|
5728
|
+
}
|
|
5729
|
+
}
|
|
5730
|
+
return ordered;
|
|
5731
|
+
}
|
|
5505
5732
|
|
|
5506
5733
|
// src/sanitize/report.ts
|
|
5507
5734
|
var SanitizeReport = class {
|
|
@@ -6188,6 +6415,7 @@ export {
|
|
|
6188
6415
|
DocumentMapper,
|
|
6189
6416
|
DocumentObject,
|
|
6190
6417
|
RedlineEngine,
|
|
6418
|
+
_extractTextFromDoc,
|
|
6191
6419
|
apply_edits_to_markdown,
|
|
6192
6420
|
create_unified_diff,
|
|
6193
6421
|
create_word_patch_diff,
|