@adeu/core 1.6.2 → 1.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/outline.ts CHANGED
@@ -1,377 +1,377 @@
1
- /**
2
- * Structural outline extractor.
3
- */
4
-
5
- import { DocumentObject } from './docx/bridge.js';
6
- import { Paragraph, Table, DocxEvent } from './docx/primitives.js';
7
- import { build_paragraph_text, extract_table } from './ingest.js';
8
- import { extract_comments_data } from './comments.js';
9
- import { findChild } from './docx/dom.js';
10
- import {
11
- _get_style_cache,
12
- get_paragraph_prefix,
13
- iter_block_items,
14
- iter_document_parts,
15
- iter_paragraph_content,
16
- } from './utils/docx.js';
17
-
18
- const _HEADING_PREFIX_RE = /^(#{1,6}) /;
19
- const _HEURISTIC_MIN_WORDS = 3;
20
-
21
- export interface OutlineNode {
22
- level: number;
23
- text: string;
24
- page: number;
25
- style: string;
26
- has_table: boolean;
27
- footnote_ids: string[];
28
- }
29
-
30
- interface _BlockRecord {
31
- item: any;
32
- is_paragraph: boolean;
33
- is_table: boolean;
34
- start_offset: number;
35
- projected_length: number;
36
- }
37
-
38
- export function extract_outline(
39
- doc: DocumentObject,
40
- projected_body: string,
41
- body_pages: string[],
42
- body_page_offsets: number[],
43
- paragraph_offsets: Record<string, [number, number]> | null = null
44
- ): OutlineNode[] {
45
- if (body_pages.length !== body_page_offsets.length) {
46
- throw new Error('body_pages and body_page_offsets length mismatch');
47
- }
48
-
49
- const comments_map = extract_comments_data(doc.pkg);
50
- const block_records = _walk_doc_body(doc, comments_map);
51
-
52
- const heading_indices: number[] = [];
53
- for (let idx = 0; idx < block_records.length; idx++) {
54
- const rec = block_records[idx];
55
- if (!(rec.is_paragraph && _is_heading(rec.item))) continue;
56
- if (!_heading_passes_quality_filter(rec.item, comments_map)) continue;
57
- heading_indices.push(idx);
58
- }
59
-
60
- if (heading_indices.length === 0) return [];
61
-
62
- const nodes: OutlineNode[] = [];
63
- for (let h_pos = 0; h_pos < heading_indices.length; h_pos++) {
64
- const rec_idx = heading_indices[h_pos];
65
- const rec = block_records[rec_idx];
66
- const paragraph = rec.item as Paragraph;
67
-
68
- const level = _heading_level(paragraph);
69
- const text = _heading_text(paragraph, comments_map);
70
- const style = _determine_heading_style(paragraph);
71
-
72
- const owned_end = _find_owned_end(block_records, heading_indices, h_pos, level);
73
- const owned_blocks = block_records.slice(rec_idx + 1, owned_end);
74
-
75
- const has_table = _direct_has_table(block_records, rec_idx + 1, owned_end);
76
- const footnote_ids = _collect_footnote_ids(owned_blocks);
77
-
78
- const page_num = _offset_to_page(rec.start_offset, body_page_offsets);
79
-
80
- nodes.push({ level, text, page: page_num, style, has_table, footnote_ids });
81
- }
82
-
83
- return nodes;
84
- }
85
-
86
- function _direct_has_table(block_records: _BlockRecord[], range_start: number, range_end: number): boolean {
87
- for (let idx = range_start; idx < range_end; idx++) {
88
- const rec = block_records[idx];
89
- if (rec.is_paragraph && _is_heading(rec.item)) return false;
90
- if (rec.is_table) return true;
91
- }
92
- return false;
93
- }
94
-
95
- function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[] {
96
- const parts = Array.from(iter_document_parts(doc));
97
- let body_start_offset = 0;
98
- let body_part: any = null;
99
-
100
- for (const part of parts) {
101
- if (part === doc) {
102
- body_part = part;
103
- break;
104
- }
105
- const part_text = _project_part(part, comments_map);
106
- if (part_text) {
107
- if (body_start_offset > 0) body_start_offset += 2;
108
- body_start_offset += part_text.length;
109
- }
110
- }
111
-
112
- if (!body_part) {
113
- body_part = doc;
114
- body_start_offset = 0;
115
- } else {
116
- if (body_start_offset > 0) body_start_offset += 2;
117
- }
118
-
119
- const records: _BlockRecord[] = [];
120
- let cursor = body_start_offset;
121
- let is_first_block = true;
122
-
123
- for (const item of iter_block_items(body_part)) {
124
- if (item instanceof Paragraph) {
125
- const prefix = get_paragraph_prefix(item);
126
- const p_text = build_paragraph_text(item, comments_map, false);
127
- const block_len = (prefix + p_text).length;
128
-
129
- if (!is_first_block) cursor += 2;
130
-
131
- records.push({ item, is_paragraph: true, is_table: false, start_offset: cursor, projected_length: block_len });
132
- cursor += block_len;
133
- is_first_block = false;
134
- } else if (item instanceof Table) {
135
- const table_text = extract_table(item, comments_map, false, 0);
136
- const block_len = table_text ? table_text.length : 0;
137
-
138
- if (!is_first_block) cursor += 2;
139
-
140
- const table_start = cursor;
141
- records.push({ item, is_paragraph: false, is_table: true, start_offset: table_start, projected_length: block_len });
142
- _record_table_inner_blocks_lite(item, table_start, records, comments_map);
143
- cursor += block_len;
144
- is_first_block = false;
145
- }
146
- }
147
-
148
- return records;
149
- }
150
-
151
- function _compute_inner_block_offset(table: Table, target_paragraph: Paragraph, table_start_offset: number, comments_map: any): number {
152
- const target_el = target_paragraph._element;
153
- let cursor = table_start_offset;
154
- let rows_processed = 0;
155
-
156
- for (const row of table.rows) {
157
- if (rows_processed > 0) cursor += 1;
158
-
159
- const seen_cells = new Set();
160
- let cells_in_row = 0;
161
-
162
- for (const cell of row.cells) {
163
- if (seen_cells.has(cell)) continue;
164
- seen_cells.add(cell);
165
-
166
- if (cells_in_row > 0) cursor += 3;
167
-
168
- const [new_cursor, found] = _walk_cell_for_offset(cell, target_el, cursor, comments_map);
169
- if (found) return new_cursor;
170
- cursor = new_cursor;
171
-
172
- cells_in_row++;
173
- }
174
- rows_processed++;
175
- }
176
-
177
- return table_start_offset;
178
- }
179
-
180
- function _walk_cell_for_offset(cell: any, target_el: any, cell_start_cursor: number, comments_map: any): [number, boolean] {
181
- let cursor = cell_start_cursor;
182
- let is_first_block = true;
183
-
184
- for (const inner_item of iter_block_items(cell)) {
185
- if (!is_first_block) cursor += 2;
186
-
187
- if (inner_item instanceof Paragraph) {
188
- if (inner_item._element === target_el) return [cursor, true];
189
- const prefix = get_paragraph_prefix(inner_item);
190
- const p_text = build_paragraph_text(inner_item, comments_map, false);
191
- cursor += (prefix + p_text).length;
192
- } else if (inner_item instanceof Table) {
193
- const nested_offset = _compute_inner_block_offset(inner_item, new Paragraph(target_el, null), cursor, comments_map);
194
- if (nested_offset !== cursor) {
195
- if (_element_is_descendant(target_el, inner_item._element)) return [nested_offset, true];
196
- }
197
- const table_text = extract_table(inner_item, comments_map, false, 0);
198
- cursor += table_text ? table_text.length : 0;
199
- }
200
- is_first_block = false;
201
- }
202
- return [cursor, false];
203
- }
204
-
205
- function _element_is_descendant(target_el: Element, ancestor_el: Element): boolean {
206
- let cur: Node | null = target_el.parentNode;
207
- while (cur) {
208
- if (cur === ancestor_el) return true;
209
- cur = cur.parentNode;
210
- }
211
- return false;
212
- }
213
-
214
- function _record_table_inner_blocks_lite(table: Table, inherited_offset: number, records: _BlockRecord[], comments_map: any) {
215
- const seen_cells = new Set();
216
- for (const row of table.rows) {
217
- for (const cell of row.cells) {
218
- if (seen_cells.has(cell)) continue;
219
- seen_cells.add(cell);
220
-
221
- for (const inner_item of iter_block_items(cell)) {
222
- if (inner_item instanceof Paragraph) {
223
- const true_offset = _is_heading(inner_item) ? _compute_inner_block_offset(table, inner_item, inherited_offset, comments_map) : inherited_offset;
224
- records.push({ item: inner_item, is_paragraph: true, is_table: false, start_offset: true_offset, projected_length: 0 });
225
- } else if (inner_item instanceof Table) {
226
- records.push({ item: inner_item, is_paragraph: false, is_table: true, start_offset: inherited_offset, projected_length: 0 });
227
- _record_table_inner_blocks_lite(inner_item, inherited_offset, records, comments_map);
228
- }
229
- }
230
- }
231
- }
232
- }
233
-
234
- function _project_part(part: any, comments_map: any): string {
235
- const blocks: string[] = [];
236
- const c_type = part.constructor.name;
237
-
238
- if (c_type === 'NotesPart') {
239
- const header = part.note_type === 'fn' ? '## Footnotes' : '## Endnotes';
240
- blocks.push(`---\n${header}`);
241
- }
242
-
243
- let is_first_para = true;
244
- for (const item of iter_block_items(part)) {
245
- if (item.constructor.name === 'FootnoteItem') {
246
- const fn_text = _project_part(item, comments_map);
247
- if (fn_text) blocks.push(fn_text);
248
- } else if (item instanceof Paragraph) {
249
- let prefix = get_paragraph_prefix(item);
250
- if (is_first_para && c_type === 'FootnoteItem') prefix = `[^${part.note_type}-${part.id}]: ${prefix}`;
251
- const p_text = build_paragraph_text(item, comments_map, false);
252
- blocks.push(prefix + p_text);
253
- is_first_para = false;
254
- } else if (item instanceof Table) {
255
- const table_text = extract_table(item, comments_map, false, 0);
256
- if (table_text) blocks.push(table_text);
257
- is_first_para = false;
258
- }
259
- }
260
-
261
- return blocks.join('\n\n');
262
- }
263
-
264
- function _is_heading(paragraph: Paragraph): boolean {
265
- return _HEADING_PREFIX_RE.test(get_paragraph_prefix(paragraph));
266
- }
267
-
268
- function _heading_passes_quality_filter(paragraph: Paragraph, comments_map: any): boolean {
269
- const style = _determine_heading_style(paragraph);
270
- if (style !== '(heuristic)') return true;
271
- const text = _heading_text(paragraph, comments_map);
272
- if (!text) return false;
273
- const word_count = (text.match(/\w+/g) || []).length;
274
- return word_count >= _HEURISTIC_MIN_WORDS;
275
- }
276
-
277
- function _heading_level(paragraph: Paragraph): number {
278
- const match = _HEADING_PREFIX_RE.exec(get_paragraph_prefix(paragraph));
279
- return match ? Math.min(match[1].length, 6) : 1;
280
- }
281
-
282
- function _heading_text(paragraph: Paragraph, comments_map: any): string {
283
- const p_text = build_paragraph_text(paragraph, comments_map, false);
284
- let cleaned = _strip_critic_markup(p_text);
285
- cleaned = _strip_inline_formatting(cleaned);
286
- return cleaned.trim();
287
- }
288
-
289
- function _strip_critic_markup(text: string): string {
290
- if (!text) return '';
291
- text = text.replace(/\{--[\s\S]*?--\}/g, '');
292
- text = text.replace(/\{>>[\s\S]*?<<\}/g, '');
293
- text = text.replace(/\{\+\+([\s\S]*?)\+\+\}/g, '$1');
294
- text = text.replace(/\{==([\s\S]*?)==\}/g, '$1');
295
- return text;
296
- }
297
-
298
- function _strip_inline_formatting(text: string): string {
299
- if (!text) return '';
300
- text = text.replace(/\*\*(.+?)\*\*/g, '$1');
301
- text = text.replace(/__(.+?)__/g, '$1');
302
- text = text.replace(/(?<!\w)_(\S(?:.*?\S)?)_(?!\w)/g, '$1');
303
- return text;
304
- }
305
-
306
- function _determine_heading_style(paragraph: Paragraph): string {
307
- const [style_cache, default_pstyle] = _get_style_cache(paragraph._parent.part || paragraph._parent);
308
- const pPr = findChild(paragraph._element, 'w:pPr');
309
- let style_id = default_pstyle;
310
-
311
- if (pPr) {
312
- const oLvl = findChild(pPr, 'w:outlineLvl');
313
- if (oLvl && /^\d+$/.test(oLvl.getAttribute('w:val') || '')) {
314
- const style = _safe_style_name(paragraph, style_cache, default_pstyle);
315
- if (style && (style.startsWith('Heading') || style === 'Title')) return style;
316
- return '(outline_level)';
317
- }
318
- const pStyle = findChild(pPr, 'w:pStyle');
319
- if (pStyle) style_id = pStyle.getAttribute('w:val') || default_pstyle;
320
- }
321
-
322
- const style_name = (style_id && style_cache && style_cache[style_id]) ? style_cache[style_id].name : null;
323
- if (style_name && (style_name.startsWith('Heading') || style_name === 'Title')) return style_name;
324
-
325
- if (style_name && /Heading[ ]?([1-6])(?![0-9])/.test(style_name)) return style_name;
326
-
327
- return '(heuristic)';
328
- }
329
-
330
- function _safe_style_name(paragraph: Paragraph, style_cache: any, default_pstyle: any): string | null {
331
- const pPr = findChild(paragraph._element, 'w:pPr');
332
- let style_id = default_pstyle;
333
- if (pPr) {
334
- const pStyle = findChild(pPr, 'w:pStyle');
335
- if (pStyle) style_id = pStyle.getAttribute('w:val') || default_pstyle;
336
- }
337
- return (style_id && style_cache && style_cache[style_id]) ? style_cache[style_id].name : null;
338
- }
339
-
340
- function _find_owned_end(block_records: _BlockRecord[], heading_indices: number[], current_h_pos: number, current_level: number): number {
341
- for (let next_h_pos = current_h_pos + 1; next_h_pos < heading_indices.length; next_h_pos++) {
342
- const next_idx = heading_indices[next_h_pos];
343
- if (_heading_level(block_records[next_idx].item) <= current_level) return next_idx;
344
- }
345
- return block_records.length;
346
- }
347
-
348
- function _collect_footnote_ids(owned_blocks: _BlockRecord[]): string[] {
349
- const seen = new Set<string>();
350
- const ordered: string[] = [];
351
- for (const rec of owned_blocks) {
352
- if (!rec.is_paragraph) continue;
353
- for (const event of iter_paragraph_content(rec.item)) {
354
- if (!('type' in event)) continue;
355
- let fn_id = '';
356
- if (event.type === 'footnote') fn_id = `fn-${event.id}`;
357
- else if (event.type === 'endnote') fn_id = `en-${event.id}`;
358
- else continue;
359
-
360
- if (!seen.has(fn_id)) {
361
- seen.add(fn_id);
362
- ordered.push(fn_id);
363
- }
364
- }
365
- }
366
- return ordered;
367
- }
368
-
369
- function _offset_to_page(offset: number, body_page_offsets: number[]): number {
370
- if (!body_page_offsets || body_page_offsets.length === 0) return 1;
371
- let page = 1;
372
- for (let i = 0; i < body_page_offsets.length; i++) {
373
- if (offset >= body_page_offsets[i]) page = i + 1;
374
- else break;
375
- }
376
- return page;
1
+ /**
2
+ * Structural outline extractor.
3
+ */
4
+
5
+ import { DocumentObject } from './docx/bridge.js';
6
+ import { Paragraph, Table, DocxEvent } from './docx/primitives.js';
7
+ import { build_paragraph_text, extract_table } from './ingest.js';
8
+ import { extract_comments_data } from './comments.js';
9
+ import { findChild } from './docx/dom.js';
10
+ import {
11
+ _get_style_cache,
12
+ get_paragraph_prefix,
13
+ iter_block_items,
14
+ iter_document_parts,
15
+ iter_paragraph_content,
16
+ } from './utils/docx.js';
17
+
18
+ const _HEADING_PREFIX_RE = /^(#{1,6}) /;
19
+ const _HEURISTIC_MIN_WORDS = 3;
20
+
21
+ export interface OutlineNode {
22
+ level: number;
23
+ text: string;
24
+ page: number;
25
+ style: string;
26
+ has_table: boolean;
27
+ footnote_ids: string[];
28
+ }
29
+
30
+ interface _BlockRecord {
31
+ item: any;
32
+ is_paragraph: boolean;
33
+ is_table: boolean;
34
+ start_offset: number;
35
+ projected_length: number;
36
+ }
37
+
38
+ export function extract_outline(
39
+ doc: DocumentObject,
40
+ projected_body: string,
41
+ body_pages: string[],
42
+ body_page_offsets: number[],
43
+ paragraph_offsets: Record<string, [number, number]> | null = null
44
+ ): OutlineNode[] {
45
+ if (body_pages.length !== body_page_offsets.length) {
46
+ throw new Error('body_pages and body_page_offsets length mismatch');
47
+ }
48
+
49
+ const comments_map = extract_comments_data(doc.pkg);
50
+ const block_records = _walk_doc_body(doc, comments_map);
51
+
52
+ const heading_indices: number[] = [];
53
+ for (let idx = 0; idx < block_records.length; idx++) {
54
+ const rec = block_records[idx];
55
+ if (!(rec.is_paragraph && _is_heading(rec.item))) continue;
56
+ if (!_heading_passes_quality_filter(rec.item, comments_map)) continue;
57
+ heading_indices.push(idx);
58
+ }
59
+
60
+ if (heading_indices.length === 0) return [];
61
+
62
+ const nodes: OutlineNode[] = [];
63
+ for (let h_pos = 0; h_pos < heading_indices.length; h_pos++) {
64
+ const rec_idx = heading_indices[h_pos];
65
+ const rec = block_records[rec_idx];
66
+ const paragraph = rec.item as Paragraph;
67
+
68
+ const level = _heading_level(paragraph);
69
+ const text = _heading_text(paragraph, comments_map);
70
+ const style = _determine_heading_style(paragraph);
71
+
72
+ const owned_end = _find_owned_end(block_records, heading_indices, h_pos, level);
73
+ const owned_blocks = block_records.slice(rec_idx + 1, owned_end);
74
+
75
+ const has_table = _direct_has_table(block_records, rec_idx + 1, owned_end);
76
+ const footnote_ids = _collect_footnote_ids(owned_blocks);
77
+
78
+ const page_num = _offset_to_page(rec.start_offset, body_page_offsets);
79
+
80
+ nodes.push({ level, text, page: page_num, style, has_table, footnote_ids });
81
+ }
82
+
83
+ return nodes;
84
+ }
85
+
86
+ function _direct_has_table(block_records: _BlockRecord[], range_start: number, range_end: number): boolean {
87
+ for (let idx = range_start; idx < range_end; idx++) {
88
+ const rec = block_records[idx];
89
+ if (rec.is_paragraph && _is_heading(rec.item)) return false;
90
+ if (rec.is_table) return true;
91
+ }
92
+ return false;
93
+ }
94
+
95
+ function _walk_doc_body(doc: DocumentObject, comments_map: any): _BlockRecord[] {
96
+ const parts = Array.from(iter_document_parts(doc));
97
+ let body_start_offset = 0;
98
+ let body_part: any = null;
99
+
100
+ for (const part of parts) {
101
+ if (part === doc) {
102
+ body_part = part;
103
+ break;
104
+ }
105
+ const part_text = _project_part(part, comments_map);
106
+ if (part_text) {
107
+ if (body_start_offset > 0) body_start_offset += 2;
108
+ body_start_offset += part_text.length;
109
+ }
110
+ }
111
+
112
+ if (!body_part) {
113
+ body_part = doc;
114
+ body_start_offset = 0;
115
+ } else {
116
+ if (body_start_offset > 0) body_start_offset += 2;
117
+ }
118
+
119
+ const records: _BlockRecord[] = [];
120
+ let cursor = body_start_offset;
121
+ let is_first_block = true;
122
+
123
+ for (const item of iter_block_items(body_part)) {
124
+ if (item instanceof Paragraph) {
125
+ const prefix = get_paragraph_prefix(item);
126
+ const p_text = build_paragraph_text(item, comments_map, false);
127
+ const block_len = (prefix + p_text).length;
128
+
129
+ if (!is_first_block) cursor += 2;
130
+
131
+ records.push({ item, is_paragraph: true, is_table: false, start_offset: cursor, projected_length: block_len });
132
+ cursor += block_len;
133
+ is_first_block = false;
134
+ } else if (item instanceof Table) {
135
+ const table_text = extract_table(item, comments_map, false, 0);
136
+ const block_len = table_text ? table_text.length : 0;
137
+
138
+ if (!is_first_block) cursor += 2;
139
+
140
+ const table_start = cursor;
141
+ records.push({ item, is_paragraph: false, is_table: true, start_offset: table_start, projected_length: block_len });
142
+ _record_table_inner_blocks_lite(item, table_start, records, comments_map);
143
+ cursor += block_len;
144
+ is_first_block = false;
145
+ }
146
+ }
147
+
148
+ return records;
149
+ }
150
+
151
+ function _compute_inner_block_offset(table: Table, target_paragraph: Paragraph, table_start_offset: number, comments_map: any): number {
152
+ const target_el = target_paragraph._element;
153
+ let cursor = table_start_offset;
154
+ let rows_processed = 0;
155
+
156
+ for (const row of table.rows) {
157
+ if (rows_processed > 0) cursor += 1;
158
+
159
+ const seen_cells = new Set();
160
+ let cells_in_row = 0;
161
+
162
+ for (const cell of row.cells) {
163
+ if (seen_cells.has(cell)) continue;
164
+ seen_cells.add(cell);
165
+
166
+ if (cells_in_row > 0) cursor += 3;
167
+
168
+ const [new_cursor, found] = _walk_cell_for_offset(cell, target_el, cursor, comments_map);
169
+ if (found) return new_cursor;
170
+ cursor = new_cursor;
171
+
172
+ cells_in_row++;
173
+ }
174
+ rows_processed++;
175
+ }
176
+
177
+ return table_start_offset;
178
+ }
179
+
180
+ function _walk_cell_for_offset(cell: any, target_el: any, cell_start_cursor: number, comments_map: any): [number, boolean] {
181
+ let cursor = cell_start_cursor;
182
+ let is_first_block = true;
183
+
184
+ for (const inner_item of iter_block_items(cell)) {
185
+ if (!is_first_block) cursor += 2;
186
+
187
+ if (inner_item instanceof Paragraph) {
188
+ if (inner_item._element === target_el) return [cursor, true];
189
+ const prefix = get_paragraph_prefix(inner_item);
190
+ const p_text = build_paragraph_text(inner_item, comments_map, false);
191
+ cursor += (prefix + p_text).length;
192
+ } else if (inner_item instanceof Table) {
193
+ const nested_offset = _compute_inner_block_offset(inner_item, new Paragraph(target_el, null), cursor, comments_map);
194
+ if (nested_offset !== cursor) {
195
+ if (_element_is_descendant(target_el, inner_item._element)) return [nested_offset, true];
196
+ }
197
+ const table_text = extract_table(inner_item, comments_map, false, 0);
198
+ cursor += table_text ? table_text.length : 0;
199
+ }
200
+ is_first_block = false;
201
+ }
202
+ return [cursor, false];
203
+ }
204
+
205
+ function _element_is_descendant(target_el: Element, ancestor_el: Element): boolean {
206
+ let cur: Node | null = target_el.parentNode;
207
+ while (cur) {
208
+ if (cur === ancestor_el) return true;
209
+ cur = cur.parentNode;
210
+ }
211
+ return false;
212
+ }
213
+
214
+ function _record_table_inner_blocks_lite(table: Table, inherited_offset: number, records: _BlockRecord[], comments_map: any) {
215
+ const seen_cells = new Set();
216
+ for (const row of table.rows) {
217
+ for (const cell of row.cells) {
218
+ if (seen_cells.has(cell)) continue;
219
+ seen_cells.add(cell);
220
+
221
+ for (const inner_item of iter_block_items(cell)) {
222
+ if (inner_item instanceof Paragraph) {
223
+ const true_offset = _is_heading(inner_item) ? _compute_inner_block_offset(table, inner_item, inherited_offset, comments_map) : inherited_offset;
224
+ records.push({ item: inner_item, is_paragraph: true, is_table: false, start_offset: true_offset, projected_length: 0 });
225
+ } else if (inner_item instanceof Table) {
226
+ records.push({ item: inner_item, is_paragraph: false, is_table: true, start_offset: inherited_offset, projected_length: 0 });
227
+ _record_table_inner_blocks_lite(inner_item, inherited_offset, records, comments_map);
228
+ }
229
+ }
230
+ }
231
+ }
232
+ }
233
+
234
+ function _project_part(part: any, comments_map: any): string {
235
+ const blocks: string[] = [];
236
+ const c_type = part.constructor.name;
237
+
238
+ if (c_type === 'NotesPart') {
239
+ const header = part.note_type === 'fn' ? '## Footnotes' : '## Endnotes';
240
+ blocks.push(`---\n${header}`);
241
+ }
242
+
243
+ let is_first_para = true;
244
+ for (const item of iter_block_items(part)) {
245
+ if (item.constructor.name === 'FootnoteItem') {
246
+ const fn_text = _project_part(item, comments_map);
247
+ if (fn_text) blocks.push(fn_text);
248
+ } else if (item instanceof Paragraph) {
249
+ let prefix = get_paragraph_prefix(item);
250
+ if (is_first_para && c_type === 'FootnoteItem') prefix = `[^${part.note_type}-${part.id}]: ${prefix}`;
251
+ const p_text = build_paragraph_text(item, comments_map, false);
252
+ blocks.push(prefix + p_text);
253
+ is_first_para = false;
254
+ } else if (item instanceof Table) {
255
+ const table_text = extract_table(item, comments_map, false, 0);
256
+ if (table_text) blocks.push(table_text);
257
+ is_first_para = false;
258
+ }
259
+ }
260
+
261
+ return blocks.join('\n\n');
262
+ }
263
+
264
+ function _is_heading(paragraph: Paragraph): boolean {
265
+ return _HEADING_PREFIX_RE.test(get_paragraph_prefix(paragraph));
266
+ }
267
+
268
+ function _heading_passes_quality_filter(paragraph: Paragraph, comments_map: any): boolean {
269
+ const style = _determine_heading_style(paragraph);
270
+ if (style !== '(heuristic)') return true;
271
+ const text = _heading_text(paragraph, comments_map);
272
+ if (!text) return false;
273
+ const word_count = (text.match(/\w+/g) || []).length;
274
+ return word_count >= _HEURISTIC_MIN_WORDS;
275
+ }
276
+
277
+ function _heading_level(paragraph: Paragraph): number {
278
+ const match = _HEADING_PREFIX_RE.exec(get_paragraph_prefix(paragraph));
279
+ return match ? Math.min(match[1].length, 6) : 1;
280
+ }
281
+
282
+ function _heading_text(paragraph: Paragraph, comments_map: any): string {
283
+ const p_text = build_paragraph_text(paragraph, comments_map, false);
284
+ let cleaned = _strip_critic_markup(p_text);
285
+ cleaned = _strip_inline_formatting(cleaned);
286
+ return cleaned.trim();
287
+ }
288
+
289
+ function _strip_critic_markup(text: string): string {
290
+ if (!text) return '';
291
+ text = text.replace(/\{--[\s\S]*?--\}/g, '');
292
+ text = text.replace(/\{>>[\s\S]*?<<\}/g, '');
293
+ text = text.replace(/\{\+\+([\s\S]*?)\+\+\}/g, '$1');
294
+ text = text.replace(/\{==([\s\S]*?)==\}/g, '$1');
295
+ return text;
296
+ }
297
+
298
+ function _strip_inline_formatting(text: string): string {
299
+ if (!text) return '';
300
+ text = text.replace(/\*\*(.+?)\*\*/g, '$1');
301
+ text = text.replace(/__(.+?)__/g, '$1');
302
+ text = text.replace(/(?<!\w)_(\S(?:.*?\S)?)_(?!\w)/g, '$1');
303
+ return text;
304
+ }
305
+
306
+ function _determine_heading_style(paragraph: Paragraph): string {
307
+ const [style_cache, default_pstyle] = _get_style_cache(paragraph._parent.part || paragraph._parent);
308
+ const pPr = findChild(paragraph._element, 'w:pPr');
309
+ let style_id = default_pstyle;
310
+
311
+ if (pPr) {
312
+ const oLvl = findChild(pPr, 'w:outlineLvl');
313
+ if (oLvl && /^\d+$/.test(oLvl.getAttribute('w:val') || '')) {
314
+ const style = _safe_style_name(paragraph, style_cache, default_pstyle);
315
+ if (style && (style.startsWith('Heading') || style === 'Title')) return style;
316
+ return '(outline_level)';
317
+ }
318
+ const pStyle = findChild(pPr, 'w:pStyle');
319
+ if (pStyle) style_id = pStyle.getAttribute('w:val') || default_pstyle;
320
+ }
321
+
322
+ const style_name = (style_id && style_cache && style_cache[style_id]) ? style_cache[style_id].name : null;
323
+ if (style_name && (style_name.startsWith('Heading') || style_name === 'Title')) return style_name;
324
+
325
+ if (style_name && /Heading[ ]?([1-6])(?![0-9])/.test(style_name)) return style_name;
326
+
327
+ return '(heuristic)';
328
+ }
329
+
330
+ function _safe_style_name(paragraph: Paragraph, style_cache: any, default_pstyle: any): string | null {
331
+ const pPr = findChild(paragraph._element, 'w:pPr');
332
+ let style_id = default_pstyle;
333
+ if (pPr) {
334
+ const pStyle = findChild(pPr, 'w:pStyle');
335
+ if (pStyle) style_id = pStyle.getAttribute('w:val') || default_pstyle;
336
+ }
337
+ return (style_id && style_cache && style_cache[style_id]) ? style_cache[style_id].name : null;
338
+ }
339
+
340
+ function _find_owned_end(block_records: _BlockRecord[], heading_indices: number[], current_h_pos: number, current_level: number): number {
341
+ for (let next_h_pos = current_h_pos + 1; next_h_pos < heading_indices.length; next_h_pos++) {
342
+ const next_idx = heading_indices[next_h_pos];
343
+ if (_heading_level(block_records[next_idx].item) <= current_level) return next_idx;
344
+ }
345
+ return block_records.length;
346
+ }
347
+
348
+ function _collect_footnote_ids(owned_blocks: _BlockRecord[]): string[] {
349
+ const seen = new Set<string>();
350
+ const ordered: string[] = [];
351
+ for (const rec of owned_blocks) {
352
+ if (!rec.is_paragraph) continue;
353
+ for (const event of iter_paragraph_content(rec.item)) {
354
+ if (!('type' in event)) continue;
355
+ let fn_id = '';
356
+ if (event.type === 'footnote') fn_id = `fn-${event.id}`;
357
+ else if (event.type === 'endnote') fn_id = `en-${event.id}`;
358
+ else continue;
359
+
360
+ if (!seen.has(fn_id)) {
361
+ seen.add(fn_id);
362
+ ordered.push(fn_id);
363
+ }
364
+ }
365
+ }
366
+ return ordered;
367
+ }
368
+
369
+ function _offset_to_page(offset: number, body_page_offsets: number[]): number {
370
+ if (!body_page_offsets || body_page_offsets.length === 0) return 1;
371
+ let page = 1;
372
+ for (let i = 0; i < body_page_offsets.length; i++) {
373
+ if (offset >= body_page_offsets[i]) page = i + 1;
374
+ else break;
375
+ }
376
+ return page;
377
377
  }