@adeu/core 1.6.2 → 1.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/utils/docx.ts CHANGED
@@ -1,478 +1,478 @@
1
- import { qn, findChild, findAllDescendants } from '../docx/dom.js';
2
- import { Paragraph, Table, Run, NotesPart, FootnoteItem, DocxEvent } from '../docx/primitives.js';
3
-
4
- export const QN_W_P = 'w:p';
5
- export const QN_W_R = 'w:r';
6
- export const QN_W_T = 'w:t';
7
- export const QN_W_DELTEXT = 'w:delText';
8
- export const QN_W_TAB = 'w:tab';
9
- export const QN_W_BR = 'w:br';
10
- export const QN_W_CR = 'w:cr';
11
- export const QN_W_RPR = 'w:rPr';
12
- export const QN_W_RPRCHANGE = 'w:rPrChange';
13
- export const QN_W_COMMENTREFERENCE = 'w:commentReference';
14
- export const QN_W_FOOTNOTEREFERENCE = 'w:footnoteReference';
15
- export const QN_W_ENDNOTEREFERENCE = 'w:endnoteReference';
16
- export const QN_W_FLDCHAR = 'w:fldChar';
17
- export const QN_W_FLDCHARTYPE = 'w:fldCharType';
18
- export const QN_W_INSTRTEXT = 'w:instrText';
19
- export const QN_W_INS = 'w:ins';
20
- export const QN_W_DEL = 'w:del';
21
- export const QN_W_ID = 'w:id';
22
- export const QN_W_AUTHOR = 'w:author';
23
- export const QN_W_DATE = 'w:date';
24
- export const QN_W_COMMENTRANGESTART = 'w:commentRangeStart';
25
- export const QN_W_COMMENTRANGEEND = 'w:commentRangeEnd';
26
- export const QN_W_HYPERLINK = 'w:hyperlink';
27
- export const QN_R_ID = 'r:id';
28
- export const QN_W_FLDSIMPLE = 'w:fldSimple';
29
- export const QN_W_INSTR = 'w:instr';
30
- export const QN_W_BOOKMARKSTART = 'w:bookmarkStart';
31
- export const QN_W_NAME = 'w:name';
32
- export const QN_W_SDT = 'w:sdt';
33
- export const QN_W_SMARTTAG = 'w:smartTag';
34
- export const QN_W_SDTCONTENT = 'w:sdtContent';
35
- export const QN_W_B = 'w:b';
36
- export const QN_W_I = 'w:i';
37
- export const QN_W_VAL = 'w:val';
38
- export const QN_W_PPR = 'w:pPr';
39
- export const QN_W_PSTYLE = 'w:pStyle';
40
- export const QN_W_OUTLINELVL = 'w:outlineLvl';
41
- export const QN_W_NUMPR = 'w:numPr';
42
- export const QN_W_NUMID = 'w:numId';
43
- export const QN_W_ILVL = 'w:ilvl';
44
-
45
- const _CUSTOM_HEADING_NAME_RE = /Heading[ ]?([1-6])(?![0-9])/;
46
-
47
- export function _get_style_cache(part: any): [Record<string, any>, string | null] {
48
- const pkg = part.package || part.pkg || (part.part ? part.part.pkg : null);
49
- if (pkg && pkg._adeu_style_cache) {
50
- return pkg._adeu_style_cache;
51
- }
52
-
53
- const cache: Record<string, any> = {};
54
- let default_pstyle: string | null = null;
55
- const raw_styles: Record<string, any> = {};
56
-
57
- const stylesPart = pkg?.getPartByPath('word/styles.xml');
58
- if (!stylesPart) {
59
- const result: [Record<string, any>, string | null] = [cache, null];
60
- if (pkg) pkg._adeu_style_cache = result;
61
- return result;
62
- }
63
-
64
- const styles = findAllDescendants(stylesPart._element, 'w:style');
65
- for (const s of styles) {
66
- const s_id = s.getAttribute('w:styleId');
67
- if (!s_id) continue;
68
-
69
- const s_type = s.getAttribute('w:type');
70
- const is_default = s.getAttribute('w:default') === '1' || s.getAttribute('w:default') === 'true';
71
-
72
- if (s_type === 'paragraph' && is_default) default_pstyle = s_id;
73
-
74
- const name_el = findChild(s, 'w:name');
75
- const name = name_el ? name_el.getAttribute('w:val') : s_id;
76
-
77
- const based_on_el = findChild(s, 'w:basedOn');
78
- const based_on = based_on_el ? based_on_el.getAttribute('w:val') : null;
79
-
80
- let outline_lvl: number | null = null;
81
- const pPr = findChild(s, 'w:pPr');
82
- if (pPr) {
83
- const oLvl = findChild(pPr, 'w:outlineLvl');
84
- if (oLvl) {
85
- const val = oLvl.getAttribute('w:val');
86
- if (val && /^\d+$/.test(val)) outline_lvl = parseInt(val, 10);
87
- }
88
- }
89
-
90
- let bold: boolean | null = null;
91
- const rPr = findChild(s, 'w:rPr');
92
- if (rPr) {
93
- const b = findChild(rPr, 'w:b');
94
- if (b) {
95
- const val = b.getAttribute('w:val');
96
- bold = val !== '0' && val !== 'false' && val !== 'off';
97
- }
98
- }
99
-
100
- raw_styles[s_id] = { name, based_on, outline_level: outline_lvl, bold };
101
- }
102
-
103
- const resolve_style = (s_id: string, visited: Set<string>): any => {
104
- if (cache[s_id]) return cache[s_id];
105
- if (visited.has(s_id) || !raw_styles[s_id]) return { name: s_id, outline_level: null, bold: false };
106
-
107
- visited.add(s_id);
108
- const raw = raw_styles[s_id];
109
- const based_on_id = raw.based_on;
110
-
111
- let o_lvl = raw.outline_level;
112
- let bold_val = raw.bold !== null ? raw.bold : false;
113
-
114
- if (based_on_id) {
115
- const parent = resolve_style(based_on_id, visited);
116
- if (o_lvl === null) o_lvl = parent.outline_level;
117
- if (raw.bold === null) bold_val = parent.bold;
118
- }
119
-
120
- const resolved = { name: raw.name, outline_level: o_lvl, bold: bold_val };
121
- cache[s_id] = resolved;
122
- return resolved;
123
- };
124
-
125
- for (const s_id in raw_styles) resolve_style(s_id, new Set());
126
-
127
- const result: [Record<string, any>, string | null] = [cache, default_pstyle];
128
- if (pkg) pkg._adeu_style_cache = result;
129
- return result;
130
- }
131
-
132
- function _detect_heading_level_from_name(name: string): number | null {
133
- if (!name) return null;
134
- const match = name.match(_CUSTOM_HEADING_NAME_RE);
135
- return match ? parseInt(match[1], 10) : null;
136
- }
137
-
138
- export function is_native_heading(paragraph: Paragraph, style_cache?: Record<string, any>, default_pstyle?: string | null): boolean {
139
- if (!style_cache) {
140
- [style_cache, default_pstyle] = _get_style_cache(paragraph._parent.part || paragraph._parent);
141
- }
142
- const pPr = findChild(paragraph._element, QN_W_PPR);
143
-
144
- if (pPr) {
145
- const oLvl = findChild(pPr, QN_W_OUTLINELVL);
146
- if (oLvl) {
147
- const val = oLvl.getAttribute(QN_W_VAL);
148
- if (val && /^\d+$/.test(val)) {
149
- const lvl = parseInt(val, 10);
150
- if (lvl >= 0 && lvl <= 8) return true;
151
- }
152
- }
153
- }
154
-
155
- let style_id = default_pstyle;
156
- if (pPr) {
157
- const pStyle = findChild(pPr, QN_W_PSTYLE);
158
- if (pStyle) style_id = pStyle.getAttribute(QN_W_VAL) || default_pstyle;
159
- }
160
-
161
- const style_info = style_id && style_cache ? style_cache[style_id] : null;
162
- if (style_info && style_info.outline_level !== null && style_info.outline_level >= 0 && style_info.outline_level <= 8) {
163
- return true;
164
- }
165
-
166
- const style_name = style_info ? style_info.name : null;
167
- if (style_name?.startsWith('Heading')) return true;
168
- if (style_name === 'Title') return true;
169
- if (style_name && style_name !== 'Normal') {
170
- if (_detect_heading_level_from_name(style_name) !== null) return true;
171
- }
172
-
173
- return false;
174
- }
175
-
176
- export function get_paragraph_prefix(paragraph: Paragraph, style_cache?: Record<string, any>, default_pstyle?: string | null): string {
177
- if (!style_cache) {
178
- [style_cache, default_pstyle] = _get_style_cache(paragraph._parent.part || paragraph._parent);
179
- }
180
- const pPr = findChild(paragraph._element, QN_W_PPR);
181
-
182
- if (pPr) {
183
- const oLvl = findChild(pPr, QN_W_OUTLINELVL);
184
- if (oLvl) {
185
- const val = oLvl.getAttribute(QN_W_VAL);
186
- if (val && /^\d+$/.test(val)) {
187
- const lvl = parseInt(val, 10);
188
- if (lvl >= 0 && lvl <= 8) return '#'.repeat(lvl + 1) + ' ';
189
- }
190
- }
191
- }
192
-
193
- let style_id = default_pstyle;
194
- if (pPr) {
195
- const pStyle = findChild(pPr, QN_W_PSTYLE);
196
- if (pStyle) style_id = pStyle.getAttribute(QN_W_VAL) || default_pstyle;
197
- }
198
-
199
- const style_info = style_id && style_cache ? style_cache[style_id] : null;
200
- if (style_info && style_info.outline_level !== null && style_info.outline_level >= 0 && style_info.outline_level <= 8) {
201
- return '#'.repeat(style_info.outline_level + 1) + ' ';
202
- }
203
-
204
- const style_name = style_info ? style_info.name : null;
205
- if (style_name?.startsWith('Heading')) {
206
- const match = style_name.replace('Heading', '').trim();
207
- if (/^\d+$/.test(match)) return '#'.repeat(parseInt(match, 10)) + ' ';
208
- }
209
-
210
- if (style_name === 'Title') return '# ';
211
-
212
- if (pPr) {
213
- const numPr = findChild(pPr, QN_W_NUMPR);
214
- if (numPr) {
215
- const numId = findChild(numPr, QN_W_NUMID);
216
- if (numId && numId.getAttribute(QN_W_VAL) !== '0') {
217
- let level = 0;
218
- const ilvl = findChild(numPr, QN_W_ILVL);
219
- if (ilvl) {
220
- const valAttr = ilvl.getAttribute(QN_W_VAL);
221
- if (valAttr) level = parseInt(valAttr, 10) || 0;
222
- }
223
- return ' '.repeat(level) + '* ';
224
- }
225
- }
226
- }
227
-
228
- if (style_name && style_name !== 'Normal') {
229
- const custom_level = _detect_heading_level_from_name(style_name);
230
- if (custom_level !== null) return '#'.repeat(custom_level) + ' ';
231
- }
232
-
233
- if (!style_name || style_name === 'Normal') {
234
- const text = paragraph.text.trim();
235
- if (text && text.length < 100 && text === text.toUpperCase()) {
236
- let is_bold = false;
237
- if (style_info?.bold) {
238
- is_bold = true;
239
- } else {
240
- const runs = findAllDescendants(paragraph._element, QN_W_R);
241
- for (const r of runs) {
242
- const tList = findAllDescendants(r, QN_W_T);
243
- const tText = tList.map(t => t.textContent || '').join('');
244
- if (tText.trim()) {
245
- const rPr_run = findChild(r, QN_W_RPR);
246
- if (rPr_run) {
247
- const b = findChild(rPr_run, QN_W_B);
248
- if (b && b.getAttribute(QN_W_VAL) !== '0' && b.getAttribute(QN_W_VAL) !== 'false') {
249
- is_bold = true;
250
- }
251
- }
252
- break;
253
- }
254
- }
255
- }
256
- if (is_bold) return '## ';
257
- }
258
- }
259
-
260
- return '';
261
- }
262
-
263
- export function is_heading_paragraph(paragraph: Paragraph, style_cache?: Record<string, any>, default_pstyle?: string | null): boolean {
264
- const prefix = get_paragraph_prefix(paragraph, style_cache, default_pstyle);
265
- if (!prefix) return false;
266
- const stripped = prefix.trimEnd();
267
- return stripped.length > 0 && stripped === '#'.repeat(stripped.length);
268
- }
269
-
270
- export function get_run_style_markers(run: Run, is_heading: boolean | null = null): [string, string] {
271
- let prefix = '';
272
- let suffix = '';
273
-
274
- const rPr = findChild(run._element, QN_W_RPR);
275
- let is_bold = false;
276
- let is_italic = false;
277
-
278
- if (rPr) {
279
- const b = findChild(rPr, QN_W_B);
280
- if (b && b.getAttribute(QN_W_VAL) !== '0' && b.getAttribute(QN_W_VAL) !== 'false') is_bold = true;
281
-
282
- const i = findChild(rPr, QN_W_I);
283
- if (i && i.getAttribute(QN_W_VAL) !== '0' && i.getAttribute(QN_W_VAL) !== 'false') is_italic = true;
284
- }
285
-
286
- if (is_heading === null) {
287
- const parent = run._parent;
288
- is_heading = parent instanceof Paragraph ? is_native_heading(parent) : false;
289
- }
290
-
291
- if (is_bold && !is_heading) {
292
- prefix += '**';
293
- suffix = '**' + suffix;
294
- }
295
-
296
- if (is_italic) {
297
- prefix += '_';
298
- suffix = '_' + suffix;
299
- }
300
-
301
- return [prefix, suffix];
302
- }
303
-
304
- export function apply_formatting_to_segments(text: string, prefix: string, suffix: string): string {
305
- if (!prefix && !suffix) return text;
306
- if (!text) return '';
307
- if (!text.includes('\n')) return `${prefix}${text}${suffix}`;
308
-
309
- const parts = text.split('\n');
310
- return parts.map(p => p ? `${prefix}${p}${suffix}` : '').join('\n');
311
- }
312
-
313
- export function get_run_text(run: Run): string {
314
- let text = '';
315
- for (let i = 0; i < run._element.childNodes.length; i++) {
316
- const child = run._element.childNodes[i] as Element;
317
- if (child.nodeType !== 1) continue;
318
-
319
- if (child.tagName === QN_W_T || child.tagName === QN_W_DELTEXT) {
320
- const raw = child.textContent || '';
321
- text += raw.replace(/\t/g, ' ');
322
- } else if (child.tagName === QN_W_TAB) {
323
- text += ' ';
324
- } else if (child.tagName === QN_W_BR || child.tagName === QN_W_CR) {
325
- text += '\n';
326
- }
327
- }
328
- return text;
329
- }
330
-
331
- export function* iter_block_items(parent: any): Generator<Paragraph | Table | FootnoteItem> {
332
- const parent_elm = parent._element || parent.element || parent;
333
-
334
- if (parent.constructor.name === 'NotesPart') {
335
- const tag = parent.note_type === 'fn' ? 'w:footnote' : 'w:endnote';
336
- const notes = findAllDescendants(parent_elm, tag);
337
- for (const child of notes) {
338
- if (child.getAttribute('w:type') === 'separator' || child.getAttribute('w:type') === 'continuationSeparator') continue;
339
- yield new FootnoteItem(child, parent, parent.note_type);
340
- }
341
- return;
342
- }
343
-
344
- for (let i = 0; i < parent_elm.childNodes.length; i++) {
345
- const child = parent_elm.childNodes[i] as Element;
346
- if (child.nodeType !== 1) continue;
347
-
348
- if (child.tagName === QN_W_P) {
349
- yield new Paragraph(child, parent);
350
- } else if (child.tagName === 'w:tbl') {
351
- yield new Table(child, parent);
352
- }
353
- }
354
- }
355
-
356
- export function* iter_document_parts(doc: any): Generator<any> {
357
- // Simplified for TS port - just yield main document and notes for ingestion
358
- yield doc;
359
-
360
- const fnPart = doc.pkg.getPartByPath('word/footnotes.xml');
361
- const enPart = doc.pkg.getPartByPath('word/endnotes.xml');
362
-
363
- if (fnPart) yield new NotesPart(fnPart, 'fn');
364
- if (enPart) yield new NotesPart(enPart, 'en');
365
- }
366
-
367
- function _is_page_instr(instr: string): boolean {
368
- if (!instr) return false;
369
- const parts = instr.toUpperCase().trim().split(/\s+/);
370
- return parts.length > 0 && (parts[0] === 'PAGE' || parts[0] === 'NUMPAGES');
371
- }
372
-
373
- export function* iter_paragraph_content(paragraph: Paragraph): Generator<Run | DocxEvent> {
374
- let in_complex_field = false;
375
- let current_instr = '';
376
- let hide_result = false;
377
-
378
- function* process_run_element(r_element: Element): Generator<Run | DocxEvent> {
379
- let c_id: string | null = null;
380
- const rPr = findChild(r_element, QN_W_RPR);
381
- if (rPr) {
382
- const rPrChange = findChild(rPr, QN_W_RPRCHANGE);
383
- if (rPrChange) {
384
- c_id = rPrChange.getAttribute(QN_W_ID);
385
- yield { type: 'fmt_start', id: c_id!, author: rPrChange.getAttribute(QN_W_AUTHOR) || undefined, date: rPrChange.getAttribute(QN_W_DATE) || undefined };
386
- }
387
- }
388
-
389
- for (let i = 0; i < r_element.childNodes.length; i++) {
390
- const child = r_element.childNodes[i] as Element;
391
- if (child.nodeType !== 1) continue;
392
-
393
- const tag = child.tagName;
394
- if (tag === QN_W_COMMENTREFERENCE) {
395
- const ref_id = child.getAttribute(QN_W_ID);
396
- if (ref_id) yield { type: 'ref', id: ref_id };
397
- } else if (tag === QN_W_FOOTNOTEREFERENCE) {
398
- const f_id = child.getAttribute(QN_W_ID);
399
- if (f_id) yield { type: 'footnote', id: f_id };
400
- } else if (tag === QN_W_ENDNOTEREFERENCE) {
401
- const e_id = child.getAttribute(QN_W_ID);
402
- if (e_id) yield { type: 'endnote', id: e_id };
403
- } else if (tag === QN_W_FLDCHAR) {
404
- const fld_type = child.getAttribute(QN_W_FLDCHARTYPE);
405
- if (fld_type === 'begin') {
406
- in_complex_field = true;
407
- current_instr = '';
408
- } else if (fld_type === 'separate') {
409
- if (_is_page_instr(current_instr)) hide_result = true;
410
- else {
411
- const parts = current_instr.trim().split(/\s+/);
412
- if (parts.length > 1 && parts[0] === 'REF') yield { type: 'xref_start', id: parts[1] };
413
- }
414
- } else if (fld_type === 'end') {
415
- if (!hide_result) {
416
- const parts = current_instr.trim().split(/\s+/);
417
- if (parts.length > 1 && parts[0] === 'REF') yield { type: 'xref_end', id: parts[1] };
418
- }
419
- in_complex_field = false;
420
- current_instr = '';
421
- hide_result = false;
422
- }
423
- } else if (tag === QN_W_INSTRTEXT && in_complex_field && !hide_result) {
424
- current_instr += child.textContent || '';
425
- }
426
- }
427
-
428
- if (!hide_result) yield new Run(r_element, paragraph);
429
- if (c_id !== null) yield { type: 'fmt_end', id: c_id };
430
- }
431
-
432
- function* traverse_node(node: Element): Generator<Run | DocxEvent> {
433
- for (let i = 0; i < node.childNodes.length; i++) {
434
- const child = node.childNodes[i] as Element;
435
- if (child.nodeType !== 1) continue;
436
-
437
- const tag = child.tagName;
438
- if (tag === QN_W_R) yield* process_run_element(child);
439
- else if (tag === QN_W_INS) {
440
- const i_id = child.getAttribute(QN_W_ID)!;
441
- yield { type: 'ins_start', id: i_id, author: child.getAttribute(QN_W_AUTHOR) || undefined, date: child.getAttribute(QN_W_DATE) || undefined };
442
- yield* traverse_node(child);
443
- yield { type: 'ins_end', id: i_id };
444
- } else if (tag === QN_W_DEL) {
445
- const d_id = child.getAttribute(QN_W_ID)!;
446
- yield { type: 'del_start', id: d_id, author: child.getAttribute(QN_W_AUTHOR) || undefined, date: child.getAttribute(QN_W_DATE) || undefined };
447
- yield* traverse_node(child);
448
- yield { type: 'del_end', id: d_id };
449
- } else if (tag === QN_W_COMMENTRANGESTART) yield { type: 'start', id: child.getAttribute(QN_W_ID)! };
450
- else if (tag === QN_W_COMMENTRANGEEND) yield { type: 'end', id: child.getAttribute(QN_W_ID)! };
451
- else if (tag === QN_W_HYPERLINK) {
452
- const rId = child.getAttribute(QN_R_ID);
453
- let url = '';
454
- if (rId && paragraph._parent.part) {
455
- const rel = paragraph._parent.part.rels.get(rId);
456
- if (rel && rel.isExternal) url = rel.target;
457
- }
458
- if (url) yield { type: 'hyperlink_start', id: rId!, date: url };
459
- yield* traverse_node(child);
460
- if (url) yield { type: 'hyperlink_end', id: rId!, date: url };
461
- } else if (tag === QN_W_FLDSIMPLE) {
462
- const instr = child.getAttribute(QN_W_INSTR) || '';
463
- const parts = instr.trim().split(/\s+/);
464
- const target = (parts.length > 1 && parts[0] === 'REF') ? parts[1] : '';
465
- if (target) yield { type: 'xref_start', id: target };
466
- yield* traverse_node(child);
467
- if (target) yield { type: 'xref_end', id: target };
468
- } else if (tag === QN_W_BOOKMARKSTART) {
469
- const b_name = child.getAttribute(QN_W_NAME);
470
- if (b_name && (!b_name.startsWith('_') || b_name.startsWith('_Ref'))) yield { type: 'bookmark', id: b_name };
471
- } else if (tag === QN_W_SDT || tag === QN_W_SMARTTAG || tag === QN_W_SDTCONTENT) {
472
- yield* traverse_node(child);
473
- }
474
- }
475
- }
476
-
477
- yield* traverse_node(paragraph._element);
1
+ import { qn, findChild, findAllDescendants } from '../docx/dom.js';
2
+ import { Paragraph, Table, Run, NotesPart, FootnoteItem, DocxEvent } from '../docx/primitives.js';
3
+
4
+ export const QN_W_P = 'w:p';
5
+ export const QN_W_R = 'w:r';
6
+ export const QN_W_T = 'w:t';
7
+ export const QN_W_DELTEXT = 'w:delText';
8
+ export const QN_W_TAB = 'w:tab';
9
+ export const QN_W_BR = 'w:br';
10
+ export const QN_W_CR = 'w:cr';
11
+ export const QN_W_RPR = 'w:rPr';
12
+ export const QN_W_RPRCHANGE = 'w:rPrChange';
13
+ export const QN_W_COMMENTREFERENCE = 'w:commentReference';
14
+ export const QN_W_FOOTNOTEREFERENCE = 'w:footnoteReference';
15
+ export const QN_W_ENDNOTEREFERENCE = 'w:endnoteReference';
16
+ export const QN_W_FLDCHAR = 'w:fldChar';
17
+ export const QN_W_FLDCHARTYPE = 'w:fldCharType';
18
+ export const QN_W_INSTRTEXT = 'w:instrText';
19
+ export const QN_W_INS = 'w:ins';
20
+ export const QN_W_DEL = 'w:del';
21
+ export const QN_W_ID = 'w:id';
22
+ export const QN_W_AUTHOR = 'w:author';
23
+ export const QN_W_DATE = 'w:date';
24
+ export const QN_W_COMMENTRANGESTART = 'w:commentRangeStart';
25
+ export const QN_W_COMMENTRANGEEND = 'w:commentRangeEnd';
26
+ export const QN_W_HYPERLINK = 'w:hyperlink';
27
+ export const QN_R_ID = 'r:id';
28
+ export const QN_W_FLDSIMPLE = 'w:fldSimple';
29
+ export const QN_W_INSTR = 'w:instr';
30
+ export const QN_W_BOOKMARKSTART = 'w:bookmarkStart';
31
+ export const QN_W_NAME = 'w:name';
32
+ export const QN_W_SDT = 'w:sdt';
33
+ export const QN_W_SMARTTAG = 'w:smartTag';
34
+ export const QN_W_SDTCONTENT = 'w:sdtContent';
35
+ export const QN_W_B = 'w:b';
36
+ export const QN_W_I = 'w:i';
37
+ export const QN_W_VAL = 'w:val';
38
+ export const QN_W_PPR = 'w:pPr';
39
+ export const QN_W_PSTYLE = 'w:pStyle';
40
+ export const QN_W_OUTLINELVL = 'w:outlineLvl';
41
+ export const QN_W_NUMPR = 'w:numPr';
42
+ export const QN_W_NUMID = 'w:numId';
43
+ export const QN_W_ILVL = 'w:ilvl';
44
+
45
+ const _CUSTOM_HEADING_NAME_RE = /Heading[ ]?([1-6])(?![0-9])/;
46
+
47
+ export function _get_style_cache(part: any): [Record<string, any>, string | null] {
48
+ const pkg = part.package || part.pkg || (part.part ? part.part.pkg : null);
49
+ if (pkg && pkg._adeu_style_cache) {
50
+ return pkg._adeu_style_cache;
51
+ }
52
+
53
+ const cache: Record<string, any> = {};
54
+ let default_pstyle: string | null = null;
55
+ const raw_styles: Record<string, any> = {};
56
+
57
+ const stylesPart = pkg?.getPartByPath('word/styles.xml');
58
+ if (!stylesPart) {
59
+ const result: [Record<string, any>, string | null] = [cache, null];
60
+ if (pkg) pkg._adeu_style_cache = result;
61
+ return result;
62
+ }
63
+
64
+ const styles = findAllDescendants(stylesPart._element, 'w:style');
65
+ for (const s of styles) {
66
+ const s_id = s.getAttribute('w:styleId');
67
+ if (!s_id) continue;
68
+
69
+ const s_type = s.getAttribute('w:type');
70
+ const is_default = s.getAttribute('w:default') === '1' || s.getAttribute('w:default') === 'true';
71
+
72
+ if (s_type === 'paragraph' && is_default) default_pstyle = s_id;
73
+
74
+ const name_el = findChild(s, 'w:name');
75
+ const name = name_el ? name_el.getAttribute('w:val') : s_id;
76
+
77
+ const based_on_el = findChild(s, 'w:basedOn');
78
+ const based_on = based_on_el ? based_on_el.getAttribute('w:val') : null;
79
+
80
+ let outline_lvl: number | null = null;
81
+ const pPr = findChild(s, 'w:pPr');
82
+ if (pPr) {
83
+ const oLvl = findChild(pPr, 'w:outlineLvl');
84
+ if (oLvl) {
85
+ const val = oLvl.getAttribute('w:val');
86
+ if (val && /^\d+$/.test(val)) outline_lvl = parseInt(val, 10);
87
+ }
88
+ }
89
+
90
+ let bold: boolean | null = null;
91
+ const rPr = findChild(s, 'w:rPr');
92
+ if (rPr) {
93
+ const b = findChild(rPr, 'w:b');
94
+ if (b) {
95
+ const val = b.getAttribute('w:val');
96
+ bold = val !== '0' && val !== 'false' && val !== 'off';
97
+ }
98
+ }
99
+
100
+ raw_styles[s_id] = { name, based_on, outline_level: outline_lvl, bold };
101
+ }
102
+
103
+ const resolve_style = (s_id: string, visited: Set<string>): any => {
104
+ if (cache[s_id]) return cache[s_id];
105
+ if (visited.has(s_id) || !raw_styles[s_id]) return { name: s_id, outline_level: null, bold: false };
106
+
107
+ visited.add(s_id);
108
+ const raw = raw_styles[s_id];
109
+ const based_on_id = raw.based_on;
110
+
111
+ let o_lvl = raw.outline_level;
112
+ let bold_val = raw.bold !== null ? raw.bold : false;
113
+
114
+ if (based_on_id) {
115
+ const parent = resolve_style(based_on_id, visited);
116
+ if (o_lvl === null) o_lvl = parent.outline_level;
117
+ if (raw.bold === null) bold_val = parent.bold;
118
+ }
119
+
120
+ const resolved = { name: raw.name, outline_level: o_lvl, bold: bold_val };
121
+ cache[s_id] = resolved;
122
+ return resolved;
123
+ };
124
+
125
+ for (const s_id in raw_styles) resolve_style(s_id, new Set());
126
+
127
+ const result: [Record<string, any>, string | null] = [cache, default_pstyle];
128
+ if (pkg) pkg._adeu_style_cache = result;
129
+ return result;
130
+ }
131
+
132
+ function _detect_heading_level_from_name(name: string): number | null {
133
+ if (!name) return null;
134
+ const match = name.match(_CUSTOM_HEADING_NAME_RE);
135
+ return match ? parseInt(match[1], 10) : null;
136
+ }
137
+
138
+ export function is_native_heading(paragraph: Paragraph, style_cache?: Record<string, any>, default_pstyle?: string | null): boolean {
139
+ if (!style_cache) {
140
+ [style_cache, default_pstyle] = _get_style_cache(paragraph._parent.part || paragraph._parent);
141
+ }
142
+ const pPr = findChild(paragraph._element, QN_W_PPR);
143
+
144
+ if (pPr) {
145
+ const oLvl = findChild(pPr, QN_W_OUTLINELVL);
146
+ if (oLvl) {
147
+ const val = oLvl.getAttribute(QN_W_VAL);
148
+ if (val && /^\d+$/.test(val)) {
149
+ const lvl = parseInt(val, 10);
150
+ if (lvl >= 0 && lvl <= 8) return true;
151
+ }
152
+ }
153
+ }
154
+
155
+ let style_id = default_pstyle;
156
+ if (pPr) {
157
+ const pStyle = findChild(pPr, QN_W_PSTYLE);
158
+ if (pStyle) style_id = pStyle.getAttribute(QN_W_VAL) || default_pstyle;
159
+ }
160
+
161
+ const style_info = style_id && style_cache ? style_cache[style_id] : null;
162
+ if (style_info && style_info.outline_level !== null && style_info.outline_level >= 0 && style_info.outline_level <= 8) {
163
+ return true;
164
+ }
165
+
166
+ const style_name = style_info ? style_info.name : null;
167
+ if (style_name?.startsWith('Heading')) return true;
168
+ if (style_name === 'Title') return true;
169
+ if (style_name && style_name !== 'Normal') {
170
+ if (_detect_heading_level_from_name(style_name) !== null) return true;
171
+ }
172
+
173
+ return false;
174
+ }
175
+
176
+ export function get_paragraph_prefix(paragraph: Paragraph, style_cache?: Record<string, any>, default_pstyle?: string | null): string {
177
+ if (!style_cache) {
178
+ [style_cache, default_pstyle] = _get_style_cache(paragraph._parent.part || paragraph._parent);
179
+ }
180
+ const pPr = findChild(paragraph._element, QN_W_PPR);
181
+
182
+ if (pPr) {
183
+ const oLvl = findChild(pPr, QN_W_OUTLINELVL);
184
+ if (oLvl) {
185
+ const val = oLvl.getAttribute(QN_W_VAL);
186
+ if (val && /^\d+$/.test(val)) {
187
+ const lvl = parseInt(val, 10);
188
+ if (lvl >= 0 && lvl <= 8) return '#'.repeat(lvl + 1) + ' ';
189
+ }
190
+ }
191
+ }
192
+
193
+ let style_id = default_pstyle;
194
+ if (pPr) {
195
+ const pStyle = findChild(pPr, QN_W_PSTYLE);
196
+ if (pStyle) style_id = pStyle.getAttribute(QN_W_VAL) || default_pstyle;
197
+ }
198
+
199
+ const style_info = style_id && style_cache ? style_cache[style_id] : null;
200
+ if (style_info && style_info.outline_level !== null && style_info.outline_level >= 0 && style_info.outline_level <= 8) {
201
+ return '#'.repeat(style_info.outline_level + 1) + ' ';
202
+ }
203
+
204
+ const style_name = style_info ? style_info.name : null;
205
+ if (style_name?.startsWith('Heading')) {
206
+ const match = style_name.replace('Heading', '').trim();
207
+ if (/^\d+$/.test(match)) return '#'.repeat(parseInt(match, 10)) + ' ';
208
+ }
209
+
210
+ if (style_name === 'Title') return '# ';
211
+
212
+ if (pPr) {
213
+ const numPr = findChild(pPr, QN_W_NUMPR);
214
+ if (numPr) {
215
+ const numId = findChild(numPr, QN_W_NUMID);
216
+ if (numId && numId.getAttribute(QN_W_VAL) !== '0') {
217
+ let level = 0;
218
+ const ilvl = findChild(numPr, QN_W_ILVL);
219
+ if (ilvl) {
220
+ const valAttr = ilvl.getAttribute(QN_W_VAL);
221
+ if (valAttr) level = parseInt(valAttr, 10) || 0;
222
+ }
223
+ return ' '.repeat(level) + '* ';
224
+ }
225
+ }
226
+ }
227
+
228
+ if (style_name && style_name !== 'Normal') {
229
+ const custom_level = _detect_heading_level_from_name(style_name);
230
+ if (custom_level !== null) return '#'.repeat(custom_level) + ' ';
231
+ }
232
+
233
+ if (!style_name || style_name === 'Normal') {
234
+ const text = paragraph.text.trim();
235
+ if (text && text.length < 100 && text === text.toUpperCase()) {
236
+ let is_bold = false;
237
+ if (style_info?.bold) {
238
+ is_bold = true;
239
+ } else {
240
+ const runs = findAllDescendants(paragraph._element, QN_W_R);
241
+ for (const r of runs) {
242
+ const tList = findAllDescendants(r, QN_W_T);
243
+ const tText = tList.map(t => t.textContent || '').join('');
244
+ if (tText.trim()) {
245
+ const rPr_run = findChild(r, QN_W_RPR);
246
+ if (rPr_run) {
247
+ const b = findChild(rPr_run, QN_W_B);
248
+ if (b && b.getAttribute(QN_W_VAL) !== '0' && b.getAttribute(QN_W_VAL) !== 'false') {
249
+ is_bold = true;
250
+ }
251
+ }
252
+ break;
253
+ }
254
+ }
255
+ }
256
+ if (is_bold) return '## ';
257
+ }
258
+ }
259
+
260
+ return '';
261
+ }
262
+
263
+ export function is_heading_paragraph(paragraph: Paragraph, style_cache?: Record<string, any>, default_pstyle?: string | null): boolean {
264
+ const prefix = get_paragraph_prefix(paragraph, style_cache, default_pstyle);
265
+ if (!prefix) return false;
266
+ const stripped = prefix.trimEnd();
267
+ return stripped.length > 0 && stripped === '#'.repeat(stripped.length);
268
+ }
269
+
270
+ export function get_run_style_markers(run: Run, is_heading: boolean | null = null): [string, string] {
271
+ let prefix = '';
272
+ let suffix = '';
273
+
274
+ const rPr = findChild(run._element, QN_W_RPR);
275
+ let is_bold = false;
276
+ let is_italic = false;
277
+
278
+ if (rPr) {
279
+ const b = findChild(rPr, QN_W_B);
280
+ if (b && b.getAttribute(QN_W_VAL) !== '0' && b.getAttribute(QN_W_VAL) !== 'false') is_bold = true;
281
+
282
+ const i = findChild(rPr, QN_W_I);
283
+ if (i && i.getAttribute(QN_W_VAL) !== '0' && i.getAttribute(QN_W_VAL) !== 'false') is_italic = true;
284
+ }
285
+
286
+ if (is_heading === null) {
287
+ const parent = run._parent;
288
+ is_heading = parent instanceof Paragraph ? is_native_heading(parent) : false;
289
+ }
290
+
291
+ if (is_bold && !is_heading) {
292
+ prefix += '**';
293
+ suffix = '**' + suffix;
294
+ }
295
+
296
+ if (is_italic) {
297
+ prefix += '_';
298
+ suffix = '_' + suffix;
299
+ }
300
+
301
+ return [prefix, suffix];
302
+ }
303
+
304
+ export function apply_formatting_to_segments(text: string, prefix: string, suffix: string): string {
305
+ if (!prefix && !suffix) return text;
306
+ if (!text) return '';
307
+ if (!text.includes('\n')) return `${prefix}${text}${suffix}`;
308
+
309
+ const parts = text.split('\n');
310
+ return parts.map(p => p ? `${prefix}${p}${suffix}` : '').join('\n');
311
+ }
312
+
313
+ export function get_run_text(run: Run): string {
314
+ let text = '';
315
+ for (let i = 0; i < run._element.childNodes.length; i++) {
316
+ const child = run._element.childNodes[i] as Element;
317
+ if (child.nodeType !== 1) continue;
318
+
319
+ if (child.tagName === QN_W_T || child.tagName === QN_W_DELTEXT) {
320
+ const raw = child.textContent || '';
321
+ text += raw.replace(/\t/g, ' ');
322
+ } else if (child.tagName === QN_W_TAB) {
323
+ text += ' ';
324
+ } else if (child.tagName === QN_W_BR || child.tagName === QN_W_CR) {
325
+ text += '\n';
326
+ }
327
+ }
328
+ return text;
329
+ }
330
+
331
+ export function* iter_block_items(parent: any): Generator<Paragraph | Table | FootnoteItem> {
332
+ const parent_elm = parent._element || parent.element || parent;
333
+
334
+ if (parent.constructor.name === 'NotesPart') {
335
+ const tag = parent.note_type === 'fn' ? 'w:footnote' : 'w:endnote';
336
+ const notes = findAllDescendants(parent_elm, tag);
337
+ for (const child of notes) {
338
+ if (child.getAttribute('w:type') === 'separator' || child.getAttribute('w:type') === 'continuationSeparator') continue;
339
+ yield new FootnoteItem(child, parent, parent.note_type);
340
+ }
341
+ return;
342
+ }
343
+
344
+ for (let i = 0; i < parent_elm.childNodes.length; i++) {
345
+ const child = parent_elm.childNodes[i] as Element;
346
+ if (child.nodeType !== 1) continue;
347
+
348
+ if (child.tagName === QN_W_P) {
349
+ yield new Paragraph(child, parent);
350
+ } else if (child.tagName === 'w:tbl') {
351
+ yield new Table(child, parent);
352
+ }
353
+ }
354
+ }
355
+
356
+ export function* iter_document_parts(doc: any): Generator<any> {
357
+ // Simplified for TS port - just yield main document and notes for ingestion
358
+ yield doc;
359
+
360
+ const fnPart = doc.pkg.getPartByPath('word/footnotes.xml');
361
+ const enPart = doc.pkg.getPartByPath('word/endnotes.xml');
362
+
363
+ if (fnPart) yield new NotesPart(fnPart, 'fn');
364
+ if (enPart) yield new NotesPart(enPart, 'en');
365
+ }
366
+
367
+ function _is_page_instr(instr: string): boolean {
368
+ if (!instr) return false;
369
+ const parts = instr.toUpperCase().trim().split(/\s+/);
370
+ return parts.length > 0 && (parts[0] === 'PAGE' || parts[0] === 'NUMPAGES');
371
+ }
372
+
373
+ export function* iter_paragraph_content(paragraph: Paragraph): Generator<Run | DocxEvent> {
374
+ let in_complex_field = false;
375
+ let current_instr = '';
376
+ let hide_result = false;
377
+
378
+ function* process_run_element(r_element: Element): Generator<Run | DocxEvent> {
379
+ let c_id: string | null = null;
380
+ const rPr = findChild(r_element, QN_W_RPR);
381
+ if (rPr) {
382
+ const rPrChange = findChild(rPr, QN_W_RPRCHANGE);
383
+ if (rPrChange) {
384
+ c_id = rPrChange.getAttribute(QN_W_ID);
385
+ yield { type: 'fmt_start', id: c_id!, author: rPrChange.getAttribute(QN_W_AUTHOR) || undefined, date: rPrChange.getAttribute(QN_W_DATE) || undefined };
386
+ }
387
+ }
388
+
389
+ for (let i = 0; i < r_element.childNodes.length; i++) {
390
+ const child = r_element.childNodes[i] as Element;
391
+ if (child.nodeType !== 1) continue;
392
+
393
+ const tag = child.tagName;
394
+ if (tag === QN_W_COMMENTREFERENCE) {
395
+ const ref_id = child.getAttribute(QN_W_ID);
396
+ if (ref_id) yield { type: 'ref', id: ref_id };
397
+ } else if (tag === QN_W_FOOTNOTEREFERENCE) {
398
+ const f_id = child.getAttribute(QN_W_ID);
399
+ if (f_id) yield { type: 'footnote', id: f_id };
400
+ } else if (tag === QN_W_ENDNOTEREFERENCE) {
401
+ const e_id = child.getAttribute(QN_W_ID);
402
+ if (e_id) yield { type: 'endnote', id: e_id };
403
+ } else if (tag === QN_W_FLDCHAR) {
404
+ const fld_type = child.getAttribute(QN_W_FLDCHARTYPE);
405
+ if (fld_type === 'begin') {
406
+ in_complex_field = true;
407
+ current_instr = '';
408
+ } else if (fld_type === 'separate') {
409
+ if (_is_page_instr(current_instr)) hide_result = true;
410
+ else {
411
+ const parts = current_instr.trim().split(/\s+/);
412
+ if (parts.length > 1 && parts[0] === 'REF') yield { type: 'xref_start', id: parts[1] };
413
+ }
414
+ } else if (fld_type === 'end') {
415
+ if (!hide_result) {
416
+ const parts = current_instr.trim().split(/\s+/);
417
+ if (parts.length > 1 && parts[0] === 'REF') yield { type: 'xref_end', id: parts[1] };
418
+ }
419
+ in_complex_field = false;
420
+ current_instr = '';
421
+ hide_result = false;
422
+ }
423
+ } else if (tag === QN_W_INSTRTEXT && in_complex_field && !hide_result) {
424
+ current_instr += child.textContent || '';
425
+ }
426
+ }
427
+
428
+ if (!hide_result) yield new Run(r_element, paragraph);
429
+ if (c_id !== null) yield { type: 'fmt_end', id: c_id };
430
+ }
431
+
432
+ function* traverse_node(node: Element): Generator<Run | DocxEvent> {
433
+ for (let i = 0; i < node.childNodes.length; i++) {
434
+ const child = node.childNodes[i] as Element;
435
+ if (child.nodeType !== 1) continue;
436
+
437
+ const tag = child.tagName;
438
+ if (tag === QN_W_R) yield* process_run_element(child);
439
+ else if (tag === QN_W_INS) {
440
+ const i_id = child.getAttribute(QN_W_ID)!;
441
+ yield { type: 'ins_start', id: i_id, author: child.getAttribute(QN_W_AUTHOR) || undefined, date: child.getAttribute(QN_W_DATE) || undefined };
442
+ yield* traverse_node(child);
443
+ yield { type: 'ins_end', id: i_id };
444
+ } else if (tag === QN_W_DEL) {
445
+ const d_id = child.getAttribute(QN_W_ID)!;
446
+ yield { type: 'del_start', id: d_id, author: child.getAttribute(QN_W_AUTHOR) || undefined, date: child.getAttribute(QN_W_DATE) || undefined };
447
+ yield* traverse_node(child);
448
+ yield { type: 'del_end', id: d_id };
449
+ } else if (tag === QN_W_COMMENTRANGESTART) yield { type: 'start', id: child.getAttribute(QN_W_ID)! };
450
+ else if (tag === QN_W_COMMENTRANGEEND) yield { type: 'end', id: child.getAttribute(QN_W_ID)! };
451
+ else if (tag === QN_W_HYPERLINK) {
452
+ const rId = child.getAttribute(QN_R_ID);
453
+ let url = '';
454
+ if (rId && paragraph._parent.part) {
455
+ const rel = paragraph._parent.part.rels.get(rId);
456
+ if (rel && rel.isExternal) url = rel.target;
457
+ }
458
+ if (url) yield { type: 'hyperlink_start', id: rId!, date: url };
459
+ yield* traverse_node(child);
460
+ if (url) yield { type: 'hyperlink_end', id: rId!, date: url };
461
+ } else if (tag === QN_W_FLDSIMPLE) {
462
+ const instr = child.getAttribute(QN_W_INSTR) || '';
463
+ const parts = instr.trim().split(/\s+/);
464
+ const target = (parts.length > 1 && parts[0] === 'REF') ? parts[1] : '';
465
+ if (target) yield { type: 'xref_start', id: target };
466
+ yield* traverse_node(child);
467
+ if (target) yield { type: 'xref_end', id: target };
468
+ } else if (tag === QN_W_BOOKMARKSTART) {
469
+ const b_name = child.getAttribute(QN_W_NAME);
470
+ if (b_name && (!b_name.startsWith('_') || b_name.startsWith('_Ref'))) yield { type: 'bookmark', id: b_name };
471
+ } else if (tag === QN_W_SDT || tag === QN_W_SMARTTAG || tag === QN_W_SDTCONTENT) {
472
+ yield* traverse_node(child);
473
+ }
474
+ }
475
+ }
476
+
477
+ yield* traverse_node(paragraph._element);
478
478
  }