@usejunior/docx-core 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/dist/.tsbuildinfo +1 -1
  2. package/dist/atomizer.d.ts +15 -1
  3. package/dist/atomizer.d.ts.map +1 -1
  4. package/dist/atomizer.js +37 -1
  5. package/dist/atomizer.js.map +1 -1
  6. package/dist/baselines/atomizer/documentReconstructor.d.ts.map +1 -1
  7. package/dist/baselines/atomizer/documentReconstructor.js +218 -90
  8. package/dist/baselines/atomizer/documentReconstructor.js.map +1 -1
  9. package/dist/baselines/atomizer/formattingFidelity.d.ts +99 -0
  10. package/dist/baselines/atomizer/formattingFidelity.d.ts.map +1 -0
  11. package/dist/baselines/atomizer/formattingFidelity.js +449 -0
  12. package/dist/baselines/atomizer/formattingFidelity.js.map +1 -0
  13. package/dist/baselines/atomizer/inPlaceModifier-bookmarks.d.ts +37 -0
  14. package/dist/baselines/atomizer/inPlaceModifier-bookmarks.d.ts.map +1 -0
  15. package/dist/baselines/atomizer/inPlaceModifier-bookmarks.js +189 -0
  16. package/dist/baselines/atomizer/inPlaceModifier-bookmarks.js.map +1 -0
  17. package/dist/baselines/atomizer/inPlaceModifier-containers.d.ts +74 -0
  18. package/dist/baselines/atomizer/inPlaceModifier-containers.d.ts.map +1 -0
  19. package/dist/baselines/atomizer/inPlaceModifier-containers.js +171 -0
  20. package/dist/baselines/atomizer/inPlaceModifier-containers.js.map +1 -0
  21. package/dist/baselines/atomizer/inPlaceModifier-deletion.d.ts +88 -0
  22. package/dist/baselines/atomizer/inPlaceModifier-deletion.d.ts.map +1 -0
  23. package/dist/baselines/atomizer/inPlaceModifier-deletion.js +326 -0
  24. package/dist/baselines/atomizer/inPlaceModifier-deletion.js.map +1 -0
  25. package/dist/baselines/atomizer/inPlaceModifier-postprocess.d.ts +85 -0
  26. package/dist/baselines/atomizer/inPlaceModifier-postprocess.d.ts.map +1 -0
  27. package/dist/baselines/atomizer/inPlaceModifier-postprocess.js +402 -0
  28. package/dist/baselines/atomizer/inPlaceModifier-postprocess.js.map +1 -0
  29. package/dist/baselines/atomizer/inPlaceModifier-presplit.d.ts +39 -0
  30. package/dist/baselines/atomizer/inPlaceModifier-presplit.d.ts.map +1 -0
  31. package/dist/baselines/atomizer/inPlaceModifier-presplit.js +265 -0
  32. package/dist/baselines/atomizer/inPlaceModifier-presplit.js.map +1 -0
  33. package/dist/baselines/atomizer/inPlaceModifier-shared.d.ts +62 -0
  34. package/dist/baselines/atomizer/inPlaceModifier-shared.d.ts.map +1 -0
  35. package/dist/baselines/atomizer/inPlaceModifier-shared.js +139 -0
  36. package/dist/baselines/atomizer/inPlaceModifier-shared.js.map +1 -0
  37. package/dist/baselines/atomizer/inPlaceModifier-wrappers.d.ts +189 -0
  38. package/dist/baselines/atomizer/inPlaceModifier-wrappers.d.ts.map +1 -0
  39. package/dist/baselines/atomizer/inPlaceModifier-wrappers.js +427 -0
  40. package/dist/baselines/atomizer/inPlaceModifier-wrappers.js.map +1 -0
  41. package/dist/baselines/atomizer/inPlaceModifier.d.ts +6 -290
  42. package/dist/baselines/atomizer/inPlaceModifier.d.ts.map +1 -1
  43. package/dist/baselines/atomizer/inPlaceModifier.js +23 -1828
  44. package/dist/baselines/atomizer/inPlaceModifier.js.map +1 -1
  45. package/dist/baselines/atomizer/pipeline.d.ts +76 -1
  46. package/dist/baselines/atomizer/pipeline.d.ts.map +1 -1
  47. package/dist/baselines/atomizer/pipeline.js +204 -27
  48. package/dist/baselines/atomizer/pipeline.js.map +1 -1
  49. package/dist/baselines/atomizer/trackChangesAcceptorAst.d.ts.map +1 -1
  50. package/dist/baselines/atomizer/trackChangesAcceptorAst.js +56 -160
  51. package/dist/baselines/atomizer/trackChangesAcceptorAst.js.map +1 -1
  52. package/dist/compare-types.d.ts +151 -0
  53. package/dist/compare-types.d.ts.map +1 -0
  54. package/dist/compare-types.js +2 -0
  55. package/dist/compare-types.js.map +1 -0
  56. package/dist/core-types.d.ts +5 -1
  57. package/dist/core-types.d.ts.map +1 -1
  58. package/dist/core-types.js +5 -1
  59. package/dist/core-types.js.map +1 -1
  60. package/dist/footnotes.d.ts +8 -3
  61. package/dist/footnotes.d.ts.map +1 -1
  62. package/dist/footnotes.js +8 -3
  63. package/dist/footnotes.js.map +1 -1
  64. package/dist/index.d.ts +6 -150
  65. package/dist/index.d.ts.map +1 -1
  66. package/dist/index.js +6 -0
  67. package/dist/index.js.map +1 -1
  68. package/dist/integration/libreoffice-oracle.d.ts +41 -0
  69. package/dist/integration/libreoffice-oracle.d.ts.map +1 -0
  70. package/dist/integration/libreoffice-oracle.js +282 -0
  71. package/dist/integration/libreoffice-oracle.js.map +1 -0
  72. package/dist/primitives/accept_changes.d.ts +2 -2
  73. package/dist/primitives/accept_changes.d.ts.map +1 -1
  74. package/dist/primitives/accept_changes.js +24 -79
  75. package/dist/primitives/accept_changes.js.map +1 -1
  76. package/dist/primitives/comments.d.ts +12 -3
  77. package/dist/primitives/comments.d.ts.map +1 -1
  78. package/dist/primitives/comments.js +374 -97
  79. package/dist/primitives/comments.js.map +1 -1
  80. package/dist/primitives/content_fingerprint.d.ts +29 -0
  81. package/dist/primitives/content_fingerprint.d.ts.map +1 -0
  82. package/dist/primitives/content_fingerprint.js +63 -0
  83. package/dist/primitives/content_fingerprint.js.map +1 -0
  84. package/dist/primitives/document.d.ts +56 -15
  85. package/dist/primitives/document.d.ts.map +1 -1
  86. package/dist/primitives/document.js +303 -32
  87. package/dist/primitives/document.js.map +1 -1
  88. package/dist/primitives/document_view-comments.d.ts +18 -0
  89. package/dist/primitives/document_view-comments.d.ts.map +1 -0
  90. package/dist/primitives/document_view-comments.js +159 -0
  91. package/dist/primitives/document_view-comments.js.map +1 -0
  92. package/dist/primitives/document_view-headings.d.ts +45 -0
  93. package/dist/primitives/document_view-headings.d.ts.map +1 -0
  94. package/dist/primitives/document_view-headings.js +247 -0
  95. package/dist/primitives/document_view-headings.js.map +1 -0
  96. package/dist/primitives/document_view-styles.d.ts +11 -0
  97. package/dist/primitives/document_view-styles.d.ts.map +1 -0
  98. package/dist/primitives/document_view-styles.js +104 -0
  99. package/dist/primitives/document_view-styles.js.map +1 -0
  100. package/dist/primitives/document_view-toon.d.ts +37 -0
  101. package/dist/primitives/document_view-toon.d.ts.map +1 -0
  102. package/dist/primitives/document_view-toon.js +199 -0
  103. package/dist/primitives/document_view-toon.js.map +1 -0
  104. package/dist/primitives/document_view-types.d.ts +137 -0
  105. package/dist/primitives/document_view-types.d.ts.map +1 -0
  106. package/dist/primitives/document_view-types.js +2 -0
  107. package/dist/primitives/document_view-types.js.map +1 -0
  108. package/dist/primitives/document_view.d.ts +8 -106
  109. package/dist/primitives/document_view.d.ts.map +1 -1
  110. package/dist/primitives/document_view.js +134 -301
  111. package/dist/primitives/document_view.js.map +1 -1
  112. package/dist/primitives/dom-helpers.d.ts +9 -0
  113. package/dist/primitives/dom-helpers.d.ts.map +1 -1
  114. package/dist/primitives/dom-helpers.js +10 -1
  115. package/dist/primitives/dom-helpers.js.map +1 -1
  116. package/dist/primitives/footnotes.d.ts +4 -3
  117. package/dist/primitives/footnotes.d.ts.map +1 -1
  118. package/dist/primitives/footnotes.js +232 -44
  119. package/dist/primitives/footnotes.js.map +1 -1
  120. package/dist/primitives/formatting_tags.d.ts +6 -0
  121. package/dist/primitives/formatting_tags.d.ts.map +1 -1
  122. package/dist/primitives/formatting_tags.js +6 -1
  123. package/dist/primitives/formatting_tags.js.map +1 -1
  124. package/dist/primitives/index.d.ts +6 -0
  125. package/dist/primitives/index.d.ts.map +1 -1
  126. package/dist/primitives/index.js +5 -0
  127. package/dist/primitives/index.js.map +1 -1
  128. package/dist/primitives/layout.d.ts +4 -3
  129. package/dist/primitives/layout.d.ts.map +1 -1
  130. package/dist/primitives/layout.js +32 -3
  131. package/dist/primitives/layout.js.map +1 -1
  132. package/dist/primitives/merge_runs.d.ts +21 -3
  133. package/dist/primitives/merge_runs.d.ts.map +1 -1
  134. package/dist/primitives/merge_runs.js +32 -10
  135. package/dist/primitives/merge_runs.js.map +1 -1
  136. package/dist/primitives/namespaces.d.ts +6 -0
  137. package/dist/primitives/namespaces.d.ts.map +1 -1
  138. package/dist/primitives/namespaces.js +9 -0
  139. package/dist/primitives/namespaces.js.map +1 -1
  140. package/dist/primitives/reject_changes.d.ts +2 -2
  141. package/dist/primitives/reject_changes.d.ts.map +1 -1
  142. package/dist/primitives/reject_changes.js +24 -81
  143. package/dist/primitives/reject_changes.js.map +1 -1
  144. package/dist/primitives/semantic_tags.d.ts +7 -0
  145. package/dist/primitives/semantic_tags.d.ts.map +1 -1
  146. package/dist/primitives/semantic_tags.js +21 -3
  147. package/dist/primitives/semantic_tags.js.map +1 -1
  148. package/dist/primitives/serialize_html.d.ts +36 -0
  149. package/dist/primitives/serialize_html.d.ts.map +1 -0
  150. package/dist/primitives/serialize_html.js +393 -0
  151. package/dist/primitives/serialize_html.js.map +1 -0
  152. package/dist/primitives/serialize_markdown.d.ts +16 -0
  153. package/dist/primitives/serialize_markdown.d.ts.map +1 -0
  154. package/dist/primitives/serialize_markdown.js +300 -0
  155. package/dist/primitives/serialize_markdown.js.map +1 -0
  156. package/dist/primitives/serialize_plaintext.d.ts +15 -0
  157. package/dist/primitives/serialize_plaintext.d.ts.map +1 -0
  158. package/dist/primitives/serialize_plaintext.js +154 -0
  159. package/dist/primitives/serialize_plaintext.js.map +1 -0
  160. package/dist/primitives/styles.js +22 -22
  161. package/dist/primitives/styles.js.map +1 -1
  162. package/dist/primitives/tables.d.ts.map +1 -1
  163. package/dist/primitives/tables.js +13 -3
  164. package/dist/primitives/tables.js.map +1 -1
  165. package/dist/primitives/text.d.ts +2 -1
  166. package/dist/primitives/text.d.ts.map +1 -1
  167. package/dist/primitives/text.js +116 -12
  168. package/dist/primitives/text.js.map +1 -1
  169. package/dist/primitives/track-changes-emitter.d.ts +139 -0
  170. package/dist/primitives/track-changes-emitter.d.ts.map +1 -0
  171. package/dist/primitives/track-changes-emitter.js +241 -0
  172. package/dist/primitives/track-changes-emitter.js.map +1 -0
  173. package/dist/primitives/xml-helpers.d.ts +29 -0
  174. package/dist/primitives/xml-helpers.d.ts.map +1 -0
  175. package/dist/primitives/xml-helpers.js +35 -0
  176. package/dist/primitives/xml-helpers.js.map +1 -0
  177. package/dist/shared/ooxml/namespaces.d.ts +4 -1
  178. package/dist/shared/ooxml/namespaces.d.ts.map +1 -1
  179. package/dist/shared/ooxml/namespaces.js +4 -1
  180. package/dist/shared/ooxml/namespaces.js.map +1 -1
  181. package/package.json +7 -6
@@ -1,22 +1,26 @@
1
1
  import { OOXML, W } from './namespaces.js';
2
+ import { getAttributeSafe, getFirstChild } from './xml-helpers.js';
2
3
  import { getParagraphText, getParagraphRuns } from './text.js';
3
- import { extractListLabel, stripListLabel, LabelType } from './list_labels.js';
4
+ import { extractListLabel, stripListLabel } from './list_labels.js';
4
5
  import { parseNumberingXml, computeListLabelForParagraph } from './numbering.js';
5
6
  import { parseStylesXml, extractParagraphFormatting, extractEffectiveRunFormatting } from './styles.js';
6
7
  import { HIGHLIGHT_TAG } from './semantic_tags.js';
7
8
  import { computeModalBaseline, computeParagraphFontBaseline, emitFormattingTags, mergeAdjacentTags } from './formatting_tags.js';
8
9
  import { isReservedFootnote } from './footnotes.js';
9
- const SHORT_HEADER_MAX_LENGTH = 50;
10
- const MAX_HEADER_TEXT_LENGTH = 60;
11
- const STYLE_EXAMPLE_TEXT_PREVIEW_LENGTH = 50;
10
+ import { deriveHeading, detectRunInHeader, detectTitleCapsCentered, extractHeaderInfo, suppressSignatureClusters, } from './document_view-headings.js';
11
+ import { discoverStyles, fingerprintKey } from './document_view-styles.js';
12
+ import { findTaggedTextInsertionIndex } from './document_view-comments.js';
13
+ export { discoverStyles } from './document_view-styles.js';
14
+ export { INLINE_COMMENT_MARKER_RUNTIME, TOON_INLINE_TAG_RE, collectInlineCommentMarkers, tokenizeToonInline } from './document_view-comments.js';
15
+ export { collectTableMarkerInfo, formatTableMarker, formatToonCommentEndnoteLines, formatToonCommentLines, formatToonCommentsEndnotesBlock, formatToonDataLine, renderToon, renderToonWithCommentEndnotes, } from './document_view-toon.js';
12
16
  function getWAttr(el, localName) {
13
- return el.getAttributeNS(OOXML.W_NS, localName) ?? el.getAttribute(`w:${localName}`) ?? el.getAttribute(localName);
17
+ return getAttributeSafe(el, OOXML.W_NS, localName, 'w');
14
18
  }
15
19
  function runHighlightVal(run) {
16
- const rPr = run.getElementsByTagNameNS(OOXML.W_NS, W.rPr).item(0);
20
+ const rPr = getFirstChild(run, OOXML.W_NS, W.rPr);
17
21
  if (!rPr)
18
22
  return null;
19
- const h = rPr.getElementsByTagNameNS(OOXML.W_NS, W.highlight).item(0);
23
+ const h = getFirstChild(rPr, OOXML.W_NS, W.highlight);
20
24
  if (!h)
21
25
  return null;
22
26
  const v = getWAttr(h, 'val');
@@ -46,284 +50,6 @@ function emitHighlightTagsFromParagraph(p) {
46
50
  out.push(`</${HIGHLIGHT_TAG}>`);
47
51
  return out.join('');
48
52
  }
49
- function fingerprintKey(fp) {
50
- // Stable JSON-ish key used for Map lookups.
51
- return `${fp.list_level}|${fp.left_indent_pt.toFixed(1)}|${fp.first_line_indent_pt.toFixed(1)}|${fp.style_name}|${fp.alignment}`;
52
- }
53
- /**
54
- * v0.3: Compact style fingerprint token.
55
- * Concatenates style name, list level, alignment, and indentation for token-efficient LLM context.
56
- * Example: "Normal:L-1:LEFT:I0:H0"
57
- */
58
- function computeFingerprintToken(fp, styleId) {
59
- const name = styleId || fp.style_name || 'body';
60
- const level = `L${fp.list_level}`;
61
- const align = fp.alignment;
62
- const indent = `I${Math.round(fp.left_indent_pt)}`;
63
- const hanging = `H${Math.round(fp.first_line_indent_pt)}`;
64
- return `${name}:${level}:${align}:${indent}:${hanging}`;
65
- }
66
- // Pattern-based header detection fallback (ported from Python ingestor._extract_header_info).
67
- const HEADER_PATTERN = /^([A-Z][^.!?:]*(?:\s+[A-Z][^.!?:]*)*)([.:]?)(?:\s|$)/;
68
- function extractHeaderInfo(cleanText) {
69
- if (!cleanText || cleanText.length < 2)
70
- return { header_text: null, header_style: null };
71
- if (!/^[A-Z]/.test(cleanText))
72
- return { header_text: null, header_style: null };
73
- const stripped = cleanText.trim();
74
- if (stripped.length <= SHORT_HEADER_MAX_LENGTH) {
75
- if (stripped.endsWith('.'))
76
- return { header_text: stripped.slice(0, -1), header_style: 'title_with_period' };
77
- if (stripped.endsWith(':'))
78
- return { header_text: stripped.slice(0, -1), header_style: 'title_with_colon' };
79
- const words = stripped.split(/\s+/);
80
- if (words.length <= 5)
81
- return { header_text: stripped, header_style: 'title_bare' };
82
- return { header_text: null, header_style: null };
83
- }
84
- const m = HEADER_PATTERN.exec(stripped);
85
- if (!m)
86
- return { header_text: null, header_style: null };
87
- const headerText = (m[1] ?? '').trim();
88
- const terminator = m[2] ?? '';
89
- const remaining = stripped.slice(m[0].length);
90
- if (!remaining || headerText.length > MAX_HEADER_TEXT_LENGTH)
91
- return { header_text: null, header_style: null };
92
- if (terminator === '.')
93
- return { header_text: headerText, header_style: 'title_with_period' };
94
- if (terminator === ':')
95
- return { header_text: headerText, header_style: 'title_with_colon' };
96
- return { header_text: headerText, header_style: 'title_bare' };
97
- }
98
- function detectRunInHeader(params) {
99
- const { paragraph, paragraphPPr, paragraphStyleId, styles } = params;
100
- const punct = new Set(['.', ':', '-']);
101
- // Use visible runs only (field code text stripped in getParagraphRuns()).
102
- const runs = getParagraphRuns(paragraph);
103
- if (runs.length === 0)
104
- return null;
105
- // Group by run element, preserving order.
106
- const orderedUniqueRuns = [];
107
- const seen = new Set();
108
- for (const tr of runs) {
109
- if (!seen.has(tr.r)) {
110
- seen.add(tr.r);
111
- orderedUniqueRuns.push(tr.r);
112
- }
113
- }
114
- let headerText = '';
115
- let formatting = null;
116
- let headerCharCount = 0;
117
- for (const r of orderedUniqueRuns) {
118
- const fmt = extractEffectiveRunFormatting({ run: r, paragraphPPr, paragraphStyleId, styles });
119
- const isHeaderStyle = fmt.bold || fmt.underline;
120
- if (!isHeaderStyle)
121
- break;
122
- // Accumulate run text.
123
- const ts = Array.from(r.getElementsByTagNameNS(OOXML.W_NS, W.t));
124
- for (const t of ts) {
125
- const tc = t.textContent ?? '';
126
- headerText += tc;
127
- headerCharCount += tc.length;
128
- }
129
- if (!formatting)
130
- formatting = { bold: fmt.bold, italic: fmt.italic, underline: fmt.underline };
131
- }
132
- const trimmed = headerText.trim();
133
- if (!trimmed)
134
- return null;
135
- if (!punct.has(trimmed[trimmed.length - 1]))
136
- return null;
137
- if (!formatting)
138
- return null;
139
- return { raw_text: trimmed, formatting, headerCharCount };
140
- }
141
- function inferSemanticName(params) {
142
- const { fp, nodes } = params;
143
- // Find first label_type if present.
144
- let labelType = null;
145
- for (const n of nodes) {
146
- if (n.list_metadata.label_type) {
147
- labelType = n.list_metadata.label_type;
148
- break;
149
- }
150
- }
151
- const listLevel = fp.list_level;
152
- if (listLevel >= 0) {
153
- if (listLevel === 0) {
154
- if (labelType === LabelType.ARTICLE)
155
- return { base_id: 'article', display_name: 'Article Heading' };
156
- if (labelType === LabelType.SECTION)
157
- return { base_id: 'section', display_name: 'Section Heading' };
158
- if (labelType === LabelType.ROMAN)
159
- return { base_id: 'roman_section', display_name: 'Roman Numeral Section' };
160
- return { base_id: 'top_level', display_name: 'Top-Level List Item' };
161
- }
162
- if (listLevel === 1) {
163
- if (labelType === LabelType.LETTER)
164
- return { base_id: 'subsection', display_name: 'Subsection (a)/(A)' };
165
- if (labelType === LabelType.NUMBER)
166
- return { base_id: 'subsection_number', display_name: 'Numbered Subsection' };
167
- if (labelType === LabelType.ROMAN)
168
- return { base_id: 'subsection_roman', display_name: 'Roman Subsection' };
169
- return { base_id: 'level_1', display_name: `Level ${listLevel} List Item` };
170
- }
171
- if (labelType === LabelType.ROMAN)
172
- return { base_id: `level_${listLevel}_roman`, display_name: `Level ${listLevel} Roman` };
173
- if (labelType === LabelType.LETTER)
174
- return { base_id: `level_${listLevel}_letter`, display_name: `Level ${listLevel} Letter` };
175
- return { base_id: `level_${listLevel}`, display_name: `Level ${listLevel} List Item` };
176
- }
177
- // Non-list.
178
- const styleName = fp.style_name.toLowerCase().replace(/\s+/g, '_');
179
- if (fp.left_indent_pt > 0)
180
- return { base_id: 'indent_block', display_name: 'Indented Block' };
181
- if (styleName.includes('heading') || styleName.includes('title'))
182
- return { base_id: 'heading', display_name: 'Heading' };
183
- if (styleName.includes('quote') || styleName.includes('block'))
184
- return { base_id: 'block_quote', display_name: 'Block Quote' };
185
- return { base_id: 'body', display_name: 'Body Text' };
186
- }
187
- export function discoverStyles(nodes) {
188
- const groups = new Map();
189
- for (const n of nodes) {
190
- const key = fingerprintKey(n.style_fingerprint);
191
- const g = groups.get(key);
192
- if (g)
193
- g.nodes.push(n);
194
- else
195
- groups.set(key, { fp: n.style_fingerprint, nodes: [n] });
196
- }
197
- const used = {};
198
- const styles = new Map();
199
- const fpToStyle = new Map();
200
- for (const [fpKey, g] of groups.entries()) {
201
- const { base_id, display_name } = inferSemanticName({ fp: g.fp, nodes: g.nodes });
202
- let styleId = base_id;
203
- if (used[base_id] !== undefined) {
204
- used[base_id] += 1;
205
- styleId = `${base_id}_${used[base_id]}`;
206
- }
207
- else {
208
- used[base_id] = 0;
209
- }
210
- const median = g.nodes[Math.floor(g.nodes.length / 2)];
211
- const info = {
212
- style_id: styleId,
213
- display_name,
214
- fingerprint: g.fp,
215
- example_node_id: median.id,
216
- example_text: median.clean_text.slice(0, STYLE_EXAMPLE_TEXT_PREVIEW_LENGTH),
217
- count: g.nodes.length,
218
- dominant_alignment: g.fp.alignment,
219
- };
220
- styles.set(styleId, info);
221
- fpToStyle.set(fpKey, styleId);
222
- }
223
- return { styles, fingerprint_to_style: fpToStyle };
224
- }
225
- function headerStripFromText(params) {
226
- // Mirrors Python TOONRenderer header stripping.
227
- const { header } = params;
228
- let { text } = params;
229
- if (!header)
230
- return text;
231
- const headerNorm = header.trim().toLowerCase();
232
- const textLower = text.toLowerCase();
233
- for (const punct of [':', '.', '-', ';', '']) {
234
- const testPrefix = `${headerNorm}${punct}`;
235
- if (textLower.startsWith(testPrefix)) {
236
- text = text.slice(testPrefix.length).trimStart();
237
- return text;
238
- }
239
- }
240
- if (text.startsWith(header)) {
241
- text = text.slice(header.length).replace(/^[.:\-;]+/, '').trimStart();
242
- }
243
- return text;
244
- }
245
- /**
246
- * Format a single toon data line for one DocumentViewNode.
247
- * Handles table-context-aware style (th/td) and header stripping.
248
- */
249
- export function formatToonDataLine(n, options) {
250
- let text = n.tagged_text;
251
- if (n.header)
252
- text = headerStripFromText({ header: n.header, text });
253
- let header = n.header;
254
- if (header && !text) {
255
- text = header;
256
- header = '';
257
- }
258
- const tc = n.table_context;
259
- let style;
260
- if (tc) {
261
- style = tc.is_header_row
262
- ? `th(${tc.row_index},${tc.col_index})`
263
- : `td(${tc.row_index},${tc.col_index})`;
264
- }
265
- else {
266
- style = options?.compact
267
- ? computeFingerprintToken(n.style_fingerprint, n.style)
268
- : n.style;
269
- }
270
- return `${n.id} | ${n.list_label} | ${header} | ${style} | ${text}`;
271
- }
272
- /**
273
- * Collect table marker info (dimensions) from nodes for #TABLE markers.
274
- * Column headers are NOT included in the marker — they appear once in the th() rows.
275
- */
276
- export function collectTableMarkerInfo(nodes) {
277
- const info = new Map();
278
- for (const n of nodes) {
279
- const tc = n.table_context;
280
- if (!tc)
281
- continue;
282
- if (!info.has(tc.table_index)) {
283
- info.set(tc.table_index, {
284
- id: tc.table_id,
285
- totalRows: tc.total_rows,
286
- totalCols: tc.total_cols,
287
- });
288
- }
289
- }
290
- return info;
291
- }
292
- /**
293
- * Format a #TABLE marker line from collected table info.
294
- * Headers are omitted — they appear exactly once in the th(0,N) data rows.
295
- */
296
- export function formatTableMarker(info) {
297
- return `#TABLE ${info.id} | ${info.totalRows} rows × ${info.totalCols} cols`;
298
- }
299
- export function renderToon(nodes, options = {}) {
300
- const lines = ['#SCHEMA id | list_label | header | style | text'];
301
- // Pre-scan: collect table marker info for #TABLE lines
302
- const tableInfo = collectTableMarkerInfo(nodes);
303
- let currentTableIndex = null;
304
- for (const n of nodes) {
305
- const tc = n.table_context;
306
- const nodeTableIndex = tc ? tc.table_index : null;
307
- // Close previous table if we left it or moved to a different table
308
- if (currentTableIndex !== null && nodeTableIndex !== currentTableIndex) {
309
- lines.push('#END_TABLE');
310
- currentTableIndex = null;
311
- }
312
- // Open new table if entering one
313
- if (nodeTableIndex !== null && currentTableIndex === null) {
314
- const info = tableInfo.get(nodeTableIndex);
315
- if (info)
316
- lines.push(formatTableMarker(info));
317
- currentTableIndex = nodeTableIndex;
318
- }
319
- lines.push(formatToonDataLine(n, options));
320
- }
321
- // Close any open table at end
322
- if (currentTableIndex !== null) {
323
- lines.push('#END_TABLE');
324
- }
325
- return lines.join('\n');
326
- }
327
53
  export function buildDocumentView(params) {
328
54
  const { documentXml, stylesXml, numberingXml, opts } = params;
329
55
  const includeSemantic = opts?.include_semantic_tags ?? true;
@@ -334,7 +60,7 @@ export function buildDocumentView(params) {
334
60
  void numberingModel;
335
61
  const counters = new Map();
336
62
  void counters;
337
- const body = documentXml.getElementsByTagNameNS(OOXML.W_NS, W.body).item(0);
63
+ const body = getFirstChild(documentXml, OOXML.W_NS, W.body);
338
64
  if (!body)
339
65
  return { nodes: [], styles: { styles: new Map(), fingerprint_to_style: new Map() } };
340
66
  const paragraphs = Array.from(body.getElementsByTagNameNS(OOXML.W_NS, W.p));
@@ -357,9 +83,7 @@ function resolveRunHyperlinkUrl(runEl, relsMap) {
357
83
  if (!parent || parent.localName !== W.hyperlink)
358
84
  return null;
359
85
  // r:id attribute can be namespaced or prefixed.
360
- const rId = parent.getAttributeNS(OOXML.R_NS, 'id') ??
361
- parent.getAttribute('r:id') ??
362
- null;
86
+ const rId = getAttributeSafe(parent, OOXML.R_NS, 'id', 'r', { bareFallback: false });
363
87
  if (!rId)
364
88
  return null;
365
89
  return relsMap.get(rId) ?? null;
@@ -510,18 +234,74 @@ function getFootnoteMarkersForParagraph(p, displayMap) {
510
234
  markers.sort((a, b) => b.offset - a.offset);
511
235
  return markers;
512
236
  }
237
+ /**
238
+ * Paragraph content that makes a text-empty paragraph meaningful on its own:
239
+ * an endnote or comment anchored to the paragraph (the comment range markers
240
+ * are what `getComments` resolves `anchored_paragraph_id`/`end_paragraph_id`
241
+ * from, so dropping their paragraph leaves a dangling anchor ID no node_ids
242
+ * probe can resolve), or embedded visual content (DrawingML drawing, VML
243
+ * picture, embedded object). Dropping such a paragraph from the document view
244
+ * severs the anchored note/comment from every read surface and silently
245
+ * hides images.
246
+ *
247
+ * Footnote references are handled separately via the display map so their
248
+ * [^N] markers render; the shapes here only need the node to exist.
249
+ * @see #383
250
+ */
251
+ const ANCHORING_CONTENT = [
252
+ W.endnoteReference,
253
+ W.commentReference,
254
+ W.commentRangeStart,
255
+ W.commentRangeEnd,
256
+ W.drawing,
257
+ W.pict,
258
+ W.object,
259
+ ];
260
+ /**
261
+ * True when `el` sits inside a `w:del` or `w:moveFrom` revision wrapper below
262
+ * the paragraph. Deleted/moved-from content is invisible to the view's text
263
+ * extraction (`getParagraphText` reads `w:t`, never `w:delText`), so an
264
+ * anchor that only survives inside a tracked deletion — e.g. the
265
+ * `w:commentReference` a tracked comment-delete leaves under `w:del` — must
266
+ * not resurrect its paragraph as a blank visible node.
267
+ */
268
+ function isInsideRemovedRevisionWrapper(el, paragraph) {
269
+ let cur = el.parentNode;
270
+ while (cur && cur !== paragraph) {
271
+ if (cur.namespaceURI === OOXML.W_NS && (cur.localName === W.del || cur.localName === W.moveFrom)) {
272
+ return true;
273
+ }
274
+ cur = cur.parentNode;
275
+ }
276
+ return false;
277
+ }
278
+ function paragraphHasAnchoringContent(p) {
279
+ return ANCHORING_CONTENT.some((localName) => {
280
+ const els = p.getElementsByTagNameNS(OOXML.W_NS, localName);
281
+ for (let i = 0; i < els.length; i++) {
282
+ if (!isInsideRemovedRevisionWrapper(els.item(i), p))
283
+ return true;
284
+ }
285
+ return false;
286
+ });
287
+ }
513
288
  /**
514
289
  * Inject footnote markers into a text string at the given offsets.
515
290
  * Markers must be sorted descending by offset.
291
+ *
292
+ * Offsets are *visible*-character offsets (they count document text, not the inline
293
+ * formatting tags emitted by `emitFormattingTags`). When `text` carries formatting tags
294
+ * we therefore map each visible offset to a tag-aware insertion index, exactly as the
295
+ * comment-marker path does (`findTaggedTextInsertionIndex`). A naive `slice(offset)` would
296
+ * land the `[^n]` marker inside a tag or mid-word once formatting is present.
516
297
  */
517
298
  function injectFootnoteMarkers(text, markers) {
518
299
  if (markers.length === 0)
519
300
  return text;
520
301
  let result = text;
521
302
  for (const { offset, marker } of markers) {
522
- // Clamp offset to text length
523
- const pos = Math.min(offset, result.length);
524
- result = result.slice(0, pos) + marker + result.slice(pos);
303
+ const insertionIndex = findTaggedTextInsertionIndex(result, offset);
304
+ result = result.slice(0, insertionIndex) + marker + result.slice(insertionIndex);
525
305
  }
526
306
  return result;
527
307
  }
@@ -544,7 +324,7 @@ export function buildNodesForDocumentView(params) {
544
324
  const allBodyRuns = [];
545
325
  if (showFormatting) {
546
326
  for (const { p } of paragraphs) {
547
- const paraPPr = p.getElementsByTagNameNS(OOXML.W_NS, W.pPr).item(0);
327
+ const paraPPr = getFirstChild(p, OOXML.W_NS, W.pPr);
548
328
  const paraFmt = extractParagraphFormatting(paraPPr ?? null, stylesModel);
549
329
  const runs = buildAnnotatedRuns({
550
330
  p,
@@ -593,20 +373,31 @@ export function buildNodesForDocumentView(params) {
593
373
  const nodes = [];
594
374
  for (let idx = 0; idx < paragraphs.length; idx++) {
595
375
  const { id, p, tableContext } = paragraphs[idx];
596
- const paraPPr = p.getElementsByTagNameNS(OOXML.W_NS, W.pPr).item(0);
376
+ const paraPPr = getFirstChild(p, OOXML.W_NS, W.pPr);
597
377
  const paraFmt = extractParagraphFormatting(paraPPr ?? null, stylesModel);
598
378
  // Visible clean text (field codes stripped).
599
379
  const fullText = getParagraphText(p).replace(/\r/g, '').replace(/\n/g, '').trim();
600
- // Preserve empty table cell paragraphs for structural completeness.
601
- if (!fullText && !tableContext)
380
+ // Preserve empty table cell paragraphs for structural completeness, and
381
+ // text-empty paragraphs that carry anchoring content — a visible footnote
382
+ // reference (its [^N] marker renders via the injection pass below), an
383
+ // endnote reference, a comment reference or comment range marker, or an
384
+ // embedded drawing/picture/object. Dropping those loses the anchored
385
+ // note/comment/image from every rendering of the document view. Anchors
386
+ // that survive only inside a tracked deletion don't count, and paragraphs
387
+ // that are empty for spacing only are still skipped.
388
+ // @see #185, #383
389
+ if (!fullText &&
390
+ !tableContext &&
391
+ getFootnoteMarkersForParagraph(p, footnoteDisplayMap).length === 0 &&
392
+ !paragraphHasAnchoringContent(p))
602
393
  continue;
603
394
  // Numbering (auto-numbered) info from numPr.
604
395
  let numId = null;
605
396
  let ilvl = null;
606
- const numPr = paraPPr ? paraPPr.getElementsByTagNameNS(OOXML.W_NS, W.numPr).item(0) : null;
397
+ const numPr = paraPPr ? getFirstChild(paraPPr, OOXML.W_NS, W.numPr) : null;
607
398
  if (numPr) {
608
- const numIdEl = numPr.getElementsByTagNameNS(OOXML.W_NS, W.numId).item(0);
609
- const ilvlEl = numPr.getElementsByTagNameNS(OOXML.W_NS, W.ilvl).item(0);
399
+ const numIdEl = getFirstChild(numPr, OOXML.W_NS, W.numId);
400
+ const ilvlEl = getFirstChild(numPr, OOXML.W_NS, W.ilvl);
610
401
  const numIdVal = numIdEl ? getWAttr(numIdEl, 'val') : null;
611
402
  const ilvlVal = ilvlEl ? getWAttr(ilvlEl, 'val') : null;
612
403
  if (numIdVal)
@@ -649,7 +440,13 @@ export function buildNodesForDocumentView(params) {
649
440
  let headerFormatting = null;
650
441
  let headerCharCount = 0;
651
442
  try {
652
- const hdr = detectRunInHeader({ paragraph: p, paragraphPPr: paraPPr ?? null, paragraphStyleId: paraFmt.styleId, styles: stylesModel });
443
+ // Skip in-table run-in header detection table cells are key/value
444
+ // layout and a bold prefix is a label, not a section heading.
445
+ // Mirrors the !tableContext gates on detectTitleCapsCentered and
446
+ // extractHeaderInfo below.
447
+ const hdr = tableContext
448
+ ? null
449
+ : detectRunInHeader({ paragraph: p, paragraphPPr: paraPPr ?? null, paragraphStyleId: paraFmt.styleId, styles: stylesModel });
653
450
  if (hdr) {
654
451
  headerText = hdr.raw_text.replace(/[.:\-]+$/g, '');
655
452
  headerStyle = 'run_in_header';
@@ -660,11 +457,39 @@ export function buildNodesForDocumentView(params) {
660
457
  catch {
661
458
  // ignore
662
459
  }
663
- if (!headerText) {
460
+ // Centered ALL-CAPS bold standalone titles (e.g. an NVCA SPA's
461
+ // `SERIES […] PREFERRED STOCK PURCHASE AGREEMENT`). Runs before
462
+ // extractHeaderInfo so the documented precedence (title_caps_centered
463
+ // outranks short standalone title_bare/title_with_period/title_with_colon)
464
+ // matches the implementation. Only fires when run_in_header did not match
465
+ // AND the paragraph has no list label AND is not in a table cell. The
466
+ // try/catch is defensive against malformed XML in user documents.
467
+ if (!headerText && !labelString && !tableContext) {
468
+ try {
469
+ const titleHdr = detectTitleCapsCentered({
470
+ paragraph: p,
471
+ paragraphPPr: paraPPr ?? null,
472
+ paragraphStyleId: paraFmt.styleId,
473
+ alignment: paraFmt.alignment,
474
+ cleanTextNoLabel,
475
+ styles: stylesModel,
476
+ });
477
+ if (titleHdr) {
478
+ headerText = titleHdr.raw_text;
479
+ headerStyle = 'title_caps_centered';
480
+ headerFormatting = titleHdr.formatting;
481
+ }
482
+ }
483
+ catch {
484
+ // ignore: malformed run/style data falls through to extractHeaderInfo.
485
+ }
486
+ }
487
+ if (!headerText && !tableContext) {
664
488
  const fallback = extractHeaderInfo(cleanTextNoLabel);
665
489
  headerText = fallback.header_text;
666
490
  headerStyle = fallback.header_style;
667
491
  }
492
+ const heading = deriveHeading(paraFmt.styleId, cleanTextNoLabel, headerText, headerStyle, tableContext != null);
668
493
  // ── Tag emission ──
669
494
  let tagged = cleanTextNoLabel;
670
495
  if (showFormatting) {
@@ -778,6 +603,10 @@ export function buildNodesForDocumentView(params) {
778
603
  if (fnMarkers.length > 0) {
779
604
  tagged = injectFootnoteMarkers(tagged, fnMarkers);
780
605
  }
606
+ // Visible characters stripped from the raw paragraph head when extracting a manual
607
+ // label (label text + trailing whitespace). Auto-numbered paragraphs leave fullText
608
+ // intact, so this is 0 for them.
609
+ const visibleOffsetCorrection = isAutoNumbered ? 0 : Math.max(0, fullText.length - cleanTextNoLabel.length);
781
610
  const node = {
782
611
  id,
783
612
  list_label: labelString,
@@ -786,6 +615,7 @@ export function buildNodesForDocumentView(params) {
786
615
  text: tagged, // filled after header stripping at render time
787
616
  clean_text: cleanTextNoLabel,
788
617
  tagged_text: tagged,
618
+ visible_offset_correction: visibleOffsetCorrection > 0 ? visibleOffsetCorrection : undefined,
789
619
  list_metadata: {
790
620
  list_level: listLevel,
791
621
  label_type: labelType,
@@ -804,10 +634,13 @@ export function buildNodesForDocumentView(params) {
804
634
  header_formatting: headerFormatting,
805
635
  body_run_formatting: bodyFmt,
806
636
  };
637
+ if (heading)
638
+ node.heading = heading;
807
639
  if (tableContext)
808
640
  node.table_context = tableContext;
809
641
  nodes.push(node);
810
642
  }
643
+ suppressSignatureClusters(nodes);
811
644
  const styles = discoverStyles(nodes);
812
645
  for (const n of nodes) {
813
646
  const sid = styles.fingerprint_to_style.get(fingerprintKey(n.style_fingerprint));