@usejunior/docx-core 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.tsbuildinfo +1 -1
- package/dist/atomizer.d.ts +15 -1
- package/dist/atomizer.d.ts.map +1 -1
- package/dist/atomizer.js +37 -1
- package/dist/atomizer.js.map +1 -1
- package/dist/baselines/atomizer/documentReconstructor.d.ts.map +1 -1
- package/dist/baselines/atomizer/documentReconstructor.js +218 -90
- package/dist/baselines/atomizer/documentReconstructor.js.map +1 -1
- package/dist/baselines/atomizer/formattingFidelity.d.ts +99 -0
- package/dist/baselines/atomizer/formattingFidelity.d.ts.map +1 -0
- package/dist/baselines/atomizer/formattingFidelity.js +449 -0
- package/dist/baselines/atomizer/formattingFidelity.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-bookmarks.d.ts +37 -0
- package/dist/baselines/atomizer/inPlaceModifier-bookmarks.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-bookmarks.js +189 -0
- package/dist/baselines/atomizer/inPlaceModifier-bookmarks.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-containers.d.ts +74 -0
- package/dist/baselines/atomizer/inPlaceModifier-containers.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-containers.js +171 -0
- package/dist/baselines/atomizer/inPlaceModifier-containers.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-deletion.d.ts +88 -0
- package/dist/baselines/atomizer/inPlaceModifier-deletion.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-deletion.js +326 -0
- package/dist/baselines/atomizer/inPlaceModifier-deletion.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-postprocess.d.ts +85 -0
- package/dist/baselines/atomizer/inPlaceModifier-postprocess.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-postprocess.js +402 -0
- package/dist/baselines/atomizer/inPlaceModifier-postprocess.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-presplit.d.ts +39 -0
- package/dist/baselines/atomizer/inPlaceModifier-presplit.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-presplit.js +265 -0
- package/dist/baselines/atomizer/inPlaceModifier-presplit.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-shared.d.ts +62 -0
- package/dist/baselines/atomizer/inPlaceModifier-shared.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-shared.js +139 -0
- package/dist/baselines/atomizer/inPlaceModifier-shared.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-wrappers.d.ts +189 -0
- package/dist/baselines/atomizer/inPlaceModifier-wrappers.d.ts.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier-wrappers.js +427 -0
- package/dist/baselines/atomizer/inPlaceModifier-wrappers.js.map +1 -0
- package/dist/baselines/atomizer/inPlaceModifier.d.ts +6 -290
- package/dist/baselines/atomizer/inPlaceModifier.d.ts.map +1 -1
- package/dist/baselines/atomizer/inPlaceModifier.js +23 -1828
- package/dist/baselines/atomizer/inPlaceModifier.js.map +1 -1
- package/dist/baselines/atomizer/pipeline.d.ts +76 -1
- package/dist/baselines/atomizer/pipeline.d.ts.map +1 -1
- package/dist/baselines/atomizer/pipeline.js +204 -27
- package/dist/baselines/atomizer/pipeline.js.map +1 -1
- package/dist/baselines/atomizer/trackChangesAcceptorAst.d.ts.map +1 -1
- package/dist/baselines/atomizer/trackChangesAcceptorAst.js +56 -160
- package/dist/baselines/atomizer/trackChangesAcceptorAst.js.map +1 -1
- package/dist/compare-types.d.ts +151 -0
- package/dist/compare-types.d.ts.map +1 -0
- package/dist/compare-types.js +2 -0
- package/dist/compare-types.js.map +1 -0
- package/dist/core-types.d.ts +5 -1
- package/dist/core-types.d.ts.map +1 -1
- package/dist/core-types.js +5 -1
- package/dist/core-types.js.map +1 -1
- package/dist/footnotes.d.ts +8 -3
- package/dist/footnotes.d.ts.map +1 -1
- package/dist/footnotes.js +8 -3
- package/dist/footnotes.js.map +1 -1
- package/dist/index.d.ts +6 -150
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -1
- package/dist/integration/libreoffice-oracle.d.ts +41 -0
- package/dist/integration/libreoffice-oracle.d.ts.map +1 -0
- package/dist/integration/libreoffice-oracle.js +282 -0
- package/dist/integration/libreoffice-oracle.js.map +1 -0
- package/dist/primitives/accept_changes.d.ts +2 -2
- package/dist/primitives/accept_changes.d.ts.map +1 -1
- package/dist/primitives/accept_changes.js +24 -79
- package/dist/primitives/accept_changes.js.map +1 -1
- package/dist/primitives/comments.d.ts +12 -3
- package/dist/primitives/comments.d.ts.map +1 -1
- package/dist/primitives/comments.js +374 -97
- package/dist/primitives/comments.js.map +1 -1
- package/dist/primitives/content_fingerprint.d.ts +29 -0
- package/dist/primitives/content_fingerprint.d.ts.map +1 -0
- package/dist/primitives/content_fingerprint.js +63 -0
- package/dist/primitives/content_fingerprint.js.map +1 -0
- package/dist/primitives/document.d.ts +56 -15
- package/dist/primitives/document.d.ts.map +1 -1
- package/dist/primitives/document.js +303 -32
- package/dist/primitives/document.js.map +1 -1
- package/dist/primitives/document_view-comments.d.ts +18 -0
- package/dist/primitives/document_view-comments.d.ts.map +1 -0
- package/dist/primitives/document_view-comments.js +159 -0
- package/dist/primitives/document_view-comments.js.map +1 -0
- package/dist/primitives/document_view-headings.d.ts +45 -0
- package/dist/primitives/document_view-headings.d.ts.map +1 -0
- package/dist/primitives/document_view-headings.js +247 -0
- package/dist/primitives/document_view-headings.js.map +1 -0
- package/dist/primitives/document_view-styles.d.ts +11 -0
- package/dist/primitives/document_view-styles.d.ts.map +1 -0
- package/dist/primitives/document_view-styles.js +104 -0
- package/dist/primitives/document_view-styles.js.map +1 -0
- package/dist/primitives/document_view-toon.d.ts +37 -0
- package/dist/primitives/document_view-toon.d.ts.map +1 -0
- package/dist/primitives/document_view-toon.js +199 -0
- package/dist/primitives/document_view-toon.js.map +1 -0
- package/dist/primitives/document_view-types.d.ts +137 -0
- package/dist/primitives/document_view-types.d.ts.map +1 -0
- package/dist/primitives/document_view-types.js +2 -0
- package/dist/primitives/document_view-types.js.map +1 -0
- package/dist/primitives/document_view.d.ts +8 -106
- package/dist/primitives/document_view.d.ts.map +1 -1
- package/dist/primitives/document_view.js +134 -301
- package/dist/primitives/document_view.js.map +1 -1
- package/dist/primitives/dom-helpers.d.ts +9 -0
- package/dist/primitives/dom-helpers.d.ts.map +1 -1
- package/dist/primitives/dom-helpers.js +10 -1
- package/dist/primitives/dom-helpers.js.map +1 -1
- package/dist/primitives/footnotes.d.ts +4 -3
- package/dist/primitives/footnotes.d.ts.map +1 -1
- package/dist/primitives/footnotes.js +232 -44
- package/dist/primitives/footnotes.js.map +1 -1
- package/dist/primitives/formatting_tags.d.ts +6 -0
- package/dist/primitives/formatting_tags.d.ts.map +1 -1
- package/dist/primitives/formatting_tags.js +6 -1
- package/dist/primitives/formatting_tags.js.map +1 -1
- package/dist/primitives/index.d.ts +6 -0
- package/dist/primitives/index.d.ts.map +1 -1
- package/dist/primitives/index.js +5 -0
- package/dist/primitives/index.js.map +1 -1
- package/dist/primitives/layout.d.ts +4 -3
- package/dist/primitives/layout.d.ts.map +1 -1
- package/dist/primitives/layout.js +32 -3
- package/dist/primitives/layout.js.map +1 -1
- package/dist/primitives/merge_runs.d.ts +21 -3
- package/dist/primitives/merge_runs.d.ts.map +1 -1
- package/dist/primitives/merge_runs.js +32 -10
- package/dist/primitives/merge_runs.js.map +1 -1
- package/dist/primitives/namespaces.d.ts +6 -0
- package/dist/primitives/namespaces.d.ts.map +1 -1
- package/dist/primitives/namespaces.js +9 -0
- package/dist/primitives/namespaces.js.map +1 -1
- package/dist/primitives/reject_changes.d.ts +2 -2
- package/dist/primitives/reject_changes.d.ts.map +1 -1
- package/dist/primitives/reject_changes.js +24 -81
- package/dist/primitives/reject_changes.js.map +1 -1
- package/dist/primitives/semantic_tags.d.ts +7 -0
- package/dist/primitives/semantic_tags.d.ts.map +1 -1
- package/dist/primitives/semantic_tags.js +21 -3
- package/dist/primitives/semantic_tags.js.map +1 -1
- package/dist/primitives/serialize_html.d.ts +36 -0
- package/dist/primitives/serialize_html.d.ts.map +1 -0
- package/dist/primitives/serialize_html.js +393 -0
- package/dist/primitives/serialize_html.js.map +1 -0
- package/dist/primitives/serialize_markdown.d.ts +16 -0
- package/dist/primitives/serialize_markdown.d.ts.map +1 -0
- package/dist/primitives/serialize_markdown.js +300 -0
- package/dist/primitives/serialize_markdown.js.map +1 -0
- package/dist/primitives/serialize_plaintext.d.ts +15 -0
- package/dist/primitives/serialize_plaintext.d.ts.map +1 -0
- package/dist/primitives/serialize_plaintext.js +154 -0
- package/dist/primitives/serialize_plaintext.js.map +1 -0
- package/dist/primitives/styles.js +22 -22
- package/dist/primitives/styles.js.map +1 -1
- package/dist/primitives/tables.d.ts.map +1 -1
- package/dist/primitives/tables.js +13 -3
- package/dist/primitives/tables.js.map +1 -1
- package/dist/primitives/text.d.ts +2 -1
- package/dist/primitives/text.d.ts.map +1 -1
- package/dist/primitives/text.js +116 -12
- package/dist/primitives/text.js.map +1 -1
- package/dist/primitives/track-changes-emitter.d.ts +139 -0
- package/dist/primitives/track-changes-emitter.d.ts.map +1 -0
- package/dist/primitives/track-changes-emitter.js +241 -0
- package/dist/primitives/track-changes-emitter.js.map +1 -0
- package/dist/primitives/xml-helpers.d.ts +29 -0
- package/dist/primitives/xml-helpers.d.ts.map +1 -0
- package/dist/primitives/xml-helpers.js +35 -0
- package/dist/primitives/xml-helpers.js.map +1 -0
- package/dist/shared/ooxml/namespaces.d.ts +4 -1
- package/dist/shared/ooxml/namespaces.d.ts.map +1 -1
- package/dist/shared/ooxml/namespaces.js +4 -1
- package/dist/shared/ooxml/namespaces.js.map +1 -1
- package/package.json +7 -6
|
@@ -1,22 +1,26 @@
|
|
|
1
1
|
import { OOXML, W } from './namespaces.js';
|
|
2
|
+
import { getAttributeSafe, getFirstChild } from './xml-helpers.js';
|
|
2
3
|
import { getParagraphText, getParagraphRuns } from './text.js';
|
|
3
|
-
import { extractListLabel, stripListLabel
|
|
4
|
+
import { extractListLabel, stripListLabel } from './list_labels.js';
|
|
4
5
|
import { parseNumberingXml, computeListLabelForParagraph } from './numbering.js';
|
|
5
6
|
import { parseStylesXml, extractParagraphFormatting, extractEffectiveRunFormatting } from './styles.js';
|
|
6
7
|
import { HIGHLIGHT_TAG } from './semantic_tags.js';
|
|
7
8
|
import { computeModalBaseline, computeParagraphFontBaseline, emitFormattingTags, mergeAdjacentTags } from './formatting_tags.js';
|
|
8
9
|
import { isReservedFootnote } from './footnotes.js';
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
import { deriveHeading, detectRunInHeader, detectTitleCapsCentered, extractHeaderInfo, suppressSignatureClusters, } from './document_view-headings.js';
|
|
11
|
+
import { discoverStyles, fingerprintKey } from './document_view-styles.js';
|
|
12
|
+
import { findTaggedTextInsertionIndex } from './document_view-comments.js';
|
|
13
|
+
export { discoverStyles } from './document_view-styles.js';
|
|
14
|
+
export { INLINE_COMMENT_MARKER_RUNTIME, TOON_INLINE_TAG_RE, collectInlineCommentMarkers, tokenizeToonInline } from './document_view-comments.js';
|
|
15
|
+
export { collectTableMarkerInfo, formatTableMarker, formatToonCommentEndnoteLines, formatToonCommentLines, formatToonCommentsEndnotesBlock, formatToonDataLine, renderToon, renderToonWithCommentEndnotes, } from './document_view-toon.js';
|
|
12
16
|
function getWAttr(el, localName) {
|
|
13
|
-
return el
|
|
17
|
+
return getAttributeSafe(el, OOXML.W_NS, localName, 'w');
|
|
14
18
|
}
|
|
15
19
|
function runHighlightVal(run) {
|
|
16
|
-
const rPr = run
|
|
20
|
+
const rPr = getFirstChild(run, OOXML.W_NS, W.rPr);
|
|
17
21
|
if (!rPr)
|
|
18
22
|
return null;
|
|
19
|
-
const h = rPr
|
|
23
|
+
const h = getFirstChild(rPr, OOXML.W_NS, W.highlight);
|
|
20
24
|
if (!h)
|
|
21
25
|
return null;
|
|
22
26
|
const v = getWAttr(h, 'val');
|
|
@@ -46,284 +50,6 @@ function emitHighlightTagsFromParagraph(p) {
|
|
|
46
50
|
out.push(`</${HIGHLIGHT_TAG}>`);
|
|
47
51
|
return out.join('');
|
|
48
52
|
}
|
|
49
|
-
function fingerprintKey(fp) {
|
|
50
|
-
// Stable JSON-ish key used for Map lookups.
|
|
51
|
-
return `${fp.list_level}|${fp.left_indent_pt.toFixed(1)}|${fp.first_line_indent_pt.toFixed(1)}|${fp.style_name}|${fp.alignment}`;
|
|
52
|
-
}
|
|
53
|
-
/**
|
|
54
|
-
* v0.3: Compact style fingerprint token.
|
|
55
|
-
* Concatenates style name, list level, alignment, and indentation for token-efficient LLM context.
|
|
56
|
-
* Example: "Normal:L-1:LEFT:I0:H0"
|
|
57
|
-
*/
|
|
58
|
-
function computeFingerprintToken(fp, styleId) {
|
|
59
|
-
const name = styleId || fp.style_name || 'body';
|
|
60
|
-
const level = `L${fp.list_level}`;
|
|
61
|
-
const align = fp.alignment;
|
|
62
|
-
const indent = `I${Math.round(fp.left_indent_pt)}`;
|
|
63
|
-
const hanging = `H${Math.round(fp.first_line_indent_pt)}`;
|
|
64
|
-
return `${name}:${level}:${align}:${indent}:${hanging}`;
|
|
65
|
-
}
|
|
66
|
-
// Pattern-based header detection fallback (ported from Python ingestor._extract_header_info).
|
|
67
|
-
const HEADER_PATTERN = /^([A-Z][^.!?:]*(?:\s+[A-Z][^.!?:]*)*)([.:]?)(?:\s|$)/;
|
|
68
|
-
function extractHeaderInfo(cleanText) {
|
|
69
|
-
if (!cleanText || cleanText.length < 2)
|
|
70
|
-
return { header_text: null, header_style: null };
|
|
71
|
-
if (!/^[A-Z]/.test(cleanText))
|
|
72
|
-
return { header_text: null, header_style: null };
|
|
73
|
-
const stripped = cleanText.trim();
|
|
74
|
-
if (stripped.length <= SHORT_HEADER_MAX_LENGTH) {
|
|
75
|
-
if (stripped.endsWith('.'))
|
|
76
|
-
return { header_text: stripped.slice(0, -1), header_style: 'title_with_period' };
|
|
77
|
-
if (stripped.endsWith(':'))
|
|
78
|
-
return { header_text: stripped.slice(0, -1), header_style: 'title_with_colon' };
|
|
79
|
-
const words = stripped.split(/\s+/);
|
|
80
|
-
if (words.length <= 5)
|
|
81
|
-
return { header_text: stripped, header_style: 'title_bare' };
|
|
82
|
-
return { header_text: null, header_style: null };
|
|
83
|
-
}
|
|
84
|
-
const m = HEADER_PATTERN.exec(stripped);
|
|
85
|
-
if (!m)
|
|
86
|
-
return { header_text: null, header_style: null };
|
|
87
|
-
const headerText = (m[1] ?? '').trim();
|
|
88
|
-
const terminator = m[2] ?? '';
|
|
89
|
-
const remaining = stripped.slice(m[0].length);
|
|
90
|
-
if (!remaining || headerText.length > MAX_HEADER_TEXT_LENGTH)
|
|
91
|
-
return { header_text: null, header_style: null };
|
|
92
|
-
if (terminator === '.')
|
|
93
|
-
return { header_text: headerText, header_style: 'title_with_period' };
|
|
94
|
-
if (terminator === ':')
|
|
95
|
-
return { header_text: headerText, header_style: 'title_with_colon' };
|
|
96
|
-
return { header_text: headerText, header_style: 'title_bare' };
|
|
97
|
-
}
|
|
98
|
-
function detectRunInHeader(params) {
|
|
99
|
-
const { paragraph, paragraphPPr, paragraphStyleId, styles } = params;
|
|
100
|
-
const punct = new Set(['.', ':', '-']);
|
|
101
|
-
// Use visible runs only (field code text stripped in getParagraphRuns()).
|
|
102
|
-
const runs = getParagraphRuns(paragraph);
|
|
103
|
-
if (runs.length === 0)
|
|
104
|
-
return null;
|
|
105
|
-
// Group by run element, preserving order.
|
|
106
|
-
const orderedUniqueRuns = [];
|
|
107
|
-
const seen = new Set();
|
|
108
|
-
for (const tr of runs) {
|
|
109
|
-
if (!seen.has(tr.r)) {
|
|
110
|
-
seen.add(tr.r);
|
|
111
|
-
orderedUniqueRuns.push(tr.r);
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
let headerText = '';
|
|
115
|
-
let formatting = null;
|
|
116
|
-
let headerCharCount = 0;
|
|
117
|
-
for (const r of orderedUniqueRuns) {
|
|
118
|
-
const fmt = extractEffectiveRunFormatting({ run: r, paragraphPPr, paragraphStyleId, styles });
|
|
119
|
-
const isHeaderStyle = fmt.bold || fmt.underline;
|
|
120
|
-
if (!isHeaderStyle)
|
|
121
|
-
break;
|
|
122
|
-
// Accumulate run text.
|
|
123
|
-
const ts = Array.from(r.getElementsByTagNameNS(OOXML.W_NS, W.t));
|
|
124
|
-
for (const t of ts) {
|
|
125
|
-
const tc = t.textContent ?? '';
|
|
126
|
-
headerText += tc;
|
|
127
|
-
headerCharCount += tc.length;
|
|
128
|
-
}
|
|
129
|
-
if (!formatting)
|
|
130
|
-
formatting = { bold: fmt.bold, italic: fmt.italic, underline: fmt.underline };
|
|
131
|
-
}
|
|
132
|
-
const trimmed = headerText.trim();
|
|
133
|
-
if (!trimmed)
|
|
134
|
-
return null;
|
|
135
|
-
if (!punct.has(trimmed[trimmed.length - 1]))
|
|
136
|
-
return null;
|
|
137
|
-
if (!formatting)
|
|
138
|
-
return null;
|
|
139
|
-
return { raw_text: trimmed, formatting, headerCharCount };
|
|
140
|
-
}
|
|
141
|
-
function inferSemanticName(params) {
|
|
142
|
-
const { fp, nodes } = params;
|
|
143
|
-
// Find first label_type if present.
|
|
144
|
-
let labelType = null;
|
|
145
|
-
for (const n of nodes) {
|
|
146
|
-
if (n.list_metadata.label_type) {
|
|
147
|
-
labelType = n.list_metadata.label_type;
|
|
148
|
-
break;
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
const listLevel = fp.list_level;
|
|
152
|
-
if (listLevel >= 0) {
|
|
153
|
-
if (listLevel === 0) {
|
|
154
|
-
if (labelType === LabelType.ARTICLE)
|
|
155
|
-
return { base_id: 'article', display_name: 'Article Heading' };
|
|
156
|
-
if (labelType === LabelType.SECTION)
|
|
157
|
-
return { base_id: 'section', display_name: 'Section Heading' };
|
|
158
|
-
if (labelType === LabelType.ROMAN)
|
|
159
|
-
return { base_id: 'roman_section', display_name: 'Roman Numeral Section' };
|
|
160
|
-
return { base_id: 'top_level', display_name: 'Top-Level List Item' };
|
|
161
|
-
}
|
|
162
|
-
if (listLevel === 1) {
|
|
163
|
-
if (labelType === LabelType.LETTER)
|
|
164
|
-
return { base_id: 'subsection', display_name: 'Subsection (a)/(A)' };
|
|
165
|
-
if (labelType === LabelType.NUMBER)
|
|
166
|
-
return { base_id: 'subsection_number', display_name: 'Numbered Subsection' };
|
|
167
|
-
if (labelType === LabelType.ROMAN)
|
|
168
|
-
return { base_id: 'subsection_roman', display_name: 'Roman Subsection' };
|
|
169
|
-
return { base_id: 'level_1', display_name: `Level ${listLevel} List Item` };
|
|
170
|
-
}
|
|
171
|
-
if (labelType === LabelType.ROMAN)
|
|
172
|
-
return { base_id: `level_${listLevel}_roman`, display_name: `Level ${listLevel} Roman` };
|
|
173
|
-
if (labelType === LabelType.LETTER)
|
|
174
|
-
return { base_id: `level_${listLevel}_letter`, display_name: `Level ${listLevel} Letter` };
|
|
175
|
-
return { base_id: `level_${listLevel}`, display_name: `Level ${listLevel} List Item` };
|
|
176
|
-
}
|
|
177
|
-
// Non-list.
|
|
178
|
-
const styleName = fp.style_name.toLowerCase().replace(/\s+/g, '_');
|
|
179
|
-
if (fp.left_indent_pt > 0)
|
|
180
|
-
return { base_id: 'indent_block', display_name: 'Indented Block' };
|
|
181
|
-
if (styleName.includes('heading') || styleName.includes('title'))
|
|
182
|
-
return { base_id: 'heading', display_name: 'Heading' };
|
|
183
|
-
if (styleName.includes('quote') || styleName.includes('block'))
|
|
184
|
-
return { base_id: 'block_quote', display_name: 'Block Quote' };
|
|
185
|
-
return { base_id: 'body', display_name: 'Body Text' };
|
|
186
|
-
}
|
|
187
|
-
export function discoverStyles(nodes) {
|
|
188
|
-
const groups = new Map();
|
|
189
|
-
for (const n of nodes) {
|
|
190
|
-
const key = fingerprintKey(n.style_fingerprint);
|
|
191
|
-
const g = groups.get(key);
|
|
192
|
-
if (g)
|
|
193
|
-
g.nodes.push(n);
|
|
194
|
-
else
|
|
195
|
-
groups.set(key, { fp: n.style_fingerprint, nodes: [n] });
|
|
196
|
-
}
|
|
197
|
-
const used = {};
|
|
198
|
-
const styles = new Map();
|
|
199
|
-
const fpToStyle = new Map();
|
|
200
|
-
for (const [fpKey, g] of groups.entries()) {
|
|
201
|
-
const { base_id, display_name } = inferSemanticName({ fp: g.fp, nodes: g.nodes });
|
|
202
|
-
let styleId = base_id;
|
|
203
|
-
if (used[base_id] !== undefined) {
|
|
204
|
-
used[base_id] += 1;
|
|
205
|
-
styleId = `${base_id}_${used[base_id]}`;
|
|
206
|
-
}
|
|
207
|
-
else {
|
|
208
|
-
used[base_id] = 0;
|
|
209
|
-
}
|
|
210
|
-
const median = g.nodes[Math.floor(g.nodes.length / 2)];
|
|
211
|
-
const info = {
|
|
212
|
-
style_id: styleId,
|
|
213
|
-
display_name,
|
|
214
|
-
fingerprint: g.fp,
|
|
215
|
-
example_node_id: median.id,
|
|
216
|
-
example_text: median.clean_text.slice(0, STYLE_EXAMPLE_TEXT_PREVIEW_LENGTH),
|
|
217
|
-
count: g.nodes.length,
|
|
218
|
-
dominant_alignment: g.fp.alignment,
|
|
219
|
-
};
|
|
220
|
-
styles.set(styleId, info);
|
|
221
|
-
fpToStyle.set(fpKey, styleId);
|
|
222
|
-
}
|
|
223
|
-
return { styles, fingerprint_to_style: fpToStyle };
|
|
224
|
-
}
|
|
225
|
-
function headerStripFromText(params) {
|
|
226
|
-
// Mirrors Python TOONRenderer header stripping.
|
|
227
|
-
const { header } = params;
|
|
228
|
-
let { text } = params;
|
|
229
|
-
if (!header)
|
|
230
|
-
return text;
|
|
231
|
-
const headerNorm = header.trim().toLowerCase();
|
|
232
|
-
const textLower = text.toLowerCase();
|
|
233
|
-
for (const punct of [':', '.', '-', ';', '']) {
|
|
234
|
-
const testPrefix = `${headerNorm}${punct}`;
|
|
235
|
-
if (textLower.startsWith(testPrefix)) {
|
|
236
|
-
text = text.slice(testPrefix.length).trimStart();
|
|
237
|
-
return text;
|
|
238
|
-
}
|
|
239
|
-
}
|
|
240
|
-
if (text.startsWith(header)) {
|
|
241
|
-
text = text.slice(header.length).replace(/^[.:\-;]+/, '').trimStart();
|
|
242
|
-
}
|
|
243
|
-
return text;
|
|
244
|
-
}
|
|
245
|
-
/**
|
|
246
|
-
* Format a single toon data line for one DocumentViewNode.
|
|
247
|
-
* Handles table-context-aware style (th/td) and header stripping.
|
|
248
|
-
*/
|
|
249
|
-
export function formatToonDataLine(n, options) {
|
|
250
|
-
let text = n.tagged_text;
|
|
251
|
-
if (n.header)
|
|
252
|
-
text = headerStripFromText({ header: n.header, text });
|
|
253
|
-
let header = n.header;
|
|
254
|
-
if (header && !text) {
|
|
255
|
-
text = header;
|
|
256
|
-
header = '';
|
|
257
|
-
}
|
|
258
|
-
const tc = n.table_context;
|
|
259
|
-
let style;
|
|
260
|
-
if (tc) {
|
|
261
|
-
style = tc.is_header_row
|
|
262
|
-
? `th(${tc.row_index},${tc.col_index})`
|
|
263
|
-
: `td(${tc.row_index},${tc.col_index})`;
|
|
264
|
-
}
|
|
265
|
-
else {
|
|
266
|
-
style = options?.compact
|
|
267
|
-
? computeFingerprintToken(n.style_fingerprint, n.style)
|
|
268
|
-
: n.style;
|
|
269
|
-
}
|
|
270
|
-
return `${n.id} | ${n.list_label} | ${header} | ${style} | ${text}`;
|
|
271
|
-
}
|
|
272
|
-
/**
|
|
273
|
-
* Collect table marker info (dimensions) from nodes for #TABLE markers.
|
|
274
|
-
* Column headers are NOT included in the marker — they appear once in the th() rows.
|
|
275
|
-
*/
|
|
276
|
-
export function collectTableMarkerInfo(nodes) {
|
|
277
|
-
const info = new Map();
|
|
278
|
-
for (const n of nodes) {
|
|
279
|
-
const tc = n.table_context;
|
|
280
|
-
if (!tc)
|
|
281
|
-
continue;
|
|
282
|
-
if (!info.has(tc.table_index)) {
|
|
283
|
-
info.set(tc.table_index, {
|
|
284
|
-
id: tc.table_id,
|
|
285
|
-
totalRows: tc.total_rows,
|
|
286
|
-
totalCols: tc.total_cols,
|
|
287
|
-
});
|
|
288
|
-
}
|
|
289
|
-
}
|
|
290
|
-
return info;
|
|
291
|
-
}
|
|
292
|
-
/**
|
|
293
|
-
* Format a #TABLE marker line from collected table info.
|
|
294
|
-
* Headers are omitted — they appear exactly once in the th(0,N) data rows.
|
|
295
|
-
*/
|
|
296
|
-
export function formatTableMarker(info) {
|
|
297
|
-
return `#TABLE ${info.id} | ${info.totalRows} rows × ${info.totalCols} cols`;
|
|
298
|
-
}
|
|
299
|
-
export function renderToon(nodes, options = {}) {
|
|
300
|
-
const lines = ['#SCHEMA id | list_label | header | style | text'];
|
|
301
|
-
// Pre-scan: collect table marker info for #TABLE lines
|
|
302
|
-
const tableInfo = collectTableMarkerInfo(nodes);
|
|
303
|
-
let currentTableIndex = null;
|
|
304
|
-
for (const n of nodes) {
|
|
305
|
-
const tc = n.table_context;
|
|
306
|
-
const nodeTableIndex = tc ? tc.table_index : null;
|
|
307
|
-
// Close previous table if we left it or moved to a different table
|
|
308
|
-
if (currentTableIndex !== null && nodeTableIndex !== currentTableIndex) {
|
|
309
|
-
lines.push('#END_TABLE');
|
|
310
|
-
currentTableIndex = null;
|
|
311
|
-
}
|
|
312
|
-
// Open new table if entering one
|
|
313
|
-
if (nodeTableIndex !== null && currentTableIndex === null) {
|
|
314
|
-
const info = tableInfo.get(nodeTableIndex);
|
|
315
|
-
if (info)
|
|
316
|
-
lines.push(formatTableMarker(info));
|
|
317
|
-
currentTableIndex = nodeTableIndex;
|
|
318
|
-
}
|
|
319
|
-
lines.push(formatToonDataLine(n, options));
|
|
320
|
-
}
|
|
321
|
-
// Close any open table at end
|
|
322
|
-
if (currentTableIndex !== null) {
|
|
323
|
-
lines.push('#END_TABLE');
|
|
324
|
-
}
|
|
325
|
-
return lines.join('\n');
|
|
326
|
-
}
|
|
327
53
|
export function buildDocumentView(params) {
|
|
328
54
|
const { documentXml, stylesXml, numberingXml, opts } = params;
|
|
329
55
|
const includeSemantic = opts?.include_semantic_tags ?? true;
|
|
@@ -334,7 +60,7 @@ export function buildDocumentView(params) {
|
|
|
334
60
|
void numberingModel;
|
|
335
61
|
const counters = new Map();
|
|
336
62
|
void counters;
|
|
337
|
-
const body = documentXml
|
|
63
|
+
const body = getFirstChild(documentXml, OOXML.W_NS, W.body);
|
|
338
64
|
if (!body)
|
|
339
65
|
return { nodes: [], styles: { styles: new Map(), fingerprint_to_style: new Map() } };
|
|
340
66
|
const paragraphs = Array.from(body.getElementsByTagNameNS(OOXML.W_NS, W.p));
|
|
@@ -357,9 +83,7 @@ function resolveRunHyperlinkUrl(runEl, relsMap) {
|
|
|
357
83
|
if (!parent || parent.localName !== W.hyperlink)
|
|
358
84
|
return null;
|
|
359
85
|
// r:id attribute can be namespaced or prefixed.
|
|
360
|
-
const rId = parent
|
|
361
|
-
parent.getAttribute('r:id') ??
|
|
362
|
-
null;
|
|
86
|
+
const rId = getAttributeSafe(parent, OOXML.R_NS, 'id', 'r', { bareFallback: false });
|
|
363
87
|
if (!rId)
|
|
364
88
|
return null;
|
|
365
89
|
return relsMap.get(rId) ?? null;
|
|
@@ -510,18 +234,74 @@ function getFootnoteMarkersForParagraph(p, displayMap) {
|
|
|
510
234
|
markers.sort((a, b) => b.offset - a.offset);
|
|
511
235
|
return markers;
|
|
512
236
|
}
|
|
237
|
+
/**
|
|
238
|
+
* Paragraph content that makes a text-empty paragraph meaningful on its own:
|
|
239
|
+
* an endnote or comment anchored to the paragraph (the comment range markers
|
|
240
|
+
* are what `getComments` resolves `anchored_paragraph_id`/`end_paragraph_id`
|
|
241
|
+
* from, so dropping their paragraph leaves a dangling anchor ID no node_ids
|
|
242
|
+
* probe can resolve), or embedded visual content (DrawingML drawing, VML
|
|
243
|
+
* picture, embedded object). Dropping such a paragraph from the document view
|
|
244
|
+
* severs the anchored note/comment from every read surface and silently
|
|
245
|
+
* hides images.
|
|
246
|
+
*
|
|
247
|
+
* Footnote references are handled separately via the display map so their
|
|
248
|
+
* [^N] markers render; the shapes here only need the node to exist.
|
|
249
|
+
* @see #383
|
|
250
|
+
*/
|
|
251
|
+
const ANCHORING_CONTENT = [
|
|
252
|
+
W.endnoteReference,
|
|
253
|
+
W.commentReference,
|
|
254
|
+
W.commentRangeStart,
|
|
255
|
+
W.commentRangeEnd,
|
|
256
|
+
W.drawing,
|
|
257
|
+
W.pict,
|
|
258
|
+
W.object,
|
|
259
|
+
];
|
|
260
|
+
/**
|
|
261
|
+
* True when `el` sits inside a `w:del` or `w:moveFrom` revision wrapper below
|
|
262
|
+
* the paragraph. Deleted/moved-from content is invisible to the view's text
|
|
263
|
+
* extraction (`getParagraphText` reads `w:t`, never `w:delText`), so an
|
|
264
|
+
* anchor that only survives inside a tracked deletion — e.g. the
|
|
265
|
+
* `w:commentReference` a tracked comment-delete leaves under `w:del` — must
|
|
266
|
+
* not resurrect its paragraph as a blank visible node.
|
|
267
|
+
*/
|
|
268
|
+
function isInsideRemovedRevisionWrapper(el, paragraph) {
|
|
269
|
+
let cur = el.parentNode;
|
|
270
|
+
while (cur && cur !== paragraph) {
|
|
271
|
+
if (cur.namespaceURI === OOXML.W_NS && (cur.localName === W.del || cur.localName === W.moveFrom)) {
|
|
272
|
+
return true;
|
|
273
|
+
}
|
|
274
|
+
cur = cur.parentNode;
|
|
275
|
+
}
|
|
276
|
+
return false;
|
|
277
|
+
}
|
|
278
|
+
function paragraphHasAnchoringContent(p) {
|
|
279
|
+
return ANCHORING_CONTENT.some((localName) => {
|
|
280
|
+
const els = p.getElementsByTagNameNS(OOXML.W_NS, localName);
|
|
281
|
+
for (let i = 0; i < els.length; i++) {
|
|
282
|
+
if (!isInsideRemovedRevisionWrapper(els.item(i), p))
|
|
283
|
+
return true;
|
|
284
|
+
}
|
|
285
|
+
return false;
|
|
286
|
+
});
|
|
287
|
+
}
|
|
513
288
|
/**
|
|
514
289
|
* Inject footnote markers into a text string at the given offsets.
|
|
515
290
|
* Markers must be sorted descending by offset.
|
|
291
|
+
*
|
|
292
|
+
* Offsets are *visible*-character offsets (they count document text, not the inline
|
|
293
|
+
* formatting tags emitted by `emitFormattingTags`). When `text` carries formatting tags
|
|
294
|
+
* we therefore map each visible offset to a tag-aware insertion index, exactly as the
|
|
295
|
+
* comment-marker path does (`findTaggedTextInsertionIndex`). A naive `slice(offset)` would
|
|
296
|
+
* land the `[^n]` marker inside a tag or mid-word once formatting is present.
|
|
516
297
|
*/
|
|
517
298
|
function injectFootnoteMarkers(text, markers) {
|
|
518
299
|
if (markers.length === 0)
|
|
519
300
|
return text;
|
|
520
301
|
let result = text;
|
|
521
302
|
for (const { offset, marker } of markers) {
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
result = result.slice(0, pos) + marker + result.slice(pos);
|
|
303
|
+
const insertionIndex = findTaggedTextInsertionIndex(result, offset);
|
|
304
|
+
result = result.slice(0, insertionIndex) + marker + result.slice(insertionIndex);
|
|
525
305
|
}
|
|
526
306
|
return result;
|
|
527
307
|
}
|
|
@@ -544,7 +324,7 @@ export function buildNodesForDocumentView(params) {
|
|
|
544
324
|
const allBodyRuns = [];
|
|
545
325
|
if (showFormatting) {
|
|
546
326
|
for (const { p } of paragraphs) {
|
|
547
|
-
const paraPPr = p
|
|
327
|
+
const paraPPr = getFirstChild(p, OOXML.W_NS, W.pPr);
|
|
548
328
|
const paraFmt = extractParagraphFormatting(paraPPr ?? null, stylesModel);
|
|
549
329
|
const runs = buildAnnotatedRuns({
|
|
550
330
|
p,
|
|
@@ -593,20 +373,31 @@ export function buildNodesForDocumentView(params) {
|
|
|
593
373
|
const nodes = [];
|
|
594
374
|
for (let idx = 0; idx < paragraphs.length; idx++) {
|
|
595
375
|
const { id, p, tableContext } = paragraphs[idx];
|
|
596
|
-
const paraPPr = p
|
|
376
|
+
const paraPPr = getFirstChild(p, OOXML.W_NS, W.pPr);
|
|
597
377
|
const paraFmt = extractParagraphFormatting(paraPPr ?? null, stylesModel);
|
|
598
378
|
// Visible clean text (field codes stripped).
|
|
599
379
|
const fullText = getParagraphText(p).replace(/\r/g, '').replace(/\n/g, '').trim();
|
|
600
|
-
// Preserve empty table cell paragraphs for structural completeness
|
|
601
|
-
|
|
380
|
+
// Preserve empty table cell paragraphs for structural completeness, and
|
|
381
|
+
// text-empty paragraphs that carry anchoring content — a visible footnote
|
|
382
|
+
// reference (its [^N] marker renders via the injection pass below), an
|
|
383
|
+
// endnote reference, a comment reference or comment range marker, or an
|
|
384
|
+
// embedded drawing/picture/object. Dropping those loses the anchored
|
|
385
|
+
// note/comment/image from every rendering of the document view. Anchors
|
|
386
|
+
// that survive only inside a tracked deletion don't count, and paragraphs
|
|
387
|
+
// that are empty for spacing only are still skipped.
|
|
388
|
+
// @see #185, #383
|
|
389
|
+
if (!fullText &&
|
|
390
|
+
!tableContext &&
|
|
391
|
+
getFootnoteMarkersForParagraph(p, footnoteDisplayMap).length === 0 &&
|
|
392
|
+
!paragraphHasAnchoringContent(p))
|
|
602
393
|
continue;
|
|
603
394
|
// Numbering (auto-numbered) info from numPr.
|
|
604
395
|
let numId = null;
|
|
605
396
|
let ilvl = null;
|
|
606
|
-
const numPr = paraPPr ? paraPPr
|
|
397
|
+
const numPr = paraPPr ? getFirstChild(paraPPr, OOXML.W_NS, W.numPr) : null;
|
|
607
398
|
if (numPr) {
|
|
608
|
-
const numIdEl = numPr
|
|
609
|
-
const ilvlEl = numPr
|
|
399
|
+
const numIdEl = getFirstChild(numPr, OOXML.W_NS, W.numId);
|
|
400
|
+
const ilvlEl = getFirstChild(numPr, OOXML.W_NS, W.ilvl);
|
|
610
401
|
const numIdVal = numIdEl ? getWAttr(numIdEl, 'val') : null;
|
|
611
402
|
const ilvlVal = ilvlEl ? getWAttr(ilvlEl, 'val') : null;
|
|
612
403
|
if (numIdVal)
|
|
@@ -649,7 +440,13 @@ export function buildNodesForDocumentView(params) {
|
|
|
649
440
|
let headerFormatting = null;
|
|
650
441
|
let headerCharCount = 0;
|
|
651
442
|
try {
|
|
652
|
-
|
|
443
|
+
// Skip in-table run-in header detection — table cells are key/value
|
|
444
|
+
// layout and a bold prefix is a label, not a section heading.
|
|
445
|
+
// Mirrors the !tableContext gates on detectTitleCapsCentered and
|
|
446
|
+
// extractHeaderInfo below.
|
|
447
|
+
const hdr = tableContext
|
|
448
|
+
? null
|
|
449
|
+
: detectRunInHeader({ paragraph: p, paragraphPPr: paraPPr ?? null, paragraphStyleId: paraFmt.styleId, styles: stylesModel });
|
|
653
450
|
if (hdr) {
|
|
654
451
|
headerText = hdr.raw_text.replace(/[.:\-]+$/g, '');
|
|
655
452
|
headerStyle = 'run_in_header';
|
|
@@ -660,11 +457,39 @@ export function buildNodesForDocumentView(params) {
|
|
|
660
457
|
catch {
|
|
661
458
|
// ignore
|
|
662
459
|
}
|
|
663
|
-
|
|
460
|
+
// Centered ALL-CAPS bold standalone titles (e.g. an NVCA SPA's
|
|
461
|
+
// `SERIES […] PREFERRED STOCK PURCHASE AGREEMENT`). Runs before
|
|
462
|
+
// extractHeaderInfo so the documented precedence (title_caps_centered
|
|
463
|
+
// outranks short standalone title_bare/title_with_period/title_with_colon)
|
|
464
|
+
// matches the implementation. Only fires when run_in_header did not match
|
|
465
|
+
// AND the paragraph has no list label AND is not in a table cell. The
|
|
466
|
+
// try/catch is defensive against malformed XML in user documents.
|
|
467
|
+
if (!headerText && !labelString && !tableContext) {
|
|
468
|
+
try {
|
|
469
|
+
const titleHdr = detectTitleCapsCentered({
|
|
470
|
+
paragraph: p,
|
|
471
|
+
paragraphPPr: paraPPr ?? null,
|
|
472
|
+
paragraphStyleId: paraFmt.styleId,
|
|
473
|
+
alignment: paraFmt.alignment,
|
|
474
|
+
cleanTextNoLabel,
|
|
475
|
+
styles: stylesModel,
|
|
476
|
+
});
|
|
477
|
+
if (titleHdr) {
|
|
478
|
+
headerText = titleHdr.raw_text;
|
|
479
|
+
headerStyle = 'title_caps_centered';
|
|
480
|
+
headerFormatting = titleHdr.formatting;
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
catch {
|
|
484
|
+
// ignore: malformed run/style data falls through to extractHeaderInfo.
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
if (!headerText && !tableContext) {
|
|
664
488
|
const fallback = extractHeaderInfo(cleanTextNoLabel);
|
|
665
489
|
headerText = fallback.header_text;
|
|
666
490
|
headerStyle = fallback.header_style;
|
|
667
491
|
}
|
|
492
|
+
const heading = deriveHeading(paraFmt.styleId, cleanTextNoLabel, headerText, headerStyle, tableContext != null);
|
|
668
493
|
// ── Tag emission ──
|
|
669
494
|
let tagged = cleanTextNoLabel;
|
|
670
495
|
if (showFormatting) {
|
|
@@ -778,6 +603,10 @@ export function buildNodesForDocumentView(params) {
|
|
|
778
603
|
if (fnMarkers.length > 0) {
|
|
779
604
|
tagged = injectFootnoteMarkers(tagged, fnMarkers);
|
|
780
605
|
}
|
|
606
|
+
// Visible characters stripped from the raw paragraph head when extracting a manual
|
|
607
|
+
// label (label text + trailing whitespace). Auto-numbered paragraphs leave fullText
|
|
608
|
+
// intact, so this is 0 for them.
|
|
609
|
+
const visibleOffsetCorrection = isAutoNumbered ? 0 : Math.max(0, fullText.length - cleanTextNoLabel.length);
|
|
781
610
|
const node = {
|
|
782
611
|
id,
|
|
783
612
|
list_label: labelString,
|
|
@@ -786,6 +615,7 @@ export function buildNodesForDocumentView(params) {
|
|
|
786
615
|
text: tagged, // filled after header stripping at render time
|
|
787
616
|
clean_text: cleanTextNoLabel,
|
|
788
617
|
tagged_text: tagged,
|
|
618
|
+
visible_offset_correction: visibleOffsetCorrection > 0 ? visibleOffsetCorrection : undefined,
|
|
789
619
|
list_metadata: {
|
|
790
620
|
list_level: listLevel,
|
|
791
621
|
label_type: labelType,
|
|
@@ -804,10 +634,13 @@ export function buildNodesForDocumentView(params) {
|
|
|
804
634
|
header_formatting: headerFormatting,
|
|
805
635
|
body_run_formatting: bodyFmt,
|
|
806
636
|
};
|
|
637
|
+
if (heading)
|
|
638
|
+
node.heading = heading;
|
|
807
639
|
if (tableContext)
|
|
808
640
|
node.table_context = tableContext;
|
|
809
641
|
nodes.push(node);
|
|
810
642
|
}
|
|
643
|
+
suppressSignatureClusters(nodes);
|
|
811
644
|
const styles = discoverStyles(nodes);
|
|
812
645
|
for (const n of nodes) {
|
|
813
646
|
const sid = styles.fingerprint_to_style.get(fingerprintKey(n.style_fingerprint));
|