odf-kit 0.9.9 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/README.md +49 -2
- package/dist/docx/body-reader.d.ts +54 -0
- package/dist/docx/body-reader.d.ts.map +1 -0
- package/dist/docx/body-reader.js +1124 -0
- package/dist/docx/body-reader.js.map +1 -0
- package/dist/docx/converter.d.ts +51 -0
- package/dist/docx/converter.d.ts.map +1 -0
- package/dist/docx/converter.js +799 -0
- package/dist/docx/converter.js.map +1 -0
- package/dist/docx/index.d.ts +81 -0
- package/dist/docx/index.d.ts.map +1 -0
- package/dist/docx/index.js +69 -0
- package/dist/docx/index.js.map +1 -0
- package/dist/docx/numbering.d.ts +42 -0
- package/dist/docx/numbering.d.ts.map +1 -0
- package/dist/docx/numbering.js +236 -0
- package/dist/docx/numbering.js.map +1 -0
- package/dist/docx/reader.d.ts +38 -0
- package/dist/docx/reader.d.ts.map +1 -0
- package/dist/docx/reader.js +512 -0
- package/dist/docx/reader.js.map +1 -0
- package/dist/docx/relationships.d.ts +27 -0
- package/dist/docx/relationships.d.ts.map +1 -0
- package/dist/docx/relationships.js +89 -0
- package/dist/docx/relationships.js.map +1 -0
- package/dist/docx/styles.d.ts +46 -0
- package/dist/docx/styles.d.ts.map +1 -0
- package/dist/docx/styles.js +383 -0
- package/dist/docx/styles.js.map +1 -0
- package/dist/docx/types.d.ts +266 -0
- package/dist/docx/types.d.ts.map +1 -0
- package/dist/docx/types.js +38 -0
- package/dist/docx/types.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/package.json +5 -1
|
@@ -0,0 +1,1124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* odf-kit — DOCX body reader
|
|
3
|
+
*
|
|
4
|
+
* Walks the w:body element from word/document.xml and converts it into the
|
|
5
|
+
* DocxBodyElement[] model. Also used to parse footnote/endnote/header/footer
|
|
6
|
+
* XML bodies, which share the same paragraph/table structure.
|
|
7
|
+
*
|
|
8
|
+
* Design decisions:
|
|
9
|
+
* - Only explicitly present XML properties are stored — no style inheritance
|
|
10
|
+
* resolution. The converter walks the basedOn chain at conversion time.
|
|
11
|
+
* - Mid-paragraph page breaks split the paragraph:
|
|
12
|
+
* [DocxParagraph (before), DocxPageBreak, DocxParagraph (after)]
|
|
13
|
+
* - Field state machine runs on every paragraph — handles both w:hyperlink
|
|
14
|
+
* elements and complex HYPERLINK fields (w:fldChar / w:instrText).
|
|
15
|
+
* - w:pict (legacy VML) images are fully handled via v:imagedata + v:shape.
|
|
16
|
+
* - w:sdt content is always processed; checkboxes get special rendering.
|
|
17
|
+
* - Tracked changes: w:ins / w:moveTo children are processed;
|
|
18
|
+
* w:del / w:moveFrom children are skipped.
|
|
19
|
+
* - Two-pass bookmark resolution: pass 1 collects all bookmark id→name
|
|
20
|
+
* mappings across the entire body; pass 2 emits bookmark elements using
|
|
21
|
+
* the pre-built map. This correctly handles cross-paragraph bookmarks.
|
|
22
|
+
* - Spec reference: ECMA-376 5th edition Part 1, §17 (WordprocessingML).
|
|
23
|
+
* CT_Body child elements validated against §17.2.2.
|
|
24
|
+
*/
|
|
25
|
+
import { parseXml } from "../reader/xml-parser.js";
|
|
26
|
+
import { parseRPr, parsePPr } from "./styles.js";
|
|
27
|
+
import { DEFAULT_RUN_PROPS, DEFAULT_PARA_PROPS } from "./types.js";
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
// Public API
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
/**
|
|
32
|
+
* Parse a body XML string into a DocxBodyElement array.
|
|
33
|
+
*
|
|
34
|
+
* Used for: word/document.xml (rootTag="body"), word/header*.xml (rootTag="hdr"),
|
|
35
|
+
* word/footer*.xml (rootTag="ftr"). Footnote/endnote XML uses readNotes().
|
|
36
|
+
*
|
|
37
|
+
* @param xml - Raw XML string.
|
|
38
|
+
* @param rootTag - Local name of the container element, e.g. "body", "hdr", "ftr".
|
|
39
|
+
* @param ctx - Shared reader context.
|
|
40
|
+
*/
|
|
41
|
+
export function readBody(xml, rootTag, ctx) {
|
|
42
|
+
const root = parseXml(xml);
|
|
43
|
+
const container = findContainer(root, rootTag);
|
|
44
|
+
// Pass 1: collect all bookmark id→name mappings in this XML scope
|
|
45
|
+
collectBookmarkNames(container.children, ctx.bookmarkNames);
|
|
46
|
+
// Pass 2: walk and emit body elements
|
|
47
|
+
return walkBodyChildren(container.children, ctx);
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Parse footnote/endnote XML into a Map<id, DocxNote>.
|
|
51
|
+
* Handles both word/footnotes.xml and word/endnotes.xml.
|
|
52
|
+
*
|
|
53
|
+
* @param xml - Raw XML string.
|
|
54
|
+
* @param noteTag - "footnote" | "endnote".
|
|
55
|
+
* @param ctx - Shared reader context.
|
|
56
|
+
*/
|
|
57
|
+
export function readNotes(xml, noteTag, ctx) {
|
|
58
|
+
const map = new Map();
|
|
59
|
+
const root = parseXml(xml);
|
|
60
|
+
// Pass 1: collect all bookmark names across all notes in this file
|
|
61
|
+
for (const child of root.children) {
|
|
62
|
+
if (child.type !== "element")
|
|
63
|
+
continue;
|
|
64
|
+
if (localName(child.tag) !== noteTag)
|
|
65
|
+
continue;
|
|
66
|
+
collectBookmarkNames(child.children, ctx.bookmarkNames);
|
|
67
|
+
}
|
|
68
|
+
// Pass 2: parse each note
|
|
69
|
+
for (const child of root.children) {
|
|
70
|
+
if (child.type !== "element")
|
|
71
|
+
continue;
|
|
72
|
+
if (localName(child.tag) !== noteTag)
|
|
73
|
+
continue;
|
|
74
|
+
const id = child.attrs["w:id"];
|
|
75
|
+
// Skip separator and continuationSeparator pseudo-notes
|
|
76
|
+
const noteType = child.attrs["w:type"];
|
|
77
|
+
if (!id || noteType === "separator" || noteType === "continuationSeparator")
|
|
78
|
+
continue;
|
|
79
|
+
const body = walkBodyChildren(child.children, ctx);
|
|
80
|
+
map.set(id, { id, body });
|
|
81
|
+
}
|
|
82
|
+
return map;
|
|
83
|
+
}
|
|
84
|
+
// ---------------------------------------------------------------------------
|
|
85
|
+
// Pass 1: bookmark name collection (recursive over entire XML tree)
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
/**
|
|
88
|
+
* Walk children recursively and build a complete id→name map for all
|
|
89
|
+
* w:bookmarkStart elements. This must run before the main body walk so that
|
|
90
|
+
* w:bookmarkEnd elements (which carry only an id) can be resolved to names
|
|
91
|
+
* regardless of paragraph boundaries.
|
|
92
|
+
*/
|
|
93
|
+
function collectBookmarkNames(children, map) {
|
|
94
|
+
for (const child of children) {
|
|
95
|
+
if (child.type !== "element")
|
|
96
|
+
continue;
|
|
97
|
+
const tag = localName(child.tag);
|
|
98
|
+
if (tag === "bookmarkStart") {
|
|
99
|
+
const id = child.attrs["w:id"];
|
|
100
|
+
const name = child.attrs["w:name"];
|
|
101
|
+
if (id && name)
|
|
102
|
+
map.set(id, name);
|
|
103
|
+
}
|
|
104
|
+
// Recurse into all container elements
|
|
105
|
+
if (child.children.length > 0) {
|
|
106
|
+
collectBookmarkNames(child.children, map);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
// ---------------------------------------------------------------------------
|
|
111
|
+
// Container finder
|
|
112
|
+
// ---------------------------------------------------------------------------
|
|
113
|
+
function findContainer(root, containerTag) {
|
|
114
|
+
if (localName(root.tag) === containerTag)
|
|
115
|
+
return root;
|
|
116
|
+
for (const child of root.children) {
|
|
117
|
+
if (child.type !== "element")
|
|
118
|
+
continue;
|
|
119
|
+
if (localName(child.tag) === containerTag)
|
|
120
|
+
return child;
|
|
121
|
+
}
|
|
122
|
+
// Fallback: use root if container not found
|
|
123
|
+
return root;
|
|
124
|
+
}
|
|
125
|
+
// ---------------------------------------------------------------------------
|
|
126
|
+
// Pass 2: body element walker
|
|
127
|
+
// ---------------------------------------------------------------------------
|
|
128
|
+
/**
|
|
129
|
+
* Walk a list of XML children and emit DocxBodyElement values.
|
|
130
|
+
* Handles all ECMA-376 §17.2.2 CT_Body child elements.
|
|
131
|
+
*/
|
|
132
|
+
function walkBodyChildren(children, ctx) {
|
|
133
|
+
const elements = [];
|
|
134
|
+
for (const child of children) {
|
|
135
|
+
if (child.type !== "element")
|
|
136
|
+
continue;
|
|
137
|
+
const tag = localName(child.tag);
|
|
138
|
+
switch (tag) {
|
|
139
|
+
case "p":
|
|
140
|
+
elements.push(...readParagraph(child, ctx));
|
|
141
|
+
break;
|
|
142
|
+
case "tbl":
|
|
143
|
+
elements.push(readTable(child, ctx));
|
|
144
|
+
break;
|
|
145
|
+
case "sdt":
|
|
146
|
+
elements.push(...readBlockSdt(child, ctx));
|
|
147
|
+
break;
|
|
148
|
+
case "customXml":
|
|
149
|
+
// Block-level custom XML wrapper — unwrap and process children
|
|
150
|
+
elements.push(...walkBodyChildren(child.children, ctx));
|
|
151
|
+
break;
|
|
152
|
+
case "ins":
|
|
153
|
+
case "moveTo":
|
|
154
|
+
// Tracked insertion / move destination — process children
|
|
155
|
+
elements.push(...walkBodyChildren(child.children, ctx));
|
|
156
|
+
break;
|
|
157
|
+
case "bookmarkStart": {
|
|
158
|
+
// Body-level bookmark (cross-paragraph) — emit as a zero-content paragraph
|
|
159
|
+
// wrapping the bookmark marker so it has a place in the body model.
|
|
160
|
+
const id = child.attrs["w:id"];
|
|
161
|
+
const name = id ? (ctx.bookmarkNames.get(id) ?? id) : null;
|
|
162
|
+
if (name) {
|
|
163
|
+
const bookmark = { type: "bookmark", name, position: "start" };
|
|
164
|
+
elements.push(makeSingleRunParagraph(bookmark));
|
|
165
|
+
}
|
|
166
|
+
break;
|
|
167
|
+
}
|
|
168
|
+
case "bookmarkEnd": {
|
|
169
|
+
const id = child.attrs["w:id"];
|
|
170
|
+
const name = id ? (ctx.bookmarkNames.get(id) ?? id) : null;
|
|
171
|
+
if (name) {
|
|
172
|
+
const bookmark = { type: "bookmark", name, position: "end" };
|
|
173
|
+
elements.push(makeSingleRunParagraph(bookmark));
|
|
174
|
+
}
|
|
175
|
+
break;
|
|
176
|
+
}
|
|
177
|
+
case "altChunk":
|
|
178
|
+
// Imported external content (HTML, RTF, etc.) — cannot convert; warn.
|
|
179
|
+
ctx.warnings.push("w:altChunk (imported external content) is not supported and was skipped");
|
|
180
|
+
break;
|
|
181
|
+
// Intentionally skipped (correct per spec):
|
|
182
|
+
case "del":
|
|
183
|
+
case "moveFrom":
|
|
184
|
+
// Tracked deletion / move source — skip all content
|
|
185
|
+
break;
|
|
186
|
+
case "sectPr":
|
|
187
|
+
// Final section properties — handled by reader.ts, not here
|
|
188
|
+
break;
|
|
189
|
+
case "proofErr":
|
|
190
|
+
case "permStart":
|
|
191
|
+
case "permEnd":
|
|
192
|
+
case "commentRangeStart":
|
|
193
|
+
case "commentRangeEnd":
|
|
194
|
+
case "customXmlDelRangeStart":
|
|
195
|
+
case "customXmlDelRangeEnd":
|
|
196
|
+
case "customXmlInsRangeStart":
|
|
197
|
+
case "customXmlInsRangeEnd":
|
|
198
|
+
case "customXmlMoveFromRangeStart":
|
|
199
|
+
case "customXmlMoveFromRangeEnd":
|
|
200
|
+
case "customXmlMoveToRangeStart":
|
|
201
|
+
case "customXmlMoveToRangeEnd":
|
|
202
|
+
case "moveFromRangeStart":
|
|
203
|
+
case "moveFromRangeEnd":
|
|
204
|
+
case "moveToRangeStart":
|
|
205
|
+
case "moveToRangeEnd":
|
|
206
|
+
case "oMath":
|
|
207
|
+
case "oMathPara":
|
|
208
|
+
// Markup anchors, math, and range markers — safely ignored
|
|
209
|
+
break;
|
|
210
|
+
default:
|
|
211
|
+
// Unknown element — no warning at body level (too noisy for namespace declarations etc.)
|
|
212
|
+
break;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
return elements;
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Wrap a single inline element in a minimal paragraph for body-level placement.
|
|
219
|
+
* Used for body-level bookmarkStart/End which have no paragraph container.
|
|
220
|
+
*/
|
|
221
|
+
function makeSingleRunParagraph(inline) {
|
|
222
|
+
return {
|
|
223
|
+
type: "paragraph",
|
|
224
|
+
headingLevel: null,
|
|
225
|
+
styleId: null,
|
|
226
|
+
props: { ...DEFAULT_PARA_PROPS },
|
|
227
|
+
runs: [inline],
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
// ---------------------------------------------------------------------------
|
|
231
|
+
// Paragraph
|
|
232
|
+
// ---------------------------------------------------------------------------
|
|
233
|
+
/**
|
|
234
|
+
* Read a w:p element. Returns one or more DocxBodyElement values because a
|
|
235
|
+
* mid-paragraph page break splits the paragraph into:
|
|
236
|
+
* [DocxParagraph (before), DocxPageBreak, DocxParagraph (after)]
|
|
237
|
+
* The field state machine runs on every paragraph.
|
|
238
|
+
*/
|
|
239
|
+
function readParagraph(el, ctx) {
|
|
240
|
+
let styleId = null;
|
|
241
|
+
let headingLevel = null;
|
|
242
|
+
let paraProps = { ...DEFAULT_PARA_PROPS };
|
|
243
|
+
// Extract pPr first (always the first child if present)
|
|
244
|
+
for (const child of el.children) {
|
|
245
|
+
if (child.type !== "element")
|
|
246
|
+
continue;
|
|
247
|
+
if (localName(child.tag) === "pPr") {
|
|
248
|
+
const result = readPPr(child, ctx);
|
|
249
|
+
styleId = result.styleId;
|
|
250
|
+
headingLevel = result.headingLevel;
|
|
251
|
+
paraProps = result.props;
|
|
252
|
+
// hasSectPr warning already emitted inside readPPr
|
|
253
|
+
break;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
// Run all inline content through the field state machine
|
|
257
|
+
const allInline = processInlineChildren(el.children, ctx);
|
|
258
|
+
const results = splitOnPageBreaks(allInline, styleId, headingLevel, paraProps);
|
|
259
|
+
// w:pageBreakBefore — spec §17.3.1.23: force a page break before this
|
|
260
|
+
// paragraph. Prepend a DocxPageBreak to the result array.
|
|
261
|
+
if (paraProps.pageBreakBefore && results.length > 0) {
|
|
262
|
+
results.unshift({ type: "pageBreak" });
|
|
263
|
+
}
|
|
264
|
+
return results;
|
|
265
|
+
}
|
|
266
|
+
function readPPr(el, ctx) {
|
|
267
|
+
let styleId = null;
|
|
268
|
+
let headingLevel = null;
|
|
269
|
+
for (const child of el.children) {
|
|
270
|
+
if (child.type !== "element")
|
|
271
|
+
continue;
|
|
272
|
+
const tag = localName(child.tag);
|
|
273
|
+
if (tag === "pStyle") {
|
|
274
|
+
styleId = child.attrs["w:val"] ?? null;
|
|
275
|
+
if (styleId) {
|
|
276
|
+
const entry = ctx.styles.get(styleId);
|
|
277
|
+
if (entry)
|
|
278
|
+
headingLevel = entry.headingLevel;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
// w:outlineLvl at paragraph level can override the style heading level
|
|
282
|
+
if (tag === "outlineLvl") {
|
|
283
|
+
const val = Number(child.attrs["w:val"] ?? "0");
|
|
284
|
+
if (val >= 0 && val <= 5)
|
|
285
|
+
headingLevel = val + 1;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
const pprResult = parsePPr(el);
|
|
289
|
+
if (pprResult.hasSectPr) {
|
|
290
|
+
ctx.warnings.push("Mid-document section break (w:sectPr in w:pPr) detected — " +
|
|
291
|
+
"multi-section page layout changes are not fully supported; " +
|
|
292
|
+
"final section layout is used for the whole document.");
|
|
293
|
+
}
|
|
294
|
+
// Merge pageBreakBefore from parsed props; default false
|
|
295
|
+
const props = {
|
|
296
|
+
alignment: pprResult.props.alignment ?? null,
|
|
297
|
+
pageBreakBefore: pprResult.props.pageBreakBefore ?? false,
|
|
298
|
+
spaceBefore: pprResult.props.spaceBefore ?? null,
|
|
299
|
+
spaceAfter: pprResult.props.spaceAfter ?? null,
|
|
300
|
+
lineHeight: pprResult.props.lineHeight ?? null,
|
|
301
|
+
indentLeft: pprResult.props.indentLeft ?? null,
|
|
302
|
+
indentRight: pprResult.props.indentRight ?? null,
|
|
303
|
+
indentFirstLine: pprResult.props.indentFirstLine ?? null,
|
|
304
|
+
list: pprResult.props.list ?? null,
|
|
305
|
+
borderBottom: pprResult.props.borderBottom ?? null,
|
|
306
|
+
};
|
|
307
|
+
return { styleId, headingLevel, props, hasSectPr: pprResult.hasSectPr };
|
|
308
|
+
}
|
|
309
|
+
function isPageBreakMarker(el) {
|
|
310
|
+
return el.type === "pageBreakMarker";
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Split inline elements on page break markers.
|
|
314
|
+
* Each segment becomes a DocxParagraph; markers become DocxPageBreak elements.
|
|
315
|
+
* Content before, between, and after breaks is always preserved.
|
|
316
|
+
*/
|
|
317
|
+
function splitOnPageBreaks(allInline, styleId, headingLevel, props) {
|
|
318
|
+
const results = [];
|
|
319
|
+
let current = [];
|
|
320
|
+
function flushParagraph() {
|
|
321
|
+
results.push({
|
|
322
|
+
type: "paragraph",
|
|
323
|
+
headingLevel,
|
|
324
|
+
styleId,
|
|
325
|
+
props,
|
|
326
|
+
runs: current,
|
|
327
|
+
});
|
|
328
|
+
current = [];
|
|
329
|
+
}
|
|
330
|
+
for (const el of allInline) {
|
|
331
|
+
if (isPageBreakMarker(el)) {
|
|
332
|
+
flushParagraph();
|
|
333
|
+
results.push({ type: "pageBreak" });
|
|
334
|
+
}
|
|
335
|
+
else {
|
|
336
|
+
current.push(el);
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
// Always flush the final segment (handles paragraphs with no page breaks,
|
|
340
|
+
// which is the common case, as well as content after the last break).
|
|
341
|
+
flushParagraph();
|
|
342
|
+
return results;
|
|
343
|
+
}
|
|
344
|
+
/**
|
|
345
|
+
* Process a list of paragraph-level child elements through the field state
|
|
346
|
+
* machine. Handles both simple w:hyperlink elements and complex
|
|
347
|
+
* w:fldChar/w:instrText field sequences in a single unified pass.
|
|
348
|
+
*/
|
|
349
|
+
function processInlineChildren(children, ctx) {
|
|
350
|
+
const results = [];
|
|
351
|
+
const field = {
|
|
352
|
+
active: false,
|
|
353
|
+
instrText: "",
|
|
354
|
+
displayRuns: [],
|
|
355
|
+
phase: "before-separate",
|
|
356
|
+
};
|
|
357
|
+
for (const child of children) {
|
|
358
|
+
if (child.type !== "element")
|
|
359
|
+
continue;
|
|
360
|
+
const tag = localName(child.tag);
|
|
361
|
+
switch (tag) {
|
|
362
|
+
case "r":
|
|
363
|
+
processRunElement(child, field, ctx, results);
|
|
364
|
+
break;
|
|
365
|
+
case "hyperlink": {
|
|
366
|
+
const link = readHyperlink(child, ctx);
|
|
367
|
+
if (link)
|
|
368
|
+
results.push(link);
|
|
369
|
+
break;
|
|
370
|
+
}
|
|
371
|
+
case "fldSimple": {
|
|
372
|
+
// Simple field — w:instr attribute contains the field instruction;
|
|
373
|
+
// children are the display runs. Spec ref: ECMA-376 §17.16.19.
|
|
374
|
+
// Handle HYPERLINK and PAGE; for others emit display content.
|
|
375
|
+
const instr = (child.attrs["w:instr"] ?? "").trim();
|
|
376
|
+
const displayRuns = [];
|
|
377
|
+
for (const fc of child.children) {
|
|
378
|
+
if (fc.type !== "element")
|
|
379
|
+
continue;
|
|
380
|
+
if (localName(fc.tag) === "r") {
|
|
381
|
+
const items = readRun(fc, ctx);
|
|
382
|
+
for (const item of items) {
|
|
383
|
+
if (item.type === "run")
|
|
384
|
+
displayRuns.push(item);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
const resolved = resolveField({ active: true, instrText: instr, displayRuns, phase: "after-separate" }, ctx);
|
|
389
|
+
if (resolved)
|
|
390
|
+
results.push(resolved);
|
|
391
|
+
break;
|
|
392
|
+
}
|
|
393
|
+
case "smartTag":
|
|
394
|
+
// Semantic annotation wrapper — unwrap and process EG_PContent children.
|
|
395
|
+
// Spec ref: ECMA-376 §17.5.1.9 (CT_SmartTagRun).
|
|
396
|
+
results.push(...processInlineChildren(child.children, ctx));
|
|
397
|
+
break;
|
|
398
|
+
case "dir":
|
|
399
|
+
case "bdo":
|
|
400
|
+
// Bidirectional text direction override wrappers — unwrap.
|
|
401
|
+
// Spec ref: ECMA-376 §17.3.2.8 (dir), §17.3.2.3 (bdo).
|
|
402
|
+
results.push(...processInlineChildren(child.children, ctx));
|
|
403
|
+
break;
|
|
404
|
+
case "ins":
|
|
405
|
+
case "moveTo":
|
|
406
|
+
// Tracked insertion / move destination — process contained runs
|
|
407
|
+
for (const insChild of child.children) {
|
|
408
|
+
if (insChild.type !== "element")
|
|
409
|
+
continue;
|
|
410
|
+
if (localName(insChild.tag) === "r") {
|
|
411
|
+
processRunElement(insChild, field, ctx, results);
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
break;
|
|
415
|
+
case "del":
|
|
416
|
+
case "moveFrom":
|
|
417
|
+
// Tracked deletion / move source — skip
|
|
418
|
+
break;
|
|
419
|
+
case "bookmarkStart": {
|
|
420
|
+
const id = child.attrs["w:id"];
|
|
421
|
+
const name = id ? (ctx.bookmarkNames.get(id) ?? id) : null;
|
|
422
|
+
if (name)
|
|
423
|
+
results.push({ type: "bookmark", name, position: "start" });
|
|
424
|
+
break;
|
|
425
|
+
}
|
|
426
|
+
case "bookmarkEnd": {
|
|
427
|
+
const id = child.attrs["w:id"];
|
|
428
|
+
const name = id ? (ctx.bookmarkNames.get(id) ?? id) : null;
|
|
429
|
+
if (name)
|
|
430
|
+
results.push({ type: "bookmark", name, position: "end" });
|
|
431
|
+
break;
|
|
432
|
+
}
|
|
433
|
+
case "sdt":
|
|
434
|
+
results.push(...readInlineSdt(child, ctx));
|
|
435
|
+
break;
|
|
436
|
+
case "customXml":
|
|
437
|
+
// Inline custom XML wrapper — unwrap and process children
|
|
438
|
+
results.push(...processInlineChildren(child.children, ctx));
|
|
439
|
+
break;
|
|
440
|
+
case "proofErr":
|
|
441
|
+
case "permStart":
|
|
442
|
+
case "permEnd":
|
|
443
|
+
case "commentRangeStart":
|
|
444
|
+
case "commentRangeEnd":
|
|
445
|
+
// Markup anchors — safely ignored
|
|
446
|
+
break;
|
|
447
|
+
case "pPr":
|
|
448
|
+
// Already processed before this loop
|
|
449
|
+
break;
|
|
450
|
+
default:
|
|
451
|
+
break;
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
// If a field is still active at end of paragraph (malformed DOCX), emit
|
|
455
|
+
// whatever display runs we have collected so content is not lost.
|
|
456
|
+
if (field.active && field.displayRuns.length > 0) {
|
|
457
|
+
ctx.warnings.push("Unclosed complex field at end of paragraph — display text recovered");
|
|
458
|
+
results.push(...field.displayRuns);
|
|
459
|
+
}
|
|
460
|
+
return results;
|
|
461
|
+
}
|
|
462
|
+
/**
|
|
463
|
+
* Process a single w:r element, routing through the field state machine
|
|
464
|
+
* if a field is currently active.
|
|
465
|
+
*/
|
|
466
|
+
function processRunElement(runEl, field, ctx, results) {
|
|
467
|
+
// Check if this run contains a fldChar — if so, handle via state machine
|
|
468
|
+
let hasFldChar = false;
|
|
469
|
+
for (const child of runEl.children) {
|
|
470
|
+
if (child.type === "element" && localName(child.tag) === "fldChar") {
|
|
471
|
+
hasFldChar = true;
|
|
472
|
+
break;
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
if (!hasFldChar && !field.active) {
|
|
476
|
+
// Common case: normal run outside any field
|
|
477
|
+
results.push(...readRun(runEl, ctx));
|
|
478
|
+
return;
|
|
479
|
+
}
|
|
480
|
+
// Field state machine processing
|
|
481
|
+
for (const child of runEl.children) {
|
|
482
|
+
if (child.type !== "element")
|
|
483
|
+
continue;
|
|
484
|
+
const tag = localName(child.tag);
|
|
485
|
+
if (tag === "fldChar") {
|
|
486
|
+
const fldCharType = child.attrs["w:fldCharType"];
|
|
487
|
+
if (fldCharType === "begin") {
|
|
488
|
+
field.active = true;
|
|
489
|
+
field.instrText = "";
|
|
490
|
+
field.displayRuns = [];
|
|
491
|
+
field.phase = "before-separate";
|
|
492
|
+
}
|
|
493
|
+
else if (fldCharType === "separate") {
|
|
494
|
+
field.phase = "after-separate";
|
|
495
|
+
}
|
|
496
|
+
else if (fldCharType === "end") {
|
|
497
|
+
const resolved = resolveField(field, ctx);
|
|
498
|
+
if (resolved)
|
|
499
|
+
results.push(resolved);
|
|
500
|
+
field.active = false;
|
|
501
|
+
field.instrText = "";
|
|
502
|
+
field.displayRuns = [];
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
else if (tag === "instrText" && field.active && field.phase === "before-separate") {
|
|
506
|
+
for (const n of child.children) {
|
|
507
|
+
if (n.type === "text")
|
|
508
|
+
field.instrText += n.text;
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
else if (field.active && field.phase === "after-separate") {
|
|
512
|
+
if (tag === "t") {
|
|
513
|
+
let text = "";
|
|
514
|
+
for (const n of child.children) {
|
|
515
|
+
if (n.type === "text")
|
|
516
|
+
text += n.text;
|
|
517
|
+
}
|
|
518
|
+
if (text) {
|
|
519
|
+
field.displayRuns.push({ type: "run", text, props: { ...DEFAULT_RUN_PROPS } });
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
else if (!field.active) {
|
|
524
|
+
// Normal run content outside a field — process individually
|
|
525
|
+
const syntheticRun = {
|
|
526
|
+
type: "element",
|
|
527
|
+
tag: runEl.tag,
|
|
528
|
+
attrs: runEl.attrs,
|
|
529
|
+
children: [child],
|
|
530
|
+
};
|
|
531
|
+
results.push(...readRun(syntheticRun, ctx));
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
/**
|
|
536
|
+
* Resolve a completed complex field into an inline element.
|
|
537
|
+
* Handles: HYPERLINK (external and internal anchor), PAGE (page number).
|
|
538
|
+
*/
|
|
539
|
+
function resolveField(field, ctx) {
|
|
540
|
+
const instr = field.instrText.trim();
|
|
541
|
+
// HYPERLINK field — two forms:
|
|
542
|
+
// HYPERLINK "url" — external URL
|
|
543
|
+
// HYPERLINK \l "anchor" — internal anchor (\l flag = "location")
|
|
544
|
+
const isLocalAnchor = /\\l\b/.test(instr);
|
|
545
|
+
const hyperlinkMatch = /HYPERLINK\s+(?:\\l\s+)?"([^"]+)"/.exec(instr);
|
|
546
|
+
if (hyperlinkMatch) {
|
|
547
|
+
const rawUrl = hyperlinkMatch[1];
|
|
548
|
+
const url = isLocalAnchor ? "#" + rawUrl : rawUrl;
|
|
549
|
+
return {
|
|
550
|
+
type: "hyperlink",
|
|
551
|
+
url,
|
|
552
|
+
internal: isLocalAnchor,
|
|
553
|
+
runs: field.displayRuns,
|
|
554
|
+
};
|
|
555
|
+
}
|
|
556
|
+
// PAGE field — current page number placeholder; emit display text as-is
|
|
557
|
+
if (/^\s*PAGE\s*$/.test(instr) && field.displayRuns.length > 0) {
|
|
558
|
+
return field.displayRuns[0];
|
|
559
|
+
}
|
|
560
|
+
// NUMPAGES, NUMCHARS, etc. — emit display text with no special treatment
|
|
561
|
+
if (field.displayRuns.length > 0) {
|
|
562
|
+
ctx.warnings.push(`Unrecognized field instruction: "${instr.slice(0, 80).trim()}"`);
|
|
563
|
+
return field.displayRuns[0];
|
|
564
|
+
}
|
|
565
|
+
return null;
|
|
566
|
+
}
|
|
567
|
+
// ---------------------------------------------------------------------------
|
|
568
|
+
// Run
|
|
569
|
+
// ---------------------------------------------------------------------------
|
|
570
|
+
/**
|
|
571
|
+
* Read a w:r element and return its inline content.
|
|
572
|
+
* Returns an array because a run may produce a page break marker plus
|
|
573
|
+
* surrounding text runs.
|
|
574
|
+
*/
|
|
575
|
+
function readRun(el, ctx) {
|
|
576
|
+
const results = [];
|
|
577
|
+
// Parse run properties (rPr is always the first child if present)
|
|
578
|
+
let runProps = { ...DEFAULT_RUN_PROPS };
|
|
579
|
+
for (const child of el.children) {
|
|
580
|
+
if (child.type !== "element")
|
|
581
|
+
continue;
|
|
582
|
+
if (localName(child.tag) === "rPr") {
|
|
583
|
+
runProps = mergeRunProps(DEFAULT_RUN_PROPS, parseRPr(child));
|
|
584
|
+
break;
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
let pendingText = "";
|
|
588
|
+
function flushText() {
|
|
589
|
+
if (pendingText.length > 0) {
|
|
590
|
+
results.push({ type: "run", text: pendingText, props: runProps });
|
|
591
|
+
pendingText = "";
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
for (const child of el.children) {
|
|
595
|
+
if (child.type !== "element")
|
|
596
|
+
continue;
|
|
597
|
+
const tag = localName(child.tag);
|
|
598
|
+
switch (tag) {
|
|
599
|
+
case "t": {
|
|
600
|
+
for (const n of child.children) {
|
|
601
|
+
if (n.type === "text")
|
|
602
|
+
pendingText += n.text;
|
|
603
|
+
}
|
|
604
|
+
break;
|
|
605
|
+
}
|
|
606
|
+
case "br": {
|
|
607
|
+
const brType = child.attrs["w:type"];
|
|
608
|
+
if (brType === "page" || brType === "column") {
|
|
609
|
+
flushText();
|
|
610
|
+
results.push({ type: "pageBreakMarker" });
|
|
611
|
+
}
|
|
612
|
+
else {
|
|
613
|
+
// Default (no type or type="textWrapping") = soft line break
|
|
614
|
+
flushText();
|
|
615
|
+
results.push({ type: "lineBreak" });
|
|
616
|
+
}
|
|
617
|
+
break;
|
|
618
|
+
}
|
|
619
|
+
case "tab":
|
|
620
|
+
flushText();
|
|
621
|
+
results.push({ type: "tab" });
|
|
622
|
+
break;
|
|
623
|
+
case "ptab":
|
|
624
|
+
// Absolute position tab — spec §17.3.3.23 (CT_PTab).
|
|
625
|
+
// Treat as a regular tab for conversion purposes.
|
|
626
|
+
flushText();
|
|
627
|
+
results.push({ type: "tab" });
|
|
628
|
+
break;
|
|
629
|
+
case "drawing":
|
|
630
|
+
flushText();
|
|
631
|
+
{
|
|
632
|
+
const img = readDrawing(child, ctx);
|
|
633
|
+
if (img)
|
|
634
|
+
results.push(img);
|
|
635
|
+
}
|
|
636
|
+
break;
|
|
637
|
+
case "pict":
|
|
638
|
+
flushText();
|
|
639
|
+
{
|
|
640
|
+
const img = readPict(child, ctx);
|
|
641
|
+
if (img)
|
|
642
|
+
results.push(img);
|
|
643
|
+
}
|
|
644
|
+
break;
|
|
645
|
+
case "footnoteReference": {
|
|
646
|
+
flushText();
|
|
647
|
+
const id = child.attrs["w:id"];
|
|
648
|
+
if (id)
|
|
649
|
+
results.push({ type: "footnoteReference", id });
|
|
650
|
+
break;
|
|
651
|
+
}
|
|
652
|
+
case "endnoteReference": {
|
|
653
|
+
flushText();
|
|
654
|
+
const id = child.attrs["w:id"];
|
|
655
|
+
if (id)
|
|
656
|
+
results.push({ type: "endnoteReference", id });
|
|
657
|
+
break;
|
|
658
|
+
}
|
|
659
|
+
case "sym": {
|
|
660
|
+
// Symbol character — w:char is a Unicode code point in hex
|
|
661
|
+
flushText();
|
|
662
|
+
const charCode = child.attrs["w:char"];
|
|
663
|
+
if (charCode) {
|
|
664
|
+
const text = String.fromCodePoint(parseInt(charCode, 16));
|
|
665
|
+
results.push({ type: "run", text, props: runProps });
|
|
666
|
+
}
|
|
667
|
+
break;
|
|
668
|
+
}
|
|
669
|
+
case "noBreakHyphen":
|
|
670
|
+
pendingText += "\u2011"; // non-breaking hyphen
|
|
671
|
+
break;
|
|
672
|
+
case "softHyphen":
|
|
673
|
+
pendingText += "\u00AD"; // soft hyphen
|
|
674
|
+
break;
|
|
675
|
+
case "cr":
|
|
676
|
+
flushText();
|
|
677
|
+
results.push({ type: "lineBreak" });
|
|
678
|
+
break;
|
|
679
|
+
case "lastRenderedPageBreak":
|
|
680
|
+
// Word-inserted rendering hint — treat as page break for fidelity
|
|
681
|
+
flushText();
|
|
682
|
+
results.push({ type: "pageBreakMarker" });
|
|
683
|
+
break;
|
|
684
|
+
case "rPr":
|
|
685
|
+
case "fldChar":
|
|
686
|
+
case "instrText":
|
|
687
|
+
// Already handled above or processed by field state machine caller
|
|
688
|
+
break;
|
|
689
|
+
default:
|
|
690
|
+
// Low-level run elements (rPrChange, annotationRef, etc.) — skip silently
|
|
691
|
+
break;
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
flushText();
|
|
695
|
+
return results;
|
|
696
|
+
}
|
|
697
|
+
// ---------------------------------------------------------------------------
|
|
698
|
+
// Hyperlink (simple — w:hyperlink element)
|
|
699
|
+
// ---------------------------------------------------------------------------
|
|
700
|
+
function readHyperlink(el, ctx) {
|
|
701
|
+
const rId = el.attrs["r:id"];
|
|
702
|
+
const anchor = el.attrs["w:anchor"];
|
|
703
|
+
let url = "";
|
|
704
|
+
let internal = false;
|
|
705
|
+
if (rId) {
|
|
706
|
+
const rel = ctx.relationships.get(rId);
|
|
707
|
+
if (rel) {
|
|
708
|
+
url = rel.target;
|
|
709
|
+
internal = !rel.external;
|
|
710
|
+
if (internal)
|
|
711
|
+
url = "#" + url;
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
else if (anchor) {
|
|
715
|
+
url = "#" + anchor;
|
|
716
|
+
internal = true;
|
|
717
|
+
}
|
|
718
|
+
if (!url)
|
|
719
|
+
return null;
|
|
720
|
+
const runs = [];
|
|
721
|
+
for (const child of el.children) {
|
|
722
|
+
if (child.type !== "element")
|
|
723
|
+
continue;
|
|
724
|
+
const tag = localName(child.tag);
|
|
725
|
+
if (tag === "r") {
|
|
726
|
+
const items = readRun(child, ctx);
|
|
727
|
+
for (const item of items) {
|
|
728
|
+
if (item.type === "run")
|
|
729
|
+
runs.push(item);
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
return { type: "hyperlink", url, internal, runs };
|
|
734
|
+
}
|
|
735
|
+
// ---------------------------------------------------------------------------
|
|
736
|
+
// Modern drawing images (w:drawing → wp:inline / wp:anchor → a:blip)
|
|
737
|
+
// ---------------------------------------------------------------------------
|
|
738
|
+
function readDrawing(el, ctx) {
|
|
739
|
+
for (const child of el.children) {
|
|
740
|
+
if (child.type !== "element")
|
|
741
|
+
continue;
|
|
742
|
+
const tag = localName(child.tag);
|
|
743
|
+
if (tag !== "inline" && tag !== "anchor")
|
|
744
|
+
continue;
|
|
745
|
+
let widthEmu = 0;
|
|
746
|
+
let heightEmu = 0;
|
|
747
|
+
let rId = null;
|
|
748
|
+
let altText = null;
|
|
749
|
+
for (const prop of child.children) {
|
|
750
|
+
if (prop.type !== "element")
|
|
751
|
+
continue;
|
|
752
|
+
const ptag = localName(prop.tag);
|
|
753
|
+
if (ptag === "extent") {
|
|
754
|
+
widthEmu = Number(prop.attrs["cx"] ?? "0");
|
|
755
|
+
heightEmu = Number(prop.attrs["cy"] ?? "0");
|
|
756
|
+
}
|
|
757
|
+
else if (ptag === "docPr") {
|
|
758
|
+
altText = prop.attrs["descr"] ?? prop.attrs["title"] ?? null;
|
|
759
|
+
}
|
|
760
|
+
else if (ptag === "graphic") {
|
|
761
|
+
rId = findBlipRId(prop);
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
if (!rId) {
|
|
765
|
+
ctx.warnings.push("w:drawing found but no image relationship could be resolved — image skipped");
|
|
766
|
+
return null;
|
|
767
|
+
}
|
|
768
|
+
return {
|
|
769
|
+
type: "inlineImage",
|
|
770
|
+
rId,
|
|
771
|
+
widthCm: emuToCm(widthEmu),
|
|
772
|
+
heightCm: emuToCm(heightEmu),
|
|
773
|
+
altText,
|
|
774
|
+
};
|
|
775
|
+
}
|
|
776
|
+
return null;
|
|
777
|
+
}
|
|
778
|
+
/** Walk a:graphic → a:graphicData → pic:pic → pic:blipFill → a:blip to find r:embed. */
|
|
779
|
+
function findBlipRId(graphicEl) {
|
|
780
|
+
for (const child of graphicEl.children) {
|
|
781
|
+
if (child.type !== "element")
|
|
782
|
+
continue;
|
|
783
|
+
if (localName(child.tag) === "graphicData") {
|
|
784
|
+
return findBlipInGraphicData(child);
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
return null;
|
|
788
|
+
}
|
|
789
|
+
function findBlipInGraphicData(el) {
|
|
790
|
+
for (const child of el.children) {
|
|
791
|
+
if (child.type !== "element")
|
|
792
|
+
continue;
|
|
793
|
+
if (localName(child.tag) === "pic") {
|
|
794
|
+
for (const picChild of child.children) {
|
|
795
|
+
if (picChild.type !== "element")
|
|
796
|
+
continue;
|
|
797
|
+
if (localName(picChild.tag) === "blipFill") {
|
|
798
|
+
for (const bfChild of picChild.children) {
|
|
799
|
+
if (bfChild.type !== "element")
|
|
800
|
+
continue;
|
|
801
|
+
if (localName(bfChild.tag) === "blip") {
|
|
802
|
+
return bfChild.attrs["r:embed"] ?? null;
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
return null;
|
|
810
|
+
}
|
|
811
|
+
// ---------------------------------------------------------------------------
|
|
812
|
+
// Legacy VML images (w:pict → v:shape → v:imagedata)
|
|
813
|
+
// ---------------------------------------------------------------------------
|
|
814
|
+
function readPict(el, ctx) {
|
|
815
|
+
for (const child of el.children) {
|
|
816
|
+
if (child.type !== "element")
|
|
817
|
+
continue;
|
|
818
|
+
if (localName(child.tag) !== "shape")
|
|
819
|
+
continue;
|
|
820
|
+
const style = child.attrs["style"] ?? "";
|
|
821
|
+
const { widthCm, heightCm } = parseVmlStyle(style);
|
|
822
|
+
let rId = null;
|
|
823
|
+
let altText = null;
|
|
824
|
+
for (const shapeChild of child.children) {
|
|
825
|
+
if (shapeChild.type !== "element")
|
|
826
|
+
continue;
|
|
827
|
+
if (localName(shapeChild.tag) === "imagedata") {
|
|
828
|
+
rId = shapeChild.attrs["r:id"] ?? shapeChild.attrs["r:href"] ?? null;
|
|
829
|
+
altText = shapeChild.attrs["o:title"] ?? null;
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
if (!rId) {
|
|
833
|
+
ctx.warnings.push("w:pict found but no r:id on v:imagedata — image skipped");
|
|
834
|
+
return null;
|
|
835
|
+
}
|
|
836
|
+
return { type: "inlineImage", rId, widthCm, heightCm, altText };
|
|
837
|
+
}
|
|
838
|
+
return null;
|
|
839
|
+
}
|
|
840
|
+
/**
|
|
841
|
+
* Parse VML shape style string for width and height.
|
|
842
|
+
* Handles pt, cm, in, px units. Falls back to 2.54cm (1 inch) if unparseable.
|
|
843
|
+
*/
|
|
844
|
+
function parseVmlStyle(style) {
|
|
845
|
+
const DEFAULT_CM = 2.54;
|
|
846
|
+
let widthCm = DEFAULT_CM;
|
|
847
|
+
let heightCm = DEFAULT_CM;
|
|
848
|
+
const widthMatch = /width:\s*([\d.]+)(pt|cm|in|px)/.exec(style);
|
|
849
|
+
const heightMatch = /height:\s*([\d.]+)(pt|cm|in|px)/.exec(style);
|
|
850
|
+
if (widthMatch)
|
|
851
|
+
widthCm = vmlUnitToCm(Number(widthMatch[1]), widthMatch[2]);
|
|
852
|
+
if (heightMatch)
|
|
853
|
+
heightCm = vmlUnitToCm(Number(heightMatch[1]), heightMatch[2]);
|
|
854
|
+
return { widthCm, heightCm };
|
|
855
|
+
}
|
|
856
|
+
function vmlUnitToCm(value, unit) {
|
|
857
|
+
switch (unit) {
|
|
858
|
+
case "cm":
|
|
859
|
+
return value;
|
|
860
|
+
case "pt":
|
|
861
|
+
return (value / 72) * 2.54;
|
|
862
|
+
case "in":
|
|
863
|
+
return value * 2.54;
|
|
864
|
+
case "px":
|
|
865
|
+
return (value / 96) * 2.54; // assumes 96dpi
|
|
866
|
+
default:
|
|
867
|
+
return value;
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
// ---------------------------------------------------------------------------
|
|
871
|
+
// Table
|
|
872
|
+
// ---------------------------------------------------------------------------
|
|
873
|
+
function readTable(el, ctx) {
|
|
874
|
+
const columnWidths = [];
|
|
875
|
+
const rows = [];
|
|
876
|
+
for (const child of el.children) {
|
|
877
|
+
if (child.type !== "element")
|
|
878
|
+
continue;
|
|
879
|
+
const tag = localName(child.tag);
|
|
880
|
+
if (tag === "tblGrid") {
|
|
881
|
+
for (const gc of child.children) {
|
|
882
|
+
if (gc.type !== "element")
|
|
883
|
+
continue;
|
|
884
|
+
if (localName(gc.tag) === "gridCol") {
|
|
885
|
+
const w = Number(gc.attrs["w:w"] ?? "0");
|
|
886
|
+
columnWidths.push(twipsToCm(w));
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
else if (tag === "tr") {
|
|
891
|
+
rows.push(readTableRow(child, ctx));
|
|
892
|
+
}
|
|
893
|
+
else if (tag === "sdt") {
|
|
894
|
+
// SDT wrapping one or more rows
|
|
895
|
+
const sdtContent = findSdtContent(child);
|
|
896
|
+
if (sdtContent) {
|
|
897
|
+
for (const sdtChild of sdtContent.children) {
|
|
898
|
+
if (sdtChild.type !== "element")
|
|
899
|
+
continue;
|
|
900
|
+
if (localName(sdtChild.tag) === "tr") {
|
|
901
|
+
rows.push(readTableRow(sdtChild, ctx));
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
// tblPr — table-wide properties handled at converter level via styles
|
|
907
|
+
}
|
|
908
|
+
return { type: "table", columnWidths, rows };
|
|
909
|
+
}
|
|
910
|
+
function readTableRow(el, ctx) {
|
|
911
|
+
const cells = [];
|
|
912
|
+
for (const child of el.children) {
|
|
913
|
+
if (child.type !== "element")
|
|
914
|
+
continue;
|
|
915
|
+
const tag = localName(child.tag);
|
|
916
|
+
if (tag === "tc") {
|
|
917
|
+
cells.push(readTableCell(child, ctx));
|
|
918
|
+
}
|
|
919
|
+
else if (tag === "sdt") {
|
|
920
|
+
const sdtContent = findSdtContent(child);
|
|
921
|
+
if (sdtContent) {
|
|
922
|
+
for (const sdtChild of sdtContent.children) {
|
|
923
|
+
if (sdtChild.type !== "element")
|
|
924
|
+
continue;
|
|
925
|
+
if (localName(sdtChild.tag) === "tc") {
|
|
926
|
+
cells.push(readTableCell(sdtChild, ctx));
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
// trPr — row properties (row height, header row flag) — handled at converter level
|
|
932
|
+
}
|
|
933
|
+
return { cells };
|
|
934
|
+
}
|
|
935
|
+
function readTableCell(el, ctx) {
|
|
936
|
+
let colSpan = 1;
|
|
937
|
+
let vMerge = null;
|
|
938
|
+
let backgroundColor = null;
|
|
939
|
+
let verticalAlign = null;
|
|
940
|
+
// Per spec (ECMA-376 §17.4.4 CT_Tc): tcPr is always the first child.
|
|
941
|
+
// Walk el.children once: read tcPr properties, then pass all children to
|
|
942
|
+
// walkBodyChildren which correctly handles tcPr by ignoring unknown tags.
|
|
943
|
+
for (const child of el.children) {
|
|
944
|
+
if (child.type !== "element")
|
|
945
|
+
continue;
|
|
946
|
+
if (localName(child.tag) !== "tcPr")
|
|
947
|
+
continue;
|
|
948
|
+
for (const prop of child.children) {
|
|
949
|
+
if (prop.type !== "element")
|
|
950
|
+
continue;
|
|
951
|
+
const ptag = localName(prop.tag);
|
|
952
|
+
if (ptag === "gridSpan") {
|
|
953
|
+
colSpan = Number(prop.attrs["w:val"] ?? "1");
|
|
954
|
+
}
|
|
955
|
+
else if (ptag === "vMerge") {
|
|
956
|
+
// w:val="restart" = first cell; absent or other value = continuation
|
|
957
|
+
vMerge = prop.attrs["w:val"] === "restart" ? "restart" : "continue";
|
|
958
|
+
}
|
|
959
|
+
else if (ptag === "shd") {
|
|
960
|
+
const fill = prop.attrs["w:fill"];
|
|
961
|
+
if (fill && fill !== "auto")
|
|
962
|
+
backgroundColor = fill.toUpperCase();
|
|
963
|
+
}
|
|
964
|
+
else if (ptag === "vAlign") {
|
|
965
|
+
verticalAlign = normalizeVAlign(prop.attrs["w:val"]);
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
break; // tcPr is always first and only appears once
|
|
969
|
+
}
|
|
970
|
+
// Pass all el.children to walkBodyChildren — it handles p, tbl, sdt, etc.
|
|
971
|
+
// and correctly ignores tcPr (not a recognized body element tag).
|
|
972
|
+
const body = walkBodyChildren(el.children, ctx);
|
|
973
|
+
return { colSpan, vMerge, backgroundColor, verticalAlign, body };
|
|
974
|
+
}
|
|
975
|
+
// ---------------------------------------------------------------------------
|
|
976
|
+
// Structured document tags (w:sdt)
|
|
977
|
+
// ---------------------------------------------------------------------------
|
|
978
|
+
function readBlockSdt(el, ctx) {
|
|
979
|
+
const { checkboxState, controlType } = readSdtPr(el);
|
|
980
|
+
if (checkboxState !== null) {
|
|
981
|
+
const char = checkboxState ? "☑" : "☐";
|
|
982
|
+
const run = { type: "run", text: char, props: { ...DEFAULT_RUN_PROPS } };
|
|
983
|
+
return [
|
|
984
|
+
{
|
|
985
|
+
type: "paragraph",
|
|
986
|
+
headingLevel: null,
|
|
987
|
+
styleId: null,
|
|
988
|
+
props: { ...DEFAULT_PARA_PROPS },
|
|
989
|
+
runs: [run],
|
|
990
|
+
},
|
|
991
|
+
];
|
|
992
|
+
}
|
|
993
|
+
warnUnknownSdtType(controlType, ctx);
|
|
994
|
+
const content = findSdtContent(el);
|
|
995
|
+
if (!content)
|
|
996
|
+
return [];
|
|
997
|
+
return walkBodyChildren(content.children, ctx);
|
|
998
|
+
}
|
|
999
|
+
function readInlineSdt(el, ctx) {
|
|
1000
|
+
const { checkboxState, controlType } = readSdtPr(el);
|
|
1001
|
+
if (checkboxState !== null) {
|
|
1002
|
+
const char = checkboxState ? "☑" : "☐";
|
|
1003
|
+
return [{ type: "run", text: char, props: { ...DEFAULT_RUN_PROPS } }];
|
|
1004
|
+
}
|
|
1005
|
+
warnUnknownSdtType(controlType, ctx);
|
|
1006
|
+
const content = findSdtContent(el);
|
|
1007
|
+
if (!content)
|
|
1008
|
+
return [];
|
|
1009
|
+
return processInlineChildren(content.children, ctx);
|
|
1010
|
+
}
|
|
1011
|
+
const KNOWN_SDT_TYPES = new Set([
|
|
1012
|
+
"richText",
|
|
1013
|
+
"text",
|
|
1014
|
+
"date",
|
|
1015
|
+
"dropDownList",
|
|
1016
|
+
"comboBox",
|
|
1017
|
+
"picture",
|
|
1018
|
+
"docPart",
|
|
1019
|
+
"docPartObj",
|
|
1020
|
+
"docPartList",
|
|
1021
|
+
"citation",
|
|
1022
|
+
"bibliography",
|
|
1023
|
+
"group",
|
|
1024
|
+
"checkbox",
|
|
1025
|
+
]);
|
|
1026
|
+
function warnUnknownSdtType(controlType, ctx) {
|
|
1027
|
+
if (controlType !== null && !KNOWN_SDT_TYPES.has(controlType)) {
|
|
1028
|
+
ctx.warnings.push(`w:sdt control type "${controlType}" — content processed as plain text`);
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
function readSdtPr(el) {
|
|
1032
|
+
for (const child of el.children) {
|
|
1033
|
+
if (child.type !== "element")
|
|
1034
|
+
continue;
|
|
1035
|
+
if (localName(child.tag) !== "sdtPr")
|
|
1036
|
+
continue;
|
|
1037
|
+
for (const prop of child.children) {
|
|
1038
|
+
if (prop.type !== "element")
|
|
1039
|
+
continue;
|
|
1040
|
+
const tag = localName(prop.tag);
|
|
1041
|
+
if (tag === "checkbox") {
|
|
1042
|
+
let checked = false;
|
|
1043
|
+
for (const cb of prop.children) {
|
|
1044
|
+
if (cb.type !== "element")
|
|
1045
|
+
continue;
|
|
1046
|
+
if (localName(cb.tag) === "checked") {
|
|
1047
|
+
checked = cb.attrs["w14:val"] !== "0";
|
|
1048
|
+
}
|
|
1049
|
+
}
|
|
1050
|
+
return { checkboxState: checked, controlType: "checkbox" };
|
|
1051
|
+
}
|
|
1052
|
+
if (KNOWN_SDT_TYPES.has(tag)) {
|
|
1053
|
+
return { checkboxState: null, controlType: tag };
|
|
1054
|
+
}
|
|
1055
|
+
// Unknown control type — report it
|
|
1056
|
+
if (![
|
|
1057
|
+
"alias",
|
|
1058
|
+
"tag",
|
|
1059
|
+
"id",
|
|
1060
|
+
"lock",
|
|
1061
|
+
"placeholder",
|
|
1062
|
+
"showingPlcHdr",
|
|
1063
|
+
"dataBinding",
|
|
1064
|
+
"rPr",
|
|
1065
|
+
"color",
|
|
1066
|
+
"appearance",
|
|
1067
|
+
].includes(tag)) {
|
|
1068
|
+
return { checkboxState: null, controlType: tag };
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
return { checkboxState: null, controlType: null };
|
|
1073
|
+
}
|
|
1074
|
+
function findSdtContent(el) {
|
|
1075
|
+
for (const child of el.children) {
|
|
1076
|
+
if (child.type !== "element")
|
|
1077
|
+
continue;
|
|
1078
|
+
if (localName(child.tag) === "sdtContent")
|
|
1079
|
+
return child;
|
|
1080
|
+
}
|
|
1081
|
+
return null;
|
|
1082
|
+
}
|
|
1083
|
+
// ---------------------------------------------------------------------------
|
|
1084
|
+
// Unit conversions
|
|
1085
|
+
// ---------------------------------------------------------------------------
|
|
1086
|
+
/** 1 EMU = 1/914400 inch; 1 inch = 2.54 cm */
|
|
1087
|
+
function emuToCm(emu) {
|
|
1088
|
+
return Number(((emu / 914400) * 2.54).toFixed(4));
|
|
1089
|
+
}
|
|
1090
|
+
/** 1 twip = 1/1440 inch; 1 inch = 2.54 cm */
|
|
1091
|
+
function twipsToCm(twips) {
|
|
1092
|
+
return Number(((twips / 1440) * 2.54).toFixed(4));
|
|
1093
|
+
}
|
|
1094
|
+
// ---------------------------------------------------------------------------
|
|
1095
|
+
// Run props merge
|
|
1096
|
+
// ---------------------------------------------------------------------------
|
|
1097
|
+
function mergeRunProps(base, override) {
|
|
1098
|
+
const result = { ...base };
|
|
1099
|
+
for (const [k, v] of Object.entries(override)) {
|
|
1100
|
+
if (v !== undefined)
|
|
1101
|
+
result[k] = v;
|
|
1102
|
+
}
|
|
1103
|
+
return result;
|
|
1104
|
+
}
|
|
1105
|
+
// ---------------------------------------------------------------------------
|
|
1106
|
+
// Normalisation helpers
|
|
1107
|
+
// ---------------------------------------------------------------------------
|
|
1108
|
+
function normalizeVAlign(val) {
|
|
1109
|
+
switch (val) {
|
|
1110
|
+
case "top":
|
|
1111
|
+
return "top";
|
|
1112
|
+
case "center":
|
|
1113
|
+
return "center";
|
|
1114
|
+
case "bottom":
|
|
1115
|
+
return "bottom";
|
|
1116
|
+
default:
|
|
1117
|
+
return null;
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
function localName(tag) {
|
|
1121
|
+
const colon = tag.indexOf(":");
|
|
1122
|
+
return colon === -1 ? tag : tag.slice(colon + 1);
|
|
1123
|
+
}
|
|
1124
|
+
//# sourceMappingURL=body-reader.js.map
|