@wonderwhy-er/desktop-commander 0.2.34 → 0.2.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/tools/docx/builders/html-builder.d.ts +17 -0
- package/dist/tools/docx/builders/html-builder.js +92 -0
- package/dist/tools/docx/builders/index.d.ts +5 -0
- package/dist/tools/docx/builders/index.js +5 -0
- package/dist/tools/docx/builders/markdown-builder.d.ts +2 -0
- package/dist/tools/docx/builders/markdown-builder.js +260 -0
- package/dist/tools/docx/constants.d.ts +36 -0
- package/dist/tools/docx/constants.js +57 -0
- package/dist/tools/docx/converters/markdown-to-html.d.ts +17 -0
- package/dist/tools/docx/converters/markdown-to-html.js +111 -0
- package/dist/tools/docx/errors.d.ts +28 -0
- package/dist/tools/docx/errors.js +48 -0
- package/dist/tools/docx/extractors/images.d.ts +14 -0
- package/dist/tools/docx/extractors/images.js +40 -0
- package/dist/tools/docx/extractors/metadata.d.ts +14 -0
- package/dist/tools/docx/extractors/metadata.js +64 -0
- package/dist/tools/docx/extractors/sections.d.ts +14 -0
- package/dist/tools/docx/extractors/sections.js +61 -0
- package/dist/tools/docx/html.d.ts +17 -0
- package/dist/tools/docx/html.js +111 -0
- package/dist/tools/docx/index.d.ts +14 -0
- package/dist/tools/docx/index.js +16 -0
- package/dist/tools/docx/markdown.d.ts +84 -0
- package/dist/tools/docx/markdown.js +507 -0
- package/dist/tools/docx/operations/handlers/index.d.ts +39 -0
- package/dist/tools/docx/operations/handlers/index.js +152 -0
- package/dist/tools/docx/operations/html-manipulator.d.ts +24 -0
- package/dist/tools/docx/operations/html-manipulator.js +352 -0
- package/dist/tools/docx/operations/index.d.ts +14 -0
- package/dist/tools/docx/operations/index.js +61 -0
- package/dist/tools/docx/operations/operation-handlers.d.ts +3 -0
- package/dist/tools/docx/operations/operation-handlers.js +67 -0
- package/dist/tools/docx/operations/preprocessor.d.ts +14 -0
- package/dist/tools/docx/operations/preprocessor.js +44 -0
- package/dist/tools/docx/operations/xml-replacer.d.ts +9 -0
- package/dist/tools/docx/operations/xml-replacer.js +35 -0
- package/dist/tools/docx/operations.d.ts +13 -0
- package/dist/tools/docx/operations.js +13 -0
- package/dist/tools/docx/parsers/image-extractor.d.ts +18 -0
- package/dist/tools/docx/parsers/image-extractor.js +61 -0
- package/dist/tools/docx/parsers/index.d.ts +9 -0
- package/dist/tools/docx/parsers/index.js +9 -0
- package/dist/tools/docx/parsers/paragraph-parser.d.ts +2 -0
- package/dist/tools/docx/parsers/paragraph-parser.js +88 -0
- package/dist/tools/docx/parsers/table-parser.d.ts +9 -0
- package/dist/tools/docx/parsers/table-parser.js +72 -0
- package/dist/tools/docx/parsers/xml-parser.d.ts +25 -0
- package/dist/tools/docx/parsers/xml-parser.js +71 -0
- package/dist/tools/docx/parsers/zip-reader.d.ts +23 -0
- package/dist/tools/docx/parsers/zip-reader.js +52 -0
- package/dist/tools/docx/structure.d.ts +25 -0
- package/dist/tools/docx/structure.js +102 -0
- package/dist/tools/docx/styled-html-parser.d.ts +23 -0
- package/dist/tools/docx/styled-html-parser.js +1262 -0
- package/dist/tools/docx/types.d.ts +114 -0
- package/dist/tools/docx/types.js +8 -0
- package/dist/tools/docx/utils/escaping.d.ts +13 -0
- package/dist/tools/docx/utils/escaping.js +26 -0
- package/dist/tools/docx/utils/images.d.ts +9 -0
- package/dist/tools/docx/utils/images.js +26 -0
- package/dist/tools/docx/utils/index.d.ts +12 -0
- package/dist/tools/docx/utils/index.js +17 -0
- package/dist/tools/docx/utils/markdown.d.ts +13 -0
- package/dist/tools/docx/utils/markdown.js +32 -0
- package/dist/tools/docx/utils/paths.d.ts +15 -0
- package/dist/tools/docx/utils/paths.js +27 -0
- package/dist/tools/docx/utils/versioning.d.ts +25 -0
- package/dist/tools/docx/utils/versioning.js +55 -0
- package/dist/tools/docx/utils.d.ts +101 -0
- package/dist/tools/docx/utils.js +299 -0
- package/dist/tools/docx/validators.d.ts +13 -0
- package/dist/tools/docx/validators.js +40 -0
- package/dist/utils/capture.js +4 -4
- package/dist/utils/files/docx.d.ts +41 -0
- package/dist/utils/files/docx.js +245 -0
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,1262 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Direct DOCX XML → Styled HTML Parser
|
|
3
|
+
*
|
|
4
|
+
* Parses the raw DOCX XML and produces HTML with full inline style preservation
|
|
5
|
+
* (font colours, sizes, families, text alignment, highlights, bold/italic/underline,
|
|
6
|
+
* images, hyperlinks, tables, and lists).
|
|
7
|
+
*
|
|
8
|
+
* mammoth.js deliberately strips visual styling; this parser fills that gap.
|
|
9
|
+
*
|
|
10
|
+
* @module docx/styled-html-parser
|
|
11
|
+
*/
|
|
12
|
+
import { createRequire } from 'module';
|
|
13
|
+
import { IMAGE_MIME_TYPES } from './constants.js';
|
|
14
|
+
import { escapeHtml } from './utils/escaping.js';
|
|
15
|
+
const require = createRequire(import.meta.url);
|
|
16
|
+
const { DOMParser } = require('@xmldom/xmldom');
|
|
17
|
+
// ─── OOXML Namespace Constants ───────────────────────────────────────────────
|
|
18
|
+
const NS = {
|
|
19
|
+
W: 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
|
20
|
+
R: 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
|
21
|
+
WP: 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
|
|
22
|
+
A: 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
|
23
|
+
PIC: 'http://schemas.openxmlformats.org/drawingml/2006/picture',
|
|
24
|
+
MC: 'http://schemas.openxmlformats.org/markup-compatibility/2006',
|
|
25
|
+
V: 'urn:schemas-microsoft-com:vml',
|
|
26
|
+
};
|
|
27
|
+
// ─── Highlight Colour Map ────────────────────────────────────────────────────
|
|
28
|
+
const HIGHLIGHT_COLORS = {
|
|
29
|
+
yellow: '#FFFF00', green: '#00FF00', cyan: '#00FFFF', magenta: '#FF00FF',
|
|
30
|
+
blue: '#0000FF', red: '#FF0000', darkBlue: '#000080', darkCyan: '#008080',
|
|
31
|
+
darkGreen: '#008000', darkMagenta: '#800080', darkRed: '#800000',
|
|
32
|
+
darkYellow: '#808000', darkGray: '#808080', lightGray: '#C0C0C0',
|
|
33
|
+
black: '#000000', white: '#FFFFFF',
|
|
34
|
+
};
|
|
35
|
+
// ─── Heading Detection ───────────────────────────────────────────────────────
|
|
36
|
+
/** Map OOXML tab leader values to the character used for dot/dash leaders in TOC entries. */
|
|
37
|
+
const TAB_LEADER_CHARS = {
|
|
38
|
+
dot: '.',
|
|
39
|
+
hyphen: '-',
|
|
40
|
+
underscore: '_',
|
|
41
|
+
middleDot: '·',
|
|
42
|
+
heavy: '━',
|
|
43
|
+
};
|
|
44
|
+
const HEADING_PATTERNS = [
|
|
45
|
+
{ pattern: /^Heading\s*1$/i, tag: 'h1' },
|
|
46
|
+
{ pattern: /^Heading\s*2$/i, tag: 'h2' },
|
|
47
|
+
{ pattern: /^Heading\s*3$/i, tag: 'h3' },
|
|
48
|
+
{ pattern: /^Heading\s*4$/i, tag: 'h4' },
|
|
49
|
+
{ pattern: /^Heading\s*5$/i, tag: 'h5' },
|
|
50
|
+
{ pattern: /^Heading\s*6$/i, tag: 'h6' },
|
|
51
|
+
{ pattern: /^Title$/i, tag: 'h1' },
|
|
52
|
+
{ pattern: /^Subtitle$/i, tag: 'h2' },
|
|
53
|
+
];
|
|
54
|
+
// ─── ZIP Helpers ─────────────────────────────────────────────────────────────
|
|
55
|
+
/**
|
|
56
|
+
* Find a file in a JSZip instance, with case-insensitive fallback.
|
|
57
|
+
* Some DOCX generators use inconsistent casing (e.g. `Word/Document.xml`
|
|
58
|
+
* vs `word/document.xml`), so we fall back to a case-insensitive search.
|
|
59
|
+
*/
|
|
60
|
+
function findZipFile(zip, path) {
|
|
61
|
+
// Try exact path first (fast path)
|
|
62
|
+
const file = zip.file(path);
|
|
63
|
+
if (file)
|
|
64
|
+
return file;
|
|
65
|
+
// Case-insensitive fallback
|
|
66
|
+
const lowerPath = path.toLowerCase();
|
|
67
|
+
const allPaths = Object.keys(zip.files);
|
|
68
|
+
const match = allPaths.find((p) => p.toLowerCase() === lowerPath);
|
|
69
|
+
return match ? zip.file(match) : null;
|
|
70
|
+
}
|
|
71
|
+
// ─── DOM Helpers ─────────────────────────────────────────────────────────────
|
|
72
|
+
function getDirectChild(parent, ns, localName) {
|
|
73
|
+
for (let i = 0; i < parent.childNodes.length; i++) {
|
|
74
|
+
const child = parent.childNodes[i];
|
|
75
|
+
if (child.nodeType === 1) {
|
|
76
|
+
const el = child;
|
|
77
|
+
if (el.localName === localName && el.namespaceURI === ns)
|
|
78
|
+
return el;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
function getDirectChildren(parent, ns, localName) {
|
|
84
|
+
const result = [];
|
|
85
|
+
for (let i = 0; i < parent.childNodes.length; i++) {
|
|
86
|
+
const child = parent.childNodes[i];
|
|
87
|
+
if (child.nodeType === 1) {
|
|
88
|
+
const el = child;
|
|
89
|
+
if (el.localName === localName && el.namespaceURI === ns)
|
|
90
|
+
result.push(el);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return result;
|
|
94
|
+
}
|
|
95
|
+
// ─── Style Extraction ────────────────────────────────────────────────────────
|
|
96
|
+
/** Extract run-level styles from a `w:rPr` element. */
|
|
97
|
+
function extractRunStyles(rPr, themeFonts) {
|
|
98
|
+
const style = {};
|
|
99
|
+
// Font colour (w:color)
|
|
100
|
+
const colorEl = getDirectChild(rPr, NS.W, 'color');
|
|
101
|
+
if (colorEl) {
|
|
102
|
+
const val = colorEl.getAttribute('w:val');
|
|
103
|
+
if (val && val !== 'auto' && /^[0-9A-Fa-f]{6}$/.test(val)) {
|
|
104
|
+
style.color = `#${val.toUpperCase()}`;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
// Font size (w:sz — value is half-points, e.g. 24 = 12pt)
|
|
108
|
+
const szEl = getDirectChild(rPr, NS.W, 'sz');
|
|
109
|
+
if (szEl) {
|
|
110
|
+
const val = szEl.getAttribute('w:val');
|
|
111
|
+
if (val) {
|
|
112
|
+
const pts = parseInt(val, 10) / 2;
|
|
113
|
+
if (!isNaN(pts) && pts > 0)
|
|
114
|
+
style.fontSize = `${pts}pt`;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
// Font family (w:rFonts) — with theme-font resolution
|
|
118
|
+
const rFontsEl = getDirectChild(rPr, NS.W, 'rFonts');
|
|
119
|
+
if (rFontsEl) {
|
|
120
|
+
let font = rFontsEl.getAttribute('w:ascii') ||
|
|
121
|
+
rFontsEl.getAttribute('w:hAnsi') ||
|
|
122
|
+
rFontsEl.getAttribute('w:cs');
|
|
123
|
+
if (!font && themeFonts) {
|
|
124
|
+
const themeAttr = rFontsEl.getAttribute('w:asciiTheme') ||
|
|
125
|
+
rFontsEl.getAttribute('w:hAnsiTheme') ||
|
|
126
|
+
rFontsEl.getAttribute('w:cstheme');
|
|
127
|
+
if (themeAttr) {
|
|
128
|
+
font = themeAttr.includes('minor') ? themeFonts.minor
|
|
129
|
+
: themeAttr.includes('major') ? themeFonts.major
|
|
130
|
+
: null;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
if (font)
|
|
134
|
+
style.fontFamily = font;
|
|
135
|
+
}
|
|
136
|
+
// Bold
|
|
137
|
+
const bEl = getDirectChild(rPr, NS.W, 'b');
|
|
138
|
+
if (bEl) {
|
|
139
|
+
const val = bEl.getAttribute('w:val');
|
|
140
|
+
style.bold = val !== '0' && val !== 'false';
|
|
141
|
+
}
|
|
142
|
+
// Italic
|
|
143
|
+
const iEl = getDirectChild(rPr, NS.W, 'i');
|
|
144
|
+
if (iEl) {
|
|
145
|
+
const val = iEl.getAttribute('w:val');
|
|
146
|
+
style.italic = val !== '0' && val !== 'false';
|
|
147
|
+
}
|
|
148
|
+
// Underline
|
|
149
|
+
const uEl = getDirectChild(rPr, NS.W, 'u');
|
|
150
|
+
if (uEl) {
|
|
151
|
+
const val = uEl.getAttribute('w:val');
|
|
152
|
+
style.underline = !!val && val !== 'none';
|
|
153
|
+
}
|
|
154
|
+
// Strikethrough
|
|
155
|
+
const strikeEl = getDirectChild(rPr, NS.W, 'strike');
|
|
156
|
+
if (strikeEl) {
|
|
157
|
+
const val = strikeEl.getAttribute('w:val');
|
|
158
|
+
style.strikethrough = val !== '0' && val !== 'false';
|
|
159
|
+
}
|
|
160
|
+
// Highlight
|
|
161
|
+
const highlightEl = getDirectChild(rPr, NS.W, 'highlight');
|
|
162
|
+
if (highlightEl) {
|
|
163
|
+
const val = highlightEl.getAttribute('w:val');
|
|
164
|
+
if (val && val !== 'none' && HIGHLIGHT_COLORS[val]) {
|
|
165
|
+
style.backgroundColor = HIGHLIGHT_COLORS[val];
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
// Shading (w:shd) — fallback background
|
|
169
|
+
if (!style.backgroundColor) {
|
|
170
|
+
const shdEl = getDirectChild(rPr, NS.W, 'shd');
|
|
171
|
+
if (shdEl) {
|
|
172
|
+
const fill = shdEl.getAttribute('w:fill');
|
|
173
|
+
if (fill && fill !== 'auto' && /^[0-9A-Fa-f]{6}$/.test(fill)) {
|
|
174
|
+
style.backgroundColor = `#${fill.toUpperCase()}`;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
// Vertical alignment
|
|
179
|
+
const vertAlignEl = getDirectChild(rPr, NS.W, 'vertAlign');
|
|
180
|
+
if (vertAlignEl) {
|
|
181
|
+
const val = vertAlignEl.getAttribute('w:val');
|
|
182
|
+
if (val === 'superscript')
|
|
183
|
+
style.verticalAlign = 'superscript';
|
|
184
|
+
else if (val === 'subscript')
|
|
185
|
+
style.verticalAlign = 'subscript';
|
|
186
|
+
}
|
|
187
|
+
return style;
|
|
188
|
+
}
|
|
189
|
+
// ─── CSS / Style Helpers ─────────────────────────────────────────────────────
|
|
190
|
+
function buildCssStyle(style) {
|
|
191
|
+
const parts = [];
|
|
192
|
+
if (style.color)
|
|
193
|
+
parts.push(`color:${style.color}`);
|
|
194
|
+
if (style.fontSize)
|
|
195
|
+
parts.push(`font-size:${style.fontSize}`);
|
|
196
|
+
if (style.fontFamily)
|
|
197
|
+
parts.push(`font-family:'${style.fontFamily}'`);
|
|
198
|
+
if (style.backgroundColor)
|
|
199
|
+
parts.push(`background-color:${style.backgroundColor}`);
|
|
200
|
+
return parts.join(';');
|
|
201
|
+
}
|
|
202
|
+
function buildStyleAttr(cssParts) {
|
|
203
|
+
return cssParts.length > 0 ? ` style="${cssParts.join(';')}"` : '';
|
|
204
|
+
}
|
|
205
|
+
function mergeRunStyles(inherited, explicit) {
|
|
206
|
+
if (!inherited)
|
|
207
|
+
return explicit;
|
|
208
|
+
return {
|
|
209
|
+
color: explicit.color || inherited.color,
|
|
210
|
+
fontSize: explicit.fontSize || inherited.fontSize,
|
|
211
|
+
fontFamily: explicit.fontFamily || inherited.fontFamily,
|
|
212
|
+
bold: explicit.bold ?? inherited.bold,
|
|
213
|
+
italic: explicit.italic ?? inherited.italic,
|
|
214
|
+
underline: explicit.underline ?? inherited.underline,
|
|
215
|
+
strikethrough: explicit.strikethrough ?? inherited.strikethrough,
|
|
216
|
+
backgroundColor: explicit.backgroundColor || inherited.backgroundColor,
|
|
217
|
+
verticalAlign: explicit.verticalAlign || inherited.verticalAlign,
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
// ─── Paragraph Style Extraction ──────────────────────────────────────────────
|
|
221
|
+
/** Convert OOXML twips (1/20 pt) to a CSS pt string. */
|
|
222
|
+
function twipsToPt(twips) {
|
|
223
|
+
return `${(twips / 20).toFixed(1)}pt`;
|
|
224
|
+
}
|
|
225
|
+
/** Convert a paragraph border element (w:top/w:bottom/…) to a CSS border string. */
|
|
226
|
+
function extractBorder(borderEl) {
|
|
227
|
+
if (!borderEl)
|
|
228
|
+
return undefined;
|
|
229
|
+
const val = borderEl.getAttribute('w:val');
|
|
230
|
+
if (!val || val === 'nil' || val === 'none')
|
|
231
|
+
return undefined;
|
|
232
|
+
// sz is in eighths of a point; convert to pt
|
|
233
|
+
const szAttr = borderEl.getAttribute('w:sz') || '0';
|
|
234
|
+
const sz = parseInt(szAttr, 10);
|
|
235
|
+
const widthPt = sz > 0 ? (sz / 8).toFixed(1) : '0';
|
|
236
|
+
// Fallback to solid if unknown
|
|
237
|
+
const style = val === 'single' ? 'solid' : 'solid';
|
|
238
|
+
let color = borderEl.getAttribute('w:color') || '';
|
|
239
|
+
if (color && color !== 'auto' && /^[0-9A-Fa-f]{6}$/.test(color)) {
|
|
240
|
+
color = `#${color.toUpperCase()}`;
|
|
241
|
+
}
|
|
242
|
+
else {
|
|
243
|
+
color = '#000000';
|
|
244
|
+
}
|
|
245
|
+
if (widthPt === '0')
|
|
246
|
+
return undefined;
|
|
247
|
+
return `${widthPt}pt ${style} ${color}`;
|
|
248
|
+
}
|
|
249
|
+
function extractParagraphStyle(pPr, stylesMap) {
|
|
250
|
+
const result = { tag: 'p' };
|
|
251
|
+
if (!pPr)
|
|
252
|
+
return result;
|
|
253
|
+
// ── Style → tag mapping ──
|
|
254
|
+
const pStyleEl = getDirectChild(pPr, NS.W, 'pStyle');
|
|
255
|
+
if (pStyleEl) {
|
|
256
|
+
const styleId = pStyleEl.getAttribute('w:val') || '';
|
|
257
|
+
for (const hp of HEADING_PATTERNS) {
|
|
258
|
+
if (hp.pattern.test(styleId)) {
|
|
259
|
+
result.tag = hp.tag;
|
|
260
|
+
break;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
if (result.tag === 'p' && stylesMap.has(styleId)) {
|
|
264
|
+
const mapped = stylesMap.get(styleId);
|
|
265
|
+
if (mapped.tag)
|
|
266
|
+
result.tag = mapped.tag;
|
|
267
|
+
// Apply style-level paragraph alignment as a default (can be overridden by explicit pPr/jc below)
|
|
268
|
+
if (mapped.paragraph?.textAlign) {
|
|
269
|
+
result.textAlign = mapped.paragraph.textAlign;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
// ── Alignment ──
|
|
274
|
+
const jcEl = getDirectChild(pPr, NS.W, 'jc');
|
|
275
|
+
if (jcEl) {
|
|
276
|
+
const val = jcEl.getAttribute('w:val');
|
|
277
|
+
// Map all common Word alignment values to CSS text-align
|
|
278
|
+
if (val === 'center') {
|
|
279
|
+
result.textAlign = 'center';
|
|
280
|
+
}
|
|
281
|
+
else if (val === 'right' || val === 'end') {
|
|
282
|
+
result.textAlign = 'right';
|
|
283
|
+
}
|
|
284
|
+
else if (val === 'both' ||
|
|
285
|
+
val === 'distribute' ||
|
|
286
|
+
val === 'thaiDistribute' ||
|
|
287
|
+
val === 'justify' ||
|
|
288
|
+
val === 'mediumKashida' ||
|
|
289
|
+
val === 'lowKashida' ||
|
|
290
|
+
val === 'highKashida') {
|
|
291
|
+
result.textAlign = 'justify';
|
|
292
|
+
}
|
|
293
|
+
else if (val === 'left' || val === 'start') {
|
|
294
|
+
// Make left/start explicit so it survives through conversions
|
|
295
|
+
result.textAlign = 'left';
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
// ── Indentation (w:ind) ──
|
|
299
|
+
const indEl = getDirectChild(pPr, NS.W, 'ind');
|
|
300
|
+
if (indEl) {
|
|
301
|
+
// w:left or w:start (start is newer, fallback to left)
|
|
302
|
+
const leftTwips = parseInt(indEl.getAttribute('w:start') || indEl.getAttribute('w:left') || '0', 10);
|
|
303
|
+
if (leftTwips > 0)
|
|
304
|
+
result.marginLeft = twipsToPt(leftTwips);
|
|
305
|
+
const rightTwips = parseInt(indEl.getAttribute('w:end') || indEl.getAttribute('w:right') || '0', 10);
|
|
306
|
+
if (rightTwips > 0)
|
|
307
|
+
result.marginRight = twipsToPt(rightTwips);
|
|
308
|
+
// First-line indent / hanging indent
|
|
309
|
+
const firstLineTwips = parseInt(indEl.getAttribute('w:firstLine') || '0', 10);
|
|
310
|
+
const hangingTwips = parseInt(indEl.getAttribute('w:hanging') || '0', 10);
|
|
311
|
+
if (firstLineTwips > 0) {
|
|
312
|
+
result.textIndent = twipsToPt(firstLineTwips);
|
|
313
|
+
}
|
|
314
|
+
else if (hangingTwips > 0) {
|
|
315
|
+
// Hanging indent: negative text-indent is a common approximation
|
|
316
|
+
const indent = twipsToPt(hangingTwips);
|
|
317
|
+
result.textIndent = `-${indent}`;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
// ── Spacing before/after (w:spacing) ──
|
|
321
|
+
const spacingEl = getDirectChild(pPr, NS.W, 'spacing');
|
|
322
|
+
if (spacingEl) {
|
|
323
|
+
const beforeTwips = parseInt(spacingEl.getAttribute('w:before') || '0', 10);
|
|
324
|
+
if (beforeTwips > 0)
|
|
325
|
+
result.marginTop = twipsToPt(beforeTwips);
|
|
326
|
+
const afterTwips = parseInt(spacingEl.getAttribute('w:after') || '0', 10);
|
|
327
|
+
if (afterTwips > 0)
|
|
328
|
+
result.marginBottom = twipsToPt(afterTwips);
|
|
329
|
+
// We deliberately ignore line/lineRule for now to avoid over-constraining line-height.
|
|
330
|
+
}
|
|
331
|
+
// ── Shading (background colour) ──
|
|
332
|
+
const shdEl = getDirectChild(pPr, NS.W, 'shd');
|
|
333
|
+
if (shdEl) {
|
|
334
|
+
const fill = shdEl.getAttribute('w:fill');
|
|
335
|
+
if (fill && fill !== 'auto' && /^[0-9A-Fa-f]{6}$/.test(fill)) {
|
|
336
|
+
result.backgroundColor = `#${fill.toUpperCase()}`;
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
// ── Paragraph borders (w:pBdr) ──
|
|
340
|
+
const pBdrEl = getDirectChild(pPr, NS.W, 'pBdr');
|
|
341
|
+
if (pBdrEl) {
|
|
342
|
+
const topEl = getDirectChild(pBdrEl, NS.W, 'top');
|
|
343
|
+
const bottomEl = getDirectChild(pBdrEl, NS.W, 'bottom');
|
|
344
|
+
const leftEl = getDirectChild(pBdrEl, NS.W, 'left');
|
|
345
|
+
const rightEl = getDirectChild(pBdrEl, NS.W, 'right');
|
|
346
|
+
const top = extractBorder(topEl);
|
|
347
|
+
const bottom = extractBorder(bottomEl);
|
|
348
|
+
const left = extractBorder(leftEl);
|
|
349
|
+
const right = extractBorder(rightEl);
|
|
350
|
+
if (top)
|
|
351
|
+
result.borderTop = top;
|
|
352
|
+
if (bottom)
|
|
353
|
+
result.borderBottom = bottom;
|
|
354
|
+
if (left)
|
|
355
|
+
result.borderLeft = left;
|
|
356
|
+
if (right)
|
|
357
|
+
result.borderRight = right;
|
|
358
|
+
}
|
|
359
|
+
// ── Tab leader (w:tabs → w:tab[@w:leader]) ──
|
|
360
|
+
const tabsEl = getDirectChild(pPr, NS.W, 'tabs');
|
|
361
|
+
if (tabsEl) {
|
|
362
|
+
const tabEls = getDirectChildren(tabsEl, NS.W, 'tab');
|
|
363
|
+
for (const tab of tabEls) {
|
|
364
|
+
const leader = tab.getAttribute('w:leader');
|
|
365
|
+
if (leader && leader !== 'none') {
|
|
366
|
+
result.tabLeader = leader; // 'dot', 'hyphen', 'underscore', 'middleDot', 'heavy'
|
|
367
|
+
break;
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
return result;
|
|
372
|
+
}
|
|
373
|
+
/** Resolve the default RunStyle for a paragraph (docDefaults → Normal/pStyle → pPr/rPr). */
|
|
374
|
+
function getDefaultRunStyle(pPr, ctx) {
|
|
375
|
+
let inherited = { ...ctx.docDefaultRunStyle };
|
|
376
|
+
let styleId = '';
|
|
377
|
+
if (pPr) {
|
|
378
|
+
const pStyleEl = getDirectChild(pPr, NS.W, 'pStyle');
|
|
379
|
+
styleId = pStyleEl?.getAttribute('w:val') || '';
|
|
380
|
+
}
|
|
381
|
+
if (!styleId)
|
|
382
|
+
styleId = 'Normal';
|
|
383
|
+
const mapped = ctx.stylesMap.get(styleId);
|
|
384
|
+
if (mapped?.runStyle)
|
|
385
|
+
inherited = mergeRunStyles(inherited, mapped.runStyle);
|
|
386
|
+
if (pPr) {
|
|
387
|
+
const rPr = getDirectChild(pPr, NS.W, 'rPr');
|
|
388
|
+
if (rPr)
|
|
389
|
+
return mergeRunStyles(inherited, extractRunStyles(rPr, ctx.themeFonts));
|
|
390
|
+
}
|
|
391
|
+
return inherited;
|
|
392
|
+
}
|
|
393
|
+
// ─── styles.xml Parsing ──────────────────────────────────────────────────────
|
|
394
|
+
function parseStylesXml(xml, themeFonts) {
|
|
395
|
+
const map = new Map();
|
|
396
|
+
try {
|
|
397
|
+
const doc = new DOMParser().parseFromString(xml, 'application/xml');
|
|
398
|
+
const styles = doc.getElementsByTagNameNS(NS.W, 'style');
|
|
399
|
+
// First pass: collect all styles with their basedOn references
|
|
400
|
+
for (let i = 0; i < styles.length; i++) {
|
|
401
|
+
const styleEl = styles[i];
|
|
402
|
+
const styleId = styleEl.getAttribute('w:styleId');
|
|
403
|
+
if (!styleId)
|
|
404
|
+
continue;
|
|
405
|
+
const entry = {};
|
|
406
|
+
const nameEl = getDirectChild(styleEl, NS.W, 'name');
|
|
407
|
+
if (nameEl) {
|
|
408
|
+
const name = nameEl.getAttribute('w:val') || '';
|
|
409
|
+
for (const hp of HEADING_PATTERNS) {
|
|
410
|
+
if (hp.pattern.test(name)) {
|
|
411
|
+
entry.tag = hp.tag;
|
|
412
|
+
break;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
// Paragraph-level properties (e.g. alignment) that apply to all paragraphs using this style
|
|
417
|
+
const pPrEl = getDirectChild(styleEl, NS.W, 'pPr');
|
|
418
|
+
if (pPrEl) {
|
|
419
|
+
const jcEl = getDirectChild(pPrEl, NS.W, 'jc');
|
|
420
|
+
if (jcEl) {
|
|
421
|
+
const val = jcEl.getAttribute('w:val');
|
|
422
|
+
if (val === 'center') {
|
|
423
|
+
entry.paragraph = { ...(entry.paragraph || {}), textAlign: 'center' };
|
|
424
|
+
}
|
|
425
|
+
else if (val === 'right' || val === 'end') {
|
|
426
|
+
entry.paragraph = { ...(entry.paragraph || {}), textAlign: 'right' };
|
|
427
|
+
}
|
|
428
|
+
else if (val === 'both' ||
|
|
429
|
+
val === 'distribute' ||
|
|
430
|
+
val === 'thaiDistribute' ||
|
|
431
|
+
val === 'justify' ||
|
|
432
|
+
val === 'mediumKashida' ||
|
|
433
|
+
val === 'lowKashida' ||
|
|
434
|
+
val === 'highKashida') {
|
|
435
|
+
entry.paragraph = { ...(entry.paragraph || {}), textAlign: 'justify' };
|
|
436
|
+
}
|
|
437
|
+
else if (val === 'left' || val === 'start') {
|
|
438
|
+
entry.paragraph = { ...(entry.paragraph || {}), textAlign: 'left' };
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
// Capture basedOn reference for inheritance resolution
|
|
443
|
+
const basedOnEl = getDirectChild(styleEl, NS.W, 'basedOn');
|
|
444
|
+
if (basedOnEl) {
|
|
445
|
+
entry.basedOn = basedOnEl.getAttribute('w:val') || undefined;
|
|
446
|
+
}
|
|
447
|
+
const rPr = getDirectChild(styleEl, NS.W, 'rPr');
|
|
448
|
+
if (rPr)
|
|
449
|
+
entry.runStyle = extractRunStyles(rPr, themeFonts);
|
|
450
|
+
map.set(styleId, entry);
|
|
451
|
+
}
|
|
452
|
+
// Second pass: resolve basedOn inheritance chains
|
|
453
|
+
// Walk up each style's basedOn chain and merge inherited run styles
|
|
454
|
+
resolveStyleInheritance(map);
|
|
455
|
+
}
|
|
456
|
+
catch {
|
|
457
|
+
// Non-fatal — return whatever we have
|
|
458
|
+
}
|
|
459
|
+
return map;
|
|
460
|
+
}
|
|
461
|
+
/**
|
|
462
|
+
* Resolve basedOn inheritance chains for all styles.
|
|
463
|
+
* Each style's runStyle is merged with its parent's (fully-resolved) runStyle,
|
|
464
|
+
* so that the final runStyle on each entry contains the complete set of inherited properties.
|
|
465
|
+
*/
|
|
466
|
+
function resolveStyleInheritance(map) {
|
|
467
|
+
const resolved = new Set();
|
|
468
|
+
function resolve(styleId, visited) {
|
|
469
|
+
if (resolved.has(styleId) || !map.has(styleId))
|
|
470
|
+
return;
|
|
471
|
+
if (visited.has(styleId))
|
|
472
|
+
return; // Circular reference guard
|
|
473
|
+
visited.add(styleId);
|
|
474
|
+
const entry = map.get(styleId);
|
|
475
|
+
if (entry.basedOn && map.has(entry.basedOn)) {
|
|
476
|
+
// Ensure the parent is resolved first
|
|
477
|
+
resolve(entry.basedOn, visited);
|
|
478
|
+
const parent = map.get(entry.basedOn);
|
|
479
|
+
if (parent.runStyle) {
|
|
480
|
+
// Merge: parent's resolved style is the base, this style's own rPr overrides
|
|
481
|
+
entry.runStyle = mergeRunStyles(parent.runStyle, entry.runStyle || {});
|
|
482
|
+
}
|
|
483
|
+
// Inherit paragraph properties (e.g. textAlign) if not explicitly set
|
|
484
|
+
if (parent.paragraph) {
|
|
485
|
+
entry.paragraph = {
|
|
486
|
+
textAlign: entry.paragraph?.textAlign || parent.paragraph.textAlign,
|
|
487
|
+
};
|
|
488
|
+
}
|
|
489
|
+
// Inherit tag from parent if not set
|
|
490
|
+
if (!entry.tag && parent.tag)
|
|
491
|
+
entry.tag = parent.tag;
|
|
492
|
+
}
|
|
493
|
+
resolved.add(styleId);
|
|
494
|
+
}
|
|
495
|
+
for (const styleId of map.keys()) {
|
|
496
|
+
resolve(styleId, new Set());
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
// ─── Element Converters ──────────────────────────────────────────────────────
|
|
500
|
+
/** EMU (English Metric Units) to pixels (96 DPI). 1 inch = 914400 EMU, 1 px = 9525 EMU. */
|
|
501
|
+
const EMU_PER_PX = 9525;
|
|
502
|
+
/** Convert CSS-like units to pixels (approximate). */
|
|
503
|
+
function unitToPx(value, unit) {
|
|
504
|
+
switch (unit) {
|
|
505
|
+
case 'px': return value;
|
|
506
|
+
case 'pt': return value * (96 / 72); // 1pt = 96/72 px
|
|
507
|
+
case 'in': return value * 96; // 1in = 96px
|
|
508
|
+
case 'cm': return value * (96 / 2.54); // 1cm = 96/2.54 px
|
|
509
|
+
case 'mm': return value * (96 / 25.4); // 1mm = 96/25.4 px
|
|
510
|
+
default: return value;
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
function convertDrawingToHtml(drawingEl, ctx) {
|
|
514
|
+
try {
|
|
515
|
+
// Search for blip (image reference) — first by namespace, then by localName as fallback
|
|
516
|
+
let blip = null;
|
|
517
|
+
const blips = drawingEl.getElementsByTagNameNS(NS.A, 'blip');
|
|
518
|
+
if (blips.length > 0) {
|
|
519
|
+
blip = blips[0];
|
|
520
|
+
}
|
|
521
|
+
else {
|
|
522
|
+
// Fallback: search by localName only (handles namespace prefix issues)
|
|
523
|
+
const allEls = drawingEl.getElementsByTagName('*');
|
|
524
|
+
for (let i = 0; i < allEls.length; i++) {
|
|
525
|
+
if (allEls[i].localName === 'blip') {
|
|
526
|
+
blip = allEls[i];
|
|
527
|
+
break;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
if (!blip)
|
|
532
|
+
return '';
|
|
533
|
+
const rId = blip.getAttributeNS(NS.R, 'embed')
|
|
534
|
+
|| blip.getAttribute('r:embed')
|
|
535
|
+
|| blip.getAttributeNS(NS.R, 'link')
|
|
536
|
+
|| blip.getAttribute('r:link');
|
|
537
|
+
if (!rId || !ctx.imageMap.has(rId))
|
|
538
|
+
return '';
|
|
539
|
+
const dataUrl = ctx.imageMap.get(rId);
|
|
540
|
+
// Extract alt text from wp:docPr
|
|
541
|
+
let alt = '';
|
|
542
|
+
const docPrs = drawingEl.getElementsByTagNameNS(NS.WP, 'docPr');
|
|
543
|
+
if (docPrs.length > 0) {
|
|
544
|
+
alt = docPrs[0].getAttribute('descr')
|
|
545
|
+
|| docPrs[0].getAttribute('name')
|
|
546
|
+
|| '';
|
|
547
|
+
}
|
|
548
|
+
// Extract image dimensions from wp:extent (cx/cy in EMUs)
|
|
549
|
+
const attrs = [`src="${dataUrl}"`, `alt="${escapeHtml(alt)}"`];
|
|
550
|
+
const extents = drawingEl.getElementsByTagNameNS(NS.WP, 'extent');
|
|
551
|
+
if (extents.length > 0) {
|
|
552
|
+
const ext = extents[0];
|
|
553
|
+
const cx = parseInt(ext.getAttribute('cx') || '0', 10);
|
|
554
|
+
const cy = parseInt(ext.getAttribute('cy') || '0', 10);
|
|
555
|
+
if (cx > 0)
|
|
556
|
+
attrs.push(`width="${Math.round(cx / EMU_PER_PX)}"`);
|
|
557
|
+
if (cy > 0)
|
|
558
|
+
attrs.push(`height="${Math.round(cy / EMU_PER_PX)}"`);
|
|
559
|
+
}
|
|
560
|
+
return `<img ${attrs.join(' ')} />`;
|
|
561
|
+
}
|
|
562
|
+
catch {
|
|
563
|
+
return '';
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
/**
|
|
567
|
+
* Convert a VML `w:pict` element to an HTML `<img>` tag.
|
|
568
|
+
* Older DOCX files and mc:Fallback blocks use VML instead of DrawingML.
|
|
569
|
+
*/
|
|
570
|
+
function convertPictToHtml(pictEl, ctx) {
|
|
571
|
+
try {
|
|
572
|
+
// Search all descendants for imagedata elements (may use v: prefix or no prefix)
|
|
573
|
+
const allEls = pictEl.getElementsByTagName('*');
|
|
574
|
+
let rId = '';
|
|
575
|
+
let shapeStyle = '';
|
|
576
|
+
for (let i = 0; i < allEls.length; i++) {
|
|
577
|
+
const el = allEls[i];
|
|
578
|
+
const local = el.localName || '';
|
|
579
|
+
if (local === 'imagedata') {
|
|
580
|
+
rId = el.getAttributeNS(NS.R, 'id') || el.getAttribute('r:id') || '';
|
|
581
|
+
}
|
|
582
|
+
if (local === 'shape' && !shapeStyle) {
|
|
583
|
+
shapeStyle = el.getAttribute('style') || '';
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
if (!rId || !ctx.imageMap.has(rId))
|
|
587
|
+
return '';
|
|
588
|
+
const dataUrl = ctx.imageMap.get(rId);
|
|
589
|
+
const attrs = [`src="${dataUrl}"`];
|
|
590
|
+
if (shapeStyle) {
|
|
591
|
+
const wMatch = shapeStyle.match(/width:\s*([\d.]+)\s*(pt|px|in|cm|mm)/);
|
|
592
|
+
const hMatch = shapeStyle.match(/height:\s*([\d.]+)\s*(pt|px|in|cm|mm)/);
|
|
593
|
+
// Convert to pixels for html-to-docx (approx: 1pt ≈ 1.333px, 1in = 96px, 1cm ≈ 37.8px)
|
|
594
|
+
if (wMatch) {
|
|
595
|
+
const px = unitToPx(parseFloat(wMatch[1]), wMatch[2]);
|
|
596
|
+
if (px > 0)
|
|
597
|
+
attrs.push(`width="${Math.round(px)}"`);
|
|
598
|
+
}
|
|
599
|
+
if (hMatch) {
|
|
600
|
+
const px = unitToPx(parseFloat(hMatch[1]), hMatch[2]);
|
|
601
|
+
if (px > 0)
|
|
602
|
+
attrs.push(`height="${Math.round(px)}"`);
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
return `<img ${attrs.join(' ')} />`;
|
|
606
|
+
}
|
|
607
|
+
catch {
|
|
608
|
+
return '';
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
/**
|
|
612
|
+
* Convert an `mc:AlternateContent` element within a run to HTML.
|
|
613
|
+
* Modern Word wraps drawings in mc:AlternateContent/mc:Choice with an
|
|
614
|
+
* mc:Fallback/w:pict for backward compatibility.
|
|
615
|
+
*/
|
|
616
|
+
function convertAlternateContentInRun(acEl, ctx) {
|
|
617
|
+
// Try mc:Choice first (preferred — DrawingML)
|
|
618
|
+
for (let i = 0; i < acEl.childNodes.length; i++) {
|
|
619
|
+
const child = acEl.childNodes[i];
|
|
620
|
+
if (child.nodeType !== 1)
|
|
621
|
+
continue;
|
|
622
|
+
const el = child;
|
|
623
|
+
if (el.localName === 'Choice') {
|
|
624
|
+
// Look for w:drawing descendants
|
|
625
|
+
const drawings = el.getElementsByTagNameNS(NS.W, 'drawing');
|
|
626
|
+
if (drawings.length > 0)
|
|
627
|
+
return convertDrawingToHtml(drawings[0], ctx);
|
|
628
|
+
// Also check by localName (some docs omit namespace on drawing)
|
|
629
|
+
for (let j = 0; j < el.childNodes.length; j++) {
|
|
630
|
+
const cc = el.childNodes[j];
|
|
631
|
+
if (cc.nodeType === 1 && cc.localName === 'drawing') {
|
|
632
|
+
return convertDrawingToHtml(cc, ctx);
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
// Fallback: try mc:Fallback → w:pict (VML)
|
|
638
|
+
for (let i = 0; i < acEl.childNodes.length; i++) {
|
|
639
|
+
const child = acEl.childNodes[i];
|
|
640
|
+
if (child.nodeType !== 1)
|
|
641
|
+
continue;
|
|
642
|
+
const el = child;
|
|
643
|
+
if (el.localName === 'Fallback') {
|
|
644
|
+
for (let j = 0; j < el.childNodes.length; j++) {
|
|
645
|
+
const fc = el.childNodes[j];
|
|
646
|
+
if (fc.nodeType === 1 && fc.localName === 'pict') {
|
|
647
|
+
return convertPictToHtml(fc, ctx);
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
return '';
|
|
653
|
+
}
|
|
654
|
+
/**
|
|
655
|
+
* Convert a `w:r` (run) element to HTML.
|
|
656
|
+
*
|
|
657
|
+
* CRITICAL: Images (`<img>`) must NOT be wrapped in `<span>` tags.
|
|
658
|
+
* `html-to-docx` only detects `<img>` as **direct children** of `<p>` elements —
|
|
659
|
+
* if images are nested inside `<span>`, they are silently dropped from the output DOCX.
|
|
660
|
+
*
|
|
661
|
+
* Therefore, this function separates text content (wrapped in styled `<span>`)
|
|
662
|
+
* from image content (emitted as bare `<img>` elements).
|
|
663
|
+
*/
|
|
664
|
+
function convertRunToHtml(runEl, ctx, defaultRunStyle) {
|
|
665
|
+
const rPr = getDirectChild(runEl, NS.W, 'rPr');
|
|
666
|
+
const explicitStyle = rPr ? extractRunStyles(rPr, ctx.themeFonts) : {};
|
|
667
|
+
const style = mergeRunStyles(defaultRunStyle, explicitStyle);
|
|
668
|
+
const textParts = [];
|
|
669
|
+
const imageParts = [];
|
|
670
|
+
for (let i = 0; i < runEl.childNodes.length; i++) {
|
|
671
|
+
const child = runEl.childNodes[i];
|
|
672
|
+
if (child.nodeType !== 1)
|
|
673
|
+
continue;
|
|
674
|
+
const el = child;
|
|
675
|
+
if (el.localName === 't')
|
|
676
|
+
textParts.push(escapeHtml(el.textContent || ''));
|
|
677
|
+
else if (el.localName === 'br')
|
|
678
|
+
textParts.push('<br>');
|
|
679
|
+
else if (el.localName === 'tab')
|
|
680
|
+
textParts.push('	');
|
|
681
|
+
else if (el.localName === 'drawing')
|
|
682
|
+
imageParts.push(convertDrawingToHtml(el, ctx));
|
|
683
|
+
else if (el.localName === 'pict')
|
|
684
|
+
imageParts.push(convertPictToHtml(el, ctx));
|
|
685
|
+
else if (el.localName === 'AlternateContent')
|
|
686
|
+
imageParts.push(convertAlternateContentInRun(el, ctx));
|
|
687
|
+
else if (el.localName === 'object') {
|
|
688
|
+
// w:object can contain w:pict or w:drawing
|
|
689
|
+
const objDrawing = getDirectChild(el, NS.W, 'drawing');
|
|
690
|
+
if (objDrawing)
|
|
691
|
+
imageParts.push(convertDrawingToHtml(objDrawing, ctx));
|
|
692
|
+
else {
|
|
693
|
+
const objPict = getDirectChild(el, NS.W, 'pict');
|
|
694
|
+
if (objPict)
|
|
695
|
+
imageParts.push(convertPictToHtml(objPict, ctx));
|
|
696
|
+
else {
|
|
697
|
+
// Try by localName without namespace
|
|
698
|
+
for (let j = 0; j < el.childNodes.length; j++) {
|
|
699
|
+
const oc = el.childNodes[j];
|
|
700
|
+
if (oc.nodeType !== 1)
|
|
701
|
+
continue;
|
|
702
|
+
const ocEl = oc;
|
|
703
|
+
if (ocEl.localName === 'pict') {
|
|
704
|
+
imageParts.push(convertPictToHtml(ocEl, ctx));
|
|
705
|
+
break;
|
|
706
|
+
}
|
|
707
|
+
if (ocEl.localName === 'drawing') {
|
|
708
|
+
imageParts.push(convertDrawingToHtml(ocEl, ctx));
|
|
709
|
+
break;
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
let result = '';
|
|
717
|
+
// Wrap TEXT content in styled span/formatting tags — but NOT images
|
|
718
|
+
const textContent = textParts.join('');
|
|
719
|
+
if (textContent) {
|
|
720
|
+
let html = textContent;
|
|
721
|
+
const cssStyle = buildCssStyle(style);
|
|
722
|
+
if (cssStyle)
|
|
723
|
+
html = `<span style="${cssStyle}">${html}</span>`;
|
|
724
|
+
if (style.underline)
|
|
725
|
+
html = `<u>${html}</u>`;
|
|
726
|
+
if (style.strikethrough)
|
|
727
|
+
html = `<s>${html}</s>`;
|
|
728
|
+
if (style.italic)
|
|
729
|
+
html = `<em>${html}</em>`;
|
|
730
|
+
if (style.bold)
|
|
731
|
+
html = `<strong>${html}</strong>`;
|
|
732
|
+
if (style.verticalAlign === 'superscript')
|
|
733
|
+
html = `<sup>${html}</sup>`;
|
|
734
|
+
else if (style.verticalAlign === 'subscript')
|
|
735
|
+
html = `<sub>${html}</sub>`;
|
|
736
|
+
result += html;
|
|
737
|
+
}
|
|
738
|
+
// Append images OUTSIDE styling wrappers — html-to-docx needs them as direct <p> children
|
|
739
|
+
const imageContent = imageParts.filter(Boolean).join('');
|
|
740
|
+
if (imageContent)
|
|
741
|
+
result += imageContent;
|
|
742
|
+
return result;
|
|
743
|
+
}
|
|
744
|
+
function convertHyperlinkToHtml(hyperlinkEl, ctx, defaultRunStyle) {
|
|
745
|
+
const rId = hyperlinkEl.getAttributeNS(NS.R, 'id') || hyperlinkEl.getAttribute('r:id');
|
|
746
|
+
const anchor = hyperlinkEl.getAttribute('w:anchor');
|
|
747
|
+
let href = '';
|
|
748
|
+
if (rId && ctx.linkMap.has(rId))
|
|
749
|
+
href = ctx.linkMap.get(rId);
|
|
750
|
+
else if (anchor)
|
|
751
|
+
href = `#${anchor}`;
|
|
752
|
+
let innerHtml = '';
|
|
753
|
+
/** Recursively gather inline content from the hyperlink. */
|
|
754
|
+
function gather(container) {
|
|
755
|
+
for (let i = 0; i < container.childNodes.length; i++) {
|
|
756
|
+
const child = container.childNodes[i];
|
|
757
|
+
if (child.nodeType !== 1)
|
|
758
|
+
continue;
|
|
759
|
+
const el = child;
|
|
760
|
+
if (el.localName === 'r')
|
|
761
|
+
innerHtml += convertRunToHtml(el, ctx, defaultRunStyle);
|
|
762
|
+
else if (el.localName === 'fldSimple')
|
|
763
|
+
gather(el);
|
|
764
|
+
else if (el.localName === 'sdt') {
|
|
765
|
+
const sdtContent = getDirectChild(el, NS.W, 'sdtContent');
|
|
766
|
+
if (sdtContent)
|
|
767
|
+
gather(sdtContent);
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
gather(hyperlinkEl);
|
|
772
|
+
return href ? `<a href="${escapeHtml(href)}">${innerHtml}</a>` : innerHtml;
|
|
773
|
+
}
|
|
774
|
+
/**
|
|
775
|
+
* Convert paragraph-level inline elements to HTML.
|
|
776
|
+
*
|
|
777
|
+
* Handles: w:r, w:hyperlink, w:sdt, w:fldSimple, w:ins, w:del, w:smartTag,
|
|
778
|
+
* and w:bookmarkStart / w:bookmarkEnd (for anchor links).
|
|
779
|
+
*
|
|
780
|
+
* `w:fldSimple` is CRITICAL for TOC entries — it wraps hyperlinks and runs
|
|
781
|
+
* inside a field code (e.g. `TOC`, `PAGEREF`). Without this, the entire
|
|
782
|
+
* TOC content would be silently dropped.
|
|
783
|
+
*/
|
|
784
|
+
function convertParagraphInner(paraEl, ctx) {
|
|
785
|
+
const pPr = getDirectChild(paraEl, NS.W, 'pPr');
|
|
786
|
+
const defaultRunStyle = getDefaultRunStyle(pPr, ctx);
|
|
787
|
+
let innerHtml = '';
|
|
788
|
+
/** Recursively process inline child elements within a container (paragraph, fldSimple, sdt, etc.). */
|
|
789
|
+
function processInlineChildren(container) {
|
|
790
|
+
for (let i = 0; i < container.childNodes.length; i++) {
|
|
791
|
+
const child = container.childNodes[i];
|
|
792
|
+
if (child.nodeType !== 1)
|
|
793
|
+
continue;
|
|
794
|
+
const el = child;
|
|
795
|
+
switch (el.localName) {
|
|
796
|
+
case 'r':
|
|
797
|
+
innerHtml += convertRunToHtml(el, ctx, defaultRunStyle);
|
|
798
|
+
break;
|
|
799
|
+
case 'hyperlink':
|
|
800
|
+
innerHtml += convertHyperlinkToHtml(el, ctx, defaultRunStyle);
|
|
801
|
+
break;
|
|
802
|
+
case 'fldSimple':
|
|
803
|
+
// Field-simple wraps content like TOC entries, PAGEREF, etc.
|
|
804
|
+
// Process its children as if they were direct paragraph children.
|
|
805
|
+
processInlineChildren(el);
|
|
806
|
+
break;
|
|
807
|
+
case 'sdt': {
|
|
808
|
+
// Inline structured document tags within a paragraph
|
|
809
|
+
const sdtContent = getDirectChild(el, NS.W, 'sdtContent');
|
|
810
|
+
if (sdtContent)
|
|
811
|
+
processInlineChildren(sdtContent);
|
|
812
|
+
break;
|
|
813
|
+
}
|
|
814
|
+
case 'smartTag':
|
|
815
|
+
// Smart tags wrap runs — process inner content
|
|
816
|
+
processInlineChildren(el);
|
|
817
|
+
break;
|
|
818
|
+
case 'ins':
|
|
819
|
+
case 'moveTo':
|
|
820
|
+
// Revision tracking — accept insertions by processing inner runs
|
|
821
|
+
processInlineChildren(el);
|
|
822
|
+
break;
|
|
823
|
+
case 'del':
|
|
824
|
+
case 'moveFrom':
|
|
825
|
+
// Revision tracking — skip deleted content
|
|
826
|
+
break;
|
|
827
|
+
case 'bookmarkStart': {
|
|
828
|
+
// Create an anchor for internal links (used by TOC hyperlinks)
|
|
829
|
+
const name = el.getAttribute('w:name');
|
|
830
|
+
if (name && name !== '_GoBack') {
|
|
831
|
+
innerHtml += `<a id="${escapeHtml(name)}"></a>`;
|
|
832
|
+
}
|
|
833
|
+
break;
|
|
834
|
+
}
|
|
835
|
+
// bookmarkEnd, proofErr, permStart, permEnd → skip silently
|
|
836
|
+
default:
|
|
837
|
+
break;
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
processInlineChildren(paraEl);
|
|
842
|
+
return innerHtml;
|
|
843
|
+
}
|
|
844
|
+
function convertParagraphToHtml(paraEl, ctx) {
|
|
845
|
+
const pPr = getDirectChild(paraEl, NS.W, 'pPr');
|
|
846
|
+
const paraStyle = extractParagraphStyle(pPr, ctx.stylesMap);
|
|
847
|
+
let innerHtml = convertParagraphInner(paraEl, ctx);
|
|
848
|
+
// Replace tab characters with dot-leader spans when the paragraph has a tab leader
|
|
849
|
+
if (paraStyle.tabLeader) {
|
|
850
|
+
const leaderChar = TAB_LEADER_CHARS[paraStyle.tabLeader] || ' ';
|
|
851
|
+
const leaderSpan = `<span style="letter-spacing:2px">${leaderChar.repeat(40)}</span>`;
|
|
852
|
+
innerHtml = innerHtml.replace(/	/g, leaderSpan);
|
|
853
|
+
}
|
|
854
|
+
const cssParts = [];
|
|
855
|
+
if (paraStyle.textAlign)
|
|
856
|
+
cssParts.push(`text-align:${paraStyle.textAlign}`);
|
|
857
|
+
if (paraStyle.marginLeft)
|
|
858
|
+
cssParts.push(`margin-left:${paraStyle.marginLeft}`);
|
|
859
|
+
if (paraStyle.marginRight)
|
|
860
|
+
cssParts.push(`margin-right:${paraStyle.marginRight}`);
|
|
861
|
+
if (paraStyle.marginTop)
|
|
862
|
+
cssParts.push(`margin-top:${paraStyle.marginTop}`);
|
|
863
|
+
if (paraStyle.marginBottom)
|
|
864
|
+
cssParts.push(`margin-bottom:${paraStyle.marginBottom}`);
|
|
865
|
+
if (paraStyle.textIndent)
|
|
866
|
+
cssParts.push(`text-indent:${paraStyle.textIndent}`);
|
|
867
|
+
if (paraStyle.backgroundColor)
|
|
868
|
+
cssParts.push(`background-color:${paraStyle.backgroundColor}`);
|
|
869
|
+
if (paraStyle.borderTop)
|
|
870
|
+
cssParts.push(`border-top:${paraStyle.borderTop}`);
|
|
871
|
+
if (paraStyle.borderBottom)
|
|
872
|
+
cssParts.push(`border-bottom:${paraStyle.borderBottom}`);
|
|
873
|
+
if (paraStyle.borderLeft)
|
|
874
|
+
cssParts.push(`border-left:${paraStyle.borderLeft}`);
|
|
875
|
+
if (paraStyle.borderRight)
|
|
876
|
+
cssParts.push(`border-right:${paraStyle.borderRight}`);
|
|
877
|
+
const styleAttr = buildStyleAttr(cssParts);
|
|
878
|
+
const { tag } = paraStyle;
|
|
879
|
+
return innerHtml.trim()
|
|
880
|
+
? `<${tag}${styleAttr}>${innerHtml}</${tag}>\n`
|
|
881
|
+
: `<${tag}${styleAttr}><br></${tag}>\n`;
|
|
882
|
+
}
|
|
883
|
+
function convertTableToHtml(tblEl, ctx) {
|
|
884
|
+
let html = '<table border="1" cellpadding="4" cellspacing="0">\n';
|
|
885
|
+
for (const row of getDirectChildren(tblEl, NS.W, 'tr')) {
|
|
886
|
+
html += '<tr>';
|
|
887
|
+
for (const cell of getDirectChildren(row, NS.W, 'tc')) {
|
|
888
|
+
html += '<td>';
|
|
889
|
+
// Table cells can contain paragraphs, nested tables, and sdt elements
|
|
890
|
+
for (let i = 0; i < cell.childNodes.length; i++) {
|
|
891
|
+
const child = cell.childNodes[i];
|
|
892
|
+
if (child.nodeType !== 1)
|
|
893
|
+
continue;
|
|
894
|
+
const el = child;
|
|
895
|
+
if (el.localName === 'p') {
|
|
896
|
+
html += convertParagraphToHtml(el, ctx);
|
|
897
|
+
}
|
|
898
|
+
else if (el.localName === 'tbl') {
|
|
899
|
+
html += convertTableToHtml(el, ctx);
|
|
900
|
+
}
|
|
901
|
+
else if (el.localName === 'sdt') {
|
|
902
|
+
const sdtContent = getDirectChild(el, NS.W, 'sdtContent');
|
|
903
|
+
if (sdtContent) {
|
|
904
|
+
for (let j = 0; j < sdtContent.childNodes.length; j++) {
|
|
905
|
+
const sc = sdtContent.childNodes[j];
|
|
906
|
+
if (sc.nodeType !== 1)
|
|
907
|
+
continue;
|
|
908
|
+
const sEl = sc;
|
|
909
|
+
if (sEl.localName === 'p')
|
|
910
|
+
html += convertParagraphToHtml(sEl, ctx);
|
|
911
|
+
else if (sEl.localName === 'tbl')
|
|
912
|
+
html += convertTableToHtml(sEl, ctx);
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
html += '</td>';
|
|
918
|
+
}
|
|
919
|
+
html += '</tr>\n';
|
|
920
|
+
}
|
|
921
|
+
return html + '</table>\n';
|
|
922
|
+
}
|
|
923
|
+
// ─── Relationship Parsing ────────────────────────────────────────────────────
|
|
924
|
+
async function loadRelationships(zip, includeImages) {
|
|
925
|
+
const imageMap = new Map();
|
|
926
|
+
const linkMap = new Map();
|
|
927
|
+
const relsFile = findZipFile(zip, 'word/_rels/document.xml.rels');
|
|
928
|
+
if (!relsFile)
|
|
929
|
+
return { imageMap, linkMap };
|
|
930
|
+
const relsXml = await relsFile.async('string');
|
|
931
|
+
const relsDoc = new DOMParser().parseFromString(relsXml, 'application/xml');
|
|
932
|
+
const rels = relsDoc.getElementsByTagName('Relationship');
|
|
933
|
+
for (let i = 0; i < rels.length; i++) {
|
|
934
|
+
const rel = rels[i];
|
|
935
|
+
const id = rel.getAttribute('Id');
|
|
936
|
+
const target = rel.getAttribute('Target');
|
|
937
|
+
const type = rel.getAttribute('Type') || '';
|
|
938
|
+
if (!id || !target)
|
|
939
|
+
continue;
|
|
940
|
+
if ((type.includes('image') || type.includes('oleObject')) && includeImages) {
|
|
941
|
+
// Resolve the image path — 'target' might be relative (e.g. "media/image1.png")
|
|
942
|
+
const imgPath = target.startsWith('/') ? target.slice(1) : `word/${target}`;
|
|
943
|
+
const imgFile = findZipFile(zip, imgPath);
|
|
944
|
+
if (imgFile) {
|
|
945
|
+
try {
|
|
946
|
+
const imgData = await imgFile.async('base64');
|
|
947
|
+
const ext = target.split('.').pop()?.toLowerCase() || 'png';
|
|
948
|
+
const mime = IMAGE_MIME_TYPES[ext] || 'image/png';
|
|
949
|
+
imageMap.set(id, `data:${mime};base64,${imgData}`);
|
|
950
|
+
}
|
|
951
|
+
catch {
|
|
952
|
+
// Skip failed image extraction
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
else if (type.includes('hyperlink')) {
|
|
957
|
+
linkMap.set(id, target);
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
return { imageMap, linkMap };
|
|
961
|
+
}
|
|
962
|
+
// ─── Numbering / List Parsing ────────────────────────────────────────────────
|
|
963
|
+
async function parseNumberingXml(zip) {
|
|
964
|
+
const result = new Map();
|
|
965
|
+
try {
|
|
966
|
+
const numFile = findZipFile(zip, 'word/numbering.xml');
|
|
967
|
+
if (!numFile)
|
|
968
|
+
return result;
|
|
969
|
+
const numXml = await numFile.async('string');
|
|
970
|
+
const doc = new DOMParser().parseFromString(numXml, 'application/xml');
|
|
971
|
+
// Abstract numbering definitions (abstractNumId → level → numFmt)
|
|
972
|
+
const abstractMap = new Map();
|
|
973
|
+
const abstractNums = doc.getElementsByTagNameNS(NS.W, 'abstractNum');
|
|
974
|
+
for (let i = 0; i < abstractNums.length; i++) {
|
|
975
|
+
const absNum = abstractNums[i];
|
|
976
|
+
const absNumId = absNum.getAttribute('w:abstractNumId');
|
|
977
|
+
if (!absNumId)
|
|
978
|
+
continue;
|
|
979
|
+
const levels = new Map();
|
|
980
|
+
for (const lvlEl of getDirectChildren(absNum, NS.W, 'lvl')) {
|
|
981
|
+
const ilvl = parseInt(lvlEl.getAttribute('w:ilvl') || '0', 10);
|
|
982
|
+
const numFmtEl = getDirectChild(lvlEl, NS.W, 'numFmt');
|
|
983
|
+
levels.set(ilvl, numFmtEl?.getAttribute('w:val') || 'bullet');
|
|
984
|
+
}
|
|
985
|
+
abstractMap.set(absNumId, levels);
|
|
986
|
+
}
|
|
987
|
+
// Concrete numbering: numId → abstractNumId mapping
|
|
988
|
+
const nums = doc.getElementsByTagNameNS(NS.W, 'num');
|
|
989
|
+
for (let i = 0; i < nums.length; i++) {
|
|
990
|
+
const numEl = nums[i];
|
|
991
|
+
const numId = numEl.getAttribute('w:numId');
|
|
992
|
+
if (!numId)
|
|
993
|
+
continue;
|
|
994
|
+
const absNumIdRef = getDirectChild(numEl, NS.W, 'abstractNumId');
|
|
995
|
+
const absNumId = absNumIdRef?.getAttribute('w:val');
|
|
996
|
+
if (absNumId && abstractMap.has(absNumId)) {
|
|
997
|
+
result.set(numId, abstractMap.get(absNumId));
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
catch {
|
|
1002
|
+
// Non-fatal
|
|
1003
|
+
}
|
|
1004
|
+
return result;
|
|
1005
|
+
}
|
|
1006
|
+
function getNumInfo(pPr) {
|
|
1007
|
+
if (!pPr)
|
|
1008
|
+
return null;
|
|
1009
|
+
const numPrEl = getDirectChild(pPr, NS.W, 'numPr');
|
|
1010
|
+
if (!numPrEl)
|
|
1011
|
+
return null;
|
|
1012
|
+
const numIdEl = getDirectChild(numPrEl, NS.W, 'numId');
|
|
1013
|
+
const numId = numIdEl?.getAttribute('w:val');
|
|
1014
|
+
if (!numId || numId === '0')
|
|
1015
|
+
return null;
|
|
1016
|
+
const ilvlEl = getDirectChild(numPrEl, NS.W, 'ilvl');
|
|
1017
|
+
const level = parseInt(ilvlEl?.getAttribute('w:val') || '0', 10);
|
|
1018
|
+
return { numId, level };
|
|
1019
|
+
}
|
|
1020
|
+
function getListTag(numberingMap, numId, level) {
|
|
1021
|
+
const levels = numberingMap.get(numId);
|
|
1022
|
+
if (!levels)
|
|
1023
|
+
return 'ul';
|
|
1024
|
+
const numFmt = levels.get(level) || 'bullet';
|
|
1025
|
+
return numFmt === 'bullet' || numFmt === 'none' ? 'ul' : 'ol';
|
|
1026
|
+
}
|
|
1027
|
+
// ─── Document Defaults Extraction ────────────────────────────────────────────
|
|
1028
|
+
async function parseThemeFonts(zip) {
|
|
1029
|
+
try {
|
|
1030
|
+
const themeFile = findZipFile(zip, 'word/theme/theme1.xml');
|
|
1031
|
+
if (!themeFile)
|
|
1032
|
+
return { major: '', minor: '' };
|
|
1033
|
+
const themeXml = await themeFile.async('string');
|
|
1034
|
+
const themeDoc = new DOMParser().parseFromString(themeXml, 'application/xml');
|
|
1035
|
+
let major = '';
|
|
1036
|
+
let minor = '';
|
|
1037
|
+
const majorFonts = themeDoc.getElementsByTagNameNS(NS.A, 'majorFont');
|
|
1038
|
+
if (majorFonts.length > 0) {
|
|
1039
|
+
const latin = getDirectChild(majorFonts[0], NS.A, 'latin');
|
|
1040
|
+
if (latin)
|
|
1041
|
+
major = latin.getAttribute('typeface') || '';
|
|
1042
|
+
}
|
|
1043
|
+
const minorFonts = themeDoc.getElementsByTagNameNS(NS.A, 'minorFont');
|
|
1044
|
+
if (minorFonts.length > 0) {
|
|
1045
|
+
const latin = getDirectChild(minorFonts[0], NS.A, 'latin');
|
|
1046
|
+
if (latin)
|
|
1047
|
+
minor = latin.getAttribute('typeface') || '';
|
|
1048
|
+
}
|
|
1049
|
+
return { major, minor };
|
|
1050
|
+
}
|
|
1051
|
+
catch {
|
|
1052
|
+
return { major: '', minor: '' };
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
function parseDocDefaults(stylesXml, themeFonts) {
|
|
1056
|
+
try {
|
|
1057
|
+
const doc = new DOMParser().parseFromString(stylesXml, 'application/xml');
|
|
1058
|
+
const docDefaultsEls = doc.getElementsByTagNameNS(NS.W, 'docDefaults');
|
|
1059
|
+
if (docDefaultsEls.length === 0)
|
|
1060
|
+
return {};
|
|
1061
|
+
const rPrDefault = getDirectChild(docDefaultsEls[0], NS.W, 'rPrDefault');
|
|
1062
|
+
if (!rPrDefault)
|
|
1063
|
+
return {};
|
|
1064
|
+
const rPr = getDirectChild(rPrDefault, NS.W, 'rPr');
|
|
1065
|
+
return rPr ? extractRunStyles(rPr, themeFonts) : {};
|
|
1066
|
+
}
|
|
1067
|
+
catch {
|
|
1068
|
+
return {};
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
// ─── Context Building ────────────────────────────────────────────────────────
|
|
1072
|
+
async function buildConversionContext(zip, includeImages) {
|
|
1073
|
+
const themeFonts = await parseThemeFonts(zip);
|
|
1074
|
+
const stylesXmlFile = findZipFile(zip, 'word/styles.xml');
|
|
1075
|
+
let stylesMap = new Map();
|
|
1076
|
+
let docDefaultsStyle = {};
|
|
1077
|
+
if (stylesXmlFile) {
|
|
1078
|
+
const stylesXml = await stylesXmlFile.async('string');
|
|
1079
|
+
stylesMap = parseStylesXml(stylesXml, themeFonts);
|
|
1080
|
+
docDefaultsStyle = parseDocDefaults(stylesXml, themeFonts);
|
|
1081
|
+
}
|
|
1082
|
+
const normalStyle = stylesMap.get('Normal');
|
|
1083
|
+
const docDefaultRunStyle = mergeRunStyles(docDefaultsStyle, normalStyle?.runStyle || {});
|
|
1084
|
+
// Ensure the default run style ALWAYS has fontFamily and fontSize —
|
|
1085
|
+
// this prevents runs that inherit from defaults from losing their font
|
|
1086
|
+
if (!docDefaultRunStyle.fontFamily) {
|
|
1087
|
+
docDefaultRunStyle.fontFamily = themeFonts.minor || 'Calibri';
|
|
1088
|
+
}
|
|
1089
|
+
if (!docDefaultRunStyle.fontSize) {
|
|
1090
|
+
docDefaultRunStyle.fontSize = '11pt';
|
|
1091
|
+
}
|
|
1092
|
+
const documentDefaults = {
|
|
1093
|
+
font: docDefaultRunStyle.fontFamily,
|
|
1094
|
+
fontSize: parseFloat(docDefaultRunStyle.fontSize),
|
|
1095
|
+
};
|
|
1096
|
+
const [numberingMap, { imageMap, linkMap }] = await Promise.all([
|
|
1097
|
+
parseNumberingXml(zip),
|
|
1098
|
+
loadRelationships(zip, includeImages),
|
|
1099
|
+
]);
|
|
1100
|
+
return {
|
|
1101
|
+
ctx: { imageMap, linkMap, stylesMap, themeFonts, docDefaultRunStyle, numberingMap },
|
|
1102
|
+
documentDefaults,
|
|
1103
|
+
};
|
|
1104
|
+
}
|
|
1105
|
+
// ─── Body Conversion ─────────────────────────────────────────────────────────
|
|
1106
|
+
function convertBodyChildrenToHtml(bodyEl, ctx) {
|
|
1107
|
+
let html = '';
|
|
1108
|
+
const listStack = [];
|
|
1109
|
+
let currentListNumId = '';
|
|
1110
|
+
function closeListsToLevel(targetLevel) {
|
|
1111
|
+
let out = '';
|
|
1112
|
+
while (listStack.length > 0 && listStack[listStack.length - 1].level > targetLevel) {
|
|
1113
|
+
out += `</${listStack.pop().tag}>\n`;
|
|
1114
|
+
}
|
|
1115
|
+
return out;
|
|
1116
|
+
}
|
|
1117
|
+
function closeAllLists() {
|
|
1118
|
+
let out = '';
|
|
1119
|
+
while (listStack.length > 0)
|
|
1120
|
+
out += `</${listStack.pop().tag}>\n`;
|
|
1121
|
+
currentListNumId = '';
|
|
1122
|
+
return out;
|
|
1123
|
+
}
|
|
1124
|
+
/** Process a single body-level element (paragraph, table, sdt, AlternateContent). */
|
|
1125
|
+
function processBodyChild(el) {
|
|
1126
|
+
if (el.localName === 'p') {
|
|
1127
|
+
const pPr = getDirectChild(el, NS.W, 'pPr');
|
|
1128
|
+
const numInfo = getNumInfo(pPr);
|
|
1129
|
+
if (numInfo) {
|
|
1130
|
+
html += convertListParagraph(el, pPr, numInfo, ctx, listStack, currentListNumId, closeAllLists, closeListsToLevel);
|
|
1131
|
+
currentListNumId = numInfo.numId;
|
|
1132
|
+
}
|
|
1133
|
+
else {
|
|
1134
|
+
html += closeAllLists();
|
|
1135
|
+
html += convertParagraphToHtml(el, ctx);
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
else if (el.localName === 'tbl') {
|
|
1139
|
+
html += closeAllLists();
|
|
1140
|
+
html += convertTableToHtml(el, ctx);
|
|
1141
|
+
}
|
|
1142
|
+
else if (el.localName === 'sdt') {
|
|
1143
|
+
// Structured document tag — unwrap and process inner content
|
|
1144
|
+
const sdtContent = getDirectChild(el, NS.W, 'sdtContent');
|
|
1145
|
+
if (sdtContent) {
|
|
1146
|
+
for (let j = 0; j < sdtContent.childNodes.length; j++) {
|
|
1147
|
+
const sc = sdtContent.childNodes[j];
|
|
1148
|
+
if (sc.nodeType === 1)
|
|
1149
|
+
processBodyChild(sc);
|
|
1150
|
+
}
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
1153
|
+
else if (el.localName === 'AlternateContent') {
|
|
1154
|
+
// Body-level mc:AlternateContent — try Choice first, then Fallback
|
|
1155
|
+
for (let j = 0; j < el.childNodes.length; j++) {
|
|
1156
|
+
const ac = el.childNodes[j];
|
|
1157
|
+
if (ac.nodeType !== 1)
|
|
1158
|
+
continue;
|
|
1159
|
+
if (ac.localName === 'Choice') {
|
|
1160
|
+
for (let k = 0; k < ac.childNodes.length; k++) {
|
|
1161
|
+
const cc = ac.childNodes[k];
|
|
1162
|
+
if (cc.nodeType === 1)
|
|
1163
|
+
processBodyChild(cc);
|
|
1164
|
+
}
|
|
1165
|
+
return; // Don't process Fallback if Choice succeeded
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
for (let j = 0; j < el.childNodes.length; j++) {
|
|
1169
|
+
const ac = el.childNodes[j];
|
|
1170
|
+
if (ac.nodeType !== 1)
|
|
1171
|
+
continue;
|
|
1172
|
+
if (ac.localName === 'Fallback') {
|
|
1173
|
+
for (let k = 0; k < ac.childNodes.length; k++) {
|
|
1174
|
+
const fc = ac.childNodes[k];
|
|
1175
|
+
if (fc.nodeType === 1)
|
|
1176
|
+
processBodyChild(fc);
|
|
1177
|
+
}
|
|
1178
|
+
return;
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
}
|
|
1183
|
+
for (let i = 0; i < bodyEl.childNodes.length; i++) {
|
|
1184
|
+
const child = bodyEl.childNodes[i];
|
|
1185
|
+
if (child.nodeType !== 1)
|
|
1186
|
+
continue;
|
|
1187
|
+
processBodyChild(child);
|
|
1188
|
+
}
|
|
1189
|
+
html += closeAllLists();
|
|
1190
|
+
return html;
|
|
1191
|
+
}
|
|
1192
|
+
function convertListParagraph(paraEl, pPr, numInfo, ctx, listStack, currentListNumId, closeAllLists, closeListsToLevel) {
|
|
1193
|
+
let out = '';
|
|
1194
|
+
const tag = getListTag(ctx.numberingMap, numInfo.numId, numInfo.level);
|
|
1195
|
+
const { level } = numInfo;
|
|
1196
|
+
if (currentListNumId && currentListNumId !== numInfo.numId) {
|
|
1197
|
+
out += closeAllLists();
|
|
1198
|
+
}
|
|
1199
|
+
const top = listStack.length > 0 ? listStack[listStack.length - 1] : null;
|
|
1200
|
+
if (!top || level > top.level) {
|
|
1201
|
+
out += `<${tag}>\n`;
|
|
1202
|
+
listStack.push({ tag, level });
|
|
1203
|
+
}
|
|
1204
|
+
else {
|
|
1205
|
+
if (level < top.level)
|
|
1206
|
+
out += closeListsToLevel(level);
|
|
1207
|
+
const current = listStack[listStack.length - 1];
|
|
1208
|
+
if (current && current.tag !== tag) {
|
|
1209
|
+
listStack.pop();
|
|
1210
|
+
out += `</${current.tag}>\n<${tag}>\n`;
|
|
1211
|
+
listStack.push({ tag, level });
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
const liContent = convertParagraphInner(paraEl, ctx);
|
|
1215
|
+
const paraStyle = extractParagraphStyle(pPr, ctx.stylesMap);
|
|
1216
|
+
const liCss = [];
|
|
1217
|
+
if (paraStyle.textAlign)
|
|
1218
|
+
liCss.push(`text-align:${paraStyle.textAlign}`);
|
|
1219
|
+
if (level > 0)
|
|
1220
|
+
liCss.push(`margin-left:${level * 36}pt`);
|
|
1221
|
+
out += `<li${buildStyleAttr(liCss)}>${liContent || ' '}</li>\n`;
|
|
1222
|
+
return out;
|
|
1223
|
+
}
|
|
1224
|
+
// ─── Image Extraction ────────────────────────────────────────────────────────
|
|
1225
|
+
function buildImageArray(imageMap) {
|
|
1226
|
+
const images = [];
|
|
1227
|
+
imageMap.forEach((dataUrl, id) => {
|
|
1228
|
+
const match = dataUrl.match(/^data:([^;]+);base64,(.+)$/);
|
|
1229
|
+
if (match) {
|
|
1230
|
+
images.push({
|
|
1231
|
+
id,
|
|
1232
|
+
data: match[2],
|
|
1233
|
+
mimeType: match[1],
|
|
1234
|
+
originalSize: Buffer.from(match[2], 'base64').length,
|
|
1235
|
+
});
|
|
1236
|
+
}
|
|
1237
|
+
});
|
|
1238
|
+
return images;
|
|
1239
|
+
}
|
|
1240
|
+
// ─── Main Entry Point ────────────────────────────────────────────────────────
|
|
1241
|
+
/**
|
|
1242
|
+
* Convert a DOCX buffer to styled HTML by parsing the DOCX XML directly.
|
|
1243
|
+
*
|
|
1244
|
+
* Bypasses mammoth.js to preserve all inline styles (colours, fonts, sizes,
|
|
1245
|
+
* alignment, highlights, bold/italic/underline, images, hyperlinks, tables).
|
|
1246
|
+
*/
|
|
1247
|
+
export async function convertDocxToStyledHtml(buffer, includeImages = true) {
|
|
1248
|
+
const JSZip = require('jszip');
|
|
1249
|
+
const zip = await JSZip.loadAsync(buffer);
|
|
1250
|
+
const docXmlFile = findZipFile(zip, 'word/document.xml');
|
|
1251
|
+
if (!docXmlFile)
|
|
1252
|
+
throw new Error('Invalid DOCX: missing word/document.xml');
|
|
1253
|
+
const docXml = await docXmlFile.async('string');
|
|
1254
|
+
const doc = new DOMParser().parseFromString(docXml, 'application/xml');
|
|
1255
|
+
const { ctx, documentDefaults } = await buildConversionContext(zip, includeImages);
|
|
1256
|
+
const bodyEl = doc.getElementsByTagNameNS(NS.W, 'body')[0];
|
|
1257
|
+
if (!bodyEl)
|
|
1258
|
+
return { html: '', images: [], documentDefaults };
|
|
1259
|
+
const html = convertBodyChildrenToHtml(bodyEl, ctx);
|
|
1260
|
+
const images = buildImageArray(ctx.imageMap);
|
|
1261
|
+
return { html, images, documentDefaults };
|
|
1262
|
+
}
|