@cj-tech-master/excelts 9.6.1 → 10.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -3
- package/README_zh.md +18 -3
- package/dist/browser/modules/excel/cell.d.ts +4 -0
- package/dist/browser/modules/excel/note.js +5 -1
- package/dist/browser/modules/excel/row.js +35 -2
- package/dist/browser/modules/excel/stream/workbook-writer.browser.d.ts +8 -1
- package/dist/browser/modules/excel/stream/workbook-writer.browser.js +22 -2
- package/dist/browser/modules/excel/types.d.ts +81 -0
- package/dist/browser/modules/excel/utils/drawing-utils.d.ts +8 -0
- package/dist/browser/modules/excel/utils/drawing-utils.js +19 -2
- package/dist/browser/modules/excel/workbook.browser.d.ts +16 -0
- package/dist/browser/modules/excel/workbook.browser.js +32 -2
- package/dist/browser/modules/excel/worksheet.d.ts +31 -1
- package/dist/browser/modules/excel/worksheet.js +83 -0
- package/dist/browser/modules/excel/xlsx/xform/comment/vml-shape-xform.d.ts +7 -0
- package/dist/browser/modules/excel/xlsx/xform/comment/vml-shape-xform.js +42 -8
- package/dist/browser/modules/excel/xlsx/xform/core/content-types-xform.js +3 -1
- package/dist/browser/modules/excel/xlsx/xform/drawing/absolute-anchor-xform.js +5 -0
- package/dist/browser/modules/excel/xlsx/xform/drawing/base-cell-anchor-xform.js +18 -1
- package/dist/browser/modules/excel/xlsx/xform/drawing/blip-xform.d.ts +6 -0
- package/dist/browser/modules/excel/xlsx/xform/drawing/blip-xform.js +38 -11
- package/dist/browser/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.d.ts +1 -0
- package/dist/browser/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.js +5 -0
- package/dist/browser/modules/excel/xlsx/xform/drawing/pic-xform.d.ts +2 -0
- package/dist/browser/modules/excel/xlsx/xform/drawing/pic-xform.js +2 -1
- package/dist/browser/modules/excel/xlsx/xform/drawing/shape-xform.d.ts +47 -0
- package/dist/browser/modules/excel/xlsx/xform/drawing/shape-xform.js +109 -0
- package/dist/browser/modules/excel/xlsx/xform/drawing/two-cell-anchor-xform.js +10 -1
- package/dist/browser/modules/excel/xlsx/xform/sheet/worksheet-xform.js +64 -1
- package/dist/browser/modules/pdf/builder/document-builder.js +22 -49
- package/dist/browser/modules/pdf/builder/pdf-editor.js +1 -1
- package/dist/browser/modules/pdf/core/pdf-stream.d.ts +28 -1
- package/dist/browser/modules/pdf/core/pdf-stream.js +38 -2
- package/dist/browser/modules/pdf/font/font-manager.d.ts +26 -0
- package/dist/browser/modules/pdf/font/font-manager.js +35 -18
- package/dist/browser/modules/pdf/render/page-renderer.d.ts +51 -3
- package/dist/browser/modules/pdf/render/page-renderer.js +111 -18
- package/dist/browser/modules/word/advanced/field-engine.js +45 -20
- package/dist/browser/modules/word/advanced/glossary.d.ts +10 -36
- package/dist/browser/modules/word/advanced/glossary.js +8 -9
- package/dist/browser/modules/word/advanced/math-convert.js +94 -12
- package/dist/browser/modules/word/advanced/ole-objects.d.ts +28 -0
- package/dist/browser/modules/word/advanced/ole-objects.js +122 -19
- package/dist/browser/modules/word/advanced/style-map.js +31 -10
- package/dist/browser/modules/word/builder/run-builders.d.ts +7 -1
- package/dist/browser/modules/word/builder/run-builders.js +7 -1
- package/dist/browser/modules/word/constants.d.ts +4 -0
- package/dist/browser/modules/word/constants.js +5 -1
- package/dist/browser/modules/word/convert/docx-to-semantic.d.ts +2 -1
- package/dist/browser/modules/word/convert/docx-to-semantic.js +135 -1
- package/dist/browser/modules/word/convert/html/html-import.d.ts +32 -1
- package/dist/browser/modules/word/convert/html/html-import.js +167 -14
- package/dist/browser/modules/word/convert/html/html.d.ts +2 -2
- package/dist/browser/modules/word/convert/html/html.js +1 -1
- package/dist/browser/modules/word/convert/markdown/markdown-import.d.ts +48 -18
- package/dist/browser/modules/word/convert/markdown/markdown-import.js +279 -69
- package/dist/browser/modules/word/convert/markdown/markdown.d.ts +1 -1
- package/dist/browser/modules/word/convert/odt/odt.js +407 -56
- package/dist/browser/modules/word/html.d.ts +2 -2
- package/dist/browser/modules/word/html.js +1 -1
- package/dist/browser/modules/word/index.base.d.ts +3 -3
- package/dist/browser/modules/word/index.base.js +1 -1
- package/dist/browser/modules/word/layout/layout-full.js +326 -19
- package/dist/browser/modules/word/layout/render-page.js +35 -8
- package/dist/browser/modules/word/markdown.d.ts +1 -1
- package/dist/browser/modules/word/query/compat.d.ts +10 -2
- package/dist/browser/modules/word/query/compat.js +29 -21
- package/dist/browser/modules/word/reader/docx-reader.js +105 -2
- package/dist/browser/modules/word/reader/math-parser.js +8 -2
- package/dist/browser/modules/word/security/cfb-reader.js +5 -5
- package/dist/browser/modules/word/types.d.ts +96 -1
- package/dist/browser/modules/word/writer/docx-packager.js +108 -2
- package/dist/browser/modules/word/writer/glossary-writer.d.ts +28 -0
- package/dist/browser/modules/word/writer/glossary-writer.js +121 -0
- package/dist/browser/modules/word/writer/header-footer-writer.js +105 -20
- package/dist/browser/modules/word/writer/math-writer.js +7 -2
- package/dist/browser/utils/font-metrics.d.ts +8 -0
- package/dist/browser/utils/font-metrics.js +43 -0
- package/dist/browser/utils/theme-colors.js +4 -1
- package/dist/cjs/modules/excel/note.js +5 -1
- package/dist/cjs/modules/excel/row.js +35 -2
- package/dist/cjs/modules/excel/stream/workbook-writer.browser.js +22 -2
- package/dist/cjs/modules/excel/utils/drawing-utils.js +19 -2
- package/dist/cjs/modules/excel/workbook.browser.js +31 -1
- package/dist/cjs/modules/excel/worksheet.js +83 -0
- package/dist/cjs/modules/excel/xlsx/xform/comment/vml-shape-xform.js +42 -8
- package/dist/cjs/modules/excel/xlsx/xform/core/content-types-xform.js +3 -1
- package/dist/cjs/modules/excel/xlsx/xform/drawing/absolute-anchor-xform.js +5 -0
- package/dist/cjs/modules/excel/xlsx/xform/drawing/base-cell-anchor-xform.js +18 -1
- package/dist/cjs/modules/excel/xlsx/xform/drawing/blip-xform.js +38 -11
- package/dist/cjs/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.js +5 -0
- package/dist/cjs/modules/excel/xlsx/xform/drawing/pic-xform.js +2 -1
- package/dist/cjs/modules/excel/xlsx/xform/drawing/shape-xform.js +112 -0
- package/dist/cjs/modules/excel/xlsx/xform/drawing/two-cell-anchor-xform.js +10 -1
- package/dist/cjs/modules/excel/xlsx/xform/sheet/worksheet-xform.js +64 -1
- package/dist/cjs/modules/pdf/builder/document-builder.js +21 -48
- package/dist/cjs/modules/pdf/builder/pdf-editor.js +1 -1
- package/dist/cjs/modules/pdf/core/pdf-stream.js +38 -2
- package/dist/cjs/modules/pdf/font/font-manager.js +35 -18
- package/dist/cjs/modules/pdf/render/page-renderer.js +112 -18
- package/dist/cjs/modules/word/advanced/field-engine.js +45 -20
- package/dist/cjs/modules/word/advanced/glossary.js +8 -9
- package/dist/cjs/modules/word/advanced/math-convert.js +94 -12
- package/dist/cjs/modules/word/advanced/ole-objects.js +123 -19
- package/dist/cjs/modules/word/advanced/style-map.js +31 -10
- package/dist/cjs/modules/word/builder/run-builders.js +7 -1
- package/dist/cjs/modules/word/constants.js +5 -1
- package/dist/cjs/modules/word/convert/docx-to-semantic.js +135 -1
- package/dist/cjs/modules/word/convert/html/html-import.js +168 -14
- package/dist/cjs/modules/word/convert/html/html.js +2 -1
- package/dist/cjs/modules/word/convert/markdown/markdown-import.js +279 -69
- package/dist/cjs/modules/word/convert/odt/odt.js +407 -56
- package/dist/cjs/modules/word/html.js +2 -1
- package/dist/cjs/modules/word/index.base.js +4 -3
- package/dist/cjs/modules/word/layout/layout-full.js +325 -18
- package/dist/cjs/modules/word/layout/render-page.js +35 -8
- package/dist/cjs/modules/word/query/compat.js +29 -21
- package/dist/cjs/modules/word/reader/docx-reader.js +104 -1
- package/dist/cjs/modules/word/reader/math-parser.js +8 -2
- package/dist/cjs/modules/word/security/cfb-reader.js +5 -5
- package/dist/cjs/modules/word/writer/docx-packager.js +108 -2
- package/dist/cjs/modules/word/writer/glossary-writer.js +124 -0
- package/dist/cjs/modules/word/writer/header-footer-writer.js +105 -20
- package/dist/cjs/modules/word/writer/math-writer.js +7 -2
- package/dist/cjs/utils/font-metrics.js +44 -0
- package/dist/cjs/utils/theme-colors.js +4 -1
- package/dist/esm/modules/excel/note.js +5 -1
- package/dist/esm/modules/excel/row.js +35 -2
- package/dist/esm/modules/excel/stream/workbook-writer.browser.js +22 -2
- package/dist/esm/modules/excel/utils/drawing-utils.js +19 -2
- package/dist/esm/modules/excel/workbook.browser.js +32 -2
- package/dist/esm/modules/excel/worksheet.js +83 -0
- package/dist/esm/modules/excel/xlsx/xform/comment/vml-shape-xform.js +42 -8
- package/dist/esm/modules/excel/xlsx/xform/core/content-types-xform.js +3 -1
- package/dist/esm/modules/excel/xlsx/xform/drawing/absolute-anchor-xform.js +5 -0
- package/dist/esm/modules/excel/xlsx/xform/drawing/base-cell-anchor-xform.js +18 -1
- package/dist/esm/modules/excel/xlsx/xform/drawing/blip-xform.js +38 -11
- package/dist/esm/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.js +5 -0
- package/dist/esm/modules/excel/xlsx/xform/drawing/pic-xform.js +2 -1
- package/dist/esm/modules/excel/xlsx/xform/drawing/shape-xform.js +109 -0
- package/dist/esm/modules/excel/xlsx/xform/drawing/two-cell-anchor-xform.js +10 -1
- package/dist/esm/modules/excel/xlsx/xform/sheet/worksheet-xform.js +64 -1
- package/dist/esm/modules/pdf/builder/document-builder.js +22 -49
- package/dist/esm/modules/pdf/builder/pdf-editor.js +1 -1
- package/dist/esm/modules/pdf/core/pdf-stream.js +38 -2
- package/dist/esm/modules/pdf/font/font-manager.js +35 -18
- package/dist/esm/modules/pdf/render/page-renderer.js +111 -18
- package/dist/esm/modules/word/advanced/field-engine.js +45 -20
- package/dist/esm/modules/word/advanced/glossary.js +8 -9
- package/dist/esm/modules/word/advanced/math-convert.js +94 -12
- package/dist/esm/modules/word/advanced/ole-objects.js +122 -19
- package/dist/esm/modules/word/advanced/style-map.js +31 -10
- package/dist/esm/modules/word/builder/run-builders.js +7 -1
- package/dist/esm/modules/word/constants.js +5 -1
- package/dist/esm/modules/word/convert/docx-to-semantic.js +135 -1
- package/dist/esm/modules/word/convert/html/html-import.js +167 -14
- package/dist/esm/modules/word/convert/html/html.js +1 -1
- package/dist/esm/modules/word/convert/markdown/markdown-import.js +279 -69
- package/dist/esm/modules/word/convert/odt/odt.js +407 -56
- package/dist/esm/modules/word/html.js +1 -1
- package/dist/esm/modules/word/index.base.js +1 -1
- package/dist/esm/modules/word/layout/layout-full.js +326 -19
- package/dist/esm/modules/word/layout/render-page.js +35 -8
- package/dist/esm/modules/word/query/compat.js +29 -21
- package/dist/esm/modules/word/reader/docx-reader.js +105 -2
- package/dist/esm/modules/word/reader/math-parser.js +8 -2
- package/dist/esm/modules/word/security/cfb-reader.js +5 -5
- package/dist/esm/modules/word/writer/docx-packager.js +108 -2
- package/dist/esm/modules/word/writer/glossary-writer.js +121 -0
- package/dist/esm/modules/word/writer/header-footer-writer.js +105 -20
- package/dist/esm/modules/word/writer/math-writer.js +7 -2
- package/dist/esm/utils/font-metrics.js +43 -0
- package/dist/esm/utils/theme-colors.js +4 -1
- package/dist/iife/excelts.iife.js +496 -59
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +39 -39
- package/dist/types/modules/excel/cell.d.ts +4 -0
- package/dist/types/modules/excel/stream/workbook-writer.browser.d.ts +8 -1
- package/dist/types/modules/excel/types.d.ts +81 -0
- package/dist/types/modules/excel/utils/drawing-utils.d.ts +8 -0
- package/dist/types/modules/excel/workbook.browser.d.ts +16 -0
- package/dist/types/modules/excel/worksheet.d.ts +31 -1
- package/dist/types/modules/excel/xlsx/xform/comment/vml-shape-xform.d.ts +7 -0
- package/dist/types/modules/excel/xlsx/xform/drawing/blip-xform.d.ts +6 -0
- package/dist/types/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.d.ts +1 -0
- package/dist/types/modules/excel/xlsx/xform/drawing/pic-xform.d.ts +2 -0
- package/dist/types/modules/excel/xlsx/xform/drawing/shape-xform.d.ts +47 -0
- package/dist/types/modules/pdf/core/pdf-stream.d.ts +28 -1
- package/dist/types/modules/pdf/font/font-manager.d.ts +26 -0
- package/dist/types/modules/pdf/render/page-renderer.d.ts +51 -3
- package/dist/types/modules/word/advanced/glossary.d.ts +10 -36
- package/dist/types/modules/word/advanced/ole-objects.d.ts +28 -0
- package/dist/types/modules/word/builder/run-builders.d.ts +7 -1
- package/dist/types/modules/word/constants.d.ts +4 -0
- package/dist/types/modules/word/convert/docx-to-semantic.d.ts +2 -1
- package/dist/types/modules/word/convert/html/html-import.d.ts +32 -1
- package/dist/types/modules/word/convert/html/html.d.ts +2 -2
- package/dist/types/modules/word/convert/markdown/markdown-import.d.ts +48 -18
- package/dist/types/modules/word/convert/markdown/markdown.d.ts +1 -1
- package/dist/types/modules/word/html.d.ts +2 -2
- package/dist/types/modules/word/index.base.d.ts +3 -3
- package/dist/types/modules/word/markdown.d.ts +1 -1
- package/dist/types/modules/word/query/compat.d.ts +10 -2
- package/dist/types/modules/word/types.d.ts +96 -1
- package/dist/types/modules/word/writer/glossary-writer.d.ts +28 -0
- package/dist/types/utils/font-metrics.d.ts +8 -0
- package/package.json +3 -1
|
@@ -11,7 +11,8 @@
|
|
|
11
11
|
* - Hyperlink extraction
|
|
12
12
|
* - Image registration into ConversionContext
|
|
13
13
|
* - Table structure with merge (colSpan/rowSpan)
|
|
14
|
-
* - List/numbering detection
|
|
14
|
+
* - List/numbering detection: consecutive numbered paragraphs are aggregated
|
|
15
|
+
* into ordered/unordered `list` blocks with nested sub-lists by level
|
|
15
16
|
* - Footnote/endnote reference and content
|
|
16
17
|
* - Math content (text fallback)
|
|
17
18
|
*/
|
|
@@ -93,6 +94,26 @@ function convertBodyContent(body, doc, ctx, imageMap) {
|
|
|
93
94
|
const item = body[bodyIndex];
|
|
94
95
|
switch (item.type) {
|
|
95
96
|
case "paragraph":
|
|
97
|
+
// A run of consecutive list-item paragraphs (each carrying a
|
|
98
|
+
// numbering reference, and not a heading) is aggregated into a single
|
|
99
|
+
// semantic `list` block with nested sub-lists driven by the numbering
|
|
100
|
+
// level. This is what turns Word numbering into real <ul>/<ol> in
|
|
101
|
+
// HTML and `-`/`1.` markers in Markdown when downstream renderers
|
|
102
|
+
// consume the IR.
|
|
103
|
+
if (isListItemParagraph(item)) {
|
|
104
|
+
let end = bodyIndex;
|
|
105
|
+
while (end < body.length) {
|
|
106
|
+
const next = body[end];
|
|
107
|
+
if (next.type !== "paragraph" || !isListItemParagraph(next)) {
|
|
108
|
+
break;
|
|
109
|
+
}
|
|
110
|
+
end++;
|
|
111
|
+
}
|
|
112
|
+
const listParas = body.slice(bodyIndex, end);
|
|
113
|
+
blocks.push(...buildListBlocks(listParas, doc, ctx, imageMap));
|
|
114
|
+
bodyIndex = end - 1; // loop's ++ advances past the consumed run
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
96
117
|
blocks.push(convertParagraph(item, doc, ctx, imageMap));
|
|
97
118
|
break;
|
|
98
119
|
case "table":
|
|
@@ -240,6 +261,119 @@ function convertBodyContent(body, doc, ctx, imageMap) {
|
|
|
240
261
|
return blocks;
|
|
241
262
|
}
|
|
242
263
|
// =============================================================================
|
|
264
|
+
// Internal: List Aggregation
|
|
265
|
+
// =============================================================================
|
|
266
|
+
/**
|
|
267
|
+
* Whether a body paragraph should render as a list item: it carries a
|
|
268
|
+
* numbering reference and is not itself a heading (a numbered heading stays a
|
|
269
|
+
* heading, mirroring the markdown/html renderers).
|
|
270
|
+
*/
|
|
271
|
+
function isListItemParagraph(item) {
|
|
272
|
+
if (item.type !== "paragraph") {
|
|
273
|
+
return false;
|
|
274
|
+
}
|
|
275
|
+
return item.properties?.numbering !== undefined && detectHeadingLevel(item) === null;
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Resolve a numbering reference to its number format string (e.g. "decimal",
|
|
279
|
+
* "bullet"). Mirrors the lookup in the markdown/html renderers so the three
|
|
280
|
+
* surfaces classify ordered vs. unordered lists identically. Defaults to
|
|
281
|
+
* "bullet" when the numbering definition can't be resolved.
|
|
282
|
+
*/
|
|
283
|
+
function getNumberingFormat(doc, numId, level) {
|
|
284
|
+
const instance = doc.numberingInstances?.find(n => n.numId === numId);
|
|
285
|
+
if (!instance) {
|
|
286
|
+
return "bullet";
|
|
287
|
+
}
|
|
288
|
+
const abstractNum = doc.abstractNumberings?.find(a => a.abstractNumId === instance.abstractNumId);
|
|
289
|
+
if (!abstractNum) {
|
|
290
|
+
return "bullet";
|
|
291
|
+
}
|
|
292
|
+
const levelDef = abstractNum.levels.find(l => l.level === level);
|
|
293
|
+
return levelDef?.format ?? "bullet";
|
|
294
|
+
}
|
|
295
|
+
/** A number format other than "bullet"/"none" denotes an ordered list. */
|
|
296
|
+
function isOrderedFormat(format) {
|
|
297
|
+
return format !== "bullet" && format !== "none";
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Build one or more semantic `list` blocks from a contiguous run of list-item
|
|
301
|
+
* paragraphs. Paragraphs are nested by their numbering `level`; a deeper level
|
|
302
|
+
* becomes a `subList` of the preceding shallower item. Adjacent items that
|
|
303
|
+
* switch between ordered and unordered at the same level start a new sibling
|
|
304
|
+
* list so the ordered/unordered distinction is preserved.
|
|
305
|
+
*/
|
|
306
|
+
function buildListBlocks(paras, doc, ctx, imageMap) {
|
|
307
|
+
const { blocks } = buildListLevel(paras, 0, 0, doc, ctx, imageMap);
|
|
308
|
+
return blocks;
|
|
309
|
+
}
|
|
310
|
+
/**
|
|
311
|
+
* Consume paragraphs starting at `start` that belong to `level` (or deeper),
|
|
312
|
+
* emitting sibling lists for this level. Deeper-level paragraphs are folded
|
|
313
|
+
* into the current item's `subList` via recursion. Returns the produced blocks
|
|
314
|
+
* and the index of the first paragraph that no longer belongs to this level.
|
|
315
|
+
*/
|
|
316
|
+
function buildListLevel(paras, start, level, doc, ctx, imageMap) {
|
|
317
|
+
const blocks = [];
|
|
318
|
+
let i = start;
|
|
319
|
+
let currentOrdered = null;
|
|
320
|
+
let items = [];
|
|
321
|
+
const flush = () => {
|
|
322
|
+
if (items.length > 0 && currentOrdered !== null) {
|
|
323
|
+
blocks.push({ type: "list", ordered: currentOrdered, items });
|
|
324
|
+
items = [];
|
|
325
|
+
}
|
|
326
|
+
};
|
|
327
|
+
while (i < paras.length) {
|
|
328
|
+
const para = paras[i];
|
|
329
|
+
const num = para.properties?.numbering;
|
|
330
|
+
// Defensive: callers only pass list-item paragraphs, but guard anyway.
|
|
331
|
+
if (!num) {
|
|
332
|
+
break;
|
|
333
|
+
}
|
|
334
|
+
if (num.level < level) {
|
|
335
|
+
// Belongs to a shallower list — let the caller handle it.
|
|
336
|
+
break;
|
|
337
|
+
}
|
|
338
|
+
if (num.level > level) {
|
|
339
|
+
// Deeper item with no shallower parent at this position: descend and
|
|
340
|
+
// attach the nested list to the most recent item, or synthesise an
|
|
341
|
+
// empty item to host it when there is no parent.
|
|
342
|
+
const { blocks: subBlocks, next } = buildListLevel(paras, i, num.level, doc, ctx, imageMap);
|
|
343
|
+
const subList = subBlocks[0];
|
|
344
|
+
if (items.length > 0) {
|
|
345
|
+
const last = items[items.length - 1];
|
|
346
|
+
items[items.length - 1] = { ...last, subList };
|
|
347
|
+
}
|
|
348
|
+
else if (subList) {
|
|
349
|
+
// Promote the deeper list to this level when there is no parent item.
|
|
350
|
+
if (currentOrdered === null && subList.type === "list") {
|
|
351
|
+
currentOrdered = subList.ordered;
|
|
352
|
+
}
|
|
353
|
+
items.push({ children: [], subList });
|
|
354
|
+
}
|
|
355
|
+
i = next;
|
|
356
|
+
continue;
|
|
357
|
+
}
|
|
358
|
+
// num.level === level
|
|
359
|
+
const format = getNumberingFormat(doc, num.numId, num.level);
|
|
360
|
+
const ordered = isOrderedFormat(format);
|
|
361
|
+
if (currentOrdered === null) {
|
|
362
|
+
currentOrdered = ordered;
|
|
363
|
+
}
|
|
364
|
+
else if (ordered !== currentOrdered) {
|
|
365
|
+
// Ordered/unordered switch at the same level → start a new sibling list.
|
|
366
|
+
flush();
|
|
367
|
+
currentOrdered = ordered;
|
|
368
|
+
}
|
|
369
|
+
const children = convertParagraphChildren(para.children, doc, ctx, imageMap);
|
|
370
|
+
items.push({ children });
|
|
371
|
+
i++;
|
|
372
|
+
}
|
|
373
|
+
flush();
|
|
374
|
+
return { blocks, next: i };
|
|
375
|
+
}
|
|
376
|
+
// =============================================================================
|
|
243
377
|
// Internal: Paragraph Conversion
|
|
244
378
|
// =============================================================================
|
|
245
379
|
function convertParagraph(para, doc, ctx, imageMap) {
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
* const buffer = await toBuffer(Document.build(h));
|
|
21
21
|
* ```
|
|
22
22
|
*/
|
|
23
|
+
import { base64ToUint8Array } from "../../../../utils/utils.js";
|
|
23
24
|
import { sanitizeUrl } from "../../core/internal-utils.js";
|
|
24
25
|
import { EMU_PER_PX } from "../../units.js";
|
|
25
26
|
/**
|
|
@@ -44,10 +45,7 @@ export function htmlToDocxBody(html, options) {
|
|
|
44
45
|
const tokens = tokenize(html);
|
|
45
46
|
// Extract <style> rules and merge with user-provided classStyles
|
|
46
47
|
const extractedStyles = extractStyleRules(tokens);
|
|
47
|
-
const classStyles = {
|
|
48
|
-
...extractedStyles,
|
|
49
|
-
...(options?.classStyles ?? {})
|
|
50
|
-
};
|
|
48
|
+
const classStyles = mergeClassStyles(extractedStyles, options?.classStyles ?? {});
|
|
51
49
|
// Seed the inline context with the caller-supplied defaults so plain text
|
|
52
50
|
// runs actually carry the requested font/size. Without this the options
|
|
53
51
|
// were effectively ignored.
|
|
@@ -61,6 +59,40 @@ export function htmlToDocxBody(html, options) {
|
|
|
61
59
|
parseBlocks(tokens, 0, blocks, initialCtx, classStyles);
|
|
62
60
|
return blocks;
|
|
63
61
|
}
|
|
62
|
+
/**
|
|
63
|
+
* Convert an HTML string into DOCX body content **and** embedded images.
|
|
64
|
+
*
|
|
65
|
+
* Unlike {@link htmlToDocxBody}, this decodes base64 `data:` image URLs into
|
|
66
|
+
* real {@link ImageDef}s and assigns each a unique rId that the emitted image
|
|
67
|
+
* runs reference. Merge the returned `images` into your document model so the
|
|
68
|
+
* pictures are embedded rather than dropped as placeholders.
|
|
69
|
+
*
|
|
70
|
+
* @example
|
|
71
|
+
* ```ts
|
|
72
|
+
* const { body, images } = htmlToDocx(html);
|
|
73
|
+
* const doc = Document.create();
|
|
74
|
+
* for (const item of body) Document.addContent(doc, item);
|
|
75
|
+
* const built = Document.build(doc);
|
|
76
|
+
* const final = { ...built, images: [...(built.images ?? []), ...images] };
|
|
77
|
+
* const bytes = await toBuffer(final);
|
|
78
|
+
* ```
|
|
79
|
+
*/
|
|
80
|
+
export function htmlToDocx(html, options) {
|
|
81
|
+
const blocks = [];
|
|
82
|
+
const tokens = tokenize(html);
|
|
83
|
+
const extractedStyles = extractStyleRules(tokens);
|
|
84
|
+
const classStyles = mergeClassStyles(extractedStyles, options?.classStyles ?? {});
|
|
85
|
+
const images = [];
|
|
86
|
+
const initialCtx = { imageSink: images };
|
|
87
|
+
if (options?.defaultFont) {
|
|
88
|
+
initialCtx.fontFamily = options.defaultFont;
|
|
89
|
+
}
|
|
90
|
+
if (options?.defaultFontSize !== undefined) {
|
|
91
|
+
initialCtx.fontSize = options.defaultFontSize;
|
|
92
|
+
}
|
|
93
|
+
parseBlocks(tokens, 0, blocks, initialCtx, classStyles);
|
|
94
|
+
return { body: blocks, images };
|
|
95
|
+
}
|
|
64
96
|
function tokenize(html) {
|
|
65
97
|
const tokens = [];
|
|
66
98
|
// Strip HTML comments, doctype declarations and SGML processing
|
|
@@ -473,6 +505,22 @@ function extractStyleRules(tokens) {
|
|
|
473
505
|
}
|
|
474
506
|
return result;
|
|
475
507
|
}
|
|
508
|
+
/**
|
|
509
|
+
* Merge two class→style maps. For classes present in both, the declarations
|
|
510
|
+
* are concatenated (extracted `<style>` rules first, caller-supplied overrides
|
|
511
|
+
* last) so the later source wins per CSS cascade while still preserving
|
|
512
|
+
* properties only declared by the other source. A plain `{ ...a, ...b }`
|
|
513
|
+
* would discard the extracted rule entirely whenever the caller supplies the
|
|
514
|
+
* same class name, silently dropping e.g. `font-style`/`color` from `<style>`.
|
|
515
|
+
*/
|
|
516
|
+
function mergeClassStyles(extracted, overrides) {
|
|
517
|
+
const merged = { ...extracted };
|
|
518
|
+
for (const [name, style] of Object.entries(overrides)) {
|
|
519
|
+
const existing = merged[name];
|
|
520
|
+
merged[name] = existing ? `${existing}; ${style}` : style;
|
|
521
|
+
}
|
|
522
|
+
return merged;
|
|
523
|
+
}
|
|
476
524
|
/**
|
|
477
525
|
* Parse HTML-style attributes from the inside of a start tag, e.g.
|
|
478
526
|
* `class="x" id='y' disabled href=foo`.
|
|
@@ -1133,6 +1181,17 @@ function parseBlocks(tokens, start, blocks, parentCtx, classStyles) {
|
|
|
1133
1181
|
return i + 1; // consumed the close tag
|
|
1134
1182
|
}
|
|
1135
1183
|
if (tok.type === "text") {
|
|
1184
|
+
// In block context, text nodes that are pure inter-element whitespace
|
|
1185
|
+
// (the newlines/indentation between block tags in pretty-printed HTML)
|
|
1186
|
+
// carry no content and must be ignored — otherwise every gap between
|
|
1187
|
+
// <p>/<table>/<div> tags would emit a spurious empty paragraph (and
|
|
1188
|
+
// the contained newline would be rendered as a <w:br/> soft break).
|
|
1189
|
+
// Whitespace that sits between inline runs is preserved by the inline
|
|
1190
|
+
// parser, which handles it separately.
|
|
1191
|
+
if (tok.value.trim() === "") {
|
|
1192
|
+
i++;
|
|
1193
|
+
continue;
|
|
1194
|
+
}
|
|
1136
1195
|
if (!pendingInline) {
|
|
1137
1196
|
pendingInline = { runs: [], ctx: parentCtx };
|
|
1138
1197
|
}
|
|
@@ -1461,7 +1520,7 @@ function parseInlineTag(tokens, idx, runs, ctx, classStyles) {
|
|
|
1461
1520
|
runs.push({ content: [{ type: "break" }] });
|
|
1462
1521
|
}
|
|
1463
1522
|
else if (tag === "img") {
|
|
1464
|
-
const imgContent = buildImageContent(tok.attrs);
|
|
1523
|
+
const imgContent = buildImageContent(tok.attrs, ctx);
|
|
1465
1524
|
if (imgContent) {
|
|
1466
1525
|
runs.push({ content: [imgContent] });
|
|
1467
1526
|
}
|
|
@@ -1534,13 +1593,15 @@ function parseInlineTag(tokens, idx, runs, ctx, classStyles) {
|
|
|
1534
1593
|
i++;
|
|
1535
1594
|
}
|
|
1536
1595
|
else if (t.type === "close") {
|
|
1596
|
+
// Mismatched close tag — close the hyperlink here but do NOT
|
|
1597
|
+
// consume the token; let the caller handle the block boundary.
|
|
1537
1598
|
const hyperlink = {
|
|
1538
1599
|
type: "hyperlink",
|
|
1539
1600
|
url: safeHref ?? "",
|
|
1540
1601
|
children: innerRuns
|
|
1541
1602
|
};
|
|
1542
1603
|
runs.push(hyperlink);
|
|
1543
|
-
return i
|
|
1604
|
+
return i;
|
|
1544
1605
|
}
|
|
1545
1606
|
else {
|
|
1546
1607
|
const childRuns = [];
|
|
@@ -1581,7 +1642,12 @@ function parseInlineTag(tokens, idx, runs, ctx, classStyles) {
|
|
|
1581
1642
|
i++;
|
|
1582
1643
|
}
|
|
1583
1644
|
else if (t.type === "close") {
|
|
1584
|
-
|
|
1645
|
+
// Mismatched close tag (e.g. </p> while inside an unclosed <strong>).
|
|
1646
|
+
// Do NOT consume it — return the current index so the caller can
|
|
1647
|
+
// handle it. Consuming a block-level close here would swallow the
|
|
1648
|
+
// parent paragraph boundary and pull all following block content
|
|
1649
|
+
// into this run, breaking page breaks, tables, etc.
|
|
1650
|
+
return i;
|
|
1585
1651
|
}
|
|
1586
1652
|
else {
|
|
1587
1653
|
i = parseInlineTag(tokens, i, runs, newCtx, classStyles);
|
|
@@ -1660,6 +1726,15 @@ function parseListItem(tokens, start, blocks, ctx, ordered, level, classStyles)
|
|
|
1660
1726
|
}
|
|
1661
1727
|
// Text content
|
|
1662
1728
|
if (tok.type === "text") {
|
|
1729
|
+
// Skip structural whitespace: the indentation/newlines that sit between
|
|
1730
|
+
// a nested <ul>/<ol> and the closing </li> (or at the very start of the
|
|
1731
|
+
// item) are not real content. Emitting them as runs would otherwise
|
|
1732
|
+
// produce a spurious empty list-item paragraph. Whitespace *between*
|
|
1733
|
+
// real inline content is preserved because `children` is non-empty then.
|
|
1734
|
+
if (tok.value.trim() === "" && children.length === 0) {
|
|
1735
|
+
i++;
|
|
1736
|
+
continue;
|
|
1737
|
+
}
|
|
1663
1738
|
children.push(makeRun(tok.value, ctx));
|
|
1664
1739
|
i++;
|
|
1665
1740
|
continue;
|
|
@@ -2064,7 +2139,7 @@ function mapCssBorderStyle(cssStyle) {
|
|
|
2064
2139
|
// Image content builder
|
|
2065
2140
|
// =============================================================================
|
|
2066
2141
|
/** Build InlineImageContent from img attributes or return undefined if not applicable. */
|
|
2067
|
-
function buildImageContent(attrs) {
|
|
2142
|
+
function buildImageContent(attrs, ctx) {
|
|
2068
2143
|
const src = attrs["src"] || "";
|
|
2069
2144
|
const alt = attrs["alt"] || "";
|
|
2070
2145
|
// Parse width/height from attributes first, then fall back to style
|
|
@@ -2083,11 +2158,36 @@ function buildImageContent(attrs) {
|
|
|
2083
2158
|
// Convert pixels to EMU
|
|
2084
2159
|
const widthEmu = (width || 100) * EMU_PER_PX;
|
|
2085
2160
|
const heightEmu = (height || 100) * EMU_PER_PX;
|
|
2086
|
-
//
|
|
2087
|
-
//
|
|
2088
|
-
//
|
|
2089
|
-
|
|
2090
|
-
|
|
2161
|
+
// base64 data: URLs can be decoded and embedded as a real media file when
|
|
2162
|
+
// an image sink is provided (htmlToDocx path). The decoded bytes are
|
|
2163
|
+
// registered as an ImageDef and the run references the assigned rId.
|
|
2164
|
+
if (src.startsWith("data:") && ctx?.imageSink) {
|
|
2165
|
+
const decoded = decodeDataUrlImage(src);
|
|
2166
|
+
if (decoded) {
|
|
2167
|
+
const sink = ctx.imageSink;
|
|
2168
|
+
const index = sink.length;
|
|
2169
|
+
const rId = `htmlImg${index}`;
|
|
2170
|
+
const ext = decoded.mediaType === "jpeg" ? "jpg" : decoded.mediaType;
|
|
2171
|
+
sink.push({
|
|
2172
|
+
data: decoded.data,
|
|
2173
|
+
mediaType: decoded.mediaType,
|
|
2174
|
+
fileName: `image_html_${index}.${ext}`,
|
|
2175
|
+
rId
|
|
2176
|
+
});
|
|
2177
|
+
return {
|
|
2178
|
+
type: "image",
|
|
2179
|
+
rId,
|
|
2180
|
+
width: widthEmu,
|
|
2181
|
+
height: heightEmu,
|
|
2182
|
+
altText: alt || undefined,
|
|
2183
|
+
name: alt || `image${index}`
|
|
2184
|
+
};
|
|
2185
|
+
}
|
|
2186
|
+
}
|
|
2187
|
+
// No sink (htmlToDocxBody only returns BodyContent[] and cannot register
|
|
2188
|
+
// media) or an unsupported/remote source: emit a placeholder with an empty
|
|
2189
|
+
// rId. The renderer treats an empty rId as a placeholder; the original src
|
|
2190
|
+
// is surfaced in the alt text so callers can post-process if needed.
|
|
2091
2191
|
if (src.startsWith("data:") || src.startsWith("http://") || src.startsWith("https://")) {
|
|
2092
2192
|
return {
|
|
2093
2193
|
type: "image",
|
|
@@ -2100,6 +2200,54 @@ function buildImageContent(attrs) {
|
|
|
2100
2200
|
}
|
|
2101
2201
|
return undefined;
|
|
2102
2202
|
}
|
|
2203
|
+
/** Decode a `data:image/...;base64,...` URL into bytes + media type. */
|
|
2204
|
+
function decodeDataUrlImage(src) {
|
|
2205
|
+
// data:image/png;base64,XXXX
|
|
2206
|
+
const match = /^data:image\/([a-z0-9.+-]+)\s*;\s*base64\s*,(.*)$/is.exec(src);
|
|
2207
|
+
if (!match) {
|
|
2208
|
+
return undefined;
|
|
2209
|
+
}
|
|
2210
|
+
const rawType = match[1].toLowerCase();
|
|
2211
|
+
const b64 = match[2].replace(/\s+/g, "");
|
|
2212
|
+
const mediaType = normalizeImageMediaType(rawType);
|
|
2213
|
+
if (!mediaType) {
|
|
2214
|
+
return undefined;
|
|
2215
|
+
}
|
|
2216
|
+
try {
|
|
2217
|
+
const data = base64ToUint8Array(b64);
|
|
2218
|
+
if (data.length === 0) {
|
|
2219
|
+
return undefined;
|
|
2220
|
+
}
|
|
2221
|
+
return { data, mediaType };
|
|
2222
|
+
}
|
|
2223
|
+
catch {
|
|
2224
|
+
return undefined;
|
|
2225
|
+
}
|
|
2226
|
+
}
|
|
2227
|
+
/** Map a data-URL image subtype to a supported ImageMediaType. */
|
|
2228
|
+
function normalizeImageMediaType(subtype) {
|
|
2229
|
+
switch (subtype) {
|
|
2230
|
+
case "png":
|
|
2231
|
+
return "png";
|
|
2232
|
+
case "jpeg":
|
|
2233
|
+
case "jpg":
|
|
2234
|
+
return "jpeg";
|
|
2235
|
+
case "gif":
|
|
2236
|
+
return "gif";
|
|
2237
|
+
case "bmp":
|
|
2238
|
+
return "bmp";
|
|
2239
|
+
case "tiff":
|
|
2240
|
+
case "tif":
|
|
2241
|
+
return "tiff";
|
|
2242
|
+
case "svg+xml":
|
|
2243
|
+
case "svg":
|
|
2244
|
+
return "svg";
|
|
2245
|
+
case "webp":
|
|
2246
|
+
return "webp";
|
|
2247
|
+
default:
|
|
2248
|
+
return undefined;
|
|
2249
|
+
}
|
|
2250
|
+
}
|
|
2103
2251
|
/** Parse an image dimension from HTML attribute value (number or "Npx"). */
|
|
2104
2252
|
function parseImageDimension(value) {
|
|
2105
2253
|
if (!value) {
|
|
@@ -2228,6 +2376,11 @@ function resolveEffectiveStyle(attrs, classStyles) {
|
|
|
2228
2376
|
// Run builder
|
|
2229
2377
|
// =============================================================================
|
|
2230
2378
|
function makeRun(text, ctx) {
|
|
2379
|
+
// HTML whitespace handling: outside <pre>/<code>, runs of whitespace
|
|
2380
|
+
// (including the newlines/indentation from source-code line wrapping)
|
|
2381
|
+
// collapse to a single space. Inside <pre>/<code> whitespace is
|
|
2382
|
+
// significant and preserved verbatim.
|
|
2383
|
+
const value = ctx.code ? text : text.replace(/\s+/g, " ");
|
|
2231
2384
|
const props = {};
|
|
2232
2385
|
if (ctx.bold) {
|
|
2233
2386
|
props.bold = true;
|
|
@@ -2264,7 +2417,7 @@ function makeRun(text, ctx) {
|
|
|
2264
2417
|
}
|
|
2265
2418
|
const run = {
|
|
2266
2419
|
...(Object.keys(props).length > 0 ? { properties: props } : {}),
|
|
2267
|
-
content: [{ type: "text", text }]
|
|
2420
|
+
content: [{ type: "text", text: value }]
|
|
2268
2421
|
};
|
|
2269
2422
|
return run;
|
|
2270
2423
|
}
|
|
@@ -12,4 +12,4 @@
|
|
|
12
12
|
// HTML → render (DocxDocument → HTML output)
|
|
13
13
|
export { renderToHtml } from "./html-renderer.js";
|
|
14
14
|
// HTML → DOCX import (HTML string → BodyContent[])
|
|
15
|
-
export { htmlToDocxBody } from "./html-import.js";
|
|
15
|
+
export { htmlToDocxBody, htmlToDocx } from "./html-import.js";
|