@cj-tech-master/excelts 9.6.1 → 10.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/README.md +18 -3
  2. package/README_zh.md +18 -3
  3. package/dist/browser/modules/excel/cell.d.ts +4 -0
  4. package/dist/browser/modules/excel/note.js +5 -1
  5. package/dist/browser/modules/excel/row.js +35 -2
  6. package/dist/browser/modules/excel/stream/workbook-writer.browser.d.ts +8 -1
  7. package/dist/browser/modules/excel/stream/workbook-writer.browser.js +22 -2
  8. package/dist/browser/modules/excel/types.d.ts +81 -0
  9. package/dist/browser/modules/excel/utils/drawing-utils.d.ts +8 -0
  10. package/dist/browser/modules/excel/utils/drawing-utils.js +19 -2
  11. package/dist/browser/modules/excel/workbook.browser.d.ts +16 -0
  12. package/dist/browser/modules/excel/workbook.browser.js +32 -2
  13. package/dist/browser/modules/excel/worksheet.d.ts +31 -1
  14. package/dist/browser/modules/excel/worksheet.js +83 -0
  15. package/dist/browser/modules/excel/xlsx/xform/comment/vml-shape-xform.d.ts +7 -0
  16. package/dist/browser/modules/excel/xlsx/xform/comment/vml-shape-xform.js +42 -8
  17. package/dist/browser/modules/excel/xlsx/xform/core/content-types-xform.js +3 -1
  18. package/dist/browser/modules/excel/xlsx/xform/drawing/absolute-anchor-xform.js +5 -0
  19. package/dist/browser/modules/excel/xlsx/xform/drawing/base-cell-anchor-xform.js +18 -1
  20. package/dist/browser/modules/excel/xlsx/xform/drawing/blip-xform.d.ts +6 -0
  21. package/dist/browser/modules/excel/xlsx/xform/drawing/blip-xform.js +38 -11
  22. package/dist/browser/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.d.ts +1 -0
  23. package/dist/browser/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.js +5 -0
  24. package/dist/browser/modules/excel/xlsx/xform/drawing/pic-xform.d.ts +2 -0
  25. package/dist/browser/modules/excel/xlsx/xform/drawing/pic-xform.js +2 -1
  26. package/dist/browser/modules/excel/xlsx/xform/drawing/shape-xform.d.ts +47 -0
  27. package/dist/browser/modules/excel/xlsx/xform/drawing/shape-xform.js +109 -0
  28. package/dist/browser/modules/excel/xlsx/xform/drawing/two-cell-anchor-xform.js +10 -1
  29. package/dist/browser/modules/excel/xlsx/xform/sheet/worksheet-xform.js +64 -1
  30. package/dist/browser/modules/pdf/builder/document-builder.js +22 -49
  31. package/dist/browser/modules/pdf/builder/pdf-editor.js +1 -1
  32. package/dist/browser/modules/pdf/core/pdf-stream.d.ts +28 -1
  33. package/dist/browser/modules/pdf/core/pdf-stream.js +38 -2
  34. package/dist/browser/modules/pdf/font/font-manager.d.ts +26 -0
  35. package/dist/browser/modules/pdf/font/font-manager.js +35 -18
  36. package/dist/browser/modules/pdf/render/page-renderer.d.ts +51 -3
  37. package/dist/browser/modules/pdf/render/page-renderer.js +111 -18
  38. package/dist/browser/modules/word/advanced/field-engine.js +45 -20
  39. package/dist/browser/modules/word/advanced/glossary.d.ts +10 -36
  40. package/dist/browser/modules/word/advanced/glossary.js +8 -9
  41. package/dist/browser/modules/word/advanced/math-convert.js +94 -12
  42. package/dist/browser/modules/word/advanced/ole-objects.d.ts +28 -0
  43. package/dist/browser/modules/word/advanced/ole-objects.js +122 -19
  44. package/dist/browser/modules/word/advanced/style-map.js +31 -10
  45. package/dist/browser/modules/word/builder/run-builders.d.ts +7 -1
  46. package/dist/browser/modules/word/builder/run-builders.js +7 -1
  47. package/dist/browser/modules/word/constants.d.ts +4 -0
  48. package/dist/browser/modules/word/constants.js +5 -1
  49. package/dist/browser/modules/word/convert/docx-to-semantic.d.ts +2 -1
  50. package/dist/browser/modules/word/convert/docx-to-semantic.js +135 -1
  51. package/dist/browser/modules/word/convert/html/html-import.d.ts +32 -1
  52. package/dist/browser/modules/word/convert/html/html-import.js +167 -14
  53. package/dist/browser/modules/word/convert/html/html.d.ts +2 -2
  54. package/dist/browser/modules/word/convert/html/html.js +1 -1
  55. package/dist/browser/modules/word/convert/markdown/markdown-import.d.ts +48 -18
  56. package/dist/browser/modules/word/convert/markdown/markdown-import.js +279 -69
  57. package/dist/browser/modules/word/convert/markdown/markdown.d.ts +1 -1
  58. package/dist/browser/modules/word/convert/odt/odt.js +407 -56
  59. package/dist/browser/modules/word/html.d.ts +2 -2
  60. package/dist/browser/modules/word/html.js +1 -1
  61. package/dist/browser/modules/word/index.base.d.ts +3 -3
  62. package/dist/browser/modules/word/index.base.js +1 -1
  63. package/dist/browser/modules/word/layout/layout-full.js +326 -19
  64. package/dist/browser/modules/word/layout/render-page.js +35 -8
  65. package/dist/browser/modules/word/markdown.d.ts +1 -1
  66. package/dist/browser/modules/word/query/compat.d.ts +10 -2
  67. package/dist/browser/modules/word/query/compat.js +29 -21
  68. package/dist/browser/modules/word/reader/docx-reader.js +105 -2
  69. package/dist/browser/modules/word/reader/math-parser.js +8 -2
  70. package/dist/browser/modules/word/security/cfb-reader.js +5 -5
  71. package/dist/browser/modules/word/types.d.ts +96 -1
  72. package/dist/browser/modules/word/writer/docx-packager.js +108 -2
  73. package/dist/browser/modules/word/writer/glossary-writer.d.ts +28 -0
  74. package/dist/browser/modules/word/writer/glossary-writer.js +121 -0
  75. package/dist/browser/modules/word/writer/header-footer-writer.js +105 -20
  76. package/dist/browser/modules/word/writer/math-writer.js +7 -2
  77. package/dist/browser/utils/font-metrics.d.ts +8 -0
  78. package/dist/browser/utils/font-metrics.js +43 -0
  79. package/dist/browser/utils/theme-colors.js +4 -1
  80. package/dist/cjs/modules/excel/note.js +5 -1
  81. package/dist/cjs/modules/excel/row.js +35 -2
  82. package/dist/cjs/modules/excel/stream/workbook-writer.browser.js +22 -2
  83. package/dist/cjs/modules/excel/utils/drawing-utils.js +19 -2
  84. package/dist/cjs/modules/excel/workbook.browser.js +31 -1
  85. package/dist/cjs/modules/excel/worksheet.js +83 -0
  86. package/dist/cjs/modules/excel/xlsx/xform/comment/vml-shape-xform.js +42 -8
  87. package/dist/cjs/modules/excel/xlsx/xform/core/content-types-xform.js +3 -1
  88. package/dist/cjs/modules/excel/xlsx/xform/drawing/absolute-anchor-xform.js +5 -0
  89. package/dist/cjs/modules/excel/xlsx/xform/drawing/base-cell-anchor-xform.js +18 -1
  90. package/dist/cjs/modules/excel/xlsx/xform/drawing/blip-xform.js +38 -11
  91. package/dist/cjs/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.js +5 -0
  92. package/dist/cjs/modules/excel/xlsx/xform/drawing/pic-xform.js +2 -1
  93. package/dist/cjs/modules/excel/xlsx/xform/drawing/shape-xform.js +112 -0
  94. package/dist/cjs/modules/excel/xlsx/xform/drawing/two-cell-anchor-xform.js +10 -1
  95. package/dist/cjs/modules/excel/xlsx/xform/sheet/worksheet-xform.js +64 -1
  96. package/dist/cjs/modules/pdf/builder/document-builder.js +21 -48
  97. package/dist/cjs/modules/pdf/builder/pdf-editor.js +1 -1
  98. package/dist/cjs/modules/pdf/core/pdf-stream.js +38 -2
  99. package/dist/cjs/modules/pdf/font/font-manager.js +35 -18
  100. package/dist/cjs/modules/pdf/render/page-renderer.js +112 -18
  101. package/dist/cjs/modules/word/advanced/field-engine.js +45 -20
  102. package/dist/cjs/modules/word/advanced/glossary.js +8 -9
  103. package/dist/cjs/modules/word/advanced/math-convert.js +94 -12
  104. package/dist/cjs/modules/word/advanced/ole-objects.js +123 -19
  105. package/dist/cjs/modules/word/advanced/style-map.js +31 -10
  106. package/dist/cjs/modules/word/builder/run-builders.js +7 -1
  107. package/dist/cjs/modules/word/constants.js +5 -1
  108. package/dist/cjs/modules/word/convert/docx-to-semantic.js +135 -1
  109. package/dist/cjs/modules/word/convert/html/html-import.js +168 -14
  110. package/dist/cjs/modules/word/convert/html/html.js +2 -1
  111. package/dist/cjs/modules/word/convert/markdown/markdown-import.js +279 -69
  112. package/dist/cjs/modules/word/convert/odt/odt.js +407 -56
  113. package/dist/cjs/modules/word/html.js +2 -1
  114. package/dist/cjs/modules/word/index.base.js +4 -3
  115. package/dist/cjs/modules/word/layout/layout-full.js +325 -18
  116. package/dist/cjs/modules/word/layout/render-page.js +35 -8
  117. package/dist/cjs/modules/word/query/compat.js +29 -21
  118. package/dist/cjs/modules/word/reader/docx-reader.js +104 -1
  119. package/dist/cjs/modules/word/reader/math-parser.js +8 -2
  120. package/dist/cjs/modules/word/security/cfb-reader.js +5 -5
  121. package/dist/cjs/modules/word/writer/docx-packager.js +108 -2
  122. package/dist/cjs/modules/word/writer/glossary-writer.js +124 -0
  123. package/dist/cjs/modules/word/writer/header-footer-writer.js +105 -20
  124. package/dist/cjs/modules/word/writer/math-writer.js +7 -2
  125. package/dist/cjs/utils/font-metrics.js +44 -0
  126. package/dist/cjs/utils/theme-colors.js +4 -1
  127. package/dist/esm/modules/excel/note.js +5 -1
  128. package/dist/esm/modules/excel/row.js +35 -2
  129. package/dist/esm/modules/excel/stream/workbook-writer.browser.js +22 -2
  130. package/dist/esm/modules/excel/utils/drawing-utils.js +19 -2
  131. package/dist/esm/modules/excel/workbook.browser.js +32 -2
  132. package/dist/esm/modules/excel/worksheet.js +83 -0
  133. package/dist/esm/modules/excel/xlsx/xform/comment/vml-shape-xform.js +42 -8
  134. package/dist/esm/modules/excel/xlsx/xform/core/content-types-xform.js +3 -1
  135. package/dist/esm/modules/excel/xlsx/xform/drawing/absolute-anchor-xform.js +5 -0
  136. package/dist/esm/modules/excel/xlsx/xform/drawing/base-cell-anchor-xform.js +18 -1
  137. package/dist/esm/modules/excel/xlsx/xform/drawing/blip-xform.js +38 -11
  138. package/dist/esm/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.js +5 -0
  139. package/dist/esm/modules/excel/xlsx/xform/drawing/pic-xform.js +2 -1
  140. package/dist/esm/modules/excel/xlsx/xform/drawing/shape-xform.js +109 -0
  141. package/dist/esm/modules/excel/xlsx/xform/drawing/two-cell-anchor-xform.js +10 -1
  142. package/dist/esm/modules/excel/xlsx/xform/sheet/worksheet-xform.js +64 -1
  143. package/dist/esm/modules/pdf/builder/document-builder.js +22 -49
  144. package/dist/esm/modules/pdf/builder/pdf-editor.js +1 -1
  145. package/dist/esm/modules/pdf/core/pdf-stream.js +38 -2
  146. package/dist/esm/modules/pdf/font/font-manager.js +35 -18
  147. package/dist/esm/modules/pdf/render/page-renderer.js +111 -18
  148. package/dist/esm/modules/word/advanced/field-engine.js +45 -20
  149. package/dist/esm/modules/word/advanced/glossary.js +8 -9
  150. package/dist/esm/modules/word/advanced/math-convert.js +94 -12
  151. package/dist/esm/modules/word/advanced/ole-objects.js +122 -19
  152. package/dist/esm/modules/word/advanced/style-map.js +31 -10
  153. package/dist/esm/modules/word/builder/run-builders.js +7 -1
  154. package/dist/esm/modules/word/constants.js +5 -1
  155. package/dist/esm/modules/word/convert/docx-to-semantic.js +135 -1
  156. package/dist/esm/modules/word/convert/html/html-import.js +167 -14
  157. package/dist/esm/modules/word/convert/html/html.js +1 -1
  158. package/dist/esm/modules/word/convert/markdown/markdown-import.js +279 -69
  159. package/dist/esm/modules/word/convert/odt/odt.js +407 -56
  160. package/dist/esm/modules/word/html.js +1 -1
  161. package/dist/esm/modules/word/index.base.js +1 -1
  162. package/dist/esm/modules/word/layout/layout-full.js +326 -19
  163. package/dist/esm/modules/word/layout/render-page.js +35 -8
  164. package/dist/esm/modules/word/query/compat.js +29 -21
  165. package/dist/esm/modules/word/reader/docx-reader.js +105 -2
  166. package/dist/esm/modules/word/reader/math-parser.js +8 -2
  167. package/dist/esm/modules/word/security/cfb-reader.js +5 -5
  168. package/dist/esm/modules/word/writer/docx-packager.js +108 -2
  169. package/dist/esm/modules/word/writer/glossary-writer.js +121 -0
  170. package/dist/esm/modules/word/writer/header-footer-writer.js +105 -20
  171. package/dist/esm/modules/word/writer/math-writer.js +7 -2
  172. package/dist/esm/utils/font-metrics.js +43 -0
  173. package/dist/esm/utils/theme-colors.js +4 -1
  174. package/dist/iife/excelts.iife.js +496 -59
  175. package/dist/iife/excelts.iife.js.map +1 -1
  176. package/dist/iife/excelts.iife.min.js +39 -39
  177. package/dist/types/modules/excel/cell.d.ts +4 -0
  178. package/dist/types/modules/excel/stream/workbook-writer.browser.d.ts +8 -1
  179. package/dist/types/modules/excel/types.d.ts +81 -0
  180. package/dist/types/modules/excel/utils/drawing-utils.d.ts +8 -0
  181. package/dist/types/modules/excel/workbook.browser.d.ts +16 -0
  182. package/dist/types/modules/excel/worksheet.d.ts +31 -1
  183. package/dist/types/modules/excel/xlsx/xform/comment/vml-shape-xform.d.ts +7 -0
  184. package/dist/types/modules/excel/xlsx/xform/drawing/blip-xform.d.ts +6 -0
  185. package/dist/types/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.d.ts +1 -0
  186. package/dist/types/modules/excel/xlsx/xform/drawing/pic-xform.d.ts +2 -0
  187. package/dist/types/modules/excel/xlsx/xform/drawing/shape-xform.d.ts +47 -0
  188. package/dist/types/modules/pdf/core/pdf-stream.d.ts +28 -1
  189. package/dist/types/modules/pdf/font/font-manager.d.ts +26 -0
  190. package/dist/types/modules/pdf/render/page-renderer.d.ts +51 -3
  191. package/dist/types/modules/word/advanced/glossary.d.ts +10 -36
  192. package/dist/types/modules/word/advanced/ole-objects.d.ts +28 -0
  193. package/dist/types/modules/word/builder/run-builders.d.ts +7 -1
  194. package/dist/types/modules/word/constants.d.ts +4 -0
  195. package/dist/types/modules/word/convert/docx-to-semantic.d.ts +2 -1
  196. package/dist/types/modules/word/convert/html/html-import.d.ts +32 -1
  197. package/dist/types/modules/word/convert/html/html.d.ts +2 -2
  198. package/dist/types/modules/word/convert/markdown/markdown-import.d.ts +48 -18
  199. package/dist/types/modules/word/convert/markdown/markdown.d.ts +1 -1
  200. package/dist/types/modules/word/html.d.ts +2 -2
  201. package/dist/types/modules/word/index.base.d.ts +3 -3
  202. package/dist/types/modules/word/markdown.d.ts +1 -1
  203. package/dist/types/modules/word/query/compat.d.ts +10 -2
  204. package/dist/types/modules/word/types.d.ts +96 -1
  205. package/dist/types/modules/word/writer/glossary-writer.d.ts +28 -0
  206. package/dist/types/utils/font-metrics.d.ts +8 -0
  207. package/package.json +3 -1
@@ -11,7 +11,8 @@
11
11
  * - Hyperlink extraction
12
12
  * - Image registration into ConversionContext
13
13
  * - Table structure with merge (colSpan/rowSpan)
14
- * - List/numbering detection (basic)
14
+ * - List/numbering detection: consecutive numbered paragraphs are aggregated
15
+ * into ordered/unordered `list` blocks with nested sub-lists by level
15
16
  * - Footnote/endnote reference and content
16
17
  * - Math content (text fallback)
17
18
  */
@@ -93,6 +94,26 @@ function convertBodyContent(body, doc, ctx, imageMap) {
93
94
  const item = body[bodyIndex];
94
95
  switch (item.type) {
95
96
  case "paragraph":
97
+ // A run of consecutive list-item paragraphs (each carrying a
98
+ // numbering reference, and not a heading) is aggregated into a single
99
+ // semantic `list` block with nested sub-lists driven by the numbering
100
+ // level. This is what turns Word numbering into real <ul>/<ol> in
101
+ // HTML and `-`/`1.` markers in Markdown when downstream renderers
102
+ // consume the IR.
103
+ if (isListItemParagraph(item)) {
104
+ let end = bodyIndex;
105
+ while (end < body.length) {
106
+ const next = body[end];
107
+ if (next.type !== "paragraph" || !isListItemParagraph(next)) {
108
+ break;
109
+ }
110
+ end++;
111
+ }
112
+ const listParas = body.slice(bodyIndex, end);
113
+ blocks.push(...buildListBlocks(listParas, doc, ctx, imageMap));
114
+ bodyIndex = end - 1; // loop's ++ advances past the consumed run
115
+ break;
116
+ }
96
117
  blocks.push(convertParagraph(item, doc, ctx, imageMap));
97
118
  break;
98
119
  case "table":
@@ -240,6 +261,119 @@ function convertBodyContent(body, doc, ctx, imageMap) {
240
261
  return blocks;
241
262
  }
242
263
  // =============================================================================
264
+ // Internal: List Aggregation
265
+ // =============================================================================
266
+ /**
267
+ * Whether a body paragraph should render as a list item: it carries a
268
+ * numbering reference and is not itself a heading (a numbered heading stays a
269
+ * heading, mirroring the markdown/html renderers).
270
+ */
271
+ function isListItemParagraph(item) {
272
+ if (item.type !== "paragraph") {
273
+ return false;
274
+ }
275
+ return item.properties?.numbering !== undefined && detectHeadingLevel(item) === null;
276
+ }
277
+ /**
278
+ * Resolve a numbering reference to its number format string (e.g. "decimal",
279
+ * "bullet"). Mirrors the lookup in the markdown/html renderers so the three
280
+ * surfaces classify ordered vs. unordered lists identically. Defaults to
281
+ * "bullet" when the numbering definition can't be resolved.
282
+ */
283
+ function getNumberingFormat(doc, numId, level) {
284
+ const instance = doc.numberingInstances?.find(n => n.numId === numId);
285
+ if (!instance) {
286
+ return "bullet";
287
+ }
288
+ const abstractNum = doc.abstractNumberings?.find(a => a.abstractNumId === instance.abstractNumId);
289
+ if (!abstractNum) {
290
+ return "bullet";
291
+ }
292
+ const levelDef = abstractNum.levels.find(l => l.level === level);
293
+ return levelDef?.format ?? "bullet";
294
+ }
295
+ /** A number format other than "bullet"/"none" denotes an ordered list. */
296
+ function isOrderedFormat(format) {
297
+ return format !== "bullet" && format !== "none";
298
+ }
299
+ /**
300
+ * Build one or more semantic `list` blocks from a contiguous run of list-item
301
+ * paragraphs. Paragraphs are nested by their numbering `level`; a deeper level
302
+ * becomes a `subList` of the preceding shallower item. Adjacent items that
303
+ * switch between ordered and unordered at the same level start a new sibling
304
+ * list so the ordered/unordered distinction is preserved.
305
+ */
306
+ function buildListBlocks(paras, doc, ctx, imageMap) {
307
+ const { blocks } = buildListLevel(paras, 0, 0, doc, ctx, imageMap);
308
+ return blocks;
309
+ }
310
+ /**
311
+ * Consume paragraphs starting at `start` that belong to `level` (or deeper),
312
+ * emitting sibling lists for this level. Deeper-level paragraphs are folded
313
+ * into the current item's `subList` via recursion. Returns the produced blocks
314
+ * and the index of the first paragraph that no longer belongs to this level.
315
+ */
316
+ function buildListLevel(paras, start, level, doc, ctx, imageMap) {
317
+ const blocks = [];
318
+ let i = start;
319
+ let currentOrdered = null;
320
+ let items = [];
321
+ const flush = () => {
322
+ if (items.length > 0 && currentOrdered !== null) {
323
+ blocks.push({ type: "list", ordered: currentOrdered, items });
324
+ items = [];
325
+ }
326
+ };
327
+ while (i < paras.length) {
328
+ const para = paras[i];
329
+ const num = para.properties?.numbering;
330
+ // Defensive: callers only pass list-item paragraphs, but guard anyway.
331
+ if (!num) {
332
+ break;
333
+ }
334
+ if (num.level < level) {
335
+ // Belongs to a shallower list — let the caller handle it.
336
+ break;
337
+ }
338
+ if (num.level > level) {
339
+ // Deeper item with no shallower parent at this position: descend and
340
+ // attach the nested list to the most recent item, or synthesise an
341
+ // empty item to host it when there is no parent.
342
+ const { blocks: subBlocks, next } = buildListLevel(paras, i, num.level, doc, ctx, imageMap);
343
+ const subList = subBlocks[0];
344
+ if (items.length > 0) {
345
+ const last = items[items.length - 1];
346
+ items[items.length - 1] = { ...last, subList };
347
+ }
348
+ else if (subList) {
349
+ // Promote the deeper list to this level when there is no parent item.
350
+ if (currentOrdered === null && subList.type === "list") {
351
+ currentOrdered = subList.ordered;
352
+ }
353
+ items.push({ children: [], subList });
354
+ }
355
+ i = next;
356
+ continue;
357
+ }
358
+ // num.level === level
359
+ const format = getNumberingFormat(doc, num.numId, num.level);
360
+ const ordered = isOrderedFormat(format);
361
+ if (currentOrdered === null) {
362
+ currentOrdered = ordered;
363
+ }
364
+ else if (ordered !== currentOrdered) {
365
+ // Ordered/unordered switch at the same level → start a new sibling list.
366
+ flush();
367
+ currentOrdered = ordered;
368
+ }
369
+ const children = convertParagraphChildren(para.children, doc, ctx, imageMap);
370
+ items.push({ children });
371
+ i++;
372
+ }
373
+ flush();
374
+ return { blocks, next: i };
375
+ }
376
+ // =============================================================================
243
377
  // Internal: Paragraph Conversion
244
378
  // =============================================================================
245
379
  function convertParagraph(para, doc, ctx, imageMap) {
@@ -20,6 +20,7 @@
20
20
  * const buffer = await toBuffer(Document.build(h));
21
21
  * ```
22
22
  */
23
+ import { base64ToUint8Array } from "../../../../utils/utils.js";
23
24
  import { sanitizeUrl } from "../../core/internal-utils.js";
24
25
  import { EMU_PER_PX } from "../../units.js";
25
26
  /**
@@ -44,10 +45,7 @@ export function htmlToDocxBody(html, options) {
44
45
  const tokens = tokenize(html);
45
46
  // Extract <style> rules and merge with user-provided classStyles
46
47
  const extractedStyles = extractStyleRules(tokens);
47
- const classStyles = {
48
- ...extractedStyles,
49
- ...(options?.classStyles ?? {})
50
- };
48
+ const classStyles = mergeClassStyles(extractedStyles, options?.classStyles ?? {});
51
49
  // Seed the inline context with the caller-supplied defaults so plain text
52
50
  // runs actually carry the requested font/size. Without this the options
53
51
  // were effectively ignored.
@@ -61,6 +59,40 @@ export function htmlToDocxBody(html, options) {
61
59
  parseBlocks(tokens, 0, blocks, initialCtx, classStyles);
62
60
  return blocks;
63
61
  }
62
+ /**
63
+ * Convert an HTML string into DOCX body content **and** embedded images.
64
+ *
65
+ * Unlike {@link htmlToDocxBody}, this decodes base64 `data:` image URLs into
66
+ * real {@link ImageDef}s and assigns each a unique rId that the emitted image
67
+ * runs reference. Merge the returned `images` into your document model so the
68
+ * pictures are embedded rather than dropped as placeholders.
69
+ *
70
+ * @example
71
+ * ```ts
72
+ * const { body, images } = htmlToDocx(html);
73
+ * const doc = Document.create();
74
+ * for (const item of body) Document.addContent(doc, item);
75
+ * const built = Document.build(doc);
76
+ * const final = { ...built, images: [...(built.images ?? []), ...images] };
77
+ * const bytes = await toBuffer(final);
78
+ * ```
79
+ */
80
+ export function htmlToDocx(html, options) {
81
+ const blocks = [];
82
+ const tokens = tokenize(html);
83
+ const extractedStyles = extractStyleRules(tokens);
84
+ const classStyles = mergeClassStyles(extractedStyles, options?.classStyles ?? {});
85
+ const images = [];
86
+ const initialCtx = { imageSink: images };
87
+ if (options?.defaultFont) {
88
+ initialCtx.fontFamily = options.defaultFont;
89
+ }
90
+ if (options?.defaultFontSize !== undefined) {
91
+ initialCtx.fontSize = options.defaultFontSize;
92
+ }
93
+ parseBlocks(tokens, 0, blocks, initialCtx, classStyles);
94
+ return { body: blocks, images };
95
+ }
64
96
  function tokenize(html) {
65
97
  const tokens = [];
66
98
  // Strip HTML comments, doctype declarations and SGML processing
@@ -473,6 +505,22 @@ function extractStyleRules(tokens) {
473
505
  }
474
506
  return result;
475
507
  }
508
+ /**
509
+ * Merge two class→style maps. For classes present in both, the declarations
510
+ * are concatenated (extracted `<style>` rules first, caller-supplied overrides
511
+ * last) so the later source wins per CSS cascade while still preserving
512
+ * properties only declared by the other source. A plain `{ ...a, ...b }`
513
+ * would discard the extracted rule entirely whenever the caller supplies the
514
+ * same class name, silently dropping e.g. `font-style`/`color` from `<style>`.
515
+ */
516
+ function mergeClassStyles(extracted, overrides) {
517
+ const merged = { ...extracted };
518
+ for (const [name, style] of Object.entries(overrides)) {
519
+ const existing = merged[name];
520
+ merged[name] = existing ? `${existing}; ${style}` : style;
521
+ }
522
+ return merged;
523
+ }
476
524
  /**
477
525
  * Parse HTML-style attributes from the inside of a start tag, e.g.
478
526
  * `class="x" id='y' disabled href=foo`.
@@ -1133,6 +1181,17 @@ function parseBlocks(tokens, start, blocks, parentCtx, classStyles) {
1133
1181
  return i + 1; // consumed the close tag
1134
1182
  }
1135
1183
  if (tok.type === "text") {
1184
+ // In block context, text nodes that are pure inter-element whitespace
1185
+ // (the newlines/indentation between block tags in pretty-printed HTML)
1186
+ // carry no content and must be ignored — otherwise every gap between
1187
+ // <p>/<table>/<div> tags would emit a spurious empty paragraph (and
1188
+ // the contained newline would be rendered as a <w:br/> soft break).
1189
+ // Whitespace that sits between inline runs is preserved by the inline
1190
+ // parser, which handles it separately.
1191
+ if (tok.value.trim() === "") {
1192
+ i++;
1193
+ continue;
1194
+ }
1136
1195
  if (!pendingInline) {
1137
1196
  pendingInline = { runs: [], ctx: parentCtx };
1138
1197
  }
@@ -1461,7 +1520,7 @@ function parseInlineTag(tokens, idx, runs, ctx, classStyles) {
1461
1520
  runs.push({ content: [{ type: "break" }] });
1462
1521
  }
1463
1522
  else if (tag === "img") {
1464
- const imgContent = buildImageContent(tok.attrs);
1523
+ const imgContent = buildImageContent(tok.attrs, ctx);
1465
1524
  if (imgContent) {
1466
1525
  runs.push({ content: [imgContent] });
1467
1526
  }
@@ -1534,13 +1593,15 @@ function parseInlineTag(tokens, idx, runs, ctx, classStyles) {
1534
1593
  i++;
1535
1594
  }
1536
1595
  else if (t.type === "close") {
1596
+ // Mismatched close tag — close the hyperlink here but do NOT
1597
+ // consume the token; let the caller handle the block boundary.
1537
1598
  const hyperlink = {
1538
1599
  type: "hyperlink",
1539
1600
  url: safeHref ?? "",
1540
1601
  children: innerRuns
1541
1602
  };
1542
1603
  runs.push(hyperlink);
1543
- return i + 1;
1604
+ return i;
1544
1605
  }
1545
1606
  else {
1546
1607
  const childRuns = [];
@@ -1581,7 +1642,12 @@ function parseInlineTag(tokens, idx, runs, ctx, classStyles) {
1581
1642
  i++;
1582
1643
  }
1583
1644
  else if (t.type === "close") {
1584
- return i + 1;
1645
+ // Mismatched close tag (e.g. </p> while inside an unclosed <strong>).
1646
+ // Do NOT consume it — return the current index so the caller can
1647
+ // handle it. Consuming a block-level close here would swallow the
1648
+ // parent paragraph boundary and pull all following block content
1649
+ // into this run, breaking page breaks, tables, etc.
1650
+ return i;
1585
1651
  }
1586
1652
  else {
1587
1653
  i = parseInlineTag(tokens, i, runs, newCtx, classStyles);
@@ -1660,6 +1726,15 @@ function parseListItem(tokens, start, blocks, ctx, ordered, level, classStyles)
1660
1726
  }
1661
1727
  // Text content
1662
1728
  if (tok.type === "text") {
1729
+ // Skip structural whitespace: the indentation/newlines that sit between
1730
+ // a nested <ul>/<ol> and the closing </li> (or at the very start of the
1731
+ // item) are not real content. Emitting them as runs would otherwise
1732
+ // produce a spurious empty list-item paragraph. Whitespace *between*
1733
+ // real inline content is preserved because `children` is non-empty then.
1734
+ if (tok.value.trim() === "" && children.length === 0) {
1735
+ i++;
1736
+ continue;
1737
+ }
1663
1738
  children.push(makeRun(tok.value, ctx));
1664
1739
  i++;
1665
1740
  continue;
@@ -2064,7 +2139,7 @@ function mapCssBorderStyle(cssStyle) {
2064
2139
  // Image content builder
2065
2140
  // =============================================================================
2066
2141
  /** Build InlineImageContent from img attributes or return undefined if not applicable. */
2067
- function buildImageContent(attrs) {
2142
+ function buildImageContent(attrs, ctx) {
2068
2143
  const src = attrs["src"] || "";
2069
2144
  const alt = attrs["alt"] || "";
2070
2145
  // Parse width/height from attributes first, then fall back to style
@@ -2083,11 +2158,36 @@ function buildImageContent(attrs) {
2083
2158
  // Convert pixels to EMU
2084
2159
  const widthEmu = (width || 100) * EMU_PER_PX;
2085
2160
  const heightEmu = (height || 100) * EMU_PER_PX;
2086
- // Both data: and http(s) URLs become placeholders. The DOCX writer needs
2087
- // a real ImageDef registered in `doc.images` plus a corresponding
2088
- // relationship; htmlToDocxBody returns BodyContent[] only and cannot do
2089
- // that registration. We surface the original src in the alt text so the
2090
- // user can post-process if they need real embedded images.
2161
+ // base64 data: URLs can be decoded and embedded as a real media file when
2162
+ // an image sink is provided (htmlToDocx path). The decoded bytes are
2163
+ // registered as an ImageDef and the run references the assigned rId.
2164
+ if (src.startsWith("data:") && ctx?.imageSink) {
2165
+ const decoded = decodeDataUrlImage(src);
2166
+ if (decoded) {
2167
+ const sink = ctx.imageSink;
2168
+ const index = sink.length;
2169
+ const rId = `htmlImg${index}`;
2170
+ const ext = decoded.mediaType === "jpeg" ? "jpg" : decoded.mediaType;
2171
+ sink.push({
2172
+ data: decoded.data,
2173
+ mediaType: decoded.mediaType,
2174
+ fileName: `image_html_${index}.${ext}`,
2175
+ rId
2176
+ });
2177
+ return {
2178
+ type: "image",
2179
+ rId,
2180
+ width: widthEmu,
2181
+ height: heightEmu,
2182
+ altText: alt || undefined,
2183
+ name: alt || `image${index}`
2184
+ };
2185
+ }
2186
+ }
2187
+ // No sink (htmlToDocxBody only returns BodyContent[] and cannot register
2188
+ // media) or an unsupported/remote source: emit a placeholder with an empty
2189
+ // rId. The renderer treats an empty rId as a placeholder; the original src
2190
+ // is surfaced in the alt text so callers can post-process if needed.
2091
2191
  if (src.startsWith("data:") || src.startsWith("http://") || src.startsWith("https://")) {
2092
2192
  return {
2093
2193
  type: "image",
@@ -2100,6 +2200,54 @@ function buildImageContent(attrs) {
2100
2200
  }
2101
2201
  return undefined;
2102
2202
  }
2203
+ /** Decode a `data:image/...;base64,...` URL into bytes + media type. */
2204
+ function decodeDataUrlImage(src) {
2205
+ // data:image/png;base64,XXXX
2206
+ const match = /^data:image\/([a-z0-9.+-]+)\s*;\s*base64\s*,(.*)$/is.exec(src);
2207
+ if (!match) {
2208
+ return undefined;
2209
+ }
2210
+ const rawType = match[1].toLowerCase();
2211
+ const b64 = match[2].replace(/\s+/g, "");
2212
+ const mediaType = normalizeImageMediaType(rawType);
2213
+ if (!mediaType) {
2214
+ return undefined;
2215
+ }
2216
+ try {
2217
+ const data = base64ToUint8Array(b64);
2218
+ if (data.length === 0) {
2219
+ return undefined;
2220
+ }
2221
+ return { data, mediaType };
2222
+ }
2223
+ catch {
2224
+ return undefined;
2225
+ }
2226
+ }
2227
+ /** Map a data-URL image subtype to a supported ImageMediaType. */
2228
+ function normalizeImageMediaType(subtype) {
2229
+ switch (subtype) {
2230
+ case "png":
2231
+ return "png";
2232
+ case "jpeg":
2233
+ case "jpg":
2234
+ return "jpeg";
2235
+ case "gif":
2236
+ return "gif";
2237
+ case "bmp":
2238
+ return "bmp";
2239
+ case "tiff":
2240
+ case "tif":
2241
+ return "tiff";
2242
+ case "svg+xml":
2243
+ case "svg":
2244
+ return "svg";
2245
+ case "webp":
2246
+ return "webp";
2247
+ default:
2248
+ return undefined;
2249
+ }
2250
+ }
2103
2251
  /** Parse an image dimension from HTML attribute value (number or "Npx"). */
2104
2252
  function parseImageDimension(value) {
2105
2253
  if (!value) {
@@ -2228,6 +2376,11 @@ function resolveEffectiveStyle(attrs, classStyles) {
2228
2376
  // Run builder
2229
2377
  // =============================================================================
2230
2378
  function makeRun(text, ctx) {
2379
+ // HTML whitespace handling: outside <pre>/<code>, runs of whitespace
2380
+ // (including the newlines/indentation from source-code line wrapping)
2381
+ // collapse to a single space. Inside <pre>/<code> whitespace is
2382
+ // significant and preserved verbatim.
2383
+ const value = ctx.code ? text : text.replace(/\s+/g, " ");
2231
2384
  const props = {};
2232
2385
  if (ctx.bold) {
2233
2386
  props.bold = true;
@@ -2264,7 +2417,7 @@ function makeRun(text, ctx) {
2264
2417
  }
2265
2418
  const run = {
2266
2419
  ...(Object.keys(props).length > 0 ? { properties: props } : {}),
2267
- content: [{ type: "text", text }]
2420
+ content: [{ type: "text", text: value }]
2268
2421
  };
2269
2422
  return run;
2270
2423
  }
@@ -12,4 +12,4 @@
12
12
  // HTML → render (DocxDocument → HTML output)
13
13
  export { renderToHtml } from "./html-renderer.js";
14
14
  // HTML → DOCX import (HTML string → BodyContent[])
15
- export { htmlToDocxBody } from "./html-import.js";
15
+ export { htmlToDocxBody, htmlToDocx } from "./html-import.js";