@cj-tech-master/excelts 9.6.1 → 10.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/README.md +18 -3
  2. package/README_zh.md +18 -3
  3. package/dist/browser/modules/excel/cell.d.ts +4 -0
  4. package/dist/browser/modules/excel/note.js +5 -1
  5. package/dist/browser/modules/excel/row.js +35 -2
  6. package/dist/browser/modules/excel/stream/workbook-writer.browser.d.ts +8 -1
  7. package/dist/browser/modules/excel/stream/workbook-writer.browser.js +22 -2
  8. package/dist/browser/modules/excel/types.d.ts +81 -0
  9. package/dist/browser/modules/excel/utils/drawing-utils.d.ts +8 -0
  10. package/dist/browser/modules/excel/utils/drawing-utils.js +19 -2
  11. package/dist/browser/modules/excel/workbook.browser.d.ts +16 -0
  12. package/dist/browser/modules/excel/workbook.browser.js +32 -2
  13. package/dist/browser/modules/excel/worksheet.d.ts +31 -1
  14. package/dist/browser/modules/excel/worksheet.js +83 -0
  15. package/dist/browser/modules/excel/xlsx/xform/comment/vml-shape-xform.d.ts +7 -0
  16. package/dist/browser/modules/excel/xlsx/xform/comment/vml-shape-xform.js +42 -8
  17. package/dist/browser/modules/excel/xlsx/xform/core/content-types-xform.js +3 -1
  18. package/dist/browser/modules/excel/xlsx/xform/drawing/absolute-anchor-xform.js +5 -0
  19. package/dist/browser/modules/excel/xlsx/xform/drawing/base-cell-anchor-xform.js +18 -1
  20. package/dist/browser/modules/excel/xlsx/xform/drawing/blip-xform.d.ts +6 -0
  21. package/dist/browser/modules/excel/xlsx/xform/drawing/blip-xform.js +38 -11
  22. package/dist/browser/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.d.ts +1 -0
  23. package/dist/browser/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.js +5 -0
  24. package/dist/browser/modules/excel/xlsx/xform/drawing/pic-xform.d.ts +2 -0
  25. package/dist/browser/modules/excel/xlsx/xform/drawing/pic-xform.js +2 -1
  26. package/dist/browser/modules/excel/xlsx/xform/drawing/shape-xform.d.ts +47 -0
  27. package/dist/browser/modules/excel/xlsx/xform/drawing/shape-xform.js +109 -0
  28. package/dist/browser/modules/excel/xlsx/xform/drawing/two-cell-anchor-xform.js +10 -1
  29. package/dist/browser/modules/excel/xlsx/xform/sheet/worksheet-xform.js +64 -1
  30. package/dist/browser/modules/pdf/builder/document-builder.js +22 -49
  31. package/dist/browser/modules/pdf/builder/pdf-editor.js +1 -1
  32. package/dist/browser/modules/pdf/core/pdf-stream.d.ts +28 -1
  33. package/dist/browser/modules/pdf/core/pdf-stream.js +38 -2
  34. package/dist/browser/modules/pdf/font/font-manager.d.ts +26 -0
  35. package/dist/browser/modules/pdf/font/font-manager.js +35 -18
  36. package/dist/browser/modules/pdf/render/page-renderer.d.ts +51 -3
  37. package/dist/browser/modules/pdf/render/page-renderer.js +111 -18
  38. package/dist/browser/modules/word/advanced/field-engine.js +45 -20
  39. package/dist/browser/modules/word/advanced/glossary.d.ts +10 -36
  40. package/dist/browser/modules/word/advanced/glossary.js +8 -9
  41. package/dist/browser/modules/word/advanced/math-convert.js +94 -12
  42. package/dist/browser/modules/word/advanced/ole-objects.d.ts +28 -0
  43. package/dist/browser/modules/word/advanced/ole-objects.js +122 -19
  44. package/dist/browser/modules/word/advanced/style-map.js +31 -10
  45. package/dist/browser/modules/word/builder/run-builders.d.ts +7 -1
  46. package/dist/browser/modules/word/builder/run-builders.js +7 -1
  47. package/dist/browser/modules/word/constants.d.ts +4 -0
  48. package/dist/browser/modules/word/constants.js +5 -1
  49. package/dist/browser/modules/word/convert/docx-to-semantic.d.ts +2 -1
  50. package/dist/browser/modules/word/convert/docx-to-semantic.js +135 -1
  51. package/dist/browser/modules/word/convert/html/html-import.d.ts +32 -1
  52. package/dist/browser/modules/word/convert/html/html-import.js +167 -14
  53. package/dist/browser/modules/word/convert/html/html.d.ts +2 -2
  54. package/dist/browser/modules/word/convert/html/html.js +1 -1
  55. package/dist/browser/modules/word/convert/markdown/markdown-import.d.ts +48 -18
  56. package/dist/browser/modules/word/convert/markdown/markdown-import.js +279 -69
  57. package/dist/browser/modules/word/convert/markdown/markdown.d.ts +1 -1
  58. package/dist/browser/modules/word/convert/odt/odt.js +407 -56
  59. package/dist/browser/modules/word/html.d.ts +2 -2
  60. package/dist/browser/modules/word/html.js +1 -1
  61. package/dist/browser/modules/word/index.base.d.ts +3 -3
  62. package/dist/browser/modules/word/index.base.js +1 -1
  63. package/dist/browser/modules/word/layout/layout-full.js +326 -19
  64. package/dist/browser/modules/word/layout/render-page.js +35 -8
  65. package/dist/browser/modules/word/markdown.d.ts +1 -1
  66. package/dist/browser/modules/word/query/compat.d.ts +10 -2
  67. package/dist/browser/modules/word/query/compat.js +29 -21
  68. package/dist/browser/modules/word/reader/docx-reader.js +105 -2
  69. package/dist/browser/modules/word/reader/math-parser.js +8 -2
  70. package/dist/browser/modules/word/security/cfb-reader.js +5 -5
  71. package/dist/browser/modules/word/types.d.ts +96 -1
  72. package/dist/browser/modules/word/writer/docx-packager.js +108 -2
  73. package/dist/browser/modules/word/writer/glossary-writer.d.ts +28 -0
  74. package/dist/browser/modules/word/writer/glossary-writer.js +121 -0
  75. package/dist/browser/modules/word/writer/header-footer-writer.js +105 -20
  76. package/dist/browser/modules/word/writer/math-writer.js +7 -2
  77. package/dist/browser/utils/font-metrics.d.ts +8 -0
  78. package/dist/browser/utils/font-metrics.js +43 -0
  79. package/dist/browser/utils/theme-colors.js +4 -1
  80. package/dist/cjs/modules/excel/note.js +5 -1
  81. package/dist/cjs/modules/excel/row.js +35 -2
  82. package/dist/cjs/modules/excel/stream/workbook-writer.browser.js +22 -2
  83. package/dist/cjs/modules/excel/utils/drawing-utils.js +19 -2
  84. package/dist/cjs/modules/excel/workbook.browser.js +31 -1
  85. package/dist/cjs/modules/excel/worksheet.js +83 -0
  86. package/dist/cjs/modules/excel/xlsx/xform/comment/vml-shape-xform.js +42 -8
  87. package/dist/cjs/modules/excel/xlsx/xform/core/content-types-xform.js +3 -1
  88. package/dist/cjs/modules/excel/xlsx/xform/drawing/absolute-anchor-xform.js +5 -0
  89. package/dist/cjs/modules/excel/xlsx/xform/drawing/base-cell-anchor-xform.js +18 -1
  90. package/dist/cjs/modules/excel/xlsx/xform/drawing/blip-xform.js +38 -11
  91. package/dist/cjs/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.js +5 -0
  92. package/dist/cjs/modules/excel/xlsx/xform/drawing/pic-xform.js +2 -1
  93. package/dist/cjs/modules/excel/xlsx/xform/drawing/shape-xform.js +112 -0
  94. package/dist/cjs/modules/excel/xlsx/xform/drawing/two-cell-anchor-xform.js +10 -1
  95. package/dist/cjs/modules/excel/xlsx/xform/sheet/worksheet-xform.js +64 -1
  96. package/dist/cjs/modules/pdf/builder/document-builder.js +21 -48
  97. package/dist/cjs/modules/pdf/builder/pdf-editor.js +1 -1
  98. package/dist/cjs/modules/pdf/core/pdf-stream.js +38 -2
  99. package/dist/cjs/modules/pdf/font/font-manager.js +35 -18
  100. package/dist/cjs/modules/pdf/render/page-renderer.js +112 -18
  101. package/dist/cjs/modules/word/advanced/field-engine.js +45 -20
  102. package/dist/cjs/modules/word/advanced/glossary.js +8 -9
  103. package/dist/cjs/modules/word/advanced/math-convert.js +94 -12
  104. package/dist/cjs/modules/word/advanced/ole-objects.js +123 -19
  105. package/dist/cjs/modules/word/advanced/style-map.js +31 -10
  106. package/dist/cjs/modules/word/builder/run-builders.js +7 -1
  107. package/dist/cjs/modules/word/constants.js +5 -1
  108. package/dist/cjs/modules/word/convert/docx-to-semantic.js +135 -1
  109. package/dist/cjs/modules/word/convert/html/html-import.js +168 -14
  110. package/dist/cjs/modules/word/convert/html/html.js +2 -1
  111. package/dist/cjs/modules/word/convert/markdown/markdown-import.js +279 -69
  112. package/dist/cjs/modules/word/convert/odt/odt.js +407 -56
  113. package/dist/cjs/modules/word/html.js +2 -1
  114. package/dist/cjs/modules/word/index.base.js +4 -3
  115. package/dist/cjs/modules/word/layout/layout-full.js +325 -18
  116. package/dist/cjs/modules/word/layout/render-page.js +35 -8
  117. package/dist/cjs/modules/word/query/compat.js +29 -21
  118. package/dist/cjs/modules/word/reader/docx-reader.js +104 -1
  119. package/dist/cjs/modules/word/reader/math-parser.js +8 -2
  120. package/dist/cjs/modules/word/security/cfb-reader.js +5 -5
  121. package/dist/cjs/modules/word/writer/docx-packager.js +108 -2
  122. package/dist/cjs/modules/word/writer/glossary-writer.js +124 -0
  123. package/dist/cjs/modules/word/writer/header-footer-writer.js +105 -20
  124. package/dist/cjs/modules/word/writer/math-writer.js +7 -2
  125. package/dist/cjs/utils/font-metrics.js +44 -0
  126. package/dist/cjs/utils/theme-colors.js +4 -1
  127. package/dist/esm/modules/excel/note.js +5 -1
  128. package/dist/esm/modules/excel/row.js +35 -2
  129. package/dist/esm/modules/excel/stream/workbook-writer.browser.js +22 -2
  130. package/dist/esm/modules/excel/utils/drawing-utils.js +19 -2
  131. package/dist/esm/modules/excel/workbook.browser.js +32 -2
  132. package/dist/esm/modules/excel/worksheet.js +83 -0
  133. package/dist/esm/modules/excel/xlsx/xform/comment/vml-shape-xform.js +42 -8
  134. package/dist/esm/modules/excel/xlsx/xform/core/content-types-xform.js +3 -1
  135. package/dist/esm/modules/excel/xlsx/xform/drawing/absolute-anchor-xform.js +5 -0
  136. package/dist/esm/modules/excel/xlsx/xform/drawing/base-cell-anchor-xform.js +18 -1
  137. package/dist/esm/modules/excel/xlsx/xform/drawing/blip-xform.js +38 -11
  138. package/dist/esm/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.js +5 -0
  139. package/dist/esm/modules/excel/xlsx/xform/drawing/pic-xform.js +2 -1
  140. package/dist/esm/modules/excel/xlsx/xform/drawing/shape-xform.js +109 -0
  141. package/dist/esm/modules/excel/xlsx/xform/drawing/two-cell-anchor-xform.js +10 -1
  142. package/dist/esm/modules/excel/xlsx/xform/sheet/worksheet-xform.js +64 -1
  143. package/dist/esm/modules/pdf/builder/document-builder.js +22 -49
  144. package/dist/esm/modules/pdf/builder/pdf-editor.js +1 -1
  145. package/dist/esm/modules/pdf/core/pdf-stream.js +38 -2
  146. package/dist/esm/modules/pdf/font/font-manager.js +35 -18
  147. package/dist/esm/modules/pdf/render/page-renderer.js +111 -18
  148. package/dist/esm/modules/word/advanced/field-engine.js +45 -20
  149. package/dist/esm/modules/word/advanced/glossary.js +8 -9
  150. package/dist/esm/modules/word/advanced/math-convert.js +94 -12
  151. package/dist/esm/modules/word/advanced/ole-objects.js +122 -19
  152. package/dist/esm/modules/word/advanced/style-map.js +31 -10
  153. package/dist/esm/modules/word/builder/run-builders.js +7 -1
  154. package/dist/esm/modules/word/constants.js +5 -1
  155. package/dist/esm/modules/word/convert/docx-to-semantic.js +135 -1
  156. package/dist/esm/modules/word/convert/html/html-import.js +167 -14
  157. package/dist/esm/modules/word/convert/html/html.js +1 -1
  158. package/dist/esm/modules/word/convert/markdown/markdown-import.js +279 -69
  159. package/dist/esm/modules/word/convert/odt/odt.js +407 -56
  160. package/dist/esm/modules/word/html.js +1 -1
  161. package/dist/esm/modules/word/index.base.js +1 -1
  162. package/dist/esm/modules/word/layout/layout-full.js +326 -19
  163. package/dist/esm/modules/word/layout/render-page.js +35 -8
  164. package/dist/esm/modules/word/query/compat.js +29 -21
  165. package/dist/esm/modules/word/reader/docx-reader.js +105 -2
  166. package/dist/esm/modules/word/reader/math-parser.js +8 -2
  167. package/dist/esm/modules/word/security/cfb-reader.js +5 -5
  168. package/dist/esm/modules/word/writer/docx-packager.js +108 -2
  169. package/dist/esm/modules/word/writer/glossary-writer.js +121 -0
  170. package/dist/esm/modules/word/writer/header-footer-writer.js +105 -20
  171. package/dist/esm/modules/word/writer/math-writer.js +7 -2
  172. package/dist/esm/utils/font-metrics.js +43 -0
  173. package/dist/esm/utils/theme-colors.js +4 -1
  174. package/dist/iife/excelts.iife.js +496 -59
  175. package/dist/iife/excelts.iife.js.map +1 -1
  176. package/dist/iife/excelts.iife.min.js +39 -39
  177. package/dist/types/modules/excel/cell.d.ts +4 -0
  178. package/dist/types/modules/excel/stream/workbook-writer.browser.d.ts +8 -1
  179. package/dist/types/modules/excel/types.d.ts +81 -0
  180. package/dist/types/modules/excel/utils/drawing-utils.d.ts +8 -0
  181. package/dist/types/modules/excel/workbook.browser.d.ts +16 -0
  182. package/dist/types/modules/excel/worksheet.d.ts +31 -1
  183. package/dist/types/modules/excel/xlsx/xform/comment/vml-shape-xform.d.ts +7 -0
  184. package/dist/types/modules/excel/xlsx/xform/drawing/blip-xform.d.ts +6 -0
  185. package/dist/types/modules/excel/xlsx/xform/drawing/one-cell-anchor-xform.d.ts +1 -0
  186. package/dist/types/modules/excel/xlsx/xform/drawing/pic-xform.d.ts +2 -0
  187. package/dist/types/modules/excel/xlsx/xform/drawing/shape-xform.d.ts +47 -0
  188. package/dist/types/modules/pdf/core/pdf-stream.d.ts +28 -1
  189. package/dist/types/modules/pdf/font/font-manager.d.ts +26 -0
  190. package/dist/types/modules/pdf/render/page-renderer.d.ts +51 -3
  191. package/dist/types/modules/word/advanced/glossary.d.ts +10 -36
  192. package/dist/types/modules/word/advanced/ole-objects.d.ts +28 -0
  193. package/dist/types/modules/word/builder/run-builders.d.ts +7 -1
  194. package/dist/types/modules/word/constants.d.ts +4 -0
  195. package/dist/types/modules/word/convert/docx-to-semantic.d.ts +2 -1
  196. package/dist/types/modules/word/convert/html/html-import.d.ts +32 -1
  197. package/dist/types/modules/word/convert/html/html.d.ts +2 -2
  198. package/dist/types/modules/word/convert/markdown/markdown-import.d.ts +48 -18
  199. package/dist/types/modules/word/convert/markdown/markdown.d.ts +1 -1
  200. package/dist/types/modules/word/html.d.ts +2 -2
  201. package/dist/types/modules/word/index.base.d.ts +3 -3
  202. package/dist/types/modules/word/markdown.d.ts +1 -1
  203. package/dist/types/modules/word/query/compat.d.ts +10 -2
  204. package/dist/types/modules/word/types.d.ts +96 -1
  205. package/dist/types/modules/word/writer/glossary-writer.d.ts +28 -0
  206. package/dist/types/utils/font-metrics.d.ts +8 -0
  207. package/package.json +3 -1
@@ -146,14 +146,8 @@ exports.DEFAULT_STYLE_MAP = {
146
146
  * ```
147
147
  */
148
148
  function parseStyleMap(dsl, options) {
149
- const rules = [];
150
- // Include defaults if requested
151
- if (options?.includeDefaults !== false && options?.base) {
152
- rules.push(...options.base.rules);
153
- }
154
- else if (options?.includeDefaults !== false && !options?.base) {
155
- rules.push(...exports.DEFAULT_STYLE_MAP.rules);
156
- }
149
+ // User-defined rules from the DSL.
150
+ const userRules = [];
157
151
  const lines = dsl
158
152
  .split("\n")
159
153
  .map(l => l.trim())
@@ -161,10 +155,37 @@ function parseStyleMap(dsl, options) {
161
155
  for (const line of lines) {
162
156
  const rule = parseRule(line);
163
157
  if (rule) {
164
- rules.push(rule);
158
+ userRules.push(rule);
165
159
  }
166
160
  }
167
- // Sort by priority (highest first)
161
+ // Default / base rules requested via `includeDefaults`.
162
+ const defaultRules = [];
163
+ if (options?.includeDefaults !== false && options?.base) {
164
+ defaultRules.push(...options.base.rules);
165
+ }
166
+ else if (options?.includeDefaults !== false && !options?.base) {
167
+ defaultRules.push(...exports.DEFAULT_STYLE_MAP.rules);
168
+ }
169
+ // An explicit DSL rule should always win over a default rule for the same
170
+ // element — that is the whole point of providing one. Default rules,
171
+ // however, carry their own priorities (e.g. "Heading 1" => h1 has priority
172
+ // 10) that can exceed the fixed priority `parseRule` assigns to user rules.
173
+ // To guarantee user intent wins while preserving the relative priority
174
+ // ordering *within* each group, lift every user rule above the highest
175
+ // default priority. When there are no defaults this offset is 0 and user
176
+ // priorities are untouched.
177
+ const maxDefaultPriority = defaultRules.reduce((m, r) => Math.max(m, r.priority ?? 0), 0);
178
+ const userOffset = defaultRules.length > 0 ? maxDefaultPriority + 1 : 0;
179
+ const liftedUserRules = userOffset === 0
180
+ ? userRules
181
+ : userRules.map(r => ({ ...r, priority: (r.priority ?? 0) + userOffset }));
182
+ // User rules come first so that, after a stable sort, an explicit DSL rule
183
+ // also wins over any default rule that happens to share its (lifted)
184
+ // priority. The sort below only reorders by priority; equal priorities
185
+ // preserve this user-before-default ordering.
186
+ const rules = [...liftedUserRules, ...defaultRules];
187
+ // Sort by priority (highest first). Array.prototype.sort is stable, so
188
+ // rules of equal priority keep their relative order (user rules first).
168
189
  rules.sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0));
169
190
  return { rules };
170
191
  }
@@ -525,7 +525,13 @@ function mathSubSuperScript(base, subScript, superScript) {
525
525
  function mathPreSubSuperScript(base, preSubScript, preSuperScript) {
526
526
  return { type: "mathPreSubSuperScript", base, preSubScript, preSuperScript };
527
527
  }
528
- /** Create a math phantom (invisible expression that takes up space). */
528
+ /**
529
+ * Create a math phantom (an expression that takes up space).
530
+ *
531
+ * Note: in OOXML the phantom base is *shown* by default. To make the classic
532
+ * "occupies space but invisible" phantom pass `{ show: false }`; passing only
533
+ * `transparent: true` is not sufficient to hide the base in Word.
534
+ */
529
535
  function mathPhantom(content, options) {
530
536
  return { type: "mathPhantom", content, ...options };
531
537
  }
@@ -272,7 +272,11 @@ exports.ContentType = {
272
272
  ChartEx: "application/vnd.ms-office.chartEx+xml",
273
273
  Xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
274
274
  CustomXml: "application/xml",
275
- VbaProject: "application/vnd.ms-office.vbaProject"
275
+ VbaProject: "application/vnd.ms-office.vbaProject",
276
+ /** Glossary (Building Blocks) document part. */
277
+ Glossary: "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml",
278
+ /** OLE embedded object binary. */
279
+ OleObject: "application/vnd.openxmlformats-officedocument.oleObject"
276
280
  };
277
281
  /** Map from image file extension to content type. */
278
282
  exports.IMAGE_CONTENT_TYPES = {
@@ -12,7 +12,8 @@
12
12
  * - Hyperlink extraction
13
13
  * - Image registration into ConversionContext
14
14
  * - Table structure with merge (colSpan/rowSpan)
15
- * - List/numbering detection (basic)
15
+ * - List/numbering detection: consecutive numbered paragraphs are aggregated
16
+ * into ordered/unordered `list` blocks with nested sub-lists by level
16
17
  * - Footnote/endnote reference and content
17
18
  * - Math content (text fallback)
18
19
  */
@@ -96,6 +97,26 @@ function convertBodyContent(body, doc, ctx, imageMap) {
96
97
  const item = body[bodyIndex];
97
98
  switch (item.type) {
98
99
  case "paragraph":
100
+ // A run of consecutive list-item paragraphs (each carrying a
101
+ // numbering reference, and not a heading) is aggregated into a single
102
+ // semantic `list` block with nested sub-lists driven by the numbering
103
+ // level. This is what turns Word numbering into real <ul>/<ol> in
104
+ // HTML and `-`/`1.` markers in Markdown when downstream renderers
105
+ // consume the IR.
106
+ if (isListItemParagraph(item)) {
107
+ let end = bodyIndex;
108
+ while (end < body.length) {
109
+ const next = body[end];
110
+ if (next.type !== "paragraph" || !isListItemParagraph(next)) {
111
+ break;
112
+ }
113
+ end++;
114
+ }
115
+ const listParas = body.slice(bodyIndex, end);
116
+ blocks.push(...buildListBlocks(listParas, doc, ctx, imageMap));
117
+ bodyIndex = end - 1; // loop's ++ advances past the consumed run
118
+ break;
119
+ }
99
120
  blocks.push(convertParagraph(item, doc, ctx, imageMap));
100
121
  break;
101
122
  case "table":
@@ -243,6 +264,119 @@ function convertBodyContent(body, doc, ctx, imageMap) {
243
264
  return blocks;
244
265
  }
245
266
  // =============================================================================
267
+ // Internal: List Aggregation
268
+ // =============================================================================
269
+ /**
270
+ * Whether a body paragraph should render as a list item: it carries a
271
+ * numbering reference and is not itself a heading (a numbered heading stays a
272
+ * heading, mirroring the markdown/html renderers).
273
+ */
274
+ function isListItemParagraph(item) {
275
+ if (item.type !== "paragraph") {
276
+ return false;
277
+ }
278
+ return item.properties?.numbering !== undefined && detectHeadingLevel(item) === null;
279
+ }
280
+ /**
281
+ * Resolve a numbering reference to its number format string (e.g. "decimal",
282
+ * "bullet"). Mirrors the lookup in the markdown/html renderers so the three
283
+ * surfaces classify ordered vs. unordered lists identically. Defaults to
284
+ * "bullet" when the numbering definition can't be resolved.
285
+ */
286
+ function getNumberingFormat(doc, numId, level) {
287
+ const instance = doc.numberingInstances?.find(n => n.numId === numId);
288
+ if (!instance) {
289
+ return "bullet";
290
+ }
291
+ const abstractNum = doc.abstractNumberings?.find(a => a.abstractNumId === instance.abstractNumId);
292
+ if (!abstractNum) {
293
+ return "bullet";
294
+ }
295
+ const levelDef = abstractNum.levels.find(l => l.level === level);
296
+ return levelDef?.format ?? "bullet";
297
+ }
298
+ /** A number format other than "bullet"/"none" denotes an ordered list. */
299
+ function isOrderedFormat(format) {
300
+ return format !== "bullet" && format !== "none";
301
+ }
302
+ /**
303
+ * Build one or more semantic `list` blocks from a contiguous run of list-item
304
+ * paragraphs. Paragraphs are nested by their numbering `level`; a deeper level
305
+ * becomes a `subList` of the preceding shallower item. Adjacent items that
306
+ * switch between ordered and unordered at the same level start a new sibling
307
+ * list so the ordered/unordered distinction is preserved.
308
+ */
309
+ function buildListBlocks(paras, doc, ctx, imageMap) {
310
+ const { blocks } = buildListLevel(paras, 0, 0, doc, ctx, imageMap);
311
+ return blocks;
312
+ }
313
+ /**
314
+ * Consume paragraphs starting at `start` that belong to `level` (or deeper),
315
+ * emitting sibling lists for this level. Deeper-level paragraphs are folded
316
+ * into the current item's `subList` via recursion. Returns the produced blocks
317
+ * and the index of the first paragraph that no longer belongs to this level.
318
+ */
319
+ function buildListLevel(paras, start, level, doc, ctx, imageMap) {
320
+ const blocks = [];
321
+ let i = start;
322
+ let currentOrdered = null;
323
+ let items = [];
324
+ const flush = () => {
325
+ if (items.length > 0 && currentOrdered !== null) {
326
+ blocks.push({ type: "list", ordered: currentOrdered, items });
327
+ items = [];
328
+ }
329
+ };
330
+ while (i < paras.length) {
331
+ const para = paras[i];
332
+ const num = para.properties?.numbering;
333
+ // Defensive: callers only pass list-item paragraphs, but guard anyway.
334
+ if (!num) {
335
+ break;
336
+ }
337
+ if (num.level < level) {
338
+ // Belongs to a shallower list — let the caller handle it.
339
+ break;
340
+ }
341
+ if (num.level > level) {
342
+ // Deeper item with no shallower parent at this position: descend and
343
+ // attach the nested list to the most recent item, or synthesise an
344
+ // empty item to host it when there is no parent.
345
+ const { blocks: subBlocks, next } = buildListLevel(paras, i, num.level, doc, ctx, imageMap);
346
+ const subList = subBlocks[0];
347
+ if (items.length > 0) {
348
+ const last = items[items.length - 1];
349
+ items[items.length - 1] = { ...last, subList };
350
+ }
351
+ else if (subList) {
352
+ // Promote the deeper list to this level when there is no parent item.
353
+ if (currentOrdered === null && subList.type === "list") {
354
+ currentOrdered = subList.ordered;
355
+ }
356
+ items.push({ children: [], subList });
357
+ }
358
+ i = next;
359
+ continue;
360
+ }
361
+ // num.level === level
362
+ const format = getNumberingFormat(doc, num.numId, num.level);
363
+ const ordered = isOrderedFormat(format);
364
+ if (currentOrdered === null) {
365
+ currentOrdered = ordered;
366
+ }
367
+ else if (ordered !== currentOrdered) {
368
+ // Ordered/unordered switch at the same level → start a new sibling list.
369
+ flush();
370
+ currentOrdered = ordered;
371
+ }
372
+ const children = convertParagraphChildren(para.children, doc, ctx, imageMap);
373
+ items.push({ children });
374
+ i++;
375
+ }
376
+ flush();
377
+ return { blocks, next: i };
378
+ }
379
+ // =============================================================================
246
380
  // Internal: Paragraph Conversion
247
381
  // =============================================================================
248
382
  function convertParagraph(para, doc, ctx, imageMap) {
@@ -23,6 +23,8 @@
23
23
  */
24
24
  Object.defineProperty(exports, "__esModule", { value: true });
25
25
  exports.htmlToDocxBody = htmlToDocxBody;
26
+ exports.htmlToDocx = htmlToDocx;
27
+ const utils_1 = require("../../../../utils/utils.js");
26
28
  const internal_utils_1 = require("../../core/internal-utils");
27
29
  const units_1 = require("../../units");
28
30
  /**
@@ -47,10 +49,7 @@ function htmlToDocxBody(html, options) {
47
49
  const tokens = tokenize(html);
48
50
  // Extract <style> rules and merge with user-provided classStyles
49
51
  const extractedStyles = extractStyleRules(tokens);
50
- const classStyles = {
51
- ...extractedStyles,
52
- ...(options?.classStyles ?? {})
53
- };
52
+ const classStyles = mergeClassStyles(extractedStyles, options?.classStyles ?? {});
54
53
  // Seed the inline context with the caller-supplied defaults so plain text
55
54
  // runs actually carry the requested font/size. Without this the options
56
55
  // were effectively ignored.
@@ -64,6 +63,40 @@ function htmlToDocxBody(html, options) {
64
63
  parseBlocks(tokens, 0, blocks, initialCtx, classStyles);
65
64
  return blocks;
66
65
  }
66
+ /**
67
+ * Convert an HTML string into DOCX body content **and** embedded images.
68
+ *
69
+ * Unlike {@link htmlToDocxBody}, this decodes base64 `data:` image URLs into
70
+ * real {@link ImageDef}s and assigns each a unique rId that the emitted image
71
+ * runs reference. Merge the returned `images` into your document model so the
72
+ * pictures are embedded rather than dropped as placeholders.
73
+ *
74
+ * @example
75
+ * ```ts
76
+ * const { body, images } = htmlToDocx(html);
77
+ * const doc = Document.create();
78
+ * for (const item of body) Document.addContent(doc, item);
79
+ * const built = Document.build(doc);
80
+ * const final = { ...built, images: [...(built.images ?? []), ...images] };
81
+ * const bytes = await toBuffer(final);
82
+ * ```
83
+ */
84
+ function htmlToDocx(html, options) {
85
+ const blocks = [];
86
+ const tokens = tokenize(html);
87
+ const extractedStyles = extractStyleRules(tokens);
88
+ const classStyles = mergeClassStyles(extractedStyles, options?.classStyles ?? {});
89
+ const images = [];
90
+ const initialCtx = { imageSink: images };
91
+ if (options?.defaultFont) {
92
+ initialCtx.fontFamily = options.defaultFont;
93
+ }
94
+ if (options?.defaultFontSize !== undefined) {
95
+ initialCtx.fontSize = options.defaultFontSize;
96
+ }
97
+ parseBlocks(tokens, 0, blocks, initialCtx, classStyles);
98
+ return { body: blocks, images };
99
+ }
67
100
  function tokenize(html) {
68
101
  const tokens = [];
69
102
  // Strip HTML comments, doctype declarations and SGML processing
@@ -476,6 +509,22 @@ function extractStyleRules(tokens) {
476
509
  }
477
510
  return result;
478
511
  }
512
+ /**
513
+ * Merge two class→style maps. For classes present in both, the declarations
514
+ * are concatenated (extracted `<style>` rules first, caller-supplied overrides
515
+ * last) so the later source wins per CSS cascade while still preserving
516
+ * properties only declared by the other source. A plain `{ ...a, ...b }`
517
+ * would discard the extracted rule entirely whenever the caller supplies the
518
+ * same class name, silently dropping e.g. `font-style`/`color` from `<style>`.
519
+ */
520
+ function mergeClassStyles(extracted, overrides) {
521
+ const merged = { ...extracted };
522
+ for (const [name, style] of Object.entries(overrides)) {
523
+ const existing = merged[name];
524
+ merged[name] = existing ? `${existing}; ${style}` : style;
525
+ }
526
+ return merged;
527
+ }
479
528
  /**
480
529
  * Parse HTML-style attributes from the inside of a start tag, e.g.
481
530
  * `class="x" id='y' disabled href=foo`.
@@ -1136,6 +1185,17 @@ function parseBlocks(tokens, start, blocks, parentCtx, classStyles) {
1136
1185
  return i + 1; // consumed the close tag
1137
1186
  }
1138
1187
  if (tok.type === "text") {
1188
+ // In block context, text nodes that are pure inter-element whitespace
1189
+ // (the newlines/indentation between block tags in pretty-printed HTML)
1190
+ // carry no content and must be ignored — otherwise every gap between
1191
+ // <p>/<table>/<div> tags would emit a spurious empty paragraph (and
1192
+ // the contained newline would be rendered as a <w:br/> soft break).
1193
+ // Whitespace that sits between inline runs is preserved by the inline
1194
+ // parser, which handles it separately.
1195
+ if (tok.value.trim() === "") {
1196
+ i++;
1197
+ continue;
1198
+ }
1139
1199
  if (!pendingInline) {
1140
1200
  pendingInline = { runs: [], ctx: parentCtx };
1141
1201
  }
@@ -1464,7 +1524,7 @@ function parseInlineTag(tokens, idx, runs, ctx, classStyles) {
1464
1524
  runs.push({ content: [{ type: "break" }] });
1465
1525
  }
1466
1526
  else if (tag === "img") {
1467
- const imgContent = buildImageContent(tok.attrs);
1527
+ const imgContent = buildImageContent(tok.attrs, ctx);
1468
1528
  if (imgContent) {
1469
1529
  runs.push({ content: [imgContent] });
1470
1530
  }
@@ -1537,13 +1597,15 @@ function parseInlineTag(tokens, idx, runs, ctx, classStyles) {
1537
1597
  i++;
1538
1598
  }
1539
1599
  else if (t.type === "close") {
1600
+ // Mismatched close tag — close the hyperlink here but do NOT
1601
+ // consume the token; let the caller handle the block boundary.
1540
1602
  const hyperlink = {
1541
1603
  type: "hyperlink",
1542
1604
  url: safeHref ?? "",
1543
1605
  children: innerRuns
1544
1606
  };
1545
1607
  runs.push(hyperlink);
1546
- return i + 1;
1608
+ return i;
1547
1609
  }
1548
1610
  else {
1549
1611
  const childRuns = [];
@@ -1584,7 +1646,12 @@ function parseInlineTag(tokens, idx, runs, ctx, classStyles) {
1584
1646
  i++;
1585
1647
  }
1586
1648
  else if (t.type === "close") {
1587
- return i + 1;
1649
+ // Mismatched close tag (e.g. </p> while inside an unclosed <strong>).
1650
+ // Do NOT consume it — return the current index so the caller can
1651
+ // handle it. Consuming a block-level close here would swallow the
1652
+ // parent paragraph boundary and pull all following block content
1653
+ // into this run, breaking page breaks, tables, etc.
1654
+ return i;
1588
1655
  }
1589
1656
  else {
1590
1657
  i = parseInlineTag(tokens, i, runs, newCtx, classStyles);
@@ -1663,6 +1730,15 @@ function parseListItem(tokens, start, blocks, ctx, ordered, level, classStyles)
1663
1730
  }
1664
1731
  // Text content
1665
1732
  if (tok.type === "text") {
1733
+ // Skip structural whitespace: the indentation/newlines that sit between
1734
+ // a nested <ul>/<ol> and the closing </li> (or at the very start of the
1735
+ // item) are not real content. Emitting them as runs would otherwise
1736
+ // produce a spurious empty list-item paragraph. Whitespace *between*
1737
+ // real inline content is preserved because `children` is non-empty then.
1738
+ if (tok.value.trim() === "" && children.length === 0) {
1739
+ i++;
1740
+ continue;
1741
+ }
1666
1742
  children.push(makeRun(tok.value, ctx));
1667
1743
  i++;
1668
1744
  continue;
@@ -2067,7 +2143,7 @@ function mapCssBorderStyle(cssStyle) {
2067
2143
  // Image content builder
2068
2144
  // =============================================================================
2069
2145
  /** Build InlineImageContent from img attributes or return undefined if not applicable. */
2070
- function buildImageContent(attrs) {
2146
+ function buildImageContent(attrs, ctx) {
2071
2147
  const src = attrs["src"] || "";
2072
2148
  const alt = attrs["alt"] || "";
2073
2149
  // Parse width/height from attributes first, then fall back to style
@@ -2086,11 +2162,36 @@ function buildImageContent(attrs) {
2086
2162
  // Convert pixels to EMU
2087
2163
  const widthEmu = (width || 100) * units_1.EMU_PER_PX;
2088
2164
  const heightEmu = (height || 100) * units_1.EMU_PER_PX;
2089
- // Both data: and http(s) URLs become placeholders. The DOCX writer needs
2090
- // a real ImageDef registered in `doc.images` plus a corresponding
2091
- // relationship; htmlToDocxBody returns BodyContent[] only and cannot do
2092
- // that registration. We surface the original src in the alt text so the
2093
- // user can post-process if they need real embedded images.
2165
+ // base64 data: URLs can be decoded and embedded as a real media file when
2166
+ // an image sink is provided (htmlToDocx path). The decoded bytes are
2167
+ // registered as an ImageDef and the run references the assigned rId.
2168
+ if (src.startsWith("data:") && ctx?.imageSink) {
2169
+ const decoded = decodeDataUrlImage(src);
2170
+ if (decoded) {
2171
+ const sink = ctx.imageSink;
2172
+ const index = sink.length;
2173
+ const rId = `htmlImg${index}`;
2174
+ const ext = decoded.mediaType === "jpeg" ? "jpg" : decoded.mediaType;
2175
+ sink.push({
2176
+ data: decoded.data,
2177
+ mediaType: decoded.mediaType,
2178
+ fileName: `image_html_${index}.${ext}`,
2179
+ rId
2180
+ });
2181
+ return {
2182
+ type: "image",
2183
+ rId,
2184
+ width: widthEmu,
2185
+ height: heightEmu,
2186
+ altText: alt || undefined,
2187
+ name: alt || `image${index}`
2188
+ };
2189
+ }
2190
+ }
2191
+ // No sink (htmlToDocxBody only returns BodyContent[] and cannot register
2192
+ // media) or an unsupported/remote source: emit a placeholder with an empty
2193
+ // rId. The renderer treats an empty rId as a placeholder; the original src
2194
+ // is surfaced in the alt text so callers can post-process if needed.
2094
2195
  if (src.startsWith("data:") || src.startsWith("http://") || src.startsWith("https://")) {
2095
2196
  return {
2096
2197
  type: "image",
@@ -2103,6 +2204,54 @@ function buildImageContent(attrs) {
2103
2204
  }
2104
2205
  return undefined;
2105
2206
  }
2207
+ /** Decode a `data:image/...;base64,...` URL into bytes + media type. */
2208
+ function decodeDataUrlImage(src) {
2209
+ // data:image/png;base64,XXXX
2210
+ const match = /^data:image\/([a-z0-9.+-]+)\s*;\s*base64\s*,(.*)$/is.exec(src);
2211
+ if (!match) {
2212
+ return undefined;
2213
+ }
2214
+ const rawType = match[1].toLowerCase();
2215
+ const b64 = match[2].replace(/\s+/g, "");
2216
+ const mediaType = normalizeImageMediaType(rawType);
2217
+ if (!mediaType) {
2218
+ return undefined;
2219
+ }
2220
+ try {
2221
+ const data = (0, utils_1.base64ToUint8Array)(b64);
2222
+ if (data.length === 0) {
2223
+ return undefined;
2224
+ }
2225
+ return { data, mediaType };
2226
+ }
2227
+ catch {
2228
+ return undefined;
2229
+ }
2230
+ }
2231
+ /** Map a data-URL image subtype to a supported ImageMediaType. */
2232
+ function normalizeImageMediaType(subtype) {
2233
+ switch (subtype) {
2234
+ case "png":
2235
+ return "png";
2236
+ case "jpeg":
2237
+ case "jpg":
2238
+ return "jpeg";
2239
+ case "gif":
2240
+ return "gif";
2241
+ case "bmp":
2242
+ return "bmp";
2243
+ case "tiff":
2244
+ case "tif":
2245
+ return "tiff";
2246
+ case "svg+xml":
2247
+ case "svg":
2248
+ return "svg";
2249
+ case "webp":
2250
+ return "webp";
2251
+ default:
2252
+ return undefined;
2253
+ }
2254
+ }
2106
2255
  /** Parse an image dimension from HTML attribute value (number or "Npx"). */
2107
2256
  function parseImageDimension(value) {
2108
2257
  if (!value) {
@@ -2231,6 +2380,11 @@ function resolveEffectiveStyle(attrs, classStyles) {
2231
2380
  // Run builder
2232
2381
  // =============================================================================
2233
2382
  function makeRun(text, ctx) {
2383
+ // HTML whitespace handling: outside <pre>/<code>, runs of whitespace
2384
+ // (including the newlines/indentation from source-code line wrapping)
2385
+ // collapse to a single space. Inside <pre>/<code> whitespace is
2386
+ // significant and preserved verbatim.
2387
+ const value = ctx.code ? text : text.replace(/\s+/g, " ");
2234
2388
  const props = {};
2235
2389
  if (ctx.bold) {
2236
2390
  props.bold = true;
@@ -2267,7 +2421,7 @@ function makeRun(text, ctx) {
2267
2421
  }
2268
2422
  const run = {
2269
2423
  ...(Object.keys(props).length > 0 ? { properties: props } : {}),
2270
- content: [{ type: "text", text }]
2424
+ content: [{ type: "text", text: value }]
2271
2425
  };
2272
2426
  return run;
2273
2427
  }
@@ -11,10 +11,11 @@
11
11
  * ```
12
12
  */
13
13
  Object.defineProperty(exports, "__esModule", { value: true });
14
- exports.htmlToDocxBody = exports.renderToHtml = void 0;
14
+ exports.htmlToDocx = exports.htmlToDocxBody = exports.renderToHtml = void 0;
15
15
  // HTML → render (DocxDocument → HTML output)
16
16
  var html_renderer_1 = require("./html-renderer");
17
17
  Object.defineProperty(exports, "renderToHtml", { enumerable: true, get: function () { return html_renderer_1.renderToHtml; } });
18
18
  // HTML → DOCX import (HTML string → BodyContent[])
19
19
  var html_import_1 = require("./html-import");
20
20
  Object.defineProperty(exports, "htmlToDocxBody", { enumerable: true, get: function () { return html_import_1.htmlToDocxBody; } });
21
+ Object.defineProperty(exports, "htmlToDocx", { enumerable: true, get: function () { return html_import_1.htmlToDocx; } });