dom-docx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. package/API.md +533 -0
  2. package/LICENSE +21 -0
  3. package/README.md +236 -0
  4. package/dist/browser.d.ts +34 -0
  5. package/dist/browser.d.ts.map +1 -0
  6. package/dist/browser.js +35 -0
  7. package/dist/browser.js.map +1 -0
  8. package/dist/cli.d.ts +3 -0
  9. package/dist/cli.d.ts.map +1 -0
  10. package/dist/cli.js +118 -0
  11. package/dist/cli.js.map +1 -0
  12. package/dist/converter/bordered-block.d.ts +54 -0
  13. package/dist/converter/bordered-block.d.ts.map +1 -0
  14. package/dist/converter/bordered-block.js +124 -0
  15. package/dist/converter/bordered-block.js.map +1 -0
  16. package/dist/converter/build-docx.d.ts +46 -0
  17. package/dist/converter/build-docx.d.ts.map +1 -0
  18. package/dist/converter/build-docx.js +161 -0
  19. package/dist/converter/build-docx.js.map +1 -0
  20. package/dist/converter/computed-style-snapshot.browser.js +73 -0
  21. package/dist/converter/computed-style-snapshot.d.ts +10 -0
  22. package/dist/converter/computed-style-snapshot.d.ts.map +1 -0
  23. package/dist/converter/computed-style-snapshot.js +78 -0
  24. package/dist/converter/computed-style-snapshot.js.map +1 -0
  25. package/dist/converter/constants.d.ts +51 -0
  26. package/dist/converter/constants.d.ts.map +1 -0
  27. package/dist/converter/constants.js +163 -0
  28. package/dist/converter/constants.js.map +1 -0
  29. package/dist/converter/css.d.ts +112 -0
  30. package/dist/converter/css.d.ts.map +1 -0
  31. package/dist/converter/css.js +621 -0
  32. package/dist/converter/css.js.map +1 -0
  33. package/dist/converter/flex.d.ts +59 -0
  34. package/dist/converter/flex.d.ts.map +1 -0
  35. package/dist/converter/flex.js +252 -0
  36. package/dist/converter/flex.js.map +1 -0
  37. package/dist/converter/image.d.ts +38 -0
  38. package/dist/converter/image.d.ts.map +1 -0
  39. package/dist/converter/image.js +159 -0
  40. package/dist/converter/image.js.map +1 -0
  41. package/dist/converter/inline.d.ts +18 -0
  42. package/dist/converter/inline.d.ts.map +1 -0
  43. package/dist/converter/inline.js +213 -0
  44. package/dist/converter/inline.js.map +1 -0
  45. package/dist/converter/ooxml-patch.d.ts +23 -0
  46. package/dist/converter/ooxml-patch.d.ts.map +1 -0
  47. package/dist/converter/ooxml-patch.js +54 -0
  48. package/dist/converter/ooxml-patch.js.map +1 -0
  49. package/dist/converter/style-path.d.ts +4 -0
  50. package/dist/converter/style-path.d.ts.map +1 -0
  51. package/dist/converter/style-path.js +17 -0
  52. package/dist/converter/style-path.js.map +1 -0
  53. package/dist/converter/style-resolver-node.d.ts +7 -0
  54. package/dist/converter/style-resolver-node.d.ts.map +1 -0
  55. package/dist/converter/style-resolver-node.js +26 -0
  56. package/dist/converter/style-resolver-node.js.map +1 -0
  57. package/dist/converter/style-resolver.d.ts +24 -0
  58. package/dist/converter/style-resolver.d.ts.map +1 -0
  59. package/dist/converter/style-resolver.js +122 -0
  60. package/dist/converter/style-resolver.js.map +1 -0
  61. package/dist/converter/svg.d.ts +11 -0
  62. package/dist/converter/svg.d.ts.map +1 -0
  63. package/dist/converter/svg.js +116 -0
  64. package/dist/converter/svg.js.map +1 -0
  65. package/dist/converter/table.d.ts +8 -0
  66. package/dist/converter/table.d.ts.map +1 -0
  67. package/dist/converter/table.js +745 -0
  68. package/dist/converter/table.js.map +1 -0
  69. package/dist/converter/text-metrics.d.ts +17 -0
  70. package/dist/converter/text-metrics.d.ts.map +1 -0
  71. package/dist/converter/text-metrics.js +51 -0
  72. package/dist/converter/text-metrics.js.map +1 -0
  73. package/dist/converter/types.d.ts +82 -0
  74. package/dist/converter/types.d.ts.map +1 -0
  75. package/dist/converter/types.js +9 -0
  76. package/dist/converter/types.js.map +1 -0
  77. package/dist/converter/visitor.d.ts +11 -0
  78. package/dist/converter/visitor.d.ts.map +1 -0
  79. package/dist/converter/visitor.js +910 -0
  80. package/dist/converter/visitor.js.map +1 -0
  81. package/dist/converter.d.ts +28 -0
  82. package/dist/converter.d.ts.map +1 -0
  83. package/dist/converter.js +44 -0
  84. package/dist/converter.js.map +1 -0
  85. package/dist/html-wrap.d.ts +3 -0
  86. package/dist/html-wrap.d.ts.map +1 -0
  87. package/dist/html-wrap.js +26 -0
  88. package/dist/html-wrap.js.map +1 -0
  89. package/dist/index.d.ts +17 -0
  90. package/dist/index.d.ts.map +1 -0
  91. package/dist/index.js +16 -0
  92. package/dist/index.js.map +1 -0
  93. package/examples/README.md +39 -0
  94. package/examples/balance-sheet/compare_side_by_side.png +0 -0
  95. package/examples/balance-sheet/input.html +41 -0
  96. package/examples/balance-sheet/output.docx +0 -0
  97. package/examples/balance-sheet/preview.png +0 -0
  98. package/examples/invoice/compare_side_by_side.png +0 -0
  99. package/examples/invoice/input.html +88 -0
  100. package/examples/invoice/logo.png +0 -0
  101. package/examples/invoice/output.docx +0 -0
  102. package/examples/invoice/preview.png +0 -0
  103. package/examples/javascript-essay/compare_side_by_side.png +0 -0
  104. package/examples/javascript-essay/input.html +39 -0
  105. package/examples/javascript-essay/output.docx +0 -0
  106. package/examples/javascript-essay/preview.png +0 -0
  107. package/examples/product-launch-brief/compare_side_by_side.png +0 -0
  108. package/examples/product-launch-brief/input.html +120 -0
  109. package/examples/product-launch-brief/output.docx +0 -0
  110. package/examples/product-launch-brief/preview.png +0 -0
  111. package/examples/quarterly-financials/compare_side_by_side.png +0 -0
  112. package/examples/quarterly-financials/input.html +27 -0
  113. package/examples/quarterly-financials/output.docx +0 -0
  114. package/examples/quarterly-financials/preview.png +0 -0
  115. package/examples/react-dashboard/compare_side_by_side.png +0 -0
  116. package/examples/react-dashboard/input.html +1 -0
  117. package/examples/react-dashboard/output.docx +0 -0
  118. package/examples/react-dashboard/preview.html +107 -0
  119. package/examples/react-dashboard/preview.png +0 -0
  120. package/examples/regional-sales-dashboard/compare_side_by_side.png +0 -0
  121. package/examples/regional-sales-dashboard/input.html +129 -0
  122. package/examples/regional-sales-dashboard/output.docx +0 -0
  123. package/examples/regional-sales-dashboard/preview.png +0 -0
  124. package/examples/sales-contract/compare_side_by_side.png +0 -0
  125. package/examples/sales-contract/input.html +68 -0
  126. package/examples/sales-contract/output.docx +0 -0
  127. package/examples/sales-contract/preview.png +0 -0
  128. package/examples/sprint-retrospective/compare_side_by_side.png +0 -0
  129. package/examples/sprint-retrospective/input.html +51 -0
  130. package/examples/sprint-retrospective/output.docx +0 -0
  131. package/examples/sprint-retrospective/preview.png +0 -0
  132. package/package.json +108 -0
package/API.md ADDED
@@ -0,0 +1,533 @@
1
+ # dom-docx API reference
2
+
3
+ dom-docx converts **semantic HTML fragments** into **native Word OOXML** (paragraphs, runs, lists, tables, images)—not raster snapshots or layout hacks.
4
+
5
+ For HTML authoring guidance (what converts well), see [AGENTS.md](./AGENTS.md). For validation scoring and test commands, see [SCORING.md](./docs/SCORING.md) and [README.md](./README.md).
6
+
7
+ ---
8
+
9
+ ## Quick start
10
+
11
+ ```bash
12
+ npm install dom-docx
13
+ ```
14
+
15
+ ```typescript
16
+ import { writeFile } from "node:fs/promises";
17
+ import { convertHtmlToDocx } from "dom-docx";
18
+
19
+ const html = `
20
+ <h1 style="color:#1a1a2e">Quarterly Report</h1>
21
+ <p>Revenue grew <strong>12%</strong> year over year.</p>
22
+ <ul>
23
+ <li>North America</li>
24
+ <li>EMEA</li>
25
+ </ul>
26
+ `;
27
+
28
+ const docx = await convertHtmlToDocx(html);
29
+ await writeFile("output.docx", docx);
30
+ ```
31
+
32
+ Pass a **body fragment only**—no `<!DOCTYPE>`, `<html>`, or `<body>` wrapper. Defaults: US Letter, 1″ margins, Arial 10.5 pt (14 px) body text — all configurable via [options](#options-convertoptions).
33
+
34
+ The default install is pure JS (`docx`, `cheerio`, `fflate`) — **no browser, no Playwright, no LibreOffice**.
35
+
36
+ ## Two entry points
37
+
38
+ | | `dom-docx` (Node) | `dom-docx/browser` |
39
+ |--|-------------------|--------------------|
40
+ | Returns | `Promise<Buffer>` | `Promise<Blob>` (or `Uint8Array`) |
41
+ | `styleSource: "inline"` (default) | Pure JS, no browser | Pure JS, no live DOM required |
42
+ | `styleSource: "computed"` | Headless Chromium via **Playwright** (optional peer dep) | Native `getComputedStyle` on the **live page** — Playwright never involved |
43
+ | Typical use | Server-side batch conversion, agents with inline HTML | In-app "Export to Word" from rendered React/Vue/etc. |
44
+
45
+ ---
46
+
47
+ ## `convertHtmlToDocx(html, options?)` — Node
48
+
49
+ Primary entry point. Resolves styles, converts HTML, returns a **`Promise<Buffer>`** containing a valid `.docx` file.
50
+
51
+ ```typescript
52
+ function convertHtmlToDocx(
53
+ html: string,
54
+ options?: ConvertOptions,
55
+ ): Promise<Buffer>;
56
+ ```
57
+
58
+ | Parameter | Type | Description |
59
+ |-----------|------|-------------|
60
+ | `html` | `string` | Body fragment (trimmed and wrapped in `<body>…</body>` internally). |
61
+ | `options` | `ConvertOptions` | Optional. See [Options](#options-convertoptions). Defaults to inline style resolution. |
62
+
63
+ **Behavior by `styleSource`:**
64
+
65
+ | `styleSource` | Playwright needed? | What happens |
66
+ |---------------|--------------------|--------------|
67
+ | `"inline"` (default) | No | Parses `style=""` attributes only. Fast (~15–30 ms typical). |
68
+ | `"computed"` | **Yes** | Renders the fragment in headless Chromium, snapshots `getComputedStyle` for every element, then converts. Install once: `npm install playwright && npx playwright install chromium`. Slower (~100–500 ms+ depending on launch/reuse). |
69
+
70
+ When `styleSource: "computed"` and neither `page` nor `browser` is provided, the function launches Chromium, snapshots styles, converts, and **closes the browser** in a `finally` block. Pass `browser` or `page` to avoid the per-call launch cost (see [Usage patterns](#usage-patterns)).
71
+
72
+ ---
73
+
74
+ ## `convertHtmlToDocx(html, options?)` — browser (`dom-docx/browser`)
75
+
76
+ Client-side entry — **no Playwright, no Node `Buffer`**. Use this when the HTML is already rendered in the user's browser. Computed styles come from the **live DOM** via native `getComputedStyle`.
77
+
78
+ ```typescript
79
+ import { convertHtmlToDocx } from "dom-docx/browser";
80
+
81
+ const blob = await convertHtmlToDocx(htmlFragment, { styleSource: "computed" });
82
+ // hand the Blob to a download (e.g. saveAs(blob, "output.docx"))
83
+ ```
84
+
85
+ **Script tag** — the prebuilt IIFE (`dist/browser/dom-docx.browser.js`, built with `npm run build:browser`) exposes `window.domDocx`:
86
+
87
+ ```html
88
+ <script src="dom-docx.browser.js"></script>
89
+ <script>
90
+ const blob = await domDocx.convertHtmlToDocx(htmlFragment);
91
+ </script>
92
+ ```
93
+
94
+ ```typescript
95
+ interface BrowserConvertOptions extends DocumentConfig {
96
+ styleSource?: "inline" | "computed"; // default "inline"
97
+ document?: Document; // computed only; defaults to the host page's document
98
+ imageResolver?: ImageResolver;
99
+ }
100
+ ```
101
+
102
+ | `styleSource` | Live DOM required? | Behavior |
103
+ |---------------|--------------------|----------|
104
+ | `"inline"` (default) | No | Parses `style=""` only — works on a string fragment with no rendered page. |
105
+ | `"computed"` | **Yes** | Batch-reads native `getComputedStyle` from `document.body` (or `options.document`). The page must already render the same fragment — the converter does not inject HTML for you. |
106
+
107
+ All [document options](#options-convertoptions) (`pageSize`, `margins`, `metadata`, …) work here too. `browser` / `page` are Node-only.
108
+
109
+ | Export | Returns | Notes |
110
+ |--------|---------|-------|
111
+ | `convertHtmlToDocx(html, options?)` | `Promise<Blob>` | Primary browser API |
112
+ | `convertHtmlToDocxUint8Array(html, options?)` | `Promise<Uint8Array>` | Same bytes, no Blob wrapper |
113
+ | `buildDocxBlob` / `buildDocxUint8Array` | | Lower-level, bring your own `StyleResolver` |
114
+ | `snapshotComputedStylesFromDocument(doc?)` | `ComputedStyleSnapshot[]` | Style snapshots from a live `document.body` |
115
+
116
+ ---
117
+
118
+ ## Options (`ConvertOptions`)
119
+
120
+ ```typescript
121
+ interface ConvertOptions extends DocumentConfig {
122
+ styleSource?: "inline" | "computed"; // default "inline"
123
+ browser?: Browser; // Node computed only (Playwright)
124
+ page?: Page; // Node computed only (Playwright)
125
+ imageResolver?: ImageResolver;
126
+ }
127
+
128
+ interface DocumentConfig {
129
+ pageSize?: "letter" | "a4" | { width: number; height: number }; // custom in inches
130
+ orientation?: "portrait" | "landscape";
131
+ margins?: { top?: number; right?: number; bottom?: number; left?: number }; // inches
132
+ defaultFont?: { family?: string; sizePt?: number };
133
+ metadata?: {
134
+ title?: string;
135
+ subject?: string;
136
+ creator?: string;
137
+ keywords?: string[];
138
+ description?: string;
139
+ };
140
+ headerHtml?: string; // HTML fragment rendered as the page header
141
+ footerHtml?: string; // HTML fragment rendered as the page footer
142
+ pageNumber?: boolean; // centered "Page N" field appended to the footer
143
+ lang?: string; // spell-check locale, e.g. "en-US", "ar-SA"
144
+ direction?: "ltr" | "rtl";
145
+ }
146
+ ```
147
+
148
+ | Option | Type | Default | Description |
149
+ |--------|------|---------|-------------|
150
+ | `styleSource` | `"inline" \| "computed"` | `"inline"` | Which style resolution path to use. |
151
+ | `pageSize` | `"letter" \| "a4" \| {width,height}` | `"letter"` | Page size. Custom `{width, height}` in **inches**. |
152
+ | `orientation` | `"portrait" \| "landscape"` | `"portrait"` | Landscape swaps the page dimensions. |
153
+ | `margins` | `{top,right,bottom,left}` | `1` each | Page margins in **inches**; each side defaults to 1″. |
154
+ | `defaultFont` | `{family?, sizePt?}` | Arial, 10.5 pt | Default body font family and size (points). Applies to text with no explicit CSS font. |
155
+ | `metadata` | `{title,subject,creator,keywords[],description}` | — | Core document properties → `docProps/core.xml`. `keywords` is joined with `, `. |
156
+ | `headerHtml` / `footerHtml` | `string` | — | HTML fragment rendered as the page header / footer (its own inline-styled fragment). |
157
+ | `pageNumber` | `boolean` | `false` | Appends a centered `Page N` field to the footer (creates one if `footerHtml` is absent). |
158
+ | `lang` | `string` | — | Document spell-check locale (`w:lang`), e.g. `"en-US"`. |
159
+ | `direction` | `"ltr" \| "rtl"` | `"ltr"` | `"rtl"` sets right-to-left runs (`w:rtl`) — e.g. Arabic/Hebrew. |
160
+ | `imageResolver` | `ImageResolver` | — | Resolve non-`data:` `<img src>`. See [Images](#images--the-resolver-hook). |
161
+ | `browser` | Playwright `Browser` | — | **Node computed only.** Reuse an already-launched browser across many conversions. |
162
+ | `page` | Playwright `Page` | — | **Node computed only.** Snapshot styles from a page you already rendered. For in-browser apps use `dom-docx/browser` instead. |
163
+
164
+ ```typescript
165
+ const docx = await convertHtmlToDocx(html, {
166
+ pageSize: "a4",
167
+ orientation: "landscape",
168
+ margins: { top: 0.75, bottom: 0.75 }, // inches; left/right default to 1
169
+ defaultFont: { family: "Georgia", sizePt: 11 },
170
+ metadata: { title: "Q3 Report", creator: "Finance", keywords: ["revenue", "q3"] },
171
+ headerHtml: "<p style='font-size:12px;color:#666'>Confidential</p>",
172
+ pageNumber: true,
173
+ });
174
+ ```
175
+
176
+ **Resolution order for `styleSource: "computed"` (Node):**
177
+
178
+ 1. If `options.page` is set → snapshot that page (no new page)
179
+ 2. Else if `options.browser` is set → new page per call, browser kept open
180
+ 3. Else → launch Chromium, convert, close browser
181
+
182
+ ---
183
+
184
+ ## Images & the resolver hook
185
+
186
+ `<img>` embeds as a native DOCX `ImageRun` (not a link). Display size comes from the
187
+ `width`/`height` attributes, falling back to the image's intrinsic size (png/jpg/gif/bmp
188
+ headers are decoded), aspect-preserved if only one dimension is given.
189
+
190
+ **By default the library never makes a network or filesystem request.** Only inline
191
+ `data:` URLs (base64 png/jpg/gif/bmp) embed automatically. Any other `src` — `http(s):`,
192
+ `file:`, relative — is **not fetched**; it falls back to the `alt` text. This keeps
193
+ conversion deterministic and preserves a zero-egress guarantee (nothing leaves your
194
+ process based on input HTML — important for SSRF-safety and PHI/on-prem use).
195
+
196
+ To enable remote or local images, pass an **`imageResolver`**. You own the fetch and its
197
+ security policy; the library only orchestrates placement.
198
+
199
+ ```typescript
200
+ type ResolvedImage = {
201
+ data: Uint8Array | ArrayBuffer; // raw image bytes (not base64)
202
+ type: "png" | "jpg" | "gif" | "bmp";
203
+ width?: number; // used only if the <img> omits width/height
204
+ height?: number;
205
+ };
206
+
207
+ type ImageResolver = (
208
+ src: string,
209
+ ) => Promise<ResolvedImage | null> | ResolvedImage | null;
210
+ ```
211
+
212
+ Behavior:
213
+
214
+ - Called once per `<img>` whose `src` is **not** already a `data:` URL, before conversion.
215
+ - Return `ResolvedImage` to embed, or `null` to skip (image falls back to alt text).
216
+ - A resolver that **throws** for one image is caught per-image (that image falls back);
217
+ the conversion never aborts.
218
+ - Multiple images resolve concurrently (`Promise.all`).
219
+
220
+ ```typescript
221
+ const docx = await convertHtmlToDocx(html, {
222
+ imageResolver: async (src) => {
223
+ // YOUR policy: allowlist hosts, block private IPs/SSRF, add auth, cap size…
224
+ const url = new URL(src);
225
+ if (url.hostname !== "cdn.example.com") return null;
226
+ const res = await fetch(src);
227
+ if (!res.ok) return null;
228
+ return { data: new Uint8Array(await res.arrayBuffer()), type: "png" };
229
+ },
230
+ });
231
+ ```
232
+
233
+ > The library ships no default network resolver on purpose — making a request must be an
234
+ > explicit, visible line of caller code, never an implicit side effect of conversion.
235
+
236
+ ---
237
+
238
+ ## Usage patterns
239
+
240
+ ### Default — inline styles
241
+
242
+ ```typescript
243
+ const docx = await convertHtmlToDocx(html);
244
+ ```
245
+
246
+ Best for agent-generated HTML with explicit inline styles. No Playwright required on Node or in the browser.
247
+
248
+ ### Stylesheet / class-based HTML (Node — Playwright required)
249
+
250
+ ```typescript
251
+ const html = `
252
+ <style>
253
+ .hero { background: #eaeaea; padding: 10px 16px; }
254
+ .hero h1 { color: #1a1a2e; margin: 0; }
255
+ </style>
256
+ <div class="hero">
257
+ <h1>Title</h1>
258
+ </div>
259
+ `;
260
+
261
+ const docx = await convertHtmlToDocx(html, { styleSource: "computed" });
262
+ ```
263
+
264
+ On **Node**, the computed path renders the fragment in headless Chromium. External `<link rel="stylesheet">` works if the URL loads during `setContent` (`waitUntil: "networkidle"`).
265
+
266
+ In a **browser app**, use `dom-docx/browser` with `styleSource: "computed"` instead — render the HTML in the page, then convert; no Playwright.
267
+
268
+ ### Reuse a browser in a loop
269
+
270
+ ```typescript
271
+ import { chromium } from "playwright";
272
+ import { convertHtmlToDocx } from "dom-docx";
273
+
274
+ const browser = await chromium.launch();
275
+ try {
276
+ for (const html of fragments) {
277
+ const docx = await convertHtmlToDocx(html, {
278
+ styleSource: "computed",
279
+ browser,
280
+ });
281
+ // ...
282
+ }
283
+ } finally {
284
+ await browser.close();
285
+ }
286
+ ```
287
+
288
+ ### Snapshot from an existing page
289
+
290
+ ```typescript
291
+ // page already has your HTML rendered (Playwright)
292
+ const docx = await convertHtmlToDocx(html, {
293
+ styleSource: "computed",
294
+ page,
295
+ });
296
+ ```
297
+
298
+ Styles come from the **same DOM** as a reference screenshot—no second render.
299
+
300
+ ---
301
+
302
+ ## Supported HTML & CSS
303
+
304
+ ### Elements
305
+
306
+ `h1`–`h6`, `p`, `div`, `section`, `ul`, `ol`, `li`, `table`, `thead`, `tbody`, `tfoot`, `tr`, `td`, `th`, `blockquote`, `hr`, `figure`, `figcaption`, `img`, `svg` (low-complexity), `strong`, `b`, `em`, `i`, `u`, `a`, `span`, `code`, `br`, `pre` (limited).
307
+
308
+ Element attributes: table `border` / `cellpadding` / `cellspacing` / `colspan`; `href` on links; `src` / `width` / `height` / `alt` on images; list `type`. `<img>` embeds `data:` URLs by default; other `src` schemes require an [`imageResolver`](#images--the-resolver-hook). Other attributes are mostly ignored.
309
+
310
+ Unsupported tags are treated as generic block containers or skipped.
311
+
312
+ ### Inline CSS properties
313
+
314
+ Parsed from `style=""` (and from computed snapshots on the computed path):
315
+
316
+ | Property | Notes |
317
+ |----------|-------|
318
+ | `color` | Hex, `rgb()`, `rgba()` (alpha 0 → ignored) |
319
+ | `background`, `background-color` | Hex / rgb; `transparent` ignored |
320
+ | `text-align` | `left`, `center`, `right`, `justify` |
321
+ | `font-size` | `px`, `pt`, `em` |
322
+ | `font-weight`, `font-style` | Including `bold`, `600`, `italic` |
323
+ | `list-style-type` | decimal, lower/upper-alpha, lower/upper-roman, disc, circle, square |
324
+ | `margin`, `margin-*` | `px`, `pt`, `em` |
325
+ | `padding`, `padding-*` | `px`, `pt`, `em` |
326
+ | `border`, `border-*` | Width + color shorthand |
327
+ | `display` | `block`, `inline-block`, `flex` |
328
+ | `flex-direction` | `row`, `column` |
329
+ | `gap`, `row-gap`, `column-gap` | px |
330
+
331
+ All other CSS properties are silently ignored.
332
+
333
+ ### What converts well / poorly
334
+
335
+ See [AGENTS.md](./AGENTS.md) for the full tier list. In short:
336
+
337
+ - **Excellent:** headings, paragraphs, lists, simple tables, inline formatting, short span highlights, blockquotes, `<hr>`.
338
+ - **Good:** shaded div banners, flex rows (≤4 items), table row/cell backgrounds, bordered boxes, `data:` images.
339
+ - **Avoid:** complex SVG, CSS grid/float/absolute layout, external stylesheets (inline path), forms, deep layout div nesting.
340
+
341
+ ### Input contract
342
+
343
+ - **Fragment only** — content that would go inside `<body>`, not a full document.
344
+ - **Inline CSS preferred** for the default path — `style="..."` on elements.
345
+ - **Stylesheets** (`<style>` blocks, classes) — require `styleSource: "computed"`.
346
+ - **No JavaScript** — conversion is static HTML → OOXML.
347
+
348
+ ---
349
+
350
+ ## Lower-level API
351
+
352
+ ### `buildDocxBuffer(html, styleResolver, imageResolver?, documentConfig?)`
353
+
354
+ Use when you already have a **`StyleResolver`** (or want to convert many fragments with one resolver / one browser session).
355
+
356
+ ```typescript
357
+ import { buildDocxBuffer, INLINE_STYLE_RESOLVER } from "dom-docx";
358
+
359
+ const docx = await buildDocxBuffer(html, INLINE_STYLE_RESOLVER);
360
+ ```
361
+
362
+ | Parameter | Type | Description |
363
+ |-----------|------|-------------|
364
+ | `html` | `string` | Same body fragment as `convertHtmlToDocx`. |
365
+ | `styleResolver` | `StyleResolver` | Supplies `getCss()` for every element during the visit. |
366
+ | `imageResolver` | `ImageResolver` | Optional. Same hook as [`ConvertOptions`](#images--the-resolver-hook). |
367
+ | `documentConfig` | `DocumentConfig` | Optional. Page/font/metadata options. |
368
+
369
+ Platform-neutral variants: **`buildDocxUint8Array`** (same signature, returns `Uint8Array`) and **`buildDocxBlob`** (returns `Blob`) — exported from both entry points.
370
+
371
+ Use these for benchmark loops that reuse one resolver, tests that inject a mock resolver, or pipelines that snapshot styles once and convert multiple times.
372
+
373
+ ### `StyleResolver`
374
+
375
+ Every element asks a `StyleResolver` for a normalized `ParsedCss` object (color, margins, borders, flex, …). The visitor, table builder, and inline collector all call `styleResolver.getCss(element)` — they never read `style=""` directly. Implement the interface to inject test doubles or alternate style sources.
376
+
377
+ ```typescript
378
+ interface StyleResolver {
379
+ readonly source: "inline" | "computed";
380
+ getCss(element: Element): ParsedCss; // cheerio/domhandler Element; {} if no styles apply
381
+ }
382
+ ```
383
+
384
+ Built-in resolvers and helpers (exported from `dom-docx`):
385
+
386
+ | Export | Description |
387
+ |--------|-------------|
388
+ | `INLINE_STYLE_RESOLVER` | Singleton inline resolver — parses each element's `style=""` attribute. No stylesheets, no class selectors. |
389
+ | `ComputedStyleResolver` | Computed resolver built from a snapshot array; looks up elements by stable DOM path. Construct via `ComputedStyleResolver.fromSnapshots(snapshots)`. |
390
+ | `createComputedStyleResolver(html, browser)` | Node: renders the fragment in a new Playwright page, snapshots styles, closes the page. |
391
+ | `computedStyleResolverFromPage(page)` | Node: snapshot an existing Playwright page (same DOM as a reference screenshot). |
392
+ | `snapshotComputedStyles(page)` | Node: raw `{ path, styles }[]` snapshots from a Playwright page. |
393
+ | `snapshotComputedStylesFromDocument(doc?)` | Browser (`dom-docx/browser`): same snapshots from a live `document`. |
394
+
395
+ **Computed style properties captured** (via `getComputedStyle`): `color`, `backgroundColor`, `display`, `flexDirection`, `gap` / `columnGap` / `rowGap`, `textAlign`, `fontSize`, `fontWeight`, `fontStyle`, margin and padding sides, per-side border width/color. UA defaults on headings are partially stripped when the element has no inline override, so the computed path stays aligned with the inline path for bare `<h1>`–`<h6>`.
396
+
397
+ ---
398
+
399
+ ## How the engine works
400
+
401
+ Conversion is a three-stage pipeline: **style resolution → HTML visitor → OOXML pack + patch**.
402
+
403
+ ```
404
+ HTML fragment
405
+
406
+
407
+ ┌─────────────────────────────────────────────────────────────┐
408
+ │ 1. Style resolution (StyleResolver) │
409
+ │ inline: parse style="" on each element │
410
+ │ computed (Node): getComputedStyle via Playwright/Chromium │
411
+ │ computed (browser): getComputedStyle on live document.body │
412
+ └─────────────────────────────────────────────────────────────┘
413
+
414
+
415
+ ┌─────────────────────────────────────────────────────────────┐
416
+ │ 2. Visitor (cheerio → docx objects) │
417
+ │ Walk body children; map blocks to Paragraph / Table │
418
+ │ Inline nodes → TextRun, Hyperlink, list numbering │
419
+ │ Flex divs → borderless tables; shaded divs → table wrap │
420
+ └─────────────────────────────────────────────────────────────┘
421
+
422
+
423
+ ┌─────────────────────────────────────────────────────────────┐
424
+ │ 3. Pack + post-process │
425
+ │ docx Packer → unzip → patch document.xml / numbering.xml │
426
+ │ → re-zip → Buffer (Node) or Blob (browser) │
427
+ └─────────────────────────────────────────────────────────────┘
428
+ ```
429
+
430
+ ### Visitor mapping
431
+
432
+ | HTML | OOXML |
433
+ |------|-------|
434
+ | `h1`–`h6` | Heading paragraphs (Word heading levels) |
435
+ | `p`, text flow | `Paragraph` + `TextRun` |
436
+ | `ul` / `ol` / `li` | Native numbering; honors `list-style-type` and the `type` attr |
437
+ | `table` / `tr` / `td` / `th` | `Table` with borders, shading, colspan |
438
+ | `div` with `display:flex` | Borderless flex table (row or column) |
439
+ | `div` with background/border | Shaded or bordered 1×1 table wrapper when needed |
440
+ | `blockquote` | Indented paragraph with left border |
441
+ | `a` | Hyperlink runs |
442
+ | `img` | Embedded `ImageRun` (`data:` URLs, or via `imageResolver`) |
443
+ | `svg` (low-complexity) | Native DOCX blocks — `<rect>` bands + `<text>` (bar/funnel charts) |
444
+ | `strong`, `em`, `span`, `code`, `br` | Inline runs / breaks |
445
+
446
+ ### OOXML post-processing
447
+
448
+ After `docx` packs the document, the engine unzips the buffer and patches XML the library cannot express cleanly:
449
+
450
+ - **`numbering.xml`** — LibreOffice needs list tab stops as `w:val="num"` (not `"left"`) and drops tentative numbering flags.
451
+ - **`document.xml`** — Shaded paragraphs with exact line spacing get vertical text alignment so PDF export centers padding correctly.
452
+
453
+ These patches are applied automatically; callers receive a finished `.docx`.
454
+
455
+ ---
456
+
457
+ ## Document defaults
458
+
459
+ Omitting all options produces:
460
+
461
+ | Setting | Value |
462
+ |---------|-------|
463
+ | Page size | US Letter (8.5″ × 11″), portrait |
464
+ | Margins | 1″ all sides |
465
+ | Body font | Arial 10.5 pt (= 14 px at 96 dpi) |
466
+ | Line height | 1.4 |
467
+ | Text color | `#111` on white |
468
+
469
+ These match the harness wrapper (`wrapHtml()` in `src/html-wrap.ts`) that the computed path and the visual validator render against, so default output aligns byte-for-byte with the validated baseline. Override any of them via [`ConvertOptions`](#options-convertoptions).
470
+
471
+ ---
472
+
473
+ ## Dependencies & environment
474
+
475
+ | Requirement | Node (`dom-docx`) | Browser (`dom-docx/browser`) |
476
+ |-------------|-------------------|------------------------------|
477
+ | **Node.js ≥ 20** | Required | N/A (runs in the user's browser) |
478
+ | **cheerio, docx, fflate** | Installed as dependencies | Bundled in the IIFE |
479
+ | **playwright** | **Only** for `styleSource: "computed"` (optional peer dependency, installed separately) | **Not used** |
480
+ | **Live DOM** | Not required (computed uses Playwright) | Required only for `styleSource: "computed"` |
481
+
482
+ LibreOffice (`soffice`) is **not** required for conversion — only for the visual validation loop in `npm run test:suite`.
483
+
484
+ For the Node computed path, install Playwright plus Chromium yourself, once:
485
+
486
+ ```bash
487
+ npm install playwright
488
+ npx playwright install chromium
489
+ ```
490
+
491
+ Contributors cloning this repo: `npm run setup` (same command, via package script).
492
+
493
+ **Playwright is not a dependency of the browser bundle.** It appears only in the Node computed path (and in this repo's test harness, which uses Playwright to *test* the browser bundle — dev tooling, not a runtime requirement for client apps).
494
+
495
+ ---
496
+
497
+ ## Limitations
498
+
499
+ - **Computed path cost (Node)** — launching Chromium per call is expensive; pass `browser` or `page` in hot loops. In the browser bundle, computed styles are free aside from normal layout.
500
+ - **Stylesheet fidelity** — the computed path improves cascade support, but some patterns (e.g. complex themed sections) still score lower than inline-authored equivalents.
501
+ - **Layout CSS** — CSS grid, floats, and absolute positioning have no OOXML equivalent and are ignored; flex support covers simple row/column cases.
502
+ - **Images** — png/jpg/gif/bmp only; svg `<img>` sources are not rasterized (inline low-complexity `<svg>` elements convert natively).
503
+ - **No JavaScript execution** — the fragment is converted statically; anything rendered client-side must be in the HTML string (or live DOM for the browser computed path).
504
+
505
+ ---
506
+
507
+ ## Validation commands (repo development)
508
+
509
+ These exercise the API and write artifacts under `output/`:
510
+
511
+ | Command | What it runs |
512
+ |---------|----------------|
513
+ | `npm run test:suite` | Full 35-case visual + XML regression suite (needs Chromium + LibreOffice) |
514
+ | `npm run test:suite:priority` | 10-case fast subset |
515
+ | `npm run test:inline-guard` | Asserts inline path OOXML equivalence (normalized XML) |
516
+ | `npm run test:config` | `ConvertOptions` OOXML checks |
517
+ | `npm run test:benchmark` | OSS html-to-docx / TurboDocx comparison |
518
+ | `npm run build:browser` | esbuild → `dist/browser/dom-docx.browser.js` |
519
+ | `npm run typecheck` | TypeScript compile check |
520
+
521
+ Browser bundle parity, style-source, and CSS-cascade guards: see [CONTRIBUTING.md](./CONTRIBUTING.md#maintainer-only-harness-commands).
522
+
523
+ ---
524
+
525
+ ## Related documentation
526
+
527
+ | Doc | Contents |
528
+ |-----|----------|
529
+ | [AGENTS.md](./AGENTS.md) | How to write HTML that converts well |
530
+ | [README.md](./README.md) | Install, quick start, API overview |
531
+ | [SCORING.md](./docs/SCORING.md) | Validation methodology and engine score |
532
+ | [TEST-SCORES.md](./docs/TEST-SCORES.md) | Latest suite metrics |
533
+ | [BENCHMARK.md](./docs/BENCHMARK.md) | Inline vs computed and CSS cascade benchmarks |
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Blair Googer
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.