@docen/import-docx 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,290 +1,290 @@
1
- # @docen/import-docx
2
-
3
- ![npm version](https://img.shields.io/npm/v/@docen/import-docx)
4
- ![npm downloads](https://img.shields.io/npm/dw/@docen/import-docx)
5
- ![npm license](https://img.shields.io/npm/l/@docen/import-docx)
6
-
7
- > Import Microsoft Word DOCX files to TipTap/ProseMirror content.
8
-
9
- ## Features
10
-
11
- - 📝 **Rich Text Parsing** - Accurate parsing of headings, paragraphs, and blockquotes with formatting
12
- - 🖼️ **Image Extraction** - Automatic image extraction with base64 conversion and cropping support
13
- - 📊 **Table Support** - Complete table structure with colspan/rowspan detection algorithm
14
- - ✅ **Lists & Tasks** - Bullet lists, numbered lists with start number extraction, and task lists with checkbox detection
15
- - 🎨 **Text Formatting** - Bold, italic, underline, strikethrough, subscript, superscript, and highlights
16
- - 🎯 **Text Styles** - Comprehensive style support including colors, backgrounds, fonts, sizes, and line heights
17
- - 🔗 **Links** - Hyperlink extraction with href preservation
18
- - 💻 **Code Blocks** - Code block detection with language attribute extraction
19
- - 🌐 **Cross-Platform** - Works in both browser and Node.js environments
20
- - ✂️ **Image Cropping** - Automatic cropping of images based on DOCX crop metadata
21
- - 🧠 **Smart Parsing** - DOCX XML parsing with proper element grouping and structure reconstruction
22
- - ⚡ **Fast Processing** - Uses fflate for ultra-fast ZIP decompression
23
-
24
- ## Installation
25
-
26
- ```bash
27
- # Install with npm
28
- $ npm install @docen/import-docx
29
-
30
- # Install with yarn
31
- $ yarn add @docen/import-docx
32
-
33
- # Install with pnpm
34
- $ pnpm add @docen/import-docx
35
- ```
36
-
37
- ## Quick Start
38
-
39
- ```typescript
40
- import { parseDOCX } from "@docen/import-docx";
41
- import { readFileSync } from "node:fs";
42
-
43
- // Read DOCX file
44
- const buffer = readFileSync("document.docx");
45
-
46
- // Parse DOCX to TipTap JSON
47
- const content = await parseDOCX(buffer);
48
-
49
- // Use in TipTap editor
50
- editor.commands.setContent(content);
51
- ```
52
-
53
- ## API Reference
54
-
55
- ### `parseDOCX(input, options?)`
56
-
57
- Parses a DOCX file and converts it to TipTap/ProseMirror JSON content.
58
-
59
- **Parameters:**
60
-
61
- - `input: Buffer | ArrayBuffer | Uint8Array` - DOCX file data
62
- - `options?: DocxImportOptions` - Optional import configuration
63
-
64
- **Returns:** `Promise<JSONContent>` - TipTap/ProseMirror document content with images embedded
65
-
66
- **Options:**
67
-
68
- ```typescript
69
- interface DocxImportOptions {
70
- /** Custom image converter (default: embed as base64) */
71
- convertImage?: (image: DocxImageInfo) => Promise<DocxImageResult>;
72
-
73
- /** Whether to ignore empty paragraphs (default: false).
74
- * Empty paragraphs are those without text content or images.
75
- * Paragraphs containing only whitespace or images are not considered empty. */
76
- ignoreEmptyParagraphs?: boolean;
77
-
78
- /**
79
- * Dynamic import function for @napi-rs/canvas
80
- * Required for image cropping in Node.js environment, ignored in browser
81
- *
82
- * @example
83
- * import { parseDOCX } from '@docen/import-docx';
84
- * const content = await parseDOCX(buffer, {
85
- * canvasImport: () => import('@napi-rs/canvas')
86
- * });
87
- */
88
- canvasImport?: () => Promise<typeof import("@napi-rs/canvas")>;
89
-
90
- /**
91
- * Enable or disable image cropping during import
92
- * When true, images with crop information in DOCX will be cropped
93
- * When false (default), crop information is ignored and full image is used
94
- *
95
- * @default false
96
- */
97
- enableImageCrop?: boolean;
98
- }
99
- ```
100
-
101
- **Default Image Converter:**
102
-
103
- The package exports `defaultImageConverter` which embeds images as base64 data URLs:
104
-
105
- ```typescript
106
- import { defaultImageConverter } from "@docen/import-docx";
107
-
108
- // Use in custom converter
109
- await parseDOCX(buffer, {
110
- convertImage: async (image) => {
111
- if (shouldUploadToCDN) {
112
- return uploadToCDN(image.data);
113
- }
114
- return defaultImageConverter(image);
115
- },
116
- });
117
- ```
118
-
119
- ## Supported Content Types
120
-
121
- ### Text Formatting
122
-
123
- - **Bold**, _Italic_, <u>Underline</u>, ~~Strikethrough~~
124
- - ^Superscript^ and ~Subscript~
125
- - Text highlights
126
- - Text colors and background colors
127
- - Font families and sizes
128
- - Line heights
129
-
130
- ### Block Elements
131
-
132
- - **Headings** (H1-H6) with proper level detection
133
- - **Paragraphs** with text alignment (left, right, center, justify)
134
- - **Blockquotes** (Detected by indentation + left border formatting)
135
- - **Horizontal Rules** (Detected as page breaks in DOCX)
136
- - **Code Blocks** with language attribute support
137
-
138
- ### Lists
139
-
140
- - **Bullet Lists** with proper nesting and structure
141
- - **Numbered Lists** with custom start number extraction
142
- - **Task Lists** with checked/unchecked state detection (☐/☑ symbols)
143
-
144
- ### Tables
145
-
146
- - Complete table structure parsing
147
- - **Table Cells** with colspan detection using grid-based algorithm
148
- - **Table Cells** with rowspan detection using vMerge tracking
149
- - Cell alignment and formatting preservation
150
- - Merged cell handling (both horizontal and vertical)
151
-
152
- ### Media & Embeds
153
-
154
- - **Images** with automatic base64 conversion
155
- - **Grouped Images** (DOCX image groups) support
156
- - **Links** (hyperlinks) with href extraction
157
-
158
- ## Parsing Algorithm
159
-
160
- ### Document Structure
161
-
162
- The parser follows a structured workflow:
163
-
164
- 1. **Extract Relationships** - Parse `_rels/document.xml.rels` for hyperlinks and images
165
- 2. **Parse Numbering** - Extract list definitions from `numbering.xml` (abstractNum → numFmt)
166
- 3. **Process Document Body** - Iterate through document.xml elements:
167
- - Detect content types (tables, lists, paragraphs, code blocks, etc.)
168
- - Group consecutive elements into proper containers
169
- - Convert XML nodes to TipTap JSON nodes
170
-
171
- ### Table Processing
172
-
173
- Tables use specialized algorithms:
174
-
175
- - **Colspan Detection** - Grid-based algorithm tracks cell positions and detects horizontal merges
176
- - **Rowspan Detection** - Vertical merge (vMerge) tracking across rows with proper cell skipping
177
- - **Cell Content** - Recursive parsing of nested paragraphs and formatting
178
- - **Hyperlink Support** - Proper handling of links within table cells
179
-
180
- ### List Processing
181
-
182
- Lists utilize the DOCX numbering system:
183
-
184
- - **Numbering ID Mapping** - Maps abstractNum to formatting (bullet vs decimal)
185
- - **Start Value Extraction** - Extracts and preserves start numbers for ordered lists
186
- - **Nesting Preservation** - Maintains proper list hierarchy
187
- - **Consecutive Grouping** - Groups consecutive list items into list containers
188
-
189
- ## Examples
190
-
191
- ### Basic Usage
192
-
193
- ```typescript
194
- import { parseDOCX } from "@docen/import-docx";
195
-
196
- const buffer = readFileSync("example.docx");
197
- const { content } = await parseDOCX(buffer);
198
-
199
- console.log(JSON.stringify(content, null, 2));
200
- ```
201
-
202
- ### Use with TipTap Editor
203
-
204
- ```typescript
205
- import { Editor } from "@tiptap/core";
206
- import { parseDOCX } from "@docen/import-docx";
207
-
208
- const editor = new Editor({
209
- extensions: [...],
210
- content: "",
211
- });
212
-
213
- // Import DOCX file
214
- async function importDocx(file: File) {
215
- const buffer = await file.arrayBuffer();
216
- const content = await parseDOCX(buffer);
217
- editor.commands.setContent(content);
218
- }
219
- ```
220
-
221
- ### Node.js Environment with Image Cropping
222
-
223
- To enable image cropping in Node.js environment, you need to provide `@napi-rs/canvas`:
224
-
225
- ```typescript
226
- import { parseDOCX } from "@docen/import-docx";
227
- import { readFileSync } from "node:fs";
228
-
229
- // Install @napi-rs/canvas first: pnpm add @napi-rs/canvas
230
- const buffer = readFileSync("document.docx");
231
-
232
- const content = await parseDOCX(buffer, {
233
- canvasImport: () => import("@napi-rs/canvas"),
234
- enableImageCrop: true, // Enable cropping (default is false)
235
- });
236
- ```
237
-
238
- **Note:** By default, image cropping is disabled. Images are imported in full size, ignoring crop information in DOCX.
239
-
240
- ### Disable Image Cropping
241
-
242
- If you want to explicitly ignore crop information in DOCX and use full images (this is the default behavior):
243
-
244
- ```typescript
245
- const content = await parseDOCX(buffer, {
246
- enableImageCrop: false,
247
- });
248
- ```
249
-
250
- ## Known Limitations
251
-
252
- ### Blockquote Detection
253
-
254
- DOCX does not have a semantic blockquote structure. Blockquotes are detected by:
255
-
256
- - Left indentation ≥ 720 twips (0.5 inch)
257
- - Presence of left border (single line)
258
-
259
- This detection method may produce false positives for documents with custom indentation similar to blockquotes.
260
-
261
- ### Code Marks
262
-
263
- The `code` mark is NOT automatically detected from monospace fonts (Consolas, Courier New, etc.). This is intentional to avoid false positives. Code marks should be explicitly added in the source document or through editor UI.
264
-
265
- ### Color Format
266
-
267
- All colors are imported as hex values (e.g., "#FF0000", "#008000"). Color names from the original document are not preserved.
268
-
269
- ### Image Limitations
270
-
271
- - Only embedded images are supported (external image links are not fetched)
272
- - Image dimensions and title are extracted from DOCX metadata
273
- - **Image Cropping**: By default, images are imported in full size (crop information is ignored)
274
- - To enable cropping, set `enableImageCrop: true` in options
275
- - In browser environments, cropping works natively with Canvas API
276
- - In Node.js, you must also provide `canvasImport` option with dynamic import of `@napi-rs/canvas`
277
- - If `@napi-rs/canvas` is not available in Node.js, images will be imported without cropping (graceful degradation)
278
- - Some DOCX image features (like advanced positioning or text wrapping) have limited support
279
-
280
- ### Table Cell Types
281
-
282
- DOCX format does not distinguish between header and body cells at a semantic level. All cells are imported as `tableCell` type for consistency. This is a DOCX format limitation.
283
-
284
- ## Contributing
285
-
286
- Contributions are welcome! Please read our [Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct/) and submit pull requests to the [main repository](https://github.com/DemoMacro/docen).
287
-
288
- ## License
289
-
290
- - [MIT](LICENSE) &copy; [Demo Macro](https://imst.xyz/)
1
+ # @docen/import-docx
2
+
3
+ ![npm version](https://img.shields.io/npm/v/@docen/import-docx)
4
+ ![npm downloads](https://img.shields.io/npm/dw/@docen/import-docx)
5
+ ![npm license](https://img.shields.io/npm/l/@docen/import-docx)
6
+
7
+ > Import Microsoft Word DOCX files to TipTap/ProseMirror content.
8
+
9
+ ## Features
10
+
11
+ - 📝 **Rich Text Parsing** - Accurate parsing of headings, paragraphs, and blockquotes with formatting
12
+ - 🖼️ **Image Extraction** - Automatic image extraction with base64 conversion and cropping support
13
+ - 📊 **Table Support** - Complete table structure with colspan/rowspan detection algorithm
14
+ - ✅ **Lists & Tasks** - Bullet lists, numbered lists with start number extraction, and task lists with checkbox detection
15
+ - 🎨 **Text Formatting** - Bold, italic, underline, strikethrough, subscript, superscript, and highlights
16
+ - 🎯 **Text Styles** - Comprehensive style support including colors, backgrounds, fonts, sizes, and line heights
17
+ - 🔗 **Links** - Hyperlink extraction with href preservation
18
+ - 💻 **Code Blocks** - Code block detection with language attribute extraction
19
+ - 🌐 **Cross-Platform** - Works in both browser and Node.js environments
20
+ - ✂️ **Image Cropping** - Automatic cropping of images based on DOCX crop metadata
21
+ - 🧠 **Smart Parsing** - DOCX XML parsing with proper element grouping and structure reconstruction
22
+ - ⚡ **Fast Processing** - Uses fflate for ultra-fast ZIP decompression
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ # Install with npm
28
+ $ npm install @docen/import-docx
29
+
30
+ # Install with yarn
31
+ $ yarn add @docen/import-docx
32
+
33
+ # Install with pnpm
34
+ $ pnpm add @docen/import-docx
35
+ ```
36
+
37
+ ## Quick Start
38
+
39
+ ```typescript
40
+ import { parseDOCX } from "@docen/import-docx";
41
+ import { readFileSync } from "node:fs";
42
+
43
+ // Read DOCX file
44
+ const buffer = readFileSync("document.docx");
45
+
46
+ // Parse DOCX to TipTap JSON
47
+ const content = await parseDOCX(buffer);
48
+
49
+ // Use in TipTap editor
50
+ editor.commands.setContent(content);
51
+ ```
52
+
53
+ ## API Reference
54
+
55
+ ### `parseDOCX(input, options?)`
56
+
57
+ Parses a DOCX file and converts it to TipTap/ProseMirror JSON content.
58
+
59
+ **Parameters:**
60
+
61
+ - `input: Buffer | ArrayBuffer | Uint8Array` - DOCX file data
62
+ - `options?: DocxImportOptions` - Optional import configuration
63
+
64
+ **Returns:** `Promise<JSONContent>` - TipTap/ProseMirror document content with images embedded
65
+
66
+ **Options:**
67
+
68
+ ```typescript
69
+ interface DocxImportOptions {
70
+ /** Custom image converter (default: embed as base64) */
71
+ convertImage?: (image: DocxImageInfo) => Promise<DocxImageResult>;
72
+
73
+ /** Whether to ignore empty paragraphs (default: false).
74
+ * Empty paragraphs are those without text content or images.
75
+ * Paragraphs containing only whitespace or images are not considered empty. */
76
+ ignoreEmptyParagraphs?: boolean;
77
+
78
+ /**
79
+ * Dynamic import function for @napi-rs/canvas
80
+ * Required for image cropping in Node.js environment, ignored in browser
81
+ *
82
+ * @example
83
+ * import { parseDOCX } from '@docen/import-docx';
84
+ * const content = await parseDOCX(buffer, {
85
+ * canvasImport: () => import('@napi-rs/canvas')
86
+ * });
87
+ */
88
+ canvasImport?: () => Promise<typeof import("@napi-rs/canvas")>;
89
+
90
+ /**
91
+ * Enable or disable image cropping during import
92
+ * When true, images with crop information in DOCX will be cropped
93
+ * When false (default), crop information is ignored and full image is used
94
+ *
95
+ * @default false
96
+ */
97
+ enableImageCrop?: boolean;
98
+ }
99
+ ```
100
+
101
+ **Default Image Converter:**
102
+
103
+ The package exports `defaultImageConverter` which embeds images as base64 data URLs:
104
+
105
+ ```typescript
106
+ import { defaultImageConverter } from "@docen/import-docx";
107
+
108
+ // Use in custom converter
109
+ await parseDOCX(buffer, {
110
+ convertImage: async (image) => {
111
+ if (shouldUploadToCDN) {
112
+ return uploadToCDN(image.data);
113
+ }
114
+ return defaultImageConverter(image);
115
+ },
116
+ });
117
+ ```
118
+
119
+ ## Supported Content Types
120
+
121
+ ### Text Formatting
122
+
123
+ - **Bold**, _Italic_, <u>Underline</u>, ~~Strikethrough~~
124
+ - ^Superscript^ and ~Subscript~
125
+ - Text highlights
126
+ - Text colors and background colors
127
+ - Font families and sizes
128
+ - Line heights
129
+
130
+ ### Block Elements
131
+
132
+ - **Headings** (H1-H6) with proper level detection
133
+ - **Paragraphs** with text alignment (left, right, center, justify)
134
+ - **Blockquotes** (Detected by indentation + left border formatting)
135
+ - **Horizontal Rules** (Detected as page breaks in DOCX)
136
+ - **Code Blocks** with language attribute support
137
+
138
+ ### Lists
139
+
140
+ - **Bullet Lists** with proper nesting and structure
141
+ - **Numbered Lists** with custom start number extraction
142
+ - **Task Lists** with checked/unchecked state detection (☐/☑ symbols)
143
+
144
+ ### Tables
145
+
146
+ - Complete table structure parsing
147
+ - **Table Cells** with colspan detection using grid-based algorithm
148
+ - **Table Cells** with rowspan detection using vMerge tracking
149
+ - Cell alignment and formatting preservation
150
+ - Merged cell handling (both horizontal and vertical)
151
+
152
+ ### Media & Embeds
153
+
154
+ - **Images** with automatic base64 conversion
155
+ - **Grouped Images** (DOCX image groups) support
156
+ - **Links** (hyperlinks) with href extraction
157
+
158
+ ## Parsing Algorithm
159
+
160
+ ### Document Structure
161
+
162
+ The parser follows a structured workflow:
163
+
164
+ 1. **Extract Relationships** - Parse `_rels/document.xml.rels` for hyperlinks and images
165
+ 2. **Parse Numbering** - Extract list definitions from `numbering.xml` (abstractNum → numFmt)
166
+ 3. **Process Document Body** - Iterate through document.xml elements:
167
+ - Detect content types (tables, lists, paragraphs, code blocks, etc.)
168
+ - Group consecutive elements into proper containers
169
+ - Convert XML nodes to TipTap JSON nodes
170
+
171
+ ### Table Processing
172
+
173
+ Tables use specialized algorithms:
174
+
175
+ - **Colspan Detection** - Grid-based algorithm tracks cell positions and detects horizontal merges
176
+ - **Rowspan Detection** - Vertical merge (vMerge) tracking across rows with proper cell skipping
177
+ - **Cell Content** - Recursive parsing of nested paragraphs and formatting
178
+ - **Hyperlink Support** - Proper handling of links within table cells
179
+
180
+ ### List Processing
181
+
182
+ Lists utilize the DOCX numbering system:
183
+
184
+ - **Numbering ID Mapping** - Maps abstractNum to formatting (bullet vs decimal)
185
+ - **Start Value Extraction** - Extracts and preserves start numbers for ordered lists
186
+ - **Nesting Preservation** - Maintains proper list hierarchy
187
+ - **Consecutive Grouping** - Groups consecutive list items into list containers
188
+
189
+ ## Examples
190
+
191
+ ### Basic Usage
192
+
193
+ ```typescript
194
+ import { parseDOCX } from "@docen/import-docx";
195
+
196
+ const buffer = readFileSync("example.docx");
197
+ const { content } = await parseDOCX(buffer);
198
+
199
+ console.log(JSON.stringify(content, null, 2));
200
+ ```
201
+
202
+ ### Use with TipTap Editor
203
+
204
+ ```typescript
205
+ import { Editor } from "@tiptap/core";
206
+ import { parseDOCX } from "@docen/import-docx";
207
+
208
+ const editor = new Editor({
209
+ extensions: [...],
210
+ content: "",
211
+ });
212
+
213
+ // Import DOCX file
214
+ async function importDocx(file: File) {
215
+ const buffer = await file.arrayBuffer();
216
+ const content = await parseDOCX(buffer);
217
+ editor.commands.setContent(content);
218
+ }
219
+ ```
220
+
221
+ ### Node.js Environment with Image Cropping
222
+
223
+ To enable image cropping in Node.js environment, you need to provide `@napi-rs/canvas`:
224
+
225
+ ```typescript
226
+ import { parseDOCX } from "@docen/import-docx";
227
+ import { readFileSync } from "node:fs";
228
+
229
+ // Install @napi-rs/canvas first: pnpm add @napi-rs/canvas
230
+ const buffer = readFileSync("document.docx");
231
+
232
+ const content = await parseDOCX(buffer, {
233
+ canvasImport: () => import("@napi-rs/canvas"),
234
+ enableImageCrop: true, // Enable cropping (default is false)
235
+ });
236
+ ```
237
+
238
+ **Note:** By default, image cropping is disabled. Images are imported in full size, ignoring crop information in DOCX.
239
+
240
+ ### Disable Image Cropping
241
+
242
+ If you want to explicitly ignore crop information in DOCX and use full images (this is the default behavior):
243
+
244
+ ```typescript
245
+ const content = await parseDOCX(buffer, {
246
+ enableImageCrop: false,
247
+ });
248
+ ```
249
+
250
+ ## Known Limitations
251
+
252
+ ### Blockquote Detection
253
+
254
+ DOCX does not have a semantic blockquote structure. Blockquotes are detected by:
255
+
256
+ - Left indentation ≥ 720 twips (0.5 inch)
257
+ - Presence of left border (single line)
258
+
259
+ This detection method may produce false positives for documents with custom indentation similar to blockquotes.
260
+
261
+ ### Code Marks
262
+
263
+ The `code` mark is NOT automatically detected from monospace fonts (Consolas, Courier New, etc.). This is intentional to avoid false positives. Code marks should be explicitly added in the source document or through editor UI.
264
+
265
+ ### Color Format
266
+
267
+ All colors are imported as hex values (e.g., "#FF0000", "#008000"). Color names from the original document are not preserved.
268
+
269
+ ### Image Limitations
270
+
271
+ - Only embedded images are supported (external image links are not fetched)
272
+ - Image dimensions and title are extracted from DOCX metadata
273
+ - **Image Cropping**: By default, images are imported in full size (crop information is ignored)
274
+ - To enable cropping, set `enableImageCrop: true` in options
275
+ - In browser environments, cropping works natively with Canvas API
276
+ - In Node.js, you must also provide `canvasImport` option with dynamic import of `@napi-rs/canvas`
277
+ - If `@napi-rs/canvas` is not available in Node.js, images will be imported without cropping (graceful degradation)
278
+ - Some DOCX image features (like advanced positioning or text wrapping) have limited support
279
+
280
+ ### Table Cell Types
281
+
282
+ DOCX format does not distinguish between header and body cells at a semantic level. All cells are imported as `tableCell` type for consistency. This is a DOCX format limitation.
283
+
284
+ ## Contributing
285
+
286
+ Contributions are welcome! Please read our [Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct/) and submit pull requests to the [main repository](https://github.com/DemoMacro/docen).
287
+
288
+ ## License
289
+
290
+ - [MIT](LICENSE) &copy; [Demo Macro](https://imst.xyz/)
package/dist/index.d.mts CHANGED
@@ -3672,7 +3672,7 @@ point into textblock nodes. It can be empty (a regular cursor
3672
3672
  position).
3673
3673
  */
3674
3674
  //#endregion
3675
- //#region ../../node_modules/.pnpm/@tiptap+core@3.20.1_@tiptap+pm@3.20.1/node_modules/@tiptap/core/dist/index.d.ts
3675
+ //#region ../../node_modules/.pnpm/@tiptap+core@3.20.0_@tiptap+pm@3.20.0/node_modules/@tiptap/core/dist/index.d.ts
3676
3676
  type StringKeyOf<T> = Extract<keyof T, string>;
3677
3677
  type CallbackType<T extends Record<string, any>, EventName extends StringKeyOf<T>> = T[EventName] extends any[] ? T[EventName] : [T[EventName]];
3678
3678
  type CallbackFunction<T extends Record<string, any>, EventName extends StringKeyOf<T>> = (...props: CallbackType<T, EventName>) => any;
@@ -6643,45 +6643,6 @@ interface DocxImportOptions {
6643
6643
  ignoreEmptyParagraphs?: boolean;
6644
6644
  }
6645
6645
  //#endregion
6646
- //#region src/parsers/styles.d.ts
6647
- /**
6648
- * Character format information from a style definition
6649
- */
6650
- interface CharFormat {
6651
- color?: string;
6652
- bold?: boolean;
6653
- italic?: boolean;
6654
- fontSize?: number;
6655
- fontFamily?: string;
6656
- underline?: boolean;
6657
- strike?: boolean;
6658
- }
6659
- /**
6660
- * Style information from styles.xml
6661
- */
6662
- interface StyleInfo {
6663
- styleId: string;
6664
- name?: string;
6665
- outlineLvl?: number;
6666
- charFormat?: CharFormat;
6667
- }
6668
- type StyleMap = Map<string, StyleInfo>;
6669
- //#endregion
6670
- //#region src/parser.d.ts
6671
- /**
6672
- * Parsing context containing all global resources from DOCX file
6673
- */
6674
- interface ParseContext extends DocxImportOptions {
6675
- hyperlinks: Map<string, string>;
6676
- images: Map<string, ImageInfo>;
6677
- listTypeMap: ListTypeMap;
6678
- styleMap: StyleMap;
6679
- }
6680
- /**
6681
- * Main entry point: Parse DOCX file and convert to TipTap JSON
6682
- */
6683
- declare function parseDOCX(input: DataType, options?: DocxImportOptions): Promise<JSONContent>;
6684
- //#endregion
6685
6646
  //#region ../../node_modules/.pnpm/@types+unist@3.0.3/node_modules/@types/unist/index.d.ts
6686
6647
  // ## Interfaces
6687
6648
  /**
@@ -7016,6 +6977,86 @@ interface Text extends Literal {
7016
6977
  */
7017
6978
  interface TextData extends Data {}
7018
6979
  //#endregion
6980
+ //#region ../extensions/dist/types.d.mts
6981
+ //#endregion
6982
+ //#region src/types.d.ts
6983
+ /**
6984
+ * Border definition (compatible with docx.js BorderOptions)
6985
+ * Used by paragraphs, table cells, and blockquotes
6986
+ */
6987
+ interface Border {
6988
+ /** Border color (hex without #, e.g., "FF0000" or "auto") */
6989
+ color?: string;
6990
+ /** Border size (eighth-points, 1/8 pt) */
6991
+ size?: number;
6992
+ /** Border style */
6993
+ style?: "single" | "dashed" | "dotted" | "double" | "dotDash" | "dotDotDash" | "none";
6994
+ /** Space between border and content (points) */
6995
+ space?: number;
6996
+ }
6997
+ /**
6998
+ * Shading definition (compatible with docx.js ShadingOptions)
6999
+ * Used for paragraph and table cell background colors
7000
+ */
7001
+ interface Shading {
7002
+ /** Fill color (hex without #, e.g., "FF0000") */
7003
+ fill?: string;
7004
+ /** Pattern color (hex without #) */
7005
+ color?: string;
7006
+ /** Shading pattern type (e.g., "clear", "percent-10") */
7007
+ type?: string;
7008
+ }
7009
+ //#endregion
7010
+ //#region src/parsers/styles.d.ts
7011
+ /**
7012
+ * Character format information from a style definition
7013
+ */
7014
+ interface CharFormat {
7015
+ color?: string;
7016
+ bold?: boolean;
7017
+ italic?: boolean;
7018
+ fontSize?: number;
7019
+ fontFamily?: string;
7020
+ underline?: boolean;
7021
+ strike?: boolean;
7022
+ }
7023
+ /**
7024
+ * Paragraph format information from a style definition
7025
+ */
7026
+ interface ParagraphFormat {
7027
+ shading?: Shading;
7028
+ borderTop?: Border;
7029
+ borderBottom?: Border;
7030
+ borderLeft?: Border;
7031
+ borderRight?: Border;
7032
+ }
7033
+ /**
7034
+ * Style information from styles.xml
7035
+ */
7036
+ interface StyleInfo {
7037
+ styleId: string;
7038
+ name?: string;
7039
+ outlineLvl?: number;
7040
+ charFormat?: CharFormat;
7041
+ paragraphFormat?: ParagraphFormat;
7042
+ }
7043
+ type StyleMap = Map<string, StyleInfo>;
7044
+ //#endregion
7045
+ //#region src/parser.d.ts
7046
+ /**
7047
+ * Parsing context containing all global resources from DOCX file
7048
+ */
7049
+ interface ParseContext extends DocxImportOptions {
7050
+ hyperlinks: Map<string, string>;
7051
+ images: Map<string, ImageInfo>;
7052
+ listTypeMap: ListTypeMap;
7053
+ styleMap: StyleMap;
7054
+ }
7055
+ /**
7056
+ * Main entry point: Parse DOCX file and convert to TipTap JSON
7057
+ */
7058
+ declare function parseDOCX(input: DataType, options?: DocxImportOptions): Promise<JSONContent>;
7059
+ //#endregion
7019
7060
  //#region src/converters/paragraph.d.ts
7020
7061
  /**
7021
7062
  * Convert DOCX paragraph node to TipTap paragraph
package/dist/index.mjs CHANGED
@@ -854,6 +854,65 @@ function parseNumberingXml(files) {
854
854
  //#endregion
855
855
  //#region src/parsers/styles.ts
856
856
  /**
857
+ * Parse a single border element
858
+ */
859
+ function parseBorder(borderNode) {
860
+ if (!borderNode) return null;
861
+ const val = borderNode.attributes["w:val"];
862
+ const size = borderNode.attributes["w:sz"];
863
+ const color = borderNode.attributes["w:color"];
864
+ const space = borderNode.attributes["w:space"];
865
+ const styleMap = {
866
+ single: "single",
867
+ dashed: "dashed",
868
+ dotted: "dotted",
869
+ double: "double",
870
+ dotDash: "dotDash",
871
+ dotDotDash: "dotDotDash",
872
+ none: "none",
873
+ nil: "none"
874
+ };
875
+ const border = {};
876
+ if (color && color !== "auto") border.color = `#${color}`;
877
+ if (size) border.size = parseInt(size);
878
+ if (val && styleMap[val]) border.style = styleMap[val];
879
+ if (space) border.space = parseInt(space);
880
+ return Object.keys(border).length > 0 ? border : null;
881
+ }
882
+ /**
883
+ * Parse borders from w:pBdr or w:pBorders element
884
+ */
885
+ function parseBorders(pPr) {
886
+ if (!pPr) return null;
887
+ const borderElement = findChild(pPr, "w:pBorders") || findChild(pPr, "w:pBdr");
888
+ if (!borderElement) return null;
889
+ const borders = {};
890
+ const topBorder = parseBorder(findChild(borderElement, "w:top"));
891
+ if (topBorder) borders.borderTop = topBorder;
892
+ const bottomBorder = parseBorder(findChild(borderElement, "w:bottom"));
893
+ if (bottomBorder) borders.borderBottom = bottomBorder;
894
+ const leftBorder = parseBorder(findChild(borderElement, "w:left"));
895
+ if (leftBorder) borders.borderLeft = leftBorder;
896
+ const rightBorder = parseBorder(findChild(borderElement, "w:right"));
897
+ if (rightBorder) borders.borderRight = rightBorder;
898
+ return Object.keys(borders).length > 0 ? borders : null;
899
+ }
900
+ /**
901
+ * Parse shading from w:shd element
902
+ */
903
+ function parseShading(pPr) {
904
+ if (!pPr) return null;
905
+ const shd = findChild(pPr, "w:shd");
906
+ if (!shd) return null;
907
+ const shading = {};
908
+ if (shd.attributes["w:fill"]) {
909
+ const fill = shd.attributes["w:fill"];
910
+ shading.fill = fill.startsWith("#") ? fill : `#${fill}`;
911
+ }
912
+ if (shd.attributes["w:val"]) shading.type = shd.attributes["w:val"];
913
+ return Object.keys(shading).length > 0 ? shading : null;
914
+ }
915
+ /**
857
916
  * Parse styles.xml to build style map
858
917
  * Extracts outlineLvl from paragraph styles to identify headings
859
918
  * Extracts character format (color, bold, etc.) from style definitions
@@ -875,6 +934,14 @@ function parseStylesXml(files) {
875
934
  if (pPr) {
876
935
  const outlineLvl = findChild(pPr, "w:outlineLvl");
877
936
  if (outlineLvl?.attributes["w:val"] !== void 0) styleInfo.outlineLvl = parseInt(outlineLvl.attributes["w:val"], 10);
937
+ const borders = parseBorders(pPr);
938
+ const shading = parseShading(pPr);
939
+ if (borders || shading) {
940
+ const paragraphFormat = {};
941
+ if (borders) Object.assign(paragraphFormat, borders);
942
+ if (shading) paragraphFormat.shading = shading;
943
+ if (Object.keys(paragraphFormat).length > 0) styleInfo.paragraphFormat = paragraphFormat;
944
+ }
878
945
  }
879
946
  const rPr = findChild(style, "w:rPr");
880
947
  if (rPr) {
@@ -884,10 +951,26 @@ function parseStylesXml(files) {
884
951
  const colorVal = color.attributes["w:val"];
885
952
  charFormat.color = colorVal.startsWith("#") ? colorVal : `#${colorVal}`;
886
953
  }
887
- if (findChild(rPr, "w:b")) charFormat.bold = true;
888
- if (findChild(rPr, "w:i")) charFormat.italic = true;
889
- if (findChild(rPr, "w:u")) charFormat.underline = true;
890
- if (findChild(rPr, "w:strike")) charFormat.strike = true;
954
+ const bold = findChild(rPr, "w:b");
955
+ if (bold) {
956
+ const val = bold.attributes["w:val"];
957
+ if (val !== "0" && val !== "false") charFormat.bold = true;
958
+ }
959
+ const italic = findChild(rPr, "w:i");
960
+ if (italic) {
961
+ const val = italic.attributes["w:val"];
962
+ if (val !== "0" && val !== "false") charFormat.italic = true;
963
+ }
964
+ const underline = findChild(rPr, "w:u");
965
+ if (underline) {
966
+ const val = underline.attributes["w:val"];
967
+ if (val !== "none" && val !== "false" && val !== "0") charFormat.underline = true;
968
+ }
969
+ const strike = findChild(rPr, "w:strike");
970
+ if (strike) {
971
+ const val = strike.attributes["w:val"];
972
+ if (val !== "0" && val !== "false") charFormat.strike = true;
973
+ }
891
974
  const sz = findChild(rPr, "w:sz");
892
975
  if (sz?.attributes["w:val"]) {
893
976
  const sizeVal = sz.attributes["w:val"];
@@ -902,6 +985,48 @@ function parseStylesXml(files) {
902
985
  }
903
986
  return styleMap;
904
987
  }
988
+ /**
989
+ * Extract all paragraph style attributes from a paragraph element
990
+ * Merges direct paragraph properties with style-based properties
991
+ */
992
+ function extractParagraphStyles(node, styleInfo) {
993
+ const pPr = findChild(node, "w:pPr");
994
+ if (!pPr) return null;
995
+ const result = {};
996
+ if (styleInfo?.paragraphFormat) {
997
+ const pf = styleInfo.paragraphFormat;
998
+ if (pf.shading) result.shading = pf.shading;
999
+ if (pf.borderTop) result.borderTop = pf.borderTop;
1000
+ if (pf.borderBottom) result.borderBottom = pf.borderBottom;
1001
+ if (pf.borderLeft) result.borderLeft = pf.borderLeft;
1002
+ if (pf.borderRight) result.borderRight = pf.borderRight;
1003
+ }
1004
+ const ind = findChild(pPr, "w:ind");
1005
+ if (ind) {
1006
+ const left = parseTwipAttr(ind.attributes, "w:left");
1007
+ if (left) result.indentLeft = convertTwipToCssString(parseInt(left, 10));
1008
+ const right = parseTwipAttr(ind.attributes, "w:right");
1009
+ if (right) result.indentRight = convertTwipToCssString(parseInt(right, 10));
1010
+ const firstLine = parseTwipAttr(ind.attributes, "w:firstLine");
1011
+ if (firstLine) result.indentFirstLine = convertTwipToCssString(parseInt(firstLine, 10));
1012
+ else {
1013
+ const hanging = parseTwipAttr(ind.attributes, "w:hanging");
1014
+ if (hanging) result.indentFirstLine = convertTwipToCssString((left ? parseInt(left, 10) : 0) - parseInt(hanging, 10));
1015
+ }
1016
+ }
1017
+ const spacing = findChild(pPr, "w:spacing");
1018
+ if (spacing) {
1019
+ const before = parseTwipAttr(spacing.attributes, "w:before");
1020
+ if (before) result.spacingBefore = convertTwipToCssString(parseInt(before, 10));
1021
+ const after = parseTwipAttr(spacing.attributes, "w:after");
1022
+ if (after) result.spacingAfter = convertTwipToCssString(parseInt(after, 10));
1023
+ }
1024
+ const shading = parseShading(pPr);
1025
+ if (shading) result.shading = shading;
1026
+ const borders = parseBorders(pPr);
1027
+ if (borders) Object.assign(result, borders);
1028
+ return Object.keys(result).length > 0 ? result : null;
1029
+ }
905
1030
  //#endregion
906
1031
  //#region src/converters/text.ts
907
1032
  /**
@@ -992,13 +1117,27 @@ function extractMarks(run, styleInfo) {
992
1117
  if (styleInfo?.charFormat) mergedFormat = { ...styleInfo.charFormat };
993
1118
  if (rPr) {
994
1119
  const boldEl = findChild(rPr, "w:b");
995
- if (boldEl) if (boldEl.attributes["w:val"] === "false") mergedFormat.bold = false;
996
- else mergedFormat.bold = true;
1120
+ if (boldEl) {
1121
+ const val = boldEl.attributes["w:val"];
1122
+ if (val === "0" || val === "false") mergedFormat.bold = false;
1123
+ else mergedFormat.bold = true;
1124
+ }
997
1125
  const italicEl = findChild(rPr, "w:i");
998
- if (italicEl) if (italicEl.attributes["w:val"] === "false") mergedFormat.italic = false;
999
- else mergedFormat.italic = true;
1000
- if (findChild(rPr, "w:u")) mergedFormat.underline = true;
1001
- if (findChild(rPr, "w:strike")) mergedFormat.strike = true;
1126
+ if (italicEl) {
1127
+ const val = italicEl.attributes["w:val"];
1128
+ if (val === "0" || val === "false") mergedFormat.italic = false;
1129
+ else mergedFormat.italic = true;
1130
+ }
1131
+ const underlineEl = findChild(rPr, "w:u");
1132
+ if (underlineEl) {
1133
+ const val = underlineEl.attributes["w:val"];
1134
+ if (val !== "none" && val !== "false" && val !== "0") mergedFormat.underline = true;
1135
+ }
1136
+ const strikeEl = findChild(rPr, "w:strike");
1137
+ if (strikeEl) {
1138
+ const val = strikeEl.attributes["w:val"];
1139
+ if (val !== "0" && val !== "false") mergedFormat.strike = true;
1140
+ }
1002
1141
  const colorEl = findChild(rPr, "w:color");
1003
1142
  if (colorEl?.attributes["w:val"] && colorEl.attributes["w:val"] !== "auto") {
1004
1143
  const colorVal = colorEl.attributes["w:val"];
@@ -1061,35 +1200,6 @@ function extractAlignment(paragraph) {
1061
1200
  //#endregion
1062
1201
  //#region src/converters/paragraph.ts
1063
1202
  /**
1064
- * Extract paragraph style attributes from DOCX paragraph properties
1065
- */
1066
- function extractParagraphStyles(node) {
1067
- const pPr = findChild(node, "w:pPr");
1068
- if (!pPr) return null;
1069
- const result = {};
1070
- const ind = findChild(pPr, "w:ind");
1071
- if (ind) {
1072
- const left = parseTwipAttr(ind.attributes, "w:left");
1073
- if (left) result.indentLeft = convertTwipToCssString(parseInt(left, 10));
1074
- const right = parseTwipAttr(ind.attributes, "w:right");
1075
- if (right) result.indentRight = convertTwipToCssString(parseInt(right, 10));
1076
- const firstLine = parseTwipAttr(ind.attributes, "w:firstLine");
1077
- if (firstLine) result.indentFirstLine = convertTwipToCssString(parseInt(firstLine, 10));
1078
- else {
1079
- const hanging = parseTwipAttr(ind.attributes, "w:hanging");
1080
- if (hanging) result.indentFirstLine = convertTwipToCssString((left ? parseInt(left, 10) : 0) - parseInt(hanging, 10));
1081
- }
1082
- }
1083
- const spacing = findChild(pPr, "w:spacing");
1084
- if (spacing) {
1085
- const before = parseTwipAttr(spacing.attributes, "w:before");
1086
- if (before) result.spacingBefore = convertTwipToCssString(parseInt(before, 10));
1087
- const after = parseTwipAttr(spacing.attributes, "w:after");
1088
- if (after) result.spacingAfter = convertTwipToCssString(parseInt(after, 10));
1089
- }
1090
- return Object.keys(result).length ? result : null;
1091
- }
1092
- /**
1093
1203
  * Convert DOCX paragraph node to TipTap paragraph
1094
1204
  */
1095
1205
  async function convertParagraph(node, params) {
@@ -1109,7 +1219,7 @@ async function convertParagraph(node, params) {
1109
1219
  });
1110
1220
  const attrs = {
1111
1221
  ...extractAlignment(node),
1112
- ...extractParagraphStyles(node)
1222
+ ...extractParagraphStyles(node, styleInfo)
1113
1223
  };
1114
1224
  if (checkForPageBreak(node)) {
1115
1225
  const filteredRuns = runs.filter((run) => run.type !== "hardBreak");
@@ -1159,7 +1269,7 @@ async function convertHeading(node, params, styleInfo, level) {
1159
1269
  type: "heading",
1160
1270
  attrs: {
1161
1271
  level,
1162
- ...extractParagraphStyles(node)
1272
+ ...extractParagraphStyles(node, styleInfo)
1163
1273
  },
1164
1274
  content: await extractRuns(node, {
1165
1275
  context: params.context,
@@ -1170,31 +1280,6 @@ async function convertHeading(node, params, styleInfo, level) {
1170
1280
  //#endregion
1171
1281
  //#region src/parsers/table.ts
1172
1282
  /**
1173
- * Parse a single border element
1174
- */
1175
- function parseBorder(borderNode) {
1176
- if (!borderNode) return null;
1177
- const val = borderNode.attributes["w:val"];
1178
- const size = borderNode.attributes["w:sz"];
1179
- const color = borderNode.attributes["w:color"];
1180
- const styleMap = {
1181
- single: "solid",
1182
- dashed: "dashed",
1183
- dotted: "dotted",
1184
- double: "double",
1185
- none: "none",
1186
- nil: "none"
1187
- };
1188
- const border = {};
1189
- if (color && color !== "auto") border.color = `#${color}`;
1190
- if (size) {
1191
- const eighthPoints = parseInt(size);
1192
- if (!isNaN(eighthPoints)) border.width = Math.round(eighthPoints / 6);
1193
- }
1194
- if (val && styleMap[val]) border.style = styleMap[val];
1195
- return Object.keys(border).length > 0 ? border : null;
1196
- }
1197
- /**
1198
1283
  * Get table properties (cell margins)
1199
1284
  */
1200
1285
  function parseTableProperties(tableNode) {
@@ -1640,7 +1725,8 @@ async function convertList(startElement, siblings, startIndex, params, processed
1640
1725
  const listInfo = getListInfo(startElement);
1641
1726
  if (!listInfo) return await convertParagraph(startElement, params);
1642
1727
  const listTypeInfo = params.context.listTypeMap.get(listInfo.numId);
1643
- const listType = listTypeInfo?.type || "bullet";
1728
+ if (!listTypeInfo) return await convertParagraph(startElement, params);
1729
+ const listType = listTypeInfo.type;
1644
1730
  const items = [];
1645
1731
  let i = startIndex;
1646
1732
  while (i < siblings.length) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@docen/import-docx",
3
- "version": "0.0.12",
3
+ "version": "0.0.13",
4
4
  "description": "A powerful TipTap/ProseMirror extension that imports Microsoft Word DOCX files to editor content",
5
5
  "keywords": [
6
6
  "converter",
@@ -51,10 +51,10 @@
51
51
  "xast-util-from-xml": "4.0.0"
52
52
  },
53
53
  "devDependencies": {
54
- "@tiptap/core": "3.20.1",
54
+ "@tiptap/core": "3.20.0",
55
55
  "@types/xast": "2.0.4",
56
- "@docen/extensions": "0.0.12",
57
- "@docen/utils": "0.0.12"
56
+ "@docen/extensions": "0.0.13",
57
+ "@docen/utils": "0.0.13"
58
58
  },
59
59
  "peerDependencies": {
60
60
  "@napi-rs/canvas": "^0.1.88"