@docen/import-docx 0.0.11 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +290 -290
- package/dist/index.d.mts +81 -40
- package/dist/index.mjs +153 -67
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -1,290 +1,290 @@
|
|
|
1
|
-
# @docen/import-docx
|
|
2
|
-
|
|
3
|
-

|
|
4
|
-

|
|
5
|
-

|
|
6
|
-
|
|
7
|
-
> Import Microsoft Word DOCX files to TipTap/ProseMirror content.
|
|
8
|
-
|
|
9
|
-
## Features
|
|
10
|
-
|
|
11
|
-
- 📝 **Rich Text Parsing** - Accurate parsing of headings, paragraphs, and blockquotes with formatting
|
|
12
|
-
- 🖼️ **Image Extraction** - Automatic image extraction with base64 conversion and cropping support
|
|
13
|
-
- 📊 **Table Support** - Complete table structure with colspan/rowspan detection algorithm
|
|
14
|
-
- ✅ **Lists & Tasks** - Bullet lists, numbered lists with start number extraction, and task lists with checkbox detection
|
|
15
|
-
- 🎨 **Text Formatting** - Bold, italic, underline, strikethrough, subscript, superscript, and highlights
|
|
16
|
-
- 🎯 **Text Styles** - Comprehensive style support including colors, backgrounds, fonts, sizes, and line heights
|
|
17
|
-
- 🔗 **Links** - Hyperlink extraction with href preservation
|
|
18
|
-
- 💻 **Code Blocks** - Code block detection with language attribute extraction
|
|
19
|
-
- 🌐 **Cross-Platform** - Works in both browser and Node.js environments
|
|
20
|
-
- ✂️ **Image Cropping** - Automatic cropping of images based on DOCX crop metadata
|
|
21
|
-
- 🧠 **Smart Parsing** - DOCX XML parsing with proper element grouping and structure reconstruction
|
|
22
|
-
- ⚡ **Fast Processing** - Uses fflate for ultra-fast ZIP decompression
|
|
23
|
-
|
|
24
|
-
## Installation
|
|
25
|
-
|
|
26
|
-
```bash
|
|
27
|
-
# Install with npm
|
|
28
|
-
$ npm install @docen/import-docx
|
|
29
|
-
|
|
30
|
-
# Install with yarn
|
|
31
|
-
$ yarn add @docen/import-docx
|
|
32
|
-
|
|
33
|
-
# Install with pnpm
|
|
34
|
-
$ pnpm add @docen/import-docx
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
## Quick Start
|
|
38
|
-
|
|
39
|
-
```typescript
|
|
40
|
-
import { parseDOCX } from "@docen/import-docx";
|
|
41
|
-
import { readFileSync } from "node:fs";
|
|
42
|
-
|
|
43
|
-
// Read DOCX file
|
|
44
|
-
const buffer = readFileSync("document.docx");
|
|
45
|
-
|
|
46
|
-
// Parse DOCX to TipTap JSON
|
|
47
|
-
const content = await parseDOCX(buffer);
|
|
48
|
-
|
|
49
|
-
// Use in TipTap editor
|
|
50
|
-
editor.commands.setContent(content);
|
|
51
|
-
```
|
|
52
|
-
|
|
53
|
-
## API Reference
|
|
54
|
-
|
|
55
|
-
### `parseDOCX(input, options?)`
|
|
56
|
-
|
|
57
|
-
Parses a DOCX file and converts it to TipTap/ProseMirror JSON content.
|
|
58
|
-
|
|
59
|
-
**Parameters:**
|
|
60
|
-
|
|
61
|
-
- `input: Buffer | ArrayBuffer | Uint8Array` - DOCX file data
|
|
62
|
-
- `options?: DocxImportOptions` - Optional import configuration
|
|
63
|
-
|
|
64
|
-
**Returns:** `Promise<JSONContent>` - TipTap/ProseMirror document content with images embedded
|
|
65
|
-
|
|
66
|
-
**Options:**
|
|
67
|
-
|
|
68
|
-
```typescript
|
|
69
|
-
interface DocxImportOptions {
|
|
70
|
-
/** Custom image converter (default: embed as base64) */
|
|
71
|
-
convertImage?: (image: DocxImageInfo) => Promise<DocxImageResult>;
|
|
72
|
-
|
|
73
|
-
/** Whether to ignore empty paragraphs (default: false).
|
|
74
|
-
* Empty paragraphs are those without text content or images.
|
|
75
|
-
* Paragraphs containing only whitespace or images are not considered empty. */
|
|
76
|
-
ignoreEmptyParagraphs?: boolean;
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* Dynamic import function for @napi-rs/canvas
|
|
80
|
-
* Required for image cropping in Node.js environment, ignored in browser
|
|
81
|
-
*
|
|
82
|
-
* @example
|
|
83
|
-
* import { parseDOCX } from '@docen/import-docx';
|
|
84
|
-
* const content = await parseDOCX(buffer, {
|
|
85
|
-
* canvasImport: () => import('@napi-rs/canvas')
|
|
86
|
-
* });
|
|
87
|
-
*/
|
|
88
|
-
canvasImport?: () => Promise<typeof import("@napi-rs/canvas")>;
|
|
89
|
-
|
|
90
|
-
/**
|
|
91
|
-
* Enable or disable image cropping during import
|
|
92
|
-
* When true, images with crop information in DOCX will be cropped
|
|
93
|
-
* When false (default), crop information is ignored and full image is used
|
|
94
|
-
*
|
|
95
|
-
* @default false
|
|
96
|
-
*/
|
|
97
|
-
enableImageCrop?: boolean;
|
|
98
|
-
}
|
|
99
|
-
```
|
|
100
|
-
|
|
101
|
-
**Default Image Converter:**
|
|
102
|
-
|
|
103
|
-
The package exports `defaultImageConverter` which embeds images as base64 data URLs:
|
|
104
|
-
|
|
105
|
-
```typescript
|
|
106
|
-
import { defaultImageConverter } from "@docen/import-docx";
|
|
107
|
-
|
|
108
|
-
// Use in custom converter
|
|
109
|
-
await parseDOCX(buffer, {
|
|
110
|
-
convertImage: async (image) => {
|
|
111
|
-
if (shouldUploadToCDN) {
|
|
112
|
-
return uploadToCDN(image.data);
|
|
113
|
-
}
|
|
114
|
-
return defaultImageConverter(image);
|
|
115
|
-
},
|
|
116
|
-
});
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
## Supported Content Types
|
|
120
|
-
|
|
121
|
-
### Text Formatting
|
|
122
|
-
|
|
123
|
-
- **Bold**, _Italic_, <u>Underline</u>, ~~Strikethrough~~
|
|
124
|
-
- ^Superscript^ and ~Subscript~
|
|
125
|
-
- Text highlights
|
|
126
|
-
- Text colors and background colors
|
|
127
|
-
- Font families and sizes
|
|
128
|
-
- Line heights
|
|
129
|
-
|
|
130
|
-
### Block Elements
|
|
131
|
-
|
|
132
|
-
- **Headings** (H1-H6) with proper level detection
|
|
133
|
-
- **Paragraphs** with text alignment (left, right, center, justify)
|
|
134
|
-
- **Blockquotes** (Detected by indentation + left border formatting)
|
|
135
|
-
- **Horizontal Rules** (Detected as page breaks in DOCX)
|
|
136
|
-
- **Code Blocks** with language attribute support
|
|
137
|
-
|
|
138
|
-
### Lists
|
|
139
|
-
|
|
140
|
-
- **Bullet Lists** with proper nesting and structure
|
|
141
|
-
- **Numbered Lists** with custom start number extraction
|
|
142
|
-
- **Task Lists** with checked/unchecked state detection (☐/☑ symbols)
|
|
143
|
-
|
|
144
|
-
### Tables
|
|
145
|
-
|
|
146
|
-
- Complete table structure parsing
|
|
147
|
-
- **Table Cells** with colspan detection using grid-based algorithm
|
|
148
|
-
- **Table Cells** with rowspan detection using vMerge tracking
|
|
149
|
-
- Cell alignment and formatting preservation
|
|
150
|
-
- Merged cell handling (both horizontal and vertical)
|
|
151
|
-
|
|
152
|
-
### Media & Embeds
|
|
153
|
-
|
|
154
|
-
- **Images** with automatic base64 conversion
|
|
155
|
-
- **Grouped Images** (DOCX image groups) support
|
|
156
|
-
- **Links** (hyperlinks) with href extraction
|
|
157
|
-
|
|
158
|
-
## Parsing Algorithm
|
|
159
|
-
|
|
160
|
-
### Document Structure
|
|
161
|
-
|
|
162
|
-
The parser follows a structured workflow:
|
|
163
|
-
|
|
164
|
-
1. **Extract Relationships** - Parse `_rels/document.xml.rels` for hyperlinks and images
|
|
165
|
-
2. **Parse Numbering** - Extract list definitions from `numbering.xml` (abstractNum → numFmt)
|
|
166
|
-
3. **Process Document Body** - Iterate through document.xml elements:
|
|
167
|
-
- Detect content types (tables, lists, paragraphs, code blocks, etc.)
|
|
168
|
-
- Group consecutive elements into proper containers
|
|
169
|
-
- Convert XML nodes to TipTap JSON nodes
|
|
170
|
-
|
|
171
|
-
### Table Processing
|
|
172
|
-
|
|
173
|
-
Tables use specialized algorithms:
|
|
174
|
-
|
|
175
|
-
- **Colspan Detection** - Grid-based algorithm tracks cell positions and detects horizontal merges
|
|
176
|
-
- **Rowspan Detection** - Vertical merge (vMerge) tracking across rows with proper cell skipping
|
|
177
|
-
- **Cell Content** - Recursive parsing of nested paragraphs and formatting
|
|
178
|
-
- **Hyperlink Support** - Proper handling of links within table cells
|
|
179
|
-
|
|
180
|
-
### List Processing
|
|
181
|
-
|
|
182
|
-
Lists utilize the DOCX numbering system:
|
|
183
|
-
|
|
184
|
-
- **Numbering ID Mapping** - Maps abstractNum to formatting (bullet vs decimal)
|
|
185
|
-
- **Start Value Extraction** - Extracts and preserves start numbers for ordered lists
|
|
186
|
-
- **Nesting Preservation** - Maintains proper list hierarchy
|
|
187
|
-
- **Consecutive Grouping** - Groups consecutive list items into list containers
|
|
188
|
-
|
|
189
|
-
## Examples
|
|
190
|
-
|
|
191
|
-
### Basic Usage
|
|
192
|
-
|
|
193
|
-
```typescript
|
|
194
|
-
import { parseDOCX } from "@docen/import-docx";
|
|
195
|
-
|
|
196
|
-
const buffer = readFileSync("example.docx");
|
|
197
|
-
const { content } = await parseDOCX(buffer);
|
|
198
|
-
|
|
199
|
-
console.log(JSON.stringify(content, null, 2));
|
|
200
|
-
```
|
|
201
|
-
|
|
202
|
-
### Use with TipTap Editor
|
|
203
|
-
|
|
204
|
-
```typescript
|
|
205
|
-
import { Editor } from "@tiptap/core";
|
|
206
|
-
import { parseDOCX } from "@docen/import-docx";
|
|
207
|
-
|
|
208
|
-
const editor = new Editor({
|
|
209
|
-
extensions: [...],
|
|
210
|
-
content: "",
|
|
211
|
-
});
|
|
212
|
-
|
|
213
|
-
// Import DOCX file
|
|
214
|
-
async function importDocx(file: File) {
|
|
215
|
-
const buffer = await file.arrayBuffer();
|
|
216
|
-
const content = await parseDOCX(buffer);
|
|
217
|
-
editor.commands.setContent(content);
|
|
218
|
-
}
|
|
219
|
-
```
|
|
220
|
-
|
|
221
|
-
### Node.js Environment with Image Cropping
|
|
222
|
-
|
|
223
|
-
To enable image cropping in Node.js environment, you need to provide `@napi-rs/canvas`:
|
|
224
|
-
|
|
225
|
-
```typescript
|
|
226
|
-
import { parseDOCX } from "@docen/import-docx";
|
|
227
|
-
import { readFileSync } from "node:fs";
|
|
228
|
-
|
|
229
|
-
// Install @napi-rs/canvas first: pnpm add @napi-rs/canvas
|
|
230
|
-
const buffer = readFileSync("document.docx");
|
|
231
|
-
|
|
232
|
-
const content = await parseDOCX(buffer, {
|
|
233
|
-
canvasImport: () => import("@napi-rs/canvas"),
|
|
234
|
-
enableImageCrop: true, // Enable cropping (default is false)
|
|
235
|
-
});
|
|
236
|
-
```
|
|
237
|
-
|
|
238
|
-
**Note:** By default, image cropping is disabled. Images are imported in full size, ignoring crop information in DOCX.
|
|
239
|
-
|
|
240
|
-
### Disable Image Cropping
|
|
241
|
-
|
|
242
|
-
If you want to explicitly ignore crop information in DOCX and use full images (this is the default behavior):
|
|
243
|
-
|
|
244
|
-
```typescript
|
|
245
|
-
const content = await parseDOCX(buffer, {
|
|
246
|
-
enableImageCrop: false,
|
|
247
|
-
});
|
|
248
|
-
```
|
|
249
|
-
|
|
250
|
-
## Known Limitations
|
|
251
|
-
|
|
252
|
-
### Blockquote Detection
|
|
253
|
-
|
|
254
|
-
DOCX does not have a semantic blockquote structure. Blockquotes are detected by:
|
|
255
|
-
|
|
256
|
-
- Left indentation ≥ 720 twips (0.5 inch)
|
|
257
|
-
- Presence of left border (single line)
|
|
258
|
-
|
|
259
|
-
This detection method may produce false positives for documents with custom indentation similar to blockquotes.
|
|
260
|
-
|
|
261
|
-
### Code Marks
|
|
262
|
-
|
|
263
|
-
The `code` mark is NOT automatically detected from monospace fonts (Consolas, Courier New, etc.). This is intentional to avoid false positives. Code marks should be explicitly added in the source document or through editor UI.
|
|
264
|
-
|
|
265
|
-
### Color Format
|
|
266
|
-
|
|
267
|
-
All colors are imported as hex values (e.g., "#FF0000", "#008000"). Color names from the original document are not preserved.
|
|
268
|
-
|
|
269
|
-
### Image Limitations
|
|
270
|
-
|
|
271
|
-
- Only embedded images are supported (external image links are not fetched)
|
|
272
|
-
- Image dimensions and title are extracted from DOCX metadata
|
|
273
|
-
- **Image Cropping**: By default, images are imported in full size (crop information is ignored)
|
|
274
|
-
- To enable cropping, set `enableImageCrop: true` in options
|
|
275
|
-
- In browser environments, cropping works natively with Canvas API
|
|
276
|
-
- In Node.js, you must also provide `canvasImport` option with dynamic import of `@napi-rs/canvas`
|
|
277
|
-
- If `@napi-rs/canvas` is not available in Node.js, images will be imported without cropping (graceful degradation)
|
|
278
|
-
- Some DOCX image features (like advanced positioning or text wrapping) have limited support
|
|
279
|
-
|
|
280
|
-
### Table Cell Types
|
|
281
|
-
|
|
282
|
-
DOCX format does not distinguish between header and body cells at a semantic level. All cells are imported as `tableCell` type for consistency. This is a DOCX format limitation.
|
|
283
|
-
|
|
284
|
-
## Contributing
|
|
285
|
-
|
|
286
|
-
Contributions are welcome! Please read our [Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct/) and submit pull requests to the [main repository](https://github.com/DemoMacro/docen).
|
|
287
|
-
|
|
288
|
-
## License
|
|
289
|
-
|
|
290
|
-
- [MIT](LICENSE) © [Demo Macro](https://imst.xyz/)
|
|
1
|
+
# @docen/import-docx
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+

|
|
6
|
+
|
|
7
|
+
> Import Microsoft Word DOCX files to TipTap/ProseMirror content.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- 📝 **Rich Text Parsing** - Accurate parsing of headings, paragraphs, and blockquotes with formatting
|
|
12
|
+
- 🖼️ **Image Extraction** - Automatic image extraction with base64 conversion and cropping support
|
|
13
|
+
- 📊 **Table Support** - Complete table structure with colspan/rowspan detection algorithm
|
|
14
|
+
- ✅ **Lists & Tasks** - Bullet lists, numbered lists with start number extraction, and task lists with checkbox detection
|
|
15
|
+
- 🎨 **Text Formatting** - Bold, italic, underline, strikethrough, subscript, superscript, and highlights
|
|
16
|
+
- 🎯 **Text Styles** - Comprehensive style support including colors, backgrounds, fonts, sizes, and line heights
|
|
17
|
+
- 🔗 **Links** - Hyperlink extraction with href preservation
|
|
18
|
+
- 💻 **Code Blocks** - Code block detection with language attribute extraction
|
|
19
|
+
- 🌐 **Cross-Platform** - Works in both browser and Node.js environments
|
|
20
|
+
- ✂️ **Image Cropping** - Automatic cropping of images based on DOCX crop metadata
|
|
21
|
+
- 🧠 **Smart Parsing** - DOCX XML parsing with proper element grouping and structure reconstruction
|
|
22
|
+
- ⚡ **Fast Processing** - Uses fflate for ultra-fast ZIP decompression
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Install with npm
|
|
28
|
+
$ npm install @docen/import-docx
|
|
29
|
+
|
|
30
|
+
# Install with yarn
|
|
31
|
+
$ yarn add @docen/import-docx
|
|
32
|
+
|
|
33
|
+
# Install with pnpm
|
|
34
|
+
$ pnpm add @docen/import-docx
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```typescript
|
|
40
|
+
import { parseDOCX } from "@docen/import-docx";
|
|
41
|
+
import { readFileSync } from "node:fs";
|
|
42
|
+
|
|
43
|
+
// Read DOCX file
|
|
44
|
+
const buffer = readFileSync("document.docx");
|
|
45
|
+
|
|
46
|
+
// Parse DOCX to TipTap JSON
|
|
47
|
+
const content = await parseDOCX(buffer);
|
|
48
|
+
|
|
49
|
+
// Use in TipTap editor
|
|
50
|
+
editor.commands.setContent(content);
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## API Reference
|
|
54
|
+
|
|
55
|
+
### `parseDOCX(input, options?)`
|
|
56
|
+
|
|
57
|
+
Parses a DOCX file and converts it to TipTap/ProseMirror JSON content.
|
|
58
|
+
|
|
59
|
+
**Parameters:**
|
|
60
|
+
|
|
61
|
+
- `input: Buffer | ArrayBuffer | Uint8Array` - DOCX file data
|
|
62
|
+
- `options?: DocxImportOptions` - Optional import configuration
|
|
63
|
+
|
|
64
|
+
**Returns:** `Promise<JSONContent>` - TipTap/ProseMirror document content with images embedded
|
|
65
|
+
|
|
66
|
+
**Options:**
|
|
67
|
+
|
|
68
|
+
```typescript
|
|
69
|
+
interface DocxImportOptions {
|
|
70
|
+
/** Custom image converter (default: embed as base64) */
|
|
71
|
+
convertImage?: (image: DocxImageInfo) => Promise<DocxImageResult>;
|
|
72
|
+
|
|
73
|
+
/** Whether to ignore empty paragraphs (default: false).
|
|
74
|
+
* Empty paragraphs are those without text content or images.
|
|
75
|
+
* Paragraphs containing only whitespace or images are not considered empty. */
|
|
76
|
+
ignoreEmptyParagraphs?: boolean;
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Dynamic import function for @napi-rs/canvas
|
|
80
|
+
* Required for image cropping in Node.js environment, ignored in browser
|
|
81
|
+
*
|
|
82
|
+
* @example
|
|
83
|
+
* import { parseDOCX } from '@docen/import-docx';
|
|
84
|
+
* const content = await parseDOCX(buffer, {
|
|
85
|
+
* canvasImport: () => import('@napi-rs/canvas')
|
|
86
|
+
* });
|
|
87
|
+
*/
|
|
88
|
+
canvasImport?: () => Promise<typeof import("@napi-rs/canvas")>;
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Enable or disable image cropping during import
|
|
92
|
+
* When true, images with crop information in DOCX will be cropped
|
|
93
|
+
* When false (default), crop information is ignored and full image is used
|
|
94
|
+
*
|
|
95
|
+
* @default false
|
|
96
|
+
*/
|
|
97
|
+
enableImageCrop?: boolean;
|
|
98
|
+
}
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Default Image Converter:**
|
|
102
|
+
|
|
103
|
+
The package exports `defaultImageConverter` which embeds images as base64 data URLs:
|
|
104
|
+
|
|
105
|
+
```typescript
|
|
106
|
+
import { defaultImageConverter } from "@docen/import-docx";
|
|
107
|
+
|
|
108
|
+
// Use in custom converter
|
|
109
|
+
await parseDOCX(buffer, {
|
|
110
|
+
convertImage: async (image) => {
|
|
111
|
+
if (shouldUploadToCDN) {
|
|
112
|
+
return uploadToCDN(image.data);
|
|
113
|
+
}
|
|
114
|
+
return defaultImageConverter(image);
|
|
115
|
+
},
|
|
116
|
+
});
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Supported Content Types
|
|
120
|
+
|
|
121
|
+
### Text Formatting
|
|
122
|
+
|
|
123
|
+
- **Bold**, _Italic_, <u>Underline</u>, ~~Strikethrough~~
|
|
124
|
+
- ^Superscript^ and ~Subscript~
|
|
125
|
+
- Text highlights
|
|
126
|
+
- Text colors and background colors
|
|
127
|
+
- Font families and sizes
|
|
128
|
+
- Line heights
|
|
129
|
+
|
|
130
|
+
### Block Elements
|
|
131
|
+
|
|
132
|
+
- **Headings** (H1-H6) with proper level detection
|
|
133
|
+
- **Paragraphs** with text alignment (left, right, center, justify)
|
|
134
|
+
- **Blockquotes** (Detected by indentation + left border formatting)
|
|
135
|
+
- **Horizontal Rules** (Detected as page breaks in DOCX)
|
|
136
|
+
- **Code Blocks** with language attribute support
|
|
137
|
+
|
|
138
|
+
### Lists
|
|
139
|
+
|
|
140
|
+
- **Bullet Lists** with proper nesting and structure
|
|
141
|
+
- **Numbered Lists** with custom start number extraction
|
|
142
|
+
- **Task Lists** with checked/unchecked state detection (☐/☑ symbols)
|
|
143
|
+
|
|
144
|
+
### Tables
|
|
145
|
+
|
|
146
|
+
- Complete table structure parsing
|
|
147
|
+
- **Table Cells** with colspan detection using grid-based algorithm
|
|
148
|
+
- **Table Cells** with rowspan detection using vMerge tracking
|
|
149
|
+
- Cell alignment and formatting preservation
|
|
150
|
+
- Merged cell handling (both horizontal and vertical)
|
|
151
|
+
|
|
152
|
+
### Media & Embeds
|
|
153
|
+
|
|
154
|
+
- **Images** with automatic base64 conversion
|
|
155
|
+
- **Grouped Images** (DOCX image groups) support
|
|
156
|
+
- **Links** (hyperlinks) with href extraction
|
|
157
|
+
|
|
158
|
+
## Parsing Algorithm
|
|
159
|
+
|
|
160
|
+
### Document Structure
|
|
161
|
+
|
|
162
|
+
The parser follows a structured workflow:
|
|
163
|
+
|
|
164
|
+
1. **Extract Relationships** - Parse `_rels/document.xml.rels` for hyperlinks and images
|
|
165
|
+
2. **Parse Numbering** - Extract list definitions from `numbering.xml` (abstractNum → numFmt)
|
|
166
|
+
3. **Process Document Body** - Iterate through document.xml elements:
|
|
167
|
+
- Detect content types (tables, lists, paragraphs, code blocks, etc.)
|
|
168
|
+
- Group consecutive elements into proper containers
|
|
169
|
+
- Convert XML nodes to TipTap JSON nodes
|
|
170
|
+
|
|
171
|
+
### Table Processing
|
|
172
|
+
|
|
173
|
+
Tables use specialized algorithms:
|
|
174
|
+
|
|
175
|
+
- **Colspan Detection** - Grid-based algorithm tracks cell positions and detects horizontal merges
|
|
176
|
+
- **Rowspan Detection** - Vertical merge (vMerge) tracking across rows with proper cell skipping
|
|
177
|
+
- **Cell Content** - Recursive parsing of nested paragraphs and formatting
|
|
178
|
+
- **Hyperlink Support** - Proper handling of links within table cells
|
|
179
|
+
|
|
180
|
+
### List Processing
|
|
181
|
+
|
|
182
|
+
Lists utilize the DOCX numbering system:
|
|
183
|
+
|
|
184
|
+
- **Numbering ID Mapping** - Maps abstractNum to formatting (bullet vs decimal)
|
|
185
|
+
- **Start Value Extraction** - Extracts and preserves start numbers for ordered lists
|
|
186
|
+
- **Nesting Preservation** - Maintains proper list hierarchy
|
|
187
|
+
- **Consecutive Grouping** - Groups consecutive list items into list containers
|
|
188
|
+
|
|
189
|
+
## Examples
|
|
190
|
+
|
|
191
|
+
### Basic Usage
|
|
192
|
+
|
|
193
|
+
```typescript
|
|
194
|
+
import { parseDOCX } from "@docen/import-docx";
|
|
195
|
+
|
|
196
|
+
const buffer = readFileSync("example.docx");
|
|
197
|
+
const { content } = await parseDOCX(buffer);
|
|
198
|
+
|
|
199
|
+
console.log(JSON.stringify(content, null, 2));
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Use with TipTap Editor
|
|
203
|
+
|
|
204
|
+
```typescript
|
|
205
|
+
import { Editor } from "@tiptap/core";
|
|
206
|
+
import { parseDOCX } from "@docen/import-docx";
|
|
207
|
+
|
|
208
|
+
const editor = new Editor({
|
|
209
|
+
extensions: [...],
|
|
210
|
+
content: "",
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
// Import DOCX file
|
|
214
|
+
async function importDocx(file: File) {
|
|
215
|
+
const buffer = await file.arrayBuffer();
|
|
216
|
+
const content = await parseDOCX(buffer);
|
|
217
|
+
editor.commands.setContent(content);
|
|
218
|
+
}
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### Node.js Environment with Image Cropping
|
|
222
|
+
|
|
223
|
+
To enable image cropping in Node.js environment, you need to provide `@napi-rs/canvas`:
|
|
224
|
+
|
|
225
|
+
```typescript
|
|
226
|
+
import { parseDOCX } from "@docen/import-docx";
|
|
227
|
+
import { readFileSync } from "node:fs";
|
|
228
|
+
|
|
229
|
+
// Install @napi-rs/canvas first: pnpm add @napi-rs/canvas
|
|
230
|
+
const buffer = readFileSync("document.docx");
|
|
231
|
+
|
|
232
|
+
const content = await parseDOCX(buffer, {
|
|
233
|
+
canvasImport: () => import("@napi-rs/canvas"),
|
|
234
|
+
enableImageCrop: true, // Enable cropping (default is false)
|
|
235
|
+
});
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
**Note:** By default, image cropping is disabled. Images are imported in full size, ignoring crop information in DOCX.
|
|
239
|
+
|
|
240
|
+
### Disable Image Cropping
|
|
241
|
+
|
|
242
|
+
If you want to explicitly ignore crop information in DOCX and use full images (this is the default behavior):
|
|
243
|
+
|
|
244
|
+
```typescript
|
|
245
|
+
const content = await parseDOCX(buffer, {
|
|
246
|
+
enableImageCrop: false,
|
|
247
|
+
});
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Known Limitations
|
|
251
|
+
|
|
252
|
+
### Blockquote Detection
|
|
253
|
+
|
|
254
|
+
DOCX does not have a semantic blockquote structure. Blockquotes are detected by:
|
|
255
|
+
|
|
256
|
+
- Left indentation ≥ 720 twips (0.5 inch)
|
|
257
|
+
- Presence of left border (single line)
|
|
258
|
+
|
|
259
|
+
This detection method may produce false positives for documents with custom indentation similar to blockquotes.
|
|
260
|
+
|
|
261
|
+
### Code Marks
|
|
262
|
+
|
|
263
|
+
The `code` mark is NOT automatically detected from monospace fonts (Consolas, Courier New, etc.). This is intentional to avoid false positives. Code marks should be explicitly added in the source document or through editor UI.
|
|
264
|
+
|
|
265
|
+
### Color Format
|
|
266
|
+
|
|
267
|
+
All colors are imported as hex values (e.g., "#FF0000", "#008000"). Color names from the original document are not preserved.
|
|
268
|
+
|
|
269
|
+
### Image Limitations
|
|
270
|
+
|
|
271
|
+
- Only embedded images are supported (external image links are not fetched)
|
|
272
|
+
- Image dimensions and title are extracted from DOCX metadata
|
|
273
|
+
- **Image Cropping**: By default, images are imported in full size (crop information is ignored)
|
|
274
|
+
- To enable cropping, set `enableImageCrop: true` in options
|
|
275
|
+
- In browser environments, cropping works natively with Canvas API
|
|
276
|
+
- In Node.js, you must also provide `canvasImport` option with dynamic import of `@napi-rs/canvas`
|
|
277
|
+
- If `@napi-rs/canvas` is not available in Node.js, images will be imported without cropping (graceful degradation)
|
|
278
|
+
- Some DOCX image features (like advanced positioning or text wrapping) have limited support
|
|
279
|
+
|
|
280
|
+
### Table Cell Types
|
|
281
|
+
|
|
282
|
+
DOCX format does not distinguish between header and body cells at a semantic level. All cells are imported as `tableCell` type for consistency. This is a DOCX format limitation.
|
|
283
|
+
|
|
284
|
+
## Contributing
|
|
285
|
+
|
|
286
|
+
Contributions are welcome! Please read our [Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct/) and submit pull requests to the [main repository](https://github.com/DemoMacro/docen).
|
|
287
|
+
|
|
288
|
+
## License
|
|
289
|
+
|
|
290
|
+
- [MIT](LICENSE) © [Demo Macro](https://imst.xyz/)
|
package/dist/index.d.mts
CHANGED
|
@@ -3672,7 +3672,7 @@ point into textblock nodes. It can be empty (a regular cursor
|
|
|
3672
3672
|
position).
|
|
3673
3673
|
*/
|
|
3674
3674
|
//#endregion
|
|
3675
|
-
//#region ../../node_modules/.pnpm/@tiptap+core@3.20.
|
|
3675
|
+
//#region ../../node_modules/.pnpm/@tiptap+core@3.20.0_@tiptap+pm@3.20.0/node_modules/@tiptap/core/dist/index.d.ts
|
|
3676
3676
|
type StringKeyOf<T> = Extract<keyof T, string>;
|
|
3677
3677
|
type CallbackType<T extends Record<string, any>, EventName extends StringKeyOf<T>> = T[EventName] extends any[] ? T[EventName] : [T[EventName]];
|
|
3678
3678
|
type CallbackFunction<T extends Record<string, any>, EventName extends StringKeyOf<T>> = (...props: CallbackType<T, EventName>) => any;
|
|
@@ -6643,45 +6643,6 @@ interface DocxImportOptions {
|
|
|
6643
6643
|
ignoreEmptyParagraphs?: boolean;
|
|
6644
6644
|
}
|
|
6645
6645
|
//#endregion
|
|
6646
|
-
//#region src/parsers/styles.d.ts
|
|
6647
|
-
/**
|
|
6648
|
-
* Character format information from a style definition
|
|
6649
|
-
*/
|
|
6650
|
-
interface CharFormat {
|
|
6651
|
-
color?: string;
|
|
6652
|
-
bold?: boolean;
|
|
6653
|
-
italic?: boolean;
|
|
6654
|
-
fontSize?: number;
|
|
6655
|
-
fontFamily?: string;
|
|
6656
|
-
underline?: boolean;
|
|
6657
|
-
strike?: boolean;
|
|
6658
|
-
}
|
|
6659
|
-
/**
|
|
6660
|
-
* Style information from styles.xml
|
|
6661
|
-
*/
|
|
6662
|
-
interface StyleInfo {
|
|
6663
|
-
styleId: string;
|
|
6664
|
-
name?: string;
|
|
6665
|
-
outlineLvl?: number;
|
|
6666
|
-
charFormat?: CharFormat;
|
|
6667
|
-
}
|
|
6668
|
-
type StyleMap = Map<string, StyleInfo>;
|
|
6669
|
-
//#endregion
|
|
6670
|
-
//#region src/parser.d.ts
|
|
6671
|
-
/**
|
|
6672
|
-
* Parsing context containing all global resources from DOCX file
|
|
6673
|
-
*/
|
|
6674
|
-
interface ParseContext extends DocxImportOptions {
|
|
6675
|
-
hyperlinks: Map<string, string>;
|
|
6676
|
-
images: Map<string, ImageInfo>;
|
|
6677
|
-
listTypeMap: ListTypeMap;
|
|
6678
|
-
styleMap: StyleMap;
|
|
6679
|
-
}
|
|
6680
|
-
/**
|
|
6681
|
-
* Main entry point: Parse DOCX file and convert to TipTap JSON
|
|
6682
|
-
*/
|
|
6683
|
-
declare function parseDOCX(input: DataType, options?: DocxImportOptions): Promise<JSONContent>;
|
|
6684
|
-
//#endregion
|
|
6685
6646
|
//#region ../../node_modules/.pnpm/@types+unist@3.0.3/node_modules/@types/unist/index.d.ts
|
|
6686
6647
|
// ## Interfaces
|
|
6687
6648
|
/**
|
|
@@ -7016,6 +6977,86 @@ interface Text extends Literal {
|
|
|
7016
6977
|
*/
|
|
7017
6978
|
interface TextData extends Data {}
|
|
7018
6979
|
//#endregion
|
|
6980
|
+
//#region ../extensions/dist/types.d.mts
|
|
6981
|
+
//#endregion
|
|
6982
|
+
//#region src/types.d.ts
|
|
6983
|
+
/**
|
|
6984
|
+
* Border definition (compatible with docx.js BorderOptions)
|
|
6985
|
+
* Used by paragraphs, table cells, and blockquotes
|
|
6986
|
+
*/
|
|
6987
|
+
interface Border {
|
|
6988
|
+
/** Border color (hex without #, e.g., "FF0000" or "auto") */
|
|
6989
|
+
color?: string;
|
|
6990
|
+
/** Border size (eighth-points, 1/8 pt) */
|
|
6991
|
+
size?: number;
|
|
6992
|
+
/** Border style */
|
|
6993
|
+
style?: "single" | "dashed" | "dotted" | "double" | "dotDash" | "dotDotDash" | "none";
|
|
6994
|
+
/** Space between border and content (points) */
|
|
6995
|
+
space?: number;
|
|
6996
|
+
}
|
|
6997
|
+
/**
|
|
6998
|
+
* Shading definition (compatible with docx.js ShadingOptions)
|
|
6999
|
+
* Used for paragraph and table cell background colors
|
|
7000
|
+
*/
|
|
7001
|
+
interface Shading {
|
|
7002
|
+
/** Fill color (hex without #, e.g., "FF0000") */
|
|
7003
|
+
fill?: string;
|
|
7004
|
+
/** Pattern color (hex without #) */
|
|
7005
|
+
color?: string;
|
|
7006
|
+
/** Shading pattern type (e.g., "clear", "percent-10") */
|
|
7007
|
+
type?: string;
|
|
7008
|
+
}
|
|
7009
|
+
//#endregion
|
|
7010
|
+
//#region src/parsers/styles.d.ts
|
|
7011
|
+
/**
|
|
7012
|
+
* Character format information from a style definition
|
|
7013
|
+
*/
|
|
7014
|
+
interface CharFormat {
|
|
7015
|
+
color?: string;
|
|
7016
|
+
bold?: boolean;
|
|
7017
|
+
italic?: boolean;
|
|
7018
|
+
fontSize?: number;
|
|
7019
|
+
fontFamily?: string;
|
|
7020
|
+
underline?: boolean;
|
|
7021
|
+
strike?: boolean;
|
|
7022
|
+
}
|
|
7023
|
+
/**
|
|
7024
|
+
* Paragraph format information from a style definition
|
|
7025
|
+
*/
|
|
7026
|
+
interface ParagraphFormat {
|
|
7027
|
+
shading?: Shading;
|
|
7028
|
+
borderTop?: Border;
|
|
7029
|
+
borderBottom?: Border;
|
|
7030
|
+
borderLeft?: Border;
|
|
7031
|
+
borderRight?: Border;
|
|
7032
|
+
}
|
|
7033
|
+
/**
|
|
7034
|
+
* Style information from styles.xml
|
|
7035
|
+
*/
|
|
7036
|
+
interface StyleInfo {
|
|
7037
|
+
styleId: string;
|
|
7038
|
+
name?: string;
|
|
7039
|
+
outlineLvl?: number;
|
|
7040
|
+
charFormat?: CharFormat;
|
|
7041
|
+
paragraphFormat?: ParagraphFormat;
|
|
7042
|
+
}
|
|
7043
|
+
type StyleMap = Map<string, StyleInfo>;
|
|
7044
|
+
//#endregion
|
|
7045
|
+
//#region src/parser.d.ts
|
|
7046
|
+
/**
|
|
7047
|
+
* Parsing context containing all global resources from DOCX file
|
|
7048
|
+
*/
|
|
7049
|
+
interface ParseContext extends DocxImportOptions {
|
|
7050
|
+
hyperlinks: Map<string, string>;
|
|
7051
|
+
images: Map<string, ImageInfo>;
|
|
7052
|
+
listTypeMap: ListTypeMap;
|
|
7053
|
+
styleMap: StyleMap;
|
|
7054
|
+
}
|
|
7055
|
+
/**
|
|
7056
|
+
* Main entry point: Parse DOCX file and convert to TipTap JSON
|
|
7057
|
+
*/
|
|
7058
|
+
declare function parseDOCX(input: DataType, options?: DocxImportOptions): Promise<JSONContent>;
|
|
7059
|
+
//#endregion
|
|
7019
7060
|
//#region src/converters/paragraph.d.ts
|
|
7020
7061
|
/**
|
|
7021
7062
|
* Convert DOCX paragraph node to TipTap paragraph
|
package/dist/index.mjs
CHANGED
|
@@ -854,6 +854,65 @@ function parseNumberingXml(files) {
|
|
|
854
854
|
//#endregion
|
|
855
855
|
//#region src/parsers/styles.ts
|
|
856
856
|
/**
|
|
857
|
+
* Parse a single border element
|
|
858
|
+
*/
|
|
859
|
+
function parseBorder(borderNode) {
|
|
860
|
+
if (!borderNode) return null;
|
|
861
|
+
const val = borderNode.attributes["w:val"];
|
|
862
|
+
const size = borderNode.attributes["w:sz"];
|
|
863
|
+
const color = borderNode.attributes["w:color"];
|
|
864
|
+
const space = borderNode.attributes["w:space"];
|
|
865
|
+
const styleMap = {
|
|
866
|
+
single: "single",
|
|
867
|
+
dashed: "dashed",
|
|
868
|
+
dotted: "dotted",
|
|
869
|
+
double: "double",
|
|
870
|
+
dotDash: "dotDash",
|
|
871
|
+
dotDotDash: "dotDotDash",
|
|
872
|
+
none: "none",
|
|
873
|
+
nil: "none"
|
|
874
|
+
};
|
|
875
|
+
const border = {};
|
|
876
|
+
if (color && color !== "auto") border.color = `#${color}`;
|
|
877
|
+
if (size) border.size = parseInt(size);
|
|
878
|
+
if (val && styleMap[val]) border.style = styleMap[val];
|
|
879
|
+
if (space) border.space = parseInt(space);
|
|
880
|
+
return Object.keys(border).length > 0 ? border : null;
|
|
881
|
+
}
|
|
882
|
+
/**
|
|
883
|
+
* Parse borders from w:pBdr or w:pBorders element
|
|
884
|
+
*/
|
|
885
|
+
function parseBorders(pPr) {
|
|
886
|
+
if (!pPr) return null;
|
|
887
|
+
const borderElement = findChild(pPr, "w:pBorders") || findChild(pPr, "w:pBdr");
|
|
888
|
+
if (!borderElement) return null;
|
|
889
|
+
const borders = {};
|
|
890
|
+
const topBorder = parseBorder(findChild(borderElement, "w:top"));
|
|
891
|
+
if (topBorder) borders.borderTop = topBorder;
|
|
892
|
+
const bottomBorder = parseBorder(findChild(borderElement, "w:bottom"));
|
|
893
|
+
if (bottomBorder) borders.borderBottom = bottomBorder;
|
|
894
|
+
const leftBorder = parseBorder(findChild(borderElement, "w:left"));
|
|
895
|
+
if (leftBorder) borders.borderLeft = leftBorder;
|
|
896
|
+
const rightBorder = parseBorder(findChild(borderElement, "w:right"));
|
|
897
|
+
if (rightBorder) borders.borderRight = rightBorder;
|
|
898
|
+
return Object.keys(borders).length > 0 ? borders : null;
|
|
899
|
+
}
|
|
900
|
+
/**
|
|
901
|
+
* Parse shading from w:shd element
|
|
902
|
+
*/
|
|
903
|
+
function parseShading(pPr) {
|
|
904
|
+
if (!pPr) return null;
|
|
905
|
+
const shd = findChild(pPr, "w:shd");
|
|
906
|
+
if (!shd) return null;
|
|
907
|
+
const shading = {};
|
|
908
|
+
if (shd.attributes["w:fill"]) {
|
|
909
|
+
const fill = shd.attributes["w:fill"];
|
|
910
|
+
shading.fill = fill.startsWith("#") ? fill : `#${fill}`;
|
|
911
|
+
}
|
|
912
|
+
if (shd.attributes["w:val"]) shading.type = shd.attributes["w:val"];
|
|
913
|
+
return Object.keys(shading).length > 0 ? shading : null;
|
|
914
|
+
}
|
|
915
|
+
/**
|
|
857
916
|
* Parse styles.xml to build style map
|
|
858
917
|
* Extracts outlineLvl from paragraph styles to identify headings
|
|
859
918
|
* Extracts character format (color, bold, etc.) from style definitions
|
|
@@ -875,6 +934,14 @@ function parseStylesXml(files) {
|
|
|
875
934
|
if (pPr) {
|
|
876
935
|
const outlineLvl = findChild(pPr, "w:outlineLvl");
|
|
877
936
|
if (outlineLvl?.attributes["w:val"] !== void 0) styleInfo.outlineLvl = parseInt(outlineLvl.attributes["w:val"], 10);
|
|
937
|
+
const borders = parseBorders(pPr);
|
|
938
|
+
const shading = parseShading(pPr);
|
|
939
|
+
if (borders || shading) {
|
|
940
|
+
const paragraphFormat = {};
|
|
941
|
+
if (borders) Object.assign(paragraphFormat, borders);
|
|
942
|
+
if (shading) paragraphFormat.shading = shading;
|
|
943
|
+
if (Object.keys(paragraphFormat).length > 0) styleInfo.paragraphFormat = paragraphFormat;
|
|
944
|
+
}
|
|
878
945
|
}
|
|
879
946
|
const rPr = findChild(style, "w:rPr");
|
|
880
947
|
if (rPr) {
|
|
@@ -884,10 +951,26 @@ function parseStylesXml(files) {
|
|
|
884
951
|
const colorVal = color.attributes["w:val"];
|
|
885
952
|
charFormat.color = colorVal.startsWith("#") ? colorVal : `#${colorVal}`;
|
|
886
953
|
}
|
|
887
|
-
|
|
888
|
-
if (
|
|
889
|
-
|
|
890
|
-
|
|
954
|
+
const bold = findChild(rPr, "w:b");
|
|
955
|
+
if (bold) {
|
|
956
|
+
const val = bold.attributes["w:val"];
|
|
957
|
+
if (val !== "0" && val !== "false") charFormat.bold = true;
|
|
958
|
+
}
|
|
959
|
+
const italic = findChild(rPr, "w:i");
|
|
960
|
+
if (italic) {
|
|
961
|
+
const val = italic.attributes["w:val"];
|
|
962
|
+
if (val !== "0" && val !== "false") charFormat.italic = true;
|
|
963
|
+
}
|
|
964
|
+
const underline = findChild(rPr, "w:u");
|
|
965
|
+
if (underline) {
|
|
966
|
+
const val = underline.attributes["w:val"];
|
|
967
|
+
if (val !== "none" && val !== "false" && val !== "0") charFormat.underline = true;
|
|
968
|
+
}
|
|
969
|
+
const strike = findChild(rPr, "w:strike");
|
|
970
|
+
if (strike) {
|
|
971
|
+
const val = strike.attributes["w:val"];
|
|
972
|
+
if (val !== "0" && val !== "false") charFormat.strike = true;
|
|
973
|
+
}
|
|
891
974
|
const sz = findChild(rPr, "w:sz");
|
|
892
975
|
if (sz?.attributes["w:val"]) {
|
|
893
976
|
const sizeVal = sz.attributes["w:val"];
|
|
@@ -902,6 +985,48 @@ function parseStylesXml(files) {
|
|
|
902
985
|
}
|
|
903
986
|
return styleMap;
|
|
904
987
|
}
|
|
988
|
+
/**
|
|
989
|
+
* Extract all paragraph style attributes from a paragraph element
|
|
990
|
+
* Merges direct paragraph properties with style-based properties
|
|
991
|
+
*/
|
|
992
|
+
function extractParagraphStyles(node, styleInfo) {
|
|
993
|
+
const pPr = findChild(node, "w:pPr");
|
|
994
|
+
if (!pPr) return null;
|
|
995
|
+
const result = {};
|
|
996
|
+
if (styleInfo?.paragraphFormat) {
|
|
997
|
+
const pf = styleInfo.paragraphFormat;
|
|
998
|
+
if (pf.shading) result.shading = pf.shading;
|
|
999
|
+
if (pf.borderTop) result.borderTop = pf.borderTop;
|
|
1000
|
+
if (pf.borderBottom) result.borderBottom = pf.borderBottom;
|
|
1001
|
+
if (pf.borderLeft) result.borderLeft = pf.borderLeft;
|
|
1002
|
+
if (pf.borderRight) result.borderRight = pf.borderRight;
|
|
1003
|
+
}
|
|
1004
|
+
const ind = findChild(pPr, "w:ind");
|
|
1005
|
+
if (ind) {
|
|
1006
|
+
const left = parseTwipAttr(ind.attributes, "w:left");
|
|
1007
|
+
if (left) result.indentLeft = convertTwipToCssString(parseInt(left, 10));
|
|
1008
|
+
const right = parseTwipAttr(ind.attributes, "w:right");
|
|
1009
|
+
if (right) result.indentRight = convertTwipToCssString(parseInt(right, 10));
|
|
1010
|
+
const firstLine = parseTwipAttr(ind.attributes, "w:firstLine");
|
|
1011
|
+
if (firstLine) result.indentFirstLine = convertTwipToCssString(parseInt(firstLine, 10));
|
|
1012
|
+
else {
|
|
1013
|
+
const hanging = parseTwipAttr(ind.attributes, "w:hanging");
|
|
1014
|
+
if (hanging) result.indentFirstLine = convertTwipToCssString((left ? parseInt(left, 10) : 0) - parseInt(hanging, 10));
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
const spacing = findChild(pPr, "w:spacing");
|
|
1018
|
+
if (spacing) {
|
|
1019
|
+
const before = parseTwipAttr(spacing.attributes, "w:before");
|
|
1020
|
+
if (before) result.spacingBefore = convertTwipToCssString(parseInt(before, 10));
|
|
1021
|
+
const after = parseTwipAttr(spacing.attributes, "w:after");
|
|
1022
|
+
if (after) result.spacingAfter = convertTwipToCssString(parseInt(after, 10));
|
|
1023
|
+
}
|
|
1024
|
+
const shading = parseShading(pPr);
|
|
1025
|
+
if (shading) result.shading = shading;
|
|
1026
|
+
const borders = parseBorders(pPr);
|
|
1027
|
+
if (borders) Object.assign(result, borders);
|
|
1028
|
+
return Object.keys(result).length > 0 ? result : null;
|
|
1029
|
+
}
|
|
905
1030
|
//#endregion
|
|
906
1031
|
//#region src/converters/text.ts
|
|
907
1032
|
/**
|
|
@@ -992,13 +1117,27 @@ function extractMarks(run, styleInfo) {
|
|
|
992
1117
|
if (styleInfo?.charFormat) mergedFormat = { ...styleInfo.charFormat };
|
|
993
1118
|
if (rPr) {
|
|
994
1119
|
const boldEl = findChild(rPr, "w:b");
|
|
995
|
-
if (boldEl)
|
|
996
|
-
|
|
1120
|
+
if (boldEl) {
|
|
1121
|
+
const val = boldEl.attributes["w:val"];
|
|
1122
|
+
if (val === "0" || val === "false") mergedFormat.bold = false;
|
|
1123
|
+
else mergedFormat.bold = true;
|
|
1124
|
+
}
|
|
997
1125
|
const italicEl = findChild(rPr, "w:i");
|
|
998
|
-
if (italicEl)
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1126
|
+
if (italicEl) {
|
|
1127
|
+
const val = italicEl.attributes["w:val"];
|
|
1128
|
+
if (val === "0" || val === "false") mergedFormat.italic = false;
|
|
1129
|
+
else mergedFormat.italic = true;
|
|
1130
|
+
}
|
|
1131
|
+
const underlineEl = findChild(rPr, "w:u");
|
|
1132
|
+
if (underlineEl) {
|
|
1133
|
+
const val = underlineEl.attributes["w:val"];
|
|
1134
|
+
if (val !== "none" && val !== "false" && val !== "0") mergedFormat.underline = true;
|
|
1135
|
+
}
|
|
1136
|
+
const strikeEl = findChild(rPr, "w:strike");
|
|
1137
|
+
if (strikeEl) {
|
|
1138
|
+
const val = strikeEl.attributes["w:val"];
|
|
1139
|
+
if (val !== "0" && val !== "false") mergedFormat.strike = true;
|
|
1140
|
+
}
|
|
1002
1141
|
const colorEl = findChild(rPr, "w:color");
|
|
1003
1142
|
if (colorEl?.attributes["w:val"] && colorEl.attributes["w:val"] !== "auto") {
|
|
1004
1143
|
const colorVal = colorEl.attributes["w:val"];
|
|
@@ -1061,35 +1200,6 @@ function extractAlignment(paragraph) {
|
|
|
1061
1200
|
//#endregion
|
|
1062
1201
|
//#region src/converters/paragraph.ts
|
|
1063
1202
|
/**
|
|
1064
|
-
* Extract paragraph style attributes from DOCX paragraph properties
|
|
1065
|
-
*/
|
|
1066
|
-
function extractParagraphStyles(node) {
|
|
1067
|
-
const pPr = findChild(node, "w:pPr");
|
|
1068
|
-
if (!pPr) return null;
|
|
1069
|
-
const result = {};
|
|
1070
|
-
const ind = findChild(pPr, "w:ind");
|
|
1071
|
-
if (ind) {
|
|
1072
|
-
const left = parseTwipAttr(ind.attributes, "w:left");
|
|
1073
|
-
if (left) result.indentLeft = convertTwipToCssString(parseInt(left, 10));
|
|
1074
|
-
const right = parseTwipAttr(ind.attributes, "w:right");
|
|
1075
|
-
if (right) result.indentRight = convertTwipToCssString(parseInt(right, 10));
|
|
1076
|
-
const firstLine = parseTwipAttr(ind.attributes, "w:firstLine");
|
|
1077
|
-
if (firstLine) result.indentFirstLine = convertTwipToCssString(parseInt(firstLine, 10));
|
|
1078
|
-
else {
|
|
1079
|
-
const hanging = parseTwipAttr(ind.attributes, "w:hanging");
|
|
1080
|
-
if (hanging) result.indentFirstLine = convertTwipToCssString((left ? parseInt(left, 10) : 0) - parseInt(hanging, 10));
|
|
1081
|
-
}
|
|
1082
|
-
}
|
|
1083
|
-
const spacing = findChild(pPr, "w:spacing");
|
|
1084
|
-
if (spacing) {
|
|
1085
|
-
const before = parseTwipAttr(spacing.attributes, "w:before");
|
|
1086
|
-
if (before) result.spacingBefore = convertTwipToCssString(parseInt(before, 10));
|
|
1087
|
-
const after = parseTwipAttr(spacing.attributes, "w:after");
|
|
1088
|
-
if (after) result.spacingAfter = convertTwipToCssString(parseInt(after, 10));
|
|
1089
|
-
}
|
|
1090
|
-
return Object.keys(result).length ? result : null;
|
|
1091
|
-
}
|
|
1092
|
-
/**
|
|
1093
1203
|
* Convert DOCX paragraph node to TipTap paragraph
|
|
1094
1204
|
*/
|
|
1095
1205
|
async function convertParagraph(node, params) {
|
|
@@ -1109,7 +1219,7 @@ async function convertParagraph(node, params) {
|
|
|
1109
1219
|
});
|
|
1110
1220
|
const attrs = {
|
|
1111
1221
|
...extractAlignment(node),
|
|
1112
|
-
...extractParagraphStyles(node)
|
|
1222
|
+
...extractParagraphStyles(node, styleInfo)
|
|
1113
1223
|
};
|
|
1114
1224
|
if (checkForPageBreak(node)) {
|
|
1115
1225
|
const filteredRuns = runs.filter((run) => run.type !== "hardBreak");
|
|
@@ -1159,7 +1269,7 @@ async function convertHeading(node, params, styleInfo, level) {
|
|
|
1159
1269
|
type: "heading",
|
|
1160
1270
|
attrs: {
|
|
1161
1271
|
level,
|
|
1162
|
-
...extractParagraphStyles(node)
|
|
1272
|
+
...extractParagraphStyles(node, styleInfo)
|
|
1163
1273
|
},
|
|
1164
1274
|
content: await extractRuns(node, {
|
|
1165
1275
|
context: params.context,
|
|
@@ -1170,31 +1280,6 @@ async function convertHeading(node, params, styleInfo, level) {
|
|
|
1170
1280
|
//#endregion
|
|
1171
1281
|
//#region src/parsers/table.ts
|
|
1172
1282
|
/**
|
|
1173
|
-
* Parse a single border element
|
|
1174
|
-
*/
|
|
1175
|
-
function parseBorder(borderNode) {
|
|
1176
|
-
if (!borderNode) return null;
|
|
1177
|
-
const val = borderNode.attributes["w:val"];
|
|
1178
|
-
const size = borderNode.attributes["w:sz"];
|
|
1179
|
-
const color = borderNode.attributes["w:color"];
|
|
1180
|
-
const styleMap = {
|
|
1181
|
-
single: "solid",
|
|
1182
|
-
dashed: "dashed",
|
|
1183
|
-
dotted: "dotted",
|
|
1184
|
-
double: "double",
|
|
1185
|
-
none: "none",
|
|
1186
|
-
nil: "none"
|
|
1187
|
-
};
|
|
1188
|
-
const border = {};
|
|
1189
|
-
if (color && color !== "auto") border.color = `#${color}`;
|
|
1190
|
-
if (size) {
|
|
1191
|
-
const eighthPoints = parseInt(size);
|
|
1192
|
-
if (!isNaN(eighthPoints)) border.width = Math.round(eighthPoints / 6);
|
|
1193
|
-
}
|
|
1194
|
-
if (val && styleMap[val]) border.style = styleMap[val];
|
|
1195
|
-
return Object.keys(border).length > 0 ? border : null;
|
|
1196
|
-
}
|
|
1197
|
-
/**
|
|
1198
1283
|
* Get table properties (cell margins)
|
|
1199
1284
|
*/
|
|
1200
1285
|
function parseTableProperties(tableNode) {
|
|
@@ -1640,7 +1725,8 @@ async function convertList(startElement, siblings, startIndex, params, processed
|
|
|
1640
1725
|
const listInfo = getListInfo(startElement);
|
|
1641
1726
|
if (!listInfo) return await convertParagraph(startElement, params);
|
|
1642
1727
|
const listTypeInfo = params.context.listTypeMap.get(listInfo.numId);
|
|
1643
|
-
|
|
1728
|
+
if (!listTypeInfo) return await convertParagraph(startElement, params);
|
|
1729
|
+
const listType = listTypeInfo.type;
|
|
1644
1730
|
const items = [];
|
|
1645
1731
|
let i = startIndex;
|
|
1646
1732
|
while (i < siblings.length) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@docen/import-docx",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.13",
|
|
4
4
|
"description": "A powerful TipTap/ProseMirror extension that imports Microsoft Word DOCX files to editor content",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"converter",
|
|
@@ -51,10 +51,10 @@
|
|
|
51
51
|
"xast-util-from-xml": "4.0.0"
|
|
52
52
|
},
|
|
53
53
|
"devDependencies": {
|
|
54
|
-
"@tiptap/core": "3.20.
|
|
54
|
+
"@tiptap/core": "3.20.0",
|
|
55
55
|
"@types/xast": "2.0.4",
|
|
56
|
-
"@docen/
|
|
57
|
-
"@docen/
|
|
56
|
+
"@docen/extensions": "0.0.13",
|
|
57
|
+
"@docen/utils": "0.0.13"
|
|
58
58
|
},
|
|
59
59
|
"peerDependencies": {
|
|
60
60
|
"@napi-rs/canvas": "^0.1.88"
|