file2md 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +293 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +153 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers/docx-parser.d.ts +20 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +237 -0
- package/dist/parsers/docx-parser.js.map +1 -0
- package/dist/parsers/pdf-parser.d.ts +8 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +98 -0
- package/dist/parsers/pdf-parser.js.map +1 -0
- package/dist/parsers/pptx-parser.d.ts +21 -0
- package/dist/parsers/pptx-parser.d.ts.map +1 -0
- package/dist/parsers/pptx-parser.js +264 -0
- package/dist/parsers/pptx-parser.js.map +1 -0
- package/dist/parsers/xlsx-parser.d.ts +19 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +267 -0
- package/dist/parsers/xlsx-parser.js.map +1 -0
- package/dist/types/errors.d.ts +52 -0
- package/dist/types/errors.d.ts.map +1 -0
- package/dist/types/errors.js +76 -0
- package/dist/types/errors.js.map +1 -0
- package/dist/types/index.d.ts +5 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +5 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/interfaces.d.ts +228 -0
- package/dist/types/interfaces.d.ts.map +1 -0
- package/dist/types/interfaces.js +10 -0
- package/dist/types/interfaces.js.map +1 -0
- package/dist/utils/chart-extractor.d.ts +44 -0
- package/dist/utils/chart-extractor.d.ts.map +1 -0
- package/dist/utils/chart-extractor.js +258 -0
- package/dist/utils/chart-extractor.js.map +1 -0
- package/dist/utils/image-extractor.d.ts +50 -0
- package/dist/utils/image-extractor.d.ts.map +1 -0
- package/dist/utils/image-extractor.js +136 -0
- package/dist/utils/image-extractor.js.map +1 -0
- package/dist/utils/layout-parser.d.ts +55 -0
- package/dist/utils/layout-parser.d.ts.map +1 -0
- package/dist/utils/layout-parser.js +244 -0
- package/dist/utils/layout-parser.js.map +1 -0
- package/dist/utils/pdf-extractor.d.ts +46 -0
- package/dist/utils/pdf-extractor.d.ts.map +1 -0
- package/dist/utils/pdf-extractor.js +235 -0
- package/dist/utils/pdf-extractor.js.map +1 -0
- package/package.json +70 -0
package/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 file2md contributors
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
@@ -0,0 +1,293 @@
|
|
1
|
+
# file2md
|
2
|
+
|
3
|
+
[](https://badge.fury.io/js/file2md)
|
4
|
+
[](https://www.typescriptlang.org/)
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
6
|
+
|
7
|
+
A modern TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX) into Markdown with **advanced layout preservation**, **image extraction**, and **chart conversion**.
|
8
|
+
|
9
|
+
## ✨ Features
|
10
|
+
|
11
|
+
- 🔄 **Multiple Format Support**: PDF, DOCX, XLSX, PPTX
|
12
|
+
- 🎨 **Layout Preservation**: Maintains document structure, tables, and formatting
|
13
|
+
- 🖼️ **Image Extraction**: Automatically extracts and references images
|
14
|
+
- 📊 **Chart Conversion**: Converts charts to Markdown tables
|
15
|
+
- 📝 **List & Table Support**: Proper nested lists and complex tables
|
16
|
+
- 🔒 **Type Safety**: Full TypeScript support with comprehensive types
|
17
|
+
- ⚡ **Modern ESM**: ES2022 modules with CommonJS compatibility
|
18
|
+
- 🚀 **Zero Config**: Works out of the box
|
19
|
+
|
20
|
+
## 📦 Installation
|
21
|
+
|
22
|
+
```bash
|
23
|
+
npm install file2md
|
24
|
+
```
|
25
|
+
|
26
|
+
## 🚀 Quick Start
|
27
|
+
|
28
|
+
### TypeScript / ES Modules
|
29
|
+
|
30
|
+
```typescript
|
31
|
+
import { convert } from 'file2md';
|
32
|
+
|
33
|
+
// Convert from file path
|
34
|
+
const result = await convert('./document.pdf');
|
35
|
+
console.log(result.markdown);
|
36
|
+
|
37
|
+
// Convert with options
|
38
|
+
const result = await convert('./presentation.pptx', {
|
39
|
+
imageDir: 'extracted-images',
|
40
|
+
preserveLayout: true,
|
41
|
+
extractCharts: true
|
42
|
+
});
|
43
|
+
|
44
|
+
console.log(`✅ Converted successfully!`);
|
45
|
+
console.log(`📄 Markdown length: ${result.markdown.length}`);
|
46
|
+
console.log(`🖼️ Images extracted: ${result.images.length}`);
|
47
|
+
console.log(`📊 Charts found: ${result.charts.length}`);
|
48
|
+
```
|
49
|
+
|
50
|
+
### CommonJS
|
51
|
+
|
52
|
+
```javascript
|
53
|
+
const { convert } = require('file2md');
|
54
|
+
|
55
|
+
const result = await convert('./document.docx');
|
56
|
+
console.log(result.markdown);
|
57
|
+
```
|
58
|
+
|
59
|
+
### From Buffer
|
60
|
+
|
61
|
+
```typescript
|
62
|
+
import { convert } from 'file2md';
|
63
|
+
import { readFile } from 'fs/promises';
|
64
|
+
|
65
|
+
const buffer = await readFile('./document.xlsx');
|
66
|
+
const result = await convert(buffer, {
|
67
|
+
imageDir: 'spreadsheet-images'
|
68
|
+
});
|
69
|
+
```
|
70
|
+
|
71
|
+
## 📋 API Reference
|
72
|
+
|
73
|
+
### `convert(input, options?)`
|
74
|
+
|
75
|
+
**Parameters:**
|
76
|
+
- `input: string | Buffer` - File path or buffer containing document data
|
77
|
+
- `options?: ConvertOptions` - Conversion options
|
78
|
+
|
79
|
+
**Returns:** `Promise<ConversionResult>`
|
80
|
+
|
81
|
+
### Options
|
82
|
+
|
83
|
+
```typescript
|
84
|
+
interface ConvertOptions {
|
85
|
+
imageDir?: string; // Directory for extracted images (default: 'images')
|
86
|
+
preserveLayout?: boolean; // Maintain document layout (default: true)
|
87
|
+
extractCharts?: boolean; // Convert charts to tables (default: true)
|
88
|
+
extractImages?: boolean; // Extract embedded images (default: true)
|
89
|
+
maxPages?: number; // Max pages for PDFs (default: unlimited)
|
90
|
+
}
|
91
|
+
```
|
92
|
+
|
93
|
+
### Result
|
94
|
+
|
95
|
+
```typescript
|
96
|
+
interface ConversionResult {
|
97
|
+
markdown: string; // Generated Markdown content
|
98
|
+
images: ImageData[]; // Extracted image information
|
99
|
+
charts: ChartData[]; // Extracted chart data
|
100
|
+
metadata: DocumentMetadata; // Document metadata
|
101
|
+
}
|
102
|
+
```
|
103
|
+
|
104
|
+
## 🎯 Format-Specific Features
|
105
|
+
|
106
|
+
### 📄 PDF
|
107
|
+
- ✅ **Text extraction** with layout enhancement
|
108
|
+
- ✅ **Table detection** and formatting
|
109
|
+
- ✅ **List recognition** (bullets, numbers)
|
110
|
+
- ✅ **Heading detection** (ALL CAPS, colons)
|
111
|
+
- ✅ **Page-to-image fallback** for complex layouts
|
112
|
+
|
113
|
+
### 📝 DOCX
|
114
|
+
- ✅ **Heading hierarchy** (H1-H6)
|
115
|
+
- ✅ **Text formatting** (bold, italic)
|
116
|
+
- ✅ **Complex tables** with merged cells
|
117
|
+
- ✅ **Nested lists** with proper indentation
|
118
|
+
- ✅ **Embedded images** and charts
|
119
|
+
- ✅ **Cell styling** (alignment, colors)
|
120
|
+
|
121
|
+
### 📊 XLSX
|
122
|
+
- ✅ **Multiple worksheets** as separate sections
|
123
|
+
- ✅ **Cell formatting** (bold, colors, alignment)
|
124
|
+
- ✅ **Data type preservation**
|
125
|
+
- ✅ **Chart extraction** to data tables
|
126
|
+
- ✅ **Conditional formatting** notes
|
127
|
+
|
128
|
+
### 🎬 PPTX
|
129
|
+
- ✅ **Slide-by-slide** organization
|
130
|
+
- ✅ **Text positioning** and layout
|
131
|
+
- ✅ **Image placement** per slide
|
132
|
+
- ✅ **Table extraction** from slides
|
133
|
+
- ✅ **Multi-column layouts**
|
134
|
+
|
135
|
+
## 🖼️ Image Handling
|
136
|
+
|
137
|
+
Images are automatically extracted and saved to the specified directory:
|
138
|
+
|
139
|
+
```typescript
|
140
|
+
const result = await convert('./presentation.pptx', {
|
141
|
+
imageDir: 'my-images'
|
142
|
+
});
|
143
|
+
|
144
|
+
// Result structure:
|
145
|
+
// my-images/
|
146
|
+
// ├── image_1.png
|
147
|
+
// ├── image_2.jpg
|
148
|
+
// └── chart_1.png
|
149
|
+
|
150
|
+
// Markdown will contain:
|
151
|
+
// 
|
152
|
+
```
|
153
|
+
|
154
|
+
## 📊 Chart Conversion
|
155
|
+
|
156
|
+
Charts are converted to Markdown tables:
|
157
|
+
|
158
|
+
```markdown
|
159
|
+
#### Chart 1: Sales Data
|
160
|
+
|
161
|
+
| Category | Q1 | Q2 | Q3 | Q4 |
|
162
|
+
| --- | --- | --- | --- | --- |
|
163
|
+
| Revenue | 100 | 150 | 200 | 250 |
|
164
|
+
| Profit | 20 | 30 | 45 | 60 |
|
165
|
+
```
|
166
|
+
|
167
|
+
## 🛡️ Error Handling
|
168
|
+
|
169
|
+
```typescript
|
170
|
+
import {
|
171
|
+
convert,
|
172
|
+
UnsupportedFormatError,
|
173
|
+
FileNotFoundError,
|
174
|
+
ParseError
|
175
|
+
} from 'file2md';
|
176
|
+
|
177
|
+
try {
|
178
|
+
const result = await convert('./document.pdf');
|
179
|
+
} catch (error) {
|
180
|
+
if (error instanceof UnsupportedFormatError) {
|
181
|
+
console.error('Unsupported file format');
|
182
|
+
} else if (error instanceof FileNotFoundError) {
|
183
|
+
console.error('File not found');
|
184
|
+
} else if (error instanceof ParseError) {
|
185
|
+
console.error('Failed to parse document:', error.message);
|
186
|
+
}
|
187
|
+
}
|
188
|
+
```
|
189
|
+
|
190
|
+
## 🧪 Advanced Usage
|
191
|
+
|
192
|
+
### Custom Error Handling
|
193
|
+
|
194
|
+
```typescript
|
195
|
+
import { convert, ConversionError } from 'file2md';
|
196
|
+
|
197
|
+
try {
|
198
|
+
const result = await convert('./complex-document.docx');
|
199
|
+
} catch (error) {
|
200
|
+
if (error instanceof ConversionError) {
|
201
|
+
console.error(`Conversion failed [${error.code}]:`, error.message);
|
202
|
+
if (error.originalError) {
|
203
|
+
console.error('Original error:', error.originalError);
|
204
|
+
}
|
205
|
+
}
|
206
|
+
}
|
207
|
+
```
|
208
|
+
|
209
|
+
### Batch Processing
|
210
|
+
|
211
|
+
```typescript
|
212
|
+
import { convert } from 'file2md';
|
213
|
+
import { readdir } from 'fs/promises';
|
214
|
+
|
215
|
+
async function convertFolder(folderPath: string) {
|
216
|
+
const files = await readdir(folderPath);
|
217
|
+
const results = [];
|
218
|
+
|
219
|
+
for (const file of files) {
|
220
|
+
if (file.match(/\.(pdf|docx|xlsx|pptx)$/i)) {
|
221
|
+
try {
|
222
|
+
const result = await convert(`${folderPath}/${file}`);
|
223
|
+
results.push({ file, success: true, result });
|
224
|
+
} catch (error) {
|
225
|
+
results.push({ file, success: false, error });
|
226
|
+
}
|
227
|
+
}
|
228
|
+
}
|
229
|
+
|
230
|
+
return results;
|
231
|
+
}
|
232
|
+
```
|
233
|
+
|
234
|
+
## 🏗️ Development
|
235
|
+
|
236
|
+
### Build from Source
|
237
|
+
|
238
|
+
```bash
|
239
|
+
git clone https://github.com/yourusername/file2md.git
|
240
|
+
cd file2md
|
241
|
+
npm install
|
242
|
+
npm run build
|
243
|
+
```
|
244
|
+
|
245
|
+
### Testing
|
246
|
+
|
247
|
+
```bash
|
248
|
+
npm test # Run tests
|
249
|
+
npm run test:watch # Watch mode
|
250
|
+
npm run test:coverage # Coverage report
|
251
|
+
```
|
252
|
+
|
253
|
+
### Linting
|
254
|
+
|
255
|
+
```bash
|
256
|
+
npm run lint # Check code style
|
257
|
+
npm run lint:fix # Fix issues
|
258
|
+
```
|
259
|
+
|
260
|
+
## 🤝 Contributing
|
261
|
+
|
262
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
263
|
+
|
264
|
+
1. Fork the repository
|
265
|
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
266
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
267
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
268
|
+
5. Open a Pull Request
|
269
|
+
|
270
|
+
## 📄 License
|
271
|
+
|
272
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
273
|
+
|
274
|
+
## 🔗 Links
|
275
|
+
|
276
|
+
- [npm package](https://www.npmjs.com/package/file2md)
|
277
|
+
- [GitHub repository](https://github.com/yourusername/file2md)
|
278
|
+
- [Issues & Bug Reports](https://github.com/yourusername/file2md/issues)
|
279
|
+
|
280
|
+
## 📊 Supported Formats
|
281
|
+
|
282
|
+
| Format | Extension | Layout | Images | Charts | Tables | Lists |
|
283
|
+
|--------|-----------|---------|---------|---------|---------|--------|
|
284
|
+
| PDF | `.pdf` | ✅ | ✅* | ❌ | ✅ | ✅ |
|
285
|
+
| Word | `.docx` | ✅ | ✅ | ✅ | ✅ | ✅ |
|
286
|
+
| Excel | `.xlsx` | ✅ | ❌ | ✅ | ✅ | ❌ |
|
287
|
+
| PowerPoint | `.pptx` | ✅ | ✅ | ✅ | ✅ | ❌ |
|
288
|
+
|
289
|
+
*PDF images via page-to-image conversion
|
290
|
+
|
291
|
+
---
|
292
|
+
|
293
|
+
**Made with ❤️ and TypeScript**
|
package/dist/index.d.ts
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
import type { ConvertInput, ConvertOptions, ConversionResult } from './types/index.js';
|
2
|
+
/**
|
3
|
+
* Convert a document (PDF, DOCX, XLSX, PPTX) to Markdown format
|
4
|
+
*
|
5
|
+
* @param input - File path (string) or Buffer containing the document data
|
6
|
+
* @param options - Conversion options
|
7
|
+
* @returns Promise resolving to conversion result with markdown and metadata
|
8
|
+
*
|
9
|
+
* @throws {FileNotFoundError} When file path doesn't exist
|
10
|
+
* @throws {UnsupportedFormatError} When file format is not supported
|
11
|
+
* @throws {InvalidFileError} When file is corrupted or invalid
|
12
|
+
* @throws {ParseError} When document parsing fails
|
13
|
+
*
|
14
|
+
* @example
|
15
|
+
* ```typescript
|
16
|
+
* // Convert from file path
|
17
|
+
* const result = await convert('./document.pdf');
|
18
|
+
* console.log(result.markdown);
|
19
|
+
*
|
20
|
+
* // Convert from buffer with options
|
21
|
+
* const buffer = await fs.readFile('./document.docx');
|
22
|
+
* const result = await convert(buffer, {
|
23
|
+
* imageDir: 'extracted-images',
|
24
|
+
* preserveLayout: true
|
25
|
+
* });
|
26
|
+
* ```
|
27
|
+
*/
|
28
|
+
export declare function convert(input: ConvertInput, options?: ConvertOptions): Promise<ConversionResult>;
|
29
|
+
export type * from './types/index.js';
|
30
|
+
export { ImageExtractor } from './utils/image-extractor.js';
|
31
|
+
export { ChartExtractor } from './utils/chart-extractor.js';
|
32
|
+
export { LayoutParser } from './utils/layout-parser.js';
|
33
|
+
//# sourceMappingURL=index.d.ts.map
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EACV,YAAY,EACZ,cAAc,EACd,gBAAgB,EAGjB,MAAM,kBAAkB,CAAC;AAS1B;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,wBAAsB,OAAO,CAAC,KAAK,EAAE,YAAY,EAAE,OAAO,GAAE,cAAmB,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAmI1G;AAGD,mBAAmB,kBAAkB,CAAC;AAGtC,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC"}
|
package/dist/index.js
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
import { promises as fs } from 'node:fs';
|
2
|
+
import { Buffer } from 'node:buffer';
|
3
|
+
const fileType = require('file-type');
|
4
|
+
import { ImageExtractor } from './utils/image-extractor.js';
|
5
|
+
import { ChartExtractor } from './utils/chart-extractor.js';
|
6
|
+
import { parsePdf } from './parsers/pdf-parser.js';
|
7
|
+
import { parseDocx } from './parsers/docx-parser.js';
|
8
|
+
import { parseXlsx } from './parsers/xlsx-parser.js';
|
9
|
+
import { parsePptx } from './parsers/pptx-parser.js';
|
10
|
+
import { FileNotFoundError, UnsupportedFormatError, InvalidFileError, SUPPORTED_MIME_TYPES } from './types/index.js';
|
11
|
+
/**
|
12
|
+
* Convert a document (PDF, DOCX, XLSX, PPTX) to Markdown format
|
13
|
+
*
|
14
|
+
* @param input - File path (string) or Buffer containing the document data
|
15
|
+
* @param options - Conversion options
|
16
|
+
* @returns Promise resolving to conversion result with markdown and metadata
|
17
|
+
*
|
18
|
+
* @throws {FileNotFoundError} When file path doesn't exist
|
19
|
+
* @throws {UnsupportedFormatError} When file format is not supported
|
20
|
+
* @throws {InvalidFileError} When file is corrupted or invalid
|
21
|
+
* @throws {ParseError} When document parsing fails
|
22
|
+
*
|
23
|
+
* @example
|
24
|
+
* ```typescript
|
25
|
+
* // Convert from file path
|
26
|
+
* const result = await convert('./document.pdf');
|
27
|
+
* console.log(result.markdown);
|
28
|
+
*
|
29
|
+
* // Convert from buffer with options
|
30
|
+
* const buffer = await fs.readFile('./document.docx');
|
31
|
+
* const result = await convert(buffer, {
|
32
|
+
* imageDir: 'extracted-images',
|
33
|
+
* preserveLayout: true
|
34
|
+
* });
|
35
|
+
* ```
|
36
|
+
*/
|
37
|
+
export async function convert(input, options = {}) {
|
38
|
+
const startTime = Date.now();
|
39
|
+
try {
|
40
|
+
let buffer;
|
41
|
+
// Handle input type
|
42
|
+
if (typeof input === 'string') {
|
43
|
+
try {
|
44
|
+
buffer = await fs.readFile(input);
|
45
|
+
}
|
46
|
+
catch (error) {
|
47
|
+
if (error?.code === 'ENOENT') {
|
48
|
+
throw new FileNotFoundError(input);
|
49
|
+
}
|
50
|
+
throw new InvalidFileError(`Failed to read file: ${input}`, error);
|
51
|
+
}
|
52
|
+
}
|
53
|
+
else if (Buffer.isBuffer(input)) {
|
54
|
+
buffer = input;
|
55
|
+
}
|
56
|
+
else {
|
57
|
+
throw new InvalidFileError('Input must be a file path (string) or Buffer');
|
58
|
+
}
|
59
|
+
// Detect file type
|
60
|
+
const detectedType = await fileType.fromBuffer(buffer);
|
61
|
+
if (!detectedType) {
|
62
|
+
throw new UnsupportedFormatError('unknown');
|
63
|
+
}
|
64
|
+
// Validate supported format
|
65
|
+
const supportedMimeTypes = Object.values(SUPPORTED_MIME_TYPES);
|
66
|
+
if (!supportedMimeTypes.includes(detectedType.mime)) {
|
67
|
+
throw new UnsupportedFormatError(detectedType.mime);
|
68
|
+
}
|
69
|
+
// Setup extractors
|
70
|
+
const { imageDir = 'images', preserveLayout = true, extractCharts = true, extractImages = true, maxPages } = options;
|
71
|
+
const imageExtractor = new ImageExtractor(imageDir);
|
72
|
+
const chartExtractor = new ChartExtractor(imageExtractor);
|
73
|
+
// Parse document based on type
|
74
|
+
let markdown;
|
75
|
+
let images = [];
|
76
|
+
let charts = [];
|
77
|
+
let pageCount = 1;
|
78
|
+
let additionalMetadata = {};
|
79
|
+
switch (detectedType.mime) {
|
80
|
+
case SUPPORTED_MIME_TYPES.PDF: {
|
81
|
+
const result = await parsePdf(buffer, imageExtractor, { maxPages, preserveLayout });
|
82
|
+
markdown = result.markdown;
|
83
|
+
images = result.images || [];
|
84
|
+
pageCount = result.pageCount || 1;
|
85
|
+
additionalMetadata = result.metadata || {};
|
86
|
+
break;
|
87
|
+
}
|
88
|
+
case SUPPORTED_MIME_TYPES.DOCX: {
|
89
|
+
const result = await parseDocx(buffer, imageExtractor, chartExtractor, { preserveLayout, extractImages, extractCharts });
|
90
|
+
markdown = result.markdown;
|
91
|
+
images = result.images || [];
|
92
|
+
charts = result.charts || [];
|
93
|
+
additionalMetadata = result.metadata || {};
|
94
|
+
break;
|
95
|
+
}
|
96
|
+
case SUPPORTED_MIME_TYPES.XLSX: {
|
97
|
+
const result = await parseXlsx(buffer, imageExtractor, chartExtractor, { preserveLayout, extractCharts });
|
98
|
+
markdown = result.markdown;
|
99
|
+
charts = result.charts || [];
|
100
|
+
pageCount = result.sheetCount || 1;
|
101
|
+
additionalMetadata = result.metadata || {};
|
102
|
+
break;
|
103
|
+
}
|
104
|
+
case SUPPORTED_MIME_TYPES.PPTX: {
|
105
|
+
const result = await parsePptx(buffer, imageExtractor, chartExtractor, { preserveLayout, extractImages, extractCharts });
|
106
|
+
markdown = result.markdown;
|
107
|
+
images = result.images || [];
|
108
|
+
charts = result.charts || [];
|
109
|
+
pageCount = result.slideCount || 1;
|
110
|
+
additionalMetadata = result.metadata || {};
|
111
|
+
break;
|
112
|
+
}
|
113
|
+
default: {
|
114
|
+
// This should never happen due to earlier validation, but TypeScript requires it
|
115
|
+
const exhaustiveCheck = detectedType.mime;
|
116
|
+
throw new UnsupportedFormatError(exhaustiveCheck);
|
117
|
+
}
|
118
|
+
}
|
119
|
+
const endTime = Date.now();
|
120
|
+
// Build metadata
|
121
|
+
const metadata = {
|
122
|
+
fileType: detectedType.ext.toUpperCase(),
|
123
|
+
mimeType: detectedType.mime,
|
124
|
+
pageCount,
|
125
|
+
imageCount: images.length,
|
126
|
+
chartCount: charts.length,
|
127
|
+
processingTime: endTime - startTime,
|
128
|
+
additional: additionalMetadata
|
129
|
+
};
|
130
|
+
return {
|
131
|
+
markdown,
|
132
|
+
images,
|
133
|
+
charts,
|
134
|
+
metadata
|
135
|
+
};
|
136
|
+
}
|
137
|
+
catch (error) {
|
138
|
+
// Re-throw known errors
|
139
|
+
if (error instanceof FileNotFoundError ||
|
140
|
+
error instanceof UnsupportedFormatError ||
|
141
|
+
error instanceof InvalidFileError) {
|
142
|
+
throw error;
|
143
|
+
}
|
144
|
+
// Wrap unknown errors
|
145
|
+
const message = error instanceof Error ? error.message : 'Unknown conversion error';
|
146
|
+
throw new InvalidFileError(`Conversion failed: ${message}`, error);
|
147
|
+
}
|
148
|
+
}
|
149
|
+
// Export utility classes for advanced usage
|
150
|
+
export { ImageExtractor } from './utils/image-extractor.js';
|
151
|
+
export { ChartExtractor } from './utils/chart-extractor.js';
|
152
|
+
export { LayoutParser } from './utils/layout-parser.js';
|
153
|
+
//# sourceMappingURL=index.js.map
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,MAAM,QAAQ,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;AAEtC,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AACrD,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AACrD,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AAUrD,OAAO,EACL,iBAAiB,EACjB,sBAAsB,EACtB,gBAAgB,EAChB,oBAAoB,EACrB,MAAM,kBAAkB,CAAC;AAE1B;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,KAAmB,EAAE,UAA0B,EAAE;IAC7E,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,IAAI,CAAC;QACH,IAAI,MAAc,CAAC;QAEnB,oBAAoB;QACpB,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,IAAI,CAAC;gBACH,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;YACpC,CAAC;YAAC,OAAO,KAAc,EAAE,CAAC;gBACxB,IAAK,KAAa,EAAE,IAAI,KAAK,QAAQ,EAAE,CAAC;oBACtC,MAAM,IAAI,iBAAiB,CAAC,KAAK,CAAC,CAAC;gBACrC,CAAC;gBACD,MAAM,IAAI,gBAAgB,CAAC,wBAAwB,KAAK,EAAE,EAAE,KAAc,CAAC,CAAC;YAC9E,CAAC;QACH,CAAC;aAAM,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAClC,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,gBAAgB,CAAC,8CAA8C,CAAC,CAAC;QAC7E,CAAC;QAED,mBAAmB;QACnB,MAAM,YAAY,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QAEvD,IAAI,CAAC,YAAY,EAAE,CAAC;YAClB,MAAM,IAAI,sBAAsB,CAAC,SAAS,CAAC,CAAC;QAC9C,CAAC;QAED,4BAA4B;QAC5B,MAAM,kBAAkB,GAAG,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC;QAC/D,IAAI,CAAC,kBAAkB,CAAC,QAAQ,CAAC,YAAY,CAAC,IAAyB,CAAC,EAAE,CAAC;YACzE,MAAM,IAAI,sBAAsB,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;QACtD,CAAC;QAED,mBAAmB;QACnB,MAAM,EACJ,QAAQ,GAAG,QAAQ,EACnB,cAAc,GAAG,IAAI,EACrB,aAAa,GAAG,IAAI,EACpB,aAAa,GAAG,IAAI,EACpB,QAAQ,EACT,GAAG,OAAO,CAAC;QAEZ,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,QAAQ,CAAC,CAAC;QACpD,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,cAAc,CAAC,CAAC;QAE1D,+BAA+B;QAC/B,IAAI,QAAgB,CAAC;QACrB,IAAI,MAAM,GAAyD,EAAE,CAAC;QACtE,IAAI,MAAM,GAAyD,EAAE,CAAC;QACtE,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,kBAAkB,GAA4B,EAAE,CAAC;QAErD,QAAQ,YAAY,CAAC,IAAyB,EAAE,CAAC;YAC/C,KAAK,oBAAoB,CAAC,GAAG,CAAC,CAAC,CAAC;gBAC9B,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,MAAM,EAAE,cAAc,EAAE,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC,CAAC;gBACpF,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;gBAC3B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,CAAC,CAAC;gBAClC,kBAAkB,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC3C,MAAM;YACR,CAAC;YAED,KAAK,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC/B,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,MAAM,EAAE,cAAc,EAAE,cAAc,EAAE,EAAE,cAAc,EAAE,aAAa,EAAE,aAAa,EAAE,CAAC,CAAC;gBACzH,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;gBAC3B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,kBAAkB,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC3C,MAAM;YACR,CAAC;YAED,KAAK,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC/B,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,MAAM,EAAE,cAAc,EAAE,cAAc,EAAE,EAAE,cAAc,EAAE,aAAa,EAAE,CAAC,CAAC;gBAC1G,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;gBAC3B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,SAAS,GAAG,MAAM,CAAC,UAAU,IAAI,CAAC,CAAC;gBACnC,kBAAkB,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC3C,MAAM;YACR,CAAC;YAED,KAAK,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC/B,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,MAAM,EAAE,cAAc,EAAE,cAAc,EAAE,EAAE,cAAc,EAAE,aAAa,EAAE,aAAa,EAAE,CAAC,CAAC;gBACzH,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;gBAC3B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,SAAS,GAAG,MAAM,CAAC,UAAU,IAAI,CAAC,CAAC;gBACnC,kBAAkB,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC3C,MAAM;YACR,CAAC;YAED,OAAO,CAAC,CAAC,CAAC;gBACR,iFAAiF;gBACjF,MAAM,eAAe,GAAU,YAAY,CAAC,IAAa,CAAC;gBAC1D,MAAM,IAAI,sBAAsB,CAAC,eAAe,CAAC,CAAC;YACpD,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE3B,iBAAiB;QACjB,MAAM,QAAQ,GAAqB;YACjC,QAAQ,EAAE,YAAY,CAAC,GAAG,CAAC,WAAW,EAAE;YACxC,QAAQ,EAAE,YAAY,CAAC,IAAI;YAC3B,SAAS;YACT,UAAU,EAAE,MAAM,CAAC,MAAM;YACzB,UAAU,EAAE,MAAM,CAAC,MAAM;YACzB,cAAc,EAAE,OAAO,GAAG,SAAS;YACnC,UAAU,EAAE,kBAAkB;SAC/B,CAAC;QAEF,OAAO;YACL,QAAQ;YACR,MAAM;YACN,MAAM;YACN,QAAQ;SACT,CAAC;IAEJ,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,wBAAwB;QACxB,IAAI,KAAK,YAAY,iBAAiB;YAClC,KAAK,YAAY,sBAAsB;YACvC,KAAK,YAAY,gBAAgB,EAAE,CAAC;YACtC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,sBAAsB;QACtB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,0BAA0B,CAAC;QACpF,MAAM,IAAI,gBAAgB,CAAC,sBAAsB,OAAO,EAAE,EAAE,KAAc,CAAC,CAAC;IAC9E,CAAC;AACH,CAAC;AAKD,4CAA4C;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC"}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
import type { Buffer } from 'node:buffer';
|
2
|
+
import type { ImageExtractor } from '../utils/image-extractor.js';
|
3
|
+
import type { ChartExtractor } from '../utils/chart-extractor.js';
|
4
|
+
import type { ImageData, ChartData } from '../types/interfaces.js';
|
5
|
+
export interface DocxParseOptions {
|
6
|
+
readonly preserveLayout?: boolean;
|
7
|
+
readonly extractImages?: boolean;
|
8
|
+
readonly extractCharts?: boolean;
|
9
|
+
}
|
10
|
+
export interface DocxParseResult {
|
11
|
+
readonly markdown: string;
|
12
|
+
readonly images: readonly ImageData[];
|
13
|
+
readonly charts: readonly ChartData[];
|
14
|
+
readonly metadata: Record<string, unknown>;
|
15
|
+
}
|
16
|
+
/**
|
17
|
+
* Parse DOCX buffer and convert to markdown with layout preservation
|
18
|
+
*/
|
19
|
+
export declare function parseDocx(buffer: Buffer, imageExtractor: ImageExtractor, chartExtractor: ChartExtractor, options?: DocxParseOptions): Promise<DocxParseResult>;
|
20
|
+
//# sourceMappingURL=docx-parser.d.ts.map
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":3,"file":"docx-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/docx-parser.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAClE,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAGlE,OAAO,KAAK,EACV,SAAS,EACT,SAAS,EAKV,MAAM,wBAAwB,CAAC;AAEhC,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,cAAc,CAAC,EAAE,OAAO,CAAC;IAClC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;IACjC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;CAClC;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC5C;AA4BD;;GAEG;AACH,wBAAsB,SAAS,CAC7B,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,cAAc,EAC9B,cAAc,EAAE,cAAc,EAC9B,OAAO,GAAE,gBAAqB,GAC7B,OAAO,CAAC,eAAe,CAAC,CA6D1B"}
|