file2md 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +293 -0
  3. package/dist/index.d.ts +33 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +153 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/parsers/docx-parser.d.ts +20 -0
  8. package/dist/parsers/docx-parser.d.ts.map +1 -0
  9. package/dist/parsers/docx-parser.js +237 -0
  10. package/dist/parsers/docx-parser.js.map +1 -0
  11. package/dist/parsers/pdf-parser.d.ts +8 -0
  12. package/dist/parsers/pdf-parser.d.ts.map +1 -0
  13. package/dist/parsers/pdf-parser.js +98 -0
  14. package/dist/parsers/pdf-parser.js.map +1 -0
  15. package/dist/parsers/pptx-parser.d.ts +21 -0
  16. package/dist/parsers/pptx-parser.d.ts.map +1 -0
  17. package/dist/parsers/pptx-parser.js +264 -0
  18. package/dist/parsers/pptx-parser.js.map +1 -0
  19. package/dist/parsers/xlsx-parser.d.ts +19 -0
  20. package/dist/parsers/xlsx-parser.d.ts.map +1 -0
  21. package/dist/parsers/xlsx-parser.js +267 -0
  22. package/dist/parsers/xlsx-parser.js.map +1 -0
  23. package/dist/types/errors.d.ts +52 -0
  24. package/dist/types/errors.d.ts.map +1 -0
  25. package/dist/types/errors.js +76 -0
  26. package/dist/types/errors.js.map +1 -0
  27. package/dist/types/index.d.ts +5 -0
  28. package/dist/types/index.d.ts.map +1 -0
  29. package/dist/types/index.js +5 -0
  30. package/dist/types/index.js.map +1 -0
  31. package/dist/types/interfaces.d.ts +228 -0
  32. package/dist/types/interfaces.d.ts.map +1 -0
  33. package/dist/types/interfaces.js +10 -0
  34. package/dist/types/interfaces.js.map +1 -0
  35. package/dist/utils/chart-extractor.d.ts +44 -0
  36. package/dist/utils/chart-extractor.d.ts.map +1 -0
  37. package/dist/utils/chart-extractor.js +258 -0
  38. package/dist/utils/chart-extractor.js.map +1 -0
  39. package/dist/utils/image-extractor.d.ts +50 -0
  40. package/dist/utils/image-extractor.d.ts.map +1 -0
  41. package/dist/utils/image-extractor.js +136 -0
  42. package/dist/utils/image-extractor.js.map +1 -0
  43. package/dist/utils/layout-parser.d.ts +55 -0
  44. package/dist/utils/layout-parser.d.ts.map +1 -0
  45. package/dist/utils/layout-parser.js +244 -0
  46. package/dist/utils/layout-parser.js.map +1 -0
  47. package/dist/utils/pdf-extractor.d.ts +46 -0
  48. package/dist/utils/pdf-extractor.d.ts.map +1 -0
  49. package/dist/utils/pdf-extractor.js +235 -0
  50. package/dist/utils/pdf-extractor.js.map +1 -0
  51. package/package.json +70 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 file2md contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,293 @@
1
+ # file2md
2
+
3
+ [![npm version](https://badge.fury.io/js/file2md.svg)](https://badge.fury.io/js/file2md)
4
+ [![TypeScript](https://img.shields.io/badge/TypeScript-Ready-blue.svg)](https://www.typescriptlang.org/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+
7
+ A modern TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX) into Markdown with **advanced layout preservation**, **image extraction**, and **chart conversion**.
8
+
9
+ ## ✨ Features
10
+
11
+ - 🔄 **Multiple Format Support**: PDF, DOCX, XLSX, PPTX
12
+ - 🎨 **Layout Preservation**: Maintains document structure, tables, and formatting
13
+ - 🖼️ **Image Extraction**: Automatically extracts and references images
14
+ - 📊 **Chart Conversion**: Converts charts to Markdown tables
15
+ - 📝 **List & Table Support**: Proper nested lists and complex tables
16
+ - 🔒 **Type Safety**: Full TypeScript support with comprehensive types
17
+ - ⚡ **Modern ESM**: ES2022 modules with CommonJS compatibility
18
+ - 🚀 **Zero Config**: Works out of the box
19
+
20
+ ## 📦 Installation
21
+
22
+ ```bash
23
+ npm install file2md
24
+ ```
25
+
26
+ ## 🚀 Quick Start
27
+
28
+ ### TypeScript / ES Modules
29
+
30
+ ```typescript
31
+ import { convert } from 'file2md';
32
+
33
+ // Convert from file path
34
+ const result = await convert('./document.pdf');
35
+ console.log(result.markdown);
36
+
37
+ // Convert with options
38
+ const result = await convert('./presentation.pptx', {
39
+ imageDir: 'extracted-images',
40
+ preserveLayout: true,
41
+ extractCharts: true
42
+ });
43
+
44
+ console.log(`✅ Converted successfully!`);
45
+ console.log(`📄 Markdown length: ${result.markdown.length}`);
46
+ console.log(`🖼️ Images extracted: ${result.images.length}`);
47
+ console.log(`📊 Charts found: ${result.charts.length}`);
48
+ ```
49
+
50
+ ### CommonJS
51
+
52
+ ```javascript
53
+ const { convert } = require('file2md');
54
+
55
+ const result = await convert('./document.docx');
56
+ console.log(result.markdown);
57
+ ```
58
+
59
+ ### From Buffer
60
+
61
+ ```typescript
62
+ import { convert } from 'file2md';
63
+ import { readFile } from 'fs/promises';
64
+
65
+ const buffer = await readFile('./document.xlsx');
66
+ const result = await convert(buffer, {
67
+ imageDir: 'spreadsheet-images'
68
+ });
69
+ ```
70
+
71
+ ## 📋 API Reference
72
+
73
+ ### `convert(input, options?)`
74
+
75
+ **Parameters:**
76
+ - `input: string | Buffer` - File path or buffer containing document data
77
+ - `options?: ConvertOptions` - Conversion options
78
+
79
+ **Returns:** `Promise<ConversionResult>`
80
+
81
+ ### Options
82
+
83
+ ```typescript
84
+ interface ConvertOptions {
85
+ imageDir?: string; // Directory for extracted images (default: 'images')
86
+ preserveLayout?: boolean; // Maintain document layout (default: true)
87
+ extractCharts?: boolean; // Convert charts to tables (default: true)
88
+ extractImages?: boolean; // Extract embedded images (default: true)
89
+ maxPages?: number; // Max pages for PDFs (default: unlimited)
90
+ }
91
+ ```
92
+
93
+ ### Result
94
+
95
+ ```typescript
96
+ interface ConversionResult {
97
+ markdown: string; // Generated Markdown content
98
+ images: ImageData[]; // Extracted image information
99
+ charts: ChartData[]; // Extracted chart data
100
+ metadata: DocumentMetadata; // Document metadata
101
+ }
102
+ ```
103
+
104
+ ## 🎯 Format-Specific Features
105
+
106
+ ### 📄 PDF
107
+ - ✅ **Text extraction** with layout enhancement
108
+ - ✅ **Table detection** and formatting
109
+ - ✅ **List recognition** (bullets, numbers)
110
+ - ✅ **Heading detection** (ALL CAPS, colons)
111
+ - ✅ **Page-to-image fallback** for complex layouts
112
+
113
+ ### 📝 DOCX
114
+ - ✅ **Heading hierarchy** (H1-H6)
115
+ - ✅ **Text formatting** (bold, italic)
116
+ - ✅ **Complex tables** with merged cells
117
+ - ✅ **Nested lists** with proper indentation
118
+ - ✅ **Embedded images** and charts
119
+ - ✅ **Cell styling** (alignment, colors)
120
+
121
+ ### 📊 XLSX
122
+ - ✅ **Multiple worksheets** as separate sections
123
+ - ✅ **Cell formatting** (bold, colors, alignment)
124
+ - ✅ **Data type preservation**
125
+ - ✅ **Chart extraction** to data tables
126
+ - ✅ **Conditional formatting** notes
127
+
128
+ ### 🎬 PPTX
129
+ - ✅ **Slide-by-slide** organization
130
+ - ✅ **Text positioning** and layout
131
+ - ✅ **Image placement** per slide
132
+ - ✅ **Table extraction** from slides
133
+ - ✅ **Multi-column layouts**
134
+
135
+ ## 🖼️ Image Handling
136
+
137
+ Images are automatically extracted and saved to the specified directory:
138
+
139
+ ```typescript
140
+ const result = await convert('./presentation.pptx', {
141
+ imageDir: 'my-images'
142
+ });
143
+
144
+ // Result structure:
145
+ // my-images/
146
+ // ├── image_1.png
147
+ // ├── image_2.jpg
148
+ // └── chart_1.png
149
+
150
+ // Markdown will contain:
151
+ // ![Slide 1 Image](my-images/image_1.png)
152
+ ```
153
+
154
+ ## 📊 Chart Conversion
155
+
156
+ Charts are converted to Markdown tables:
157
+
158
+ ```markdown
159
+ #### Chart 1: Sales Data
160
+
161
+ | Category | Q1 | Q2 | Q3 | Q4 |
162
+ | --- | --- | --- | --- | --- |
163
+ | Revenue | 100 | 150 | 200 | 250 |
164
+ | Profit | 20 | 30 | 45 | 60 |
165
+ ```
166
+
167
+ ## 🛡️ Error Handling
168
+
169
+ ```typescript
170
+ import {
171
+ convert,
172
+ UnsupportedFormatError,
173
+ FileNotFoundError,
174
+ ParseError
175
+ } from 'file2md';
176
+
177
+ try {
178
+ const result = await convert('./document.pdf');
179
+ } catch (error) {
180
+ if (error instanceof UnsupportedFormatError) {
181
+ console.error('Unsupported file format');
182
+ } else if (error instanceof FileNotFoundError) {
183
+ console.error('File not found');
184
+ } else if (error instanceof ParseError) {
185
+ console.error('Failed to parse document:', error.message);
186
+ }
187
+ }
188
+ ```
189
+
190
+ ## 🧪 Advanced Usage
191
+
192
+ ### Custom Error Handling
193
+
194
+ ```typescript
195
+ import { convert, ConversionError } from 'file2md';
196
+
197
+ try {
198
+ const result = await convert('./complex-document.docx');
199
+ } catch (error) {
200
+ if (error instanceof ConversionError) {
201
+ console.error(`Conversion failed [${error.code}]:`, error.message);
202
+ if (error.originalError) {
203
+ console.error('Original error:', error.originalError);
204
+ }
205
+ }
206
+ }
207
+ ```
208
+
209
+ ### Batch Processing
210
+
211
+ ```typescript
212
+ import { convert } from 'file2md';
213
+ import { readdir } from 'fs/promises';
214
+
215
+ async function convertFolder(folderPath: string) {
216
+ const files = await readdir(folderPath);
217
+ const results = [];
218
+
219
+ for (const file of files) {
220
+ if (file.match(/\.(pdf|docx|xlsx|pptx)$/i)) {
221
+ try {
222
+ const result = await convert(`${folderPath}/${file}`);
223
+ results.push({ file, success: true, result });
224
+ } catch (error) {
225
+ results.push({ file, success: false, error });
226
+ }
227
+ }
228
+ }
229
+
230
+ return results;
231
+ }
232
+ ```
233
+
234
+ ## 🏗️ Development
235
+
236
+ ### Build from Source
237
+
238
+ ```bash
239
+ git clone https://github.com/yourusername/file2md.git
240
+ cd file2md
241
+ npm install
242
+ npm run build
243
+ ```
244
+
245
+ ### Testing
246
+
247
+ ```bash
248
+ npm test # Run tests
249
+ npm run test:watch # Watch mode
250
+ npm run test:coverage # Coverage report
251
+ ```
252
+
253
+ ### Linting
254
+
255
+ ```bash
256
+ npm run lint # Check code style
257
+ npm run lint:fix # Fix issues
258
+ ```
259
+
260
+ ## 🤝 Contributing
261
+
262
+ Contributions are welcome! Please feel free to submit a Pull Request.
263
+
264
+ 1. Fork the repository
265
+ 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
266
+ 3. Commit your changes (`git commit -m 'Add amazing feature'`)
267
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
268
+ 5. Open a Pull Request
269
+
270
+ ## 📄 License
271
+
272
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
273
+
274
+ ## 🔗 Links
275
+
276
+ - [npm package](https://www.npmjs.com/package/file2md)
277
+ - [GitHub repository](https://github.com/yourusername/file2md)
278
+ - [Issues & Bug Reports](https://github.com/yourusername/file2md/issues)
279
+
280
+ ## 📊 Supported Formats
281
+
282
+ | Format | Extension | Layout | Images | Charts | Tables | Lists |
283
+ |--------|-----------|---------|---------|---------|---------|--------|
284
+ | PDF | `.pdf` | ✅ | ✅* | ❌ | ✅ | ✅ |
285
+ | Word | `.docx` | ✅ | ✅ | ✅ | ✅ | ✅ |
286
+ | Excel | `.xlsx` | ✅ | ❌ | ✅ | ✅ | ❌ |
287
+ | PowerPoint | `.pptx` | ✅ | ✅ | ✅ | ✅ | ❌ |
288
+
289
+ *PDF images via page-to-image conversion
290
+
291
+ ---
292
+
293
+ **Made with ❤️ and TypeScript**
@@ -0,0 +1,33 @@
1
+ import type { ConvertInput, ConvertOptions, ConversionResult } from './types/index.js';
2
+ /**
3
+ * Convert a document (PDF, DOCX, XLSX, PPTX) to Markdown format
4
+ *
5
+ * @param input - File path (string) or Buffer containing the document data
6
+ * @param options - Conversion options
7
+ * @returns Promise resolving to conversion result with markdown and metadata
8
+ *
9
+ * @throws {FileNotFoundError} When file path doesn't exist
10
+ * @throws {UnsupportedFormatError} When file format is not supported
11
+ * @throws {InvalidFileError} When file is corrupted or invalid
12
+ * @throws {ParseError} When document parsing fails
13
+ *
14
+ * @example
15
+ * ```typescript
16
+ * // Convert from file path
17
+ * const result = await convert('./document.pdf');
18
+ * console.log(result.markdown);
19
+ *
20
+ * // Convert from buffer with options
21
+ * const buffer = await fs.readFile('./document.docx');
22
+ * const result = await convert(buffer, {
23
+ * imageDir: 'extracted-images',
24
+ * preserveLayout: true
25
+ * });
26
+ * ```
27
+ */
28
+ export declare function convert(input: ConvertInput, options?: ConvertOptions): Promise<ConversionResult>;
29
+ export type * from './types/index.js';
30
+ export { ImageExtractor } from './utils/image-extractor.js';
31
+ export { ChartExtractor } from './utils/chart-extractor.js';
32
+ export { LayoutParser } from './utils/layout-parser.js';
33
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EACV,YAAY,EACZ,cAAc,EACd,gBAAgB,EAGjB,MAAM,kBAAkB,CAAC;AAS1B;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,wBAAsB,OAAO,CAAC,KAAK,EAAE,YAAY,EAAE,OAAO,GAAE,cAAmB,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAmI1G;AAGD,mBAAmB,kBAAkB,CAAC;AAGtC,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,153 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import { Buffer } from 'node:buffer';
3
+ const fileType = require('file-type');
4
+ import { ImageExtractor } from './utils/image-extractor.js';
5
+ import { ChartExtractor } from './utils/chart-extractor.js';
6
+ import { parsePdf } from './parsers/pdf-parser.js';
7
+ import { parseDocx } from './parsers/docx-parser.js';
8
+ import { parseXlsx } from './parsers/xlsx-parser.js';
9
+ import { parsePptx } from './parsers/pptx-parser.js';
10
+ import { FileNotFoundError, UnsupportedFormatError, InvalidFileError, SUPPORTED_MIME_TYPES } from './types/index.js';
11
+ /**
12
+ * Convert a document (PDF, DOCX, XLSX, PPTX) to Markdown format
13
+ *
14
+ * @param input - File path (string) or Buffer containing the document data
15
+ * @param options - Conversion options
16
+ * @returns Promise resolving to conversion result with markdown and metadata
17
+ *
18
+ * @throws {FileNotFoundError} When file path doesn't exist
19
+ * @throws {UnsupportedFormatError} When file format is not supported
20
+ * @throws {InvalidFileError} When file is corrupted or invalid
21
+ * @throws {ParseError} When document parsing fails
22
+ *
23
+ * @example
24
+ * ```typescript
25
+ * // Convert from file path
26
+ * const result = await convert('./document.pdf');
27
+ * console.log(result.markdown);
28
+ *
29
+ * // Convert from buffer with options
30
+ * const buffer = await fs.readFile('./document.docx');
31
+ * const result = await convert(buffer, {
32
+ * imageDir: 'extracted-images',
33
+ * preserveLayout: true
34
+ * });
35
+ * ```
36
+ */
37
+ export async function convert(input, options = {}) {
38
+ const startTime = Date.now();
39
+ try {
40
+ let buffer;
41
+ // Handle input type
42
+ if (typeof input === 'string') {
43
+ try {
44
+ buffer = await fs.readFile(input);
45
+ }
46
+ catch (error) {
47
+ if (error?.code === 'ENOENT') {
48
+ throw new FileNotFoundError(input);
49
+ }
50
+ throw new InvalidFileError(`Failed to read file: ${input}`, error);
51
+ }
52
+ }
53
+ else if (Buffer.isBuffer(input)) {
54
+ buffer = input;
55
+ }
56
+ else {
57
+ throw new InvalidFileError('Input must be a file path (string) or Buffer');
58
+ }
59
+ // Detect file type
60
+ const detectedType = await fileType.fromBuffer(buffer);
61
+ if (!detectedType) {
62
+ throw new UnsupportedFormatError('unknown');
63
+ }
64
+ // Validate supported format
65
+ const supportedMimeTypes = Object.values(SUPPORTED_MIME_TYPES);
66
+ if (!supportedMimeTypes.includes(detectedType.mime)) {
67
+ throw new UnsupportedFormatError(detectedType.mime);
68
+ }
69
+ // Setup extractors
70
+ const { imageDir = 'images', preserveLayout = true, extractCharts = true, extractImages = true, maxPages } = options;
71
+ const imageExtractor = new ImageExtractor(imageDir);
72
+ const chartExtractor = new ChartExtractor(imageExtractor);
73
+ // Parse document based on type
74
+ let markdown;
75
+ let images = [];
76
+ let charts = [];
77
+ let pageCount = 1;
78
+ let additionalMetadata = {};
79
+ switch (detectedType.mime) {
80
+ case SUPPORTED_MIME_TYPES.PDF: {
81
+ const result = await parsePdf(buffer, imageExtractor, { maxPages, preserveLayout });
82
+ markdown = result.markdown;
83
+ images = result.images || [];
84
+ pageCount = result.pageCount || 1;
85
+ additionalMetadata = result.metadata || {};
86
+ break;
87
+ }
88
+ case SUPPORTED_MIME_TYPES.DOCX: {
89
+ const result = await parseDocx(buffer, imageExtractor, chartExtractor, { preserveLayout, extractImages, extractCharts });
90
+ markdown = result.markdown;
91
+ images = result.images || [];
92
+ charts = result.charts || [];
93
+ additionalMetadata = result.metadata || {};
94
+ break;
95
+ }
96
+ case SUPPORTED_MIME_TYPES.XLSX: {
97
+ const result = await parseXlsx(buffer, imageExtractor, chartExtractor, { preserveLayout, extractCharts });
98
+ markdown = result.markdown;
99
+ charts = result.charts || [];
100
+ pageCount = result.sheetCount || 1;
101
+ additionalMetadata = result.metadata || {};
102
+ break;
103
+ }
104
+ case SUPPORTED_MIME_TYPES.PPTX: {
105
+ const result = await parsePptx(buffer, imageExtractor, chartExtractor, { preserveLayout, extractImages, extractCharts });
106
+ markdown = result.markdown;
107
+ images = result.images || [];
108
+ charts = result.charts || [];
109
+ pageCount = result.slideCount || 1;
110
+ additionalMetadata = result.metadata || {};
111
+ break;
112
+ }
113
+ default: {
114
+ // This should never happen due to earlier validation, but TypeScript requires it
115
+ const exhaustiveCheck = detectedType.mime;
116
+ throw new UnsupportedFormatError(exhaustiveCheck);
117
+ }
118
+ }
119
+ const endTime = Date.now();
120
+ // Build metadata
121
+ const metadata = {
122
+ fileType: detectedType.ext.toUpperCase(),
123
+ mimeType: detectedType.mime,
124
+ pageCount,
125
+ imageCount: images.length,
126
+ chartCount: charts.length,
127
+ processingTime: endTime - startTime,
128
+ additional: additionalMetadata
129
+ };
130
+ return {
131
+ markdown,
132
+ images,
133
+ charts,
134
+ metadata
135
+ };
136
+ }
137
+ catch (error) {
138
+ // Re-throw known errors
139
+ if (error instanceof FileNotFoundError ||
140
+ error instanceof UnsupportedFormatError ||
141
+ error instanceof InvalidFileError) {
142
+ throw error;
143
+ }
144
+ // Wrap unknown errors
145
+ const message = error instanceof Error ? error.message : 'Unknown conversion error';
146
+ throw new InvalidFileError(`Conversion failed: ${message}`, error);
147
+ }
148
+ }
149
+ // Export utility classes for advanced usage
150
+ export { ImageExtractor } from './utils/image-extractor.js';
151
+ export { ChartExtractor } from './utils/chart-extractor.js';
152
+ export { LayoutParser } from './utils/layout-parser.js';
153
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,MAAM,QAAQ,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;AAEtC,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AACrD,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AACrD,OAAO,EAAE,SAAS,EAAE,MAAM,0BAA0B,CAAC;AAUrD,OAAO,EACL,iBAAiB,EACjB,sBAAsB,EACtB,gBAAgB,EAChB,oBAAoB,EACrB,MAAM,kBAAkB,CAAC;AAE1B;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,KAAmB,EAAE,UAA0B,EAAE;IAC7E,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,IAAI,CAAC;QACH,IAAI,MAAc,CAAC;QAEnB,oBAAoB;QACpB,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;YAC9B,IAAI,CAAC;gBACH,MAAM,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;YACpC,CAAC;YAAC,OAAO,KAAc,EAAE,CAAC;gBACxB,IAAK,KAAa,EAAE,IAAI,KAAK,QAAQ,EAAE,CAAC;oBACtC,MAAM,IAAI,iBAAiB,CAAC,KAAK,CAAC,CAAC;gBACrC,CAAC;gBACD,MAAM,IAAI,gBAAgB,CAAC,wBAAwB,KAAK,EAAE,EAAE,KAAc,CAAC,CAAC;YAC9E,CAAC;QACH,CAAC;aAAM,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAClC,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;aAAM,CAAC;YACN,MAAM,IAAI,gBAAgB,CAAC,8CAA8C,CAAC,CAAC;QAC7E,CAAC;QAED,mBAAmB;QACnB,MAAM,YAAY,GAAG,MAAM,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QAEvD,IAAI,CAAC,YAAY,EAAE,CAAC;YAClB,MAAM,IAAI,sBAAsB,CAAC,SAAS,CAAC,CAAC;QAC9C,CAAC;QAED,4BAA4B;QAC5B,MAAM,kBAAkB,GAAG,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC;QAC/D,IAAI,CAAC,kBAAkB,CAAC,QAAQ,CAAC,YAAY,CAAC,IAAyB,CAAC,EAAE,CAAC;YACzE,MAAM,IAAI,sBAAsB,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;QACtD,CAAC;QAED,mBAAmB;QACnB,MAAM,EACJ,QAAQ,GAAG,QAAQ,EACnB,cAAc,GAAG,IAAI,EACrB,aAAa,GAAG,IAAI,EACpB,aAAa,GAAG,IAAI,EACpB,QAAQ,EACT,GAAG,OAAO,CAAC;QAEZ,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,QAAQ,CAAC,CAAC;QACpD,MAAM,cAAc,GAAG,IAAI,cAAc,CAAC,cAAc,CAAC,CAAC;QAE1D,+BAA+B;QAC/B,IAAI,QAAgB,CAAC;QACrB,IAAI,MAAM,GAAyD,EAAE,CAAC;QACtE,IAAI,MAAM,GAAyD,EAAE,CAAC;QACtE,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,IAAI,kBAAkB,GAA4B,EAAE,CAAC;QAErD,QAAQ,YAAY,CAAC,IAAyB,EAAE,CAAC;YAC/C,KAAK,oBAAoB,CAAC,GAAG,CAAC,CAAC,CAAC;gBAC9B,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,MAAM,EAAE,cAAc,EAAE,EAAE,QAAQ,EAAE,cAAc,EAAE,CAAC,CAAC;gBACpF,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;gBAC3B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,CAAC,CAAC;gBAClC,kBAAkB,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC3C,MAAM;YACR,CAAC;YAED,KAAK,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC/B,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,MAAM,EAAE,cAAc,EAAE,cAAc,EAAE,EAAE,cAAc,EAAE,aAAa,EAAE,aAAa,EAAE,CAAC,CAAC;gBACzH,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;gBAC3B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,kBAAkB,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC3C,MAAM;YACR,CAAC;YAED,KAAK,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC/B,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,MAAM,EAAE,cAAc,EAAE,cAAc,EAAE,EAAE,cAAc,EAAE,aAAa,EAAE,CAAC,CAAC;gBAC1G,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;gBAC3B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,SAAS,GAAG,MAAM,CAAC,UAAU,IAAI,CAAC,CAAC;gBACnC,kBAAkB,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC3C,MAAM;YACR,CAAC;YAED,KAAK,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC/B,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,MAAM,EAAE,cAAc,EAAE,cAAc,EAAE,EAAE,cAAc,EAAE,aAAa,EAAE,aAAa,EAAE,CAAC,CAAC;gBACzH,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;gBAC3B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC;gBAC7B,SAAS,GAAG,MAAM,CAAC,UAAU,IAAI,CAAC,CAAC;gBACnC,kBAAkB,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC;gBAC3C,MAAM;YACR,CAAC;YAED,OAAO,CAAC,CAAC,CAAC;gBACR,iFAAiF;gBACjF,MAAM,eAAe,GAAU,YAAY,CAAC,IAAa,CAAC;gBAC1D,MAAM,IAAI,sBAAsB,CAAC,eAAe,CAAC,CAAC;YACpD,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE3B,iBAAiB;QACjB,MAAM,QAAQ,GAAqB;YACjC,QAAQ,EAAE,YAAY,CAAC,GAAG,CAAC,WAAW,EAAE;YACxC,QAAQ,EAAE,YAAY,CAAC,IAAI;YAC3B,SAAS;YACT,UAAU,EAAE,MAAM,CAAC,MAAM;YACzB,UAAU,EAAE,MAAM,CAAC,MAAM;YACzB,cAAc,EAAE,OAAO,GAAG,SAAS;YACnC,UAAU,EAAE,kBAAkB;SAC/B,CAAC;QAEF,OAAO;YACL,QAAQ;YACR,MAAM;YACN,MAAM;YACN,QAAQ;SACT,CAAC;IAEJ,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,wBAAwB;QACxB,IAAI,KAAK,YAAY,iBAAiB;YAClC,KAAK,YAAY,sBAAsB;YACvC,KAAK,YAAY,gBAAgB,EAAE,CAAC;YACtC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,sBAAsB;QACtB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,0BAA0B,CAAC;QACpF,MAAM,IAAI,gBAAgB,CAAC,sBAAsB,OAAO,EAAE,EAAE,KAAc,CAAC,CAAC;IAC9E,CAAC;AACH,CAAC;AAKD,4CAA4C;AAC5C,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAC5D,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC"}
@@ -0,0 +1,20 @@
1
+ import type { Buffer } from 'node:buffer';
2
+ import type { ImageExtractor } from '../utils/image-extractor.js';
3
+ import type { ChartExtractor } from '../utils/chart-extractor.js';
4
+ import type { ImageData, ChartData } from '../types/interfaces.js';
5
+ export interface DocxParseOptions {
6
+ readonly preserveLayout?: boolean;
7
+ readonly extractImages?: boolean;
8
+ readonly extractCharts?: boolean;
9
+ }
10
+ export interface DocxParseResult {
11
+ readonly markdown: string;
12
+ readonly images: readonly ImageData[];
13
+ readonly charts: readonly ChartData[];
14
+ readonly metadata: Record<string, unknown>;
15
+ }
16
+ /**
17
+ * Parse DOCX buffer and convert to markdown with layout preservation
18
+ */
19
+ export declare function parseDocx(buffer: Buffer, imageExtractor: ImageExtractor, chartExtractor: ChartExtractor, options?: DocxParseOptions): Promise<DocxParseResult>;
20
+ //# sourceMappingURL=docx-parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"docx-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/docx-parser.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAClE,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAGlE,OAAO,KAAK,EACV,SAAS,EACT,SAAS,EAKV,MAAM,wBAAwB,CAAC;AAEhC,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,cAAc,CAAC,EAAE,OAAO,CAAC;IAClC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;IACjC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;CAClC;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC5C;AA4BD;;GAEG;AACH,wBAAsB,SAAS,CAC7B,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,cAAc,EAC9B,cAAc,EAAE,cAAc,EAC9B,OAAO,GAAE,gBAAqB,GAC7B,OAAO,CAAC,eAAe,CAAC,CA6D1B"}