file2md 1.2.22 → 1.2.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +211 -9
- package/dist/parsers/docx-parser.d.ts.map +1 -1
- package/dist/parsers/docx-parser.js +12 -17
- package/dist/parsers/docx-parser.js.map +1 -1
- package/dist/parsers/hwp-parser.js +0 -40
- package/dist/parsers/hwp-parser.js.map +1 -1
- package/dist/parsers/pdf-parser.d.ts.map +1 -1
- package/dist/parsers/pdf-parser.js +125 -13
- package/dist/parsers/pdf-parser.js.map +1 -1
- package/dist/parsers/pptx-parser.js +0 -83
- package/dist/parsers/pptx-parser.js.map +1 -1
- package/dist/utils/image-extractor.d.ts.map +1 -1
- package/dist/utils/image-extractor.js +4 -21
- package/dist/utils/image-extractor.js.map +1 -1
- package/dist/utils/pptx-visual-parser.d.ts.map +1 -1
- package/dist/utils/pptx-visual-parser.js +0 -2
- package/dist/utils/pptx-visual-parser.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
@@ -4,18 +4,20 @@
|
|
4
4
|
[](https://www.typescriptlang.org/)
|
5
5
|
[](https://opensource.org/licenses/MIT)
|
6
6
|
|
7
|
-
A modern TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX) into Markdown with **advanced layout preservation**, **image extraction**, and **
|
7
|
+
A modern TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX, HWP, HWPX) into Markdown with **advanced layout preservation**, **image extraction**, **chart conversion**, and **Korean language support**.
|
8
8
|
|
9
9
|
## ✨ Features
|
10
10
|
|
11
|
-
- 🔄 **Multiple Format Support**: PDF, DOCX, XLSX, PPTX
|
12
|
-
- 🎨 **Layout Preservation**: Maintains document structure, tables, and formatting
|
11
|
+
- 🔄 **Multiple Format Support**: PDF, DOCX, XLSX, PPTX, HWP, HWPX
|
12
|
+
- 🎨 **Layout Preservation**: Maintains document structure, tables, and formatting
|
13
13
|
- 🖼️ **Image Extraction**: Automatically extracts and references images
|
14
14
|
- 📊 **Chart Conversion**: Converts charts to Markdown tables
|
15
15
|
- 📝 **List & Table Support**: Proper nested lists and complex tables
|
16
|
+
- 🌏 **Korean Language Support**: Full support for HWP/HWPX Korean document formats
|
16
17
|
- 🔒 **Type Safety**: Full TypeScript support with comprehensive types
|
17
18
|
- ⚡ **Modern ESM**: ES2022 modules with CommonJS compatibility
|
18
19
|
- 🚀 **Zero Config**: Works out of the box
|
20
|
+
- 🎯 **Visual Parsing**: Enhanced PPTX parsing with visual layout analysis
|
19
21
|
|
20
22
|
## 📦 Installation
|
21
23
|
|
@@ -38,13 +40,38 @@ console.log(result.markdown);
|
|
38
40
|
const result = await convert('./presentation.pptx', {
|
39
41
|
imageDir: 'extracted-images',
|
40
42
|
preserveLayout: true,
|
41
|
-
extractCharts: true
|
43
|
+
extractCharts: true,
|
44
|
+
useVisualParser: true // Enhanced PPTX parsing
|
42
45
|
});
|
43
46
|
|
44
47
|
console.log(`✅ Converted successfully!`);
|
45
48
|
console.log(`📄 Markdown length: ${result.markdown.length}`);
|
46
49
|
console.log(`🖼️ Images extracted: ${result.images.length}`);
|
47
50
|
console.log(`📊 Charts found: ${result.charts.length}`);
|
51
|
+
console.log(`⏱️ Processing time: ${result.metadata.processingTime}ms`);
|
52
|
+
```
|
53
|
+
|
54
|
+
### Korean Document Support (HWP/HWPX)
|
55
|
+
|
56
|
+
```typescript
|
57
|
+
import { convert } from 'file2md';
|
58
|
+
|
59
|
+
// Convert Korean HWP document
|
60
|
+
const hwpResult = await convert('./document.hwp', {
|
61
|
+
imageDir: 'hwp-images',
|
62
|
+
preserveLayout: true,
|
63
|
+
extractImages: true
|
64
|
+
});
|
65
|
+
|
66
|
+
// Convert Korean HWPX document (XML-based format)
|
67
|
+
const hwpxResult = await convert('./document.hwp', {
|
68
|
+
imageDir: 'hwpx-images',
|
69
|
+
preserveLayout: true,
|
70
|
+
extractImages: true
|
71
|
+
});
|
72
|
+
|
73
|
+
console.log(`🇰🇷 HWP content: ${hwpResult.markdown.substring(0, 100)}...`);
|
74
|
+
console.log(`📄 HWPX pages: ${hwpResult.metadata.pageCount}`);
|
48
75
|
```
|
49
76
|
|
50
77
|
### CommonJS
|
@@ -83,10 +110,12 @@ const result = await convert(buffer, {
|
|
83
110
|
```typescript
|
84
111
|
interface ConvertOptions {
|
85
112
|
imageDir?: string; // Directory for extracted images (default: 'images')
|
113
|
+
outputDir?: string; // Output directory for slide screenshots (PPTX, falls back to imageDir)
|
86
114
|
preserveLayout?: boolean; // Maintain document layout (default: true)
|
87
115
|
extractCharts?: boolean; // Convert charts to tables (default: true)
|
88
116
|
extractImages?: boolean; // Extract embedded images (default: true)
|
89
117
|
maxPages?: number; // Max pages for PDFs (default: unlimited)
|
118
|
+
useVisualParser?: boolean; // Enhanced visual parsing for PPTX (default: true)
|
90
119
|
}
|
91
120
|
```
|
92
121
|
|
@@ -97,7 +126,7 @@ interface ConversionResult {
|
|
97
126
|
markdown: string; // Generated Markdown content
|
98
127
|
images: ImageData[]; // Extracted image information
|
99
128
|
charts: ChartData[]; // Extracted chart data
|
100
|
-
metadata: DocumentMetadata; // Document metadata
|
129
|
+
metadata: DocumentMetadata; // Document metadata with processing info
|
101
130
|
}
|
102
131
|
```
|
103
132
|
|
@@ -109,21 +138,24 @@ interface ConversionResult {
|
|
109
138
|
- ✅ **List recognition** (bullets, numbers)
|
110
139
|
- ✅ **Heading detection** (ALL CAPS, colons)
|
111
140
|
- ✅ **Page-to-image fallback** for complex layouts
|
141
|
+
- ✅ **Embedded image extraction** when available
|
112
142
|
|
113
|
-
### 📝 DOCX
|
143
|
+
### 📝 DOCX
|
114
144
|
- ✅ **Heading hierarchy** (H1-H6)
|
115
145
|
- ✅ **Text formatting** (bold, italic)
|
116
146
|
- ✅ **Complex tables** with merged cells
|
117
147
|
- ✅ **Nested lists** with proper indentation
|
118
148
|
- ✅ **Embedded images** and charts
|
119
149
|
- ✅ **Cell styling** (alignment, colors)
|
150
|
+
- ✅ **Font size preservation** and formatting
|
120
151
|
|
121
152
|
### 📊 XLSX
|
122
153
|
- ✅ **Multiple worksheets** as separate sections
|
123
154
|
- ✅ **Cell formatting** (bold, colors, alignment)
|
124
|
-
- ✅ **Data type preservation**
|
155
|
+
- ✅ **Data type preservation**
|
125
156
|
- ✅ **Chart extraction** to data tables
|
126
157
|
- ✅ **Conditional formatting** notes
|
158
|
+
- ✅ **Shared strings** handling for large files
|
127
159
|
|
128
160
|
### 🎬 PPTX
|
129
161
|
- ✅ **Slide-by-slide** organization
|
@@ -131,6 +163,24 @@ interface ConversionResult {
|
|
131
163
|
- ✅ **Image placement** per slide
|
132
164
|
- ✅ **Table extraction** from slides
|
133
165
|
- ✅ **Multi-column layouts**
|
166
|
+
- ✅ **Visual parsing** with enhanced layout analysis
|
167
|
+
- ✅ **Title extraction** from document properties
|
168
|
+
- ✅ **Chart and image** inline embedding
|
169
|
+
|
170
|
+
### 🇰🇷 HWP (Korean)
|
171
|
+
- ✅ **Binary format** parsing using hwp.js
|
172
|
+
- ✅ **Korean text extraction** with proper encoding
|
173
|
+
- ✅ **Image extraction** from embedded content
|
174
|
+
- ✅ **Layout preservation** for Korean documents
|
175
|
+
- ✅ **Copyright message filtering** for clean output
|
176
|
+
|
177
|
+
### 🇰🇷 HWPX (Korean XML)
|
178
|
+
- ✅ **XML-based format** parsing with JSZip
|
179
|
+
- ✅ **Multiple section support** for large documents
|
180
|
+
- ✅ **Relationship mapping** for image references
|
181
|
+
- ✅ **OWPML structure** parsing
|
182
|
+
- ✅ **Enhanced Korean text** processing
|
183
|
+
- ✅ **BinData image extraction** from ZIP archive
|
134
184
|
|
135
185
|
## 🖼️ Image Handling
|
136
186
|
|
@@ -285,9 +335,161 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|
285
335
|
| Word | `.docx` | ✅ | ✅ | ✅ | ✅ | ✅ |
|
286
336
|
| Excel | `.xlsx` | ✅ | ❌ | ✅ | ✅ | ❌ |
|
287
337
|
| PowerPoint | `.pptx` | ✅ | ✅ | ✅ | ✅ | ❌ |
|
338
|
+
| HWP | `.hwp` | ✅ | ✅ | ❌ | ❌ | ✅ |
|
339
|
+
| HWPX | `.hwpx` | ✅ | ✅ | ❌ | ❌ | ✅ |
|
340
|
+
|
341
|
+
*PDF images via page-to-image conversion or embedded extraction
|
342
|
+
|
343
|
+
## 🌏 Korean Document Support
|
344
|
+
|
345
|
+
file2md includes comprehensive support for Korean document formats:
|
346
|
+
|
347
|
+
### HWP (한글)
|
348
|
+
- **Binary format** used by Hangul (한글) word processor
|
349
|
+
- **Legacy format** still widely used in Korean organizations
|
350
|
+
- **Full text extraction** with Korean character encoding
|
351
|
+
- **Image and chart** extraction support
|
352
|
+
|
353
|
+
### HWPX (한글 XML)
|
354
|
+
- **Modern XML-based** format, successor to HWP
|
355
|
+
- **ZIP archive structure** with XML content files
|
356
|
+
- **Enhanced parsing** with relationship mapping
|
357
|
+
- **Multiple sections** and complex document support
|
358
|
+
|
359
|
+
### Usage Examples
|
360
|
+
|
361
|
+
```typescript
|
362
|
+
// Convert Korean documents
|
363
|
+
const koreanDocs = [
|
364
|
+
'report.hwp', // Legacy binary format
|
365
|
+
'document.hwpx', // Modern XML format
|
366
|
+
'presentation.pptx'
|
367
|
+
];
|
368
|
+
|
369
|
+
for (const doc of koreanDocs) {
|
370
|
+
const result = await convert(doc, {
|
371
|
+
imageDir: 'korean-docs-images',
|
372
|
+
preserveLayout: true
|
373
|
+
});
|
374
|
+
|
375
|
+
console.log(`📄 ${doc}: ${result.markdown.length} characters`);
|
376
|
+
console.log(`🖼️ Images: ${result.images.length}`);
|
377
|
+
console.log(`⏱️ Processed in ${result.metadata.processingTime}ms`);
|
378
|
+
}
|
379
|
+
```
|
380
|
+
|
381
|
+
## 🔧 Advanced Configuration
|
382
|
+
|
383
|
+
### Performance Optimization
|
384
|
+
|
385
|
+
```typescript
|
386
|
+
import { convert } from 'file2md';
|
387
|
+
|
388
|
+
// Optimize for large documents
|
389
|
+
const result = await convert('./large-document.pdf', {
|
390
|
+
maxPages: 50, // Limit PDF processing
|
391
|
+
extractImages: false, // Disable images for speed
|
392
|
+
preserveLayout: true // Keep layout analysis
|
393
|
+
});
|
394
|
+
|
395
|
+
// Enhanced PPTX processing
|
396
|
+
const pptxResult = await convert('./presentation.pptx', {
|
397
|
+
useVisualParser: true, // Enable visual layout analysis
|
398
|
+
outputDir: 'slides', // Separate directory for slides
|
399
|
+
extractCharts: true, // Extract chart data
|
400
|
+
extractImages: true // Extract embedded images
|
401
|
+
});
|
402
|
+
```
|
403
|
+
|
404
|
+
### Error Handling for Korean Documents
|
405
|
+
|
406
|
+
```typescript
|
407
|
+
import { convert, ParseError } from 'file2md';
|
408
|
+
|
409
|
+
try {
|
410
|
+
const result = await convert('./korean-document.hwp');
|
411
|
+
console.log('Korean document converted successfully');
|
412
|
+
} catch (error) {
|
413
|
+
if (error instanceof ParseError) {
|
414
|
+
console.error(`Failed to parse ${error.format} document:`, error.message);
|
415
|
+
// Handle Korean-specific parsing errors
|
416
|
+
if (error.format === 'HWP' || error.format === 'HWPX') {
|
417
|
+
console.log('Try converting to HWPX format for better compatibility');
|
418
|
+
}
|
419
|
+
}
|
420
|
+
}
|
421
|
+
```
|
422
|
+
|
423
|
+
## 📈 Performance Metrics
|
424
|
+
|
425
|
+
The library provides detailed performance metrics in the metadata:
|
426
|
+
|
427
|
+
```typescript
|
428
|
+
const result = await convert('./document.docx');
|
429
|
+
|
430
|
+
console.log('Performance Metrics:');
|
431
|
+
console.log(`- Processing time: ${result.metadata.processingTime}ms`);
|
432
|
+
console.log(`- Pages processed: ${result.metadata.pageCount}`);
|
433
|
+
console.log(`- Images extracted: ${result.metadata.imageCount}`);
|
434
|
+
console.log(`- Charts found: ${result.metadata.chartCount}`);
|
435
|
+
console.log(`- File type: ${result.metadata.fileType}`);
|
436
|
+
console.log(`- MIME type: ${result.metadata.mimeType}`);
|
437
|
+
```
|
438
|
+
|
439
|
+
## 🤝 Contributing
|
440
|
+
|
441
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
442
|
+
|
443
|
+
1. Fork the repository
|
444
|
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
445
|
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
446
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
447
|
+
5. Open a Pull Request
|
448
|
+
|
449
|
+
### Development Setup
|
450
|
+
|
451
|
+
```bash
|
452
|
+
# Clone the repository
|
453
|
+
git clone https://github.com/ricky-clevi/file2md.git
|
454
|
+
cd file2md
|
455
|
+
|
456
|
+
# Install dependencies
|
457
|
+
npm install
|
458
|
+
|
459
|
+
# Run tests
|
460
|
+
npm test
|
461
|
+
|
462
|
+
# Build the project
|
463
|
+
npm run build
|
288
464
|
|
289
|
-
|
465
|
+
# Run linting
|
466
|
+
npm run lint
|
467
|
+
```
|
468
|
+
|
469
|
+
### Testing Korean Documents
|
470
|
+
|
471
|
+
When testing Korean document support:
|
472
|
+
|
473
|
+
```bash
|
474
|
+
# Run specific tests for Korean formats
|
475
|
+
npm test -- --testNamePattern="HWP"
|
476
|
+
|
477
|
+
# Run with coverage for Korean parsers
|
478
|
+
npm run test:coverage -- --collectCoverageFrom="src/parsers/hwp-*.ts"
|
479
|
+
```
|
480
|
+
|
481
|
+
## 📄 License
|
482
|
+
|
483
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
484
|
+
|
485
|
+
## 🔗 Links
|
486
|
+
|
487
|
+
- [npm package](https://www.npmjs.com/package/file2md)
|
488
|
+
- [GitHub repository](https://github.com/ricky-clevi/file2md)
|
489
|
+
- [Issues & Bug Reports](https://github.com/ricky-clevi/file2md/issues)
|
490
|
+
- [Korean Document Format Info](https://www.hancom.com/)
|
290
491
|
|
291
492
|
---
|
292
493
|
|
293
|
-
**Made with ❤️ and TypeScript**
|
494
|
+
**Made with ❤️ and TypeScript**
|
495
|
+
**🇰🇷 Enhanced with Korean document support**
|
@@ -1 +1 @@
|
|
1
|
-
{"version":3,"file":"docx-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/docx-parser.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAClE,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAGlE,OAAO,KAAK,EACV,SAAS,EACT,SAAS,EAKV,MAAM,wBAAwB,CAAC;AAEhC,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,cAAc,CAAC,EAAE,OAAO,CAAC;IAClC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;IACjC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;CAClC;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC5C;AAuBD;;GAEG;AACH,wBAAsB,SAAS,CAC7B,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,cAAc,EAC9B,cAAc,EAAE,cAAc,EAC9B,OAAO,GAAE,gBAAqB,GAC7B,OAAO,CAAC,eAAe,CAAC,
|
1
|
+
{"version":3,"file":"docx-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/docx-parser.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAClE,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAGlE,OAAO,KAAK,EACV,SAAS,EACT,SAAS,EAKV,MAAM,wBAAwB,CAAC;AAEhC,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,cAAc,CAAC,EAAE,OAAO,CAAC;IAClC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;IACjC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;CAClC;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC5C;AAuBD;;GAEG;AACH,wBAAsB,SAAS,CAC7B,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,cAAc,EAC9B,cAAc,EAAE,cAAc,EAC9B,OAAO,GAAE,gBAAqB,GAC7B,OAAO,CAAC,eAAe,CAAC,CAoG1B"}
|
@@ -24,17 +24,12 @@ export async function parseDocx(buffer, imageExtractor, chartExtractor, options
|
|
24
24
|
// Initialize layout parser
|
25
25
|
const layoutParser = new LayoutParser();
|
26
26
|
const xmlContent = await documentXml.async('string');
|
27
|
-
console.log('[DEBUG] DOCX XML content length:', xmlContent.length);
|
28
|
-
console.log('[DEBUG] DOCX XML content preview (first 500 chars):', xmlContent.substring(0, 500));
|
29
|
-
// Check for XML namespaces in the content
|
30
|
-
const namespaceMatches = xmlContent.match(/xmlns:[^=]+="[^"]+"/g);
|
31
|
-
console.log('[DEBUG] Found XML namespaces:', namespaceMatches);
|
32
27
|
// Try parsing with different options to handle namespaces
|
33
28
|
const parseOptions = {
|
34
29
|
explicitCharkey: false,
|
35
30
|
trim: true,
|
36
31
|
normalize: true,
|
37
|
-
explicitRoot:
|
32
|
+
explicitRoot: true, // Keep the root element
|
38
33
|
emptyTag: null,
|
39
34
|
explicitChildren: false,
|
40
35
|
charsAsChildren: false,
|
@@ -45,18 +40,20 @@ export async function parseDocx(buffer, imageExtractor, chartExtractor, options
|
|
45
40
|
tagNameProcessors: [],
|
46
41
|
valueProcessors: []
|
47
42
|
};
|
48
|
-
console.log('[DEBUG] Attempting XML parse with options:', JSON.stringify(parseOptions, null, 2));
|
49
43
|
const result = await parseStringPromise(xmlContent, parseOptions);
|
50
|
-
console.log('[DEBUG] Parsed XML result keys:', Object.keys(result));
|
51
|
-
console.log('[DEBUG] Full parsed XML structure:', JSON.stringify(result, null, 2));
|
52
44
|
// Handle both array and non-array XML parsing results
|
53
|
-
|
54
|
-
|
55
|
-
if (
|
56
|
-
|
45
|
+
// The structure should be: result['w:document'] -> document element
|
46
|
+
let document;
|
47
|
+
if (result['w:document']) {
|
48
|
+
// If w:document exists directly
|
49
|
+
document = Array.isArray(result['w:document']) ? result['w:document'][0] : result['w:document'];
|
50
|
+
}
|
51
|
+
else if (result['w:body']) {
|
52
|
+
// If w:body is at the top level (no document wrapper), create a synthetic document
|
53
|
+
document = { 'w:body': Array.isArray(result['w:body']) ? result['w:body'] : [result['w:body']] };
|
54
|
+
}
|
55
|
+
else {
|
57
56
|
// Check if document exists under a different key
|
58
|
-
const alternativeKeys = Object.keys(result).filter(key => key.toLowerCase().includes('document'));
|
59
|
-
console.log('[DEBUG] Alternative document-related keys:', alternativeKeys);
|
60
57
|
throw new ParseError('DOCX', 'Invalid DOCX structure - Missing document element', new Error('Missing document element'));
|
61
58
|
}
|
62
59
|
const body = document['w:body']?.[0];
|
@@ -297,7 +294,6 @@ async function extractImageFromRun(run, imageExtractor, extractedImages) {
|
|
297
294
|
img.originalPath.includes('image'));
|
298
295
|
if (matchingImage && matchingImage.savedPath) {
|
299
296
|
const filename = path.basename(matchingImage.savedPath);
|
300
|
-
console.log(`📎 Found DOCX image: ${filename} (ID: ${imageId})`);
|
301
297
|
return imageExtractor.getImageMarkdown('Document Image', filename);
|
302
298
|
}
|
303
299
|
}
|
@@ -306,7 +302,6 @@ async function extractImageFromRun(run, imageExtractor, extractedImages) {
|
|
306
302
|
const img = extractedImages.find(img => img.savedPath);
|
307
303
|
if (img) {
|
308
304
|
const filename = path.basename(img.savedPath);
|
309
|
-
console.log(`📎 Using fallback DOCX image: ${filename}`);
|
310
305
|
return imageExtractor.getImageMarkdown('Document Image', filename);
|
311
306
|
}
|
312
307
|
}
|
@@ -1 +1 @@
|
|
1
|
-
{"version":3,"file":"docx-parser.js","sourceRoot":"","sources":["../../src/parsers/docx-parser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,kBAAkB,EAAE,MAAM,QAAQ,CAAC;AAC5C,OAAO,IAAI,MAAM,WAAW,CAAC;AAK7B,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;AACzD,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AA4ClE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,MAAc,EACd,cAA8B,EAC9B,cAA8B,EAC9B,UAA4B,EAAE;IAE9B,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC1C,MAAM,WAAW,GAAG,GAAG,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAElD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,gBAAgB,CAAC,yCAAyC,CAAC,CAAC;QACxE,CAAC;QAED,uBAAuB;QACvB,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK;YACrD,CAAC,CAAC,MAAM,cAAc,CAAC,oBAAoB,CAAC,GAAG,EAAE,OAAO,CAAC;YACzD,CAAC,CAAC,EAAE,CAAC;QAEP,4BAA4B;QAC5B,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK;YACrD,CAAC,CAAC,MAAM,cAAc,CAAC,oBAAoB,CAAC,GAAG,EAAE,OAAO,CAAC;YACzD,CAAC,CAAC,EAAE,CAAC;QAEP,2BAA2B;QAC3B,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC;QAExC,MAAM,UAAU,GAAG,MAAM,WAAW,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACrD,OAAO,CAAC,GAAG,CAAC,kCAAkC,EAAE,UAAU,CAAC,MAAM,CAAC,CAAC;QACnE,OAAO,CAAC,GAAG,CAAC,qDAAqD,EAAE,UAAU,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;QAEjG,0CAA0C;QAC1C,MAAM,gBAAgB,GAAG,UAAU,CAAC,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAClE,OAAO,CAAC,GAAG,CAAC,+BAA+B,EAAE,gBAAgB,CAAC,CAAC;QAE/D,0DAA0D;QAC1D,MAAM,YAAY,GAAG;YACnB,eAAe,EAAE,KAAK;YACtB,IAAI,EAAE,IAAI;YACV,SAAS,EAAE,IAAI;YACf,YAAY,EAAE,KAAK;YACnB,QAAQ,EAAE,IAAW;YACrB,gBAAgB,EAAE,KAAK;YACvB,eAAe,EAAE,KAAK;YACtB,iBAAiB,EAAE,KAAK;YACxB,UAAU,EAAE,KAAK;YACjB,kBAAkB,EAAE,EAAW;YAC/B,mBAAmB,EAAE,EAAW;YAChC,iBAAiB,EAAE,EAAW;YAC9B,eAAe,EAAE,EAAW;SAC7B,CAAC;QAEF,OAAO,CAAC,GAAG,CAAC,4CAA4C,EAAE,IAAI,CAAC,SAAS,CAAC,YAAY,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QACjG,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,UAAU,EAAE,YAAY,CAAiB,CAAC;QAClF,OAAO,CAAC,GAAG,CAAC,iCAAiC,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;QACpE,OAAO,CAAC,GAAG,CAAC,oCAAoC,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QAEnF,sDAAsD;QACtD,MAAM,QAAQ,GAAkD,MAAM,CAAC,YAAY,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1F,OAAO,CAAC,GAAG,CAAC,iCAAiC,EAAE,CAAC,CAAC,QAAQ,CAAC,CAAC;QAE3D,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,OAAO,CAAC,GAAG,CAAC,oDAAoD,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;YACvF,iDAAiD;YACjD,MAAM,eAAe,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC;YAClG,OAAO,CAAC,GAAG,CAAC,4CAA4C,EAAE,eAAe,CAAC,CAAC;YAC3E,MAAM,IAAI,UAAU,CAAC,MAAM,EAAE,mDAAmD,EAAE,IAAI,KAAK,CAAC,0BAA0B,CAAC,CAAC,CAAC;QAC3H,CAAC;QAED,MAAM,IAAI,GAAyB,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAE3D,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,MAAM,IAAI,UAAU,CAAC,MAAM,EAAE,gDAAgD,EAAE,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC,CAAC;QACrH,CAAC;QACD,IAAI,QAAQ,GAAG,EAAE,CAAC;QAElB,qBAAqB;QACrB,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;YACxC,MAAM,SAAS,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;YACjF,IAAI,SAAS,CAAC,IAAI,EAAE,EAAE,CAAC;gBACrB,QAAQ,IAAI,GAAG,SAAS,MAAM,CAAC;YACjC,CAAC;QACH,CAAC;QAED,iBAAiB;QACjB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;YACxC,MAAM,aAAa,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,YAAY,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;YACrG,IAAI,aAAa,CAAC,IAAI,EAAE,EAAE,CAAC;gBACzB,QAAQ,IAAI,GAAG,aAAa,MAAM,CAAC;YACrC,CAAC;QACH,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE;YACzB,MAAM,EAAE,eAAe;YACvB,MAAM,EAAE,eAAe,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC;YAChD,QAAQ,EAAE;gBACR,cAAc,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM;gBAC1C,UAAU,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM;aACzC;SACF,CAAC;IACJ,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,IAAI,KAAK,YAAY,gBAAgB,EAAE,CAAC;YACtC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;QACzE,MAAM,IAAI,UAAU,CAAC,MAAM,EAAE,OAAO,EAAE,KAAc,CAAC,CAAC;IACxD,CAAC;AACH,CAAC;AAED,KAAK,UAAU,kBAAkB,CAC/B,KAAc,EACd,YAA0B,EAC1B,cAA8B,EAC9B,eAAqC;IAErC,MAAM,SAAS,GAAG,KAAwC,CAAC;IAC3D,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;IACrC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,MAAM,WAAW,GAAc,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IAE5C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,KAAK,GAAI,GAAuC,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;QACrE,MAAM,OAAO,GAAY,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,QAAQ,GAAa;gBACzB,IAAI,EAAE,EAAE;gBACR,IAAI,EAAE,KAAK;gBACX,MAAM,EAAE,KAAK;gBACb,SAAS,EAAE,MAAuB;gBAClC,eAAe,EAAE,SAAS;gBAC1B,OAAO,EAAE,CAAC;gBACV,OAAO,EAAE,CAAC;gBACV,MAAM,EAAE,KAAK;aACd,CAAC;YAEF,0BAA0B;YAC1B,MAAM,IAAI,GAAI,IAAsK,CAAC,QAAQ,CAAC,CAAC;YAC/L,IAAI,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBACd,yBAAyB;gBACzB,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,EAAE,CAAC;oBAC1B,QAAQ,CAAC,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;gBACvE,CAAC;gBACD,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,EAAE,CAAC;oBACxB,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC;gBACzB,CAAC;gBAED,6BAA6B;gBAC7B,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;oBAClC,QAAQ,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;gBACxD,CAAC;YACH,CAAC;YAED,uBAAuB;YACvB,MAAM,WAAW,GAAI,IAAuC,CAAC;YAC7D,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;gBACvB,MAAM,SAAS,GAAa,EAAE,CAAC;gBAC/B,KAAK,MAAM,SAAS,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC3C,MAAM,aAAa,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;oBAC/F,IAAI,aAAa,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;wBAC9B,SAAS,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;wBAEnC,oCAAoC;wBACpC,IAAI,aAAa,CAAC,IAAI;4BAAE,QAAQ,CAAC,IAAI,GAAG,IAAI,CAAC;wBAC7C,IAAI,aAAa,CAAC,MAAM;4BAAE,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC;wBACjD,IAAI,aAAa,CAAC,SAAS,KAAK,MAAM;4BAAE,QAAQ,CAAC,SAAS,GAAG,aAAa,CAAC,SAAS,CAAC;oBACvF,CAAC;gBACH,CAAC;gBACD,QAAQ,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACtC,CAAC;YAED,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC/B,CAAC;QAED,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC;IAED,OAAO,YAAY,CAAC,kBAAkB,CAAC,WAAW,EAAE;QAClD,iBAAiB,EAAE,IAAI;QACvB,WAAW,EAAE,IAAI;QACjB,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,sBAAsB,CACnC,SAAkB,EAClB,cAA8B,EAC9B,eAAqC;IAErC,MAAM,IAAI,GAAG,SAAyO,CAAC;IACvP,IAAI,IAAI,GAAG,EAAE,CAAC;IACd,IAAI,IAAI,GAAG,KAAK,CAAC;IACjB,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,SAAS,GAAkB,MAAM,CAAC;IACtC,IAAI,QAAQ,GAAoB,QAAQ,CAAC;IACzC,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,6BAA6B;IAC7B,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;IAC1B,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACb,kBAAkB;QAClB,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;YAC/B,MAAM,UAAU,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;YAC3C,SAAS,GAAG,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC;gBACtE,CAAC,CAAC,UAAU;gBACZ,CAAC,CAAC,MAAM,CAAkB,CAAC;QAC/B,CAAC;QAED,uBAAuB;QACvB,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,EAAE,CAAC;YACtB,MAAM,GAAG,IAAI,CAAC;YACd,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACnC,SAAS,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QAChB,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAC9B,4BAA4B;YAC5B,IAAK,GAAqD,CAAC,WAAW,CAAC,IAAK,GAAqD,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC5I,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,GAAG,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;gBACjF,IAAI,QAAQ,EAAE,CAAC;oBACb,IAAI,IAAI,GAAG,QAAQ,IAAI,CAAC;gBAC1B,CAAC;YACH,CAAC;YAED,+BAA+B;YAC/B,MAAM,WAAW,GAAI,GAAuD,CAAC;YAC7E,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;gBACvB,IAAI,OAAO,GAAG,EAAE,CAAC;gBACjB,KAAK,MAAM,WAAW,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC7C,IAAI,OAAO,WAAW,KAAK,QAAQ,EAAE,CAAC;wBACpC,OAAO,IAAI,WAAW,CAAC;oBACzB,CAAC;yBAAM,IAAI,WAAW,IAAI,OAAO,WAAW,KAAK,QAAQ,IAAI,GAAG,IAAI,WAAW,EAAE,CAAC;wBAChF,OAAO,IAAK,WAA6B,CAAC,CAAC,CAAC;oBAC9C,CAAC;gBACH,CAAC;gBAED,mBAAmB;gBACnB,MAAM,GAAG,GAAI,GAAiH,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;gBAC7I,IAAI,GAAG,EAAE,CAAC;oBACR,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;wBACf,OAAO,GAAG,KAAK,OAAO,IAAI,CAAC;wBAC3B,IAAI,GAAG,IAAI,CAAC;oBACd,CAAC;oBACD,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;wBACf,OAAO,GAAG,IAAI,OAAO,GAAG,CAAC;wBACzB,MAAM,GAAG,IAAI,CAAC;oBAChB,CAAC;oBACD,IAAI,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;wBAC5B,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,gCAAgC;oBACrF,CAAC;gBACH,CAAC;gBAED,IAAI,IAAI,OAAO,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAED,wBAAwB;IACxB,IAAI,MAAM,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QAC1B,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QACtC,IAAI,GAAG,GAAG,MAAM,KAAK,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;IACrC,CAAC;IAED,2BAA2B;IAC3B,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC7C,IAAI,QAAQ,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC;YAC/E,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YACtC,IAAI,KAAK,EAAE,CAAC;gBACV,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBAC5C,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC;gBACrD,IAAI,GAAG,GAAG,MAAM,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YACpC,CAAC;QACH,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,IAAI,QAAQ,KAAK,QAAQ,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QACzC,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC;QACxC,IAAI,GAAG,YAAY,CAAC,cAAc,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACrD,CAAC;IAED,OAAO;QACL,IAAI;QACJ,IAAI;QACJ,MAAM;QACN,SAAS;QACT,QAAQ;QACR,MAAM;QACN,SAAS;KACV,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,cAAc,CAC3B,SAAkB,EAClB,cAA8B,EAC9B,eAAqC;IAErC,MAAM,YAAY,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;IAC9F,OAAO,YAAY,CAAC,IAAI,CAAC;AAC3B,CAAC;AAED,KAAK,UAAU,mBAAmB,CAChC,GAAY,EACZ,cAA8B,EAC9B,eAAqC;IAErC,MAAM,OAAO,GAAG,GAmBf,CAAC;IAEF,IAAI,OAAO,GAAkB,IAAI,CAAC;IAElC,oDAAoD;IACpD,IAAI,OAAO,CAAC,WAAW,CAAC,EAAE,CAAC;QACzB,MAAM,OAAO,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QACxC,MAAM,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,WAAW,GAAG,OAAO,EAAE,CAAC,eAAe,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,GAAG,GAAG,WAAW,EAAE,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,QAAQ,GAAG,GAAG,EAAE,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,IAAI,GAAG,QAAQ,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,IAAI,IAAI,EAAE,CAAC,EAAE,CAAC,SAAS,CAAC,EAAE,CAAC;YACzB,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,4CAA4C;IAC5C,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QAClC,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QACnC,MAAM,SAAS,GAAG,KAAK,EAAE,CAAC,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAE9C,IAAI,SAAS,EAAE,CAAC,EAAE,CAAC,MAAM,CAAC,EAAE,CAAC;YAC3B,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;IAED,IAAI,OAAO,EAAE,CAAC;QACZ,8DAA8D;QAC9D,MAAM,aAAa,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAC/C,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,OAAO,CAAC;YAClC,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,OAAO,MAAM,CAAC;YAC3C,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,OAAO,MAAM,CAAC;YAC3C,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,OAAO,OAAO,CAAC;YAC5C,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,OAAO,CAAC,CACnC,CAAC;QAEF,IAAI,aAAa,IAAI,aAAa,CAAC,SAAS,EAAE,CAAC;YAC7C,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;YACxD,OAAO,CAAC,GAAG,CAAC,wBAAwB,QAAQ,SAAS,OAAO,GAAG,CAAC,CAAC;YACjE,OAAO,cAAc,CAAC,gBAAgB,CAAC,gBAAgB,EAAE,QAAQ,CAAC,CAAC;QACrE,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC/B,MAAM,GAAG,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACvD,IAAI,GAAG,EAAE,CAAC;YACR,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;YAC9C,OAAO,CAAC,GAAG,CAAC,iCAAiC,QAAQ,EAAE,CAAC,CAAC;YACzD,OAAO,cAAc,CAAC,gBAAgB,CAAC,gBAAgB,EAAE,QAAQ,CAAC,CAAC;QACrE,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
|
1
|
+
{"version":3,"file":"docx-parser.js","sourceRoot":"","sources":["../../src/parsers/docx-parser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,kBAAkB,EAAE,MAAM,QAAQ,CAAC;AAC5C,OAAO,IAAI,MAAM,WAAW,CAAC;AAK7B,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;AACzD,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AA4ClE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,MAAc,EACd,cAA8B,EAC9B,cAA8B,EAC9B,UAA4B,EAAE;IAE9B,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC1C,MAAM,WAAW,GAAG,GAAG,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAElD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,gBAAgB,CAAC,yCAAyC,CAAC,CAAC;QACxE,CAAC;QAED,uBAAuB;QACvB,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK;YACrD,CAAC,CAAC,MAAM,cAAc,CAAC,oBAAoB,CAAC,GAAG,EAAE,OAAO,CAAC;YACzD,CAAC,CAAC,EAAE,CAAC;QAEP,4BAA4B;QAC5B,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK;YACrD,CAAC,CAAC,MAAM,cAAc,CAAC,oBAAoB,CAAC,GAAG,EAAE,OAAO,CAAC;YACzD,CAAC,CAAC,EAAE,CAAC;QAEP,2BAA2B;QAC3B,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC;QAExC,MAAM,UAAU,GAAG,MAAM,WAAW,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QAGrD,0DAA0D;QAC1D,MAAM,YAAY,GAAG;YACnB,eAAe,EAAE,KAAK;YACtB,IAAI,EAAE,IAAI;YACV,SAAS,EAAE,IAAI;YACf,YAAY,EAAE,IAAI,EAAG,wBAAwB;YAC7C,QAAQ,EAAE,IAAW;YACrB,gBAAgB,EAAE,KAAK;YACvB,eAAe,EAAE,KAAK;YACtB,iBAAiB,EAAE,KAAK;YACxB,UAAU,EAAE,KAAK;YACjB,kBAAkB,EAAE,EAAW;YAC/B,mBAAmB,EAAE,EAAW;YAChC,iBAAiB,EAAE,EAAW;YAC9B,eAAe,EAAE,EAAW;SAC7B,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,UAAU,EAAE,YAAY,CAAQ,CAAC;QAEzE,sDAAsD;QACtD,oEAAoE;QACpE,IAAI,QAAuD,CAAC;QAE5D,IAAI,MAAM,CAAC,YAAY,CAAC,EAAE,CAAC;YACzB,gCAAgC;YAChC,QAAQ,GAAG,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC;QAClG,CAAC;aAAM,IAAI,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC5B,mFAAmF;YACnF,QAAQ,GAAG,EAAE,QAAQ,EAAE,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAQ,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;QAC1G,CAAC;aAAM,CAAC;YACN,iDAAiD;YACjD,MAAM,IAAI,UAAU,CAAC,MAAM,EAAE,mDAAmD,EAAE,IAAI,KAAK,CAAC,0BAA0B,CAAC,CAAC,CAAC;QAC3H,CAAC;QAGD,MAAM,IAAI,GAAyB,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAE3D,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,MAAM,IAAI,UAAU,CAAC,MAAM,EAAE,gDAAgD,EAAE,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC,CAAC;QACrH,CAAC;QACD,IAAI,QAAQ,GAAG,EAAE,CAAC;QAElB,qBAAqB;QACrB,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;YACxC,MAAM,SAAS,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;YACjF,IAAI,SAAS,CAAC,IAAI,EAAE,EAAE,CAAC;gBACrB,QAAQ,IAAI,GAAG,SAAS,MAAM,CAAC;YACjC,CAAC;QACH,CAAC;QAED,iBAAiB;QACjB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;YACxC,MAAM,aAAa,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,YAAY,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;YACrG,IAAI,aAAa,CAAC,IAAI,EAAE,EAAE,CAAC;gBACzB,QAAQ,IAAI,GAAG,aAAa,MAAM,CAAC;YACrC,CAAC;QACH,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE;YACzB,MAAM,EAAE,eAAe;YACvB,MAAM,EAAE,eAAe,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC;YAChD,QAAQ,EAAE;gBACR,cAAc,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM;gBAC1C,UAAU,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM;aACzC;SACF,CAAC;IACJ,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,IAAI,KAAK,YAAY,gBAAgB,EAAE,CAAC;YACtC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;QACzE,MAAM,IAAI,UAAU,CAAC,MAAM,EAAE,OAAO,EAAE,KAAc,CAAC,CAAC;IACxD,CAAC;AACH,CAAC;AAED,KAAK,UAAU,kBAAkB,CAC/B,KAAc,EACd,YAA0B,EAC1B,cAA8B,EAC9B,eAAqC;IAErC,MAAM,SAAS,GAAG,KAAwC,CAAC;IAC3D,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;IACrC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,MAAM,WAAW,GAAc,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IAE5C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,KAAK,GAAI,GAAuC,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;QACrE,MAAM,OAAO,GAAY,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,QAAQ,GAAa;gBACzB,IAAI,EAAE,EAAE;gBACR,IAAI,EAAE,KAAK;gBACX,MAAM,EAAE,KAAK;gBACb,SAAS,EAAE,MAAuB;gBAClC,eAAe,EAAE,SAAS;gBAC1B,OAAO,EAAE,CAAC;gBACV,OAAO,EAAE,CAAC;gBACV,MAAM,EAAE,KAAK;aACd,CAAC;YAEF,0BAA0B;YAC1B,MAAM,IAAI,GAAI,IAAsK,CAAC,QAAQ,CAAC,CAAC;YAC/L,IAAI,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBACd,yBAAyB;gBACzB,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,EAAE,CAAC;oBAC1B,QAAQ,CAAC,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;gBACvE,CAAC;gBACD,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,EAAE,CAAC;oBACxB,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC;gBACzB,CAAC;gBAED,6BAA6B;gBAC7B,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;oBAClC,QAAQ,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;gBACxD,CAAC;YACH,CAAC;YAED,uBAAuB;YACvB,MAAM,WAAW,GAAI,IAAuC,CAAC;YAC7D,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;gBACvB,MAAM,SAAS,GAAa,EAAE,CAAC;gBAC/B,KAAK,MAAM,SAAS,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC3C,MAAM,aAAa,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;oBAC/F,IAAI,aAAa,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;wBAC9B,SAAS,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;wBAEnC,oCAAoC;wBACpC,IAAI,aAAa,CAAC,IAAI;4BAAE,QAAQ,CAAC,IAAI,GAAG,IAAI,CAAC;wBAC7C,IAAI,aAAa,CAAC,MAAM;4BAAE,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC;wBACjD,IAAI,aAAa,CAAC,SAAS,KAAK,MAAM;4BAAE,QAAQ,CAAC,SAAS,GAAG,aAAa,CAAC,SAAS,CAAC;oBACvF,CAAC;gBACH,CAAC;gBACD,QAAQ,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACtC,CAAC;YAED,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC/B,CAAC;QAED,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC;IAED,OAAO,YAAY,CAAC,kBAAkB,CAAC,WAAW,EAAE;QAClD,iBAAiB,EAAE,IAAI;QACvB,WAAW,EAAE,IAAI;QACjB,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,sBAAsB,CACnC,SAAkB,EAClB,cAA8B,EAC9B,eAAqC;IAErC,MAAM,IAAI,GAAG,SAAyO,CAAC;IACvP,IAAI,IAAI,GAAG,EAAE,CAAC;IACd,IAAI,IAAI,GAAG,KAAK,CAAC;IACjB,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,SAAS,GAAkB,MAAM,CAAC;IACtC,IAAI,QAAQ,GAAoB,QAAQ,CAAC;IACzC,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,6BAA6B;IAC7B,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;IAC1B,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACb,kBAAkB;QAClB,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;YAC/B,MAAM,UAAU,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;YAC3C,SAAS,GAAG,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC;gBACtE,CAAC,CAAC,UAAU;gBACZ,CAAC,CAAC,MAAM,CAAkB,CAAC;QAC/B,CAAC;QAED,uBAAuB;QACvB,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,EAAE,CAAC;YACtB,MAAM,GAAG,IAAI,CAAC;YACd,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACnC,SAAS,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QAChB,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAC9B,4BAA4B;YAC5B,IAAK,GAAqD,CAAC,WAAW,CAAC,IAAK,GAAqD,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC5I,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,GAAG,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;gBACjF,IAAI,QAAQ,EAAE,CAAC;oBACb,IAAI,IAAI,GAAG,QAAQ,IAAI,CAAC;gBAC1B,CAAC;YACH,CAAC;YAED,+BAA+B;YAC/B,MAAM,WAAW,GAAI,GAAuD,CAAC;YAC7E,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;gBACvB,IAAI,OAAO,GAAG,EAAE,CAAC;gBACjB,KAAK,MAAM,WAAW,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC7C,IAAI,OAAO,WAAW,KAAK,QAAQ,EAAE,CAAC;wBACpC,OAAO,IAAI,WAAW,CAAC;oBACzB,CAAC;yBAAM,IAAI,WAAW,IAAI,OAAO,WAAW,KAAK,QAAQ,IAAI,GAAG,IAAI,WAAW,EAAE,CAAC;wBAChF,OAAO,IAAK,WAA6B,CAAC,CAAC,CAAC;oBAC9C,CAAC;gBACH,CAAC;gBAED,mBAAmB;gBACnB,MAAM,GAAG,GAAI,GAAiH,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;gBAC7I,IAAI,GAAG,EAAE,CAAC;oBACR,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;wBACf,OAAO,GAAG,KAAK,OAAO,IAAI,CAAC;wBAC3B,IAAI,GAAG,IAAI,CAAC;oBACd,CAAC;oBACD,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;wBACf,OAAO,GAAG,IAAI,OAAO,GAAG,CAAC;wBACzB,MAAM,GAAG,IAAI,CAAC;oBAChB,CAAC;oBACD,IAAI,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;wBAC5B,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,gCAAgC;oBACrF,CAAC;gBACH,CAAC;gBAED,IAAI,IAAI,OAAO,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAED,wBAAwB;IACxB,IAAI,MAAM,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QAC1B,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QACtC,IAAI,GAAG,GAAG,MAAM,KAAK,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;IACrC,CAAC;IAED,2BAA2B;IAC3B,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC7C,IAAI,QAAQ,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC;YAC/E,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YACtC,IAAI,KAAK,EAAE,CAAC;gBACV,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBAC5C,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC;gBACrD,IAAI,GAAG,GAAG,MAAM,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YACpC,CAAC;QACH,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,IAAI,QAAQ,KAAK,QAAQ,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QACzC,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC;QACxC,IAAI,GAAG,YAAY,CAAC,cAAc,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACrD,CAAC;IAED,OAAO;QACL,IAAI;QACJ,IAAI;QACJ,MAAM;QACN,SAAS;QACT,QAAQ;QACR,MAAM;QACN,SAAS;KACV,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,cAAc,CAC3B,SAAkB,EAClB,cAA8B,EAC9B,eAAqC;IAErC,MAAM,YAAY,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;IAC9F,OAAO,YAAY,CAAC,IAAI,CAAC;AAC3B,CAAC;AAED,KAAK,UAAU,mBAAmB,CAChC,GAAY,EACZ,cAA8B,EAC9B,eAAqC;IAErC,MAAM,OAAO,GAAG,GAmBf,CAAC;IAEF,IAAI,OAAO,GAAkB,IAAI,CAAC;IAElC,oDAAoD;IACpD,IAAI,OAAO,CAAC,WAAW,CAAC,EAAE,CAAC;QACzB,MAAM,OAAO,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;QACxC,MAAM,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QACzC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC3C,MAAM,WAAW,GAAG,OAAO,EAAE,CAAC,eAAe,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,GAAG,GAAG,WAAW,EAAE,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,QAAQ,GAAG,GAAG,EAAE,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,IAAI,GAAG,QAAQ,EAAE,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAEvC,IAAI,IAAI,EAAE,CAAC,EAAE,CAAC,SAAS,CAAC,EAAE,CAAC;YACzB,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IAED,4CAA4C;IAC5C,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QAClC,MAAM,IAAI,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAClC,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QACnC,MAAM,SAAS,GAAG,KAAK,EAAE,CAAC,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAE9C,IAAI,SAAS,EAAE,CAAC,EAAE,CAAC,MAAM,CAAC,EAAE,CAAC;YAC3B,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;IAED,IAAI,OAAO,EAAE,CAAC;QACZ,8DAA8D;QAC9D,MAAM,aAAa,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAC/C,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,OAAO,CAAC;YAClC,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,OAAO,MAAM,CAAC;YAC3C,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,OAAO,MAAM,CAAC;YAC3C,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,OAAO,OAAO,CAAC;YAC5C,GAAG,CAAC,YAAY,CAAC,QAAQ,CAAC,OAAO,CAAC,CACnC,CAAC;QAEF,IAAI,aAAa,IAAI,aAAa,CAAC,SAAS,EAAE,CAAC;YAC7C,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;YACxD,OAAO,cAAc,CAAC,gBAAgB,CAAC,gBAAgB,EAAE,QAAQ,CAAC,CAAC;QACrE,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC/B,MAAM,GAAG,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACvD,IAAI,GAAG,EAAE,CAAC;YACR,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;YAC9C,OAAO,cAAc,CAAC,gBAAgB,CAAC,gBAAgB,EAAE,QAAQ,CAAC,CAAC;QACrE,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
|
@@ -200,11 +200,6 @@ async function parseHwpBinary(buffer, imageExtractor, _chartExtractor, options)
|
|
200
200
|
if (!viewer) {
|
201
201
|
throw new Error('Viewer instance is null or undefined');
|
202
202
|
}
|
203
|
-
// Verify viewer has expected properties
|
204
|
-
const viewerObj = viewer;
|
205
|
-
if (viewerObj && typeof viewerObj === 'object') {
|
206
|
-
console.log('Viewer created successfully');
|
207
|
-
}
|
208
203
|
}
|
209
204
|
catch (viewerError) {
|
210
205
|
console.warn('Failed to initialize hwp.js Viewer:', viewerError);
|
@@ -268,7 +263,6 @@ async function parseHwpxXml(buffer, imageExtractor, _chartExtractor, options) {
|
|
268
263
|
const zip = await JSZip.loadAsync(buffer);
|
269
264
|
// Log all files in the ZIP for debugging
|
270
265
|
const allFiles = Object.keys(zip.files);
|
271
|
-
console.log('HWPX archive contains files:', allFiles);
|
272
266
|
// Find main content files in HWPX (OWPML format)
|
273
267
|
// HWPX structure typically has sections in Contents/section0.xml, section1.xml, etc.
|
274
268
|
const contentFiles = [
|
@@ -287,14 +281,8 @@ async function parseHwpxXml(buffer, imageExtractor, _chartExtractor, options) {
|
|
287
281
|
];
|
288
282
|
// Try to find any section files
|
289
283
|
const sectionFiles = allFiles.filter(f => f.match(/Contents\/section\d+\.xml/));
|
290
|
-
if (sectionFiles.length > 0) {
|
291
|
-
console.log('Found section files:', sectionFiles);
|
292
|
-
}
|
293
284
|
// Try to find XML files
|
294
285
|
const xmlFiles = allFiles.filter(f => f.endsWith('.xml'));
|
295
|
-
if (xmlFiles.length > 0) {
|
296
|
-
console.log('Found XML files:', xmlFiles);
|
297
|
-
}
|
298
286
|
let contentFile = null;
|
299
287
|
let contentFileName = '';
|
300
288
|
// First try section files
|
@@ -353,7 +341,6 @@ async function parseHwpxXml(buffer, imageExtractor, _chartExtractor, options) {
|
|
353
341
|
trimValues: true
|
354
342
|
});
|
355
343
|
const parsedXml = parser.parse(xmlContent);
|
356
|
-
console.log(`Parsed HWPX section: ${sectionFileName}`);
|
357
344
|
// Convert each section to markdown and combine
|
358
345
|
const sectionMarkdown = convertOwpmlToMarkdown(parsedXml, images, relationshipMap);
|
359
346
|
if (sectionMarkdown && sectionMarkdown.trim()) {
|
@@ -373,7 +360,6 @@ async function parseHwpxXml(buffer, imageExtractor, _chartExtractor, options) {
|
|
373
360
|
trimValues: true
|
374
361
|
});
|
375
362
|
const parsedXml = parser.parse(xmlContent);
|
376
|
-
console.log(`Parsed HWPX XML from ${contentFileName}`);
|
377
363
|
allContent = convertOwpmlToMarkdown(parsedXml, images, relationshipMap);
|
378
364
|
}
|
379
365
|
const markdown = allContent.trim() || '*No readable content found in HWPX file*';
|
@@ -616,7 +602,6 @@ async function extractHwpxImages(zip, imageExtractor) {
|
|
616
602
|
format: extension,
|
617
603
|
size: imageBuffer.length
|
618
604
|
});
|
619
|
-
console.log(`Extracted and saved image: ${fileName} -> ${savedPath}`);
|
620
605
|
}
|
621
606
|
}
|
622
607
|
catch (e) {
|
@@ -788,12 +773,6 @@ function convertOwpmlToMarkdown(owpmlData, images = [], relationshipMap = {}) {
|
|
788
773
|
const positionCounter = { value: 0 };
|
789
774
|
// Extract all text content and image references recursively
|
790
775
|
extractContentNodes(owpmlData, contentItems, positionCounter, images, relationshipMap);
|
791
|
-
// DEBUG: Log extracted content items
|
792
|
-
console.log('[DEBUG] Total content items extracted:', contentItems.length);
|
793
|
-
console.log('[DEBUG] Content items breakdown:');
|
794
|
-
contentItems.forEach((item, index) => {
|
795
|
-
console.log(`[DEBUG] Item ${index}: type=${item.type}, position=${item.position}, content="${item.content.substring(0, 50)}${item.content.length > 50 ? '...' : ''}"`);
|
796
|
-
});
|
797
776
|
// Sort by position to maintain document order
|
798
777
|
contentItems.sort((a, b) => a.position - b.position);
|
799
778
|
// Build markdown with text and image references
|
@@ -807,13 +786,6 @@ function convertOwpmlToMarkdown(owpmlData, images = [], relationshipMap = {}) {
|
|
807
786
|
markdownParts.push(item.content);
|
808
787
|
}
|
809
788
|
}
|
810
|
-
// DEBUG: Log how parts are being joined
|
811
|
-
console.log('[DEBUG] Total markdown parts:', markdownParts.length);
|
812
|
-
console.log('[DEBUG] Markdown parts before joining:');
|
813
|
-
markdownParts.forEach((part, index) => {
|
814
|
-
console.log(`[DEBUG] Part ${index}: "${part.substring(0, 50)}${part.length > 50 ? '...' : ''}"`);
|
815
|
-
});
|
816
|
-
console.log('[DEBUG] Joining with smart spacing - single line breaks between paragraphs');
|
817
789
|
// Smart joining: use double line breaks for paragraph separation, single line breaks for flow
|
818
790
|
markdown = smartJoinMarkdownParts(markdownParts);
|
819
791
|
}
|
@@ -929,7 +901,6 @@ function findImageReference(drawingObj, images, relationshipMap) {
|
|
929
901
|
// This is a simple approach - in production you might want a more sophisticated reset mechanism
|
930
902
|
if (images.length > 0 && (!('__globalImageCounter' in findImageReference) || findImageReference.__globalImageCounter >= images.length * 2)) {
|
931
903
|
findImageReference.__globalImageCounter = 0;
|
932
|
-
console.log('[DEBUG] Reset global image counter for new conversion');
|
933
904
|
}
|
934
905
|
if (!images || images.length === 0)
|
935
906
|
return null;
|
@@ -979,7 +950,6 @@ function findImageReference(drawingObj, images, relationshipMap) {
|
|
979
950
|
if (matchingImage) {
|
980
951
|
const imageName = path.basename(matchingImage.savedPath);
|
981
952
|
const markdownRef = ``;
|
982
|
-
console.log(`[DEBUG] Found matching image: ${markdownRef} (originalPath: ${matchingImage.originalPath}, savedPath: ${matchingImage.savedPath})`);
|
983
953
|
return markdownRef;
|
984
954
|
}
|
985
955
|
}
|
@@ -1026,7 +996,6 @@ function findImageReference(drawingObj, images, relationshipMap) {
|
|
1026
996
|
if (matchingImage) {
|
1027
997
|
const imageName = path.basename(matchingImage.savedPath);
|
1028
998
|
const markdownRef = ``;
|
1029
|
-
console.log(`[DEBUG] Found direct target match: ${markdownRef} (originalPath: ${matchingImage.originalPath}, savedPath: ${matchingImage.savedPath})`);
|
1030
999
|
return markdownRef;
|
1031
1000
|
}
|
1032
1001
|
}
|
@@ -1041,11 +1010,8 @@ function findImageReference(drawingObj, images, relationshipMap) {
|
|
1041
1010
|
if (selected) {
|
1042
1011
|
const imageName = path.basename(selected.savedPath);
|
1043
1012
|
const markdownRef = ``;
|
1044
|
-
console.log(`[DEBUG] Using sequential image reference: ${markdownRef} (counter: ${globalCounter}, total images: ${images.length})`);
|
1045
|
-
console.log(`[DEBUG] Selected image: originalPath=${selected.originalPath}, savedPath=${selected.savedPath}`);
|
1046
1013
|
return markdownRef;
|
1047
1014
|
}
|
1048
|
-
console.log(`[DEBUG] No fallback image available (total images: ${images.length})`);
|
1049
1015
|
return null;
|
1050
1016
|
}
|
1051
1017
|
catch (e) {
|
@@ -1059,7 +1025,6 @@ function findImageReference(drawingObj, images, relationshipMap) {
|
|
1059
1025
|
function extractParagraphContent(para, contentItems, positionCounter, images, relationshipMap) {
|
1060
1026
|
if (!para)
|
1061
1027
|
return;
|
1062
|
-
console.log('[DEBUG] Processing new paragraph');
|
1063
1028
|
// Check for images/drawings in paragraph first
|
1064
1029
|
const obj = para;
|
1065
1030
|
if (obj['hp:pic'] || obj['PICTURE'] || obj['IMAGE'] || obj['hp:draw'] || obj['DRAWING']) {
|
@@ -1075,7 +1040,6 @@ function extractParagraphContent(para, contentItems, positionCounter, images, re
|
|
1075
1040
|
// Extract and combine all text content from this paragraph into a single content item
|
1076
1041
|
const combinedText = extractCombinedParagraphText(para);
|
1077
1042
|
if (combinedText && combinedText.trim().length > 0) {
|
1078
|
-
console.log(`[DEBUG] Adding combined paragraph text: "${combinedText.substring(0, 50)}${combinedText.length > 50 ? '...' : ''}"`);
|
1079
1043
|
contentItems.push({
|
1080
1044
|
type: 'text',
|
1081
1045
|
content: combinedText.trim(),
|
@@ -1094,7 +1058,6 @@ function extractCombinedParagraphText(para) {
|
|
1094
1058
|
const runs = para['hp:run'] || para['run'] || para['RUN'];
|
1095
1059
|
if (runs) {
|
1096
1060
|
const runArray = Array.isArray(runs) ? runs : [runs];
|
1097
|
-
console.log(`[DEBUG] Processing ${runArray.length} runs in paragraph for combination`);
|
1098
1061
|
for (const run of runArray) {
|
1099
1062
|
// Look for hp:t or t nodes (text content)
|
1100
1063
|
const textNode = run['hp:t'] || run['t'] || run['T'] || run['#text'];
|
@@ -1102,14 +1065,12 @@ function extractCombinedParagraphText(para) {
|
|
1102
1065
|
if (typeof textNode === 'string') {
|
1103
1066
|
const text = textNode.trim();
|
1104
1067
|
if (text && !isMetadata(text)) {
|
1105
|
-
console.log(`[DEBUG] Adding text segment from run: "${text.substring(0, 30)}${text.length > 30 ? '...' : ''}"`);
|
1106
1068
|
textSegments.push(text);
|
1107
1069
|
}
|
1108
1070
|
}
|
1109
1071
|
else if (textNode['#text']) {
|
1110
1072
|
const text = textNode['#text'].trim();
|
1111
1073
|
if (text && !isMetadata(text)) {
|
1112
|
-
console.log(`[DEBUG] Adding text segment from run.#text: "${text.substring(0, 30)}${text.length > 30 ? '...' : ''}"`);
|
1113
1074
|
textSegments.push(text);
|
1114
1075
|
}
|
1115
1076
|
}
|
@@ -1143,7 +1104,6 @@ function extractCombinedParagraphText(para) {
|
|
1143
1104
|
}
|
1144
1105
|
// Combine all text segments with single spaces
|
1145
1106
|
const combinedText = textSegments.join(' ');
|
1146
|
-
console.log(`[DEBUG] Combined ${textSegments.length} segments into: "${combinedText.substring(0, 50)}${combinedText.length > 50 ? '...' : ''}"`);
|
1147
1107
|
return combinedText;
|
1148
1108
|
}
|
1149
1109
|
/**
|