file2md 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +293 -0
  3. package/dist/index.d.ts +33 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +153 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/parsers/docx-parser.d.ts +20 -0
  8. package/dist/parsers/docx-parser.d.ts.map +1 -0
  9. package/dist/parsers/docx-parser.js +237 -0
  10. package/dist/parsers/docx-parser.js.map +1 -0
  11. package/dist/parsers/pdf-parser.d.ts +8 -0
  12. package/dist/parsers/pdf-parser.d.ts.map +1 -0
  13. package/dist/parsers/pdf-parser.js +98 -0
  14. package/dist/parsers/pdf-parser.js.map +1 -0
  15. package/dist/parsers/pptx-parser.d.ts +21 -0
  16. package/dist/parsers/pptx-parser.d.ts.map +1 -0
  17. package/dist/parsers/pptx-parser.js +264 -0
  18. package/dist/parsers/pptx-parser.js.map +1 -0
  19. package/dist/parsers/xlsx-parser.d.ts +19 -0
  20. package/dist/parsers/xlsx-parser.d.ts.map +1 -0
  21. package/dist/parsers/xlsx-parser.js +267 -0
  22. package/dist/parsers/xlsx-parser.js.map +1 -0
  23. package/dist/types/errors.d.ts +52 -0
  24. package/dist/types/errors.d.ts.map +1 -0
  25. package/dist/types/errors.js +76 -0
  26. package/dist/types/errors.js.map +1 -0
  27. package/dist/types/index.d.ts +5 -0
  28. package/dist/types/index.d.ts.map +1 -0
  29. package/dist/types/index.js +5 -0
  30. package/dist/types/index.js.map +1 -0
  31. package/dist/types/interfaces.d.ts +228 -0
  32. package/dist/types/interfaces.d.ts.map +1 -0
  33. package/dist/types/interfaces.js +10 -0
  34. package/dist/types/interfaces.js.map +1 -0
  35. package/dist/utils/chart-extractor.d.ts +44 -0
  36. package/dist/utils/chart-extractor.d.ts.map +1 -0
  37. package/dist/utils/chart-extractor.js +258 -0
  38. package/dist/utils/chart-extractor.js.map +1 -0
  39. package/dist/utils/image-extractor.d.ts +50 -0
  40. package/dist/utils/image-extractor.d.ts.map +1 -0
  41. package/dist/utils/image-extractor.js +136 -0
  42. package/dist/utils/image-extractor.js.map +1 -0
  43. package/dist/utils/layout-parser.d.ts +55 -0
  44. package/dist/utils/layout-parser.d.ts.map +1 -0
  45. package/dist/utils/layout-parser.js +244 -0
  46. package/dist/utils/layout-parser.js.map +1 -0
  47. package/dist/utils/pdf-extractor.d.ts +46 -0
  48. package/dist/utils/pdf-extractor.d.ts.map +1 -0
  49. package/dist/utils/pdf-extractor.js +235 -0
  50. package/dist/utils/pdf-extractor.js.map +1 -0
  51. package/package.json +70 -0
@@ -0,0 +1,237 @@
1
+ import JSZip from 'jszip';
2
+ import { parseStringPromise } from 'xml2js';
3
+ import { LayoutParser } from '../utils/layout-parser.js';
4
+ import { ParseError, InvalidFileError } from '../types/errors.js';
5
+ /**
6
+ * Parse DOCX buffer and convert to markdown with layout preservation
7
+ */
8
+ export async function parseDocx(buffer, imageExtractor, chartExtractor, options = {}) {
9
+ try {
10
+ const zip = await JSZip.loadAsync(buffer);
11
+ const documentXml = zip.file('word/document.xml');
12
+ if (!documentXml) {
13
+ throw new InvalidFileError('Invalid DOCX file: missing document.xml');
14
+ }
15
+ // Extract images first
16
+ const extractedImages = options.extractImages !== false
17
+ ? await imageExtractor.extractImagesFromZip(zip, 'word/')
18
+ : [];
19
+ // Extract charts if enabled
20
+ const extractedCharts = options.extractCharts !== false
21
+ ? await chartExtractor.extractChartsFromZip(zip, 'word/')
22
+ : [];
23
+ // Initialize layout parser
24
+ const layoutParser = new LayoutParser();
25
+ const xmlContent = await documentXml.async('string');
26
+ const result = await parseStringPromise(xmlContent);
27
+ const body = result['w:document'][0]['w:body'][0];
28
+ let markdown = '';
29
+ // Process paragraphs
30
+ for (const element of body['w:p'] || []) {
31
+ const paragraph = await parseParagraph(element, imageExtractor, extractedImages);
32
+ if (paragraph.trim()) {
33
+ markdown += paragraph + '\n\n';
34
+ }
35
+ }
36
+ // Process tables
37
+ for (const table of body['w:tbl'] || []) {
38
+ const tableMarkdown = await parseAdvancedTable(table, layoutParser, imageExtractor, extractedImages);
39
+ if (tableMarkdown.trim()) {
40
+ markdown += tableMarkdown + '\n\n';
41
+ }
42
+ }
43
+ return {
44
+ markdown: markdown.trim(),
45
+ images: extractedImages,
46
+ charts: extractedCharts.map(chart => chart.data),
47
+ metadata: {
48
+ paragraphCount: (body['w:p'] || []).length,
49
+ tableCount: (body['w:tbl'] || []).length
50
+ }
51
+ };
52
+ }
53
+ catch (error) {
54
+ if (error instanceof InvalidFileError) {
55
+ throw error;
56
+ }
57
+ const message = error instanceof Error ? error.message : 'Unknown error';
58
+ throw new ParseError('DOCX', message, error);
59
+ }
60
+ }
61
+ async function parseAdvancedTable(table, layoutParser, imageExtractor, extractedImages) {
62
+ const tableData = table;
63
+ const rows = tableData['w:tr'] || [];
64
+ if (rows.length === 0)
65
+ return '';
66
+ const tableStruct = { rows: [] };
67
+ for (const row of rows) {
68
+ const cells = row['w:tc'] || [];
69
+ const rowData = { cells: [] };
70
+ for (const cell of cells) {
71
+ const cellData = {
72
+ text: '',
73
+ bold: false,
74
+ italic: false,
75
+ alignment: 'left',
76
+ backgroundColor: undefined,
77
+ colSpan: 1,
78
+ rowSpan: 1,
79
+ merged: false
80
+ };
81
+ // Extract cell properties
82
+ const tcPr = cell['w:tcPr'];
83
+ if (tcPr?.[0]) {
84
+ // Check for merged cells
85
+ if (tcPr[0]['w:gridSpan']) {
86
+ cellData.colSpan = parseInt(tcPr[0]['w:gridSpan'][0].$.val) || 1;
87
+ }
88
+ if (tcPr[0]['w:vMerge']) {
89
+ cellData.merged = true;
90
+ }
91
+ // Check for background color
92
+ if (tcPr[0]['w:shd']?.[0]?.$.fill) {
93
+ cellData.backgroundColor = tcPr[0]['w:shd'][0].$.fill;
94
+ }
95
+ }
96
+ // Extract cell content
97
+ if (cell['w:p']) {
98
+ const cellTexts = [];
99
+ for (const paragraph of cell['w:p']) {
100
+ const paragraphData = await parseAdvancedParagraph(paragraph, imageExtractor, extractedImages);
101
+ if (paragraphData.text.trim()) {
102
+ cellTexts.push(paragraphData.text);
103
+ // Inherit formatting from paragraph
104
+ if (paragraphData.bold)
105
+ cellData.bold = true;
106
+ if (paragraphData.italic)
107
+ cellData.italic = true;
108
+ if (paragraphData.alignment !== 'left')
109
+ cellData.alignment = paragraphData.alignment;
110
+ }
111
+ }
112
+ cellData.text = cellTexts.join(' ');
113
+ }
114
+ rowData.cells.push(cellData);
115
+ }
116
+ tableStruct.rows.push(rowData);
117
+ }
118
+ return layoutParser.parseAdvancedTable(tableStruct, {
119
+ preserveAlignment: true,
120
+ showBorders: true,
121
+ preserveColors: true
122
+ });
123
+ }
124
+ async function parseAdvancedParagraph(paragraph, imageExtractor, extractedImages) {
125
+ const para = paragraph;
126
+ let text = '';
127
+ let bold = false;
128
+ let italic = false;
129
+ let alignment = 'left';
130
+ let fontSize = 'normal';
131
+ let isList = false;
132
+ let listLevel = 0;
133
+ // Check paragraph properties
134
+ const pPr = para['w:pPr'];
135
+ if (pPr?.[0]) {
136
+ // Check alignment
137
+ if (pPr[0]['w:jc']?.[0]?.$.val) {
138
+ const alignValue = pPr[0]['w:jc'][0].$.val;
139
+ alignment = (['left', 'center', 'right', 'justify'].includes(alignValue)
140
+ ? alignValue
141
+ : 'left');
142
+ }
143
+ // Check if it's a list
144
+ if (pPr[0]['w:numPr']) {
145
+ isList = true;
146
+ if (pPr[0]['w:numPr'][0]['w:ilvl']) {
147
+ listLevel = parseInt(pPr[0]['w:numPr'][0]['w:ilvl'][0].$.val) || 0;
148
+ }
149
+ }
150
+ }
151
+ if (para['w:r']) {
152
+ for (const run of para['w:r']) {
153
+ // Check for images/drawings
154
+ if (run['w:drawing'] || run['w:pict']) {
155
+ const imageRef = await extractImageFromRun(run, imageExtractor, extractedImages);
156
+ if (imageRef) {
157
+ text += imageRef + '\n';
158
+ }
159
+ }
160
+ // Extract text with formatting
161
+ if (run['w:t']) {
162
+ let runText = '';
163
+ for (const textElement of run['w:t']) {
164
+ if (typeof textElement === 'string') {
165
+ runText += textElement;
166
+ }
167
+ else if (textElement && typeof textElement === 'object' && '_' in textElement) {
168
+ runText += textElement._;
169
+ }
170
+ }
171
+ // Apply formatting
172
+ const rPr = run['w:rPr']?.[0];
173
+ if (rPr) {
174
+ if (rPr['w:b']) {
175
+ runText = `**${runText}**`;
176
+ bold = true;
177
+ }
178
+ if (rPr['w:i']) {
179
+ runText = `*${runText}*`;
180
+ italic = true;
181
+ }
182
+ if (rPr['w:sz']?.[0]?.$.val) {
183
+ fontSize = parseInt(rPr['w:sz'][0].$.val) / 2; // Convert half-points to points
184
+ }
185
+ }
186
+ text += runText;
187
+ }
188
+ }
189
+ }
190
+ // Apply list formatting
191
+ if (isList && text.trim()) {
192
+ const indent = ' '.repeat(listLevel);
193
+ text = `${indent}- ${text.trim()}`;
194
+ }
195
+ // Apply heading formatting
196
+ if (pPr?.[0]?.['w:pStyle']?.[0]?.$.val) {
197
+ const styleVal = pPr[0]['w:pStyle'][0].$.val;
198
+ if (styleVal && (styleVal.includes('Heading') || styleVal.includes('heading'))) {
199
+ const match = styleVal.match(/(\d+)/);
200
+ if (match) {
201
+ const headingLevel = parseInt(match[1]);
202
+ const hashes = '#'.repeat(Math.min(headingLevel, 6));
203
+ text = `${hashes} ${text.trim()}`;
204
+ }
205
+ }
206
+ }
207
+ // Apply font size formatting
208
+ if (fontSize !== 'normal' && text.trim()) {
209
+ const layoutParser = new LayoutParser();
210
+ text = layoutParser.formatWithSize(text, fontSize);
211
+ }
212
+ return {
213
+ text,
214
+ bold,
215
+ italic,
216
+ alignment,
217
+ fontSize,
218
+ isList,
219
+ listLevel
220
+ };
221
+ }
222
+ async function parseParagraph(paragraph, imageExtractor, extractedImages) {
223
+ const advancedData = await parseAdvancedParagraph(paragraph, imageExtractor, extractedImages);
224
+ return advancedData.text;
225
+ }
226
+ async function extractImageFromRun(run, imageExtractor, extractedImages) {
227
+ // This is a simplified image extraction - in reality, we'd need to parse the drawing XML
228
+ // and match it with the extracted images
229
+ if (extractedImages.length > 0) {
230
+ const img = extractedImages.find(img => img.savedPath);
231
+ if (img) {
232
+ return imageExtractor.getImageMarkdown('Document Image', img.savedPath);
233
+ }
234
+ }
235
+ return null;
236
+ }
237
+ //# sourceMappingURL=docx-parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"docx-parser.js","sourceRoot":"","sources":["../../src/parsers/docx-parser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,kBAAkB,EAAE,MAAM,QAAQ,CAAC;AAK5C,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;AACzD,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAiDlE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,MAAc,EACd,cAA8B,EAC9B,cAA8B,EAC9B,UAA4B,EAAE;IAE9B,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC1C,MAAM,WAAW,GAAG,GAAG,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAElD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,gBAAgB,CAAC,yCAAyC,CAAC,CAAC;QACxE,CAAC;QAED,uBAAuB;QACvB,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK;YACrD,CAAC,CAAC,MAAM,cAAc,CAAC,oBAAoB,CAAC,GAAG,EAAE,OAAO,CAAC;YACzD,CAAC,CAAC,EAAE,CAAC;QAEP,4BAA4B;QAC5B,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK;YACrD,CAAC,CAAC,MAAM,cAAc,CAAC,oBAAoB,CAAC,GAAG,EAAE,OAAO,CAAC;YACzD,CAAC,CAAC,EAAE,CAAC;QAEP,2BAA2B;QAC3B,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC;QAExC,MAAM,UAAU,GAAG,MAAM,WAAW,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACrD,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,UAAU,CAAiB,CAAC;QAEpE,MAAM,IAAI,GAAG,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,IAAI,QAAQ,GAAG,EAAE,CAAC;QAElB,qBAAqB;QACrB,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;YACxC,MAAM,SAAS,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;YACjF,IAAI,SAAS,CAAC,IAAI,EAAE,EAAE,CAAC;gBACrB,QAAQ,IAAI,SAAS,GAAG,MAAM,CAAC;YACjC,CAAC;QACH,CAAC;QAED,iBAAiB;QACjB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;YACxC,MAAM,aAAa,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,YAAY,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;YACrG,IAAI,aAAa,CAAC,IAAI,EAAE,EAAE,CAAC;gBACzB,QAAQ,IAAI,aAAa,GAAG,MAAM,CAAC;YACrC,CAAC;QACH,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE;YACzB,MAAM,EAAE,eAAe;YACvB,MAAM,EAAE,eAAe,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC;YAChD,QAAQ,EAAE;gBACR,cAAc,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM;gBAC1C,UAAU,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM;aACzC;SACF,CAAC;IACJ,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,IAAI,KAAK,YAAY,gBAAgB,EAAE,CAAC;YACtC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;QACzE,MAAM,IAAI,UAAU,CAAC,MAAM,EAAE,OAAO,EAAE,KAAc,CAAC,CAAC;IACxD,CAAC;AACH,CAAC;AAED,KAAK,UAAU,kBAAkB,CAC/B,KAAc,EACd,YAA0B,EAC1B,cAA8B,EAC9B,eAAqC;IAErC,MAAM,SAAS,GAAG,KAAY,CAAC;IAC/B,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;IACrC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,MAAM,WAAW,GAAc,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IAE5C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;QAChC,MAAM,OAAO,GAAY,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,QAAQ,GAAa;gBACzB,IAAI,EAAE,EAAE;gBACR,IAAI,EAAE,KAAK;gBACX,MAAM,EAAE,KAAK;gBACb,SAAS,EAAE,MAAuB;gBAClC,eAAe,EAAE,SAAS;gBAC1B,OAAO,EAAE,CAAC;gBACV,OAAO,EAAE,CAAC;gBACV,MAAM,EAAE,KAAK;aACd,CAAC;YAEF,0BAA0B;YAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC;YAC5B,IAAI,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBACd,yBAAyB;gBACzB,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,EAAE,CAAC;oBAC1B,QAAQ,CAAC,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;gBACnE,CAAC;gBACD,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,EAAE,CAAC;oBACxB,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC;gBACzB,CAAC;gBAED,6BAA6B;gBAC7B,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;oBAClC,QAAQ,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;gBACxD,CAAC;YACH,CAAC;YAED,uBAAuB;YACvB,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBAChB,MAAM,SAAS,GAAa,EAAE,CAAC;gBAC/B,KAAK,MAAM,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;oBACpC,MAAM,aAAa,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;oBAC/F,IAAI,aAAa,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;wBAC9B,SAAS,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;wBAEnC,oCAAoC;wBACpC,IAAI,aAAa,CAAC,IAAI;4BAAE,QAAQ,CAAC,IAAI,GAAG,IAAI,CAAC;wBAC7C,IAAI,aAAa,CAAC,MAAM;4BAAE,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC;wBACjD,IAAI,aAAa,CAAC,SAAS,KAAK,MAAM;4BAAE,QAAQ,CAAC,SAAS,GAAG,aAAa,CAAC,SAAS,CAAC;oBACvF,CAAC;gBACH,CAAC;gBACD,QAAQ,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACtC,CAAC;YAED,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC/B,CAAC;QAED,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC;IAED,OAAO,YAAY,CAAC,kBAAkB,CAAC,WAAW,EAAE;QAClD,iBAAiB,EAAE,IAAI;QACvB,WAAW,EAAE,IAAI;QACjB,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,sBAAsB,CACnC,SAAkB,EAClB,cAA8B,EAC9B,eAAqC;IAErC,MAAM,IAAI,GAAG,SAAgB,CAAC;IAC9B,IAAI,IAAI,GAAG,EAAE,CAAC;IACd,IAAI,IAAI,GAAG,KAAK,CAAC;IACjB,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,SAAS,GAAkB,MAAM,CAAC;IACtC,IAAI,QAAQ,GAAoB,QAAQ,CAAC;IACzC,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,6BAA6B;IAC7B,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;IAC1B,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACb,kBAAkB;QAClB,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;YAC/B,MAAM,UAAU,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;YAC3C,SAAS,GAAG,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC;gBACtE,CAAC,CAAC,UAAU;gBACZ,CAAC,CAAC,MAAM,CAAkB,CAAC;QAC/B,CAAC;QAED,uBAAuB;QACvB,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,EAAE,CAAC;YACtB,MAAM,GAAG,IAAI,CAAC;YACd,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACnC,SAAS,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACrE,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QAChB,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAC9B,4BAA4B;YAC5B,IAAI,GAAG,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACtC,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,GAAG,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;gBACjF,IAAI,QAAQ,EAAE,CAAC;oBACb,IAAI,IAAI,QAAQ,GAAG,IAAI,CAAC;gBAC1B,CAAC;YACH,CAAC;YAED,+BAA+B;YAC/B,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;gBACf,IAAI,OAAO,GAAG,EAAE,CAAC;gBACjB,KAAK,MAAM,WAAW,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;oBACrC,IAAI,OAAO,WAAW,KAAK,QAAQ,EAAE,CAAC;wBACpC,OAAO,IAAI,WAAW,CAAC;oBACzB,CAAC;yBAAM,IAAI,WAAW,IAAI,OAAO,WAAW,KAAK,QAAQ,IAAI,GAAG,IAAI,WAAW,EAAE,CAAC;wBAChF,OAAO,IAAK,WAAmB,CAAC,CAAC,CAAC;oBACpC,CAAC;gBACH,CAAC;gBAED,mBAAmB;gBACnB,MAAM,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;gBAC9B,IAAI,GAAG,EAAE,CAAC;oBACR,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;wBACf,OAAO,GAAG,KAAK,OAAO,IAAI,CAAC;wBAC3B,IAAI,GAAG,IAAI,CAAC;oBACd,CAAC;oBACD,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;wBACf,OAAO,GAAG,IAAI,OAAO,GAAG,CAAC;wBACzB,MAAM,GAAG,IAAI,CAAC;oBAChB,CAAC;oBACD,IAAI,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;wBAC5B,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,gCAAgC;oBACjF,CAAC;gBACH,CAAC;gBAED,IAAI,IAAI,OAAO,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAED,wBAAwB;IACxB,IAAI,MAAM,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QAC1B,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QACtC,IAAI,GAAG,GAAG,MAAM,KAAK,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;IACrC,CAAC;IAED,2BAA2B;IAC3B,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC7C,IAAI,QAAQ,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC;YAC/E,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YACtC,IAAI,KAAK,EAAE,CAAC;gBACV,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC;gBACrD,IAAI,GAAG,GAAG,MAAM,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YACpC,CAAC;QACH,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,IAAI,QAAQ,KAAK,QAAQ,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QACzC,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC;QACxC,IAAI,GAAG,YAAY,CAAC,cAAc,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACrD,CAAC;IAED,OAAO;QACL,IAAI;QACJ,IAAI;QACJ,MAAM;QACN,SAAS;QACT,QAAQ;QACR,MAAM;QACN,SAAS;KACV,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,cAAc,CAC3B,SAAkB,EAClB,cAA8B,EAC9B,eAAqC;IAErC,MAAM,YAAY,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;IAC9F,OAAO,YAAY,CAAC,IAAI,CAAC;AAC3B,CAAC;AAED,KAAK,UAAU,mBAAmB,CAChC,GAAY,EACZ,cAA8B,EAC9B,eAAqC;IAErC,yFAAyF;IACzF,yCAAyC;IACzC,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC/B,MAAM,GAAG,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACvD,IAAI,GAAG,EAAE,CAAC;YACR,OAAO,cAAc,CAAC,gBAAgB,CAAC,gBAAgB,EAAE,GAAG,CAAC,SAAS,CAAC,CAAC;QAC1E,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { Buffer } from 'node:buffer';
2
+ import { type PDFParseOptions, type PDFParseResult } from '../utils/pdf-extractor.js';
3
+ import type { ImageExtractor } from '../utils/image-extractor.js';
4
+ /**
5
+ * Parse PDF buffer and convert to markdown with enhanced layout preservation
6
+ */
7
+ export declare function parsePdf(buffer: Buffer, imageExtractor: ImageExtractor, options?: PDFParseOptions): Promise<PDFParseResult>;
8
+ //# sourceMappingURL=pdf-parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/pdf-parser.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,EAAgB,KAAK,eAAe,EAAE,KAAK,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAEpG,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAGlE;;GAEG;AACH,wBAAsB,QAAQ,CAC5B,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,cAAc,EAC9B,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,CAAC,CAuFzB"}
@@ -0,0 +1,98 @@
1
+ import pdfParse from 'pdf-parse';
2
+ import { PDFExtractor } from '../utils/pdf-extractor.js';
3
+ import { ParseError, InvalidFileError } from '../types/errors.js';
4
+ /**
5
+ * Parse PDF buffer and convert to markdown with enhanced layout preservation
6
+ */
7
+ export async function parsePdf(buffer, imageExtractor, options = {}) {
8
+ try {
9
+ const data = await pdfParse(buffer);
10
+ const pdfExtractor = new PDFExtractor(imageExtractor);
11
+ let markdown = '';
12
+ const images = [];
13
+ let pageCount = data.numpages || 1;
14
+ // Apply maxPages limit if specified
15
+ if (options.maxPages && options.maxPages > 0) {
16
+ pageCount = Math.min(pageCount, options.maxPages);
17
+ }
18
+ // Try to extract text with enhanced layout
19
+ if (data.text && data.text.trim()) {
20
+ console.log('📄 Extracting text with layout enhancement...');
21
+ try {
22
+ const enhancedText = await pdfExtractor.enhanceTextWithLayout(data.text, data);
23
+ markdown += enhancedText;
24
+ }
25
+ catch (layoutError) {
26
+ console.warn('Layout enhancement failed, falling back to basic text extraction');
27
+ // Fall back to basic text extraction
28
+ markdown = extractBasicText(data.text);
29
+ }
30
+ }
31
+ // If text is minimal or extraction failed, convert pages to images
32
+ if (!data.text || data.text.trim().length < 100) {
33
+ console.log('📸 Converting PDF pages to images for better preservation...');
34
+ try {
35
+ const pageImages = await pdfExtractor.extractImagesFromPDF(buffer);
36
+ if (pageImages.length > 0) {
37
+ // Convert page images to ImageData format
38
+ for (const page of pageImages) {
39
+ images.push({
40
+ originalPath: `page_${page.pageNumber}`,
41
+ savedPath: page.imagePath,
42
+ basePath: '',
43
+ format: 'png',
44
+ dimensions: page.dimensions
45
+ });
46
+ }
47
+ if (markdown.trim()) {
48
+ markdown += '\n\n---\n\n## Visual Content\n\n';
49
+ }
50
+ markdown += await pdfExtractor.createPageBreaks(pageImages);
51
+ }
52
+ }
53
+ catch (imageError) {
54
+ console.warn('Failed to extract PDF as images:', imageError instanceof Error ? imageError.message : 'Unknown error');
55
+ }
56
+ }
57
+ // Fallback to basic text if everything else fails
58
+ if (!markdown.trim()) {
59
+ if (data.text && data.text.trim()) {
60
+ markdown = extractBasicText(data.text);
61
+ }
62
+ else {
63
+ throw new InvalidFileError('PDF file appears to be empty or contains no extractable text');
64
+ }
65
+ }
66
+ return {
67
+ markdown,
68
+ images,
69
+ pageCount,
70
+ metadata: {
71
+ version: data.version || 'unknown',
72
+ info: data.info || {},
73
+ textLength: data.text?.length || 0
74
+ }
75
+ };
76
+ }
77
+ catch (error) {
78
+ if (error instanceof InvalidFileError) {
79
+ throw error;
80
+ }
81
+ const message = error instanceof Error ? error.message : 'Unknown error';
82
+ if (message.includes('Invalid PDF') || message.includes('PDF')) {
83
+ throw new InvalidFileError('Invalid or corrupted PDF file', error);
84
+ }
85
+ throw new ParseError('PDF', message, error);
86
+ }
87
+ }
88
+ /**
89
+ * Extract basic text from PDF with minimal formatting
90
+ */
91
+ function extractBasicText(text) {
92
+ const lines = text.split('\n');
93
+ const cleanedLines = lines
94
+ .map(line => line.trim())
95
+ .filter(line => line.length > 0);
96
+ return cleanedLines.join('\n');
97
+ }
98
+ //# sourceMappingURL=pdf-parser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf-parser.js","sourceRoot":"","sources":["../../src/parsers/pdf-parser.ts"],"names":[],"mappings":"AAAA,OAAO,QAAQ,MAAM,WAAW,CAAC;AAGjC,OAAO,EAAE,YAAY,EAA6C,MAAM,2BAA2B,CAAC;AACpG,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAIlE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,MAAc,EACd,cAA8B,EAC9B,UAA2B,EAAE;IAE7B,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,CAAC;QACpC,MAAM,YAAY,GAAG,IAAI,YAAY,CAAC,cAAc,CAAC,CAAC;QAEtD,IAAI,QAAQ,GAAG,EAAE,CAAC;QAClB,MAAM,MAAM,GAAgB,EAAE,CAAC;QAC/B,IAAI,SAAS,GAAG,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;QAEnC,oCAAoC;QACpC,IAAI,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,QAAQ,GAAG,CAAC,EAAE,CAAC;YAC7C,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;QACpD,CAAC;QAED,2CAA2C;QAC3C,IAAI,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YAClC,OAAO,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAC;YAC7D,IAAI,CAAC;gBACH,MAAM,YAAY,GAAG,MAAM,YAAY,CAAC,qBAAqB,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;gBAC/E,QAAQ,IAAI,YAAY,CAAC;YAC3B,CAAC;YAAC,OAAO,WAAoB,EAAE,CAAC;gBAC9B,OAAO,CAAC,IAAI,CAAC,kEAAkE,CAAC,CAAC;gBACjF,qCAAqC;gBACrC,QAAQ,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACzC,CAAC;QACH,CAAC;QAED,mEAAmE;QACnE,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAChD,OAAO,CAAC,GAAG,CAAC,8DAA8D,CAAC,CAAC;YAC5E,IAAI,CAAC;gBACH,MAAM,UAAU,GAAG,MAAM,YAAY,CAAC,oBAAoB,CAAC,MAAM,CAAC,CAAC;gBACnE,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC1B,0CAA0C;oBAC1C,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;wBAC9B,MAAM,CAAC,IAAI,CAAC;4BACV,YAAY,EAAE,QAAQ,IAAI,CAAC,UAAU,EAAE;4BACvC,SAAS,EAAE,IAAI,CAAC,SAAS;4BACzB,QAAQ,EAAE,EAAE;4BACZ,MAAM,EAAE,KAAK;4BACb,UAAU,EAAE,IAAI,CAAC,UAAU;yBAC5B,CAAC,CAAC;oBACL,CAAC;oBAED,IAAI,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC;wBACpB,QAAQ,IAAI,kCAAkC,CAAC;oBACjD,CAAC;oBACD,QAAQ,IAAI,MAAM,YAAY,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC;gBAC9D,CAAC;YACH,CAAC;YAAC,OAAO,UAAmB,EAAE,CAAC;gBAC7B,OAAO,CAAC,IAAI,CAAC,kCAAkC,EAAE,UAAU,YAAY,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC;YACvH,CAAC;QACH,CAAC;QAED,kDAAkD;QAClD,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC;YACrB,IAAI,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;gBAClC,QAAQ,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACzC,CAAC;iBAAM,CAAC;gBACN,MAAM,IAAI,gBAAgB,CAAC,8DAA8D,CAAC,CAAC;YAC7F,CAAC;QACH,CAAC;QAED,OAAO;YACL,QAAQ;YACR,MAAM;YACN,SAAS;YACT,QAAQ,EAAE;gBACR,OAAO,EAAE,IAAI,CAAC,OAAO,IAAI,SAAS;gBAClC,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,EAAE;gBACrB,UAAU,EAAE,IAAI,CAAC,IAAI,EAAE,MAAM,IAAI,CAAC;aACnC;SACF,CAAC;IAEJ,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,IAAI,KAAK,YAAY,gBAAgB,EAAE,CAAC;YACtC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;QAEzE,IAAI,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/D,MAAM,IAAI,gBAAgB,CAAC,+BAA+B,EAAE,KAAc,CAAC,CAAC;QAC9E,CAAC;QAED,MAAM,IAAI,UAAU,CAAC,KAAK,EAAE,OAAO,EAAE,KAAc,CAAC,CAAC;IACvD,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAY;IACpC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC/B,MAAM,YAAY,GAAG,KAAK;SACvB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SACxB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAEnC,OAAO,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC"}
@@ -0,0 +1,21 @@
1
+ import type { Buffer } from 'node:buffer';
2
+ import type { ImageExtractor } from '../utils/image-extractor.js';
3
+ import type { ChartExtractor } from '../utils/chart-extractor.js';
4
+ import type { ImageData, ChartData } from '../types/interfaces.js';
5
+ export interface PptxParseOptions {
6
+ readonly preserveLayout?: boolean;
7
+ readonly extractImages?: boolean;
8
+ readonly extractCharts?: boolean;
9
+ }
10
+ export interface PptxParseResult {
11
+ readonly markdown: string;
12
+ readonly images: readonly ImageData[];
13
+ readonly charts: readonly ChartData[];
14
+ readonly slideCount: number;
15
+ readonly metadata: Record<string, unknown>;
16
+ }
17
+ /**
18
+ * Parse PPTX buffer and convert to markdown with layout preservation
19
+ */
20
+ export declare function parsePptx(buffer: Buffer, imageExtractor: ImageExtractor, chartExtractor: ChartExtractor, options?: PptxParseOptions): Promise<PptxParseResult>;
21
+ //# sourceMappingURL=pptx-parser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pptx-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/pptx-parser.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAClE,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAGlE,OAAO,KAAK,EACV,SAAS,EACT,SAAS,EAIV,MAAM,wBAAwB,CAAC;AAEhC,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,cAAc,CAAC,EAAE,OAAO,CAAC;IAClC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;IACjC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;CAClC;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC5C;AAkBD;;GAEG;AACH,wBAAsB,SAAS,CAC7B,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,cAAc,EAC9B,cAAc,EAAE,cAAc,EAC9B,OAAO,GAAE,gBAAqB,GAC7B,OAAO,CAAC,eAAe,CAAC,CAwE1B"}
@@ -0,0 +1,264 @@
1
+ import JSZip from 'jszip';
2
+ import { parseStringPromise } from 'xml2js';
3
+ import { LayoutParser } from '../utils/layout-parser.js';
4
+ import { ParseError } from '../types/errors.js';
5
+ /**
6
+ * Parse PPTX buffer and convert to markdown with layout preservation
7
+ */
8
+ export async function parsePptx(buffer, imageExtractor, chartExtractor, options = {}) {
9
+ try {
10
+ const zip = await JSZip.loadAsync(buffer);
11
+ // Extract images first
12
+ const extractedImages = options.extractImages !== false
13
+ ? await imageExtractor.extractImagesFromZip(zip, 'ppt/')
14
+ : [];
15
+ // Extract charts if enabled
16
+ const extractedCharts = options.extractCharts !== false
17
+ ? await chartExtractor.extractChartsFromZip(zip, 'ppt/')
18
+ : [];
19
+ // Initialize layout parser
20
+ const layoutParser = new LayoutParser();
21
+ const slideFiles = [];
22
+ zip.forEach((relativePath, file) => {
23
+ if (relativePath.startsWith('ppt/slides/slide') && relativePath.endsWith('.xml')) {
24
+ slideFiles.push({
25
+ path: relativePath,
26
+ file: file
27
+ });
28
+ }
29
+ });
30
+ slideFiles.sort((a, b) => {
31
+ const aNum = parseInt(a.path.match(/slide(\d+)\.xml/)?.[1] || '0');
32
+ const bNum = parseInt(b.path.match(/slide(\d+)\.xml/)?.[1] || '0');
33
+ return aNum - bNum;
34
+ });
35
+ let markdown = '';
36
+ for (let i = 0; i < slideFiles.length; i++) {
37
+ const slideFile = slideFiles[i];
38
+ const slideNumber = i + 1;
39
+ markdown += `## Slide ${slideNumber}\n\n`;
40
+ const xmlContent = await slideFile.file.async('string');
41
+ const slideContent = await extractAdvancedSlideContent(xmlContent, imageExtractor, extractedImages, slideNumber, layoutParser);
42
+ if (slideContent.trim()) {
43
+ markdown += slideContent + '\n\n';
44
+ }
45
+ else {
46
+ markdown += '*No content*\n\n';
47
+ }
48
+ }
49
+ return {
50
+ markdown: markdown.trim(),
51
+ images: extractedImages,
52
+ charts: extractedCharts.map(chart => chart.data),
53
+ slideCount: slideFiles.length,
54
+ metadata: {
55
+ totalSlides: slideFiles.length,
56
+ hasImages: extractedImages.length > 0,
57
+ hasCharts: extractedCharts.length > 0
58
+ }
59
+ };
60
+ }
61
+ catch (error) {
62
+ const message = error instanceof Error ? error.message : 'Unknown error';
63
+ throw new ParseError('PPTX', message, error);
64
+ }
65
+ }
66
+ async function extractAdvancedSlideContent(xmlContent, imageExtractor, extractedImages, slideNumber, layoutParser) {
67
+ try {
68
+ const result = await parseStringPromise(xmlContent);
69
+ const elements = [];
70
+ let imageCount = 0;
71
+ // Extract all shapes and their positions
72
+ function extractShapes(obj, parentPos = { x: 0, y: 0 }) {
73
+ if (typeof obj === 'object' && obj !== null) {
74
+ if (Array.isArray(obj)) {
75
+ for (const item of obj) {
76
+ extractShapes(item, parentPos);
77
+ }
78
+ }
79
+ else {
80
+ // Check for shape positioning
81
+ let position = { ...parentPos };
82
+ if (obj['a:off']?.[0]?.$) {
83
+ position.x = parseInt(obj['a:off'][0].$.x) || 0;
84
+ position.y = parseInt(obj['a:off'][0].$.y) || 0;
85
+ }
86
+ // Check for text content in shapes
87
+ if (obj['a:t']) {
88
+ const textElement = {
89
+ type: 'text',
90
+ content: '',
91
+ position: position
92
+ };
93
+ if (Array.isArray(obj['a:t'])) {
94
+ for (const textItem of obj['a:t']) {
95
+ let text = '';
96
+ if (typeof textItem === 'string') {
97
+ text = textItem;
98
+ }
99
+ else if (textItem && typeof textItem === 'object' && '_' in textItem) {
100
+ text = textItem._;
101
+ }
102
+ if (text && text.trim()) {
103
+ textElement.content += text.trim() + ' ';
104
+ }
105
+ }
106
+ }
107
+ if (textElement.content.trim()) {
108
+ elements.push(textElement);
109
+ }
110
+ }
111
+ // Check for tables
112
+ if (obj['a:tbl']) {
113
+ const tableElement = {
114
+ type: 'table',
115
+ content: obj['a:tbl'],
116
+ position: position
117
+ };
118
+ elements.push(tableElement);
119
+ }
120
+ // Check for images
121
+ if (obj['a:blip'] || obj['p:pic'] || obj['a:pic']) {
122
+ const slideImages = extractedImages.filter(img => img.originalPath.includes(`slide${slideNumber}`) ||
123
+ img.originalPath.includes('media/'));
124
+ if (slideImages.length > imageCount) {
125
+ const img = slideImages[imageCount];
126
+ if (img?.savedPath) {
127
+ const imageElement = {
128
+ type: 'image',
129
+ content: img.savedPath,
130
+ position: position
131
+ };
132
+ elements.push(imageElement);
133
+ imageCount++;
134
+ }
135
+ }
136
+ }
137
+ // Recursively process nested objects
138
+ for (const key in obj) {
139
+ if (key !== 'a:t') {
140
+ extractShapes(obj[key], position);
141
+ }
142
+ }
143
+ }
144
+ }
145
+ }
146
+ extractShapes(result);
147
+ // Sort elements by position (top to bottom, left to right)
148
+ const sortedElements = layoutParser.calculateRelativePosition(elements);
149
+ let markdown = '';
150
+ let currentRow = null;
151
+ const rowThreshold = 50; // EMUs tolerance for same row
152
+ for (const element of sortedElements) {
153
+ const elementY = element.position?.y || 0;
154
+ // Check if this element is in the same row as the previous
155
+ if (currentRow && Math.abs(elementY - currentRow.y) < rowThreshold) {
156
+ // Same row - add as column
157
+ currentRow.elements.push(element);
158
+ }
159
+ else {
160
+ // New row
161
+ if (currentRow && currentRow.elements.length > 0) {
162
+ // Process previous row
163
+ markdown += processSlideRow(currentRow, layoutParser, imageExtractor);
164
+ }
165
+ currentRow = {
166
+ y: elementY,
167
+ elements: [element]
168
+ };
169
+ }
170
+ }
171
+ // Process the last row
172
+ if (currentRow && currentRow.elements.length > 0) {
173
+ markdown += processSlideRow(currentRow, layoutParser, imageExtractor);
174
+ }
175
+ // If no organized content, fall back to simple extraction
176
+ if (!markdown.trim() && extractedImages.length > 0) {
177
+ const slideImages = extractedImages.filter(img => img.originalPath.includes(`slide${slideNumber}`) ||
178
+ (slideNumber === 1 && img.originalPath.includes('media/')));
179
+ for (const img of slideImages) {
180
+ if (img.savedPath) {
181
+ markdown += imageExtractor.getImageMarkdown(`Slide ${slideNumber} Image`, img.savedPath) + '\n\n';
182
+ }
183
+ }
184
+ }
185
+ return markdown.trim();
186
+ }
187
+ catch (error) {
188
+ const message = error instanceof Error ? error.message : 'Unknown error';
189
+ throw new ParseError('PPTX', `Failed to extract advanced content from slide: ${message}`, error);
190
+ }
191
+ }
192
+ function processSlideRow(row, layoutParser, imageExtractor) {
193
+ if (row.elements.length === 1) {
194
+ // Single element in row
195
+ const element = row.elements[0];
196
+ return formatSlideElement(element, layoutParser, imageExtractor) + '\n\n';
197
+ }
198
+ else {
199
+ // Multiple elements - create columns
200
+ const columns = row.elements.map(element => ({
201
+ content: formatSlideElement(element, layoutParser, imageExtractor)
202
+ }));
203
+ return layoutParser.createColumns(columns) + '\n';
204
+ }
205
+ }
206
+ function formatSlideElement(element, layoutParser, imageExtractor) {
207
+ switch (element.type) {
208
+ case 'text': {
209
+ const content = element.content;
210
+ // Determine if it's a title based on position and length
211
+ if ((element.position?.y || 0) < 1000000 && content.length < 100) {
212
+ return `### ${content.trim()}`;
213
+ }
214
+ return content.trim();
215
+ }
216
+ case 'table':
217
+ // Parse PowerPoint table (simplified)
218
+ return parseSlideTable(element.content, layoutParser);
219
+ case 'image': {
220
+ const imagePath = element.content;
221
+ return imageExtractor.getImageMarkdown('Slide Image', imagePath);
222
+ }
223
+ default:
224
+ return typeof element.content === 'string' ? element.content : '';
225
+ }
226
+ }
227
+ function parseSlideTable(tableData, layoutParser) {
228
+ // Simplified table parsing for PowerPoint
229
+ const table = tableData;
230
+ if (!table?.[0]?.['a:tr']) {
231
+ return '';
232
+ }
233
+ const rows = table[0]['a:tr'];
234
+ const tableStruct = { rows: [] };
235
+ for (const row of rows) {
236
+ const cells = row['a:tc'] || [];
237
+ const rowData = { cells: [] };
238
+ for (const cell of cells) {
239
+ let cellText = '';
240
+ // Extract text from cell
241
+ if (cell['a:txBody']?.[0]?.['a:p']) {
242
+ const paragraphs = cell['a:txBody'][0]['a:p'];
243
+ for (const para of paragraphs) {
244
+ if (para['a:r']?.[0]?.['a:t']?.[0]) {
245
+ cellText += para['a:r'][0]['a:t'][0] + ' ';
246
+ }
247
+ }
248
+ }
249
+ rowData.cells.push({
250
+ text: cellText.trim(),
251
+ bold: false,
252
+ italic: false,
253
+ alignment: 'left',
254
+ backgroundColor: undefined,
255
+ colSpan: 1,
256
+ rowSpan: 1,
257
+ merged: false
258
+ });
259
+ }
260
+ tableStruct.rows.push(rowData);
261
+ }
262
+ return layoutParser.parseAdvancedTable(tableStruct);
263
+ }
264
+ //# sourceMappingURL=pptx-parser.js.map