file2md 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +293 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +153 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers/docx-parser.d.ts +20 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +237 -0
- package/dist/parsers/docx-parser.js.map +1 -0
- package/dist/parsers/pdf-parser.d.ts +8 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +98 -0
- package/dist/parsers/pdf-parser.js.map +1 -0
- package/dist/parsers/pptx-parser.d.ts +21 -0
- package/dist/parsers/pptx-parser.d.ts.map +1 -0
- package/dist/parsers/pptx-parser.js +264 -0
- package/dist/parsers/pptx-parser.js.map +1 -0
- package/dist/parsers/xlsx-parser.d.ts +19 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +267 -0
- package/dist/parsers/xlsx-parser.js.map +1 -0
- package/dist/types/errors.d.ts +52 -0
- package/dist/types/errors.d.ts.map +1 -0
- package/dist/types/errors.js +76 -0
- package/dist/types/errors.js.map +1 -0
- package/dist/types/index.d.ts +5 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +5 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/interfaces.d.ts +228 -0
- package/dist/types/interfaces.d.ts.map +1 -0
- package/dist/types/interfaces.js +10 -0
- package/dist/types/interfaces.js.map +1 -0
- package/dist/utils/chart-extractor.d.ts +44 -0
- package/dist/utils/chart-extractor.d.ts.map +1 -0
- package/dist/utils/chart-extractor.js +258 -0
- package/dist/utils/chart-extractor.js.map +1 -0
- package/dist/utils/image-extractor.d.ts +50 -0
- package/dist/utils/image-extractor.d.ts.map +1 -0
- package/dist/utils/image-extractor.js +136 -0
- package/dist/utils/image-extractor.js.map +1 -0
- package/dist/utils/layout-parser.d.ts +55 -0
- package/dist/utils/layout-parser.d.ts.map +1 -0
- package/dist/utils/layout-parser.js +244 -0
- package/dist/utils/layout-parser.js.map +1 -0
- package/dist/utils/pdf-extractor.d.ts +46 -0
- package/dist/utils/pdf-extractor.d.ts.map +1 -0
- package/dist/utils/pdf-extractor.js +235 -0
- package/dist/utils/pdf-extractor.js.map +1 -0
- package/package.json +70 -0
@@ -0,0 +1,237 @@
|
|
1
|
+
import JSZip from 'jszip';
|
2
|
+
import { parseStringPromise } from 'xml2js';
|
3
|
+
import { LayoutParser } from '../utils/layout-parser.js';
|
4
|
+
import { ParseError, InvalidFileError } from '../types/errors.js';
|
5
|
+
/**
|
6
|
+
* Parse DOCX buffer and convert to markdown with layout preservation
|
7
|
+
*/
|
8
|
+
export async function parseDocx(buffer, imageExtractor, chartExtractor, options = {}) {
|
9
|
+
try {
|
10
|
+
const zip = await JSZip.loadAsync(buffer);
|
11
|
+
const documentXml = zip.file('word/document.xml');
|
12
|
+
if (!documentXml) {
|
13
|
+
throw new InvalidFileError('Invalid DOCX file: missing document.xml');
|
14
|
+
}
|
15
|
+
// Extract images first
|
16
|
+
const extractedImages = options.extractImages !== false
|
17
|
+
? await imageExtractor.extractImagesFromZip(zip, 'word/')
|
18
|
+
: [];
|
19
|
+
// Extract charts if enabled
|
20
|
+
const extractedCharts = options.extractCharts !== false
|
21
|
+
? await chartExtractor.extractChartsFromZip(zip, 'word/')
|
22
|
+
: [];
|
23
|
+
// Initialize layout parser
|
24
|
+
const layoutParser = new LayoutParser();
|
25
|
+
const xmlContent = await documentXml.async('string');
|
26
|
+
const result = await parseStringPromise(xmlContent);
|
27
|
+
const body = result['w:document'][0]['w:body'][0];
|
28
|
+
let markdown = '';
|
29
|
+
// Process paragraphs
|
30
|
+
for (const element of body['w:p'] || []) {
|
31
|
+
const paragraph = await parseParagraph(element, imageExtractor, extractedImages);
|
32
|
+
if (paragraph.trim()) {
|
33
|
+
markdown += paragraph + '\n\n';
|
34
|
+
}
|
35
|
+
}
|
36
|
+
// Process tables
|
37
|
+
for (const table of body['w:tbl'] || []) {
|
38
|
+
const tableMarkdown = await parseAdvancedTable(table, layoutParser, imageExtractor, extractedImages);
|
39
|
+
if (tableMarkdown.trim()) {
|
40
|
+
markdown += tableMarkdown + '\n\n';
|
41
|
+
}
|
42
|
+
}
|
43
|
+
return {
|
44
|
+
markdown: markdown.trim(),
|
45
|
+
images: extractedImages,
|
46
|
+
charts: extractedCharts.map(chart => chart.data),
|
47
|
+
metadata: {
|
48
|
+
paragraphCount: (body['w:p'] || []).length,
|
49
|
+
tableCount: (body['w:tbl'] || []).length
|
50
|
+
}
|
51
|
+
};
|
52
|
+
}
|
53
|
+
catch (error) {
|
54
|
+
if (error instanceof InvalidFileError) {
|
55
|
+
throw error;
|
56
|
+
}
|
57
|
+
const message = error instanceof Error ? error.message : 'Unknown error';
|
58
|
+
throw new ParseError('DOCX', message, error);
|
59
|
+
}
|
60
|
+
}
|
61
|
+
async function parseAdvancedTable(table, layoutParser, imageExtractor, extractedImages) {
|
62
|
+
const tableData = table;
|
63
|
+
const rows = tableData['w:tr'] || [];
|
64
|
+
if (rows.length === 0)
|
65
|
+
return '';
|
66
|
+
const tableStruct = { rows: [] };
|
67
|
+
for (const row of rows) {
|
68
|
+
const cells = row['w:tc'] || [];
|
69
|
+
const rowData = { cells: [] };
|
70
|
+
for (const cell of cells) {
|
71
|
+
const cellData = {
|
72
|
+
text: '',
|
73
|
+
bold: false,
|
74
|
+
italic: false,
|
75
|
+
alignment: 'left',
|
76
|
+
backgroundColor: undefined,
|
77
|
+
colSpan: 1,
|
78
|
+
rowSpan: 1,
|
79
|
+
merged: false
|
80
|
+
};
|
81
|
+
// Extract cell properties
|
82
|
+
const tcPr = cell['w:tcPr'];
|
83
|
+
if (tcPr?.[0]) {
|
84
|
+
// Check for merged cells
|
85
|
+
if (tcPr[0]['w:gridSpan']) {
|
86
|
+
cellData.colSpan = parseInt(tcPr[0]['w:gridSpan'][0].$.val) || 1;
|
87
|
+
}
|
88
|
+
if (tcPr[0]['w:vMerge']) {
|
89
|
+
cellData.merged = true;
|
90
|
+
}
|
91
|
+
// Check for background color
|
92
|
+
if (tcPr[0]['w:shd']?.[0]?.$.fill) {
|
93
|
+
cellData.backgroundColor = tcPr[0]['w:shd'][0].$.fill;
|
94
|
+
}
|
95
|
+
}
|
96
|
+
// Extract cell content
|
97
|
+
if (cell['w:p']) {
|
98
|
+
const cellTexts = [];
|
99
|
+
for (const paragraph of cell['w:p']) {
|
100
|
+
const paragraphData = await parseAdvancedParagraph(paragraph, imageExtractor, extractedImages);
|
101
|
+
if (paragraphData.text.trim()) {
|
102
|
+
cellTexts.push(paragraphData.text);
|
103
|
+
// Inherit formatting from paragraph
|
104
|
+
if (paragraphData.bold)
|
105
|
+
cellData.bold = true;
|
106
|
+
if (paragraphData.italic)
|
107
|
+
cellData.italic = true;
|
108
|
+
if (paragraphData.alignment !== 'left')
|
109
|
+
cellData.alignment = paragraphData.alignment;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
cellData.text = cellTexts.join(' ');
|
113
|
+
}
|
114
|
+
rowData.cells.push(cellData);
|
115
|
+
}
|
116
|
+
tableStruct.rows.push(rowData);
|
117
|
+
}
|
118
|
+
return layoutParser.parseAdvancedTable(tableStruct, {
|
119
|
+
preserveAlignment: true,
|
120
|
+
showBorders: true,
|
121
|
+
preserveColors: true
|
122
|
+
});
|
123
|
+
}
|
124
|
+
async function parseAdvancedParagraph(paragraph, imageExtractor, extractedImages) {
|
125
|
+
const para = paragraph;
|
126
|
+
let text = '';
|
127
|
+
let bold = false;
|
128
|
+
let italic = false;
|
129
|
+
let alignment = 'left';
|
130
|
+
let fontSize = 'normal';
|
131
|
+
let isList = false;
|
132
|
+
let listLevel = 0;
|
133
|
+
// Check paragraph properties
|
134
|
+
const pPr = para['w:pPr'];
|
135
|
+
if (pPr?.[0]) {
|
136
|
+
// Check alignment
|
137
|
+
if (pPr[0]['w:jc']?.[0]?.$.val) {
|
138
|
+
const alignValue = pPr[0]['w:jc'][0].$.val;
|
139
|
+
alignment = (['left', 'center', 'right', 'justify'].includes(alignValue)
|
140
|
+
? alignValue
|
141
|
+
: 'left');
|
142
|
+
}
|
143
|
+
// Check if it's a list
|
144
|
+
if (pPr[0]['w:numPr']) {
|
145
|
+
isList = true;
|
146
|
+
if (pPr[0]['w:numPr'][0]['w:ilvl']) {
|
147
|
+
listLevel = parseInt(pPr[0]['w:numPr'][0]['w:ilvl'][0].$.val) || 0;
|
148
|
+
}
|
149
|
+
}
|
150
|
+
}
|
151
|
+
if (para['w:r']) {
|
152
|
+
for (const run of para['w:r']) {
|
153
|
+
// Check for images/drawings
|
154
|
+
if (run['w:drawing'] || run['w:pict']) {
|
155
|
+
const imageRef = await extractImageFromRun(run, imageExtractor, extractedImages);
|
156
|
+
if (imageRef) {
|
157
|
+
text += imageRef + '\n';
|
158
|
+
}
|
159
|
+
}
|
160
|
+
// Extract text with formatting
|
161
|
+
if (run['w:t']) {
|
162
|
+
let runText = '';
|
163
|
+
for (const textElement of run['w:t']) {
|
164
|
+
if (typeof textElement === 'string') {
|
165
|
+
runText += textElement;
|
166
|
+
}
|
167
|
+
else if (textElement && typeof textElement === 'object' && '_' in textElement) {
|
168
|
+
runText += textElement._;
|
169
|
+
}
|
170
|
+
}
|
171
|
+
// Apply formatting
|
172
|
+
const rPr = run['w:rPr']?.[0];
|
173
|
+
if (rPr) {
|
174
|
+
if (rPr['w:b']) {
|
175
|
+
runText = `**${runText}**`;
|
176
|
+
bold = true;
|
177
|
+
}
|
178
|
+
if (rPr['w:i']) {
|
179
|
+
runText = `*${runText}*`;
|
180
|
+
italic = true;
|
181
|
+
}
|
182
|
+
if (rPr['w:sz']?.[0]?.$.val) {
|
183
|
+
fontSize = parseInt(rPr['w:sz'][0].$.val) / 2; // Convert half-points to points
|
184
|
+
}
|
185
|
+
}
|
186
|
+
text += runText;
|
187
|
+
}
|
188
|
+
}
|
189
|
+
}
|
190
|
+
// Apply list formatting
|
191
|
+
if (isList && text.trim()) {
|
192
|
+
const indent = ' '.repeat(listLevel);
|
193
|
+
text = `${indent}- ${text.trim()}`;
|
194
|
+
}
|
195
|
+
// Apply heading formatting
|
196
|
+
if (pPr?.[0]?.['w:pStyle']?.[0]?.$.val) {
|
197
|
+
const styleVal = pPr[0]['w:pStyle'][0].$.val;
|
198
|
+
if (styleVal && (styleVal.includes('Heading') || styleVal.includes('heading'))) {
|
199
|
+
const match = styleVal.match(/(\d+)/);
|
200
|
+
if (match) {
|
201
|
+
const headingLevel = parseInt(match[1]);
|
202
|
+
const hashes = '#'.repeat(Math.min(headingLevel, 6));
|
203
|
+
text = `${hashes} ${text.trim()}`;
|
204
|
+
}
|
205
|
+
}
|
206
|
+
}
|
207
|
+
// Apply font size formatting
|
208
|
+
if (fontSize !== 'normal' && text.trim()) {
|
209
|
+
const layoutParser = new LayoutParser();
|
210
|
+
text = layoutParser.formatWithSize(text, fontSize);
|
211
|
+
}
|
212
|
+
return {
|
213
|
+
text,
|
214
|
+
bold,
|
215
|
+
italic,
|
216
|
+
alignment,
|
217
|
+
fontSize,
|
218
|
+
isList,
|
219
|
+
listLevel
|
220
|
+
};
|
221
|
+
}
|
222
|
+
async function parseParagraph(paragraph, imageExtractor, extractedImages) {
|
223
|
+
const advancedData = await parseAdvancedParagraph(paragraph, imageExtractor, extractedImages);
|
224
|
+
return advancedData.text;
|
225
|
+
}
|
226
|
+
async function extractImageFromRun(run, imageExtractor, extractedImages) {
|
227
|
+
// This is a simplified image extraction - in reality, we'd need to parse the drawing XML
|
228
|
+
// and match it with the extracted images
|
229
|
+
if (extractedImages.length > 0) {
|
230
|
+
const img = extractedImages.find(img => img.savedPath);
|
231
|
+
if (img) {
|
232
|
+
return imageExtractor.getImageMarkdown('Document Image', img.savedPath);
|
233
|
+
}
|
234
|
+
}
|
235
|
+
return null;
|
236
|
+
}
|
237
|
+
//# sourceMappingURL=docx-parser.js.map
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":3,"file":"docx-parser.js","sourceRoot":"","sources":["../../src/parsers/docx-parser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,EAAE,kBAAkB,EAAE,MAAM,QAAQ,CAAC;AAK5C,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;AACzD,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAiDlE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,MAAc,EACd,cAA8B,EAC9B,cAA8B,EAC9B,UAA4B,EAAE;IAE9B,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC1C,MAAM,WAAW,GAAG,GAAG,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAElD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,gBAAgB,CAAC,yCAAyC,CAAC,CAAC;QACxE,CAAC;QAED,uBAAuB;QACvB,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK;YACrD,CAAC,CAAC,MAAM,cAAc,CAAC,oBAAoB,CAAC,GAAG,EAAE,OAAO,CAAC;YACzD,CAAC,CAAC,EAAE,CAAC;QAEP,4BAA4B;QAC5B,MAAM,eAAe,GAAG,OAAO,CAAC,aAAa,KAAK,KAAK;YACrD,CAAC,CAAC,MAAM,cAAc,CAAC,oBAAoB,CAAC,GAAG,EAAE,OAAO,CAAC;YACzD,CAAC,CAAC,EAAE,CAAC;QAEP,2BAA2B;QAC3B,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC;QAExC,MAAM,UAAU,GAAG,MAAM,WAAW,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACrD,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,UAAU,CAAiB,CAAC;QAEpE,MAAM,IAAI,GAAG,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,IAAI,QAAQ,GAAG,EAAE,CAAC;QAElB,qBAAqB;QACrB,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,CAAC;YACxC,MAAM,SAAS,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;YACjF,IAAI,SAAS,CAAC,IAAI,EAAE,EAAE,CAAC;gBACrB,QAAQ,IAAI,SAAS,GAAG,MAAM,CAAC;YACjC,CAAC;QACH,CAAC;QAED,iBAAiB;QACjB,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;YACxC,MAAM,aAAa,GAAG,MAAM,kBAAkB,CAAC,KAAK,EAAE,YAAY,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;YACrG,IAAI,aAAa,CAAC,IAAI,EAAE,EAAE,CAAC;gBACzB,QAAQ,IAAI,aAAa,GAAG,MAAM,CAAC;YACrC,CAAC;QACH,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,QAAQ,CAAC,IAAI,EAAE;YACzB,MAAM,EAAE,eAAe;YACvB,MAAM,EAAE,eAAe,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC;YAChD,QAAQ,EAAE;gBACR,cAAc,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM;gBAC1C,UAAU,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM;aACzC;SACF,CAAC;IACJ,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,IAAI,KAAK,YAAY,gBAAgB,EAAE,CAAC;YACtC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;QACzE,MAAM,IAAI,UAAU,CAAC,MAAM,EAAE,OAAO,EAAE,KAAc,CAAC,CAAC;IACxD,CAAC;AACH,CAAC;AAED,KAAK,UAAU,kBAAkB,CAC/B,KAAc,EACd,YAA0B,EAC1B,cAA8B,EAC9B,eAAqC;IAErC,MAAM,SAAS,GAAG,KAAY,CAAC;IAC/B,MAAM,IAAI,GAAG,SAAS,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;IACrC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEjC,MAAM,WAAW,GAAc,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IAE5C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;QAChC,MAAM,OAAO,GAAY,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,QAAQ,GAAa;gBACzB,IAAI,EAAE,EAAE;gBACR,IAAI,EAAE,KAAK;gBACX,MAAM,EAAE,KAAK;gBACb,SAAS,EAAE,MAAuB;gBAClC,eAAe,EAAE,SAAS;gBAC1B,OAAO,EAAE,CAAC;gBACV,OAAO,EAAE,CAAC;gBACV,MAAM,EAAE,KAAK;aACd,CAAC;YAEF,0BAA0B;YAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC;YAC5B,IAAI,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBACd,yBAAyB;gBACzB,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,EAAE,CAAC;oBAC1B,QAAQ,CAAC,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;gBACnE,CAAC;gBACD,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,EAAE,CAAC;oBACxB,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC;gBACzB,CAAC;gBAED,6BAA6B;gBAC7B,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;oBAClC,QAAQ,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;gBACxD,CAAC;YACH,CAAC;YAED,uBAAuB;YACvB,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBAChB,MAAM,SAAS,GAAa,EAAE,CAAC;gBAC/B,KAAK,MAAM,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;oBACpC,MAAM,aAAa,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;oBAC/F,IAAI,aAAa,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;wBAC9B,SAAS,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;wBAEnC,oCAAoC;wBACpC,IAAI,aAAa,CAAC,IAAI;4BAAE,QAAQ,CAAC,IAAI,GAAG,IAAI,CAAC;wBAC7C,IAAI,aAAa,CAAC,MAAM;4BAAE,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC;wBACjD,IAAI,aAAa,CAAC,SAAS,KAAK,MAAM;4BAAE,QAAQ,CAAC,SAAS,GAAG,aAAa,CAAC,SAAS,CAAC;oBACvF,CAAC;gBACH,CAAC;gBACD,QAAQ,CAAC,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACtC,CAAC;YAED,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC/B,CAAC;QAED,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC;IAED,OAAO,YAAY,CAAC,kBAAkB,CAAC,WAAW,EAAE;QAClD,iBAAiB,EAAE,IAAI;QACvB,WAAW,EAAE,IAAI;QACjB,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,sBAAsB,CACnC,SAAkB,EAClB,cAA8B,EAC9B,eAAqC;IAErC,MAAM,IAAI,GAAG,SAAgB,CAAC;IAC9B,IAAI,IAAI,GAAG,EAAE,CAAC;IACd,IAAI,IAAI,GAAG,KAAK,CAAC;IACjB,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,SAAS,GAAkB,MAAM,CAAC;IACtC,IAAI,QAAQ,GAAoB,QAAQ,CAAC;IACzC,IAAI,MAAM,GAAG,KAAK,CAAC;IACnB,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,6BAA6B;IAC7B,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;IAC1B,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACb,kBAAkB;QAClB,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;YAC/B,MAAM,UAAU,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;YAC3C,SAAS,GAAG,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC;gBACtE,CAAC,CAAC,UAAU;gBACZ,CAAC,CAAC,MAAM,CAAkB,CAAC;QAC/B,CAAC;QAED,uBAAuB;QACvB,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,EAAE,CAAC;YACtB,MAAM,GAAG,IAAI,CAAC;YACd,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACnC,SAAS,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACrE,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QAChB,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;YAC9B,4BAA4B;YAC5B,IAAI,GAAG,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;gBACtC,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,GAAG,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;gBACjF,IAAI,QAAQ,EAAE,CAAC;oBACb,IAAI,IAAI,QAAQ,GAAG,IAAI,CAAC;gBAC1B,CAAC;YACH,CAAC;YAED,+BAA+B;YAC/B,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;gBACf,IAAI,OAAO,GAAG,EAAE,CAAC;gBACjB,KAAK,MAAM,WAAW,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;oBACrC,IAAI,OAAO,WAAW,KAAK,QAAQ,EAAE,CAAC;wBACpC,OAAO,IAAI,WAAW,CAAC;oBACzB,CAAC;yBAAM,IAAI,WAAW,IAAI,OAAO,WAAW,KAAK,QAAQ,IAAI,GAAG,IAAI,WAAW,EAAE,CAAC;wBAChF,OAAO,IAAK,WAAmB,CAAC,CAAC,CAAC;oBACpC,CAAC;gBACH,CAAC;gBAED,mBAAmB;gBACnB,MAAM,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;gBAC9B,IAAI,GAAG,EAAE,CAAC;oBACR,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;wBACf,OAAO,GAAG,KAAK,OAAO,IAAI,CAAC;wBAC3B,IAAI,GAAG,IAAI,CAAC;oBACd,CAAC;oBACD,IAAI,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;wBACf,OAAO,GAAG,IAAI,OAAO,GAAG,CAAC;wBACzB,MAAM,GAAG,IAAI,CAAC;oBAChB,CAAC;oBACD,IAAI,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;wBAC5B,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,gCAAgC;oBACjF,CAAC;gBACH,CAAC;gBAED,IAAI,IAAI,OAAO,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAED,wBAAwB;IACxB,IAAI,MAAM,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QAC1B,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QACtC,IAAI,GAAG,GAAG,MAAM,KAAK,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;IACrC,CAAC;IAED,2BAA2B;IAC3B,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAC7C,IAAI,QAAQ,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC;YAC/E,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YACtC,IAAI,KAAK,EAAE,CAAC;gBACV,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxC,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC;gBACrD,IAAI,GAAG,GAAG,MAAM,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YACpC,CAAC;QACH,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,IAAI,QAAQ,KAAK,QAAQ,IAAI,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;QACzC,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC;QACxC,IAAI,GAAG,YAAY,CAAC,cAAc,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;IACrD,CAAC;IAED,OAAO;QACL,IAAI;QACJ,IAAI;QACJ,MAAM;QACN,SAAS;QACT,QAAQ;QACR,MAAM;QACN,SAAS;KACV,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,cAAc,CAC3B,SAAkB,EAClB,cAA8B,EAC9B,eAAqC;IAErC,MAAM,YAAY,GAAG,MAAM,sBAAsB,CAAC,SAAS,EAAE,cAAc,EAAE,eAAe,CAAC,CAAC;IAC9F,OAAO,YAAY,CAAC,IAAI,CAAC;AAC3B,CAAC;AAED,KAAK,UAAU,mBAAmB,CAChC,GAAY,EACZ,cAA8B,EAC9B,eAAqC;IAErC,yFAAyF;IACzF,yCAAyC;IACzC,IAAI,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC/B,MAAM,GAAG,GAAG,eAAe,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACvD,IAAI,GAAG,EAAE,CAAC;YACR,OAAO,cAAc,CAAC,gBAAgB,CAAC,gBAAgB,EAAE,GAAG,CAAC,SAAS,CAAC,CAAC;QAC1E,CAAC;IACH,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC"}
|
@@ -0,0 +1,8 @@
|
|
1
|
+
import type { Buffer } from 'node:buffer';
|
2
|
+
import { type PDFParseOptions, type PDFParseResult } from '../utils/pdf-extractor.js';
|
3
|
+
import type { ImageExtractor } from '../utils/image-extractor.js';
|
4
|
+
/**
|
5
|
+
* Parse PDF buffer and convert to markdown with enhanced layout preservation
|
6
|
+
*/
|
7
|
+
export declare function parsePdf(buffer: Buffer, imageExtractor: ImageExtractor, options?: PDFParseOptions): Promise<PDFParseResult>;
|
8
|
+
//# sourceMappingURL=pdf-parser.d.ts.map
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":3,"file":"pdf-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/pdf-parser.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,EAAgB,KAAK,eAAe,EAAE,KAAK,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAEpG,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAGlE;;GAEG;AACH,wBAAsB,QAAQ,CAC5B,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,cAAc,EAC9B,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,cAAc,CAAC,CAuFzB"}
|
@@ -0,0 +1,98 @@
|
|
1
|
+
import pdfParse from 'pdf-parse';
|
2
|
+
import { PDFExtractor } from '../utils/pdf-extractor.js';
|
3
|
+
import { ParseError, InvalidFileError } from '../types/errors.js';
|
4
|
+
/**
|
5
|
+
* Parse PDF buffer and convert to markdown with enhanced layout preservation
|
6
|
+
*/
|
7
|
+
export async function parsePdf(buffer, imageExtractor, options = {}) {
|
8
|
+
try {
|
9
|
+
const data = await pdfParse(buffer);
|
10
|
+
const pdfExtractor = new PDFExtractor(imageExtractor);
|
11
|
+
let markdown = '';
|
12
|
+
const images = [];
|
13
|
+
let pageCount = data.numpages || 1;
|
14
|
+
// Apply maxPages limit if specified
|
15
|
+
if (options.maxPages && options.maxPages > 0) {
|
16
|
+
pageCount = Math.min(pageCount, options.maxPages);
|
17
|
+
}
|
18
|
+
// Try to extract text with enhanced layout
|
19
|
+
if (data.text && data.text.trim()) {
|
20
|
+
console.log('📄 Extracting text with layout enhancement...');
|
21
|
+
try {
|
22
|
+
const enhancedText = await pdfExtractor.enhanceTextWithLayout(data.text, data);
|
23
|
+
markdown += enhancedText;
|
24
|
+
}
|
25
|
+
catch (layoutError) {
|
26
|
+
console.warn('Layout enhancement failed, falling back to basic text extraction');
|
27
|
+
// Fall back to basic text extraction
|
28
|
+
markdown = extractBasicText(data.text);
|
29
|
+
}
|
30
|
+
}
|
31
|
+
// If text is minimal or extraction failed, convert pages to images
|
32
|
+
if (!data.text || data.text.trim().length < 100) {
|
33
|
+
console.log('📸 Converting PDF pages to images for better preservation...');
|
34
|
+
try {
|
35
|
+
const pageImages = await pdfExtractor.extractImagesFromPDF(buffer);
|
36
|
+
if (pageImages.length > 0) {
|
37
|
+
// Convert page images to ImageData format
|
38
|
+
for (const page of pageImages) {
|
39
|
+
images.push({
|
40
|
+
originalPath: `page_${page.pageNumber}`,
|
41
|
+
savedPath: page.imagePath,
|
42
|
+
basePath: '',
|
43
|
+
format: 'png',
|
44
|
+
dimensions: page.dimensions
|
45
|
+
});
|
46
|
+
}
|
47
|
+
if (markdown.trim()) {
|
48
|
+
markdown += '\n\n---\n\n## Visual Content\n\n';
|
49
|
+
}
|
50
|
+
markdown += await pdfExtractor.createPageBreaks(pageImages);
|
51
|
+
}
|
52
|
+
}
|
53
|
+
catch (imageError) {
|
54
|
+
console.warn('Failed to extract PDF as images:', imageError instanceof Error ? imageError.message : 'Unknown error');
|
55
|
+
}
|
56
|
+
}
|
57
|
+
// Fallback to basic text if everything else fails
|
58
|
+
if (!markdown.trim()) {
|
59
|
+
if (data.text && data.text.trim()) {
|
60
|
+
markdown = extractBasicText(data.text);
|
61
|
+
}
|
62
|
+
else {
|
63
|
+
throw new InvalidFileError('PDF file appears to be empty or contains no extractable text');
|
64
|
+
}
|
65
|
+
}
|
66
|
+
return {
|
67
|
+
markdown,
|
68
|
+
images,
|
69
|
+
pageCount,
|
70
|
+
metadata: {
|
71
|
+
version: data.version || 'unknown',
|
72
|
+
info: data.info || {},
|
73
|
+
textLength: data.text?.length || 0
|
74
|
+
}
|
75
|
+
};
|
76
|
+
}
|
77
|
+
catch (error) {
|
78
|
+
if (error instanceof InvalidFileError) {
|
79
|
+
throw error;
|
80
|
+
}
|
81
|
+
const message = error instanceof Error ? error.message : 'Unknown error';
|
82
|
+
if (message.includes('Invalid PDF') || message.includes('PDF')) {
|
83
|
+
throw new InvalidFileError('Invalid or corrupted PDF file', error);
|
84
|
+
}
|
85
|
+
throw new ParseError('PDF', message, error);
|
86
|
+
}
|
87
|
+
}
|
88
|
+
/**
|
89
|
+
* Extract basic text from PDF with minimal formatting
|
90
|
+
*/
|
91
|
+
function extractBasicText(text) {
|
92
|
+
const lines = text.split('\n');
|
93
|
+
const cleanedLines = lines
|
94
|
+
.map(line => line.trim())
|
95
|
+
.filter(line => line.length > 0);
|
96
|
+
return cleanedLines.join('\n');
|
97
|
+
}
|
98
|
+
//# sourceMappingURL=pdf-parser.js.map
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":3,"file":"pdf-parser.js","sourceRoot":"","sources":["../../src/parsers/pdf-parser.ts"],"names":[],"mappings":"AAAA,OAAO,QAAQ,MAAM,WAAW,CAAC;AAGjC,OAAO,EAAE,YAAY,EAA6C,MAAM,2BAA2B,CAAC;AACpG,OAAO,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAIlE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,MAAc,EACd,cAA8B,EAC9B,UAA2B,EAAE;IAE7B,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC,CAAC;QACpC,MAAM,YAAY,GAAG,IAAI,YAAY,CAAC,cAAc,CAAC,CAAC;QAEtD,IAAI,QAAQ,GAAG,EAAE,CAAC;QAClB,MAAM,MAAM,GAAgB,EAAE,CAAC;QAC/B,IAAI,SAAS,GAAG,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;QAEnC,oCAAoC;QACpC,IAAI,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,QAAQ,GAAG,CAAC,EAAE,CAAC;YAC7C,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;QACpD,CAAC;QAED,2CAA2C;QAC3C,IAAI,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;YAClC,OAAO,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAC;YAC7D,IAAI,CAAC;gBACH,MAAM,YAAY,GAAG,MAAM,YAAY,CAAC,qBAAqB,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;gBAC/E,QAAQ,IAAI,YAAY,CAAC;YAC3B,CAAC;YAAC,OAAO,WAAoB,EAAE,CAAC;gBAC9B,OAAO,CAAC,IAAI,CAAC,kEAAkE,CAAC,CAAC;gBACjF,qCAAqC;gBACrC,QAAQ,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACzC,CAAC;QACH,CAAC;QAED,mEAAmE;QACnE,IAAI,CAAC,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAChD,OAAO,CAAC,GAAG,CAAC,8DAA8D,CAAC,CAAC;YAC5E,IAAI,CAAC;gBACH,MAAM,UAAU,GAAG,MAAM,YAAY,CAAC,oBAAoB,CAAC,MAAM,CAAC,CAAC;gBACnE,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC1B,0CAA0C;oBAC1C,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;wBAC9B,MAAM,CAAC,IAAI,CAAC;4BACV,YAAY,EAAE,QAAQ,IAAI,CAAC,UAAU,EAAE;4BACvC,SAAS,EAAE,IAAI,CAAC,SAAS;4BACzB,QAAQ,EAAE,EAAE;4BACZ,MAAM,EAAE,KAAK;4BACb,UAAU,EAAE,IAAI,CAAC,UAAU;yBAC5B,CAAC,CAAC;oBACL,CAAC;oBAED,IAAI,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC;wBACpB,QAAQ,IAAI,kCAAkC,CAAC;oBACjD,CAAC;oBACD,QAAQ,IAAI,MAAM,YAAY,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC;gBAC9D,CAAC;YACH,CAAC;YAAC,OAAO,UAAmB,EAAE,CAAC;gBAC7B,OAAO,CAAC,IAAI,CAAC,kCAAkC,EAAE,UAAU,YAAY,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC;YACvH,CAAC;QACH,CAAC;QAED,kDAAkD;QAClD,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC;YACrB,IAAI,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC;gBAClC,QAAQ,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACzC,CAAC;iBAAM,CAAC;gBACN,MAAM,IAAI,gBAAgB,CAAC,8DAA8D,CAAC,CAAC;YAC7F,CAAC;QACH,CAAC;QAED,OAAO;YACL,QAAQ;YACR,MAAM;YACN,SAAS;YACT,QAAQ,EAAE;gBACR,OAAO,EAAE,IAAI,CAAC,OAAO,IAAI,SAAS;gBAClC,IAAI,EAAE,IAAI,CAAC,IAAI,IAAI,EAAE;gBACrB,UAAU,EAAE,IAAI,CAAC,IAAI,EAAE,MAAM,IAAI,CAAC;aACnC;SACF,CAAC;IAEJ,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,IAAI,KAAK,YAAY,gBAAgB,EAAE,CAAC;YACtC,MAAM,KAAK,CAAC;QACd,CAAC;QAED,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;QAEzE,IAAI,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/D,MAAM,IAAI,gBAAgB,CAAC,+BAA+B,EAAE,KAAc,CAAC,CAAC;QAC9E,CAAC;QAED,MAAM,IAAI,UAAU,CAAC,KAAK,EAAE,OAAO,EAAE,KAAc,CAAC,CAAC;IACvD,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,IAAY;IACpC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC/B,MAAM,YAAY,GAAG,KAAK;SACvB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;SACxB,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAEnC,OAAO,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACjC,CAAC"}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
import type { Buffer } from 'node:buffer';
|
2
|
+
import type { ImageExtractor } from '../utils/image-extractor.js';
|
3
|
+
import type { ChartExtractor } from '../utils/chart-extractor.js';
|
4
|
+
import type { ImageData, ChartData } from '../types/interfaces.js';
|
5
|
+
export interface PptxParseOptions {
|
6
|
+
readonly preserveLayout?: boolean;
|
7
|
+
readonly extractImages?: boolean;
|
8
|
+
readonly extractCharts?: boolean;
|
9
|
+
}
|
10
|
+
export interface PptxParseResult {
|
11
|
+
readonly markdown: string;
|
12
|
+
readonly images: readonly ImageData[];
|
13
|
+
readonly charts: readonly ChartData[];
|
14
|
+
readonly slideCount: number;
|
15
|
+
readonly metadata: Record<string, unknown>;
|
16
|
+
}
|
17
|
+
/**
|
18
|
+
* Parse PPTX buffer and convert to markdown with layout preservation
|
19
|
+
*/
|
20
|
+
export declare function parsePptx(buffer: Buffer, imageExtractor: ImageExtractor, chartExtractor: ChartExtractor, options?: PptxParseOptions): Promise<PptxParseResult>;
|
21
|
+
//# sourceMappingURL=pptx-parser.d.ts.map
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":3,"file":"pptx-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/pptx-parser.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAClE,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAGlE,OAAO,KAAK,EACV,SAAS,EACT,SAAS,EAIV,MAAM,wBAAwB,CAAC;AAEhC,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,cAAc,CAAC,EAAE,OAAO,CAAC;IAClC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;IACjC,QAAQ,CAAC,aAAa,CAAC,EAAE,OAAO,CAAC;CAClC;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,MAAM,EAAE,SAAS,SAAS,EAAE,CAAC;IACtC,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC5C;AAkBD;;GAEG;AACH,wBAAsB,SAAS,CAC7B,MAAM,EAAE,MAAM,EACd,cAAc,EAAE,cAAc,EAC9B,cAAc,EAAE,cAAc,EAC9B,OAAO,GAAE,gBAAqB,GAC7B,OAAO,CAAC,eAAe,CAAC,CAwE1B"}
|
@@ -0,0 +1,264 @@
|
|
1
|
+
import JSZip from 'jszip';
|
2
|
+
import { parseStringPromise } from 'xml2js';
|
3
|
+
import { LayoutParser } from '../utils/layout-parser.js';
|
4
|
+
import { ParseError } from '../types/errors.js';
|
5
|
+
/**
|
6
|
+
* Parse PPTX buffer and convert to markdown with layout preservation
|
7
|
+
*/
|
8
|
+
export async function parsePptx(buffer, imageExtractor, chartExtractor, options = {}) {
|
9
|
+
try {
|
10
|
+
const zip = await JSZip.loadAsync(buffer);
|
11
|
+
// Extract images first
|
12
|
+
const extractedImages = options.extractImages !== false
|
13
|
+
? await imageExtractor.extractImagesFromZip(zip, 'ppt/')
|
14
|
+
: [];
|
15
|
+
// Extract charts if enabled
|
16
|
+
const extractedCharts = options.extractCharts !== false
|
17
|
+
? await chartExtractor.extractChartsFromZip(zip, 'ppt/')
|
18
|
+
: [];
|
19
|
+
// Initialize layout parser
|
20
|
+
const layoutParser = new LayoutParser();
|
21
|
+
const slideFiles = [];
|
22
|
+
zip.forEach((relativePath, file) => {
|
23
|
+
if (relativePath.startsWith('ppt/slides/slide') && relativePath.endsWith('.xml')) {
|
24
|
+
slideFiles.push({
|
25
|
+
path: relativePath,
|
26
|
+
file: file
|
27
|
+
});
|
28
|
+
}
|
29
|
+
});
|
30
|
+
slideFiles.sort((a, b) => {
|
31
|
+
const aNum = parseInt(a.path.match(/slide(\d+)\.xml/)?.[1] || '0');
|
32
|
+
const bNum = parseInt(b.path.match(/slide(\d+)\.xml/)?.[1] || '0');
|
33
|
+
return aNum - bNum;
|
34
|
+
});
|
35
|
+
let markdown = '';
|
36
|
+
for (let i = 0; i < slideFiles.length; i++) {
|
37
|
+
const slideFile = slideFiles[i];
|
38
|
+
const slideNumber = i + 1;
|
39
|
+
markdown += `## Slide ${slideNumber}\n\n`;
|
40
|
+
const xmlContent = await slideFile.file.async('string');
|
41
|
+
const slideContent = await extractAdvancedSlideContent(xmlContent, imageExtractor, extractedImages, slideNumber, layoutParser);
|
42
|
+
if (slideContent.trim()) {
|
43
|
+
markdown += slideContent + '\n\n';
|
44
|
+
}
|
45
|
+
else {
|
46
|
+
markdown += '*No content*\n\n';
|
47
|
+
}
|
48
|
+
}
|
49
|
+
return {
|
50
|
+
markdown: markdown.trim(),
|
51
|
+
images: extractedImages,
|
52
|
+
charts: extractedCharts.map(chart => chart.data),
|
53
|
+
slideCount: slideFiles.length,
|
54
|
+
metadata: {
|
55
|
+
totalSlides: slideFiles.length,
|
56
|
+
hasImages: extractedImages.length > 0,
|
57
|
+
hasCharts: extractedCharts.length > 0
|
58
|
+
}
|
59
|
+
};
|
60
|
+
}
|
61
|
+
catch (error) {
|
62
|
+
const message = error instanceof Error ? error.message : 'Unknown error';
|
63
|
+
throw new ParseError('PPTX', message, error);
|
64
|
+
}
|
65
|
+
}
|
66
|
+
async function extractAdvancedSlideContent(xmlContent, imageExtractor, extractedImages, slideNumber, layoutParser) {
|
67
|
+
try {
|
68
|
+
const result = await parseStringPromise(xmlContent);
|
69
|
+
const elements = [];
|
70
|
+
let imageCount = 0;
|
71
|
+
// Extract all shapes and their positions
|
72
|
+
function extractShapes(obj, parentPos = { x: 0, y: 0 }) {
|
73
|
+
if (typeof obj === 'object' && obj !== null) {
|
74
|
+
if (Array.isArray(obj)) {
|
75
|
+
for (const item of obj) {
|
76
|
+
extractShapes(item, parentPos);
|
77
|
+
}
|
78
|
+
}
|
79
|
+
else {
|
80
|
+
// Check for shape positioning
|
81
|
+
let position = { ...parentPos };
|
82
|
+
if (obj['a:off']?.[0]?.$) {
|
83
|
+
position.x = parseInt(obj['a:off'][0].$.x) || 0;
|
84
|
+
position.y = parseInt(obj['a:off'][0].$.y) || 0;
|
85
|
+
}
|
86
|
+
// Check for text content in shapes
|
87
|
+
if (obj['a:t']) {
|
88
|
+
const textElement = {
|
89
|
+
type: 'text',
|
90
|
+
content: '',
|
91
|
+
position: position
|
92
|
+
};
|
93
|
+
if (Array.isArray(obj['a:t'])) {
|
94
|
+
for (const textItem of obj['a:t']) {
|
95
|
+
let text = '';
|
96
|
+
if (typeof textItem === 'string') {
|
97
|
+
text = textItem;
|
98
|
+
}
|
99
|
+
else if (textItem && typeof textItem === 'object' && '_' in textItem) {
|
100
|
+
text = textItem._;
|
101
|
+
}
|
102
|
+
if (text && text.trim()) {
|
103
|
+
textElement.content += text.trim() + ' ';
|
104
|
+
}
|
105
|
+
}
|
106
|
+
}
|
107
|
+
if (textElement.content.trim()) {
|
108
|
+
elements.push(textElement);
|
109
|
+
}
|
110
|
+
}
|
111
|
+
// Check for tables
|
112
|
+
if (obj['a:tbl']) {
|
113
|
+
const tableElement = {
|
114
|
+
type: 'table',
|
115
|
+
content: obj['a:tbl'],
|
116
|
+
position: position
|
117
|
+
};
|
118
|
+
elements.push(tableElement);
|
119
|
+
}
|
120
|
+
// Check for images
|
121
|
+
if (obj['a:blip'] || obj['p:pic'] || obj['a:pic']) {
|
122
|
+
const slideImages = extractedImages.filter(img => img.originalPath.includes(`slide${slideNumber}`) ||
|
123
|
+
img.originalPath.includes('media/'));
|
124
|
+
if (slideImages.length > imageCount) {
|
125
|
+
const img = slideImages[imageCount];
|
126
|
+
if (img?.savedPath) {
|
127
|
+
const imageElement = {
|
128
|
+
type: 'image',
|
129
|
+
content: img.savedPath,
|
130
|
+
position: position
|
131
|
+
};
|
132
|
+
elements.push(imageElement);
|
133
|
+
imageCount++;
|
134
|
+
}
|
135
|
+
}
|
136
|
+
}
|
137
|
+
// Recursively process nested objects
|
138
|
+
for (const key in obj) {
|
139
|
+
if (key !== 'a:t') {
|
140
|
+
extractShapes(obj[key], position);
|
141
|
+
}
|
142
|
+
}
|
143
|
+
}
|
144
|
+
}
|
145
|
+
}
|
146
|
+
extractShapes(result);
|
147
|
+
// Sort elements by position (top to bottom, left to right)
|
148
|
+
const sortedElements = layoutParser.calculateRelativePosition(elements);
|
149
|
+
let markdown = '';
|
150
|
+
let currentRow = null;
|
151
|
+
const rowThreshold = 50; // EMUs tolerance for same row
|
152
|
+
for (const element of sortedElements) {
|
153
|
+
const elementY = element.position?.y || 0;
|
154
|
+
// Check if this element is in the same row as the previous
|
155
|
+
if (currentRow && Math.abs(elementY - currentRow.y) < rowThreshold) {
|
156
|
+
// Same row - add as column
|
157
|
+
currentRow.elements.push(element);
|
158
|
+
}
|
159
|
+
else {
|
160
|
+
// New row
|
161
|
+
if (currentRow && currentRow.elements.length > 0) {
|
162
|
+
// Process previous row
|
163
|
+
markdown += processSlideRow(currentRow, layoutParser, imageExtractor);
|
164
|
+
}
|
165
|
+
currentRow = {
|
166
|
+
y: elementY,
|
167
|
+
elements: [element]
|
168
|
+
};
|
169
|
+
}
|
170
|
+
}
|
171
|
+
// Process the last row
|
172
|
+
if (currentRow && currentRow.elements.length > 0) {
|
173
|
+
markdown += processSlideRow(currentRow, layoutParser, imageExtractor);
|
174
|
+
}
|
175
|
+
// If no organized content, fall back to simple extraction
|
176
|
+
if (!markdown.trim() && extractedImages.length > 0) {
|
177
|
+
const slideImages = extractedImages.filter(img => img.originalPath.includes(`slide${slideNumber}`) ||
|
178
|
+
(slideNumber === 1 && img.originalPath.includes('media/')));
|
179
|
+
for (const img of slideImages) {
|
180
|
+
if (img.savedPath) {
|
181
|
+
markdown += imageExtractor.getImageMarkdown(`Slide ${slideNumber} Image`, img.savedPath) + '\n\n';
|
182
|
+
}
|
183
|
+
}
|
184
|
+
}
|
185
|
+
return markdown.trim();
|
186
|
+
}
|
187
|
+
catch (error) {
|
188
|
+
const message = error instanceof Error ? error.message : 'Unknown error';
|
189
|
+
throw new ParseError('PPTX', `Failed to extract advanced content from slide: ${message}`, error);
|
190
|
+
}
|
191
|
+
}
|
192
|
+
function processSlideRow(row, layoutParser, imageExtractor) {
|
193
|
+
if (row.elements.length === 1) {
|
194
|
+
// Single element in row
|
195
|
+
const element = row.elements[0];
|
196
|
+
return formatSlideElement(element, layoutParser, imageExtractor) + '\n\n';
|
197
|
+
}
|
198
|
+
else {
|
199
|
+
// Multiple elements - create columns
|
200
|
+
const columns = row.elements.map(element => ({
|
201
|
+
content: formatSlideElement(element, layoutParser, imageExtractor)
|
202
|
+
}));
|
203
|
+
return layoutParser.createColumns(columns) + '\n';
|
204
|
+
}
|
205
|
+
}
|
206
|
+
function formatSlideElement(element, layoutParser, imageExtractor) {
|
207
|
+
switch (element.type) {
|
208
|
+
case 'text': {
|
209
|
+
const content = element.content;
|
210
|
+
// Determine if it's a title based on position and length
|
211
|
+
if ((element.position?.y || 0) < 1000000 && content.length < 100) {
|
212
|
+
return `### ${content.trim()}`;
|
213
|
+
}
|
214
|
+
return content.trim();
|
215
|
+
}
|
216
|
+
case 'table':
|
217
|
+
// Parse PowerPoint table (simplified)
|
218
|
+
return parseSlideTable(element.content, layoutParser);
|
219
|
+
case 'image': {
|
220
|
+
const imagePath = element.content;
|
221
|
+
return imageExtractor.getImageMarkdown('Slide Image', imagePath);
|
222
|
+
}
|
223
|
+
default:
|
224
|
+
return typeof element.content === 'string' ? element.content : '';
|
225
|
+
}
|
226
|
+
}
|
227
|
+
function parseSlideTable(tableData, layoutParser) {
|
228
|
+
// Simplified table parsing for PowerPoint
|
229
|
+
const table = tableData;
|
230
|
+
if (!table?.[0]?.['a:tr']) {
|
231
|
+
return '';
|
232
|
+
}
|
233
|
+
const rows = table[0]['a:tr'];
|
234
|
+
const tableStruct = { rows: [] };
|
235
|
+
for (const row of rows) {
|
236
|
+
const cells = row['a:tc'] || [];
|
237
|
+
const rowData = { cells: [] };
|
238
|
+
for (const cell of cells) {
|
239
|
+
let cellText = '';
|
240
|
+
// Extract text from cell
|
241
|
+
if (cell['a:txBody']?.[0]?.['a:p']) {
|
242
|
+
const paragraphs = cell['a:txBody'][0]['a:p'];
|
243
|
+
for (const para of paragraphs) {
|
244
|
+
if (para['a:r']?.[0]?.['a:t']?.[0]) {
|
245
|
+
cellText += para['a:r'][0]['a:t'][0] + ' ';
|
246
|
+
}
|
247
|
+
}
|
248
|
+
}
|
249
|
+
rowData.cells.push({
|
250
|
+
text: cellText.trim(),
|
251
|
+
bold: false,
|
252
|
+
italic: false,
|
253
|
+
alignment: 'left',
|
254
|
+
backgroundColor: undefined,
|
255
|
+
colSpan: 1,
|
256
|
+
rowSpan: 1,
|
257
|
+
merged: false
|
258
|
+
});
|
259
|
+
}
|
260
|
+
tableStruct.rows.push(rowData);
|
261
|
+
}
|
262
|
+
return layoutParser.parseAdvancedTable(tableStruct);
|
263
|
+
}
|
264
|
+
//# sourceMappingURL=pptx-parser.js.map
|