file2md 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +293 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +153 -0
- package/dist/index.js.map +1 -0
- package/dist/parsers/docx-parser.d.ts +20 -0
- package/dist/parsers/docx-parser.d.ts.map +1 -0
- package/dist/parsers/docx-parser.js +237 -0
- package/dist/parsers/docx-parser.js.map +1 -0
- package/dist/parsers/pdf-parser.d.ts +8 -0
- package/dist/parsers/pdf-parser.d.ts.map +1 -0
- package/dist/parsers/pdf-parser.js +98 -0
- package/dist/parsers/pdf-parser.js.map +1 -0
- package/dist/parsers/pptx-parser.d.ts +21 -0
- package/dist/parsers/pptx-parser.d.ts.map +1 -0
- package/dist/parsers/pptx-parser.js +264 -0
- package/dist/parsers/pptx-parser.js.map +1 -0
- package/dist/parsers/xlsx-parser.d.ts +19 -0
- package/dist/parsers/xlsx-parser.d.ts.map +1 -0
- package/dist/parsers/xlsx-parser.js +267 -0
- package/dist/parsers/xlsx-parser.js.map +1 -0
- package/dist/types/errors.d.ts +52 -0
- package/dist/types/errors.d.ts.map +1 -0
- package/dist/types/errors.js +76 -0
- package/dist/types/errors.js.map +1 -0
- package/dist/types/index.d.ts +5 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +5 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/interfaces.d.ts +228 -0
- package/dist/types/interfaces.d.ts.map +1 -0
- package/dist/types/interfaces.js +10 -0
- package/dist/types/interfaces.js.map +1 -0
- package/dist/utils/chart-extractor.d.ts +44 -0
- package/dist/utils/chart-extractor.d.ts.map +1 -0
- package/dist/utils/chart-extractor.js +258 -0
- package/dist/utils/chart-extractor.js.map +1 -0
- package/dist/utils/image-extractor.d.ts +50 -0
- package/dist/utils/image-extractor.d.ts.map +1 -0
- package/dist/utils/image-extractor.js +136 -0
- package/dist/utils/image-extractor.js.map +1 -0
- package/dist/utils/layout-parser.d.ts +55 -0
- package/dist/utils/layout-parser.d.ts.map +1 -0
- package/dist/utils/layout-parser.js +244 -0
- package/dist/utils/layout-parser.js.map +1 -0
- package/dist/utils/pdf-extractor.d.ts +46 -0
- package/dist/utils/pdf-extractor.d.ts.map +1 -0
- package/dist/utils/pdf-extractor.js +235 -0
- package/dist/utils/pdf-extractor.js.map +1 -0
- package/package.json +70 -0
@@ -0,0 +1,235 @@
|
|
1
|
+
import path from 'node:path';
|
2
|
+
import { ImageExtractionError } from '../types/errors.js';
|
3
|
+
export class PDFExtractor {
|
4
|
+
imageExtractor;
|
5
|
+
pageCounter = 0;
|
6
|
+
constructor(imageExtractor) {
|
7
|
+
this.imageExtractor = imageExtractor;
|
8
|
+
}
|
9
|
+
/**
|
10
|
+
* Extract images from PDF by converting pages to images
|
11
|
+
*/
|
12
|
+
async extractImagesFromPDF(buffer) {
|
13
|
+
try {
|
14
|
+
// Dynamic import to handle potential missing dependency
|
15
|
+
const pdf2pic = await import('pdf2pic');
|
16
|
+
const convert = pdf2pic.fromBuffer(buffer, {
|
17
|
+
density: 150, // Output resolution
|
18
|
+
saveFilename: "page",
|
19
|
+
savePath: this.imageExtractor.imageDirectory,
|
20
|
+
format: "png",
|
21
|
+
width: 800, // Max width
|
22
|
+
height: 1200 // Max height
|
23
|
+
});
|
24
|
+
const results = await convert.bulk(-1); // Convert all pages
|
25
|
+
const extractedPages = [];
|
26
|
+
for (const result of results) {
|
27
|
+
if (result.path) {
|
28
|
+
const filename = path.basename(result.path);
|
29
|
+
extractedPages.push({
|
30
|
+
pageNumber: result.page,
|
31
|
+
imagePath: filename,
|
32
|
+
fullPath: result.path
|
33
|
+
});
|
34
|
+
}
|
35
|
+
}
|
36
|
+
return extractedPages;
|
37
|
+
}
|
38
|
+
catch (error) {
|
39
|
+
const message = error instanceof Error ? error.message : 'Unknown error';
|
40
|
+
throw new ImageExtractionError(`Failed to convert PDF pages to images: ${message}`, error);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
/**
|
44
|
+
* Enhance text with layout detection
|
45
|
+
*/
|
46
|
+
async enhanceTextWithLayout(text, pdfData) {
|
47
|
+
const lines = text.split('\n');
|
48
|
+
let enhancedText = '';
|
49
|
+
let inTable = false;
|
50
|
+
let tableRows = [];
|
51
|
+
for (let i = 0; i < lines.length; i++) {
|
52
|
+
const line = lines[i].trim();
|
53
|
+
if (!line) {
|
54
|
+
// Handle empty lines
|
55
|
+
if (inTable) {
|
56
|
+
enhancedText += this.formatTableRows(tableRows);
|
57
|
+
tableRows = [];
|
58
|
+
inTable = false;
|
59
|
+
}
|
60
|
+
enhancedText += '\n';
|
61
|
+
continue;
|
62
|
+
}
|
63
|
+
// Detect headings (lines that are short and followed by content)
|
64
|
+
if (this.isLikelyHeading(line, lines, i)) {
|
65
|
+
if (inTable) {
|
66
|
+
enhancedText += this.formatTableRows(tableRows);
|
67
|
+
tableRows = [];
|
68
|
+
inTable = false;
|
69
|
+
}
|
70
|
+
const headingLevel = this.determineHeadingLevel(line);
|
71
|
+
enhancedText += `${'#'.repeat(headingLevel)} ${line}\n\n`;
|
72
|
+
continue;
|
73
|
+
}
|
74
|
+
// Detect table-like content
|
75
|
+
if (this.isLikelyTableRow(line)) {
|
76
|
+
if (!inTable) {
|
77
|
+
inTable = true;
|
78
|
+
}
|
79
|
+
tableRows.push({ cells: this.parseTableRow(line) });
|
80
|
+
continue;
|
81
|
+
}
|
82
|
+
else if (inTable) {
|
83
|
+
// End of table
|
84
|
+
enhancedText += this.formatTableRows(tableRows);
|
85
|
+
tableRows = [];
|
86
|
+
inTable = false;
|
87
|
+
}
|
88
|
+
// Detect lists
|
89
|
+
if (this.isListItem(line)) {
|
90
|
+
enhancedText += this.formatListItem(line) + '\n';
|
91
|
+
continue;
|
92
|
+
}
|
93
|
+
// Regular paragraph
|
94
|
+
enhancedText += line + '\n';
|
95
|
+
}
|
96
|
+
// Handle any remaining table
|
97
|
+
if (inTable && tableRows.length > 0) {
|
98
|
+
enhancedText += this.formatTableRows(tableRows);
|
99
|
+
}
|
100
|
+
return enhancedText;
|
101
|
+
}
|
102
|
+
isLikelyHeading(line, allLines, index) {
|
103
|
+
// Check if line looks like a heading
|
104
|
+
if (line.length > 80)
|
105
|
+
return false; // Too long to be a heading
|
106
|
+
if (line.length < 3)
|
107
|
+
return false; // Too short
|
108
|
+
// Check if it's all caps (common for headings)
|
109
|
+
if (line === line.toUpperCase() && line.length > 5)
|
110
|
+
return true;
|
111
|
+
// Check if followed by a longer paragraph
|
112
|
+
const nextLine = allLines[index + 1];
|
113
|
+
if (nextLine && nextLine.trim().length > line.length * 1.5) {
|
114
|
+
return true;
|
115
|
+
}
|
116
|
+
// Check if it ends with a colon (section header)
|
117
|
+
if (line.endsWith(':'))
|
118
|
+
return true;
|
119
|
+
return false;
|
120
|
+
}
|
121
|
+
determineHeadingLevel(line) {
|
122
|
+
if (line === line.toUpperCase())
|
123
|
+
return 1; // All caps = major heading
|
124
|
+
if (line.endsWith(':'))
|
125
|
+
return 2; // Ends with colon = section
|
126
|
+
if (line.length < 30)
|
127
|
+
return 3; // Short = subsection
|
128
|
+
return 2; // Default
|
129
|
+
}
|
130
|
+
isLikelyTableRow(line) {
|
131
|
+
// Look for patterns that suggest tabular data
|
132
|
+
const patterns = [
|
133
|
+
/\t+/, // Tab separated
|
134
|
+
/\s{3,}/, // Multiple spaces
|
135
|
+
/\|/, // Pipe separated
|
136
|
+
/\s+\d+\s+/, // Numbers with spaces
|
137
|
+
/^\s*\d+\.\s+/, // Numbered items with alignment
|
138
|
+
];
|
139
|
+
return patterns.some(pattern => pattern.test(line));
|
140
|
+
}
|
141
|
+
parseTableRow(line) {
|
142
|
+
// Split line into columns based on various separators
|
143
|
+
let columns = [];
|
144
|
+
if (line.includes('\t')) {
|
145
|
+
columns = line.split('\t').map(col => col.trim());
|
146
|
+
}
|
147
|
+
else if (line.includes('|')) {
|
148
|
+
columns = line.split('|').map(col => col.trim());
|
149
|
+
}
|
150
|
+
else {
|
151
|
+
// Split on multiple spaces
|
152
|
+
columns = line.split(/\s{2,}/).map(col => col.trim());
|
153
|
+
}
|
154
|
+
return columns.filter(col => col.length > 0);
|
155
|
+
}
|
156
|
+
formatTableRows(rows) {
|
157
|
+
if (rows.length === 0)
|
158
|
+
return '';
|
159
|
+
// Find maximum number of columns
|
160
|
+
const maxCols = Math.max(...rows.map(row => row.cells.length));
|
161
|
+
let markdown = '';
|
162
|
+
for (let i = 0; i < rows.length; i++) {
|
163
|
+
const row = rows[i];
|
164
|
+
let rowMarkdown = '|';
|
165
|
+
for (let j = 0; j < maxCols; j++) {
|
166
|
+
const cell = row.cells[j] || '';
|
167
|
+
rowMarkdown += ` ${cell} |`;
|
168
|
+
}
|
169
|
+
markdown += rowMarkdown + '\n';
|
170
|
+
// Add header separator after first row
|
171
|
+
if (i === 0) {
|
172
|
+
let separator = '|';
|
173
|
+
for (let j = 0; j < maxCols; j++) {
|
174
|
+
separator += ' --- |';
|
175
|
+
}
|
176
|
+
markdown += separator + '\n';
|
177
|
+
}
|
178
|
+
}
|
179
|
+
return markdown + '\n';
|
180
|
+
}
|
181
|
+
isListItem(line) {
|
182
|
+
// Check for various list patterns
|
183
|
+
const listPatterns = [
|
184
|
+
/^\s*[-•·]\s+/, // Bullet points
|
185
|
+
/^\s*\d+\.\s+/, // Numbered lists
|
186
|
+
/^\s*[a-zA-Z]\.\s+/, // Lettered lists
|
187
|
+
/^\s*[ivx]+\.\s+/i, // Roman numerals
|
188
|
+
];
|
189
|
+
return listPatterns.some(pattern => pattern.test(line));
|
190
|
+
}
|
191
|
+
formatListItem(line) {
|
192
|
+
// Convert various list formats to markdown
|
193
|
+
if (/^\s*\d+\.\s+/.test(line)) {
|
194
|
+
return line.replace(/^\s*\d+\.\s+/, '1. ');
|
195
|
+
}
|
196
|
+
else if (/^\s*[a-zA-Z]\.\s+/.test(line)) {
|
197
|
+
return line.replace(/^\s*[a-zA-Z]\.\s+/, '- ');
|
198
|
+
}
|
199
|
+
else if (/^\s*[ivx]+\.\s+/i.test(line)) {
|
200
|
+
return line.replace(/^\s*[ivx]+\.\s+/i, '- ');
|
201
|
+
}
|
202
|
+
else {
|
203
|
+
return line.replace(/^\s*[-•·]\s+/, '- ');
|
204
|
+
}
|
205
|
+
}
|
206
|
+
/**
|
207
|
+
* Create page breaks with images
|
208
|
+
*/
|
209
|
+
async createPageBreaks(pageImages) {
|
210
|
+
let markdown = '';
|
211
|
+
for (let i = 0; i < pageImages.length; i++) {
|
212
|
+
const page = pageImages[i];
|
213
|
+
markdown += `## Page ${page.pageNumber}\n\n`;
|
214
|
+
markdown += this.imageExtractor.getImageMarkdown(`Page ${page.pageNumber}`, page.imagePath);
|
215
|
+
markdown += '\n\n';
|
216
|
+
if (i < pageImages.length - 1) {
|
217
|
+
markdown += '---\n\n'; // Page separator
|
218
|
+
}
|
219
|
+
}
|
220
|
+
return markdown;
|
221
|
+
}
|
222
|
+
/**
|
223
|
+
* Reset internal counters
|
224
|
+
*/
|
225
|
+
reset() {
|
226
|
+
this.pageCounter = 0;
|
227
|
+
}
|
228
|
+
/**
|
229
|
+
* Get current page counter
|
230
|
+
*/
|
231
|
+
get currentPageCount() {
|
232
|
+
return this.pageCounter;
|
233
|
+
}
|
234
|
+
}
|
235
|
+
//# sourceMappingURL=pdf-extractor.js.map
|
@@ -0,0 +1 @@
|
|
1
|
+
{"version":3,"file":"pdf-extractor.js","sourceRoot":"","sources":["../../src/utils/pdf-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAI7B,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAwB1D,MAAM,OAAO,YAAY;IACN,cAAc,CAAiB;IACxC,WAAW,GAAW,CAAC,CAAC;IAEhC,YAAY,cAA8B;QACxC,IAAI,CAAC,cAAc,GAAG,cAAc,CAAC;IACvC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,oBAAoB,CAAC,MAAc;QACvC,IAAI,CAAC;YACH,wDAAwD;YACxD,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;YAExC,MAAM,OAAO,GAAG,OAAO,CAAC,UAAU,CAAC,MAAM,EAAE;gBACzC,OAAO,EAAE,GAAG,EAAY,oBAAoB;gBAC5C,YAAY,EAAE,MAAM;gBACpB,QAAQ,EAAE,IAAI,CAAC,cAAc,CAAC,cAAc;gBAC5C,MAAM,EAAE,KAAK;gBACb,KAAK,EAAE,GAAG,EAAa,YAAY;gBACnC,MAAM,EAAE,IAAI,CAAW,aAAa;aACrC,CAAC,CAAC;YAEH,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAuB,CAAC,CAAC,oBAAoB;YAElF,MAAM,cAAc,GAAe,EAAE,CAAC;YACtC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC7B,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;oBAChB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;oBAC5C,cAAc,CAAC,IAAI,CAAC;wBAClB,UAAU,EAAE,MAAM,CAAC,IAAI;wBACvB,SAAS,EAAE,QAAQ;wBACnB,QAAQ,EAAE,MAAM,CAAC,IAAI;qBACtB,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAED,OAAO,cAAc,CAAC;QACxB,CAAC;QAAC,OAAO,KAAc,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;YACzE,MAAM,IAAI,oBAAoB,CAAC,0CAA0C,OAAO,EAAE,EAAE,KAAc,CAAC,CAAC;QACtG,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,qBAAqB,CAAC,IAAY,EAAE,OAAiB;QACzD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,YAAY,GAAG,EAAE,CAAC;QACtB,IAAI,OAAO,GAAG,KAAK,CAAC;QACpB,IAAI,SAAS,GAAe,EAAE,CAAC;QAE/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAE7B,IAAI,CAAC,IAAI,EAAE,CAAC;gBACV,qBAAqB;gBACrB,IAAI,OAAO,EAAE,CAAC;oBACZ,YAAY,IAAI,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;oBAChD,SAAS,GAAG,EAAE,CAAC;oBACf,OAAO,GAAG,KAAK,CAAC;gBAClB,CAAC;gBACD,YAAY,IAAI,IAAI,CAAC;gBACrB,SAAS;YACX,CAAC;YAED,iEAAiE;YACjE,IAAI,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,EAAE,CAAC;gBACzC,IAAI,OAAO,EAAE,CAAC;oBACZ,YAAY,IAAI,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;oBAChD,SAAS,GAAG,EAAE,CAAC;oBACf,OAAO,GAAG,KAAK,CAAC;gBAClB,CAAC;gBAED,MAAM,YAAY,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,CAAC,CAAC;gBACtD,YAAY,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,YAAY,CAAC,IAAI,IAAI,MAAM,CAAC;gBAC1D,SAAS;YACX,CAAC;YAED,4BAA4B;YAC5B,IAAI,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,EAAE,CAAC;gBAChC,IAAI,CAAC,OAAO,EAAE,CAAC;oBACb,OAAO,GAAG,IAAI,CAAC;gBACjB,CAAC;gBACD,SAAS,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACpD,SAAS;YACX,CAAC;iBAAM,IAAI,OAAO,EAAE,CAAC;gBACnB,eAAe;gBACf,YAAY,IAAI,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;gBAChD,SAAS,GAAG,EAAE,CAAC;gBACf,OAAO,GAAG,KAAK,CAAC;YAClB,CAAC;YAED,eAAe;YACf,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1B,YAAY,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;gBACjD,SAAS;YACX,CAAC;YAED,oBAAoB;YACpB,YAAY,IAAI,IAAI,GAAG,IAAI,CAAC;QAC9B,CAAC;QAED,6BAA6B;QAC7B,IAAI,OAAO,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpC,YAAY,IAAI,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QAClD,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAEO,eAAe,CAAC,IAAY,EAAE,QAA2B,EAAE,KAAa;QAC9E,qCAAqC;QACrC,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE;YAAE,OAAO,KAAK,CAAC,CAAC,2BAA2B;QAC/D,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,KAAK,CAAC,CAAE,YAAY;QAEhD,+CAA+C;QAC/C,IAAI,IAAI,KAAK,IAAI,CAAC,WAAW,EAAE,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,IAAI,CAAC;QAEhE,0CAA0C;QAC1C,MAAM,QAAQ,GAAG,QAAQ,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;QACrC,IAAI,QAAQ,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAC3D,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC;YAAE,OAAO,IAAI,CAAC;QAEpC,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,qBAAqB,CAAC,IAAY;QACxC,IAAI,IAAI,KAAK,IAAI,CAAC,WAAW,EAAE;YAAE,OAAO,CAAC,CAAC,CAAC,2BAA2B;QACtE,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC;YAAE,OAAO,CAAC,CAAC,CAAS,4BAA4B;QACtE,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE;YAAE,OAAO,CAAC,CAAC,CAAW,qBAAqB;QAC/D,OAAO,CAAC,CAAC,CAAC,UAAU;IACtB,CAAC;IAEO,gBAAgB,CAAC,IAAY;QACnC,8CAA8C;QAC9C,MAAM,QAAQ,GAAG;YACf,KAAK,EAAqB,gBAAgB;YAC1C,QAAQ,EAAkB,kBAAkB;YAC5C,IAAI,EAAsB,iBAAiB;YAC3C,WAAW,EAAe,sBAAsB;YAChD,cAAc,EAAY,gCAAgC;SAC3D,CAAC;QAEF,OAAO,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IACtD,CAAC;IAEO,aAAa,CAAC,IAAY;QAChC,sDAAsD;QACtD,IAAI,OAAO,GAAa,EAAE,CAAC;QAE3B,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACpD,CAAC;aAAM,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC9B,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACnD,CAAC;aAAM,CAAC;YACN,2BAA2B;YAC3B,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACxD,CAAC;QAED,OAAO,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC/C,CAAC;IAEO,eAAe,CAAC,IAAyB;QAC/C,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEjC,iCAAiC;QACjC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;QAE/D,IAAI,QAAQ,GAAG,EAAE,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YACpB,IAAI,WAAW,GAAG,GAAG,CAAC;YAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;gBACjC,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAChC,WAAW,IAAI,IAAI,IAAI,IAAI,CAAC;YAC9B,CAAC;YAED,QAAQ,IAAI,WAAW,GAAG,IAAI,CAAC;YAE/B,uCAAuC;YACvC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBACZ,IAAI,SAAS,GAAG,GAAG,CAAC;gBACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;oBACjC,SAAS,IAAI,QAAQ,CAAC;gBACxB,CAAC;gBACD,QAAQ,IAAI,SAAS,GAAG,IAAI,CAAC;YAC/B,CAAC;QACH,CAAC;QAED,OAAO,QAAQ,GAAG,IAAI,CAAC;IACzB,CAAC;IAEO,UAAU,CAAC,IAAY;QAC7B,kCAAkC;QAClC,MAAM,YAAY,GAAG;YACnB,cAAc,EAAY,gBAAgB;YAC1C,cAAc,EAAY,iBAAiB;YAC3C,mBAAmB,EAAO,iBAAiB;YAC3C,kBAAkB,EAAQ,iBAAiB;SAC5C,CAAC;QAEF,OAAO,YAAY,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IAC1D,CAAC;IAEO,cAAc,CAAC,IAAY;QACjC,2CAA2C;QAC3C,IAAI,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,KAAK,CAAC,CAAC;QAC7C,CAAC;aAAM,IAAI,mBAAmB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1C,OAAO,IAAI,CAAC,OAAO,CAAC,mBAAmB,EAAE,IAAI,CAAC,CAAC;QACjD,CAAC;aAAM,IAAI,kBAAkB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACzC,OAAO,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,IAAI,CAAC,CAAC;QAChD,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,gBAAgB,CAAC,UAA+B;QACpD,IAAI,QAAQ,GAAG,EAAE,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3B,QAAQ,IAAI,WAAW,IAAI,CAAC,UAAU,MAAM,CAAC;YAC7C,QAAQ,IAAI,IAAI,CAAC,cAAc,CAAC,gBAAgB,CAAC,QAAQ,IAAI,CAAC,UAAU,EAAE,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;YAC5F,QAAQ,IAAI,MAAM,CAAC;YAEnB,IAAI,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC9B,QAAQ,IAAI,SAAS,CAAC,CAAC,iBAAiB;YAC1C,CAAC;QACH,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,WAAW,GAAG,CAAC,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,IAAI,gBAAgB;QAClB,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;CACF"}
|
package/package.json
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
{
|
2
|
+
"name": "file2md",
|
3
|
+
"version": "1.0.3",
|
4
|
+
"description": "A TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX) into Markdown with image and layout preservation",
|
5
|
+
"main": "dist/index.js",
|
6
|
+
"types": "dist/index.d.ts",
|
7
|
+
"type": "module",
|
8
|
+
"scripts": {
|
9
|
+
"build": "tsc -p tsconfig.build.json",
|
10
|
+
"dev": "ts-node --esm src/index.ts",
|
11
|
+
"test": "jest",
|
12
|
+
"test:watch": "jest --watch",
|
13
|
+
"test:coverage": "jest --coverage",
|
14
|
+
"lint": "eslint src/**/*.ts",
|
15
|
+
"lint:fix": "eslint src/**/*.ts --fix",
|
16
|
+
"clean": "rimraf dist",
|
17
|
+
"prepublishOnly": "npm run clean && npm run build && npm test",
|
18
|
+
"typecheck": "tsc --noEmit"
|
19
|
+
},
|
20
|
+
"keywords": [
|
21
|
+
"markdown",
|
22
|
+
"converter",
|
23
|
+
"pdf",
|
24
|
+
"docx",
|
25
|
+
"xlsx",
|
26
|
+
"pptx",
|
27
|
+
"document",
|
28
|
+
"typescript",
|
29
|
+
"layout-preservation",
|
30
|
+
"image-extraction"
|
31
|
+
],
|
32
|
+
"author": "",
|
33
|
+
"license": "MIT",
|
34
|
+
"dependencies": {
|
35
|
+
"file-type": "^16.5.4",
|
36
|
+
"jszip": "^3.10.1",
|
37
|
+
"pdf-parse": "^1.1.1",
|
38
|
+
"pdf2pic": "^2.1.4",
|
39
|
+
"xml2js": "^0.6.2"
|
40
|
+
},
|
41
|
+
"engines": {
|
42
|
+
"node": ">=18.0.0"
|
43
|
+
},
|
44
|
+
"devDependencies": {
|
45
|
+
"@types/jest": "^29.5.0",
|
46
|
+
"@types/jszip": "^3.4.1",
|
47
|
+
"@types/node": "^20.0.0",
|
48
|
+
"@types/pdf-parse": "^1.1.5",
|
49
|
+
"@types/xml2js": "^0.4.14",
|
50
|
+
"@typescript-eslint/eslint-plugin": "^6.0.0",
|
51
|
+
"@typescript-eslint/parser": "^6.0.0",
|
52
|
+
"eslint": "^8.50.0",
|
53
|
+
"jest": "^29.7.0",
|
54
|
+
"rimraf": "^5.0.0",
|
55
|
+
"ts-jest": "^29.1.0",
|
56
|
+
"ts-node": "^10.9.0",
|
57
|
+
"typescript": "^5.3.0"
|
58
|
+
},
|
59
|
+
"exports": {
|
60
|
+
".": {
|
61
|
+
"import": "./dist/index.js",
|
62
|
+
"require": "./dist/index.js",
|
63
|
+
"types": "./dist/index.d.ts"
|
64
|
+
}
|
65
|
+
},
|
66
|
+
"files": [
|
67
|
+
"dist",
|
68
|
+
"README.md"
|
69
|
+
]
|
70
|
+
}
|