file2md 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +293 -0
  3. package/dist/index.d.ts +33 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +153 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/parsers/docx-parser.d.ts +20 -0
  8. package/dist/parsers/docx-parser.d.ts.map +1 -0
  9. package/dist/parsers/docx-parser.js +237 -0
  10. package/dist/parsers/docx-parser.js.map +1 -0
  11. package/dist/parsers/pdf-parser.d.ts +8 -0
  12. package/dist/parsers/pdf-parser.d.ts.map +1 -0
  13. package/dist/parsers/pdf-parser.js +98 -0
  14. package/dist/parsers/pdf-parser.js.map +1 -0
  15. package/dist/parsers/pptx-parser.d.ts +21 -0
  16. package/dist/parsers/pptx-parser.d.ts.map +1 -0
  17. package/dist/parsers/pptx-parser.js +264 -0
  18. package/dist/parsers/pptx-parser.js.map +1 -0
  19. package/dist/parsers/xlsx-parser.d.ts +19 -0
  20. package/dist/parsers/xlsx-parser.d.ts.map +1 -0
  21. package/dist/parsers/xlsx-parser.js +267 -0
  22. package/dist/parsers/xlsx-parser.js.map +1 -0
  23. package/dist/types/errors.d.ts +52 -0
  24. package/dist/types/errors.d.ts.map +1 -0
  25. package/dist/types/errors.js +76 -0
  26. package/dist/types/errors.js.map +1 -0
  27. package/dist/types/index.d.ts +5 -0
  28. package/dist/types/index.d.ts.map +1 -0
  29. package/dist/types/index.js +5 -0
  30. package/dist/types/index.js.map +1 -0
  31. package/dist/types/interfaces.d.ts +228 -0
  32. package/dist/types/interfaces.d.ts.map +1 -0
  33. package/dist/types/interfaces.js +10 -0
  34. package/dist/types/interfaces.js.map +1 -0
  35. package/dist/utils/chart-extractor.d.ts +44 -0
  36. package/dist/utils/chart-extractor.d.ts.map +1 -0
  37. package/dist/utils/chart-extractor.js +258 -0
  38. package/dist/utils/chart-extractor.js.map +1 -0
  39. package/dist/utils/image-extractor.d.ts +50 -0
  40. package/dist/utils/image-extractor.d.ts.map +1 -0
  41. package/dist/utils/image-extractor.js +136 -0
  42. package/dist/utils/image-extractor.js.map +1 -0
  43. package/dist/utils/layout-parser.d.ts +55 -0
  44. package/dist/utils/layout-parser.d.ts.map +1 -0
  45. package/dist/utils/layout-parser.js +244 -0
  46. package/dist/utils/layout-parser.js.map +1 -0
  47. package/dist/utils/pdf-extractor.d.ts +46 -0
  48. package/dist/utils/pdf-extractor.d.ts.map +1 -0
  49. package/dist/utils/pdf-extractor.js +235 -0
  50. package/dist/utils/pdf-extractor.js.map +1 -0
  51. package/package.json +70 -0
@@ -0,0 +1,235 @@
1
+ import path from 'node:path';
2
+ import { ImageExtractionError } from '../types/errors.js';
3
+ export class PDFExtractor {
4
+ imageExtractor;
5
+ pageCounter = 0;
6
+ constructor(imageExtractor) {
7
+ this.imageExtractor = imageExtractor;
8
+ }
9
+ /**
10
+ * Extract images from PDF by converting pages to images
11
+ */
12
+ async extractImagesFromPDF(buffer) {
13
+ try {
14
+ // Dynamic import to handle potential missing dependency
15
+ const pdf2pic = await import('pdf2pic');
16
+ const convert = pdf2pic.fromBuffer(buffer, {
17
+ density: 150, // Output resolution
18
+ saveFilename: "page",
19
+ savePath: this.imageExtractor.imageDirectory,
20
+ format: "png",
21
+ width: 800, // Max width
22
+ height: 1200 // Max height
23
+ });
24
+ const results = await convert.bulk(-1); // Convert all pages
25
+ const extractedPages = [];
26
+ for (const result of results) {
27
+ if (result.path) {
28
+ const filename = path.basename(result.path);
29
+ extractedPages.push({
30
+ pageNumber: result.page,
31
+ imagePath: filename,
32
+ fullPath: result.path
33
+ });
34
+ }
35
+ }
36
+ return extractedPages;
37
+ }
38
+ catch (error) {
39
+ const message = error instanceof Error ? error.message : 'Unknown error';
40
+ throw new ImageExtractionError(`Failed to convert PDF pages to images: ${message}`, error);
41
+ }
42
+ }
43
+ /**
44
+ * Enhance text with layout detection
45
+ */
46
+ async enhanceTextWithLayout(text, pdfData) {
47
+ const lines = text.split('\n');
48
+ let enhancedText = '';
49
+ let inTable = false;
50
+ let tableRows = [];
51
+ for (let i = 0; i < lines.length; i++) {
52
+ const line = lines[i].trim();
53
+ if (!line) {
54
+ // Handle empty lines
55
+ if (inTable) {
56
+ enhancedText += this.formatTableRows(tableRows);
57
+ tableRows = [];
58
+ inTable = false;
59
+ }
60
+ enhancedText += '\n';
61
+ continue;
62
+ }
63
+ // Detect headings (lines that are short and followed by content)
64
+ if (this.isLikelyHeading(line, lines, i)) {
65
+ if (inTable) {
66
+ enhancedText += this.formatTableRows(tableRows);
67
+ tableRows = [];
68
+ inTable = false;
69
+ }
70
+ const headingLevel = this.determineHeadingLevel(line);
71
+ enhancedText += `${'#'.repeat(headingLevel)} ${line}\n\n`;
72
+ continue;
73
+ }
74
+ // Detect table-like content
75
+ if (this.isLikelyTableRow(line)) {
76
+ if (!inTable) {
77
+ inTable = true;
78
+ }
79
+ tableRows.push({ cells: this.parseTableRow(line) });
80
+ continue;
81
+ }
82
+ else if (inTable) {
83
+ // End of table
84
+ enhancedText += this.formatTableRows(tableRows);
85
+ tableRows = [];
86
+ inTable = false;
87
+ }
88
+ // Detect lists
89
+ if (this.isListItem(line)) {
90
+ enhancedText += this.formatListItem(line) + '\n';
91
+ continue;
92
+ }
93
+ // Regular paragraph
94
+ enhancedText += line + '\n';
95
+ }
96
+ // Handle any remaining table
97
+ if (inTable && tableRows.length > 0) {
98
+ enhancedText += this.formatTableRows(tableRows);
99
+ }
100
+ return enhancedText;
101
+ }
102
+ isLikelyHeading(line, allLines, index) {
103
+ // Check if line looks like a heading
104
+ if (line.length > 80)
105
+ return false; // Too long to be a heading
106
+ if (line.length < 3)
107
+ return false; // Too short
108
+ // Check if it's all caps (common for headings)
109
+ if (line === line.toUpperCase() && line.length > 5)
110
+ return true;
111
+ // Check if followed by a longer paragraph
112
+ const nextLine = allLines[index + 1];
113
+ if (nextLine && nextLine.trim().length > line.length * 1.5) {
114
+ return true;
115
+ }
116
+ // Check if it ends with a colon (section header)
117
+ if (line.endsWith(':'))
118
+ return true;
119
+ return false;
120
+ }
121
+ determineHeadingLevel(line) {
122
+ if (line === line.toUpperCase())
123
+ return 1; // All caps = major heading
124
+ if (line.endsWith(':'))
125
+ return 2; // Ends with colon = section
126
+ if (line.length < 30)
127
+ return 3; // Short = subsection
128
+ return 2; // Default
129
+ }
130
+ isLikelyTableRow(line) {
131
+ // Look for patterns that suggest tabular data
132
+ const patterns = [
133
+ /\t+/, // Tab separated
134
+ /\s{3,}/, // Multiple spaces
135
+ /\|/, // Pipe separated
136
+ /\s+\d+\s+/, // Numbers with spaces
137
+ /^\s*\d+\.\s+/, // Numbered items with alignment
138
+ ];
139
+ return patterns.some(pattern => pattern.test(line));
140
+ }
141
+ parseTableRow(line) {
142
+ // Split line into columns based on various separators
143
+ let columns = [];
144
+ if (line.includes('\t')) {
145
+ columns = line.split('\t').map(col => col.trim());
146
+ }
147
+ else if (line.includes('|')) {
148
+ columns = line.split('|').map(col => col.trim());
149
+ }
150
+ else {
151
+ // Split on multiple spaces
152
+ columns = line.split(/\s{2,}/).map(col => col.trim());
153
+ }
154
+ return columns.filter(col => col.length > 0);
155
+ }
156
+ formatTableRows(rows) {
157
+ if (rows.length === 0)
158
+ return '';
159
+ // Find maximum number of columns
160
+ const maxCols = Math.max(...rows.map(row => row.cells.length));
161
+ let markdown = '';
162
+ for (let i = 0; i < rows.length; i++) {
163
+ const row = rows[i];
164
+ let rowMarkdown = '|';
165
+ for (let j = 0; j < maxCols; j++) {
166
+ const cell = row.cells[j] || '';
167
+ rowMarkdown += ` ${cell} |`;
168
+ }
169
+ markdown += rowMarkdown + '\n';
170
+ // Add header separator after first row
171
+ if (i === 0) {
172
+ let separator = '|';
173
+ for (let j = 0; j < maxCols; j++) {
174
+ separator += ' --- |';
175
+ }
176
+ markdown += separator + '\n';
177
+ }
178
+ }
179
+ return markdown + '\n';
180
+ }
181
+ isListItem(line) {
182
+ // Check for various list patterns
183
+ const listPatterns = [
184
+ /^\s*[-•·]\s+/, // Bullet points
185
+ /^\s*\d+\.\s+/, // Numbered lists
186
+ /^\s*[a-zA-Z]\.\s+/, // Lettered lists
187
+ /^\s*[ivx]+\.\s+/i, // Roman numerals
188
+ ];
189
+ return listPatterns.some(pattern => pattern.test(line));
190
+ }
191
+ formatListItem(line) {
192
+ // Convert various list formats to markdown
193
+ if (/^\s*\d+\.\s+/.test(line)) {
194
+ return line.replace(/^\s*\d+\.\s+/, '1. ');
195
+ }
196
+ else if (/^\s*[a-zA-Z]\.\s+/.test(line)) {
197
+ return line.replace(/^\s*[a-zA-Z]\.\s+/, '- ');
198
+ }
199
+ else if (/^\s*[ivx]+\.\s+/i.test(line)) {
200
+ return line.replace(/^\s*[ivx]+\.\s+/i, '- ');
201
+ }
202
+ else {
203
+ return line.replace(/^\s*[-•·]\s+/, '- ');
204
+ }
205
+ }
206
+ /**
207
+ * Create page breaks with images
208
+ */
209
+ async createPageBreaks(pageImages) {
210
+ let markdown = '';
211
+ for (let i = 0; i < pageImages.length; i++) {
212
+ const page = pageImages[i];
213
+ markdown += `## Page ${page.pageNumber}\n\n`;
214
+ markdown += this.imageExtractor.getImageMarkdown(`Page ${page.pageNumber}`, page.imagePath);
215
+ markdown += '\n\n';
216
+ if (i < pageImages.length - 1) {
217
+ markdown += '---\n\n'; // Page separator
218
+ }
219
+ }
220
+ return markdown;
221
+ }
222
+ /**
223
+ * Reset internal counters
224
+ */
225
+ reset() {
226
+ this.pageCounter = 0;
227
+ }
228
+ /**
229
+ * Get current page counter
230
+ */
231
+ get currentPageCount() {
232
+ return this.pageCounter;
233
+ }
234
+ }
235
+ //# sourceMappingURL=pdf-extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf-extractor.js","sourceRoot":"","sources":["../../src/utils/pdf-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAI7B,OAAO,EAAE,oBAAoB,EAAE,MAAM,oBAAoB,CAAC;AAwB1D,MAAM,OAAO,YAAY;IACN,cAAc,CAAiB;IACxC,WAAW,GAAW,CAAC,CAAC;IAEhC,YAAY,cAA8B;QACxC,IAAI,CAAC,cAAc,GAAG,cAAc,CAAC;IACvC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,oBAAoB,CAAC,MAAc;QACvC,IAAI,CAAC;YACH,wDAAwD;YACxD,MAAM,OAAO,GAAG,MAAM,MAAM,CAAC,SAAS,CAAC,CAAC;YAExC,MAAM,OAAO,GAAG,OAAO,CAAC,UAAU,CAAC,MAAM,EAAE;gBACzC,OAAO,EAAE,GAAG,EAAY,oBAAoB;gBAC5C,YAAY,EAAE,MAAM;gBACpB,QAAQ,EAAE,IAAI,CAAC,cAAc,CAAC,cAAc;gBAC5C,MAAM,EAAE,KAAK;gBACb,KAAK,EAAE,GAAG,EAAa,YAAY;gBACnC,MAAM,EAAE,IAAI,CAAW,aAAa;aACrC,CAAC,CAAC;YAEH,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAuB,CAAC,CAAC,oBAAoB;YAElF,MAAM,cAAc,GAAe,EAAE,CAAC;YACtC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;gBAC7B,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;oBAChB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;oBAC5C,cAAc,CAAC,IAAI,CAAC;wBAClB,UAAU,EAAE,MAAM,CAAC,IAAI;wBACvB,SAAS,EAAE,QAAQ;wBACnB,QAAQ,EAAE,MAAM,CAAC,IAAI;qBACtB,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAED,OAAO,cAAc,CAAC;QACxB,CAAC;QAAC,OAAO,KAAc,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;YACzE,MAAM,IAAI,oBAAoB,CAAC,0CAA0C,OAAO,EAAE,EAAE,KAAc,CAAC,CAAC;QACtG,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,qBAAqB,CAAC,IAAY,EAAE,OAAiB;QACzD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,YAAY,GAAG,EAAE,CAAC;QACtB,IAAI,OAAO,GAAG,KAAK,CAAC;QACpB,IAAI,SAAS,GAAe,EAAE,CAAC;QAE/B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAE7B,IAAI,CAAC,IAAI,EAAE,CAAC;gBACV,qBAAqB;gBACrB,IAAI,OAAO,EAAE,CAAC;oBACZ,YAAY,IAAI,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;oBAChD,SAAS,GAAG,EAAE,CAAC;oBACf,OAAO,GAAG,KAAK,CAAC;gBAClB,CAAC;gBACD,YAAY,IAAI,IAAI,CAAC;gBACrB,SAAS;YACX,CAAC;YAED,iEAAiE;YACjE,IAAI,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC,EAAE,CAAC;gBACzC,IAAI,OAAO,EAAE,CAAC;oBACZ,YAAY,IAAI,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;oBAChD,SAAS,GAAG,EAAE,CAAC;oBACf,OAAO,GAAG,KAAK,CAAC;gBAClB,CAAC;gBAED,MAAM,YAAY,GAAG,IAAI,CAAC,qBAAqB,CAAC,IAAI,CAAC,CAAC;gBACtD,YAAY,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,YAAY,CAAC,IAAI,IAAI,MAAM,CAAC;gBAC1D,SAAS;YACX,CAAC;YAED,4BAA4B;YAC5B,IAAI,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,EAAE,CAAC;gBAChC,IAAI,CAAC,OAAO,EAAE,CAAC;oBACb,OAAO,GAAG,IAAI,CAAC;gBACjB,CAAC;gBACD,SAAS,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACpD,SAAS;YACX,CAAC;iBAAM,IAAI,OAAO,EAAE,CAAC;gBACnB,eAAe;gBACf,YAAY,IAAI,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;gBAChD,SAAS,GAAG,EAAE,CAAC;gBACf,OAAO,GAAG,KAAK,CAAC;YAClB,CAAC;YAED,eAAe;YACf,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1B,YAAY,IAAI,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;gBACjD,SAAS;YACX,CAAC;YAED,oBAAoB;YACpB,YAAY,IAAI,IAAI,GAAG,IAAI,CAAC;QAC9B,CAAC;QAED,6BAA6B;QAC7B,IAAI,OAAO,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpC,YAAY,IAAI,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC,CAAC;QAClD,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAEO,eAAe,CAAC,IAAY,EAAE,QAA2B,EAAE,KAAa;QAC9E,qCAAqC;QACrC,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE;YAAE,OAAO,KAAK,CAAC,CAAC,2BAA2B;QAC/D,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,KAAK,CAAC,CAAE,YAAY;QAEhD,+CAA+C;QAC/C,IAAI,IAAI,KAAK,IAAI,CAAC,WAAW,EAAE,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,IAAI,CAAC;QAEhE,0CAA0C;QAC1C,MAAM,QAAQ,GAAG,QAAQ,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC;QACrC,IAAI,QAAQ,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;YAC3D,OAAO,IAAI,CAAC;QACd,CAAC;QAED,iDAAiD;QACjD,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC;YAAE,OAAO,IAAI,CAAC;QAEpC,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,qBAAqB,CAAC,IAAY;QACxC,IAAI,IAAI,KAAK,IAAI,CAAC,WAAW,EAAE;YAAE,OAAO,CAAC,CAAC,CAAC,2BAA2B;QACtE,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC;YAAE,OAAO,CAAC,CAAC,CAAS,4BAA4B;QACtE,IAAI,IAAI,CAAC,MAAM,GAAG,EAAE;YAAE,OAAO,CAAC,CAAC,CAAW,qBAAqB;QAC/D,OAAO,CAAC,CAAC,CAAC,UAAU;IACtB,CAAC;IAEO,gBAAgB,CAAC,IAAY;QACnC,8CAA8C;QAC9C,MAAM,QAAQ,GAAG;YACf,KAAK,EAAqB,gBAAgB;YAC1C,QAAQ,EAAkB,kBAAkB;YAC5C,IAAI,EAAsB,iBAAiB;YAC3C,WAAW,EAAe,sBAAsB;YAChD,cAAc,EAAY,gCAAgC;SAC3D,CAAC;QAEF,OAAO,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IACtD,CAAC;IAEO,aAAa,CAAC,IAAY;QAChC,sDAAsD;QACtD,IAAI,OAAO,GAAa,EAAE,CAAC;QAE3B,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACpD,CAAC;aAAM,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC9B,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACnD,CAAC;aAAM,CAAC;YACN,2BAA2B;YAC3B,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACxD,CAAC;QAED,OAAO,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC/C,CAAC;IAEO,eAAe,CAAC,IAAyB;QAC/C,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEjC,iCAAiC;QACjC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;QAE/D,IAAI,QAAQ,GAAG,EAAE,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;YACpB,IAAI,WAAW,GAAG,GAAG,CAAC;YAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;gBACjC,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAChC,WAAW,IAAI,IAAI,IAAI,IAAI,CAAC;YAC9B,CAAC;YAED,QAAQ,IAAI,WAAW,GAAG,IAAI,CAAC;YAE/B,uCAAuC;YACvC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBACZ,IAAI,SAAS,GAAG,GAAG,CAAC;gBACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC;oBACjC,SAAS,IAAI,QAAQ,CAAC;gBACxB,CAAC;gBACD,QAAQ,IAAI,SAAS,GAAG,IAAI,CAAC;YAC/B,CAAC;QACH,CAAC;QAED,OAAO,QAAQ,GAAG,IAAI,CAAC;IACzB,CAAC;IAEO,UAAU,CAAC,IAAY;QAC7B,kCAAkC;QAClC,MAAM,YAAY,GAAG;YACnB,cAAc,EAAY,gBAAgB;YAC1C,cAAc,EAAY,iBAAiB;YAC3C,mBAAmB,EAAO,iBAAiB;YAC3C,kBAAkB,EAAQ,iBAAiB;SAC5C,CAAC;QAEF,OAAO,YAAY,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IAC1D,CAAC;IAEO,cAAc,CAAC,IAAY;QACjC,2CAA2C;QAC3C,IAAI,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,KAAK,CAAC,CAAC;QAC7C,CAAC;aAAM,IAAI,mBAAmB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1C,OAAO,IAAI,CAAC,OAAO,CAAC,mBAAmB,EAAE,IAAI,CAAC,CAAC;QACjD,CAAC;aAAM,IAAI,kBAAkB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YACzC,OAAO,IAAI,CAAC,OAAO,CAAC,kBAAkB,EAAE,IAAI,CAAC,CAAC;QAChD,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,gBAAgB,CAAC,UAA+B;QACpD,IAAI,QAAQ,GAAG,EAAE,CAAC;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC3B,QAAQ,IAAI,WAAW,IAAI,CAAC,UAAU,MAAM,CAAC;YAC7C,QAAQ,IAAI,IAAI,CAAC,cAAc,CAAC,gBAAgB,CAAC,QAAQ,IAAI,CAAC,UAAU,EAAE,EAAE,IAAI,CAAC,SAAS,CAAC,CAAC;YAC5F,QAAQ,IAAI,MAAM,CAAC;YAEnB,IAAI,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC9B,QAAQ,IAAI,SAAS,CAAC,CAAC,iBAAiB;YAC1C,CAAC;QACH,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,WAAW,GAAG,CAAC,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,IAAI,gBAAgB;QAClB,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;CACF"}
package/package.json ADDED
@@ -0,0 +1,70 @@
1
+ {
2
+ "name": "file2md",
3
+ "version": "1.0.3",
4
+ "description": "A TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX) into Markdown with image and layout preservation",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "type": "module",
8
+ "scripts": {
9
+ "build": "tsc -p tsconfig.build.json",
10
+ "dev": "ts-node --esm src/index.ts",
11
+ "test": "jest",
12
+ "test:watch": "jest --watch",
13
+ "test:coverage": "jest --coverage",
14
+ "lint": "eslint src/**/*.ts",
15
+ "lint:fix": "eslint src/**/*.ts --fix",
16
+ "clean": "rimraf dist",
17
+ "prepublishOnly": "npm run clean && npm run build && npm test",
18
+ "typecheck": "tsc --noEmit"
19
+ },
20
+ "keywords": [
21
+ "markdown",
22
+ "converter",
23
+ "pdf",
24
+ "docx",
25
+ "xlsx",
26
+ "pptx",
27
+ "document",
28
+ "typescript",
29
+ "layout-preservation",
30
+ "image-extraction"
31
+ ],
32
+ "author": "",
33
+ "license": "MIT",
34
+ "dependencies": {
35
+ "file-type": "^16.5.4",
36
+ "jszip": "^3.10.1",
37
+ "pdf-parse": "^1.1.1",
38
+ "pdf2pic": "^2.1.4",
39
+ "xml2js": "^0.6.2"
40
+ },
41
+ "engines": {
42
+ "node": ">=18.0.0"
43
+ },
44
+ "devDependencies": {
45
+ "@types/jest": "^29.5.0",
46
+ "@types/jszip": "^3.4.1",
47
+ "@types/node": "^20.0.0",
48
+ "@types/pdf-parse": "^1.1.5",
49
+ "@types/xml2js": "^0.4.14",
50
+ "@typescript-eslint/eslint-plugin": "^6.0.0",
51
+ "@typescript-eslint/parser": "^6.0.0",
52
+ "eslint": "^8.50.0",
53
+ "jest": "^29.7.0",
54
+ "rimraf": "^5.0.0",
55
+ "ts-jest": "^29.1.0",
56
+ "ts-node": "^10.9.0",
57
+ "typescript": "^5.3.0"
58
+ },
59
+ "exports": {
60
+ ".": {
61
+ "import": "./dist/index.js",
62
+ "require": "./dist/index.js",
63
+ "types": "./dist/index.d.ts"
64
+ }
65
+ },
66
+ "files": [
67
+ "dist",
68
+ "README.md"
69
+ ]
70
+ }