markitdown-ts 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,1160 @@
1
+ 'use strict';
2
+
3
+ const mime = require('mime-types');
4
+ const path = require('path');
5
+ const fs = require('fs');
6
+ const jsdom = require('jsdom');
7
+ const TurndownService = require('turndown');
8
+ const turndownPluginGfm = require('@joplin/turndown-plugin-gfm');
9
+ const xmldom = require('@xmldom/xmldom');
10
+ const url = require('url');
11
+ const pdfTs = require('pdf-ts');
12
+ const Mammoth = require('mammoth');
13
+ const XLSX = require('xlsx');
14
+ const childProcess = require('child_process');
15
+ const util = require('util');
16
+ const fs$1 = require('fs/promises');
17
+ const os = require('os');
18
+ const ai = require('ai');
19
+ const unzipper = require('unzipper');
20
+
21
+ function _interopDefaultCompat (e) { return e && typeof e === 'object' && 'default' in e ? e.default : e; }
22
+
23
+ function _interopNamespaceCompat(e) {
24
+ if (e && typeof e === 'object' && 'default' in e) return e;
25
+ const n = Object.create(null);
26
+ if (e) {
27
+ for (const k in e) {
28
+ n[k] = e[k];
29
+ }
30
+ }
31
+ n.default = e;
32
+ return n;
33
+ }
34
+
35
+ const mime__namespace = /*#__PURE__*/_interopNamespaceCompat(mime);
36
+ const path__namespace = /*#__PURE__*/_interopNamespaceCompat(path);
37
+ const path__default = /*#__PURE__*/_interopDefaultCompat(path);
38
+ const fs__default = /*#__PURE__*/_interopDefaultCompat(fs);
39
+ const fs__namespace = /*#__PURE__*/_interopNamespaceCompat(fs);
40
+ const TurndownService__default = /*#__PURE__*/_interopDefaultCompat(TurndownService);
41
+ const turndownPluginGfm__default = /*#__PURE__*/_interopDefaultCompat(turndownPluginGfm);
42
+ const Mammoth__default = /*#__PURE__*/_interopDefaultCompat(Mammoth);
43
+ const XLSX__default = /*#__PURE__*/_interopDefaultCompat(XLSX);
44
+ const childProcess__namespace = /*#__PURE__*/_interopNamespaceCompat(childProcess);
45
+ const util__namespace = /*#__PURE__*/_interopNamespaceCompat(util);
46
+ const fs__namespace$1 = /*#__PURE__*/_interopNamespaceCompat(fs$1);
47
+ const os__namespace = /*#__PURE__*/_interopNamespaceCompat(os);
48
+ const unzipper__namespace = /*#__PURE__*/_interopNamespaceCompat(unzipper);
49
+
50
+ class PlainTextConverter {
51
+ async convert(local_path, options = {}) {
52
+ const fileExtension = options.file_extension || "";
53
+ const contentType = mime__namespace.lookup(fileExtension);
54
+ if (!contentType) {
55
+ return null;
56
+ } else if (!contentType.toLowerCase().includes("text/")) {
57
+ return null;
58
+ }
59
+ const content = fs__default.readFileSync(local_path, { encoding: "utf-8" });
60
+ return {
61
+ title: null,
62
+ text_content: content
63
+ };
64
+ }
65
+ }
66
+
67
+ class CustomTurnDown {
68
+ convert_soup(doc) {
69
+ let turnDownService = new TurndownService__default({
70
+ headingStyle: "atx"
71
+ });
72
+ turnDownService.use(turndownPluginGfm__default.gfm);
73
+ turnDownService.addRule("anchor tags", {
74
+ filter: ["a"],
75
+ replacement: function(content, node) {
76
+ if (content === "") {
77
+ return "";
78
+ }
79
+ let prefix = "";
80
+ let suffix = "";
81
+ if (content && content[0] === " ") {
82
+ prefix = " ";
83
+ }
84
+ if (content && content[content.length - 1] === " ") {
85
+ suffix = " ";
86
+ }
87
+ let text = content.trim().replace(/\n\n.*/g, "");
88
+ if (text === "") {
89
+ return "";
90
+ }
91
+ let href = node.getAttribute("href");
92
+ let title = node.title;
93
+ if (href) {
94
+ try {
95
+ let parsed_url = new URL(href);
96
+ if (!["https:", "http:", "file:"].includes(parsed_url.protocol)) {
97
+ return `${prefix}${text}${suffix}`;
98
+ }
99
+ } catch (e) {
100
+ if (!/^https?:|^file:/.test(href)) {
101
+ return `${prefix}[${text}](${href} "${title}")${suffix}`;
102
+ }
103
+ return `${prefix}${text}${suffix}`;
104
+ }
105
+ }
106
+ if (text.replace(/\\_/g, "_") === href && !title) {
107
+ return `<${href}>`;
108
+ }
109
+ if (!title && href) {
110
+ title = href;
111
+ }
112
+ let title_part = title ? ` "${title}"` : "";
113
+ return `${prefix}[${text}](${href}${title_part})${suffix}`;
114
+ }
115
+ });
116
+ turnDownService.addRule("img tags", {
117
+ filter: ["img"],
118
+ replacement: function(_, node) {
119
+ if (!node || node.nodeName !== "IMG") {
120
+ return "";
121
+ }
122
+ let alt = node.getAttribute("alt") || "";
123
+ let src = node.getAttribute("src") || "";
124
+ let title = node.getAttribute("title") || "";
125
+ let titlePart = title ? ` "${title}"` : "";
126
+ if (src.startsWith("data:")) {
127
+ src = src.split(",")[0] + "...";
128
+ }
129
+ return `![${alt}](${src}${titlePart})`;
130
+ }
131
+ });
132
+ let markdown = turnDownService.turndown(doc);
133
+ return markdown;
134
+ }
135
+ }
136
+
137
+ class HtmlConverter {
138
+ async convert(local_path, options) {
139
+ const extension = options.file_extension || "";
140
+ if (![".html", ".htm"].includes(extension.toLowerCase())) {
141
+ return null;
142
+ }
143
+ try {
144
+ let exists = fs__namespace.existsSync(local_path);
145
+ if (!exists) {
146
+ throw new Error("File does'nt exists");
147
+ }
148
+ let content = fs__namespace.readFileSync(local_path, { encoding: "utf-8" });
149
+ return await this._convert(content);
150
+ } catch (e) {
151
+ console.error(e);
152
+ return null;
153
+ }
154
+ }
155
+ async _convert(htmlContent) {
156
+ const soup = new jsdom.JSDOM(htmlContent);
157
+ const doc = soup.window.document;
158
+ doc.querySelectorAll("script, style").forEach((script) => {
159
+ script.remove();
160
+ });
161
+ const bodyElm = doc.querySelector("body");
162
+ let webpageText = "";
163
+ if (bodyElm) {
164
+ webpageText = new CustomTurnDown().convert_soup(bodyElm);
165
+ } else {
166
+ webpageText = new CustomTurnDown().convert_soup(doc);
167
+ }
168
+ return {
169
+ title: doc.title,
170
+ text_content: webpageText
171
+ };
172
+ }
173
+ }
174
+
175
+ class RSSConverter {
176
+ async convert(localPath, options = {}) {
177
+ const fileExtension = options.file_extension || "";
178
+ if (![".xml", ".rss", ".atom"].includes(fileExtension.toLowerCase())) {
179
+ return null;
180
+ }
181
+ try {
182
+ const xmlString = fs__namespace.readFileSync(localPath, { encoding: "utf-8" });
183
+ const doc = new xmldom.DOMParser().parseFromString(xmlString, "text/xml");
184
+ let result;
185
+ if (doc.getElementsByTagName("rss").length > 0) {
186
+ result = this._parseRssType(doc);
187
+ } else if (doc.getElementsByTagName("feed").length > 0) {
188
+ const root = doc.getElementsByTagName("feed")[0];
189
+ if (root.getElementsByTagName("entry").length > 0) {
190
+ result = this._parseAtomType(doc);
191
+ }
192
+ }
193
+ return result;
194
+ } catch (error) {
195
+ console.error("RSS Parsing Error:", error);
196
+ return null;
197
+ }
198
+ }
199
+ _parseAtomType(doc) {
200
+ try {
201
+ const root = doc.getElementsByTagName("feed")[0];
202
+ const title = this._getDataByTagName(root, "title");
203
+ const subtitle = this._getDataByTagName(root, "subtitle");
204
+ const entries = root.getElementsByTagName("entry");
205
+ let mdText = `# ${title}
206
+ `;
207
+ if (subtitle) {
208
+ mdText += `${subtitle}
209
+ `;
210
+ }
211
+ for (let i = 0; i < entries.length; i++) {
212
+ const entry = entries[i];
213
+ const entryTitle = this._getDataByTagName(entry, "title");
214
+ const entrySummary = this._getDataByTagName(entry, "summary");
215
+ const entryUpdated = this._getDataByTagName(entry, "updated");
216
+ const entryContent = this._getDataByTagName(entry, "content");
217
+ if (entryTitle) {
218
+ mdText += `
219
+ ## ${entryTitle}
220
+ `;
221
+ }
222
+ if (entryUpdated) {
223
+ mdText += `Updated on: ${entryUpdated}
224
+ `;
225
+ }
226
+ if (entrySummary) {
227
+ mdText += this._parseContent(entrySummary);
228
+ }
229
+ if (entryContent) {
230
+ mdText += this._parseContent(entryContent);
231
+ }
232
+ }
233
+ return {
234
+ title,
235
+ text_content: mdText
236
+ };
237
+ } catch (error) {
238
+ console.error("Atom Parsing Error:", error);
239
+ return null;
240
+ }
241
+ }
242
+ _parseRssType(doc) {
243
+ try {
244
+ const root = doc.getElementsByTagName("rss")[0];
245
+ const channel = root.getElementsByTagName("channel");
246
+ if (!channel || channel.length === 0) {
247
+ return null;
248
+ }
249
+ const channelElement = channel[0];
250
+ const channelTitle = this._getDataByTagName(channelElement, "title");
251
+ const channelDescription = this._getDataByTagName(channelElement, "description");
252
+ const items = channelElement.getElementsByTagName("item");
253
+ let mdText = "";
254
+ if (channelTitle) {
255
+ mdText = `# ${channelTitle}
256
+ `;
257
+ }
258
+ if (channelDescription) {
259
+ mdText += `${channelDescription}
260
+ `;
261
+ }
262
+ for (let i = 0; i < items.length; i++) {
263
+ const item = items[i];
264
+ const title = this._getDataByTagName(item, "title");
265
+ const description = this._getDataByTagName(item, "description");
266
+ const pubDate = this._getDataByTagName(item, "pubDate");
267
+ const content = this._getDataByTagName(item, "content:encoded");
268
+ if (title) {
269
+ mdText += `
270
+ ## ${title}
271
+ `;
272
+ }
273
+ if (pubDate) {
274
+ mdText += `Published on: ${pubDate}
275
+ `;
276
+ }
277
+ if (description) {
278
+ mdText += this._parseContent(description);
279
+ }
280
+ if (content) {
281
+ mdText += this._parseContent(content);
282
+ }
283
+ }
284
+ return {
285
+ title: channelTitle,
286
+ text_content: mdText
287
+ };
288
+ } catch (error) {
289
+ console.error("RSS Parsing Error:", error);
290
+ return null;
291
+ }
292
+ }
293
+ _parseContent(content) {
294
+ try {
295
+ const dom = new jsdom.JSDOM(content);
296
+ const document = dom.window.document;
297
+ return new CustomTurnDown().convert_soup(document);
298
+ } catch (error) {
299
+ console.warn("Parsing content error", error);
300
+ return content;
301
+ }
302
+ }
303
+ _getDataByTagName(element, tagName) {
304
+ const nodes = element.getElementsByTagName(tagName);
305
+ if (!nodes || nodes.length === 0) {
306
+ return null;
307
+ }
308
+ const fc = nodes[0].firstChild;
309
+ if (fc && fc.nodeValue) {
310
+ return fc.nodeValue;
311
+ }
312
+ return null;
313
+ }
314
+ }
315
+
316
+ class WikipediaConverter {
317
+ async convert(localPath, options = {}) {
318
+ const fileExtension = options.file_extension || "";
319
+ if (![".html", ".htm"].includes(fileExtension.toLowerCase())) {
320
+ return null;
321
+ }
322
+ const url = options.url || "";
323
+ if (!/^https?:\/\/[a-zA-Z]{2,3}\.wikipedia\.org\//.test(url)) {
324
+ return null;
325
+ }
326
+ try {
327
+ const htmlContent = fs__namespace.readFileSync(localPath, { encoding: "utf-8" });
328
+ return this._convert(htmlContent);
329
+ } catch (error) {
330
+ console.error("Wikipedia Parsing Error:", error);
331
+ return null;
332
+ }
333
+ }
334
+ _convert(htmlContent) {
335
+ const dom = new jsdom.JSDOM(htmlContent);
336
+ const doc = dom.window.document;
337
+ doc.querySelectorAll("script, style").forEach((script) => {
338
+ script.remove();
339
+ });
340
+ const bodyElm = doc.querySelector("div#mw-content-text");
341
+ const titleElm = doc.querySelector("span.mw-page-title-main");
342
+ let webpageText = "";
343
+ let mainTitle = doc.title;
344
+ if (bodyElm) {
345
+ if (titleElm && titleElm.textContent) {
346
+ mainTitle = titleElm.textContent;
347
+ }
348
+ webpageText = `# ${mainTitle}
349
+
350
+ ` + new CustomTurnDown().convert_soup(bodyElm);
351
+ } else {
352
+ webpageText = new CustomTurnDown().convert_soup(doc);
353
+ }
354
+ return {
355
+ title: mainTitle,
356
+ text_content: webpageText
357
+ };
358
+ }
359
+ }
360
+
361
+ class YouTubeConverter {
362
+ async convert(localPath, options = {}) {
363
+ const fileExtension = options.file_extension || "";
364
+ if (![".html", ".htm"].includes(fileExtension.toLowerCase())) {
365
+ return null;
366
+ }
367
+ const url = options.url || "";
368
+ if (!url.startsWith("https://www.youtube.com/watch?")) {
369
+ return null;
370
+ }
371
+ try {
372
+ const htmlContent = fs__namespace.readFileSync(localPath, { encoding: "utf-8" });
373
+ return this._convert(htmlContent, url, options);
374
+ } catch (error) {
375
+ console.error("YouTube Parsing Error:", error);
376
+ return null;
377
+ }
378
+ }
379
+ async _convert(htmlContent, url$1, options) {
380
+ const dom = new jsdom.JSDOM(htmlContent);
381
+ const doc = dom.window.document;
382
+ const metadata = {
383
+ title: doc.title
384
+ };
385
+ doc.querySelectorAll("meta").forEach((meta) => {
386
+ for (const a of meta.attributes) {
387
+ const attributeContent = meta.getAttribute("content");
388
+ if (["itemprop", "property", "name"].includes(a.name) && attributeContent) {
389
+ metadata[a.value] = attributeContent;
390
+ break;
391
+ }
392
+ }
393
+ });
394
+ try {
395
+ for (const script of doc.querySelectorAll("script")) {
396
+ const content = script.textContent || "";
397
+ if (content.includes("ytInitialData")) {
398
+ const lines = content.split(/\r?\n/);
399
+ const objStart = lines[0].indexOf("{");
400
+ const objEnd = lines[0].lastIndexOf("}");
401
+ if (objStart >= 0 && objEnd >= 0) {
402
+ const data = JSON.parse(lines[0].substring(objStart, objEnd + 1));
403
+ const attrdesc = this._findKey(data, "attributedDescriptionBodyText");
404
+ if (attrdesc) {
405
+ metadata["description"] = attrdesc["content"];
406
+ }
407
+ }
408
+ break;
409
+ }
410
+ }
411
+ } catch (e) {
412
+ console.warn("Error while parsing Youtube description");
413
+ }
414
+ let webpageText = "# YouTube\n";
415
+ const title = this._get(metadata, ["title", "og:title", "name"]);
416
+ if (title) {
417
+ webpageText += `
418
+ ## ${title}
419
+ `;
420
+ }
421
+ let stats = "";
422
+ const views = this._get(metadata, ["interactionCount"]);
423
+ if (views) {
424
+ stats += `- **Views:** ${views}
425
+ `;
426
+ }
427
+ const keywords = this._get(metadata, ["keywords"]);
428
+ if (keywords) {
429
+ stats += `- **Keywords:** ${keywords}
430
+ `;
431
+ }
432
+ const runtime = this._get(metadata, ["duration"]);
433
+ if (runtime) {
434
+ stats += `- **Runtime:** ${runtime}
435
+ `;
436
+ }
437
+ if (stats.length > 0) {
438
+ webpageText += `
439
+ ### Video Metadata
440
+ ${stats}
441
+ `;
442
+ }
443
+ const description = this._get(metadata, ["description", "og:description"]);
444
+ if (description) {
445
+ webpageText += `
446
+ ### Description
447
+ ${description}
448
+ `;
449
+ }
450
+ if (options.enableYoutubeTranscript) {
451
+ let transcriptText = "";
452
+ const parsedUrl = new url.URL(url$1);
453
+ const params = parsedUrl.searchParams;
454
+ const videoId = params.get("v");
455
+ let ytTranscript;
456
+ try {
457
+ ytTranscript = await import('youtube-transcript').then((mod) => mod.YoutubeTranscript);
458
+ } catch (error) {
459
+ console.warn(
460
+ "Optional dependency 'youtube-transcript' is not installed. Run `npm install youtube-transcript` to enable this feature."
461
+ );
462
+ return null;
463
+ }
464
+ if (videoId) {
465
+ try {
466
+ const youtubeTranscriptLanguage = options.youtubeTranscriptLanguage || "en";
467
+ const transcript = await ytTranscript.fetchTranscript(videoId, {
468
+ lang: youtubeTranscriptLanguage
469
+ });
470
+ transcriptText = transcript.map((part) => part.text).join(" ");
471
+ } catch (error) {
472
+ console.warn("Error while extracting the Youtube Transcript", error);
473
+ }
474
+ }
475
+ if (transcriptText) {
476
+ webpageText += `
477
+ ### Transcript
478
+ ${transcriptText}
479
+ `;
480
+ }
481
+ }
482
+ const finalTitle = title ? title : doc.title;
483
+ return {
484
+ title: finalTitle,
485
+ text_content: webpageText
486
+ };
487
+ }
488
+ _get(metadata, keys, default_value) {
489
+ for (const k of keys) {
490
+ if (metadata[k]) {
491
+ return metadata[k];
492
+ }
493
+ }
494
+ return default_value || null;
495
+ }
496
+ _findKey(json, key) {
497
+ if (Array.isArray(json)) {
498
+ for (const elm of json) {
499
+ const ret = this._findKey(elm, key);
500
+ if (ret) {
501
+ return ret;
502
+ }
503
+ }
504
+ } else if (typeof json === "object" && json !== null) {
505
+ for (const k in json) {
506
+ if (k === key) {
507
+ return json[k];
508
+ } else {
509
+ const ret = this._findKey(json[k], key);
510
+ if (ret) {
511
+ return ret;
512
+ }
513
+ }
514
+ }
515
+ }
516
+ return null;
517
+ }
518
+ }
519
+
520
+ class IpynbConverter {
521
+ async convert(localPath, options = {}) {
522
+ const fileExtension = options.file_extension || "";
523
+ if (fileExtension.toLowerCase() !== ".ipynb") {
524
+ return null;
525
+ }
526
+ try {
527
+ const notebookContent = JSON.parse(fs__namespace.readFileSync(localPath, { encoding: "utf-8" }));
528
+ return this._convert(notebookContent);
529
+ } catch (error) {
530
+ console.error("Error converting .ipynb file:", error);
531
+ return null;
532
+ }
533
+ }
534
+ _convert(notebookContent) {
535
+ try {
536
+ const mdOutput = [];
537
+ let title = null;
538
+ for (const cell of notebookContent.cells || []) {
539
+ const cellType = cell.cell_type || "";
540
+ const sourceLines = cell.source || [];
541
+ if (cellType === "markdown") {
542
+ mdOutput.push(sourceLines.join(""));
543
+ if (!title) {
544
+ for (const line of sourceLines) {
545
+ if (line.startsWith("# ")) {
546
+ title = line.substring(line.indexOf("# ") + 2).trim();
547
+ break;
548
+ }
549
+ }
550
+ }
551
+ } else if (cellType === "code") {
552
+ mdOutput.push(`\`\`\`python
553
+ ${sourceLines.join("")}
554
+ \`\`\``);
555
+ } else if (cellType === "raw") {
556
+ mdOutput.push(`\`\`\`
557
+ ${sourceLines.join("")}
558
+ \`\`\``);
559
+ }
560
+ }
561
+ const mdText = mdOutput.join("\n\n");
562
+ title = notebookContent.metadata?.title || title;
563
+ return {
564
+ title,
565
+ text_content: mdText
566
+ };
567
+ } catch (e) {
568
+ console.error("Error converting .ipynb file:", e);
569
+ throw new Error(`Error converting .ipynb file: ${e}`);
570
+ }
571
+ }
572
+ }
573
+
574
+ class BingSerpConverter {
575
+ async convert(localPath, options = {}) {
576
+ const fileExtension = options.file_extension || "";
577
+ if (![".html", ".htm"].includes(fileExtension.toLowerCase())) {
578
+ return null;
579
+ }
580
+ const url = options.url || "";
581
+ if (!/^https:\/\/www\.bing\.com\/search\?q=/.test(url)) {
582
+ return null;
583
+ }
584
+ try {
585
+ const htmlContent = fs__namespace.readFileSync(localPath, { encoding: "utf-8" });
586
+ return this._convert(htmlContent, url);
587
+ } catch (error) {
588
+ console.error("Bing SERP Parsing Error:", error);
589
+ return null;
590
+ }
591
+ }
592
+ _convert(htmlContent, url$1) {
593
+ const dom = new jsdom.JSDOM(htmlContent);
594
+ const doc = dom.window.document;
595
+ const parsedParams = new url.URL(url$1).searchParams;
596
+ const query = parsedParams.get("q") || "";
597
+ doc.querySelectorAll(".tptt").forEach((tptt) => {
598
+ if (tptt.textContent) {
599
+ tptt.textContent += " ";
600
+ }
601
+ });
602
+ doc.querySelectorAll(".algoSlug_icon").forEach((slug) => {
603
+ slug.remove();
604
+ });
605
+ const markdownify = new CustomTurnDown();
606
+ const results = [];
607
+ doc.querySelectorAll(".b_algo").forEach((result) => {
608
+ result.querySelectorAll("a[href]").forEach((a) => {
609
+ try {
610
+ const parsedHref = new url.URL(a.getAttribute("href"));
611
+ const params = parsedHref.searchParams;
612
+ const u = params.get("u");
613
+ if (u) {
614
+ const decoded = this._decodeBase64Url(u);
615
+ a.setAttribute("href", decoded);
616
+ }
617
+ } catch (e) {
618
+ }
619
+ });
620
+ const mdResult = markdownify.convert_soup(result).trim();
621
+ const lines = mdResult.split(/\n+/).map((line) => line.trim()).filter((line) => line.length > 0);
622
+ results.push(lines.join("\n"));
623
+ });
624
+ const webpageText = `## A Bing search for '${query}' found the following results:
625
+
626
+ ${results.join("\n\n")}`;
627
+ return {
628
+ title: doc.title,
629
+ text_content: webpageText
630
+ };
631
+ }
632
+ _decodeBase64Url(encodedUrl) {
633
+ let u = encodedUrl.slice(2).trim() + "==";
634
+ try {
635
+ const decoded = Buffer.from(u, "base64").toString("utf-8");
636
+ return decoded;
637
+ } catch (error) {
638
+ console.error("Error decoding Base64URL:", error);
639
+ return encodedUrl;
640
+ }
641
+ }
642
+ }
643
+
644
+ class PdfConverter {
645
+ async convert(localPath, options = {}) {
646
+ const fileExtension = options.file_extension || "";
647
+ if (![".pdf"].includes(fileExtension.toLowerCase())) {
648
+ return null;
649
+ }
650
+ try {
651
+ const pdfContent = fs__default.readFileSync(localPath);
652
+ return this._convert(pdfContent);
653
+ } catch (error) {
654
+ console.error("PDF Parsing Error:", error);
655
+ return null;
656
+ }
657
+ }
658
+ async _convert(pdfContent) {
659
+ try {
660
+ const textContent = await pdfTs.pdfToText(pdfContent);
661
+ return {
662
+ title: null,
663
+ text_content: textContent
664
+ };
665
+ } catch (error) {
666
+ console.error("PDF Parsing Error:", error);
667
+ return null;
668
+ }
669
+ }
670
+ }
671
+
672
+ class DocxConverter extends HtmlConverter {
673
+ async convert(local_path, options) {
674
+ const fileExtension = options.file_extension || "";
675
+ if (![".docx"].includes(fileExtension.toLowerCase())) {
676
+ return null;
677
+ }
678
+ try {
679
+ let exists = fs__namespace.existsSync(local_path);
680
+ if (!exists) {
681
+ throw new Error("File does'nt exists");
682
+ }
683
+ let htmlContent = await Mammoth__default.convertToHtml(
684
+ {
685
+ path: local_path
686
+ },
687
+ {
688
+ ...options
689
+ }
690
+ );
691
+ return await this._convert(htmlContent.value);
692
+ } catch (e) {
693
+ console.error(e);
694
+ return null;
695
+ }
696
+ }
697
+ }
698
+
699
+ class XlsxConverter extends HtmlConverter {
700
+ async convert(local_path, options) {
701
+ const extension = options.file_extension || "";
702
+ if (![".xlsx"].includes(extension.toLowerCase())) {
703
+ return null;
704
+ }
705
+ try {
706
+ let exists = fs__namespace.existsSync(local_path);
707
+ if (!exists) {
708
+ throw new Error("File does'nt exists");
709
+ }
710
+ let workbook = XLSX__default.readFile(local_path);
711
+ let mdContent = "";
712
+ for (const sheetName of workbook.SheetNames) {
713
+ mdContent += `## ${sheetName}
714
+ `;
715
+ let htmlContent = XLSX__default.utils.sheet_to_html(workbook.Sheets[sheetName]);
716
+ mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
717
+ }
718
+ return {
719
+ title: workbook?.Props?.Title || "Untitled",
720
+ text_content: mdContent
721
+ };
722
+ } catch (e) {
723
+ console.error(e);
724
+ return null;
725
+ }
726
+ }
727
+ }
728
+
729
+ const exec = util__namespace.promisify(childProcess__namespace.exec);
730
+ class MediaConverter {
731
+ async _getMetadata(localPath) {
732
+ const exiftool = await this._which("exiftool");
733
+ if (!exiftool) {
734
+ console.error("exiftool is not found on this system so metadata cannot be extracted");
735
+ return null;
736
+ }
737
+ try {
738
+ const result = await exec(`"${exiftool}" -json "${localPath}"`);
739
+ return JSON.parse(result.stdout)[0];
740
+ } catch (error) {
741
+ console.error("Exiftool error:", error);
742
+ return null;
743
+ }
744
+ }
745
+ async _which(command) {
746
+ try {
747
+ const result = await exec(`which ${command}`);
748
+ return result.stdout.trim();
749
+ } catch (error) {
750
+ console.warn("Which command error:", error);
751
+ return null;
752
+ }
753
+ }
754
+ }
755
+
756
+ class WavConverter extends MediaConverter {
757
+ async convert(localPath, options = {}) {
758
+ const fileExtension = options.file_extension || "";
759
+ if (fileExtension.toLowerCase() !== ".wav") {
760
+ return null;
761
+ }
762
+ try {
763
+ return this._convert(localPath, options);
764
+ } catch (error) {
765
+ console.error("WAV Conversion Error:", error);
766
+ return null;
767
+ }
768
+ }
769
+ async _convert(localPath, _) {
770
+ let mdContent = "";
771
+ const metadata = await this._getMetadata(localPath);
772
+ if (metadata) {
773
+ for (const f of [
774
+ "Title",
775
+ "Artist",
776
+ "Author",
777
+ "Band",
778
+ "Album",
779
+ "Genre",
780
+ "Track",
781
+ "DateTimeOriginal",
782
+ "CreateDate",
783
+ "Duration"
784
+ ]) {
785
+ if (metadata[f]) {
786
+ mdContent += `${f}: ${metadata[f]}
787
+ `;
788
+ }
789
+ }
790
+ }
791
+ try {
792
+ const transcript = await this._transcribeAudio(localPath);
793
+ mdContent += `
794
+
795
+ ### Audio Transcript:
796
+ ${transcript === "" ? "[No speech detected]" : transcript}`;
797
+ } catch (error) {
798
+ console.error("Error loading speech recognition module:", error);
799
+ mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio.";
800
+ }
801
+ return {
802
+ title: null,
803
+ text_content: mdContent.trim()
804
+ };
805
+ }
806
+ // TODO: Add speech to text
807
+ async _transcribeAudio(_) {
808
+ throw new Error("TODO: Audio transcription not implemented yet");
809
+ }
810
+ }
811
+
812
+ class Mp3Converter extends WavConverter {
813
+ async convert(localPath, options = {}) {
814
+ const fileExtension = options.file_extension || "";
815
+ if (fileExtension.toLowerCase() !== ".mp3") {
816
+ return null;
817
+ }
818
+ try {
819
+ return await this._convert$(localPath, options);
820
+ } catch (error) {
821
+ console.error("MP3 Conversion Error:", error);
822
+ return null;
823
+ }
824
+ }
825
+ async _convert$(localPath, options) {
826
+ let mdContent = "";
827
+ const metadata = await this._getMetadata(localPath);
828
+ if (metadata) {
829
+ for (const f of [
830
+ "Title",
831
+ "Artist",
832
+ "Author",
833
+ "Band",
834
+ "Album",
835
+ "Genre",
836
+ "Track",
837
+ "DateTimeOriginal",
838
+ "CreateDate",
839
+ "Duration"
840
+ ]) {
841
+ if (metadata[f]) {
842
+ mdContent += `${f}: ${metadata[f]}
843
+ `;
844
+ }
845
+ }
846
+ }
847
+ const tempPath = await fs__namespace$1.mkdtemp(path__namespace.join(os__namespace.tmpdir(), "temp_"));
848
+ const wavPath = path__namespace.join(tempPath, "audio.wav");
849
+ try {
850
+ const transcript = await super._transcribeAudio(wavPath);
851
+ mdContent += `
852
+
853
+ ### Audio Transcript:
854
+ ${transcript == "" ? "[No speech detected]" : transcript}`;
855
+ } catch (e) {
856
+ mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio.";
857
+ } finally {
858
+ await fs__namespace$1.unlink(wavPath);
859
+ await fs__namespace$1.rmdir(tempPath);
860
+ }
861
+ return {
862
+ title: null,
863
+ text_content: mdContent.trim()
864
+ };
865
+ }
866
+ }
867
+
868
+ class ImageConverter extends MediaConverter {
869
+ async convert(localPath, options = {}) {
870
+ const fileExtension = options.file_extension || "";
871
+ if (![".jpg", ".jpeg", ".png"].includes(fileExtension.toLowerCase())) {
872
+ return null;
873
+ }
874
+ try {
875
+ return this._convert(localPath, options);
876
+ } catch (error) {
877
+ console.error("Image Conversion Error:", error);
878
+ return null;
879
+ }
880
+ }
881
+ async _convert(localPath, options) {
882
+ let mdContent = "";
883
+ const metadata = await this._getMetadata(localPath);
884
+ if (metadata) {
885
+ for (const f of [
886
+ "ImageSize",
887
+ "Title",
888
+ "Caption",
889
+ "Description",
890
+ "Keywords",
891
+ "Artist",
892
+ "Author",
893
+ "DateTimeOriginal",
894
+ "CreateDate",
895
+ "GPSPosition"
896
+ ]) {
897
+ if (metadata[f]) {
898
+ mdContent += `${f}: ${metadata[f]}
899
+ `;
900
+ }
901
+ }
902
+ }
903
+ if (options.llmModel) {
904
+ mdContent += `
905
+ # Description:
906
+ ${(await this._getLLMDescription(localPath, options)).trim()}
907
+ `;
908
+ }
909
+ return {
910
+ title: null,
911
+ text_content: mdContent.trim()
912
+ };
913
+ }
914
+ async _getLLMDescription(localPath, options) {
915
+ if (!options.llmPrompt || options.llmPrompt.trim() === "") {
916
+ options.llmPrompt = "Write a detailed caption for this image.";
917
+ }
918
+ const imageFile = fs__namespace.readFileSync(localPath).toString("base64");
919
+ const result = await ai.generateText({
920
+ model: options.llmModel,
921
+ messages: [
922
+ {
923
+ role: "user",
924
+ content: [
925
+ { type: "text", text: options.llmPrompt },
926
+ {
927
+ type: "image",
928
+ image: imageFile
929
+ }
930
+ ]
931
+ }
932
+ ]
933
+ });
934
+ return result.text.trim();
935
+ }
936
+ }
937
+
938
+ class ZipConverter {
939
+ async convert(localPath, options = {}) {
940
+ const fileExtension = options.file_extension || "";
941
+ if (fileExtension.toLowerCase() !== ".zip") {
942
+ return null;
943
+ }
944
+ const parentConverters = options._parent_converters || [];
945
+ if (!parentConverters) {
946
+ return {
947
+ title: null,
948
+ text_content: `[ERROR] No converters available to process zip contents from: ${localPath}`
949
+ };
950
+ }
951
+ const extractedZipFolderName = `extracted_${path__namespace.basename(localPath).replace(".zip", "_zip")}`;
952
+ const newFolder = path__namespace.normalize(path__namespace.join(path__namespace.dirname(localPath), extractedZipFolderName));
953
+ let mdContent = `Content from the zip file \`${path__namespace.basename(localPath)}\`:
954
+
955
+ `;
956
+ if (!newFolder.startsWith(path__namespace.dirname(localPath))) {
957
+ return {
958
+ title: null,
959
+ text_content: `[ERROR] Invalid zip file path: ${localPath}`
960
+ };
961
+ }
962
+ try {
963
+ await fs__namespace$1.mkdir(newFolder, { recursive: true });
964
+ const zip = await unzipper__namespace.Open.file(localPath);
965
+ await zip.extract({ path: newFolder });
966
+ const files = await this._walk(newFolder);
967
+ for (const { root, name } of files) {
968
+ const filePath = path__namespace.join(root, name);
969
+ const relativePath = path__namespace.relative(newFolder, filePath);
970
+ const fileExtension2 = path__namespace.extname(name);
971
+ const fileOptions = {
972
+ ...options,
973
+ file_extension: fileExtension2,
974
+ _parent_converters: parentConverters
975
+ };
976
+ for (const converter of parentConverters) {
977
+ if (converter instanceof ZipConverter) {
978
+ continue;
979
+ }
980
+ const result = await converter.convert(filePath, fileOptions);
981
+ if (result) {
982
+ mdContent += `
983
+ ## File: ${relativePath}
984
+
985
+ `;
986
+ mdContent += result.text_content + "\n\n";
987
+ break;
988
+ }
989
+ }
990
+ }
991
+ if (options.cleanupExtracted !== false) {
992
+ await fs__namespace$1.rm(newFolder, { recursive: true, force: true });
993
+ }
994
+ return {
995
+ title: null,
996
+ text_content: mdContent.trim()
997
+ };
998
+ } catch (error) {
999
+ if (error.message.includes("invalid signature")) {
1000
+ return {
1001
+ title: null,
1002
+ text_content: `[ERROR] Invalid or corrupted zip file: ${localPath}`
1003
+ };
1004
+ }
1005
+ return {
1006
+ title: null,
1007
+ text_content: `[ERROR] Failed to process zip file ${localPath}: ${String(error)}`
1008
+ };
1009
+ }
1010
+ }
1011
+ async _walk(dir) {
1012
+ let results = [];
1013
+ const files = await fs__namespace$1.readdir(dir, { withFileTypes: true });
1014
+ for (const file of files) {
1015
+ if (file.isDirectory()) {
1016
+ results = results.concat(await this._walk(path__namespace.join(dir, file.name)));
1017
+ } else {
1018
+ results.push({ root: dir, name: file.name });
1019
+ }
1020
+ }
1021
+ return results;
1022
+ }
1023
+ }
1024
+
1025
+ class MarkItDown {
1026
+ converters = [];
1027
+ constructor() {
1028
+ this.register_converter(new PlainTextConverter());
1029
+ this.register_converter(new HtmlConverter());
1030
+ this.register_converter(new RSSConverter());
1031
+ this.register_converter(new WikipediaConverter());
1032
+ this.register_converter(new YouTubeConverter());
1033
+ this.register_converter(new BingSerpConverter());
1034
+ this.register_converter(new DocxConverter());
1035
+ this.register_converter(new XlsxConverter());
1036
+ this.register_converter(new WavConverter());
1037
+ this.register_converter(new Mp3Converter());
1038
+ this.register_converter(new ImageConverter());
1039
+ this.register_converter(new IpynbConverter());
1040
+ this.register_converter(new PdfConverter());
1041
+ this.register_converter(new ZipConverter());
1042
+ }
1043
+ async convert(source, options = {}) {
1044
+ if (source instanceof Response) {
1045
+ return await this.convert_response(source, options);
1046
+ } else {
1047
+ if (source.startsWith("http://") || source.startsWith("https://") || source.startsWith("file://")) {
1048
+ return await this.convert_url(source, options);
1049
+ } else {
1050
+ return this.convert_local(source, options);
1051
+ }
1052
+ }
1053
+ }
1054
+ async convert_url(source, { fetch = globalThis.fetch, ...options }) {
1055
+ let response = await fetch(source);
1056
+ if (!response.ok) {
1057
+ throw new Error(`Failed to fetch URL: ${source}, status: ${response.status}`);
1058
+ }
1059
+ return await this.convert_response(response, options);
1060
+ }
1061
+ async convert_response(response, options) {
1062
+ const ext = options.file_extension;
1063
+ const extensions = ext ? /* @__PURE__ */ new Set([ext]) : /* @__PURE__ */ new Set();
1064
+ const contentType = response.headers?.get("content-type")?.split(";")[0];
1065
+ if (!contentType) {
1066
+ throw new Error("Response Content-Type header is missing");
1067
+ }
1068
+ const mimeExtension = mime__namespace.extension(contentType);
1069
+ if (mimeExtension) {
1070
+ extensions.add(`.${mimeExtension}`);
1071
+ }
1072
+ const content_disposition = response.headers?.get("content-disposition") || "";
1073
+ const fname = content_disposition.match(/filename="([^;]+)"/);
1074
+ if (fname) {
1075
+ extensions.add(path__default.extname(fname[1]));
1076
+ }
1077
+ const url_ext = path__default.extname(new URL(response.url).pathname);
1078
+ extensions.add(url_ext);
1079
+ const file = fname ? `/tmp/${fname?.[1]}` : "/tmp/temp";
1080
+ const temp_writeable = fs__namespace.createWriteStream(file);
1081
+ try {
1082
+ if (response.body == null) {
1083
+ throw new Error("Response body is empty");
1084
+ }
1085
+ const reader = response.body.getReader();
1086
+ while (true) {
1087
+ const { done, value } = await reader.read();
1088
+ if (done) break;
1089
+ temp_writeable.write(value);
1090
+ }
1091
+ temp_writeable.end();
1092
+ return await this._convert(file, extensions, {
1093
+ ...options,
1094
+ url: response.url
1095
+ });
1096
+ } catch (e) {
1097
+ throw new Error(`Could not write to file: ${e}`);
1098
+ } finally {
1099
+ try {
1100
+ temp_writeable.close();
1101
+ } catch (e) {
1102
+ throw new Error(`Could not close file: ${e}`);
1103
+ }
1104
+ }
1105
+ }
1106
+ async convert_local(source, options) {
1107
+ const ext = options.file_extension;
1108
+ const extensions = ext ? new Set(ext) : /* @__PURE__ */ new Set();
1109
+ if (!fs__namespace.existsSync(source)) {
1110
+ throw new Error(`File not found: ${source}`);
1111
+ }
1112
+ const extname = path__default.extname(source);
1113
+ if (extname === "") {
1114
+ throw new Error(`File extension not found: ${source}`);
1115
+ }
1116
+ if (!extensions.has(extname)) {
1117
+ extensions.add(extname);
1118
+ }
1119
+ return await this._convert(source, extensions, options);
1120
+ }
1121
+ async _convert(source, extensions, options = {}) {
1122
+ let error;
1123
+ for (const ext of extensions) {
1124
+ for (const converter of this.converters) {
1125
+ let res;
1126
+ try {
1127
+ const op = {
1128
+ ...options,
1129
+ file_extension: ext,
1130
+ _parent_converters: this.converters
1131
+ };
1132
+ res = await converter.convert(source, op);
1133
+ } catch (e) {
1134
+ error = e;
1135
+ }
1136
+ if (res != null) {
1137
+ res.text_content = res.text_content.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1138
+ res.text_content = res.text_content.replace(/\n{3,}/g, "\n\n");
1139
+ return res;
1140
+ }
1141
+ }
1142
+ }
1143
+ if (error) {
1144
+ throw new Error(
1145
+ `Could not convert ${source} to markdown. While converting the following error occurred: ${error}`
1146
+ );
1147
+ }
1148
+ throw new Error(
1149
+ `Could not convert ${source} to markdown format. The ${Array.from(extensions).join(
1150
+ ", "
1151
+ )} are not supported.`
1152
+ );
1153
+ }
1154
+ // NOTE: Inserts the converter at the beginning of the list
1155
+ register_converter(converter) {
1156
+ this.converters.unshift(converter);
1157
+ }
1158
+ }
1159
+
1160
+ exports.MarkItDown = MarkItDown;