markitdown-ts 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,1131 @@
1
+ import * as mime from 'mime-types';
2
+ import * as path from 'path';
3
+ import path__default from 'path';
4
+ import * as fs from 'fs';
5
+ import fs__default from 'fs';
6
+ import { JSDOM } from 'jsdom';
7
+ import TurndownService from 'turndown';
8
+ import turndownPluginGfm from '@joplin/turndown-plugin-gfm';
9
+ import { DOMParser } from '@xmldom/xmldom';
10
+ import { URL as URL$1 } from 'url';
11
+ import { pdfToText } from 'pdf-ts';
12
+ import Mammoth from 'mammoth';
13
+ import XLSX from 'xlsx';
14
+ import * as childProcess from 'child_process';
15
+ import * as util from 'util';
16
+ import * as fs$1 from 'fs/promises';
17
+ import * as os from 'os';
18
+ import { generateText } from 'ai';
19
+ import * as unzipper from 'unzipper';
20
+
21
+ class PlainTextConverter {
22
+ async convert(local_path, options = {}) {
23
+ const fileExtension = options.file_extension || "";
24
+ const contentType = mime.lookup(fileExtension);
25
+ if (!contentType) {
26
+ return null;
27
+ } else if (!contentType.toLowerCase().includes("text/")) {
28
+ return null;
29
+ }
30
+ const content = fs__default.readFileSync(local_path, { encoding: "utf-8" });
31
+ return {
32
+ title: null,
33
+ text_content: content
34
+ };
35
+ }
36
+ }
37
+
38
+ class CustomTurnDown {
39
+ convert_soup(doc) {
40
+ let turnDownService = new TurndownService({
41
+ headingStyle: "atx"
42
+ });
43
+ turnDownService.use(turndownPluginGfm.gfm);
44
+ turnDownService.addRule("anchor tags", {
45
+ filter: ["a"],
46
+ replacement: function(content, node) {
47
+ if (content === "") {
48
+ return "";
49
+ }
50
+ let prefix = "";
51
+ let suffix = "";
52
+ if (content && content[0] === " ") {
53
+ prefix = " ";
54
+ }
55
+ if (content && content[content.length - 1] === " ") {
56
+ suffix = " ";
57
+ }
58
+ let text = content.trim().replace(/\n\n.*/g, "");
59
+ if (text === "") {
60
+ return "";
61
+ }
62
+ let href = node.getAttribute("href");
63
+ let title = node.title;
64
+ if (href) {
65
+ try {
66
+ let parsed_url = new URL(href);
67
+ if (!["https:", "http:", "file:"].includes(parsed_url.protocol)) {
68
+ return `${prefix}${text}${suffix}`;
69
+ }
70
+ } catch (e) {
71
+ if (!/^https?:|^file:/.test(href)) {
72
+ return `${prefix}[${text}](${href} "${title}")${suffix}`;
73
+ }
74
+ return `${prefix}${text}${suffix}`;
75
+ }
76
+ }
77
+ if (text.replace(/\\_/g, "_") === href && !title) {
78
+ return `<${href}>`;
79
+ }
80
+ if (!title && href) {
81
+ title = href;
82
+ }
83
+ let title_part = title ? ` "${title}"` : "";
84
+ return `${prefix}[${text}](${href}${title_part})${suffix}`;
85
+ }
86
+ });
87
+ turnDownService.addRule("img tags", {
88
+ filter: ["img"],
89
+ replacement: function(_, node) {
90
+ if (!node || node.nodeName !== "IMG") {
91
+ return "";
92
+ }
93
+ let alt = node.getAttribute("alt") || "";
94
+ let src = node.getAttribute("src") || "";
95
+ let title = node.getAttribute("title") || "";
96
+ let titlePart = title ? ` "${title}"` : "";
97
+ if (src.startsWith("data:")) {
98
+ src = src.split(",")[0] + "...";
99
+ }
100
+ return `![${alt}](${src}${titlePart})`;
101
+ }
102
+ });
103
+ let markdown = turnDownService.turndown(doc);
104
+ return markdown;
105
+ }
106
+ }
107
+
108
+ class HtmlConverter {
109
+ async convert(local_path, options) {
110
+ const extension = options.file_extension || "";
111
+ if (![".html", ".htm"].includes(extension.toLowerCase())) {
112
+ return null;
113
+ }
114
+ try {
115
+ let exists = fs.existsSync(local_path);
116
+ if (!exists) {
117
+ throw new Error("File does'nt exists");
118
+ }
119
+ let content = fs.readFileSync(local_path, { encoding: "utf-8" });
120
+ return await this._convert(content);
121
+ } catch (e) {
122
+ console.error(e);
123
+ return null;
124
+ }
125
+ }
126
+ async _convert(htmlContent) {
127
+ const soup = new JSDOM(htmlContent);
128
+ const doc = soup.window.document;
129
+ doc.querySelectorAll("script, style").forEach((script) => {
130
+ script.remove();
131
+ });
132
+ const bodyElm = doc.querySelector("body");
133
+ let webpageText = "";
134
+ if (bodyElm) {
135
+ webpageText = new CustomTurnDown().convert_soup(bodyElm);
136
+ } else {
137
+ webpageText = new CustomTurnDown().convert_soup(doc);
138
+ }
139
+ return {
140
+ title: doc.title,
141
+ text_content: webpageText
142
+ };
143
+ }
144
+ }
145
+
146
+ class RSSConverter {
147
+ async convert(localPath, options = {}) {
148
+ const fileExtension = options.file_extension || "";
149
+ if (![".xml", ".rss", ".atom"].includes(fileExtension.toLowerCase())) {
150
+ return null;
151
+ }
152
+ try {
153
+ const xmlString = fs.readFileSync(localPath, { encoding: "utf-8" });
154
+ const doc = new DOMParser().parseFromString(xmlString, "text/xml");
155
+ let result;
156
+ if (doc.getElementsByTagName("rss").length > 0) {
157
+ result = this._parseRssType(doc);
158
+ } else if (doc.getElementsByTagName("feed").length > 0) {
159
+ const root = doc.getElementsByTagName("feed")[0];
160
+ if (root.getElementsByTagName("entry").length > 0) {
161
+ result = this._parseAtomType(doc);
162
+ }
163
+ }
164
+ return result;
165
+ } catch (error) {
166
+ console.error("RSS Parsing Error:", error);
167
+ return null;
168
+ }
169
+ }
170
+ _parseAtomType(doc) {
171
+ try {
172
+ const root = doc.getElementsByTagName("feed")[0];
173
+ const title = this._getDataByTagName(root, "title");
174
+ const subtitle = this._getDataByTagName(root, "subtitle");
175
+ const entries = root.getElementsByTagName("entry");
176
+ let mdText = `# ${title}
177
+ `;
178
+ if (subtitle) {
179
+ mdText += `${subtitle}
180
+ `;
181
+ }
182
+ for (let i = 0; i < entries.length; i++) {
183
+ const entry = entries[i];
184
+ const entryTitle = this._getDataByTagName(entry, "title");
185
+ const entrySummary = this._getDataByTagName(entry, "summary");
186
+ const entryUpdated = this._getDataByTagName(entry, "updated");
187
+ const entryContent = this._getDataByTagName(entry, "content");
188
+ if (entryTitle) {
189
+ mdText += `
190
+ ## ${entryTitle}
191
+ `;
192
+ }
193
+ if (entryUpdated) {
194
+ mdText += `Updated on: ${entryUpdated}
195
+ `;
196
+ }
197
+ if (entrySummary) {
198
+ mdText += this._parseContent(entrySummary);
199
+ }
200
+ if (entryContent) {
201
+ mdText += this._parseContent(entryContent);
202
+ }
203
+ }
204
+ return {
205
+ title,
206
+ text_content: mdText
207
+ };
208
+ } catch (error) {
209
+ console.error("Atom Parsing Error:", error);
210
+ return null;
211
+ }
212
+ }
213
+ _parseRssType(doc) {
214
+ try {
215
+ const root = doc.getElementsByTagName("rss")[0];
216
+ const channel = root.getElementsByTagName("channel");
217
+ if (!channel || channel.length === 0) {
218
+ return null;
219
+ }
220
+ const channelElement = channel[0];
221
+ const channelTitle = this._getDataByTagName(channelElement, "title");
222
+ const channelDescription = this._getDataByTagName(channelElement, "description");
223
+ const items = channelElement.getElementsByTagName("item");
224
+ let mdText = "";
225
+ if (channelTitle) {
226
+ mdText = `# ${channelTitle}
227
+ `;
228
+ }
229
+ if (channelDescription) {
230
+ mdText += `${channelDescription}
231
+ `;
232
+ }
233
+ for (let i = 0; i < items.length; i++) {
234
+ const item = items[i];
235
+ const title = this._getDataByTagName(item, "title");
236
+ const description = this._getDataByTagName(item, "description");
237
+ const pubDate = this._getDataByTagName(item, "pubDate");
238
+ const content = this._getDataByTagName(item, "content:encoded");
239
+ if (title) {
240
+ mdText += `
241
+ ## ${title}
242
+ `;
243
+ }
244
+ if (pubDate) {
245
+ mdText += `Published on: ${pubDate}
246
+ `;
247
+ }
248
+ if (description) {
249
+ mdText += this._parseContent(description);
250
+ }
251
+ if (content) {
252
+ mdText += this._parseContent(content);
253
+ }
254
+ }
255
+ return {
256
+ title: channelTitle,
257
+ text_content: mdText
258
+ };
259
+ } catch (error) {
260
+ console.error("RSS Parsing Error:", error);
261
+ return null;
262
+ }
263
+ }
264
+ _parseContent(content) {
265
+ try {
266
+ const dom = new JSDOM(content);
267
+ const document = dom.window.document;
268
+ return new CustomTurnDown().convert_soup(document);
269
+ } catch (error) {
270
+ console.warn("Parsing content error", error);
271
+ return content;
272
+ }
273
+ }
274
+ _getDataByTagName(element, tagName) {
275
+ const nodes = element.getElementsByTagName(tagName);
276
+ if (!nodes || nodes.length === 0) {
277
+ return null;
278
+ }
279
+ const fc = nodes[0].firstChild;
280
+ if (fc && fc.nodeValue) {
281
+ return fc.nodeValue;
282
+ }
283
+ return null;
284
+ }
285
+ }
286
+
287
+ class WikipediaConverter {
288
+ async convert(localPath, options = {}) {
289
+ const fileExtension = options.file_extension || "";
290
+ if (![".html", ".htm"].includes(fileExtension.toLowerCase())) {
291
+ return null;
292
+ }
293
+ const url = options.url || "";
294
+ if (!/^https?:\/\/[a-zA-Z]{2,3}\.wikipedia\.org\//.test(url)) {
295
+ return null;
296
+ }
297
+ try {
298
+ const htmlContent = fs.readFileSync(localPath, { encoding: "utf-8" });
299
+ return this._convert(htmlContent);
300
+ } catch (error) {
301
+ console.error("Wikipedia Parsing Error:", error);
302
+ return null;
303
+ }
304
+ }
305
+ _convert(htmlContent) {
306
+ const dom = new JSDOM(htmlContent);
307
+ const doc = dom.window.document;
308
+ doc.querySelectorAll("script, style").forEach((script) => {
309
+ script.remove();
310
+ });
311
+ const bodyElm = doc.querySelector("div#mw-content-text");
312
+ const titleElm = doc.querySelector("span.mw-page-title-main");
313
+ let webpageText = "";
314
+ let mainTitle = doc.title;
315
+ if (bodyElm) {
316
+ if (titleElm && titleElm.textContent) {
317
+ mainTitle = titleElm.textContent;
318
+ }
319
+ webpageText = `# ${mainTitle}
320
+
321
+ ` + new CustomTurnDown().convert_soup(bodyElm);
322
+ } else {
323
+ webpageText = new CustomTurnDown().convert_soup(doc);
324
+ }
325
+ return {
326
+ title: mainTitle,
327
+ text_content: webpageText
328
+ };
329
+ }
330
+ }
331
+
332
+ class YouTubeConverter {
333
+ async convert(localPath, options = {}) {
334
+ const fileExtension = options.file_extension || "";
335
+ if (![".html", ".htm"].includes(fileExtension.toLowerCase())) {
336
+ return null;
337
+ }
338
+ const url = options.url || "";
339
+ if (!url.startsWith("https://www.youtube.com/watch?")) {
340
+ return null;
341
+ }
342
+ try {
343
+ const htmlContent = fs.readFileSync(localPath, { encoding: "utf-8" });
344
+ return this._convert(htmlContent, url, options);
345
+ } catch (error) {
346
+ console.error("YouTube Parsing Error:", error);
347
+ return null;
348
+ }
349
+ }
350
+ async _convert(htmlContent, url, options) {
351
+ const dom = new JSDOM(htmlContent);
352
+ const doc = dom.window.document;
353
+ const metadata = {
354
+ title: doc.title
355
+ };
356
+ doc.querySelectorAll("meta").forEach((meta) => {
357
+ for (const a of meta.attributes) {
358
+ const attributeContent = meta.getAttribute("content");
359
+ if (["itemprop", "property", "name"].includes(a.name) && attributeContent) {
360
+ metadata[a.value] = attributeContent;
361
+ break;
362
+ }
363
+ }
364
+ });
365
+ try {
366
+ for (const script of doc.querySelectorAll("script")) {
367
+ const content = script.textContent || "";
368
+ if (content.includes("ytInitialData")) {
369
+ const lines = content.split(/\r?\n/);
370
+ const objStart = lines[0].indexOf("{");
371
+ const objEnd = lines[0].lastIndexOf("}");
372
+ if (objStart >= 0 && objEnd >= 0) {
373
+ const data = JSON.parse(lines[0].substring(objStart, objEnd + 1));
374
+ const attrdesc = this._findKey(data, "attributedDescriptionBodyText");
375
+ if (attrdesc) {
376
+ metadata["description"] = attrdesc["content"];
377
+ }
378
+ }
379
+ break;
380
+ }
381
+ }
382
+ } catch (e) {
383
+ console.warn("Error while parsing Youtube description");
384
+ }
385
+ let webpageText = "# YouTube\n";
386
+ const title = this._get(metadata, ["title", "og:title", "name"]);
387
+ if (title) {
388
+ webpageText += `
389
+ ## ${title}
390
+ `;
391
+ }
392
+ let stats = "";
393
+ const views = this._get(metadata, ["interactionCount"]);
394
+ if (views) {
395
+ stats += `- **Views:** ${views}
396
+ `;
397
+ }
398
+ const keywords = this._get(metadata, ["keywords"]);
399
+ if (keywords) {
400
+ stats += `- **Keywords:** ${keywords}
401
+ `;
402
+ }
403
+ const runtime = this._get(metadata, ["duration"]);
404
+ if (runtime) {
405
+ stats += `- **Runtime:** ${runtime}
406
+ `;
407
+ }
408
+ if (stats.length > 0) {
409
+ webpageText += `
410
+ ### Video Metadata
411
+ ${stats}
412
+ `;
413
+ }
414
+ const description = this._get(metadata, ["description", "og:description"]);
415
+ if (description) {
416
+ webpageText += `
417
+ ### Description
418
+ ${description}
419
+ `;
420
+ }
421
+ if (options.enableYoutubeTranscript) {
422
+ let transcriptText = "";
423
+ const parsedUrl = new URL$1(url);
424
+ const params = parsedUrl.searchParams;
425
+ const videoId = params.get("v");
426
+ let ytTranscript;
427
+ try {
428
+ ytTranscript = await import('youtube-transcript').then((mod) => mod.YoutubeTranscript);
429
+ } catch (error) {
430
+ console.warn(
431
+ "Optional dependency 'youtube-transcript' is not installed. Run `npm install youtube-transcript` to enable this feature."
432
+ );
433
+ return null;
434
+ }
435
+ if (videoId) {
436
+ try {
437
+ const youtubeTranscriptLanguage = options.youtubeTranscriptLanguage || "en";
438
+ const transcript = await ytTranscript.fetchTranscript(videoId, {
439
+ lang: youtubeTranscriptLanguage
440
+ });
441
+ transcriptText = transcript.map((part) => part.text).join(" ");
442
+ } catch (error) {
443
+ console.warn("Error while extracting the Youtube Transcript", error);
444
+ }
445
+ }
446
+ if (transcriptText) {
447
+ webpageText += `
448
+ ### Transcript
449
+ ${transcriptText}
450
+ `;
451
+ }
452
+ }
453
+ const finalTitle = title ? title : doc.title;
454
+ return {
455
+ title: finalTitle,
456
+ text_content: webpageText
457
+ };
458
+ }
459
+ _get(metadata, keys, default_value) {
460
+ for (const k of keys) {
461
+ if (metadata[k]) {
462
+ return metadata[k];
463
+ }
464
+ }
465
+ return default_value || null;
466
+ }
467
+ _findKey(json, key) {
468
+ if (Array.isArray(json)) {
469
+ for (const elm of json) {
470
+ const ret = this._findKey(elm, key);
471
+ if (ret) {
472
+ return ret;
473
+ }
474
+ }
475
+ } else if (typeof json === "object" && json !== null) {
476
+ for (const k in json) {
477
+ if (k === key) {
478
+ return json[k];
479
+ } else {
480
+ const ret = this._findKey(json[k], key);
481
+ if (ret) {
482
+ return ret;
483
+ }
484
+ }
485
+ }
486
+ }
487
+ return null;
488
+ }
489
+ }
490
+
491
+ class IpynbConverter {
492
+ async convert(localPath, options = {}) {
493
+ const fileExtension = options.file_extension || "";
494
+ if (fileExtension.toLowerCase() !== ".ipynb") {
495
+ return null;
496
+ }
497
+ try {
498
+ const notebookContent = JSON.parse(fs.readFileSync(localPath, { encoding: "utf-8" }));
499
+ return this._convert(notebookContent);
500
+ } catch (error) {
501
+ console.error("Error converting .ipynb file:", error);
502
+ return null;
503
+ }
504
+ }
505
+ _convert(notebookContent) {
506
+ try {
507
+ const mdOutput = [];
508
+ let title = null;
509
+ for (const cell of notebookContent.cells || []) {
510
+ const cellType = cell.cell_type || "";
511
+ const sourceLines = cell.source || [];
512
+ if (cellType === "markdown") {
513
+ mdOutput.push(sourceLines.join(""));
514
+ if (!title) {
515
+ for (const line of sourceLines) {
516
+ if (line.startsWith("# ")) {
517
+ title = line.substring(line.indexOf("# ") + 2).trim();
518
+ break;
519
+ }
520
+ }
521
+ }
522
+ } else if (cellType === "code") {
523
+ mdOutput.push(`\`\`\`python
524
+ ${sourceLines.join("")}
525
+ \`\`\``);
526
+ } else if (cellType === "raw") {
527
+ mdOutput.push(`\`\`\`
528
+ ${sourceLines.join("")}
529
+ \`\`\``);
530
+ }
531
+ }
532
+ const mdText = mdOutput.join("\n\n");
533
+ title = notebookContent.metadata?.title || title;
534
+ return {
535
+ title,
536
+ text_content: mdText
537
+ };
538
+ } catch (e) {
539
+ console.error("Error converting .ipynb file:", e);
540
+ throw new Error(`Error converting .ipynb file: ${e}`);
541
+ }
542
+ }
543
+ }
544
+
545
+ class BingSerpConverter {
546
+ async convert(localPath, options = {}) {
547
+ const fileExtension = options.file_extension || "";
548
+ if (![".html", ".htm"].includes(fileExtension.toLowerCase())) {
549
+ return null;
550
+ }
551
+ const url = options.url || "";
552
+ if (!/^https:\/\/www\.bing\.com\/search\?q=/.test(url)) {
553
+ return null;
554
+ }
555
+ try {
556
+ const htmlContent = fs.readFileSync(localPath, { encoding: "utf-8" });
557
+ return this._convert(htmlContent, url);
558
+ } catch (error) {
559
+ console.error("Bing SERP Parsing Error:", error);
560
+ return null;
561
+ }
562
+ }
563
+ _convert(htmlContent, url) {
564
+ const dom = new JSDOM(htmlContent);
565
+ const doc = dom.window.document;
566
+ const parsedParams = new URL$1(url).searchParams;
567
+ const query = parsedParams.get("q") || "";
568
+ doc.querySelectorAll(".tptt").forEach((tptt) => {
569
+ if (tptt.textContent) {
570
+ tptt.textContent += " ";
571
+ }
572
+ });
573
+ doc.querySelectorAll(".algoSlug_icon").forEach((slug) => {
574
+ slug.remove();
575
+ });
576
+ const markdownify = new CustomTurnDown();
577
+ const results = [];
578
+ doc.querySelectorAll(".b_algo").forEach((result) => {
579
+ result.querySelectorAll("a[href]").forEach((a) => {
580
+ try {
581
+ const parsedHref = new URL$1(a.getAttribute("href"));
582
+ const params = parsedHref.searchParams;
583
+ const u = params.get("u");
584
+ if (u) {
585
+ const decoded = this._decodeBase64Url(u);
586
+ a.setAttribute("href", decoded);
587
+ }
588
+ } catch (e) {
589
+ }
590
+ });
591
+ const mdResult = markdownify.convert_soup(result).trim();
592
+ const lines = mdResult.split(/\n+/).map((line) => line.trim()).filter((line) => line.length > 0);
593
+ results.push(lines.join("\n"));
594
+ });
595
+ const webpageText = `## A Bing search for '${query}' found the following results:
596
+
597
+ ${results.join("\n\n")}`;
598
+ return {
599
+ title: doc.title,
600
+ text_content: webpageText
601
+ };
602
+ }
603
+ _decodeBase64Url(encodedUrl) {
604
+ let u = encodedUrl.slice(2).trim() + "==";
605
+ try {
606
+ const decoded = Buffer.from(u, "base64").toString("utf-8");
607
+ return decoded;
608
+ } catch (error) {
609
+ console.error("Error decoding Base64URL:", error);
610
+ return encodedUrl;
611
+ }
612
+ }
613
+ }
614
+
615
+ class PdfConverter {
616
+ async convert(localPath, options = {}) {
617
+ const fileExtension = options.file_extension || "";
618
+ if (![".pdf"].includes(fileExtension.toLowerCase())) {
619
+ return null;
620
+ }
621
+ try {
622
+ const pdfContent = fs__default.readFileSync(localPath);
623
+ return this._convert(pdfContent);
624
+ } catch (error) {
625
+ console.error("PDF Parsing Error:", error);
626
+ return null;
627
+ }
628
+ }
629
+ async _convert(pdfContent) {
630
+ try {
631
+ const textContent = await pdfToText(pdfContent);
632
+ return {
633
+ title: null,
634
+ text_content: textContent
635
+ };
636
+ } catch (error) {
637
+ console.error("PDF Parsing Error:", error);
638
+ return null;
639
+ }
640
+ }
641
+ }
642
+
643
+ class DocxConverter extends HtmlConverter {
644
+ async convert(local_path, options) {
645
+ const fileExtension = options.file_extension || "";
646
+ if (![".docx"].includes(fileExtension.toLowerCase())) {
647
+ return null;
648
+ }
649
+ try {
650
+ let exists = fs.existsSync(local_path);
651
+ if (!exists) {
652
+ throw new Error("File does'nt exists");
653
+ }
654
+ let htmlContent = await Mammoth.convertToHtml(
655
+ {
656
+ path: local_path
657
+ },
658
+ {
659
+ ...options
660
+ }
661
+ );
662
+ return await this._convert(htmlContent.value);
663
+ } catch (e) {
664
+ console.error(e);
665
+ return null;
666
+ }
667
+ }
668
+ }
669
+
670
+ class XlsxConverter extends HtmlConverter {
671
+ async convert(local_path, options) {
672
+ const extension = options.file_extension || "";
673
+ if (![".xlsx"].includes(extension.toLowerCase())) {
674
+ return null;
675
+ }
676
+ try {
677
+ let exists = fs.existsSync(local_path);
678
+ if (!exists) {
679
+ throw new Error("File does'nt exists");
680
+ }
681
+ let workbook = XLSX.readFile(local_path);
682
+ let mdContent = "";
683
+ for (const sheetName of workbook.SheetNames) {
684
+ mdContent += `## ${sheetName}
685
+ `;
686
+ let htmlContent = XLSX.utils.sheet_to_html(workbook.Sheets[sheetName]);
687
+ mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
688
+ }
689
+ return {
690
+ title: workbook?.Props?.Title || "Untitled",
691
+ text_content: mdContent
692
+ };
693
+ } catch (e) {
694
+ console.error(e);
695
+ return null;
696
+ }
697
+ }
698
+ }
699
+
700
+ const exec = util.promisify(childProcess.exec);
701
+ class MediaConverter {
702
+ async _getMetadata(localPath) {
703
+ const exiftool = await this._which("exiftool");
704
+ if (!exiftool) {
705
+ console.error("exiftool is not found on this system so metadata cannot be extracted");
706
+ return null;
707
+ }
708
+ try {
709
+ const result = await exec(`"${exiftool}" -json "${localPath}"`);
710
+ return JSON.parse(result.stdout)[0];
711
+ } catch (error) {
712
+ console.error("Exiftool error:", error);
713
+ return null;
714
+ }
715
+ }
716
+ async _which(command) {
717
+ try {
718
+ const result = await exec(`which ${command}`);
719
+ return result.stdout.trim();
720
+ } catch (error) {
721
+ console.warn("Which command error:", error);
722
+ return null;
723
+ }
724
+ }
725
+ }
726
+
727
+ class WavConverter extends MediaConverter {
728
+ async convert(localPath, options = {}) {
729
+ const fileExtension = options.file_extension || "";
730
+ if (fileExtension.toLowerCase() !== ".wav") {
731
+ return null;
732
+ }
733
+ try {
734
+ return this._convert(localPath, options);
735
+ } catch (error) {
736
+ console.error("WAV Conversion Error:", error);
737
+ return null;
738
+ }
739
+ }
740
+ async _convert(localPath, _) {
741
+ let mdContent = "";
742
+ const metadata = await this._getMetadata(localPath);
743
+ if (metadata) {
744
+ for (const f of [
745
+ "Title",
746
+ "Artist",
747
+ "Author",
748
+ "Band",
749
+ "Album",
750
+ "Genre",
751
+ "Track",
752
+ "DateTimeOriginal",
753
+ "CreateDate",
754
+ "Duration"
755
+ ]) {
756
+ if (metadata[f]) {
757
+ mdContent += `${f}: ${metadata[f]}
758
+ `;
759
+ }
760
+ }
761
+ }
762
+ try {
763
+ const transcript = await this._transcribeAudio(localPath);
764
+ mdContent += `
765
+
766
+ ### Audio Transcript:
767
+ ${transcript === "" ? "[No speech detected]" : transcript}`;
768
+ } catch (error) {
769
+ console.error("Error loading speech recognition module:", error);
770
+ mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio.";
771
+ }
772
+ return {
773
+ title: null,
774
+ text_content: mdContent.trim()
775
+ };
776
+ }
777
+ // TODO: Add speech to text
778
+ async _transcribeAudio(_) {
779
+ throw new Error("TODO: Audio transcription not implemented yet");
780
+ }
781
+ }
782
+
783
+ class Mp3Converter extends WavConverter {
784
+ async convert(localPath, options = {}) {
785
+ const fileExtension = options.file_extension || "";
786
+ if (fileExtension.toLowerCase() !== ".mp3") {
787
+ return null;
788
+ }
789
+ try {
790
+ return await this._convert$(localPath, options);
791
+ } catch (error) {
792
+ console.error("MP3 Conversion Error:", error);
793
+ return null;
794
+ }
795
+ }
796
+ async _convert$(localPath, options) {
797
+ let mdContent = "";
798
+ const metadata = await this._getMetadata(localPath);
799
+ if (metadata) {
800
+ for (const f of [
801
+ "Title",
802
+ "Artist",
803
+ "Author",
804
+ "Band",
805
+ "Album",
806
+ "Genre",
807
+ "Track",
808
+ "DateTimeOriginal",
809
+ "CreateDate",
810
+ "Duration"
811
+ ]) {
812
+ if (metadata[f]) {
813
+ mdContent += `${f}: ${metadata[f]}
814
+ `;
815
+ }
816
+ }
817
+ }
818
+ const tempPath = await fs$1.mkdtemp(path.join(os.tmpdir(), "temp_"));
819
+ const wavPath = path.join(tempPath, "audio.wav");
820
+ try {
821
+ const transcript = await super._transcribeAudio(wavPath);
822
+ mdContent += `
823
+
824
+ ### Audio Transcript:
825
+ ${transcript == "" ? "[No speech detected]" : transcript}`;
826
+ } catch (e) {
827
+ mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio.";
828
+ } finally {
829
+ await fs$1.unlink(wavPath);
830
+ await fs$1.rmdir(tempPath);
831
+ }
832
+ return {
833
+ title: null,
834
+ text_content: mdContent.trim()
835
+ };
836
+ }
837
+ }
838
+
839
+ class ImageConverter extends MediaConverter {
840
+ async convert(localPath, options = {}) {
841
+ const fileExtension = options.file_extension || "";
842
+ if (![".jpg", ".jpeg", ".png"].includes(fileExtension.toLowerCase())) {
843
+ return null;
844
+ }
845
+ try {
846
+ return this._convert(localPath, options);
847
+ } catch (error) {
848
+ console.error("Image Conversion Error:", error);
849
+ return null;
850
+ }
851
+ }
852
+ async _convert(localPath, options) {
853
+ let mdContent = "";
854
+ const metadata = await this._getMetadata(localPath);
855
+ if (metadata) {
856
+ for (const f of [
857
+ "ImageSize",
858
+ "Title",
859
+ "Caption",
860
+ "Description",
861
+ "Keywords",
862
+ "Artist",
863
+ "Author",
864
+ "DateTimeOriginal",
865
+ "CreateDate",
866
+ "GPSPosition"
867
+ ]) {
868
+ if (metadata[f]) {
869
+ mdContent += `${f}: ${metadata[f]}
870
+ `;
871
+ }
872
+ }
873
+ }
874
+ if (options.llmModel) {
875
+ mdContent += `
876
+ # Description:
877
+ ${(await this._getLLMDescription(localPath, options)).trim()}
878
+ `;
879
+ }
880
+ return {
881
+ title: null,
882
+ text_content: mdContent.trim()
883
+ };
884
+ }
885
+ async _getLLMDescription(localPath, options) {
886
+ if (!options.llmPrompt || options.llmPrompt.trim() === "") {
887
+ options.llmPrompt = "Write a detailed caption for this image.";
888
+ }
889
+ const imageFile = fs.readFileSync(localPath).toString("base64");
890
+ const result = await generateText({
891
+ model: options.llmModel,
892
+ messages: [
893
+ {
894
+ role: "user",
895
+ content: [
896
+ { type: "text", text: options.llmPrompt },
897
+ {
898
+ type: "image",
899
+ image: imageFile
900
+ }
901
+ ]
902
+ }
903
+ ]
904
+ });
905
+ return result.text.trim();
906
+ }
907
+ }
908
+
909
+ class ZipConverter {
910
+ async convert(localPath, options = {}) {
911
+ const fileExtension = options.file_extension || "";
912
+ if (fileExtension.toLowerCase() !== ".zip") {
913
+ return null;
914
+ }
915
+ const parentConverters = options._parent_converters || [];
916
+ if (!parentConverters) {
917
+ return {
918
+ title: null,
919
+ text_content: `[ERROR] No converters available to process zip contents from: ${localPath}`
920
+ };
921
+ }
922
+ const extractedZipFolderName = `extracted_${path.basename(localPath).replace(".zip", "_zip")}`;
923
+ const newFolder = path.normalize(path.join(path.dirname(localPath), extractedZipFolderName));
924
+ let mdContent = `Content from the zip file \`${path.basename(localPath)}\`:
925
+
926
+ `;
927
+ if (!newFolder.startsWith(path.dirname(localPath))) {
928
+ return {
929
+ title: null,
930
+ text_content: `[ERROR] Invalid zip file path: ${localPath}`
931
+ };
932
+ }
933
+ try {
934
+ await fs$1.mkdir(newFolder, { recursive: true });
935
+ const zip = await unzipper.Open.file(localPath);
936
+ await zip.extract({ path: newFolder });
937
+ const files = await this._walk(newFolder);
938
+ for (const { root, name } of files) {
939
+ const filePath = path.join(root, name);
940
+ const relativePath = path.relative(newFolder, filePath);
941
+ const fileExtension2 = path.extname(name);
942
+ const fileOptions = {
943
+ ...options,
944
+ file_extension: fileExtension2,
945
+ _parent_converters: parentConverters
946
+ };
947
+ for (const converter of parentConverters) {
948
+ if (converter instanceof ZipConverter) {
949
+ continue;
950
+ }
951
+ const result = await converter.convert(filePath, fileOptions);
952
+ if (result) {
953
+ mdContent += `
954
+ ## File: ${relativePath}
955
+
956
+ `;
957
+ mdContent += result.text_content + "\n\n";
958
+ break;
959
+ }
960
+ }
961
+ }
962
+ if (options.cleanupExtracted !== false) {
963
+ await fs$1.rm(newFolder, { recursive: true, force: true });
964
+ }
965
+ return {
966
+ title: null,
967
+ text_content: mdContent.trim()
968
+ };
969
+ } catch (error) {
970
+ if (error.message.includes("invalid signature")) {
971
+ return {
972
+ title: null,
973
+ text_content: `[ERROR] Invalid or corrupted zip file: ${localPath}`
974
+ };
975
+ }
976
+ return {
977
+ title: null,
978
+ text_content: `[ERROR] Failed to process zip file ${localPath}: ${String(error)}`
979
+ };
980
+ }
981
+ }
982
+ async _walk(dir) {
983
+ let results = [];
984
+ const files = await fs$1.readdir(dir, { withFileTypes: true });
985
+ for (const file of files) {
986
+ if (file.isDirectory()) {
987
+ results = results.concat(await this._walk(path.join(dir, file.name)));
988
+ } else {
989
+ results.push({ root: dir, name: file.name });
990
+ }
991
+ }
992
+ return results;
993
+ }
994
+ }
995
+
996
+ class MarkItDown {
997
+ converters = [];
998
+ constructor() {
999
+ this.register_converter(new PlainTextConverter());
1000
+ this.register_converter(new HtmlConverter());
1001
+ this.register_converter(new RSSConverter());
1002
+ this.register_converter(new WikipediaConverter());
1003
+ this.register_converter(new YouTubeConverter());
1004
+ this.register_converter(new BingSerpConverter());
1005
+ this.register_converter(new DocxConverter());
1006
+ this.register_converter(new XlsxConverter());
1007
+ this.register_converter(new WavConverter());
1008
+ this.register_converter(new Mp3Converter());
1009
+ this.register_converter(new ImageConverter());
1010
+ this.register_converter(new IpynbConverter());
1011
+ this.register_converter(new PdfConverter());
1012
+ this.register_converter(new ZipConverter());
1013
+ }
1014
+ async convert(source, options = {}) {
1015
+ if (source instanceof Response) {
1016
+ return await this.convert_response(source, options);
1017
+ } else {
1018
+ if (source.startsWith("http://") || source.startsWith("https://") || source.startsWith("file://")) {
1019
+ return await this.convert_url(source, options);
1020
+ } else {
1021
+ return this.convert_local(source, options);
1022
+ }
1023
+ }
1024
+ }
1025
+ async convert_url(source, { fetch = globalThis.fetch, ...options }) {
1026
+ let response = await fetch(source);
1027
+ if (!response.ok) {
1028
+ throw new Error(`Failed to fetch URL: ${source}, status: ${response.status}`);
1029
+ }
1030
+ return await this.convert_response(response, options);
1031
+ }
1032
+ async convert_response(response, options) {
1033
+ const ext = options.file_extension;
1034
+ const extensions = ext ? /* @__PURE__ */ new Set([ext]) : /* @__PURE__ */ new Set();
1035
+ const contentType = response.headers?.get("content-type")?.split(";")[0];
1036
+ if (!contentType) {
1037
+ throw new Error("Response Content-Type header is missing");
1038
+ }
1039
+ const mimeExtension = mime.extension(contentType);
1040
+ if (mimeExtension) {
1041
+ extensions.add(`.${mimeExtension}`);
1042
+ }
1043
+ const content_disposition = response.headers?.get("content-disposition") || "";
1044
+ const fname = content_disposition.match(/filename="([^;]+)"/);
1045
+ if (fname) {
1046
+ extensions.add(path__default.extname(fname[1]));
1047
+ }
1048
+ const url_ext = path__default.extname(new URL(response.url).pathname);
1049
+ extensions.add(url_ext);
1050
+ const file = fname ? `/tmp/${fname?.[1]}` : "/tmp/temp";
1051
+ const temp_writeable = fs.createWriteStream(file);
1052
+ try {
1053
+ if (response.body == null) {
1054
+ throw new Error("Response body is empty");
1055
+ }
1056
+ const reader = response.body.getReader();
1057
+ while (true) {
1058
+ const { done, value } = await reader.read();
1059
+ if (done) break;
1060
+ temp_writeable.write(value);
1061
+ }
1062
+ temp_writeable.end();
1063
+ return await this._convert(file, extensions, {
1064
+ ...options,
1065
+ url: response.url
1066
+ });
1067
+ } catch (e) {
1068
+ throw new Error(`Could not write to file: ${e}`);
1069
+ } finally {
1070
+ try {
1071
+ temp_writeable.close();
1072
+ } catch (e) {
1073
+ throw new Error(`Could not close file: ${e}`);
1074
+ }
1075
+ }
1076
+ }
1077
+ async convert_local(source, options) {
1078
+ const ext = options.file_extension;
1079
+ const extensions = ext ? new Set(ext) : /* @__PURE__ */ new Set();
1080
+ if (!fs.existsSync(source)) {
1081
+ throw new Error(`File not found: ${source}`);
1082
+ }
1083
+ const extname = path__default.extname(source);
1084
+ if (extname === "") {
1085
+ throw new Error(`File extension not found: ${source}`);
1086
+ }
1087
+ if (!extensions.has(extname)) {
1088
+ extensions.add(extname);
1089
+ }
1090
+ return await this._convert(source, extensions, options);
1091
+ }
1092
+ async _convert(source, extensions, options = {}) {
1093
+ let error;
1094
+ for (const ext of extensions) {
1095
+ for (const converter of this.converters) {
1096
+ let res;
1097
+ try {
1098
+ const op = {
1099
+ ...options,
1100
+ file_extension: ext,
1101
+ _parent_converters: this.converters
1102
+ };
1103
+ res = await converter.convert(source, op);
1104
+ } catch (e) {
1105
+ error = e;
1106
+ }
1107
+ if (res != null) {
1108
+ res.text_content = res.text_content.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1109
+ res.text_content = res.text_content.replace(/\n{3,}/g, "\n\n");
1110
+ return res;
1111
+ }
1112
+ }
1113
+ }
1114
+ if (error) {
1115
+ throw new Error(
1116
+ `Could not convert ${source} to markdown. While converting the following error occurred: ${error}`
1117
+ );
1118
+ }
1119
+ throw new Error(
1120
+ `Could not convert ${source} to markdown format. The ${Array.from(extensions).join(
1121
+ ", "
1122
+ )} are not supported.`
1123
+ );
1124
+ }
1125
+ // NOTE: Inserts the converter at the beginning of the list
1126
+ register_converter(converter) {
1127
+ this.converters.unshift(converter);
1128
+ }
1129
+ }
1130
+
1131
+ export { MarkItDown };