markitdown-ts 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -1
- package/dist/index.cjs +21 -52
- package/dist/index.d.cts +2 -0
- package/dist/index.d.mts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.mjs +21 -52
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -50,7 +50,7 @@ try {
|
|
|
50
50
|
const result = await markitdown.convert("https://arxiv.org/pdf/2308.08155v2.pdf");
|
|
51
51
|
|
|
52
52
|
if (result) {
|
|
53
|
-
console.log(result.
|
|
53
|
+
console.log(result.markdown);
|
|
54
54
|
}
|
|
55
55
|
} catch (error) {
|
|
56
56
|
console.error("Conversion failed:", error);
|
|
@@ -158,6 +158,8 @@ class MarkItDown {
|
|
|
158
158
|
export type ConverterResult =
|
|
159
159
|
| {
|
|
160
160
|
title: string | null;
|
|
161
|
+
markdown: string;
|
|
162
|
+
/** @deprecated Use `markdown` instead. */
|
|
161
163
|
text_content: string;
|
|
162
164
|
}
|
|
163
165
|
| null
|
package/dist/index.cjs
CHANGED
|
@@ -63,6 +63,7 @@ class PlainTextConverter {
|
|
|
63
63
|
}
|
|
64
64
|
return {
|
|
65
65
|
title: null,
|
|
66
|
+
markdown: content,
|
|
66
67
|
text_content: content
|
|
67
68
|
};
|
|
68
69
|
}
|
|
@@ -176,6 +177,7 @@ class HtmlConverter {
|
|
|
176
177
|
}
|
|
177
178
|
return {
|
|
178
179
|
title: doc.title,
|
|
180
|
+
markdown: webpageText,
|
|
179
181
|
text_content: webpageText
|
|
180
182
|
};
|
|
181
183
|
}
|
|
@@ -239,10 +241,7 @@ class RSSConverter {
|
|
|
239
241
|
mdText += this._parseContent(entryContent);
|
|
240
242
|
}
|
|
241
243
|
}
|
|
242
|
-
return {
|
|
243
|
-
title,
|
|
244
|
-
text_content: mdText
|
|
245
|
-
};
|
|
244
|
+
return { title, markdown: mdText, text_content: mdText };
|
|
246
245
|
} catch (error) {
|
|
247
246
|
console.error("Atom Parsing Error:", error);
|
|
248
247
|
return null;
|
|
@@ -290,10 +289,7 @@ class RSSConverter {
|
|
|
290
289
|
mdText += this._parseContent(content);
|
|
291
290
|
}
|
|
292
291
|
}
|
|
293
|
-
return {
|
|
294
|
-
title: channelTitle,
|
|
295
|
-
text_content: mdText
|
|
296
|
-
};
|
|
292
|
+
return { title: channelTitle, markdown: mdText, text_content: mdText };
|
|
297
293
|
} catch (error) {
|
|
298
294
|
console.error("RSS Parsing Error:", error);
|
|
299
295
|
return null;
|
|
@@ -363,10 +359,7 @@ class WikipediaConverter {
|
|
|
363
359
|
} else {
|
|
364
360
|
webpageText = new CustomTurnDown().convert_soup(doc);
|
|
365
361
|
}
|
|
366
|
-
return {
|
|
367
|
-
title: mainTitle,
|
|
368
|
-
text_content: webpageText
|
|
369
|
-
};
|
|
362
|
+
return { title: mainTitle, markdown: webpageText, text_content: webpageText };
|
|
370
363
|
}
|
|
371
364
|
}
|
|
372
365
|
|
|
@@ -492,10 +485,7 @@ ${transcriptText}
|
|
|
492
485
|
}
|
|
493
486
|
}
|
|
494
487
|
const finalTitle = title ? title : doc.title;
|
|
495
|
-
return {
|
|
496
|
-
title: finalTitle,
|
|
497
|
-
text_content: webpageText
|
|
498
|
-
};
|
|
488
|
+
return { title: finalTitle, markdown: webpageText, text_content: webpageText };
|
|
499
489
|
}
|
|
500
490
|
_get(metadata, keys, default_value) {
|
|
501
491
|
for (const k of keys) {
|
|
@@ -573,10 +563,7 @@ ${sourceLines.join("")}
|
|
|
573
563
|
}
|
|
574
564
|
const mdText = mdOutput.join("\n\n");
|
|
575
565
|
title = notebookContent.metadata?.title || title;
|
|
576
|
-
return {
|
|
577
|
-
title,
|
|
578
|
-
text_content: mdText
|
|
579
|
-
};
|
|
566
|
+
return { title, markdown: mdText, text_content: mdText };
|
|
580
567
|
} catch (e) {
|
|
581
568
|
console.error("Error converting .ipynb file:", e);
|
|
582
569
|
throw new Error(`Error converting .ipynb file: ${e}`);
|
|
@@ -637,10 +624,7 @@ class BingSerpConverter {
|
|
|
637
624
|
const webpageText = `## A Bing search for '${query}' found the following results:
|
|
638
625
|
|
|
639
626
|
${results.join("\n\n")}`;
|
|
640
|
-
return {
|
|
641
|
-
title: doc.title,
|
|
642
|
-
text_content: webpageText
|
|
643
|
-
};
|
|
627
|
+
return { title: doc.title, markdown: webpageText, text_content: webpageText };
|
|
644
628
|
}
|
|
645
629
|
_decodeBase64Url(encodedUrl) {
|
|
646
630
|
let u = encodedUrl.slice(2).trim() + "==";
|
|
@@ -671,10 +655,7 @@ class PdfConverter {
|
|
|
671
655
|
async _convert(pdfContent) {
|
|
672
656
|
try {
|
|
673
657
|
const textContent = await pdfTs.pdfToText(pdfContent);
|
|
674
|
-
return {
|
|
675
|
-
title: null,
|
|
676
|
-
text_content: textContent
|
|
677
|
-
};
|
|
658
|
+
return { title: null, markdown: textContent, text_content: textContent };
|
|
678
659
|
} catch (error) {
|
|
679
660
|
console.error("PDF Parsing Error:", error);
|
|
680
661
|
return null;
|
|
@@ -730,12 +711,9 @@ class XlsxConverter extends HtmlConverter {
|
|
|
730
711
|
mdContent += `## ${sheetName}
|
|
731
712
|
`;
|
|
732
713
|
let htmlContent = XLSX__namespace.utils.sheet_to_html(workbook.Sheets[sheetName]);
|
|
733
|
-
mdContent += (await this._convert(htmlContent))?.
|
|
714
|
+
mdContent += (await this._convert(htmlContent))?.markdown.trim() + "\n\n";
|
|
734
715
|
}
|
|
735
|
-
return {
|
|
736
|
-
title: workbook?.Props?.Title || "Untitled",
|
|
737
|
-
text_content: mdContent
|
|
738
|
-
};
|
|
716
|
+
return { title: workbook?.Props?.Title || "Untitled", markdown: mdContent, text_content: mdContent };
|
|
739
717
|
} catch (e) {
|
|
740
718
|
console.error(e);
|
|
741
719
|
return null;
|
|
@@ -825,10 +803,7 @@ ${transcript === "" ? "[No speech detected]" : transcript}`;
|
|
|
825
803
|
} else {
|
|
826
804
|
mdContent += "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]";
|
|
827
805
|
}
|
|
828
|
-
return {
|
|
829
|
-
title: null,
|
|
830
|
-
text_content: mdContent.trim()
|
|
831
|
-
};
|
|
806
|
+
return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
|
|
832
807
|
}
|
|
833
808
|
// TODO: Add speech to text
|
|
834
809
|
async _transcribeAudio(_) {
|
|
@@ -895,10 +870,7 @@ ${transcript == "" ? "[No speech detected]" : transcript}`;
|
|
|
895
870
|
} else {
|
|
896
871
|
mdContent += "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]";
|
|
897
872
|
}
|
|
898
|
-
return {
|
|
899
|
-
title: null,
|
|
900
|
-
text_content: mdContent.trim()
|
|
901
|
-
};
|
|
873
|
+
return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
|
|
902
874
|
}
|
|
903
875
|
}
|
|
904
876
|
|
|
@@ -950,10 +922,7 @@ class ImageConverter extends MediaConverter {
|
|
|
950
922
|
${(await this._getLLMDescription(imageBuffer, options)).trim()}
|
|
951
923
|
`;
|
|
952
924
|
}
|
|
953
|
-
return {
|
|
954
|
-
title: null,
|
|
955
|
-
text_content: mdContent.trim()
|
|
956
|
-
};
|
|
925
|
+
return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
|
|
957
926
|
}
|
|
958
927
|
async _getLLMDescription(imageBuffer, options) {
|
|
959
928
|
if (!options.llmPrompt || options.llmPrompt.trim() === "") {
|
|
@@ -989,6 +958,7 @@ class ZipConverter {
|
|
|
989
958
|
if (!parentConverters) {
|
|
990
959
|
return {
|
|
991
960
|
title: null,
|
|
961
|
+
markdown: `[ERROR] No converters available to process zip contents from: ${source}`,
|
|
992
962
|
text_content: `[ERROR] No converters available to process zip contents from: ${source}`
|
|
993
963
|
};
|
|
994
964
|
}
|
|
@@ -1026,7 +996,7 @@ class ZipConverter {
|
|
|
1026
996
|
mdResults.push(`
|
|
1027
997
|
## File: ${relativePath}
|
|
1028
998
|
|
|
1029
|
-
${result.
|
|
999
|
+
${result.markdown}
|
|
1030
1000
|
|
|
1031
1001
|
`);
|
|
1032
1002
|
break;
|
|
@@ -1050,19 +1020,18 @@ ${result.text_content}
|
|
|
1050
1020
|
inputStream.pipe(parser);
|
|
1051
1021
|
});
|
|
1052
1022
|
mdContent += mdResults.join("");
|
|
1053
|
-
return {
|
|
1054
|
-
title: null,
|
|
1055
|
-
text_content: mdContent.trim()
|
|
1056
|
-
};
|
|
1023
|
+
return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
|
|
1057
1024
|
} catch (error) {
|
|
1058
1025
|
if (error.message.includes("invalid signature")) {
|
|
1059
1026
|
return {
|
|
1060
1027
|
title: null,
|
|
1028
|
+
markdown: `[ERROR] Invalid or corrupted zip file: ${source}`,
|
|
1061
1029
|
text_content: `[ERROR] Invalid or corrupted zip file: ${source}`
|
|
1062
1030
|
};
|
|
1063
1031
|
}
|
|
1064
1032
|
return {
|
|
1065
1033
|
title: null,
|
|
1034
|
+
markdown: `[ERROR] Failed to process zip file ${source}: ${String(error)}`,
|
|
1066
1035
|
text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}`
|
|
1067
1036
|
};
|
|
1068
1037
|
}
|
|
@@ -1180,8 +1149,8 @@ class MarkItDown {
|
|
|
1180
1149
|
error = e;
|
|
1181
1150
|
}
|
|
1182
1151
|
if (res != null) {
|
|
1183
|
-
res.
|
|
1184
|
-
res.
|
|
1152
|
+
res.markdown = res.markdown.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
|
|
1153
|
+
res.markdown = res.markdown.replace(/\n{3,}/g, "\n\n");
|
|
1185
1154
|
return res;
|
|
1186
1155
|
}
|
|
1187
1156
|
}
|
package/dist/index.d.cts
CHANGED
package/dist/index.d.mts
CHANGED
package/dist/index.d.ts
CHANGED
package/dist/index.mjs
CHANGED
|
@@ -35,6 +35,7 @@ class PlainTextConverter {
|
|
|
35
35
|
}
|
|
36
36
|
return {
|
|
37
37
|
title: null,
|
|
38
|
+
markdown: content,
|
|
38
39
|
text_content: content
|
|
39
40
|
};
|
|
40
41
|
}
|
|
@@ -148,6 +149,7 @@ class HtmlConverter {
|
|
|
148
149
|
}
|
|
149
150
|
return {
|
|
150
151
|
title: doc.title,
|
|
152
|
+
markdown: webpageText,
|
|
151
153
|
text_content: webpageText
|
|
152
154
|
};
|
|
153
155
|
}
|
|
@@ -211,10 +213,7 @@ class RSSConverter {
|
|
|
211
213
|
mdText += this._parseContent(entryContent);
|
|
212
214
|
}
|
|
213
215
|
}
|
|
214
|
-
return {
|
|
215
|
-
title,
|
|
216
|
-
text_content: mdText
|
|
217
|
-
};
|
|
216
|
+
return { title, markdown: mdText, text_content: mdText };
|
|
218
217
|
} catch (error) {
|
|
219
218
|
console.error("Atom Parsing Error:", error);
|
|
220
219
|
return null;
|
|
@@ -262,10 +261,7 @@ class RSSConverter {
|
|
|
262
261
|
mdText += this._parseContent(content);
|
|
263
262
|
}
|
|
264
263
|
}
|
|
265
|
-
return {
|
|
266
|
-
title: channelTitle,
|
|
267
|
-
text_content: mdText
|
|
268
|
-
};
|
|
264
|
+
return { title: channelTitle, markdown: mdText, text_content: mdText };
|
|
269
265
|
} catch (error) {
|
|
270
266
|
console.error("RSS Parsing Error:", error);
|
|
271
267
|
return null;
|
|
@@ -335,10 +331,7 @@ class WikipediaConverter {
|
|
|
335
331
|
} else {
|
|
336
332
|
webpageText = new CustomTurnDown().convert_soup(doc);
|
|
337
333
|
}
|
|
338
|
-
return {
|
|
339
|
-
title: mainTitle,
|
|
340
|
-
text_content: webpageText
|
|
341
|
-
};
|
|
334
|
+
return { title: mainTitle, markdown: webpageText, text_content: webpageText };
|
|
342
335
|
}
|
|
343
336
|
}
|
|
344
337
|
|
|
@@ -464,10 +457,7 @@ ${transcriptText}
|
|
|
464
457
|
}
|
|
465
458
|
}
|
|
466
459
|
const finalTitle = title ? title : doc.title;
|
|
467
|
-
return {
|
|
468
|
-
title: finalTitle,
|
|
469
|
-
text_content: webpageText
|
|
470
|
-
};
|
|
460
|
+
return { title: finalTitle, markdown: webpageText, text_content: webpageText };
|
|
471
461
|
}
|
|
472
462
|
_get(metadata, keys, default_value) {
|
|
473
463
|
for (const k of keys) {
|
|
@@ -545,10 +535,7 @@ ${sourceLines.join("")}
|
|
|
545
535
|
}
|
|
546
536
|
const mdText = mdOutput.join("\n\n");
|
|
547
537
|
title = notebookContent.metadata?.title || title;
|
|
548
|
-
return {
|
|
549
|
-
title,
|
|
550
|
-
text_content: mdText
|
|
551
|
-
};
|
|
538
|
+
return { title, markdown: mdText, text_content: mdText };
|
|
552
539
|
} catch (e) {
|
|
553
540
|
console.error("Error converting .ipynb file:", e);
|
|
554
541
|
throw new Error(`Error converting .ipynb file: ${e}`);
|
|
@@ -609,10 +596,7 @@ class BingSerpConverter {
|
|
|
609
596
|
const webpageText = `## A Bing search for '${query}' found the following results:
|
|
610
597
|
|
|
611
598
|
${results.join("\n\n")}`;
|
|
612
|
-
return {
|
|
613
|
-
title: doc.title,
|
|
614
|
-
text_content: webpageText
|
|
615
|
-
};
|
|
599
|
+
return { title: doc.title, markdown: webpageText, text_content: webpageText };
|
|
616
600
|
}
|
|
617
601
|
_decodeBase64Url(encodedUrl) {
|
|
618
602
|
let u = encodedUrl.slice(2).trim() + "==";
|
|
@@ -643,10 +627,7 @@ class PdfConverter {
|
|
|
643
627
|
async _convert(pdfContent) {
|
|
644
628
|
try {
|
|
645
629
|
const textContent = await pdfToText(pdfContent);
|
|
646
|
-
return {
|
|
647
|
-
title: null,
|
|
648
|
-
text_content: textContent
|
|
649
|
-
};
|
|
630
|
+
return { title: null, markdown: textContent, text_content: textContent };
|
|
650
631
|
} catch (error) {
|
|
651
632
|
console.error("PDF Parsing Error:", error);
|
|
652
633
|
return null;
|
|
@@ -702,12 +683,9 @@ class XlsxConverter extends HtmlConverter {
|
|
|
702
683
|
mdContent += `## ${sheetName}
|
|
703
684
|
`;
|
|
704
685
|
let htmlContent = XLSX.utils.sheet_to_html(workbook.Sheets[sheetName]);
|
|
705
|
-
mdContent += (await this._convert(htmlContent))?.
|
|
686
|
+
mdContent += (await this._convert(htmlContent))?.markdown.trim() + "\n\n";
|
|
706
687
|
}
|
|
707
|
-
return {
|
|
708
|
-
title: workbook?.Props?.Title || "Untitled",
|
|
709
|
-
text_content: mdContent
|
|
710
|
-
};
|
|
688
|
+
return { title: workbook?.Props?.Title || "Untitled", markdown: mdContent, text_content: mdContent };
|
|
711
689
|
} catch (e) {
|
|
712
690
|
console.error(e);
|
|
713
691
|
return null;
|
|
@@ -797,10 +775,7 @@ ${transcript === "" ? "[No speech detected]" : transcript}`;
|
|
|
797
775
|
} else {
|
|
798
776
|
mdContent += "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]";
|
|
799
777
|
}
|
|
800
|
-
return {
|
|
801
|
-
title: null,
|
|
802
|
-
text_content: mdContent.trim()
|
|
803
|
-
};
|
|
778
|
+
return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
|
|
804
779
|
}
|
|
805
780
|
// TODO: Add speech to text
|
|
806
781
|
async _transcribeAudio(_) {
|
|
@@ -867,10 +842,7 @@ ${transcript == "" ? "[No speech detected]" : transcript}`;
|
|
|
867
842
|
} else {
|
|
868
843
|
mdContent += "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]";
|
|
869
844
|
}
|
|
870
|
-
return {
|
|
871
|
-
title: null,
|
|
872
|
-
text_content: mdContent.trim()
|
|
873
|
-
};
|
|
845
|
+
return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
|
|
874
846
|
}
|
|
875
847
|
}
|
|
876
848
|
|
|
@@ -922,10 +894,7 @@ class ImageConverter extends MediaConverter {
|
|
|
922
894
|
${(await this._getLLMDescription(imageBuffer, options)).trim()}
|
|
923
895
|
`;
|
|
924
896
|
}
|
|
925
|
-
return {
|
|
926
|
-
title: null,
|
|
927
|
-
text_content: mdContent.trim()
|
|
928
|
-
};
|
|
897
|
+
return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
|
|
929
898
|
}
|
|
930
899
|
async _getLLMDescription(imageBuffer, options) {
|
|
931
900
|
if (!options.llmPrompt || options.llmPrompt.trim() === "") {
|
|
@@ -961,6 +930,7 @@ class ZipConverter {
|
|
|
961
930
|
if (!parentConverters) {
|
|
962
931
|
return {
|
|
963
932
|
title: null,
|
|
933
|
+
markdown: `[ERROR] No converters available to process zip contents from: ${source}`,
|
|
964
934
|
text_content: `[ERROR] No converters available to process zip contents from: ${source}`
|
|
965
935
|
};
|
|
966
936
|
}
|
|
@@ -998,7 +968,7 @@ class ZipConverter {
|
|
|
998
968
|
mdResults.push(`
|
|
999
969
|
## File: ${relativePath}
|
|
1000
970
|
|
|
1001
|
-
${result.
|
|
971
|
+
${result.markdown}
|
|
1002
972
|
|
|
1003
973
|
`);
|
|
1004
974
|
break;
|
|
@@ -1022,19 +992,18 @@ ${result.text_content}
|
|
|
1022
992
|
inputStream.pipe(parser);
|
|
1023
993
|
});
|
|
1024
994
|
mdContent += mdResults.join("");
|
|
1025
|
-
return {
|
|
1026
|
-
title: null,
|
|
1027
|
-
text_content: mdContent.trim()
|
|
1028
|
-
};
|
|
995
|
+
return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
|
|
1029
996
|
} catch (error) {
|
|
1030
997
|
if (error.message.includes("invalid signature")) {
|
|
1031
998
|
return {
|
|
1032
999
|
title: null,
|
|
1000
|
+
markdown: `[ERROR] Invalid or corrupted zip file: ${source}`,
|
|
1033
1001
|
text_content: `[ERROR] Invalid or corrupted zip file: ${source}`
|
|
1034
1002
|
};
|
|
1035
1003
|
}
|
|
1036
1004
|
return {
|
|
1037
1005
|
title: null,
|
|
1006
|
+
markdown: `[ERROR] Failed to process zip file ${source}: ${String(error)}`,
|
|
1038
1007
|
text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}`
|
|
1039
1008
|
};
|
|
1040
1009
|
}
|
|
@@ -1152,8 +1121,8 @@ class MarkItDown {
|
|
|
1152
1121
|
error = e;
|
|
1153
1122
|
}
|
|
1154
1123
|
if (res != null) {
|
|
1155
|
-
res.
|
|
1156
|
-
res.
|
|
1124
|
+
res.markdown = res.markdown.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
|
|
1125
|
+
res.markdown = res.markdown.replace(/\n{3,}/g, "\n\n");
|
|
1157
1126
|
return res;
|
|
1158
1127
|
}
|
|
1159
1128
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markitdown-ts",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.7",
|
|
4
4
|
"description": "",
|
|
5
5
|
"keywords": [],
|
|
6
6
|
"homepage": "https://github.com/dead8309/markitdown-ts#readme",
|
|
@@ -34,6 +34,7 @@
|
|
|
34
34
|
"@types/node": "^22.10.2",
|
|
35
35
|
"@types/turndown": "^5.0.5",
|
|
36
36
|
"@types/unzipper": "^0.10.10",
|
|
37
|
+
"zod": "^4.1.8",
|
|
37
38
|
"bumpp": "^9.9.1",
|
|
38
39
|
"is-ci": "^4.1.0",
|
|
39
40
|
"prettier": "^3.4.2",
|