markitdown-ts 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -50,7 +50,7 @@ try {
50
50
  const result = await markitdown.convert("https://arxiv.org/pdf/2308.08155v2.pdf");
51
51
 
52
52
  if (result) {
53
- console.log(result.text_content);
53
+ console.log(result.markdown);
54
54
  }
55
55
  } catch (error) {
56
56
  console.error("Conversion failed:", error);
@@ -158,6 +158,8 @@ class MarkItDown {
158
158
  export type ConverterResult =
159
159
  | {
160
160
  title: string | null;
161
+ markdown: string;
162
+ /** @deprecated Use `markdown` instead. */
161
163
  text_content: string;
162
164
  }
163
165
  | null
package/dist/index.cjs CHANGED
@@ -63,6 +63,7 @@ class PlainTextConverter {
63
63
  }
64
64
  return {
65
65
  title: null,
66
+ markdown: content,
66
67
  text_content: content
67
68
  };
68
69
  }
@@ -176,6 +177,7 @@ class HtmlConverter {
176
177
  }
177
178
  return {
178
179
  title: doc.title,
180
+ markdown: webpageText,
179
181
  text_content: webpageText
180
182
  };
181
183
  }
@@ -239,10 +241,7 @@ class RSSConverter {
239
241
  mdText += this._parseContent(entryContent);
240
242
  }
241
243
  }
242
- return {
243
- title,
244
- text_content: mdText
245
- };
244
+ return { title, markdown: mdText, text_content: mdText };
246
245
  } catch (error) {
247
246
  console.error("Atom Parsing Error:", error);
248
247
  return null;
@@ -290,10 +289,7 @@ class RSSConverter {
290
289
  mdText += this._parseContent(content);
291
290
  }
292
291
  }
293
- return {
294
- title: channelTitle,
295
- text_content: mdText
296
- };
292
+ return { title: channelTitle, markdown: mdText, text_content: mdText };
297
293
  } catch (error) {
298
294
  console.error("RSS Parsing Error:", error);
299
295
  return null;
@@ -363,10 +359,7 @@ class WikipediaConverter {
363
359
  } else {
364
360
  webpageText = new CustomTurnDown().convert_soup(doc);
365
361
  }
366
- return {
367
- title: mainTitle,
368
- text_content: webpageText
369
- };
362
+ return { title: mainTitle, markdown: webpageText, text_content: webpageText };
370
363
  }
371
364
  }
372
365
 
@@ -492,10 +485,7 @@ ${transcriptText}
492
485
  }
493
486
  }
494
487
  const finalTitle = title ? title : doc.title;
495
- return {
496
- title: finalTitle,
497
- text_content: webpageText
498
- };
488
+ return { title: finalTitle, markdown: webpageText, text_content: webpageText };
499
489
  }
500
490
  _get(metadata, keys, default_value) {
501
491
  for (const k of keys) {
@@ -573,10 +563,7 @@ ${sourceLines.join("")}
573
563
  }
574
564
  const mdText = mdOutput.join("\n\n");
575
565
  title = notebookContent.metadata?.title || title;
576
- return {
577
- title,
578
- text_content: mdText
579
- };
566
+ return { title, markdown: mdText, text_content: mdText };
580
567
  } catch (e) {
581
568
  console.error("Error converting .ipynb file:", e);
582
569
  throw new Error(`Error converting .ipynb file: ${e}`);
@@ -637,10 +624,7 @@ class BingSerpConverter {
637
624
  const webpageText = `## A Bing search for '${query}' found the following results:
638
625
 
639
626
  ${results.join("\n\n")}`;
640
- return {
641
- title: doc.title,
642
- text_content: webpageText
643
- };
627
+ return { title: doc.title, markdown: webpageText, text_content: webpageText };
644
628
  }
645
629
  _decodeBase64Url(encodedUrl) {
646
630
  let u = encodedUrl.slice(2).trim() + "==";
@@ -671,10 +655,7 @@ class PdfConverter {
671
655
  async _convert(pdfContent) {
672
656
  try {
673
657
  const textContent = await pdfTs.pdfToText(pdfContent);
674
- return {
675
- title: null,
676
- text_content: textContent
677
- };
658
+ return { title: null, markdown: textContent, text_content: textContent };
678
659
  } catch (error) {
679
660
  console.error("PDF Parsing Error:", error);
680
661
  return null;
@@ -730,12 +711,9 @@ class XlsxConverter extends HtmlConverter {
730
711
  mdContent += `## ${sheetName}
731
712
  `;
732
713
  let htmlContent = XLSX__namespace.utils.sheet_to_html(workbook.Sheets[sheetName]);
733
- mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
714
+ mdContent += (await this._convert(htmlContent))?.markdown.trim() + "\n\n";
734
715
  }
735
- return {
736
- title: workbook?.Props?.Title || "Untitled",
737
- text_content: mdContent
738
- };
716
+ return { title: workbook?.Props?.Title || "Untitled", markdown: mdContent, text_content: mdContent };
739
717
  } catch (e) {
740
718
  console.error(e);
741
719
  return null;
@@ -825,10 +803,7 @@ ${transcript === "" ? "[No speech detected]" : transcript}`;
825
803
  } else {
826
804
  mdContent += "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]";
827
805
  }
828
- return {
829
- title: null,
830
- text_content: mdContent.trim()
831
- };
806
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
832
807
  }
833
808
  // TODO: Add speech to text
834
809
  async _transcribeAudio(_) {
@@ -895,10 +870,7 @@ ${transcript == "" ? "[No speech detected]" : transcript}`;
895
870
  } else {
896
871
  mdContent += "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]";
897
872
  }
898
- return {
899
- title: null,
900
- text_content: mdContent.trim()
901
- };
873
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
902
874
  }
903
875
  }
904
876
 
@@ -950,10 +922,7 @@ class ImageConverter extends MediaConverter {
950
922
  ${(await this._getLLMDescription(imageBuffer, options)).trim()}
951
923
  `;
952
924
  }
953
- return {
954
- title: null,
955
- text_content: mdContent.trim()
956
- };
925
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
957
926
  }
958
927
  async _getLLMDescription(imageBuffer, options) {
959
928
  if (!options.llmPrompt || options.llmPrompt.trim() === "") {
@@ -989,6 +958,7 @@ class ZipConverter {
989
958
  if (!parentConverters) {
990
959
  return {
991
960
  title: null,
961
+ markdown: `[ERROR] No converters available to process zip contents from: ${source}`,
992
962
  text_content: `[ERROR] No converters available to process zip contents from: ${source}`
993
963
  };
994
964
  }
@@ -1026,7 +996,7 @@ class ZipConverter {
1026
996
  mdResults.push(`
1027
997
  ## File: ${relativePath}
1028
998
 
1029
- ${result.text_content}
999
+ ${result.markdown}
1030
1000
 
1031
1001
  `);
1032
1002
  break;
@@ -1050,19 +1020,18 @@ ${result.text_content}
1050
1020
  inputStream.pipe(parser);
1051
1021
  });
1052
1022
  mdContent += mdResults.join("");
1053
- return {
1054
- title: null,
1055
- text_content: mdContent.trim()
1056
- };
1023
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
1057
1024
  } catch (error) {
1058
1025
  if (error.message.includes("invalid signature")) {
1059
1026
  return {
1060
1027
  title: null,
1028
+ markdown: `[ERROR] Invalid or corrupted zip file: ${source}`,
1061
1029
  text_content: `[ERROR] Invalid or corrupted zip file: ${source}`
1062
1030
  };
1063
1031
  }
1064
1032
  return {
1065
1033
  title: null,
1034
+ markdown: `[ERROR] Failed to process zip file ${source}: ${String(error)}`,
1066
1035
  text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}`
1067
1036
  };
1068
1037
  }
@@ -1180,8 +1149,8 @@ class MarkItDown {
1180
1149
  error = e;
1181
1150
  }
1182
1151
  if (res != null) {
1183
- res.text_content = res.text_content.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1184
- res.text_content = res.text_content.replace(/\n{3,}/g, "\n\n");
1152
+ res.markdown = res.markdown.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1153
+ res.markdown = res.markdown.replace(/\n{3,}/g, "\n\n");
1185
1154
  return res;
1186
1155
  }
1187
1156
  }
package/dist/index.d.cts CHANGED
@@ -3,6 +3,8 @@ import mammoth from 'mammoth';
3
3
 
4
4
  type ConverterResult = {
5
5
  title: string | null;
6
+ markdown: string;
7
+ /** @deprecated Use `markdown` instead. */
6
8
  text_content: string;
7
9
  } | null | undefined;
8
10
  type ConverterOptions = {
package/dist/index.d.mts CHANGED
@@ -3,6 +3,8 @@ import mammoth from 'mammoth';
3
3
 
4
4
  type ConverterResult = {
5
5
  title: string | null;
6
+ markdown: string;
7
+ /** @deprecated Use `markdown` instead. */
6
8
  text_content: string;
7
9
  } | null | undefined;
8
10
  type ConverterOptions = {
package/dist/index.d.ts CHANGED
@@ -3,6 +3,8 @@ import mammoth from 'mammoth';
3
3
 
4
4
  type ConverterResult = {
5
5
  title: string | null;
6
+ markdown: string;
7
+ /** @deprecated Use `markdown` instead. */
6
8
  text_content: string;
7
9
  } | null | undefined;
8
10
  type ConverterOptions = {
package/dist/index.mjs CHANGED
@@ -35,6 +35,7 @@ class PlainTextConverter {
35
35
  }
36
36
  return {
37
37
  title: null,
38
+ markdown: content,
38
39
  text_content: content
39
40
  };
40
41
  }
@@ -148,6 +149,7 @@ class HtmlConverter {
148
149
  }
149
150
  return {
150
151
  title: doc.title,
152
+ markdown: webpageText,
151
153
  text_content: webpageText
152
154
  };
153
155
  }
@@ -211,10 +213,7 @@ class RSSConverter {
211
213
  mdText += this._parseContent(entryContent);
212
214
  }
213
215
  }
214
- return {
215
- title,
216
- text_content: mdText
217
- };
216
+ return { title, markdown: mdText, text_content: mdText };
218
217
  } catch (error) {
219
218
  console.error("Atom Parsing Error:", error);
220
219
  return null;
@@ -262,10 +261,7 @@ class RSSConverter {
262
261
  mdText += this._parseContent(content);
263
262
  }
264
263
  }
265
- return {
266
- title: channelTitle,
267
- text_content: mdText
268
- };
264
+ return { title: channelTitle, markdown: mdText, text_content: mdText };
269
265
  } catch (error) {
270
266
  console.error("RSS Parsing Error:", error);
271
267
  return null;
@@ -335,10 +331,7 @@ class WikipediaConverter {
335
331
  } else {
336
332
  webpageText = new CustomTurnDown().convert_soup(doc);
337
333
  }
338
- return {
339
- title: mainTitle,
340
- text_content: webpageText
341
- };
334
+ return { title: mainTitle, markdown: webpageText, text_content: webpageText };
342
335
  }
343
336
  }
344
337
 
@@ -464,10 +457,7 @@ ${transcriptText}
464
457
  }
465
458
  }
466
459
  const finalTitle = title ? title : doc.title;
467
- return {
468
- title: finalTitle,
469
- text_content: webpageText
470
- };
460
+ return { title: finalTitle, markdown: webpageText, text_content: webpageText };
471
461
  }
472
462
  _get(metadata, keys, default_value) {
473
463
  for (const k of keys) {
@@ -545,10 +535,7 @@ ${sourceLines.join("")}
545
535
  }
546
536
  const mdText = mdOutput.join("\n\n");
547
537
  title = notebookContent.metadata?.title || title;
548
- return {
549
- title,
550
- text_content: mdText
551
- };
538
+ return { title, markdown: mdText, text_content: mdText };
552
539
  } catch (e) {
553
540
  console.error("Error converting .ipynb file:", e);
554
541
  throw new Error(`Error converting .ipynb file: ${e}`);
@@ -609,10 +596,7 @@ class BingSerpConverter {
609
596
  const webpageText = `## A Bing search for '${query}' found the following results:
610
597
 
611
598
  ${results.join("\n\n")}`;
612
- return {
613
- title: doc.title,
614
- text_content: webpageText
615
- };
599
+ return { title: doc.title, markdown: webpageText, text_content: webpageText };
616
600
  }
617
601
  _decodeBase64Url(encodedUrl) {
618
602
  let u = encodedUrl.slice(2).trim() + "==";
@@ -643,10 +627,7 @@ class PdfConverter {
643
627
  async _convert(pdfContent) {
644
628
  try {
645
629
  const textContent = await pdfToText(pdfContent);
646
- return {
647
- title: null,
648
- text_content: textContent
649
- };
630
+ return { title: null, markdown: textContent, text_content: textContent };
650
631
  } catch (error) {
651
632
  console.error("PDF Parsing Error:", error);
652
633
  return null;
@@ -702,12 +683,9 @@ class XlsxConverter extends HtmlConverter {
702
683
  mdContent += `## ${sheetName}
703
684
  `;
704
685
  let htmlContent = XLSX.utils.sheet_to_html(workbook.Sheets[sheetName]);
705
- mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
686
+ mdContent += (await this._convert(htmlContent))?.markdown.trim() + "\n\n";
706
687
  }
707
- return {
708
- title: workbook?.Props?.Title || "Untitled",
709
- text_content: mdContent
710
- };
688
+ return { title: workbook?.Props?.Title || "Untitled", markdown: mdContent, text_content: mdContent };
711
689
  } catch (e) {
712
690
  console.error(e);
713
691
  return null;
@@ -797,10 +775,7 @@ ${transcript === "" ? "[No speech detected]" : transcript}`;
797
775
  } else {
798
776
  mdContent += "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]";
799
777
  }
800
- return {
801
- title: null,
802
- text_content: mdContent.trim()
803
- };
778
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
804
779
  }
805
780
  // TODO: Add speech to text
806
781
  async _transcribeAudio(_) {
@@ -867,10 +842,7 @@ ${transcript == "" ? "[No speech detected]" : transcript}`;
867
842
  } else {
868
843
  mdContent += "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]";
869
844
  }
870
- return {
871
- title: null,
872
- text_content: mdContent.trim()
873
- };
845
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
874
846
  }
875
847
  }
876
848
 
@@ -922,10 +894,7 @@ class ImageConverter extends MediaConverter {
922
894
  ${(await this._getLLMDescription(imageBuffer, options)).trim()}
923
895
  `;
924
896
  }
925
- return {
926
- title: null,
927
- text_content: mdContent.trim()
928
- };
897
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
929
898
  }
930
899
  async _getLLMDescription(imageBuffer, options) {
931
900
  if (!options.llmPrompt || options.llmPrompt.trim() === "") {
@@ -961,6 +930,7 @@ class ZipConverter {
961
930
  if (!parentConverters) {
962
931
  return {
963
932
  title: null,
933
+ markdown: `[ERROR] No converters available to process zip contents from: ${source}`,
964
934
  text_content: `[ERROR] No converters available to process zip contents from: ${source}`
965
935
  };
966
936
  }
@@ -998,7 +968,7 @@ class ZipConverter {
998
968
  mdResults.push(`
999
969
  ## File: ${relativePath}
1000
970
 
1001
- ${result.text_content}
971
+ ${result.markdown}
1002
972
 
1003
973
  `);
1004
974
  break;
@@ -1022,19 +992,18 @@ ${result.text_content}
1022
992
  inputStream.pipe(parser);
1023
993
  });
1024
994
  mdContent += mdResults.join("");
1025
- return {
1026
- title: null,
1027
- text_content: mdContent.trim()
1028
- };
995
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
1029
996
  } catch (error) {
1030
997
  if (error.message.includes("invalid signature")) {
1031
998
  return {
1032
999
  title: null,
1000
+ markdown: `[ERROR] Invalid or corrupted zip file: ${source}`,
1033
1001
  text_content: `[ERROR] Invalid or corrupted zip file: ${source}`
1034
1002
  };
1035
1003
  }
1036
1004
  return {
1037
1005
  title: null,
1006
+ markdown: `[ERROR] Failed to process zip file ${source}: ${String(error)}`,
1038
1007
  text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}`
1039
1008
  };
1040
1009
  }
@@ -1152,8 +1121,8 @@ class MarkItDown {
1152
1121
  error = e;
1153
1122
  }
1154
1123
  if (res != null) {
1155
- res.text_content = res.text_content.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1156
- res.text_content = res.text_content.replace(/\n{3,}/g, "\n\n");
1124
+ res.markdown = res.markdown.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1125
+ res.markdown = res.markdown.replace(/\n{3,}/g, "\n\n");
1157
1126
  return res;
1158
1127
  }
1159
1128
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "markitdown-ts",
3
- "version": "0.0.6",
3
+ "version": "0.0.7",
4
4
  "description": "",
5
5
  "keywords": [],
6
6
  "homepage": "https://github.com/dead8309/markitdown-ts#readme",
@@ -34,6 +34,7 @@
34
34
  "@types/node": "^22.10.2",
35
35
  "@types/turndown": "^5.0.5",
36
36
  "@types/unzipper": "^0.10.10",
37
+ "zod": "^4.1.8",
37
38
  "bumpp": "^9.9.1",
38
39
  "is-ci": "^4.1.0",
39
40
  "prettier": "^3.4.2",