markitdown-ts 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -50,7 +50,7 @@ try {
50
50
  const result = await markitdown.convert("https://arxiv.org/pdf/2308.08155v2.pdf");
51
51
 
52
52
  if (result) {
53
- console.log(result.text_content);
53
+ console.log(result.markdown);
54
54
  }
55
55
  } catch (error) {
56
56
  console.error("Conversion failed:", error);
@@ -158,6 +158,8 @@ class MarkItDown {
158
158
  export type ConverterResult =
159
159
  | {
160
160
  title: string | null;
161
+ markdown: string;
162
+ /** @deprecated Use `markdown` instead. */
161
163
  text_content: string;
162
164
  }
163
165
  | null
package/dist/index.cjs CHANGED
@@ -8,7 +8,7 @@ const TurndownService = require('turndown');
8
8
  const turndownPluginGfm = require('@joplin/turndown-plugin-gfm');
9
9
  const xmldom = require('@xmldom/xmldom');
10
10
  const url = require('url');
11
- const pdfTs = require('pdf-ts');
11
+ const pdfParse = require('pdf-parse');
12
12
  const Mammoth = require('mammoth');
13
13
  const XLSX = require('xlsx');
14
14
  const childProcess = require('child_process');
@@ -63,6 +63,7 @@ class PlainTextConverter {
63
63
  }
64
64
  return {
65
65
  title: null,
66
+ markdown: content,
66
67
  text_content: content
67
68
  };
68
69
  }
@@ -176,6 +177,7 @@ class HtmlConverter {
176
177
  }
177
178
  return {
178
179
  title: doc.title,
180
+ markdown: webpageText,
179
181
  text_content: webpageText
180
182
  };
181
183
  }
@@ -239,10 +241,7 @@ class RSSConverter {
239
241
  mdText += this._parseContent(entryContent);
240
242
  }
241
243
  }
242
- return {
243
- title,
244
- text_content: mdText
245
- };
244
+ return { title, markdown: mdText, text_content: mdText };
246
245
  } catch (error) {
247
246
  console.error("Atom Parsing Error:", error);
248
247
  return null;
@@ -290,10 +289,7 @@ class RSSConverter {
290
289
  mdText += this._parseContent(content);
291
290
  }
292
291
  }
293
- return {
294
- title: channelTitle,
295
- text_content: mdText
296
- };
292
+ return { title: channelTitle, markdown: mdText, text_content: mdText };
297
293
  } catch (error) {
298
294
  console.error("RSS Parsing Error:", error);
299
295
  return null;
@@ -363,10 +359,7 @@ class WikipediaConverter {
363
359
  } else {
364
360
  webpageText = new CustomTurnDown().convert_soup(doc);
365
361
  }
366
- return {
367
- title: mainTitle,
368
- text_content: webpageText
369
- };
362
+ return { title: mainTitle, markdown: webpageText, text_content: webpageText };
370
363
  }
371
364
  }
372
365
 
@@ -492,10 +485,7 @@ ${transcriptText}
492
485
  }
493
486
  }
494
487
  const finalTitle = title ? title : doc.title;
495
- return {
496
- title: finalTitle,
497
- text_content: webpageText
498
- };
488
+ return { title: finalTitle, markdown: webpageText, text_content: webpageText };
499
489
  }
500
490
  _get(metadata, keys, default_value) {
501
491
  for (const k of keys) {
@@ -573,10 +563,7 @@ ${sourceLines.join("")}
573
563
  }
574
564
  const mdText = mdOutput.join("\n\n");
575
565
  title = notebookContent.metadata?.title || title;
576
- return {
577
- title,
578
- text_content: mdText
579
- };
566
+ return { title, markdown: mdText, text_content: mdText };
580
567
  } catch (e) {
581
568
  console.error("Error converting .ipynb file:", e);
582
569
  throw new Error(`Error converting .ipynb file: ${e}`);
@@ -637,10 +624,7 @@ class BingSerpConverter {
637
624
  const webpageText = `## A Bing search for '${query}' found the following results:
638
625
 
639
626
  ${results.join("\n\n")}`;
640
- return {
641
- title: doc.title,
642
- text_content: webpageText
643
- };
627
+ return { title: doc.title, markdown: webpageText, text_content: webpageText };
644
628
  }
645
629
  _decodeBase64Url(encodedUrl) {
646
630
  let u = encodedUrl.slice(2).trim() + "==";
@@ -670,11 +654,10 @@ class PdfConverter {
670
654
  }
671
655
  async _convert(pdfContent) {
672
656
  try {
673
- const textContent = await pdfTs.pdfToText(pdfContent);
674
- return {
675
- title: null,
676
- text_content: textContent
677
- };
657
+ const parser = new pdfParse.PDFParse({ data: pdfContent });
658
+ const result = await parser.getText();
659
+ await parser.destroy();
660
+ return { title: null, markdown: result.text, text_content: result.text };
678
661
  } catch (error) {
679
662
  console.error("PDF Parsing Error:", error);
680
663
  return null;
@@ -730,12 +713,9 @@ class XlsxConverter extends HtmlConverter {
730
713
  mdContent += `## ${sheetName}
731
714
  `;
732
715
  let htmlContent = XLSX__namespace.utils.sheet_to_html(workbook.Sheets[sheetName]);
733
- mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
716
+ mdContent += (await this._convert(htmlContent))?.markdown.trim() + "\n\n";
734
717
  }
735
- return {
736
- title: workbook?.Props?.Title || "Untitled",
737
- text_content: mdContent
738
- };
718
+ return { title: workbook?.Props?.Title || "Untitled", markdown: mdContent, text_content: mdContent };
739
719
  } catch (e) {
740
720
  console.error(e);
741
721
  return null;
@@ -825,10 +805,7 @@ ${transcript === "" ? "[No speech detected]" : transcript}`;
825
805
  } else {
826
806
  mdContent += "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]";
827
807
  }
828
- return {
829
- title: null,
830
- text_content: mdContent.trim()
831
- };
808
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
832
809
  }
833
810
  // TODO: Add speech to text
834
811
  async _transcribeAudio(_) {
@@ -895,10 +872,7 @@ ${transcript == "" ? "[No speech detected]" : transcript}`;
895
872
  } else {
896
873
  mdContent += "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]";
897
874
  }
898
- return {
899
- title: null,
900
- text_content: mdContent.trim()
901
- };
875
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
902
876
  }
903
877
  }
904
878
 
@@ -950,10 +924,7 @@ class ImageConverter extends MediaConverter {
950
924
  ${(await this._getLLMDescription(imageBuffer, options)).trim()}
951
925
  `;
952
926
  }
953
- return {
954
- title: null,
955
- text_content: mdContent.trim()
956
- };
927
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
957
928
  }
958
929
  async _getLLMDescription(imageBuffer, options) {
959
930
  if (!options.llmPrompt || options.llmPrompt.trim() === "") {
@@ -989,6 +960,7 @@ class ZipConverter {
989
960
  if (!parentConverters) {
990
961
  return {
991
962
  title: null,
963
+ markdown: `[ERROR] No converters available to process zip contents from: ${source}`,
992
964
  text_content: `[ERROR] No converters available to process zip contents from: ${source}`
993
965
  };
994
966
  }
@@ -1026,7 +998,7 @@ class ZipConverter {
1026
998
  mdResults.push(`
1027
999
  ## File: ${relativePath}
1028
1000
 
1029
- ${result.text_content}
1001
+ ${result.markdown}
1030
1002
 
1031
1003
  `);
1032
1004
  break;
@@ -1050,19 +1022,18 @@ ${result.text_content}
1050
1022
  inputStream.pipe(parser);
1051
1023
  });
1052
1024
  mdContent += mdResults.join("");
1053
- return {
1054
- title: null,
1055
- text_content: mdContent.trim()
1056
- };
1025
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
1057
1026
  } catch (error) {
1058
1027
  if (error.message.includes("invalid signature")) {
1059
1028
  return {
1060
1029
  title: null,
1030
+ markdown: `[ERROR] Invalid or corrupted zip file: ${source}`,
1061
1031
  text_content: `[ERROR] Invalid or corrupted zip file: ${source}`
1062
1032
  };
1063
1033
  }
1064
1034
  return {
1065
1035
  title: null,
1036
+ markdown: `[ERROR] Failed to process zip file ${source}: ${String(error)}`,
1066
1037
  text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}`
1067
1038
  };
1068
1039
  }
@@ -1180,8 +1151,8 @@ class MarkItDown {
1180
1151
  error = e;
1181
1152
  }
1182
1153
  if (res != null) {
1183
- res.text_content = res.text_content.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1184
- res.text_content = res.text_content.replace(/\n{3,}/g, "\n\n");
1154
+ res.markdown = res.markdown.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1155
+ res.markdown = res.markdown.replace(/\n{3,}/g, "\n\n");
1185
1156
  return res;
1186
1157
  }
1187
1158
  }
package/dist/index.d.cts CHANGED
@@ -3,6 +3,8 @@ import mammoth from 'mammoth';
3
3
 
4
4
  type ConverterResult = {
5
5
  title: string | null;
6
+ markdown: string;
7
+ /** @deprecated Use `markdown` instead. */
6
8
  text_content: string;
7
9
  } | null | undefined;
8
10
  type ConverterOptions = {
package/dist/index.d.mts CHANGED
@@ -3,6 +3,8 @@ import mammoth from 'mammoth';
3
3
 
4
4
  type ConverterResult = {
5
5
  title: string | null;
6
+ markdown: string;
7
+ /** @deprecated Use `markdown` instead. */
6
8
  text_content: string;
7
9
  } | null | undefined;
8
10
  type ConverterOptions = {
package/dist/index.d.ts CHANGED
@@ -3,6 +3,8 @@ import mammoth from 'mammoth';
3
3
 
4
4
  type ConverterResult = {
5
5
  title: string | null;
6
+ markdown: string;
7
+ /** @deprecated Use `markdown` instead. */
6
8
  text_content: string;
7
9
  } | null | undefined;
8
10
  type ConverterOptions = {
package/dist/index.mjs CHANGED
@@ -8,7 +8,7 @@ import TurndownService from 'turndown';
8
8
  import turndownPluginGfm from '@joplin/turndown-plugin-gfm';
9
9
  import { DOMParser } from '@xmldom/xmldom';
10
10
  import { URL as URL$1 } from 'url';
11
- import { pdfToText } from 'pdf-ts';
11
+ import { PDFParse } from 'pdf-parse';
12
12
  import Mammoth from 'mammoth';
13
13
  import * as XLSX from 'xlsx';
14
14
  import * as childProcess from 'child_process';
@@ -35,6 +35,7 @@ class PlainTextConverter {
35
35
  }
36
36
  return {
37
37
  title: null,
38
+ markdown: content,
38
39
  text_content: content
39
40
  };
40
41
  }
@@ -148,6 +149,7 @@ class HtmlConverter {
148
149
  }
149
150
  return {
150
151
  title: doc.title,
152
+ markdown: webpageText,
151
153
  text_content: webpageText
152
154
  };
153
155
  }
@@ -211,10 +213,7 @@ class RSSConverter {
211
213
  mdText += this._parseContent(entryContent);
212
214
  }
213
215
  }
214
- return {
215
- title,
216
- text_content: mdText
217
- };
216
+ return { title, markdown: mdText, text_content: mdText };
218
217
  } catch (error) {
219
218
  console.error("Atom Parsing Error:", error);
220
219
  return null;
@@ -262,10 +261,7 @@ class RSSConverter {
262
261
  mdText += this._parseContent(content);
263
262
  }
264
263
  }
265
- return {
266
- title: channelTitle,
267
- text_content: mdText
268
- };
264
+ return { title: channelTitle, markdown: mdText, text_content: mdText };
269
265
  } catch (error) {
270
266
  console.error("RSS Parsing Error:", error);
271
267
  return null;
@@ -335,10 +331,7 @@ class WikipediaConverter {
335
331
  } else {
336
332
  webpageText = new CustomTurnDown().convert_soup(doc);
337
333
  }
338
- return {
339
- title: mainTitle,
340
- text_content: webpageText
341
- };
334
+ return { title: mainTitle, markdown: webpageText, text_content: webpageText };
342
335
  }
343
336
  }
344
337
 
@@ -464,10 +457,7 @@ ${transcriptText}
464
457
  }
465
458
  }
466
459
  const finalTitle = title ? title : doc.title;
467
- return {
468
- title: finalTitle,
469
- text_content: webpageText
470
- };
460
+ return { title: finalTitle, markdown: webpageText, text_content: webpageText };
471
461
  }
472
462
  _get(metadata, keys, default_value) {
473
463
  for (const k of keys) {
@@ -545,10 +535,7 @@ ${sourceLines.join("")}
545
535
  }
546
536
  const mdText = mdOutput.join("\n\n");
547
537
  title = notebookContent.metadata?.title || title;
548
- return {
549
- title,
550
- text_content: mdText
551
- };
538
+ return { title, markdown: mdText, text_content: mdText };
552
539
  } catch (e) {
553
540
  console.error("Error converting .ipynb file:", e);
554
541
  throw new Error(`Error converting .ipynb file: ${e}`);
@@ -609,10 +596,7 @@ class BingSerpConverter {
609
596
  const webpageText = `## A Bing search for '${query}' found the following results:
610
597
 
611
598
  ${results.join("\n\n")}`;
612
- return {
613
- title: doc.title,
614
- text_content: webpageText
615
- };
599
+ return { title: doc.title, markdown: webpageText, text_content: webpageText };
616
600
  }
617
601
  _decodeBase64Url(encodedUrl) {
618
602
  let u = encodedUrl.slice(2).trim() + "==";
@@ -642,11 +626,10 @@ class PdfConverter {
642
626
  }
643
627
  async _convert(pdfContent) {
644
628
  try {
645
- const textContent = await pdfToText(pdfContent);
646
- return {
647
- title: null,
648
- text_content: textContent
649
- };
629
+ const parser = new PDFParse({ data: pdfContent });
630
+ const result = await parser.getText();
631
+ await parser.destroy();
632
+ return { title: null, markdown: result.text, text_content: result.text };
650
633
  } catch (error) {
651
634
  console.error("PDF Parsing Error:", error);
652
635
  return null;
@@ -702,12 +685,9 @@ class XlsxConverter extends HtmlConverter {
702
685
  mdContent += `## ${sheetName}
703
686
  `;
704
687
  let htmlContent = XLSX.utils.sheet_to_html(workbook.Sheets[sheetName]);
705
- mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
688
+ mdContent += (await this._convert(htmlContent))?.markdown.trim() + "\n\n";
706
689
  }
707
- return {
708
- title: workbook?.Props?.Title || "Untitled",
709
- text_content: mdContent
710
- };
690
+ return { title: workbook?.Props?.Title || "Untitled", markdown: mdContent, text_content: mdContent };
711
691
  } catch (e) {
712
692
  console.error(e);
713
693
  return null;
@@ -797,10 +777,7 @@ ${transcript === "" ? "[No speech detected]" : transcript}`;
797
777
  } else {
798
778
  mdContent += "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]";
799
779
  }
800
- return {
801
- title: null,
802
- text_content: mdContent.trim()
803
- };
780
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
804
781
  }
805
782
  // TODO: Add speech to text
806
783
  async _transcribeAudio(_) {
@@ -867,10 +844,7 @@ ${transcript == "" ? "[No speech detected]" : transcript}`;
867
844
  } else {
868
845
  mdContent += "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]";
869
846
  }
870
- return {
871
- title: null,
872
- text_content: mdContent.trim()
873
- };
847
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
874
848
  }
875
849
  }
876
850
 
@@ -922,10 +896,7 @@ class ImageConverter extends MediaConverter {
922
896
  ${(await this._getLLMDescription(imageBuffer, options)).trim()}
923
897
  `;
924
898
  }
925
- return {
926
- title: null,
927
- text_content: mdContent.trim()
928
- };
899
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
929
900
  }
930
901
  async _getLLMDescription(imageBuffer, options) {
931
902
  if (!options.llmPrompt || options.llmPrompt.trim() === "") {
@@ -961,6 +932,7 @@ class ZipConverter {
961
932
  if (!parentConverters) {
962
933
  return {
963
934
  title: null,
935
+ markdown: `[ERROR] No converters available to process zip contents from: ${source}`,
964
936
  text_content: `[ERROR] No converters available to process zip contents from: ${source}`
965
937
  };
966
938
  }
@@ -998,7 +970,7 @@ class ZipConverter {
998
970
  mdResults.push(`
999
971
  ## File: ${relativePath}
1000
972
 
1001
- ${result.text_content}
973
+ ${result.markdown}
1002
974
 
1003
975
  `);
1004
976
  break;
@@ -1022,19 +994,18 @@ ${result.text_content}
1022
994
  inputStream.pipe(parser);
1023
995
  });
1024
996
  mdContent += mdResults.join("");
1025
- return {
1026
- title: null,
1027
- text_content: mdContent.trim()
1028
- };
997
+ return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
1029
998
  } catch (error) {
1030
999
  if (error.message.includes("invalid signature")) {
1031
1000
  return {
1032
1001
  title: null,
1002
+ markdown: `[ERROR] Invalid or corrupted zip file: ${source}`,
1033
1003
  text_content: `[ERROR] Invalid or corrupted zip file: ${source}`
1034
1004
  };
1035
1005
  }
1036
1006
  return {
1037
1007
  title: null,
1008
+ markdown: `[ERROR] Failed to process zip file ${source}: ${String(error)}`,
1038
1009
  text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}`
1039
1010
  };
1040
1011
  }
@@ -1152,8 +1123,8 @@ class MarkItDown {
1152
1123
  error = e;
1153
1124
  }
1154
1125
  if (res != null) {
1155
- res.text_content = res.text_content.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1156
- res.text_content = res.text_content.replace(/\n{3,}/g, "\n\n");
1126
+ res.markdown = res.markdown.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
1127
+ res.markdown = res.markdown.replace(/\n{3,}/g, "\n\n");
1157
1128
  return res;
1158
1129
  }
1159
1130
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "markitdown-ts",
3
- "version": "0.0.6",
3
+ "version": "0.0.8",
4
4
  "description": "",
5
5
  "keywords": [],
6
6
  "homepage": "https://github.com/dead8309/markitdown-ts#readme",
@@ -40,7 +40,8 @@
40
40
  "typescript": "^5.7.2",
41
41
  "unbuild": "^3.0.1",
42
42
  "vite": "^6.0.4",
43
- "vitest": "^2.1.8"
43
+ "vitest": "^2.1.8",
44
+ "zod": "^4.1.8"
44
45
  },
45
46
  "dependencies": {
46
47
  "@joplin/turndown-plugin-gfm": "^1.0.60",
@@ -49,7 +50,7 @@
49
50
  "jsdom": "^25.0.1",
50
51
  "mammoth": "^1.8.0",
51
52
  "mime-types": "^2.1.35",
52
- "pdf-ts": "^0.0.2",
53
+ "pdf-parse": "^2.4.5",
53
54
  "turndown": "^7.2.0",
54
55
  "xlsx": "^0.18.5"
55
56
  },