npm - markitdown-ts - Versions diffs - 0.0.6 → 0.0.8 - Mend

markitdown-ts 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md CHANGED Viewed

@@ -50,7 +50,7 @@ try {
   const result = await markitdown.convert("https://arxiv.org/pdf/2308.08155v2.pdf");
   if (result) {
-    console.log(result.text_content);
+    console.log(result.markdown);
   }
 } catch (error) {
   console.error("Conversion failed:", error);
@@ -158,6 +158,8 @@ class MarkItDown {
 export type ConverterResult =
   | {
       title: string | null;
+      markdown: string;
+      /** @deprecated Use `markdown` instead. */
       text_content: string;
     }
   | null

package/dist/index.cjs CHANGED Viewed

@@ -8,7 +8,7 @@ const TurndownService = require('turndown');
 const turndownPluginGfm = require('@joplin/turndown-plugin-gfm');
 const xmldom = require('@xmldom/xmldom');
 const url = require('url');
-const pdfTs = require('pdf-ts');
+const pdfParse = require('pdf-parse');
 const Mammoth = require('mammoth');
 const XLSX = require('xlsx');
 const childProcess = require('child_process');
@@ -63,6 +63,7 @@ class PlainTextConverter {
     }
     return {
       title: null,
+      markdown: content,
       text_content: content
     };
   }
@@ -176,6 +177,7 @@ class HtmlConverter {
     }
     return {
       title: doc.title,
+      markdown: webpageText,
       text_content: webpageText
     };
   }
@@ -239,10 +241,7 @@ class RSSConverter {
           mdText += this._parseContent(entryContent);
         }
       }
-      return {
-        title,
-        text_content: mdText
-      };
+      return { title, markdown: mdText, text_content: mdText };
     } catch (error) {
       console.error("Atom Parsing Error:", error);
       return null;
@@ -290,10 +289,7 @@ class RSSConverter {
           mdText += this._parseContent(content);
         }
       }
-      return {
-        title: channelTitle,
-        text_content: mdText
-      };
+      return { title: channelTitle, markdown: mdText, text_content: mdText };
     } catch (error) {
       console.error("RSS Parsing Error:", error);
       return null;
@@ -363,10 +359,7 @@ class WikipediaConverter {
     } else {
       webpageText = new CustomTurnDown().convert_soup(doc);
     }
-    return {
-      title: mainTitle,
-      text_content: webpageText
-    };
+    return { title: mainTitle, markdown: webpageText, text_content: webpageText };
   }
 }
@@ -492,10 +485,7 @@ ${transcriptText}
       }
     }
     const finalTitle = title ? title : doc.title;
-    return {
-      title: finalTitle,
-      text_content: webpageText
-    };
+    return { title: finalTitle, markdown: webpageText, text_content: webpageText };
   }
   _get(metadata, keys, default_value) {
     for (const k of keys) {
@@ -573,10 +563,7 @@ ${sourceLines.join("")}
       }
       const mdText = mdOutput.join("\n\n");
       title = notebookContent.metadata?.title || title;
-      return {
-        title,
-        text_content: mdText
-      };
+      return { title, markdown: mdText, text_content: mdText };
     } catch (e) {
       console.error("Error converting .ipynb file:", e);
       throw new Error(`Error converting .ipynb file: ${e}`);
@@ -637,10 +624,7 @@ class BingSerpConverter {
     const webpageText = `## A Bing search for '${query}' found the following results:
 ${results.join("\n\n")}`;
-    return {
-      title: doc.title,
-      text_content: webpageText
-    };
+    return { title: doc.title, markdown: webpageText, text_content: webpageText };
   }
   _decodeBase64Url(encodedUrl) {
     let u = encodedUrl.slice(2).trim() + "==";
@@ -670,11 +654,10 @@ class PdfConverter {
   }
   async _convert(pdfContent) {
     try {
-      const textContent = await pdfTs.pdfToText(pdfContent);
-      return {
-        title: null,
-        text_content: textContent
-      };
+      const parser = new pdfParse.PDFParse({ data: pdfContent });
+      const result = await parser.getText();
+      await parser.destroy();
+      return { title: null, markdown: result.text, text_content: result.text };
     } catch (error) {
       console.error("PDF Parsing Error:", error);
       return null;
@@ -730,12 +713,9 @@ class XlsxConverter extends HtmlConverter {
         mdContent += `## ${sheetName}
 `;
         let htmlContent = XLSX__namespace.utils.sheet_to_html(workbook.Sheets[sheetName]);
-        mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
+        mdContent += (await this._convert(htmlContent))?.markdown.trim() + "\n\n";
       }
-      return {
-        title: workbook?.Props?.Title || "Untitled",
-        text_content: mdContent
-      };
+      return { title: workbook?.Props?.Title || "Untitled", markdown: mdContent, text_content: mdContent };
     } catch (e) {
       console.error(e);
       return null;
@@ -825,10 +805,7 @@ ${transcript === "" ? "[No speech detected]" : transcript}`;
     } else {
       mdContent += "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]";
     }
-    return {
-      title: null,
-      text_content: mdContent.trim()
-    };
+    return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
   }
   // TODO: Add speech to text
   async _transcribeAudio(_) {
@@ -895,10 +872,7 @@ ${transcript == "" ? "[No speech detected]" : transcript}`;
     } else {
       mdContent += "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]";
     }
-    return {
-      title: null,
-      text_content: mdContent.trim()
-    };
+    return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
   }
 }
@@ -950,10 +924,7 @@ class ImageConverter extends MediaConverter {
 ${(await this._getLLMDescription(imageBuffer, options)).trim()}
 `;
     }
-    return {
-      title: null,
-      text_content: mdContent.trim()
-    };
+    return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
   }
   async _getLLMDescription(imageBuffer, options) {
     if (!options.llmPrompt || options.llmPrompt.trim() === "") {
@@ -989,6 +960,7 @@ class ZipConverter {
     if (!parentConverters) {
       return {
         title: null,
+        markdown: `[ERROR] No converters available to process zip contents from: ${source}`,
         text_content: `[ERROR] No converters available to process zip contents from: ${source}`
       };
     }
@@ -1026,7 +998,7 @@ class ZipConverter {
               mdResults.push(`
 ## File: ${relativePath}
-${result.text_content}
+${result.markdown}
 `);
               break;
@@ -1050,19 +1022,18 @@ ${result.text_content}
         inputStream.pipe(parser);
       });
       mdContent += mdResults.join("");
-      return {
-        title: null,
-        text_content: mdContent.trim()
-      };
+      return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
     } catch (error) {
       if (error.message.includes("invalid signature")) {
         return {
           title: null,
+          markdown: `[ERROR] Invalid or corrupted zip file: ${source}`,
           text_content: `[ERROR] Invalid or corrupted zip file: ${source}`
         };
       }
       return {
         title: null,
+        markdown: `[ERROR] Failed to process zip file ${source}: ${String(error)}`,
         text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}`
       };
     }
@@ -1180,8 +1151,8 @@ class MarkItDown {
           error = e;
         }
         if (res != null) {
-          res.text_content = res.text_content.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
-          res.text_content = res.text_content.replace(/\n{3,}/g, "\n\n");
+          res.markdown = res.markdown.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
+          res.markdown = res.markdown.replace(/\n{3,}/g, "\n\n");
           return res;
         }
       }

package/dist/index.d.cts CHANGED Viewed

@@ -3,6 +3,8 @@ import mammoth from 'mammoth';
 type ConverterResult = {
     title: string | null;
+    markdown: string;
+    /** @deprecated Use `markdown` instead. */
     text_content: string;
 } | null | undefined;
 type ConverterOptions = {

package/dist/index.d.mts CHANGED Viewed

@@ -3,6 +3,8 @@ import mammoth from 'mammoth';
 type ConverterResult = {
     title: string | null;
+    markdown: string;
+    /** @deprecated Use `markdown` instead. */
     text_content: string;
 } | null | undefined;
 type ConverterOptions = {

package/dist/index.d.ts CHANGED Viewed

@@ -3,6 +3,8 @@ import mammoth from 'mammoth';
 type ConverterResult = {
     title: string | null;
+    markdown: string;
+    /** @deprecated Use `markdown` instead. */
     text_content: string;
 } | null | undefined;
 type ConverterOptions = {

package/dist/index.mjs CHANGED Viewed

@@ -8,7 +8,7 @@ import TurndownService from 'turndown';
 import turndownPluginGfm from '@joplin/turndown-plugin-gfm';
 import { DOMParser } from '@xmldom/xmldom';
 import { URL as URL$1 } from 'url';
-import { pdfToText } from 'pdf-ts';
+import { PDFParse } from 'pdf-parse';
 import Mammoth from 'mammoth';
 import * as XLSX from 'xlsx';
 import * as childProcess from 'child_process';
@@ -35,6 +35,7 @@ class PlainTextConverter {
     }
     return {
       title: null,
+      markdown: content,
       text_content: content
     };
   }
@@ -148,6 +149,7 @@ class HtmlConverter {
     }
     return {
       title: doc.title,
+      markdown: webpageText,
       text_content: webpageText
     };
   }
@@ -211,10 +213,7 @@ class RSSConverter {
           mdText += this._parseContent(entryContent);
         }
       }
-      return {
-        title,
-        text_content: mdText
-      };
+      return { title, markdown: mdText, text_content: mdText };
     } catch (error) {
       console.error("Atom Parsing Error:", error);
       return null;
@@ -262,10 +261,7 @@ class RSSConverter {
           mdText += this._parseContent(content);
         }
       }
-      return {
-        title: channelTitle,
-        text_content: mdText
-      };
+      return { title: channelTitle, markdown: mdText, text_content: mdText };
     } catch (error) {
       console.error("RSS Parsing Error:", error);
       return null;
@@ -335,10 +331,7 @@ class WikipediaConverter {
     } else {
       webpageText = new CustomTurnDown().convert_soup(doc);
     }
-    return {
-      title: mainTitle,
-      text_content: webpageText
-    };
+    return { title: mainTitle, markdown: webpageText, text_content: webpageText };
   }
 }
@@ -464,10 +457,7 @@ ${transcriptText}
       }
     }
     const finalTitle = title ? title : doc.title;
-    return {
-      title: finalTitle,
-      text_content: webpageText
-    };
+    return { title: finalTitle, markdown: webpageText, text_content: webpageText };
   }
   _get(metadata, keys, default_value) {
     for (const k of keys) {
@@ -545,10 +535,7 @@ ${sourceLines.join("")}
       }
       const mdText = mdOutput.join("\n\n");
       title = notebookContent.metadata?.title || title;
-      return {
-        title,
-        text_content: mdText
-      };
+      return { title, markdown: mdText, text_content: mdText };
     } catch (e) {
       console.error("Error converting .ipynb file:", e);
       throw new Error(`Error converting .ipynb file: ${e}`);
@@ -609,10 +596,7 @@ class BingSerpConverter {
     const webpageText = `## A Bing search for '${query}' found the following results:
 ${results.join("\n\n")}`;
-    return {
-      title: doc.title,
-      text_content: webpageText
-    };
+    return { title: doc.title, markdown: webpageText, text_content: webpageText };
   }
   _decodeBase64Url(encodedUrl) {
     let u = encodedUrl.slice(2).trim() + "==";
@@ -642,11 +626,10 @@ class PdfConverter {
   }
   async _convert(pdfContent) {
     try {
-      const textContent = await pdfToText(pdfContent);
-      return {
-        title: null,
-        text_content: textContent
-      };
+      const parser = new PDFParse({ data: pdfContent });
+      const result = await parser.getText();
+      await parser.destroy();
+      return { title: null, markdown: result.text, text_content: result.text };
     } catch (error) {
       console.error("PDF Parsing Error:", error);
       return null;
@@ -702,12 +685,9 @@ class XlsxConverter extends HtmlConverter {
         mdContent += `## ${sheetName}
 `;
         let htmlContent = XLSX.utils.sheet_to_html(workbook.Sheets[sheetName]);
-        mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
+        mdContent += (await this._convert(htmlContent))?.markdown.trim() + "\n\n";
       }
-      return {
-        title: workbook?.Props?.Title || "Untitled",
-        text_content: mdContent
-      };
+      return { title: workbook?.Props?.Title || "Untitled", markdown: mdContent, text_content: mdContent };
     } catch (e) {
       console.error(e);
       return null;
@@ -797,10 +777,7 @@ ${transcript === "" ? "[No speech detected]" : transcript}`;
     } else {
       mdContent += "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]";
     }
-    return {
-      title: null,
-      text_content: mdContent.trim()
-    };
+    return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
   }
   // TODO: Add speech to text
   async _transcribeAudio(_) {
@@ -867,10 +844,7 @@ ${transcript == "" ? "[No speech detected]" : transcript}`;
     } else {
       mdContent += "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]";
     }
-    return {
-      title: null,
-      text_content: mdContent.trim()
-    };
+    return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
   }
 }
@@ -922,10 +896,7 @@ class ImageConverter extends MediaConverter {
 ${(await this._getLLMDescription(imageBuffer, options)).trim()}
 `;
     }
-    return {
-      title: null,
-      text_content: mdContent.trim()
-    };
+    return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
   }
   async _getLLMDescription(imageBuffer, options) {
     if (!options.llmPrompt || options.llmPrompt.trim() === "") {
@@ -961,6 +932,7 @@ class ZipConverter {
     if (!parentConverters) {
       return {
         title: null,
+        markdown: `[ERROR] No converters available to process zip contents from: ${source}`,
         text_content: `[ERROR] No converters available to process zip contents from: ${source}`
       };
     }
@@ -998,7 +970,7 @@ class ZipConverter {
               mdResults.push(`
 ## File: ${relativePath}
-${result.text_content}
+${result.markdown}
 `);
               break;
@@ -1022,19 +994,18 @@ ${result.text_content}
         inputStream.pipe(parser);
       });
       mdContent += mdResults.join("");
-      return {
-        title: null,
-        text_content: mdContent.trim()
-      };
+      return { title: null, markdown: mdContent.trim(), text_content: mdContent.trim() };
     } catch (error) {
       if (error.message.includes("invalid signature")) {
         return {
           title: null,
+          markdown: `[ERROR] Invalid or corrupted zip file: ${source}`,
           text_content: `[ERROR] Invalid or corrupted zip file: ${source}`
         };
       }
       return {
         title: null,
+        markdown: `[ERROR] Failed to process zip file ${source}: ${String(error)}`,
         text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}`
       };
     }
@@ -1152,8 +1123,8 @@ class MarkItDown {
           error = e;
         }
         if (res != null) {
-          res.text_content = res.text_content.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
-          res.text_content = res.text_content.replace(/\n{3,}/g, "\n\n");
+          res.markdown = res.markdown.replace(/(?:\r\n|\r|\n)/g, "\n").trim();
+          res.markdown = res.markdown.replace(/\n{3,}/g, "\n\n");
           return res;
         }
       }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "markitdown-ts",
-  "version": "0.0.6",
+  "version": "0.0.8",
   "description": "",
   "keywords": [],
   "homepage": "https://github.com/dead8309/markitdown-ts#readme",
@@ -40,7 +40,8 @@
     "typescript": "^5.7.2",
     "unbuild": "^3.0.1",
     "vite": "^6.0.4",
-    "vitest": "^2.1.8"
+    "vitest": "^2.1.8",
+    "zod": "^4.1.8"
   },
   "dependencies": {
     "@joplin/turndown-plugin-gfm": "^1.0.60",
@@ -49,7 +50,7 @@
     "jsdom": "^25.0.1",
     "mammoth": "^1.8.0",
     "mime-types": "^2.1.35",
-    "pdf-ts": "^0.0.2",
+    "pdf-parse": "^2.4.5",
     "turndown": "^7.2.0",
     "xlsx": "^0.18.5"
   },