npm - markitdown-ts - Versions diffs - 0.0.2 → 0.0.4 - Mend

markitdown-ts 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -1,35 +1,113 @@
-# markitdown
+# markitdown-ts
-[![CI](https://github.com/dead8309/markitdown/actions/workflows/ci.yml/badge.svg)](https://github.com/dead8309/markitdown/actions/workflows/ci.yml)
+[![CI](https://github.com/dead8309/markitdown-ts/actions/workflows/ci.yml/badge.svg)](https://github.com/dead8309/markitdown/actions/workflows/ci.yml)
+`markitdown-ts` is a TypeScript library designed for converting various file formats to Markdown. This makes it suitable for indexing, text analysis, and other applications that benefit from structured text. It is a TypeScript implementation of the original `markitdown` [Python library.](https://github.com/microsoft/markitdown)
-MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
 It supports:
-[x] PDF
-[ ] PowerPoint
-[x] Word
-[x] Excel
-[x] Images (EXIF metadata and OCR)
-[x] Audio (EXIF metadata and speech transcription)
-[x] HTML
-[x] Text-based formats (CSV, JSON, XML)
-[x] ZIP files (iterates over contents)
+- [x] PDF
+- [x] Word (.docx)
+- [x] Excel (.xlsx)
+- [x] Images (EXIF metadata extraction and optional LLM-based description)
+- [x] Audio (EXIF metadata extraction only)
+- [x] HTML
+- [x] Text-based formats (plain text, .csv, .xml, .rss, .atom)
+- [x] Jupyter Notebooks (.ipynb)
+- [x] Bing Search Result Pages (SERP)
+- [x] ZIP files (recursively iterates over contents)
+- [ ] PowerPoint
+> [!NOTE]
+>
+> Speech Recognition for audio converter has not been implemented yet. I'm happy to accept contributions for this feature.
-## Youtube Transcript
+## Installation
-To enable YouTube transcript functionality, you need to install the youtube-transcript package:
+Install `markitdown-ts` using your preferred package manager:
 ```bash
-npm install youtube-transcript
+pnpm add markitdown-ts
 ```
-## Installation
+## Usage
-```bash
-npm i markitdown
+```typescript
+import { MarkItDown } from "markitdown-ts";
+const markitdown = new MarkItDown();
+try {
+  const result = await markitdown.convert("path/to/your/file.pdf");
+  if (result) {
+    console.log(result.text_content);
+  }
+} catch (error) {
+  console.error("Conversion failed:", error);
+}
 ```
-## Usage
+Pass additional options as needed for specific functionality.
+## YouTube Transcript Support
+When converting YouTube files, you can pass the `enableYoutubeTranscript` and the `youtubeTranscriptLanguage` option to control the transcript extraction. By default it will use `"en"` if the `youtubeTranscriptLanguage` is not provided.
+```typescript
+const markitdown = new MarkItDown();
+const result = await markitdown.convert("https://www.youtube.com/watch?v=V2qZ_lgxTzg", {
+  enableYoutubeTranscript: true,
+  youtubeTranscriptLanguage: "en"
+});
+```
+## LLM Image Description Support
+To enable LLM functionality, you need to configure a model and client in the `options` for the image converter. You can use the `@ai-sdk/openai` to get an LLM client.
+```typescript
+import { openai } from "@ai-sdk/openai";
+const markitdown = new MarkItDown();
+const result = await markitdown.convert("test.jpg", {
+  llmModel: openai("gpt-4o-mini"),
+  llmPrompt: "Write a detailed description of this image"
+});
+```
+## API
+The library uses a single function `convert` for all conversions, with the options and the response type defined as such:
+```typescript
+export interface DocumentConverter {
+  convert(local_path: string, options: ConverterOptions): Promise<ConverterResult>;
+}
+export type ConverterResult =
+  | {
+      title: string | null;
+      text_content: string;
+    }
+  | null
+  | undefined;
+export type ConverterOption = {
+  file_extension?: string;
+  url?: string;
+  fetch?: typeof fetch;
+  enableYoutubeTranscript?: boolean; // false by default
+  youtubeTranscriptLanguage?: string; // "en" by default
+  llmModel: string;
+  llmPrompt?: string;
+  styleMap?: string | Array<string>;
+  _parent_converters?: DocumentConverter[];
+  cleanup_extracted?: boolean;
+};
+```
+## Examples
+Check out the [examples](./examples) folder.
 ## License

package/dist/index.cjs CHANGED Viewed

@@ -16,7 +16,6 @@ const util = require('util');
 const fs$1 = require('fs/promises');
 const os = require('os');
 const ai = require('ai');
-const unzipper = require('unzipper');
 function _interopDefaultCompat (e) { return e && typeof e === 'object' && 'default' in e ? e.default : e; }
@@ -40,12 +39,11 @@ const fs__namespace = /*#__PURE__*/_interopNamespaceCompat(fs);
 const TurndownService__default = /*#__PURE__*/_interopDefaultCompat(TurndownService);
 const turndownPluginGfm__default = /*#__PURE__*/_interopDefaultCompat(turndownPluginGfm);
 const Mammoth__default = /*#__PURE__*/_interopDefaultCompat(Mammoth);
-const XLSX__default = /*#__PURE__*/_interopDefaultCompat(XLSX);
+const XLSX__namespace = /*#__PURE__*/_interopNamespaceCompat(XLSX);
 const childProcess__namespace = /*#__PURE__*/_interopNamespaceCompat(childProcess);
 const util__namespace = /*#__PURE__*/_interopNamespaceCompat(util);
 const fs__namespace$1 = /*#__PURE__*/_interopNamespaceCompat(fs$1);
 const os__namespace = /*#__PURE__*/_interopNamespaceCompat(os);
-const unzipper__namespace = /*#__PURE__*/_interopNamespaceCompat(unzipper);
 class PlainTextConverter {
   async convert(local_path, options = {}) {
@@ -707,12 +705,12 @@ class XlsxConverter extends HtmlConverter {
       if (!exists) {
         throw new Error("File does'nt exists");
       }
-      let workbook = XLSX__default.readFile(local_path);
+      let workbook = XLSX__namespace.readFile(local_path);
       let mdContent = "";
       for (const sheetName of workbook.SheetNames) {
         mdContent += `## ${sheetName}
 `;
-        let htmlContent = XLSX__default.utils.sheet_to_html(workbook.Sheets[sheetName]);
+        let htmlContent = XLSX__namespace.utils.sheet_to_html(workbook.Sheets[sheetName]);
         mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
       }
       return {
@@ -959,9 +957,18 @@ class ZipConverter {
         text_content: `[ERROR] Invalid zip file path: ${localPath}`
       };
     }
+    let unzipper;
+    try {
+      unzipper = await import('unzipper').then((mod) => mod.default);
+    } catch (error) {
+      console.error(
+        "Optional dependency 'unzipper' is not installed. Run `npm install unzipper` to enable this feature."
+      );
+      return null;
+    }
     try {
       await fs__namespace$1.mkdir(newFolder, { recursive: true });
-      const zip = await unzipper__namespace.Open.file(localPath);
+      const zip = await unzipper.Open.file(localPath);
       await zip.extract({ path: newFolder });
       const files = await this._walk(newFolder);
       for (const { root, name } of files) {

package/dist/index.mjs CHANGED Viewed

@@ -10,13 +10,12 @@ import { DOMParser } from '@xmldom/xmldom';
 import { URL as URL$1 } from 'url';
 import { pdfToText } from 'pdf-ts';
 import Mammoth from 'mammoth';
-import XLSX from 'xlsx';
+import * as XLSX from 'xlsx';
 import * as childProcess from 'child_process';
 import * as util from 'util';
 import * as fs$1 from 'fs/promises';
 import * as os from 'os';
 import { generateText } from 'ai';
-import * as unzipper from 'unzipper';
 class PlainTextConverter {
   async convert(local_path, options = {}) {
@@ -930,6 +929,15 @@ class ZipConverter {
         text_content: `[ERROR] Invalid zip file path: ${localPath}`
       };
     }
+    let unzipper;
+    try {
+      unzipper = await import('unzipper').then((mod) => mod.default);
+    } catch (error) {
+      console.error(
+        "Optional dependency 'unzipper' is not installed. Run `npm install unzipper` to enable this feature."
+      );
+      return null;
+    }
     try {
       await fs$1.mkdir(newFolder, { recursive: true });
       const zip = await unzipper.Open.file(localPath);

package/package.json CHANGED Viewed

@@ -1,15 +1,15 @@
 {
   "name": "markitdown-ts",
-  "version": "0.0.2",
+  "version": "0.0.4",
   "description": "",
   "keywords": [],
-  "homepage": "https://github.com/dead8309/markitdown#readme",
+  "homepage": "https://github.com/dead8309/markitdown-ts#readme",
   "bugs": {
-    "url": "https://github.com/dead8309/markitdown/issues"
+    "url": "https://github.com/dead8309/markitdown-ts/issues"
   },
   "repository": {
     "type": "git",
-    "url": "git+https://github.com/dead8309/markitdown.git"
+    "url": "git+https://github.com/dead8309/markitdown-ts.git"
   },
   "license": "MIT",
   "author": "Vaibhav Raj",
@@ -50,10 +50,10 @@
     "mime-types": "^2.1.35",
     "pdf-ts": "^0.0.2",
     "turndown": "^7.2.0",
-    "xlsx": "^0.18.5"
+    "xlsx": "^0.18.5",
+    "ai": "^4.0.22"
   },
   "peerDependencies": {
-    "ai": "^4.0.22",
     "youtube-transcript": "^1.2.1",
     "unzipper": "^0.12.3"
   },
@@ -61,9 +61,6 @@
     "youtube-transcript": {
       "optional": true
     },
-    "ai": {
-      "optional": true
-    },
     "unzipper": {
       "optional": true
     }