markitdown-ts 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +97 -19
- package/dist/index.cjs +13 -6
- package/dist/index.mjs +10 -2
- package/package.json +6 -9
package/README.md
CHANGED
|
@@ -1,35 +1,113 @@
|
|
|
1
|
-
# markitdown
|
|
1
|
+
# markitdown-ts
|
|
2
2
|
|
|
3
|
-
[](https://github.com/dead8309/markitdown/actions/workflows/ci.yml)
|
|
3
|
+
[](https://github.com/dead8309/markitdown/actions/workflows/ci.yml)
|
|
4
|
+
|
|
5
|
+
`markitdown-ts` is a TypeScript library designed for converting various file formats to Markdown. This makes it suitable for indexing, text analysis, and other applications that benefit from structured text. It is a TypeScript implementation of the original `markitdown` [Python library.](https://github.com/microsoft/markitdown)
|
|
4
6
|
|
|
5
|
-
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
|
6
7
|
It supports:
|
|
7
8
|
|
|
8
|
-
[x] PDF
|
|
9
|
-
[
|
|
10
|
-
[x]
|
|
11
|
-
[x]
|
|
12
|
-
[x]
|
|
13
|
-
[x]
|
|
14
|
-
[x]
|
|
15
|
-
[x]
|
|
16
|
-
[x]
|
|
9
|
+
- [x] PDF
|
|
10
|
+
- [x] Word (.docx)
|
|
11
|
+
- [x] Excel (.xlsx)
|
|
12
|
+
- [x] Images (EXIF metadata extraction and optional LLM-based description)
|
|
13
|
+
- [x] Audio (EXIF metadata extraction only)
|
|
14
|
+
- [x] HTML
|
|
15
|
+
- [x] Text-based formats (plain text, .csv, .xml, .rss, .atom)
|
|
16
|
+
- [x] Jupyter Notebooks (.ipynb)
|
|
17
|
+
- [x] Bing Search Result Pages (SERP)
|
|
18
|
+
- [x] ZIP files (recursively iterates over contents)
|
|
19
|
+
- [ ] PowerPoint
|
|
20
|
+
|
|
21
|
+
> [!NOTE]
|
|
22
|
+
>
|
|
23
|
+
> Speech Recognition for audio converter has not been implemented yet. I'm happy to accept contributions for this feature.
|
|
17
24
|
|
|
18
|
-
##
|
|
25
|
+
## Installation
|
|
19
26
|
|
|
20
|
-
|
|
27
|
+
Install `markitdown-ts` using your preferred package manager:
|
|
21
28
|
|
|
22
29
|
```bash
|
|
23
|
-
|
|
30
|
+
pnpm add markitdown-ts
|
|
24
31
|
```
|
|
25
32
|
|
|
26
|
-
##
|
|
33
|
+
## Usage
|
|
27
34
|
|
|
28
|
-
```
|
|
29
|
-
|
|
35
|
+
```typescript
|
|
36
|
+
import { MarkItDown } from "markitdown-ts";
|
|
37
|
+
|
|
38
|
+
const markitdown = new MarkItDown();
|
|
39
|
+
try {
|
|
40
|
+
const result = await markitdown.convert("path/to/your/file.pdf");
|
|
41
|
+
if (result) {
|
|
42
|
+
console.log(result.text_content);
|
|
43
|
+
}
|
|
44
|
+
} catch (error) {
|
|
45
|
+
console.error("Conversion failed:", error);
|
|
46
|
+
}
|
|
30
47
|
```
|
|
31
48
|
|
|
32
|
-
|
|
49
|
+
Pass additional options as needed for specific functionality.
|
|
50
|
+
|
|
51
|
+
## YouTube Transcript Support
|
|
52
|
+
|
|
53
|
+
When converting YouTube files, you can pass the `enableYoutubeTranscript` and the `youtubeTranscriptLanguage` option to control the transcript extraction. By default it will use `"en"` if the `youtubeTranscriptLanguage` is not provided.
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
const markitdown = new MarkItDown();
|
|
57
|
+
const result = await markitdown.convert("https://www.youtube.com/watch?v=V2qZ_lgxTzg", {
|
|
58
|
+
enableYoutubeTranscript: true,
|
|
59
|
+
youtubeTranscriptLanguage: "en"
|
|
60
|
+
});
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## LLM Image Description Support
|
|
64
|
+
|
|
65
|
+
To enable LLM functionality, you need to configure a model and client in the `options` for the image converter. You can use the `@ai-sdk/openai` to get an LLM client.
|
|
66
|
+
|
|
67
|
+
```typescript
|
|
68
|
+
import { openai } from "@ai-sdk/openai";
|
|
69
|
+
|
|
70
|
+
const markitdown = new MarkItDown();
|
|
71
|
+
const result = await markitdown.convert("test.jpg", {
|
|
72
|
+
llmModel: openai("gpt-4o-mini"),
|
|
73
|
+
llmPrompt: "Write a detailed description of this image"
|
|
74
|
+
});
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## API
|
|
78
|
+
|
|
79
|
+
The library uses a single function `convert` for all conversions, with the options and the response type defined as such:
|
|
80
|
+
|
|
81
|
+
```typescript
|
|
82
|
+
export interface DocumentConverter {
|
|
83
|
+
convert(local_path: string, options: ConverterOptions): Promise<ConverterResult>;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export type ConverterResult =
|
|
87
|
+
| {
|
|
88
|
+
title: string | null;
|
|
89
|
+
text_content: string;
|
|
90
|
+
}
|
|
91
|
+
| null
|
|
92
|
+
| undefined;
|
|
93
|
+
|
|
94
|
+
export type ConverterOption = {
|
|
95
|
+
file_extension?: string;
|
|
96
|
+
url?: string;
|
|
97
|
+
fetch?: typeof fetch;
|
|
98
|
+
enableYoutubeTranscript?: boolean; // false by default
|
|
99
|
+
youtubeTranscriptLanguage?: string; // "en" by default
|
|
100
|
+
llmModel: string;
|
|
101
|
+
llmPrompt?: string;
|
|
102
|
+
styleMap?: string | Array<string>;
|
|
103
|
+
_parent_converters?: DocumentConverter[];
|
|
104
|
+
cleanup_extracted?: boolean;
|
|
105
|
+
};
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Examples
|
|
109
|
+
|
|
110
|
+
Check out the [examples](./examples) folder.
|
|
33
111
|
|
|
34
112
|
## License
|
|
35
113
|
|
package/dist/index.cjs
CHANGED
|
@@ -16,7 +16,6 @@ const util = require('util');
|
|
|
16
16
|
const fs$1 = require('fs/promises');
|
|
17
17
|
const os = require('os');
|
|
18
18
|
const ai = require('ai');
|
|
19
|
-
const unzipper = require('unzipper');
|
|
20
19
|
|
|
21
20
|
function _interopDefaultCompat (e) { return e && typeof e === 'object' && 'default' in e ? e.default : e; }
|
|
22
21
|
|
|
@@ -40,12 +39,11 @@ const fs__namespace = /*#__PURE__*/_interopNamespaceCompat(fs);
|
|
|
40
39
|
const TurndownService__default = /*#__PURE__*/_interopDefaultCompat(TurndownService);
|
|
41
40
|
const turndownPluginGfm__default = /*#__PURE__*/_interopDefaultCompat(turndownPluginGfm);
|
|
42
41
|
const Mammoth__default = /*#__PURE__*/_interopDefaultCompat(Mammoth);
|
|
43
|
-
const
|
|
42
|
+
const XLSX__namespace = /*#__PURE__*/_interopNamespaceCompat(XLSX);
|
|
44
43
|
const childProcess__namespace = /*#__PURE__*/_interopNamespaceCompat(childProcess);
|
|
45
44
|
const util__namespace = /*#__PURE__*/_interopNamespaceCompat(util);
|
|
46
45
|
const fs__namespace$1 = /*#__PURE__*/_interopNamespaceCompat(fs$1);
|
|
47
46
|
const os__namespace = /*#__PURE__*/_interopNamespaceCompat(os);
|
|
48
|
-
const unzipper__namespace = /*#__PURE__*/_interopNamespaceCompat(unzipper);
|
|
49
47
|
|
|
50
48
|
class PlainTextConverter {
|
|
51
49
|
async convert(local_path, options = {}) {
|
|
@@ -707,12 +705,12 @@ class XlsxConverter extends HtmlConverter {
|
|
|
707
705
|
if (!exists) {
|
|
708
706
|
throw new Error("File does'nt exists");
|
|
709
707
|
}
|
|
710
|
-
let workbook =
|
|
708
|
+
let workbook = XLSX__namespace.readFile(local_path);
|
|
711
709
|
let mdContent = "";
|
|
712
710
|
for (const sheetName of workbook.SheetNames) {
|
|
713
711
|
mdContent += `## ${sheetName}
|
|
714
712
|
`;
|
|
715
|
-
let htmlContent =
|
|
713
|
+
let htmlContent = XLSX__namespace.utils.sheet_to_html(workbook.Sheets[sheetName]);
|
|
716
714
|
mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
|
|
717
715
|
}
|
|
718
716
|
return {
|
|
@@ -959,9 +957,18 @@ class ZipConverter {
|
|
|
959
957
|
text_content: `[ERROR] Invalid zip file path: ${localPath}`
|
|
960
958
|
};
|
|
961
959
|
}
|
|
960
|
+
let unzipper;
|
|
961
|
+
try {
|
|
962
|
+
unzipper = await import('unzipper').then((mod) => mod.default);
|
|
963
|
+
} catch (error) {
|
|
964
|
+
console.error(
|
|
965
|
+
"Optional dependency 'unzipper' is not installed. Run `npm install unzipper` to enable this feature."
|
|
966
|
+
);
|
|
967
|
+
return null;
|
|
968
|
+
}
|
|
962
969
|
try {
|
|
963
970
|
await fs__namespace$1.mkdir(newFolder, { recursive: true });
|
|
964
|
-
const zip = await
|
|
971
|
+
const zip = await unzipper.Open.file(localPath);
|
|
965
972
|
await zip.extract({ path: newFolder });
|
|
966
973
|
const files = await this._walk(newFolder);
|
|
967
974
|
for (const { root, name } of files) {
|
package/dist/index.mjs
CHANGED
|
@@ -10,13 +10,12 @@ import { DOMParser } from '@xmldom/xmldom';
|
|
|
10
10
|
import { URL as URL$1 } from 'url';
|
|
11
11
|
import { pdfToText } from 'pdf-ts';
|
|
12
12
|
import Mammoth from 'mammoth';
|
|
13
|
-
import XLSX from 'xlsx';
|
|
13
|
+
import * as XLSX from 'xlsx';
|
|
14
14
|
import * as childProcess from 'child_process';
|
|
15
15
|
import * as util from 'util';
|
|
16
16
|
import * as fs$1 from 'fs/promises';
|
|
17
17
|
import * as os from 'os';
|
|
18
18
|
import { generateText } from 'ai';
|
|
19
|
-
import * as unzipper from 'unzipper';
|
|
20
19
|
|
|
21
20
|
class PlainTextConverter {
|
|
22
21
|
async convert(local_path, options = {}) {
|
|
@@ -930,6 +929,15 @@ class ZipConverter {
|
|
|
930
929
|
text_content: `[ERROR] Invalid zip file path: ${localPath}`
|
|
931
930
|
};
|
|
932
931
|
}
|
|
932
|
+
let unzipper;
|
|
933
|
+
try {
|
|
934
|
+
unzipper = await import('unzipper').then((mod) => mod.default);
|
|
935
|
+
} catch (error) {
|
|
936
|
+
console.error(
|
|
937
|
+
"Optional dependency 'unzipper' is not installed. Run `npm install unzipper` to enable this feature."
|
|
938
|
+
);
|
|
939
|
+
return null;
|
|
940
|
+
}
|
|
933
941
|
try {
|
|
934
942
|
await fs$1.mkdir(newFolder, { recursive: true });
|
|
935
943
|
const zip = await unzipper.Open.file(localPath);
|
package/package.json
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markitdown-ts",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.4",
|
|
4
4
|
"description": "",
|
|
5
5
|
"keywords": [],
|
|
6
|
-
"homepage": "https://github.com/dead8309/markitdown#readme",
|
|
6
|
+
"homepage": "https://github.com/dead8309/markitdown-ts#readme",
|
|
7
7
|
"bugs": {
|
|
8
|
-
"url": "https://github.com/dead8309/markitdown/issues"
|
|
8
|
+
"url": "https://github.com/dead8309/markitdown-ts/issues"
|
|
9
9
|
},
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
12
|
-
"url": "git+https://github.com/dead8309/markitdown.git"
|
|
12
|
+
"url": "git+https://github.com/dead8309/markitdown-ts.git"
|
|
13
13
|
},
|
|
14
14
|
"license": "MIT",
|
|
15
15
|
"author": "Vaibhav Raj",
|
|
@@ -50,10 +50,10 @@
|
|
|
50
50
|
"mime-types": "^2.1.35",
|
|
51
51
|
"pdf-ts": "^0.0.2",
|
|
52
52
|
"turndown": "^7.2.0",
|
|
53
|
-
"xlsx": "^0.18.5"
|
|
53
|
+
"xlsx": "^0.18.5",
|
|
54
|
+
"ai": "^4.0.22"
|
|
54
55
|
},
|
|
55
56
|
"peerDependencies": {
|
|
56
|
-
"ai": "^4.0.22",
|
|
57
57
|
"youtube-transcript": "^1.2.1",
|
|
58
58
|
"unzipper": "^0.12.3"
|
|
59
59
|
},
|
|
@@ -61,9 +61,6 @@
|
|
|
61
61
|
"youtube-transcript": {
|
|
62
62
|
"optional": true
|
|
63
63
|
},
|
|
64
|
-
"ai": {
|
|
65
|
-
"optional": true
|
|
66
|
-
},
|
|
67
64
|
"unzipper": {
|
|
68
65
|
"optional": true
|
|
69
66
|
}
|