markitdown-ts 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +97 -19
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -1,35 +1,113 @@
|
|
|
1
|
-
# markitdown
|
|
1
|
+
# markitdown-ts
|
|
2
2
|
|
|
3
|
-
[](https://github.com/dead8309/markitdown/actions/workflows/ci.yml)
|
|
3
|
+
[](https://github.com/dead8309/markitdown/actions/workflows/ci.yml)
|
|
4
|
+
|
|
5
|
+
`markitdown-ts` is a TypeScript library designed for converting various file formats to Markdown. This makes it suitable for indexing, text analysis, and other applications that benefit from structured text. It is a TypeScript implementation of the original `markitdown` [Python library.](https://github.com/microsoft/markitdown)
|
|
4
6
|
|
|
5
|
-
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
|
6
7
|
It supports:
|
|
7
8
|
|
|
8
|
-
[x] PDF
|
|
9
|
-
[
|
|
10
|
-
[x] Word
|
|
11
|
-
[x] Excel
|
|
12
|
-
[x] Images (EXIF metadata and
|
|
13
|
-
[x] Audio (EXIF metadata
|
|
14
|
-
[x] HTML
|
|
15
|
-
[x] Text-based formats (
|
|
16
|
-
[x]
|
|
9
|
+
- [x] PDF
|
|
10
|
+
- [x] PowerPoint
|
|
11
|
+
- [x] Word (.docx)
|
|
12
|
+
- [x] Excel (.xlsx)
|
|
13
|
+
- [x] Images (EXIF metadata extraction and optional LLM-based description)
|
|
14
|
+
- [x] Audio (EXIF metadata extraction only)
|
|
15
|
+
- [x] HTML
|
|
16
|
+
- [x] Text-based formats (plain text, .csv, .xml, .rss, .atom)
|
|
17
|
+
- [x] Jupyter Notebooks (.ipynb)
|
|
18
|
+
- [x] Bing Search Result Pages (SERP)
|
|
19
|
+
- [x] ZIP files (recursively iterates over contents)
|
|
20
|
+
|
|
21
|
+
> [!NOTE]
|
|
22
|
+
>
|
|
23
|
+
> Speech Recognition for audio converter has not been implemented yet. I'm happy to accept contributions for this feature.
|
|
17
24
|
|
|
18
|
-
##
|
|
25
|
+
## Installation
|
|
19
26
|
|
|
20
|
-
|
|
27
|
+
Install `markitdown-ts` using your preferred package manager:
|
|
21
28
|
|
|
22
29
|
```bash
|
|
23
|
-
|
|
30
|
+
pnpm add markitdown-ts
|
|
24
31
|
```
|
|
25
32
|
|
|
26
|
-
##
|
|
33
|
+
## Usage
|
|
27
34
|
|
|
28
|
-
```
|
|
29
|
-
|
|
35
|
+
```typescript
|
|
36
|
+
import { MarkItDown } from "markitdown-ts";
|
|
37
|
+
|
|
38
|
+
const markitdown = new MarkItDown();
|
|
39
|
+
try {
|
|
40
|
+
const result = await markitdown.convert("path/to/your/file.pdf");
|
|
41
|
+
if (result) {
|
|
42
|
+
console.log(result.text_content);
|
|
43
|
+
}
|
|
44
|
+
} catch (error) {
|
|
45
|
+
console.error("Conversion failed:", error);
|
|
46
|
+
}
|
|
30
47
|
```
|
|
31
48
|
|
|
32
|
-
|
|
49
|
+
Pass additional options as needed for specific functionality.
|
|
50
|
+
|
|
51
|
+
## YouTube Transcript Support
|
|
52
|
+
|
|
53
|
+
When converting YouTube files, you can pass the `enableYoutubeTranscript` and the `youtubeTranscriptLanguage` option to control the transcript extraction. By default it will use `"en"` if the `youtubeTranscriptLanguage` is not provided.
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
const markitdown = new MarkItDown();
|
|
57
|
+
const result = await markitdown.convert("https://www.youtube.com/watch?v=V2qZ_lgxTzg", {
|
|
58
|
+
enableYoutubeTranscript: true,
|
|
59
|
+
youtubeTranscriptLanguage: "en"
|
|
60
|
+
});
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## LLM Image Description Support
|
|
64
|
+
|
|
65
|
+
To enable LLM functionality, you need to configure a model and client in the `options` for the image converter. You can use the `@ai-sdk/openai` to get an LLM client.
|
|
66
|
+
|
|
67
|
+
```typescript
|
|
68
|
+
import { openai } from "@ai-sdk/openai";
|
|
69
|
+
|
|
70
|
+
const markitdown = new MarkItDown();
|
|
71
|
+
const result = await markitdown.convert("test.jpg", {
|
|
72
|
+
llmModel: openai("gpt-4o-mini"),
|
|
73
|
+
llmPrompt: "Write a detailed description of this image"
|
|
74
|
+
});
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## API
|
|
78
|
+
|
|
79
|
+
The library uses a single function `convert` for all conversions, with the options and the response type defined as such:
|
|
80
|
+
|
|
81
|
+
```typescript
|
|
82
|
+
export interface DocumentConverter {
|
|
83
|
+
convert(local_path: string, options: ConverterOptions): Promise<ConverterResult>;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export type ConverterResult =
|
|
87
|
+
| {
|
|
88
|
+
title: string | null;
|
|
89
|
+
text_content: string;
|
|
90
|
+
}
|
|
91
|
+
| null
|
|
92
|
+
| undefined;
|
|
93
|
+
|
|
94
|
+
export type ConverterOption = {
|
|
95
|
+
file_extension?: string;
|
|
96
|
+
url?: string;
|
|
97
|
+
fetch?: typeof fetch;
|
|
98
|
+
enableYoutubeTranscript?: boolean; // false by default
|
|
99
|
+
youtubeTranscriptLanguage?: string; // "en" by default
|
|
100
|
+
llmModel: string;
|
|
101
|
+
llmPrompt?: string;
|
|
102
|
+
styleMap?: string | Array<string>;
|
|
103
|
+
_parent_converters?: DocumentConverter[];
|
|
104
|
+
cleanup_extracted?: boolean;
|
|
105
|
+
};
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Examples
|
|
109
|
+
|
|
110
|
+
Check out the [examples](./examples) folder.
|
|
33
111
|
|
|
34
112
|
## License
|
|
35
113
|
|
package/package.json
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markitdown-ts",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.3",
|
|
4
4
|
"description": "",
|
|
5
5
|
"keywords": [],
|
|
6
|
-
"homepage": "https://github.com/dead8309/markitdown#readme",
|
|
6
|
+
"homepage": "https://github.com/dead8309/markitdown-ts#readme",
|
|
7
7
|
"bugs": {
|
|
8
|
-
"url": "https://github.com/dead8309/markitdown/issues"
|
|
8
|
+
"url": "https://github.com/dead8309/markitdown-ts/issues"
|
|
9
9
|
},
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
12
|
-
"url": "git+https://github.com/dead8309/markitdown.git"
|
|
12
|
+
"url": "git+https://github.com/dead8309/markitdown-ts.git"
|
|
13
13
|
},
|
|
14
14
|
"license": "MIT",
|
|
15
15
|
"author": "Vaibhav Raj",
|