markitdown-ts 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.cjs +13 -6
- package/dist/index.mjs +10 -2
- package/package.json +3 -6
package/README.md
CHANGED
|
@@ -7,7 +7,6 @@
|
|
|
7
7
|
It supports:
|
|
8
8
|
|
|
9
9
|
- [x] PDF
|
|
10
|
-
- [x] PowerPoint
|
|
11
10
|
- [x] Word (.docx)
|
|
12
11
|
- [x] Excel (.xlsx)
|
|
13
12
|
- [x] Images (EXIF metadata extraction and optional LLM-based description)
|
|
@@ -17,6 +16,7 @@ It supports:
|
|
|
17
16
|
- [x] Jupyter Notebooks (.ipynb)
|
|
18
17
|
- [x] Bing Search Result Pages (SERP)
|
|
19
18
|
- [x] ZIP files (recursively iterates over contents)
|
|
19
|
+
- [ ] PowerPoint
|
|
20
20
|
|
|
21
21
|
> [!NOTE]
|
|
22
22
|
>
|
package/dist/index.cjs
CHANGED
|
@@ -16,7 +16,6 @@ const util = require('util');
|
|
|
16
16
|
const fs$1 = require('fs/promises');
|
|
17
17
|
const os = require('os');
|
|
18
18
|
const ai = require('ai');
|
|
19
|
-
const unzipper = require('unzipper');
|
|
20
19
|
|
|
21
20
|
function _interopDefaultCompat (e) { return e && typeof e === 'object' && 'default' in e ? e.default : e; }
|
|
22
21
|
|
|
@@ -40,12 +39,11 @@ const fs__namespace = /*#__PURE__*/_interopNamespaceCompat(fs);
|
|
|
40
39
|
const TurndownService__default = /*#__PURE__*/_interopDefaultCompat(TurndownService);
|
|
41
40
|
const turndownPluginGfm__default = /*#__PURE__*/_interopDefaultCompat(turndownPluginGfm);
|
|
42
41
|
const Mammoth__default = /*#__PURE__*/_interopDefaultCompat(Mammoth);
|
|
43
|
-
const
|
|
42
|
+
const XLSX__namespace = /*#__PURE__*/_interopNamespaceCompat(XLSX);
|
|
44
43
|
const childProcess__namespace = /*#__PURE__*/_interopNamespaceCompat(childProcess);
|
|
45
44
|
const util__namespace = /*#__PURE__*/_interopNamespaceCompat(util);
|
|
46
45
|
const fs__namespace$1 = /*#__PURE__*/_interopNamespaceCompat(fs$1);
|
|
47
46
|
const os__namespace = /*#__PURE__*/_interopNamespaceCompat(os);
|
|
48
|
-
const unzipper__namespace = /*#__PURE__*/_interopNamespaceCompat(unzipper);
|
|
49
47
|
|
|
50
48
|
class PlainTextConverter {
|
|
51
49
|
async convert(local_path, options = {}) {
|
|
@@ -707,12 +705,12 @@ class XlsxConverter extends HtmlConverter {
|
|
|
707
705
|
if (!exists) {
|
|
708
706
|
throw new Error("File does'nt exists");
|
|
709
707
|
}
|
|
710
|
-
let workbook =
|
|
708
|
+
let workbook = XLSX__namespace.readFile(local_path);
|
|
711
709
|
let mdContent = "";
|
|
712
710
|
for (const sheetName of workbook.SheetNames) {
|
|
713
711
|
mdContent += `## ${sheetName}
|
|
714
712
|
`;
|
|
715
|
-
let htmlContent =
|
|
713
|
+
let htmlContent = XLSX__namespace.utils.sheet_to_html(workbook.Sheets[sheetName]);
|
|
716
714
|
mdContent += (await this._convert(htmlContent))?.text_content.trim() + "\n\n";
|
|
717
715
|
}
|
|
718
716
|
return {
|
|
@@ -959,9 +957,18 @@ class ZipConverter {
|
|
|
959
957
|
text_content: `[ERROR] Invalid zip file path: ${localPath}`
|
|
960
958
|
};
|
|
961
959
|
}
|
|
960
|
+
let unzipper;
|
|
961
|
+
try {
|
|
962
|
+
unzipper = await import('unzipper').then((mod) => mod.default);
|
|
963
|
+
} catch (error) {
|
|
964
|
+
console.error(
|
|
965
|
+
"Optional dependency 'unzipper' is not installed. Run `npm install unzipper` to enable this feature."
|
|
966
|
+
);
|
|
967
|
+
return null;
|
|
968
|
+
}
|
|
962
969
|
try {
|
|
963
970
|
await fs__namespace$1.mkdir(newFolder, { recursive: true });
|
|
964
|
-
const zip = await
|
|
971
|
+
const zip = await unzipper.Open.file(localPath);
|
|
965
972
|
await zip.extract({ path: newFolder });
|
|
966
973
|
const files = await this._walk(newFolder);
|
|
967
974
|
for (const { root, name } of files) {
|
package/dist/index.mjs
CHANGED
|
@@ -10,13 +10,12 @@ import { DOMParser } from '@xmldom/xmldom';
|
|
|
10
10
|
import { URL as URL$1 } from 'url';
|
|
11
11
|
import { pdfToText } from 'pdf-ts';
|
|
12
12
|
import Mammoth from 'mammoth';
|
|
13
|
-
import XLSX from 'xlsx';
|
|
13
|
+
import * as XLSX from 'xlsx';
|
|
14
14
|
import * as childProcess from 'child_process';
|
|
15
15
|
import * as util from 'util';
|
|
16
16
|
import * as fs$1 from 'fs/promises';
|
|
17
17
|
import * as os from 'os';
|
|
18
18
|
import { generateText } from 'ai';
|
|
19
|
-
import * as unzipper from 'unzipper';
|
|
20
19
|
|
|
21
20
|
class PlainTextConverter {
|
|
22
21
|
async convert(local_path, options = {}) {
|
|
@@ -930,6 +929,15 @@ class ZipConverter {
|
|
|
930
929
|
text_content: `[ERROR] Invalid zip file path: ${localPath}`
|
|
931
930
|
};
|
|
932
931
|
}
|
|
932
|
+
let unzipper;
|
|
933
|
+
try {
|
|
934
|
+
unzipper = await import('unzipper').then((mod) => mod.default);
|
|
935
|
+
} catch (error) {
|
|
936
|
+
console.error(
|
|
937
|
+
"Optional dependency 'unzipper' is not installed. Run `npm install unzipper` to enable this feature."
|
|
938
|
+
);
|
|
939
|
+
return null;
|
|
940
|
+
}
|
|
933
941
|
try {
|
|
934
942
|
await fs$1.mkdir(newFolder, { recursive: true });
|
|
935
943
|
const zip = await unzipper.Open.file(localPath);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "markitdown-ts",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.4",
|
|
4
4
|
"description": "",
|
|
5
5
|
"keywords": [],
|
|
6
6
|
"homepage": "https://github.com/dead8309/markitdown-ts#readme",
|
|
@@ -50,10 +50,10 @@
|
|
|
50
50
|
"mime-types": "^2.1.35",
|
|
51
51
|
"pdf-ts": "^0.0.2",
|
|
52
52
|
"turndown": "^7.2.0",
|
|
53
|
-
"xlsx": "^0.18.5"
|
|
53
|
+
"xlsx": "^0.18.5",
|
|
54
|
+
"ai": "^4.0.22"
|
|
54
55
|
},
|
|
55
56
|
"peerDependencies": {
|
|
56
|
-
"ai": "^4.0.22",
|
|
57
57
|
"youtube-transcript": "^1.2.1",
|
|
58
58
|
"unzipper": "^0.12.3"
|
|
59
59
|
},
|
|
@@ -61,9 +61,6 @@
|
|
|
61
61
|
"youtube-transcript": {
|
|
62
62
|
"optional": true
|
|
63
63
|
},
|
|
64
|
-
"ai": {
|
|
65
|
-
"optional": true
|
|
66
|
-
},
|
|
67
64
|
"unzipper": {
|
|
68
65
|
"optional": true
|
|
69
66
|
}
|