@hawon/nexus 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -3,4 +3,5 @@ export { chunkText } from "./chunker.js";
|
|
|
3
3
|
export { extractPdfText, isPdfSupported } from "./pdf.js";
|
|
4
4
|
export { extractDocxText } from "./docx.js";
|
|
5
5
|
export { extractPlainText } from "./text.js";
|
|
6
|
+
export { convertWithMarkItDown, isMarkItDownSupported, isMarkItDownFormat } from "./markitdown.js";
|
|
6
7
|
export type { ParsedDocument, DocumentChunk, ParseOptions, DocumentFormat } from "./types.js";
|
package/dist/docparser/index.js
CHANGED
|
@@ -3,3 +3,4 @@ export { chunkText } from "./chunker.js";
|
|
|
3
3
|
export { extractPdfText, isPdfSupported } from "./pdf.js";
|
|
4
4
|
export { extractDocxText } from "./docx.js";
|
|
5
5
|
export { extractPlainText } from "./text.js";
|
|
6
|
+
export { convertWithMarkItDown, isMarkItDownSupported, isMarkItDownFormat } from "./markitdown.js";
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export declare function isMarkItDownSupported(): boolean;
|
|
2
|
+
/**
|
|
3
|
+
* Convert any file to markdown using Microsoft MarkItDown.
|
|
4
|
+
* Supports: PDF, DOCX, PPTX, XLSX, HTML, images, audio, ZIP, etc.
|
|
5
|
+
*/
|
|
6
|
+
export declare function convertWithMarkItDown(filePath: string): string;
|
|
7
|
+
export declare function isMarkItDownFormat(ext: string): boolean;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { execSync } from "node:child_process";
|
|
2
|
+
let _supported = null;
|
|
3
|
+
export function isMarkItDownSupported() {
|
|
4
|
+
if (_supported !== null)
|
|
5
|
+
return _supported;
|
|
6
|
+
try {
|
|
7
|
+
execSync("markitdown --help", { stdio: "ignore" });
|
|
8
|
+
_supported = true;
|
|
9
|
+
}
|
|
10
|
+
catch {
|
|
11
|
+
_supported = false;
|
|
12
|
+
}
|
|
13
|
+
return _supported;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Convert any file to markdown using Microsoft MarkItDown.
|
|
17
|
+
* Supports: PDF, DOCX, PPTX, XLSX, HTML, images, audio, ZIP, etc.
|
|
18
|
+
*/
|
|
19
|
+
export function convertWithMarkItDown(filePath) {
|
|
20
|
+
if (!isMarkItDownSupported()) {
|
|
21
|
+
throw new Error("markitdown not installed. Run: pip install markitdown");
|
|
22
|
+
}
|
|
23
|
+
const result = execSync(`markitdown "${filePath}"`, {
|
|
24
|
+
maxBuffer: 50 * 1024 * 1024,
|
|
25
|
+
encoding: "utf-8",
|
|
26
|
+
timeout: 60_000,
|
|
27
|
+
});
|
|
28
|
+
return result.trim();
|
|
29
|
+
}
|
|
30
|
+
/** Formats that MarkItDown supports beyond our built-in parsers. */
|
|
31
|
+
const MARKITDOWN_EXTENSIONS = new Set([
|
|
32
|
+
".pptx", ".ppt", ".xlsx", ".xls",
|
|
33
|
+
".html", ".htm", ".xml",
|
|
34
|
+
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp",
|
|
35
|
+
".mp3", ".wav", ".m4a",
|
|
36
|
+
".zip", ".tar", ".gz",
|
|
37
|
+
".json", ".yaml", ".yml",
|
|
38
|
+
".rst", ".org", ".rtf",
|
|
39
|
+
".epub",
|
|
40
|
+
]);
|
|
41
|
+
export function isMarkItDownFormat(ext) {
|
|
42
|
+
return MARKITDOWN_EXTENSIONS.has(ext.toLowerCase());
|
|
43
|
+
}
|
|
@@ -4,6 +4,7 @@ import { chunkText } from "./chunker.js";
|
|
|
4
4
|
import { extractPdfText, isPdfSupported } from "./pdf.js";
|
|
5
5
|
import { extractDocxText } from "./docx.js";
|
|
6
6
|
import { extractPlainText } from "./text.js";
|
|
7
|
+
import { convertWithMarkItDown, isMarkItDownSupported, isMarkItDownFormat } from "./markitdown.js";
|
|
7
8
|
export function detectFormat(filePath) {
|
|
8
9
|
const ext = extname(filePath).toLowerCase();
|
|
9
10
|
switch (ext) {
|
|
@@ -16,33 +17,78 @@ export function detectFormat(filePath) {
|
|
|
16
17
|
case ".text":
|
|
17
18
|
case ".log":
|
|
18
19
|
case ".csv": return "txt";
|
|
19
|
-
default:
|
|
20
|
+
default:
|
|
21
|
+
// MarkItDown handles PPTX, XLSX, HTML, images, audio, etc.
|
|
22
|
+
if (isMarkItDownFormat(ext))
|
|
23
|
+
return "txt";
|
|
24
|
+
return null;
|
|
20
25
|
}
|
|
21
26
|
}
|
|
27
|
+
/** Check if a file should use MarkItDown instead of built-in parsers. */
|
|
28
|
+
function shouldUseMarkItDown(filePath) {
|
|
29
|
+
const ext = extname(filePath).toLowerCase();
|
|
30
|
+
return isMarkItDownFormat(ext);
|
|
31
|
+
}
|
|
22
32
|
export function parseDocument(filePath, memory, options) {
|
|
23
33
|
if (!existsSync(filePath))
|
|
24
34
|
throw new Error(`File not found: ${filePath}`);
|
|
35
|
+
const ext = extname(filePath).toLowerCase();
|
|
36
|
+
const useMarkItDown = shouldUseMarkItDown(filePath);
|
|
25
37
|
const format = options?.format ?? detectFormat(filePath);
|
|
26
|
-
if (!format)
|
|
27
|
-
throw new Error(`Unsupported format: ${
|
|
38
|
+
if (!format && !useMarkItDown)
|
|
39
|
+
throw new Error(`Unsupported format: ${ext}`);
|
|
28
40
|
let text;
|
|
29
41
|
let pageCount;
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
42
|
+
if (useMarkItDown) {
|
|
43
|
+
// PPTX, XLSX, HTML, images, audio, etc. → MarkItDown
|
|
44
|
+
if (!isMarkItDownSupported())
|
|
45
|
+
throw new Error("Install markitdown for this format: pip install markitdown");
|
|
46
|
+
text = convertWithMarkItDown(filePath);
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
switch (format) {
|
|
50
|
+
case "pdf": {
|
|
51
|
+
// Try pymupdf first, fall back to markitdown
|
|
52
|
+
if (isPdfSupported()) {
|
|
53
|
+
const result = extractPdfText(filePath);
|
|
54
|
+
text = result.text;
|
|
55
|
+
pageCount = result.pageCount;
|
|
56
|
+
}
|
|
57
|
+
else if (isMarkItDownSupported()) {
|
|
58
|
+
text = convertWithMarkItDown(filePath);
|
|
59
|
+
}
|
|
60
|
+
else {
|
|
61
|
+
throw new Error("PDF requires python3 + pymupdf or markitdown");
|
|
62
|
+
}
|
|
63
|
+
break;
|
|
64
|
+
}
|
|
65
|
+
case "docx":
|
|
66
|
+
try {
|
|
67
|
+
text = extractDocxText(filePath);
|
|
68
|
+
}
|
|
69
|
+
catch {
|
|
70
|
+
// Fall back to markitdown if unzip fails
|
|
71
|
+
if (isMarkItDownSupported()) {
|
|
72
|
+
text = convertWithMarkItDown(filePath);
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
throw new Error(`Failed to parse DOCX: ${filePath}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
break;
|
|
79
|
+
case "markdown":
|
|
80
|
+
case "txt":
|
|
81
|
+
text = extractPlainText(filePath, format === "markdown");
|
|
82
|
+
break;
|
|
83
|
+
default:
|
|
84
|
+
if (isMarkItDownSupported()) {
|
|
85
|
+
text = convertWithMarkItDown(filePath);
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
throw new Error(`Unsupported format: ${ext}`);
|
|
89
|
+
}
|
|
90
|
+
break;
|
|
38
91
|
}
|
|
39
|
-
case "docx":
|
|
40
|
-
text = extractDocxText(filePath);
|
|
41
|
-
break;
|
|
42
|
-
case "markdown":
|
|
43
|
-
case "txt":
|
|
44
|
-
text = extractPlainText(filePath, format === "markdown");
|
|
45
|
-
break;
|
|
46
92
|
}
|
|
47
93
|
// Truncate if needed
|
|
48
94
|
const maxChars = options?.maxChars ?? 500_000;
|
|
@@ -59,7 +105,7 @@ export function parseDocument(filePath, memory, options) {
|
|
|
59
105
|
memory.save();
|
|
60
106
|
return {
|
|
61
107
|
filePath,
|
|
62
|
-
format,
|
|
108
|
+
format: format ?? "txt",
|
|
63
109
|
title,
|
|
64
110
|
text,
|
|
65
111
|
chunks,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hawon/nexus",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.1",
|
|
4
4
|
"description": "The all-in-one AI developer framework — session intelligence, code review, prompt injection defense, infinite memory, self-evolving skills",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|