@hawon/nexus 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,4 +3,5 @@ export { chunkText } from "./chunker.js";
3
3
  export { extractPdfText, isPdfSupported } from "./pdf.js";
4
4
  export { extractDocxText } from "./docx.js";
5
5
  export { extractPlainText } from "./text.js";
6
+ export { convertWithMarkItDown, isMarkItDownSupported, isMarkItDownFormat } from "./markitdown.js";
6
7
  export type { ParsedDocument, DocumentChunk, ParseOptions, DocumentFormat } from "./types.js";
@@ -3,3 +3,4 @@ export { chunkText } from "./chunker.js";
3
3
  export { extractPdfText, isPdfSupported } from "./pdf.js";
4
4
  export { extractDocxText } from "./docx.js";
5
5
  export { extractPlainText } from "./text.js";
6
+ export { convertWithMarkItDown, isMarkItDownSupported, isMarkItDownFormat } from "./markitdown.js";
@@ -0,0 +1,7 @@
1
+ export declare function isMarkItDownSupported(): boolean;
2
+ /**
3
+ * Convert any file to markdown using Microsoft MarkItDown.
4
+ * Supports: PDF, DOCX, PPTX, XLSX, HTML, images, audio, ZIP, etc.
5
+ */
6
+ export declare function convertWithMarkItDown(filePath: string): string;
7
+ export declare function isMarkItDownFormat(ext: string): boolean;
@@ -0,0 +1,43 @@
1
+ import { execSync } from "node:child_process";
2
+ let _supported = null;
3
+ export function isMarkItDownSupported() {
4
+ if (_supported !== null)
5
+ return _supported;
6
+ try {
7
+ execSync("markitdown --help", { stdio: "ignore" });
8
+ _supported = true;
9
+ }
10
+ catch {
11
+ _supported = false;
12
+ }
13
+ return _supported;
14
+ }
15
+ /**
16
+ * Convert any file to markdown using Microsoft MarkItDown.
17
+ * Supports: PDF, DOCX, PPTX, XLSX, HTML, images, audio, ZIP, etc.
18
+ */
19
+ export function convertWithMarkItDown(filePath) {
20
+ if (!isMarkItDownSupported()) {
21
+ throw new Error("markitdown not installed. Run: pip install markitdown");
22
+ }
23
+ const result = execSync(`markitdown "${filePath}"`, {
24
+ maxBuffer: 50 * 1024 * 1024,
25
+ encoding: "utf-8",
26
+ timeout: 60_000,
27
+ });
28
+ return result.trim();
29
+ }
30
+ /** Formats that MarkItDown supports beyond our built-in parsers. */
31
+ const MARKITDOWN_EXTENSIONS = new Set([
32
+ ".pptx", ".ppt", ".xlsx", ".xls",
33
+ ".html", ".htm", ".xml",
34
+ ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp",
35
+ ".mp3", ".wav", ".m4a",
36
+ ".zip", ".tar", ".gz",
37
+ ".json", ".yaml", ".yml",
38
+ ".rst", ".org", ".rtf",
39
+ ".epub",
40
+ ]);
41
+ export function isMarkItDownFormat(ext) {
42
+ return MARKITDOWN_EXTENSIONS.has(ext.toLowerCase());
43
+ }
@@ -4,6 +4,7 @@ import { chunkText } from "./chunker.js";
4
4
  import { extractPdfText, isPdfSupported } from "./pdf.js";
5
5
  import { extractDocxText } from "./docx.js";
6
6
  import { extractPlainText } from "./text.js";
7
+ import { convertWithMarkItDown, isMarkItDownSupported, isMarkItDownFormat } from "./markitdown.js";
7
8
  export function detectFormat(filePath) {
8
9
  const ext = extname(filePath).toLowerCase();
9
10
  switch (ext) {
@@ -16,33 +17,78 @@ export function detectFormat(filePath) {
16
17
  case ".text":
17
18
  case ".log":
18
19
  case ".csv": return "txt";
19
- default: return null;
20
+ default:
21
+ // MarkItDown handles PPTX, XLSX, HTML, images, audio, etc.
22
+ if (isMarkItDownFormat(ext))
23
+ return "txt";
24
+ return null;
20
25
  }
21
26
  }
27
+ /** Check if a file should use MarkItDown instead of built-in parsers. */
28
+ function shouldUseMarkItDown(filePath) {
29
+ const ext = extname(filePath).toLowerCase();
30
+ return isMarkItDownFormat(ext);
31
+ }
22
32
  export function parseDocument(filePath, memory, options) {
23
33
  if (!existsSync(filePath))
24
34
  throw new Error(`File not found: ${filePath}`);
35
+ const ext = extname(filePath).toLowerCase();
36
+ const useMarkItDown = shouldUseMarkItDown(filePath);
25
37
  const format = options?.format ?? detectFormat(filePath);
26
- if (!format)
27
- throw new Error(`Unsupported format: ${extname(filePath)}`);
38
+ if (!format && !useMarkItDown)
39
+ throw new Error(`Unsupported format: ${ext}`);
28
40
  let text;
29
41
  let pageCount;
30
- switch (format) {
31
- case "pdf": {
32
- if (!isPdfSupported())
33
- throw new Error("PDF requires python3 + pymupdf");
34
- const result = extractPdfText(filePath);
35
- text = result.text;
36
- pageCount = result.pageCount;
37
- break;
42
+ if (useMarkItDown) {
43
+ // PPTX, XLSX, HTML, images, audio, etc. → MarkItDown
44
+ if (!isMarkItDownSupported())
45
+ throw new Error("Install markitdown for this format: pip install markitdown");
46
+ text = convertWithMarkItDown(filePath);
47
+ }
48
+ else {
49
+ switch (format) {
50
+ case "pdf": {
51
+ // Try pymupdf first, fall back to markitdown
52
+ if (isPdfSupported()) {
53
+ const result = extractPdfText(filePath);
54
+ text = result.text;
55
+ pageCount = result.pageCount;
56
+ }
57
+ else if (isMarkItDownSupported()) {
58
+ text = convertWithMarkItDown(filePath);
59
+ }
60
+ else {
61
+ throw new Error("PDF requires python3 + pymupdf or markitdown");
62
+ }
63
+ break;
64
+ }
65
+ case "docx":
66
+ try {
67
+ text = extractDocxText(filePath);
68
+ }
69
+ catch {
70
+ // Fall back to markitdown if unzip fails
71
+ if (isMarkItDownSupported()) {
72
+ text = convertWithMarkItDown(filePath);
73
+ }
74
+ else {
75
+ throw new Error(`Failed to parse DOCX: ${filePath}`);
76
+ }
77
+ }
78
+ break;
79
+ case "markdown":
80
+ case "txt":
81
+ text = extractPlainText(filePath, format === "markdown");
82
+ break;
83
+ default:
84
+ if (isMarkItDownSupported()) {
85
+ text = convertWithMarkItDown(filePath);
86
+ }
87
+ else {
88
+ throw new Error(`Unsupported format: ${ext}`);
89
+ }
90
+ break;
38
91
  }
39
- case "docx":
40
- text = extractDocxText(filePath);
41
- break;
42
- case "markdown":
43
- case "txt":
44
- text = extractPlainText(filePath, format === "markdown");
45
- break;
46
92
  }
47
93
  // Truncate if needed
48
94
  const maxChars = options?.maxChars ?? 500_000;
@@ -59,7 +105,7 @@ export function parseDocument(filePath, memory, options) {
59
105
  memory.save();
60
106
  return {
61
107
  filePath,
62
- format,
108
+ format: format ?? "txt",
63
109
  title,
64
110
  text,
65
111
  chunks,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hawon/nexus",
3
- "version": "0.4.0",
3
+ "version": "0.4.1",
4
4
  "description": "The all-in-one AI developer framework — session intelligence, code review, prompt injection defense, infinite memory, self-evolving skills",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",