docsmith-mcp 0.0.1-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/test.yml +35 -0
- package/LICENSE +21 -0
- package/README.md +109 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +679 -0
- package/dist/index.js.map +1 -0
- package/dist/python/excel_handler.py +97 -0
- package/dist/python/pdf_handler.py +81 -0
- package/dist/python/text_handler.py +331 -0
- package/dist/python/word_handler.py +98 -0
- package/examples/sample_data.csv +6 -0
- package/examples/sample_data.json +9 -0
- package/examples/sample_document.pdf +80 -0
- package/examples/sample_report.docx +0 -0
- package/examples/sample_sales_data.xlsx +0 -0
- package/examples/sample_text.txt +10 -0
- package/package.json +36 -0
- package/python/excel_handler.py +97 -0
- package/python/pdf_handler.py +81 -0
- package/python/text_handler.py +331 -0
- package/python/word_handler.py +98 -0
- package/scripts/preload-packages.mjs +64 -0
- package/src/code-runner.ts +136 -0
- package/src/index.ts +496 -0
- package/src/utils.ts +45 -0
- package/tests/document-processing.test.ts +230 -0
- package/tsconfig.json +20 -0
- package/tsdown.config.ts +21 -0
- package/vitest.config.ts +15 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Word document handler - read/write DOCX files
|
|
3
|
+
"""
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
def read_word(file_path: str, page: int = None, page_size: int = 100):
|
|
9
|
+
"""Read Word document with optional pagination by paragraphs"""
|
|
10
|
+
from docx import Document
|
|
11
|
+
|
|
12
|
+
doc = Document(file_path)
|
|
13
|
+
|
|
14
|
+
# Extract paragraphs
|
|
15
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
16
|
+
|
|
17
|
+
# Extract tables
|
|
18
|
+
tables = []
|
|
19
|
+
for table in doc.tables:
|
|
20
|
+
table_data = []
|
|
21
|
+
for row in table.rows:
|
|
22
|
+
row_data = [cell.text for cell in row.cells]
|
|
23
|
+
table_data.append(row_data)
|
|
24
|
+
tables.append(table_data)
|
|
25
|
+
|
|
26
|
+
# Handle pagination for paragraphs
|
|
27
|
+
if page is not None:
|
|
28
|
+
start = (page - 1) * page_size
|
|
29
|
+
end = start + page_size
|
|
30
|
+
paragraphs = paragraphs[start:end]
|
|
31
|
+
total_pages = (len(doc.paragraphs) + page_size - 1) // page_size
|
|
32
|
+
else:
|
|
33
|
+
total_pages = 1
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
"paragraphs": paragraphs,
|
|
37
|
+
"tables": tables,
|
|
38
|
+
"total_paragraphs": len(doc.paragraphs),
|
|
39
|
+
"total_tables": len(doc.tables),
|
|
40
|
+
"current_page": page,
|
|
41
|
+
"page_size": page_size if page else None,
|
|
42
|
+
"total_pages": total_pages
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
def get_word_info(file_path: str):
|
|
46
|
+
"""Get Word document metadata"""
|
|
47
|
+
from docx import Document
|
|
48
|
+
|
|
49
|
+
doc = Document(file_path)
|
|
50
|
+
|
|
51
|
+
# Count non-empty paragraphs
|
|
52
|
+
para_count = sum(1 for p in doc.paragraphs if p.text.strip())
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
"paragraphs": para_count,
|
|
56
|
+
"tables": len(doc.tables),
|
|
57
|
+
"file_size": Path(file_path).stat().st_size
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
def write_word(file_path: str, paragraphs: list, tables: list = None):
|
|
61
|
+
"""Write data to Word document"""
|
|
62
|
+
from docx import Document
|
|
63
|
+
|
|
64
|
+
doc = Document()
|
|
65
|
+
|
|
66
|
+
# Add paragraphs
|
|
67
|
+
for text in paragraphs:
|
|
68
|
+
doc.add_paragraph(text)
|
|
69
|
+
|
|
70
|
+
# Add tables if provided
|
|
71
|
+
if tables:
|
|
72
|
+
for table_data in tables:
|
|
73
|
+
table = doc.add_table(rows=len(table_data), cols=len(table_data[0]) if table_data else 1)
|
|
74
|
+
for i, row_data in enumerate(table_data):
|
|
75
|
+
for j, cell_text in enumerate(row_data):
|
|
76
|
+
table.rows[i].cells[j].text = str(cell_text)
|
|
77
|
+
|
|
78
|
+
doc.save(file_path)
|
|
79
|
+
return {"success": True, "file_path": file_path}
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
command = sys.argv[1]
|
|
83
|
+
file_path = sys.argv[2]
|
|
84
|
+
|
|
85
|
+
if command == "read":
|
|
86
|
+
page = int(sys.argv[3]) if len(sys.argv) > 3 else None
|
|
87
|
+
page_size = int(sys.argv[4]) if len(sys.argv) > 4 else 100
|
|
88
|
+
result = read_word(file_path, page, page_size)
|
|
89
|
+
elif command == "info":
|
|
90
|
+
result = get_word_info(file_path)
|
|
91
|
+
elif command == "write":
|
|
92
|
+
paragraphs = json.loads(sys.argv[3])
|
|
93
|
+
tables = json.loads(sys.argv[4]) if len(sys.argv) > 4 else None
|
|
94
|
+
result = write_word(file_path, paragraphs, tables)
|
|
95
|
+
else:
|
|
96
|
+
result = {"error": f"Unknown command: {command}"}
|
|
97
|
+
|
|
98
|
+
print(json.dumps(result, default=str))
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Preload Python packages during installation
|
|
4
|
+
* This script runs runPy to install packages into Pyodide's cache
|
|
5
|
+
*
|
|
6
|
+
* Note: Requires Node.js with --experimental-wasm-stack-switching flag
|
|
7
|
+
*/
|
|
8
|
+
import { runPy } from "@mcpc-tech/code-runner-mcp";
|
|
9
|
+
|
|
10
|
+
const PACKAGES = [
|
|
11
|
+
"openpyxl",
|
|
12
|
+
"python-docx",
|
|
13
|
+
"PyPDF2",
|
|
14
|
+
];
|
|
15
|
+
|
|
16
|
+
async function preloadPackages() {
|
|
17
|
+
console.log("📦 Preloading Python packages for Pyodide...\n");
|
|
18
|
+
|
|
19
|
+
// Use async wrapper to allow await
|
|
20
|
+
const code = `
|
|
21
|
+
import micropip
|
|
22
|
+
import asyncio
|
|
23
|
+
|
|
24
|
+
async def main():
|
|
25
|
+
packages = ${JSON.stringify(PACKAGES)}
|
|
26
|
+
|
|
27
|
+
print(f"Installing {len(packages)} packages...")
|
|
28
|
+
for pkg in packages:
|
|
29
|
+
print(f" - {pkg}")
|
|
30
|
+
|
|
31
|
+
await micropip.install(packages)
|
|
32
|
+
print("\\n✅ All packages installed successfully!")
|
|
33
|
+
|
|
34
|
+
asyncio.run(main())
|
|
35
|
+
`;
|
|
36
|
+
|
|
37
|
+
try {
|
|
38
|
+
const stream = await runPy(code, {
|
|
39
|
+
packages: {
|
|
40
|
+
openpyxl: "openpyxl",
|
|
41
|
+
"python-docx": "python-docx",
|
|
42
|
+
PyPDF2: "PyPDF2",
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
const reader = stream.getReader();
|
|
47
|
+
const decoder = new TextDecoder();
|
|
48
|
+
|
|
49
|
+
while (true) {
|
|
50
|
+
const { done, value } = await reader.read();
|
|
51
|
+
if (done) break;
|
|
52
|
+
const text = decoder.decode(value, { stream: true });
|
|
53
|
+
process.stdout.write(text);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
console.log("\n✨ Preload complete!");
|
|
57
|
+
} catch (error) {
|
|
58
|
+
console.error("\n❌ Preload failed:", error.message);
|
|
59
|
+
// Don't fail the install, just warn
|
|
60
|
+
console.log("⚠️ Packages will be downloaded on first use");
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
preloadPackages();
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Code runner client - uses @mcpc-tech/code-runner-mcp npm package
|
|
3
|
+
*/
|
|
4
|
+
import { runPy, type RunPyOptions } from "@mcpc-tech/code-runner-mcp";
|
|
5
|
+
import { readFileSync } from "fs";
|
|
6
|
+
import { fileURLToPath } from "url";
|
|
7
|
+
import { dirname, join, resolve } from "path";
|
|
8
|
+
|
|
9
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
10
|
+
const __dirname = dirname(__filename);
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Options for running Python script files
|
|
14
|
+
*/
|
|
15
|
+
export interface RunPythonFileOptions {
|
|
16
|
+
/** Command line arguments to pass to the script */
|
|
17
|
+
args?: string[];
|
|
18
|
+
/** Package name mappings (import_name -> pypi_name) */
|
|
19
|
+
packages?: Record<string, string>;
|
|
20
|
+
/** Base directory for the script (default: "python") */
|
|
21
|
+
baseDir?: string;
|
|
22
|
+
/** User file paths that need to be accessible (for file system mounting) */
|
|
23
|
+
filePaths?: string[];
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Convert absolute file path to Pyodide virtual path
|
|
28
|
+
* Determines the mount root and converts the path accordingly
|
|
29
|
+
*
|
|
30
|
+
* @param filePath - Absolute path to the file
|
|
31
|
+
* @returns Object with mountRoot (host path) and virtualPath (Pyodide path)
|
|
32
|
+
*/
|
|
33
|
+
function getFileSystemMapping(
|
|
34
|
+
filePath: string,
|
|
35
|
+
): { mountRoot: string; virtualPath: string } {
|
|
36
|
+
const absolutePath = resolve(filePath);
|
|
37
|
+
|
|
38
|
+
// Mount the parent directory of the file
|
|
39
|
+
// This allows Python to access the file and its siblings
|
|
40
|
+
const mountRoot = dirname(absolutePath);
|
|
41
|
+
const virtualPath = absolutePath;
|
|
42
|
+
|
|
43
|
+
return { mountRoot, virtualPath };
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Run a Python script file using code-runner-mcp
|
|
48
|
+
*
|
|
49
|
+
* @param scriptPath - Path to the Python script (relative to baseDir)
|
|
50
|
+
* @param options - Execution options
|
|
51
|
+
* @returns The execution result
|
|
52
|
+
*/
|
|
53
|
+
export async function runPythonFile(
|
|
54
|
+
scriptPath: string,
|
|
55
|
+
options: RunPythonFileOptions = {},
|
|
56
|
+
): Promise<any> {
|
|
57
|
+
const {
|
|
58
|
+
args = [],
|
|
59
|
+
packages = {},
|
|
60
|
+
baseDir = "python",
|
|
61
|
+
filePaths = [],
|
|
62
|
+
} = options;
|
|
63
|
+
|
|
64
|
+
// Read the Python script
|
|
65
|
+
const fullPath = join(__dirname, "..", baseDir, scriptPath);
|
|
66
|
+
const scriptContent = readFileSync(fullPath, "utf-8");
|
|
67
|
+
|
|
68
|
+
// Build wrapper code that sets sys.argv and executes the script
|
|
69
|
+
const wrapperCode = `
|
|
70
|
+
import sys
|
|
71
|
+
import json
|
|
72
|
+
|
|
73
|
+
# Set command line arguments
|
|
74
|
+
sys.argv = ['${scriptPath}'] + ${JSON.stringify(args)}
|
|
75
|
+
|
|
76
|
+
# Execute the script
|
|
77
|
+
${scriptContent}
|
|
78
|
+
`;
|
|
79
|
+
|
|
80
|
+
// Determine mount root from the first file path
|
|
81
|
+
let mountRoot = join(__dirname, ".."); // Default: project root
|
|
82
|
+
if (filePaths.length > 0) {
|
|
83
|
+
const mapping = getFileSystemMapping(filePaths[0]);
|
|
84
|
+
mountRoot = mapping.mountRoot;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Execute via runPy with options
|
|
88
|
+
// Mount point is the same as the mount root (Pyodide will see host paths directly)
|
|
89
|
+
const runPyOptions: RunPyOptions = {
|
|
90
|
+
packages,
|
|
91
|
+
nodeFSMountPoint: mountRoot,
|
|
92
|
+
nodeFSRoot: mountRoot,
|
|
93
|
+
};
|
|
94
|
+
const stream = await runPy(wrapperCode, runPyOptions);
|
|
95
|
+
|
|
96
|
+
// Read the stream output
|
|
97
|
+
const reader = stream.getReader();
|
|
98
|
+
const decoder = new TextDecoder();
|
|
99
|
+
let stdout = "";
|
|
100
|
+
let stderr = "";
|
|
101
|
+
let error = "";
|
|
102
|
+
|
|
103
|
+
try {
|
|
104
|
+
while (true) {
|
|
105
|
+
const { done, value } = await reader.read();
|
|
106
|
+
if (done) break;
|
|
107
|
+
|
|
108
|
+
const chunk = decoder.decode(value, { stream: true });
|
|
109
|
+
if (chunk.startsWith("[stderr] ")) {
|
|
110
|
+
stderr += chunk.slice(9);
|
|
111
|
+
} else if (chunk.startsWith("[err]")) {
|
|
112
|
+
error += chunk;
|
|
113
|
+
} else {
|
|
114
|
+
stdout += chunk;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
} catch (streamError) {
|
|
118
|
+
// Stream error means Python execution failed
|
|
119
|
+
return { error: String(streamError) };
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Check for errors
|
|
123
|
+
if (error) {
|
|
124
|
+
return { error: error.replace(/\[err\]\[py\]\s*/g, "").trim() };
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Parse the JSON output from the script (last line)
|
|
128
|
+
const lines = stdout.trim().split("\n");
|
|
129
|
+
const lastLine = lines[lines.length - 1];
|
|
130
|
+
|
|
131
|
+
try {
|
|
132
|
+
return JSON.parse(lastLine);
|
|
133
|
+
} catch {
|
|
134
|
+
return { stdout, stderr };
|
|
135
|
+
}
|
|
136
|
+
}
|