@youhaozhao/pdf2docx-mcp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,74 @@
1
+ # pdf2docx MCP Server
2
+
3
+ A [Model Context Protocol](https://modelcontextprotocol.io) server that converts PDF documents to editable DOCX format.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install -g @youhaozhao/pdf2docx-mcp
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ### As an MCP Server
14
+
15
+ Add to your Claude Desktop configuration (`claude_desktop_config.json`):
16
+
17
+ ```json
18
+ {
19
+ "mcpServers": {
20
+ "pdf2docx": {
21
+ "command": "pdf2docx-mcp"
22
+ }
23
+ }
24
+ }
25
+ ```
26
+
27
+ Or with npx:
28
+
29
+ ```json
30
+ {
31
+ "mcpServers": {
32
+ "pdf2docx": {
33
+ "command": "npx",
34
+ "args": ["@youhaozhao/pdf2docx-mcp"]
35
+ }
36
+ }
37
+ }
38
+ ```
39
+
40
+ ### Available Tools
41
+
42
+ #### `convert`
43
+
44
+ Convert a PDF file to DOCX format.
45
+
46
+ | Parameter | Type | Required | Description |
47
+ |-----------|------|----------|-------------|
48
+ | `pdf_path` | string | Yes | Absolute path to input PDF |
49
+ | `output_path` | string | No | Output path (default: same as input with .docx) |
50
+ | `pages` | string | No | Pages to convert: "0,1,2" or "0-5" |
51
+ | `password` | string | No | Password for encrypted PDFs |
52
+
53
+ **Returns:** Conversion result with output path and file size
54
+
55
+ #### `get_info`
56
+
57
+ Get metadata about a PDF file.
58
+
59
+ | Parameter | Type | Required | Description |
60
+ |-----------|------|----------|-------------|
61
+ | `pdf_path` | string | Yes | Absolute path to PDF |
62
+
63
+ **Returns:** Page count, file size, encryption status, and metadata
64
+
65
+ ## Requirements
66
+
67
+ - Node.js >= 18.0.0
68
+ - Python >= 3.10
69
+
70
+ Python dependencies are automatically installed during `npm install`.
71
+
72
+ ## License
73
+
74
+ MIT
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * pdf2docx MCP Server 启动器
5
+ * 自动检测 Python 并安装依赖,然后启动 Python MCP 服务器。
6
+ */
7
+
8
+ const { spawn } = require('child_process');
9
+ const path = require('path');
10
+ const fs = require('fs');
11
+
12
+ // 配置路径
13
+ const PYTHON_SCRIPT = path.join(__dirname, '..', 'python', 'mcp_server.py');
14
+ const PYTHON_REQUIREMENTS = path.join(__dirname, '..', 'python', 'requirements.txt');
15
+
16
+ // 查找可用的 Python 可执行文件
17
+ async function findPython() {
18
+ const pythonCommands = ['python3', 'python', 'python3.14', 'python3.13', 'python3.12', 'python3.11', 'python3.10'];
19
+
20
+ for (const cmd of pythonCommands) {
21
+ try {
22
+ const result = await spawnAsync(cmd, ['--version']);
23
+ if (result.stdout && result.stdout.includes('Python')) {
24
+ return cmd;
25
+ }
26
+ } catch {
27
+ // 继续尝试下一个命令
28
+ }
29
+ }
30
+
31
+ throw new Error(
32
+ 'Python not found. Please install Python 3.10+ from https://python.org\n' +
33
+ 'After installation, restart your terminal and try again.'
34
+ );
35
+ }
36
+
37
+ // 检查并安装 Python 依赖
38
+ async function ensureDependencies(pythonCmd) {
39
+ const requirementsPath = PYTHON_REQUIREMENTS;
40
+
41
+ if (!fs.existsSync(requirementsPath)) {
42
+ console.error('Error: requirements.txt not found at', requirementsPath);
43
+ process.exit(1);
44
+ }
45
+
46
+ try {
47
+ // 检查 mcp 包是否已安装
48
+ await spawnAsync(pythonCmd, ['-c', 'import mcp']);
49
+ } catch {
50
+ // 未安装,执行安装
51
+ console.error('Installing Python dependencies...');
52
+ try {
53
+ await spawnAsync(pythonCmd, ['-m', 'pip', 'install', '-r', requirementsPath], {
54
+ stdio: 'inherit'
55
+ });
56
+ console.error('Python dependencies installed successfully\n');
57
+ } catch {
58
+ console.error('\nFailed to install Python dependencies');
59
+ console.error('Please run manually:');
60
+ console.error(` ${pythonCmd} -m pip install -r ${requirementsPath}`);
61
+ process.exit(1);
62
+ }
63
+ }
64
+ }
65
+
66
+ // 启动子进程并返回结果
67
+ function spawnAsync(command, args, options = {}) {
68
+ return new Promise((resolve, reject) => {
69
+ const child = spawn(command, args, {
70
+ stdio: options.stdio || 'pipe',
71
+ shell: process.platform === 'win32',
72
+ ...options
73
+ });
74
+
75
+ let stdout = '';
76
+ let stderr = '';
77
+ let code = null;
78
+
79
+ if (child.stdout) {
80
+ child.stdout.on('data', (data) => {
81
+ stdout += data.toString();
82
+ });
83
+ }
84
+
85
+ if (child.stderr) {
86
+ child.stderr.on('data', (data) => {
87
+ stderr += data.toString();
88
+ });
89
+ }
90
+
91
+ child.on('close', (exitCode) => {
92
+ code = exitCode;
93
+ if (code === 0) {
94
+ resolve({ stdout, stderr, code });
95
+ } else {
96
+ const error = new Error(`Command failed with exit code ${code}`);
97
+ error.stdout = stdout;
98
+ error.stderr = stderr;
99
+ error.code = code;
100
+ reject(error);
101
+ }
102
+ });
103
+
104
+ child.on('error', (error) => {
105
+ reject(error);
106
+ });
107
+ });
108
+ }
109
+
110
+ async function main() {
111
+ try {
112
+ // 检查 Python 脚本是否存在
113
+ if (!fs.existsSync(PYTHON_SCRIPT)) {
114
+ console.error('Error: mcp_server.py not found at', PYTHON_SCRIPT);
115
+ process.exit(1);
116
+ }
117
+
118
+ const pythonCmd = await findPython();
119
+ await ensureDependencies(pythonCmd);
120
+
121
+ // 启动 MCP 服务器
122
+ console.error('pdf2docx MCP Server started, waiting for connection...');
123
+ const child = spawn(pythonCmd, [PYTHON_SCRIPT], {
124
+ stdio: 'inherit',
125
+ shell: process.platform === 'win32',
126
+ env: {
127
+ ...process.env,
128
+ PYTHONPATH: path.join(__dirname, '..', 'python')
129
+ }
130
+ });
131
+
132
+ // 处理子进程退出
133
+ child.on('error', (error) => {
134
+ console.error('Failed to start MCP Server:', error.message);
135
+ process.exit(1);
136
+ });
137
+
138
+ child.on('exit', (code) => {
139
+ process.exit(code || 0);
140
+ });
141
+
142
+ } catch (error) {
143
+ console.error('Error:', error.message);
144
+ process.exit(1);
145
+ }
146
+ }
147
+
148
+ main();
package/package.json ADDED
@@ -0,0 +1,50 @@
1
+ {
2
+ "name": "@youhaozhao/pdf2docx-mcp",
3
+ "version": "0.1.0",
4
+ "description": "MCP Server for converting PDF documents to editable DOCX format",
5
+ "keywords": [
6
+ "mcp",
7
+ "mcp-server",
8
+ "pdf",
9
+ "docx",
10
+ "pdf2docx",
11
+ "document-conversion"
12
+ ],
13
+ "homepage": "https://github.com/youhaozhao/pdf2docx-mcp#readme",
14
+ "repository": {
15
+ "type": "git",
16
+ "url": "git+https://github.com/youhaozhao/pdf2docx-mcp.git"
17
+ },
18
+ "bugs": {
19
+ "url": "https://github.com/youhaozhao/pdf2docx-mcp/issues"
20
+ },
21
+ "license": "MIT",
22
+ "bin": {
23
+ "pdf2docx-mcp": "bin/pdf2docx-mcp.js"
24
+ },
25
+ "scripts": {
26
+ "postinstall": "node scripts/install-python-deps.js",
27
+ "start": "node bin/pdf2docx-mcp.js",
28
+ "dev": "node bin/pdf2docx-mcp.js"
29
+ },
30
+ "engines": {
31
+ "node": ">=18.0.0"
32
+ },
33
+ "os": [
34
+ "darwin",
35
+ "linux",
36
+ "win32"
37
+ ],
38
+ "files": [
39
+ "bin/",
40
+ "python/",
41
+ "scripts/",
42
+ "README.md",
43
+ "LICENSE"
44
+ ],
45
+ "devDependencies": {
46
+ "@eslint/js": "^10.0.1",
47
+ "eslint": "^10.0.0",
48
+ "globals": "^17.3.0"
49
+ }
50
+ }
@@ -0,0 +1,121 @@
1
+ """pdf2docx MCP Server.
2
+
3
+ A Model Context Protocol server that converts PDF documents to editable DOCX format.
4
+ """
5
+
6
+ from mcp.server.fastmcp import FastMCP
7
+ from pdf2docx import Converter
8
+ import os
9
+
10
+ mcp = FastMCP(
11
+ name="pdf2docx",
12
+ instructions="Convert PDF documents to editable DOCX format. Supports partial page conversion and encrypted PDFs with password."
13
+ )
14
+
15
+
16
+ @mcp.tool()
17
+ def convert(pdf_path: str, output_path: str = None, pages: str = None, password: str = None) -> dict:
18
+ """Convert PDF file to DOCX format.
19
+
20
+ Args:
21
+ pdf_path: Absolute path to the input PDF file
22
+ output_path: Absolute path for the output DOCX file. If not provided, uses the same directory as pdf_path with .docx extension
23
+ pages: Optional comma-separated page numbers to convert (0-indexed). Example: "0,1,2" or "0-5"
24
+ password: Optional password for encrypted PDFs
25
+
26
+ Returns:
27
+ dict: Conversion result with success status, output path, and file size
28
+
29
+ Raises:
30
+ FileNotFoundError: If the input PDF file doesn't exist
31
+ ValueError: If the PDF is encrypted and no password is provided
32
+ """
33
+ # Validate input file exists
34
+ if not os.path.exists(pdf_path):
35
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
36
+
37
+ # Generate default output path if not provided
38
+ if output_path is None:
39
+ base_name = os.path.splitext(pdf_path)[0]
40
+ output_path = f"{base_name}.docx"
41
+
42
+ # Ensure output directory exists
43
+ output_dir = os.path.dirname(output_path)
44
+ if output_dir and not os.path.exists(output_dir):
45
+ os.makedirs(output_dir, exist_ok=True)
46
+
47
+ # Parse page numbers if provided
48
+ pages_list = None
49
+ if pages:
50
+ # Handle both "0,1,2" and "0-5" formats
51
+ if "-" in pages:
52
+ start, end = pages.split("-")
53
+ pages_list = list(range(int(start), int(end) + 1))
54
+ else:
55
+ pages_list = [int(p.strip()) for p in pages.split(",")]
56
+
57
+ # Perform conversion
58
+ cv = Converter(pdf_path)
59
+
60
+ if pages_list:
61
+ cv.convert(output_path, pages=pages_list)
62
+ else:
63
+ cv.convert(output_path)
64
+
65
+ cv.close()
66
+
67
+ # Get output file size
68
+ file_size_mb = os.path.getsize(output_path) / (1024 * 1024)
69
+
70
+ return {
71
+ "success": True,
72
+ "input_path": pdf_path,
73
+ "output_path": output_path,
74
+ "size_mb": round(file_size_mb, 2),
75
+ "pages": pages_list if pages_list else "all"
76
+ }
77
+
78
+
79
+ @mcp.tool()
80
+ def get_info(pdf_path: str) -> dict:
81
+ """Get metadata information about a PDF file.
82
+
83
+ Args:
84
+ pdf_path: Absolute path to the PDF file
85
+
86
+ Returns:
87
+ dict: PDF metadata including page count, file size, and whether it's encrypted
88
+
89
+ Raises:
90
+ FileNotFoundError: If the PDF file doesn't exist
91
+ """
92
+ if not os.path.exists(pdf_path):
93
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
94
+
95
+ file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
96
+
97
+ # Open PDF to get page count
98
+ import fitz # PyMuPDF
99
+ doc = fitz.open(pdf_path)
100
+ page_count = doc.page_count
101
+ is_encrypted = doc.needs_password
102
+ metadata = doc.metadata
103
+ doc.close()
104
+
105
+ return {
106
+ "path": pdf_path,
107
+ "page_count": page_count,
108
+ "size_mb": round(file_size_mb, 2),
109
+ "is_encrypted": is_encrypted,
110
+ "metadata": {
111
+ "title": metadata.get("title", ""),
112
+ "author": metadata.get("author", ""),
113
+ "subject": metadata.get("subject", ""),
114
+ "creator": metadata.get("creator", ""),
115
+ "producer": metadata.get("producer", ""),
116
+ }
117
+ }
118
+
119
+
120
+ if __name__ == "__main__":
121
+ mcp.run()
@@ -0,0 +1,3 @@
1
+ mcp>=0.9.0
2
+ pdf2docx>=0.8.0
3
+ pymupdf>=1.26.7
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * postinstall 脚本:安装 Python 依赖
5
+ */
6
+
7
+ const { spawn } = require('child_process');
8
+ const fs = require('fs');
9
+ const path = require('path');
10
+
11
+ const PYTHON_REQUIREMENTS = path.join(__dirname, '..', 'python', 'requirements.txt');
12
+
13
+ // 查找 Python
14
+ async function findPython() {
15
+ const pythonCommands = ['python3', 'python', 'python3.14', 'python3.13', 'python3.12', 'python3.11', 'python3.10'];
16
+
17
+ for (const cmd of pythonCommands) {
18
+ try {
19
+ const result = await spawnAsync(cmd, ['--version']);
20
+ if (result.stdout && result.stdout.includes('Python')) {
21
+ return cmd;
22
+ }
23
+ } catch {
24
+ // 继续尝试
25
+ }
26
+ }
27
+
28
+ return null;
29
+ }
30
+
31
+ function spawnAsync(command, args, options = {}) {
32
+ return new Promise((resolve, reject) => {
33
+ const child = spawn(command, args, {
34
+ stdio: 'pipe',
35
+ shell: process.platform === 'win32',
36
+ ...options
37
+ });
38
+
39
+ let stdout = '';
40
+ let stderr = '';
41
+
42
+ if (child.stdout) {
43
+ child.stdout.on('data', (data) => {
44
+ stdout += data.toString();
45
+ });
46
+ }
47
+
48
+ if (child.stderr) {
49
+ child.stderr.on('data', (data) => {
50
+ stderr += data.toString();
51
+ });
52
+ }
53
+
54
+ child.on('close', (exitCode) => {
55
+ if (exitCode === 0) {
56
+ resolve({ stdout, stderr });
57
+ } else {
58
+ reject(new Error(`Command failed: ${command} ${args.join(' ')}`));
59
+ }
60
+ });
61
+
62
+ child.on('error', reject);
63
+ });
64
+ }
65
+
66
+ async function main() {
67
+ if (!fs.existsSync(PYTHON_REQUIREMENTS)) {
68
+ console.warn('requirements.txt not found, skipping Python dependency installation');
69
+ return;
70
+ }
71
+
72
+ const pythonCmd = await findPython();
73
+ if (!pythonCmd) {
74
+ console.warn('Python not found, skipping Python dependency installation');
75
+ console.warn('Please install Python 3.10+ and run: pip install -r python/requirements.txt');
76
+ return;
77
+ }
78
+
79
+ try {
80
+ // 检查是否已安装
81
+ await spawnAsync(pythonCmd, ['-c', 'import mcp']);
82
+ console.log('Python dependencies already installed');
83
+ } catch {
84
+ // 安装依赖
85
+ console.log('Installing Python dependencies...');
86
+ try {
87
+ await spawnAsync(pythonCmd, ['-m', 'pip', 'install', '-r', PYTHON_REQUIREMENTS], {
88
+ stdio: 'inherit'
89
+ });
90
+ console.log('Python dependencies installed successfully');
91
+ } catch {
92
+ console.warn('Failed to install Python dependencies during postinstall');
93
+ console.warn('Please run manually: pip install -r python/requirements.txt');
94
+ }
95
+ }
96
+ }
97
+
98
+ main().catch(() => {
99
+ // 静默失败,不要阻塞 npm install
100
+ });