mineru-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +233 -0
- package/package.json +53 -0
package/README.md
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# mineru-mcp
|
|
2
|
+
|
|
3
|
+
MCP server for [MinerU](https://mineru.net) document parsing API - optimized for Claude Code.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **4 optimized tools** with concise descriptions (~73% token reduction vs alternatives)
|
|
8
|
+
- **VLM model support** (90%+ accuracy) and pipeline mode (faster)
|
|
9
|
+
- **Page range selection** - parse specific pages only
|
|
10
|
+
- **Batch processing** - up to 200 documents at once
|
|
11
|
+
- **Pagination** - efficient handling of large batch results
|
|
12
|
+
- **109 language OCR** support
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
### Claude Code
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
claude mcp add mineru-mcp -e MINERU_API_KEY=your-api-key -- npx mineru-mcp
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Claude Desktop
|
|
23
|
+
|
|
24
|
+
Add to your Claude Desktop config:
|
|
25
|
+
|
|
26
|
+
```json
|
|
27
|
+
{
|
|
28
|
+
"mcpServers": {
|
|
29
|
+
"mineru": {
|
|
30
|
+
"command": "npx",
|
|
31
|
+
"args": ["-y", "mineru-mcp"],
|
|
32
|
+
"env": {
|
|
33
|
+
"MINERU_API_KEY": "your-api-key"
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Configuration
|
|
41
|
+
|
|
42
|
+
| Environment Variable | Default | Description |
|
|
43
|
+
|---------------------|---------|-------------|
|
|
44
|
+
| `MINERU_API_KEY` | (required) | Your MinerU API Bearer token |
|
|
45
|
+
| `MINERU_BASE_URL` | `https://mineru.net/api/v4` | API base URL |
|
|
46
|
+
| `MINERU_DEFAULT_MODEL` | `pipeline` | Default model: `pipeline` or `vlm` |
|
|
47
|
+
|
|
48
|
+
Get your API key at [mineru.net](https://mineru.net)
|
|
49
|
+
|
|
50
|
+
## Tools
|
|
51
|
+
|
|
52
|
+
### `mineru_parse`
|
|
53
|
+
|
|
54
|
+
Parse a single document URL.
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
mineru_parse({
|
|
58
|
+
url: "https://example.com/document.pdf",
|
|
59
|
+
model: "vlm", // optional: "pipeline" (default) or "vlm" (90% accuracy)
|
|
60
|
+
pages: "1-10,15", // optional: page ranges
|
|
61
|
+
ocr: true, // optional: enable OCR (pipeline only)
|
|
62
|
+
formula: true, // optional: formula recognition
|
|
63
|
+
table: true, // optional: table recognition
|
|
64
|
+
language: "en", // optional: language code
|
|
65
|
+
formats: ["html"] // optional: extra export formats
|
|
66
|
+
})
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### `mineru_status`
|
|
70
|
+
|
|
71
|
+
Check task progress and get download URL.
|
|
72
|
+
|
|
73
|
+
```typescript
|
|
74
|
+
mineru_status({
|
|
75
|
+
task_id: "abc-123",
|
|
76
|
+
format: "concise" // optional: "concise" (default) or "detailed"
|
|
77
|
+
})
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Concise output**: `done | abc-123 | https://cdn-mineru.../result.zip`
|
|
81
|
+
|
|
82
|
+
### `mineru_batch`
|
|
83
|
+
|
|
84
|
+
Parse multiple documents in one batch (max 200).
|
|
85
|
+
|
|
86
|
+
```typescript
|
|
87
|
+
mineru_batch({
|
|
88
|
+
urls: ["https://example.com/doc1.pdf", "https://example.com/doc2.pdf"],
|
|
89
|
+
model: "vlm"
|
|
90
|
+
})
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### `mineru_batch_status`
|
|
94
|
+
|
|
95
|
+
Get batch results with pagination.
|
|
96
|
+
|
|
97
|
+
```typescript
|
|
98
|
+
mineru_batch_status({
|
|
99
|
+
batch_id: "batch-123",
|
|
100
|
+
limit: 10, // optional: max results (default: 10)
|
|
101
|
+
offset: 0, // optional: skip first N results
|
|
102
|
+
format: "concise" // optional: "concise" or "detailed"
|
|
103
|
+
})
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Supported Formats
|
|
107
|
+
|
|
108
|
+
- PDF, DOC, DOCX, PPT, PPTX
|
|
109
|
+
- PNG, JPG, JPEG
|
|
110
|
+
|
|
111
|
+
## Limits
|
|
112
|
+
|
|
113
|
+
- Single file: 200MB max, 600 pages max
|
|
114
|
+
- Daily quota: 2000 pages at high priority
|
|
115
|
+
- Batch: max 200 URLs per request
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT
|
|
120
|
+
|
|
121
|
+
## Links
|
|
122
|
+
|
|
123
|
+
- [MinerU](https://mineru.net) - Document parsing service
|
|
124
|
+
- [MinerU GitHub](https://github.com/opendatalab/MinerU) - Open source version
|
|
125
|
+
- [MCP Specification](https://modelcontextprotocol.io) - Model Context Protocol
|
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
3
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
4
|
+
import { z } from "zod";
|
|
5
|
+
import axios, { AxiosError } from "axios";
|
|
6
|
+
// Configuration from environment
|
|
7
|
+
const config = {
|
|
8
|
+
apiKey: process.env.MINERU_API_KEY || "",
|
|
9
|
+
baseUrl: process.env.MINERU_BASE_URL || "https://mineru.net/api/v4",
|
|
10
|
+
defaultModel: (process.env.MINERU_DEFAULT_MODEL || "pipeline"),
|
|
11
|
+
};
|
|
12
|
+
// Error codes with actionable messages
|
|
13
|
+
const ERROR_MESSAGES = {
|
|
14
|
+
"A0202": "Token error. Check your API key.",
|
|
15
|
+
"A0211": "Token expired. Get a new API key.",
|
|
16
|
+
"-60002": "Invalid file format. Use: pdf, doc, docx, ppt, pptx, png, jpg, jpeg",
|
|
17
|
+
"-60005": "File too large. Max 200MB.",
|
|
18
|
+
"-60006": "Too many pages. Max 600 per file. Split the document.",
|
|
19
|
+
"-60008": "URL timeout. Check the URL is accessible.",
|
|
20
|
+
"-60009": "Queue full. Try again later.",
|
|
21
|
+
"-60012": "Task not found. Check task_id is valid.",
|
|
22
|
+
"-60013": "Access denied. You can only access your own tasks.",
|
|
23
|
+
};
|
|
24
|
+
// API client
|
|
25
|
+
async function mineruRequest(endpoint, method = "GET", data) {
|
|
26
|
+
if (!config.apiKey) {
|
|
27
|
+
throw new Error("MINERU_API_KEY not set. Add it to your environment.");
|
|
28
|
+
}
|
|
29
|
+
try {
|
|
30
|
+
const response = await axios({
|
|
31
|
+
method,
|
|
32
|
+
url: `${config.baseUrl}${endpoint}`,
|
|
33
|
+
headers: {
|
|
34
|
+
"Content-Type": "application/json",
|
|
35
|
+
Authorization: `Bearer ${config.apiKey}`,
|
|
36
|
+
},
|
|
37
|
+
data,
|
|
38
|
+
});
|
|
39
|
+
const result = response.data;
|
|
40
|
+
if (result.code !== 0) {
|
|
41
|
+
const code = String(result.code);
|
|
42
|
+
const msg = ERROR_MESSAGES[code] || result.msg || "Unknown error";
|
|
43
|
+
throw new Error(`MinerU error ${code}: ${msg}`);
|
|
44
|
+
}
|
|
45
|
+
return result.data;
|
|
46
|
+
}
|
|
47
|
+
catch (error) {
|
|
48
|
+
if (error instanceof AxiosError) {
|
|
49
|
+
const code = error.response?.data?.code;
|
|
50
|
+
if (code) {
|
|
51
|
+
const msg = ERROR_MESSAGES[String(code)] || error.response?.data?.msg;
|
|
52
|
+
throw new Error(`MinerU error ${code}: ${msg}`);
|
|
53
|
+
}
|
|
54
|
+
throw new Error(`HTTP ${error.response?.status}: ${error.message}`);
|
|
55
|
+
}
|
|
56
|
+
throw error;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
// Format helpers
|
|
60
|
+
function formatConciseStatus(status) {
|
|
61
|
+
const parts = [status.state, status.task_id];
|
|
62
|
+
if (status.state === "done" && status.full_zip_url) {
|
|
63
|
+
parts.push(status.full_zip_url);
|
|
64
|
+
}
|
|
65
|
+
else if (status.state === "running" && status.extract_progress) {
|
|
66
|
+
const p = status.extract_progress;
|
|
67
|
+
parts.push(`${p.extracted_pages}/${p.total_pages} pages`);
|
|
68
|
+
}
|
|
69
|
+
else if (status.state === "failed" && status.err_msg) {
|
|
70
|
+
parts.push(status.err_msg);
|
|
71
|
+
}
|
|
72
|
+
return parts.join(" | ");
|
|
73
|
+
}
|
|
74
|
+
function formatDetailedStatus(status) {
|
|
75
|
+
return JSON.stringify(status, null, 2);
|
|
76
|
+
}
|
|
77
|
+
function formatConciseBatch(batch, limit, offset) {
|
|
78
|
+
const results = batch.extract_result.slice(offset, offset + limit);
|
|
79
|
+
const total = batch.extract_result.length;
|
|
80
|
+
const done = batch.extract_result.filter((r) => r.state === "done").length;
|
|
81
|
+
const lines = [`Batch ${batch.batch_id}: ${done}/${total} done`];
|
|
82
|
+
for (const r of results) {
|
|
83
|
+
let line = `- ${r.file_name}: ${r.state}`;
|
|
84
|
+
if (r.state === "done" && r.full_zip_url) {
|
|
85
|
+
line += ` ${r.full_zip_url}`;
|
|
86
|
+
}
|
|
87
|
+
else if (r.state === "running" && r.extract_progress) {
|
|
88
|
+
line += ` (${r.extract_progress.extracted_pages}/${r.extract_progress.total_pages})`;
|
|
89
|
+
}
|
|
90
|
+
lines.push(line);
|
|
91
|
+
}
|
|
92
|
+
if (offset + limit < total) {
|
|
93
|
+
lines.push(`[+${total - offset - limit} more, use offset=${offset + limit}]`);
|
|
94
|
+
}
|
|
95
|
+
return lines.join("\n");
|
|
96
|
+
}
|
|
97
|
+
// Create MCP server
|
|
98
|
+
const server = new McpServer({
|
|
99
|
+
name: "mineru",
|
|
100
|
+
version: "1.0.0",
|
|
101
|
+
});
|
|
102
|
+
// Tool 1: mineru_parse
|
|
103
|
+
server.tool("mineru_parse", "Parse a document URL. Returns task_id to check status.", {
|
|
104
|
+
url: z.string().describe("Document URL (PDF, DOC, PPT, images)"),
|
|
105
|
+
model: z
|
|
106
|
+
.enum(["pipeline", "vlm"])
|
|
107
|
+
.optional()
|
|
108
|
+
.describe("pipeline=fast, vlm=90% accuracy"),
|
|
109
|
+
pages: z.string().optional().describe("Page range: 1-10,15 or 2--2"),
|
|
110
|
+
ocr: z.boolean().optional().describe("Enable OCR (pipeline only)"),
|
|
111
|
+
formula: z.boolean().optional().describe("Formula recognition"),
|
|
112
|
+
table: z.boolean().optional().describe("Table recognition"),
|
|
113
|
+
language: z.string().optional().describe("Language code: ch, en, etc"),
|
|
114
|
+
formats: z
|
|
115
|
+
.array(z.enum(["docx", "html", "latex"]))
|
|
116
|
+
.optional()
|
|
117
|
+
.describe("Extra export formats"),
|
|
118
|
+
}, async (params) => {
|
|
119
|
+
const requestData = {
|
|
120
|
+
url: params.url,
|
|
121
|
+
model_version: params.model || config.defaultModel,
|
|
122
|
+
};
|
|
123
|
+
if (params.pages)
|
|
124
|
+
requestData.page_ranges = params.pages;
|
|
125
|
+
if (params.ocr !== undefined)
|
|
126
|
+
requestData.is_ocr = params.ocr;
|
|
127
|
+
if (params.formula !== undefined)
|
|
128
|
+
requestData.enable_formula = params.formula;
|
|
129
|
+
if (params.table !== undefined)
|
|
130
|
+
requestData.enable_table = params.table;
|
|
131
|
+
if (params.language)
|
|
132
|
+
requestData.language = params.language;
|
|
133
|
+
if (params.formats?.length)
|
|
134
|
+
requestData.extra_formats = params.formats;
|
|
135
|
+
const result = await mineruRequest("/extract/task", "POST", requestData);
|
|
136
|
+
return {
|
|
137
|
+
content: [
|
|
138
|
+
{
|
|
139
|
+
type: "text",
|
|
140
|
+
text: `Task created: ${result.task_id}\nUse mineru_status to check progress.`,
|
|
141
|
+
},
|
|
142
|
+
],
|
|
143
|
+
};
|
|
144
|
+
});
|
|
145
|
+
// Tool 2: mineru_status
|
|
146
|
+
server.tool("mineru_status", "Check task progress. Returns download URL when done.", {
|
|
147
|
+
task_id: z.string().describe("Task ID from mineru_parse"),
|
|
148
|
+
format: z
|
|
149
|
+
.enum(["concise", "detailed"])
|
|
150
|
+
.optional()
|
|
151
|
+
.default("concise")
|
|
152
|
+
.describe("Output format"),
|
|
153
|
+
}, async (params) => {
|
|
154
|
+
const status = await mineruRequest(`/extract/task/${params.task_id}`);
|
|
155
|
+
const text = params.format === "detailed"
|
|
156
|
+
? formatDetailedStatus(status)
|
|
157
|
+
: formatConciseStatus(status);
|
|
158
|
+
return {
|
|
159
|
+
content: [{ type: "text", text }],
|
|
160
|
+
};
|
|
161
|
+
});
|
|
162
|
+
// Tool 3: mineru_batch
|
|
163
|
+
server.tool("mineru_batch", "Parse multiple URLs in one batch (max 200).", {
|
|
164
|
+
urls: z.array(z.string()).describe("Array of document URLs"),
|
|
165
|
+
model: z
|
|
166
|
+
.enum(["pipeline", "vlm"])
|
|
167
|
+
.optional()
|
|
168
|
+
.describe("pipeline=fast, vlm=90% accuracy"),
|
|
169
|
+
ocr: z.boolean().optional().describe("Enable OCR (pipeline only)"),
|
|
170
|
+
formula: z.boolean().optional().describe("Formula recognition"),
|
|
171
|
+
table: z.boolean().optional().describe("Table recognition"),
|
|
172
|
+
language: z.string().optional().describe("Language code: ch, en, etc"),
|
|
173
|
+
formats: z
|
|
174
|
+
.array(z.enum(["docx", "html", "latex"]))
|
|
175
|
+
.optional()
|
|
176
|
+
.describe("Extra export formats"),
|
|
177
|
+
}, async (params) => {
|
|
178
|
+
if (params.urls.length > 200) {
|
|
179
|
+
throw new Error("Max 200 URLs per batch. Split into smaller batches.");
|
|
180
|
+
}
|
|
181
|
+
const requestData = {
|
|
182
|
+
files: params.urls.map((url) => ({ url })),
|
|
183
|
+
model_version: params.model || config.defaultModel,
|
|
184
|
+
};
|
|
185
|
+
if (params.ocr !== undefined)
|
|
186
|
+
requestData.is_ocr = params.ocr;
|
|
187
|
+
if (params.formula !== undefined)
|
|
188
|
+
requestData.enable_formula = params.formula;
|
|
189
|
+
if (params.table !== undefined)
|
|
190
|
+
requestData.enable_table = params.table;
|
|
191
|
+
if (params.language)
|
|
192
|
+
requestData.language = params.language;
|
|
193
|
+
if (params.formats?.length)
|
|
194
|
+
requestData.extra_formats = params.formats;
|
|
195
|
+
const result = await mineruRequest("/extract/task/batch", "POST", requestData);
|
|
196
|
+
return {
|
|
197
|
+
content: [
|
|
198
|
+
{
|
|
199
|
+
type: "text",
|
|
200
|
+
text: `Batch created: ${result.batch_id}\n${params.urls.length} files queued.\nUse mineru_batch_status to check progress.`,
|
|
201
|
+
},
|
|
202
|
+
],
|
|
203
|
+
};
|
|
204
|
+
});
|
|
205
|
+
// Tool 4: mineru_batch_status
|
|
206
|
+
server.tool("mineru_batch_status", "Get batch results. Supports pagination for large batches.", {
|
|
207
|
+
batch_id: z.string().describe("Batch ID from mineru_batch"),
|
|
208
|
+
limit: z.number().optional().default(10).describe("Max results to return"),
|
|
209
|
+
offset: z.number().optional().default(0).describe("Skip first N results"),
|
|
210
|
+
format: z
|
|
211
|
+
.enum(["concise", "detailed"])
|
|
212
|
+
.optional()
|
|
213
|
+
.default("concise")
|
|
214
|
+
.describe("Output format"),
|
|
215
|
+
}, async (params) => {
|
|
216
|
+
const batch = await mineruRequest(`/extract-results/batch/${params.batch_id}`);
|
|
217
|
+
const text = params.format === "detailed"
|
|
218
|
+
? JSON.stringify(batch, null, 2)
|
|
219
|
+
: formatConciseBatch(batch, params.limit ?? 10, params.offset ?? 0);
|
|
220
|
+
return {
|
|
221
|
+
content: [{ type: "text", text }],
|
|
222
|
+
};
|
|
223
|
+
});
|
|
224
|
+
// Start server
|
|
225
|
+
async function main() {
|
|
226
|
+
const transport = new StdioServerTransport();
|
|
227
|
+
await server.connect(transport);
|
|
228
|
+
console.error("MinerU MCP server running");
|
|
229
|
+
}
|
|
230
|
+
main().catch((error) => {
|
|
231
|
+
console.error("Fatal error:", error);
|
|
232
|
+
process.exit(1);
|
|
233
|
+
});
|
package/package.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "mineru-mcp",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "MCP server for MinerU document parsing API - optimized for Claude Code with 73% token reduction",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"mineru-mcp": "dist/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist",
|
|
12
|
+
"README.md"
|
|
13
|
+
],
|
|
14
|
+
"scripts": {
|
|
15
|
+
"build": "tsc",
|
|
16
|
+
"dev": "tsx src/index.ts",
|
|
17
|
+
"start": "node dist/index.js",
|
|
18
|
+
"prepublishOnly": "npm run build"
|
|
19
|
+
},
|
|
20
|
+
"keywords": [
|
|
21
|
+
"mcp",
|
|
22
|
+
"model-context-protocol",
|
|
23
|
+
"mineru",
|
|
24
|
+
"pdf",
|
|
25
|
+
"document-parsing",
|
|
26
|
+
"ocr",
|
|
27
|
+
"claude",
|
|
28
|
+
"claude-code",
|
|
29
|
+
"anthropic",
|
|
30
|
+
"ai",
|
|
31
|
+
"llm"
|
|
32
|
+
],
|
|
33
|
+
"author": "",
|
|
34
|
+
"license": "MIT",
|
|
35
|
+
"repository": {
|
|
36
|
+
"type": "git",
|
|
37
|
+
"url": "https://github.com/anthropics/claude-code"
|
|
38
|
+
},
|
|
39
|
+
"homepage": "https://mineru.net",
|
|
40
|
+
"engines": {
|
|
41
|
+
"node": ">=18.0.0"
|
|
42
|
+
},
|
|
43
|
+
"dependencies": {
|
|
44
|
+
"@modelcontextprotocol/sdk": "^1.12.0",
|
|
45
|
+
"axios": "^1.7.0",
|
|
46
|
+
"zod": "^3.23.0"
|
|
47
|
+
},
|
|
48
|
+
"devDependencies": {
|
|
49
|
+
"@types/node": "^22.0.0",
|
|
50
|
+
"tsx": "^4.19.0",
|
|
51
|
+
"typescript": "^5.6.0"
|
|
52
|
+
}
|
|
53
|
+
}
|