mineru-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,125 @@
1
+ # mineru-mcp
2
+
3
+ MCP server for [MinerU](https://mineru.net) document parsing API - optimized for Claude Code.
4
+
5
+ ## Features
6
+
7
+ - **4 optimized tools** with concise descriptions (~73% token reduction vs alternatives)
8
+ - **VLM model support** (90%+ accuracy) and pipeline mode (faster)
9
+ - **Page range selection** - parse specific pages only
10
+ - **Batch processing** - up to 200 documents at once
11
+ - **Pagination** - efficient handling of large batch results
12
+ - **109 language OCR** support
13
+
14
+ ## Installation
15
+
16
+ ### Claude Code
17
+
18
+ ```bash
19
+ claude mcp add mineru-mcp -e MINERU_API_KEY=your-api-key -- npx mineru-mcp
20
+ ```
21
+
22
+ ### Claude Desktop
23
+
24
+ Add to your Claude Desktop config:
25
+
26
+ ```json
27
+ {
28
+ "mcpServers": {
29
+ "mineru": {
30
+ "command": "npx",
31
+ "args": ["-y", "mineru-mcp"],
32
+ "env": {
33
+ "MINERU_API_KEY": "your-api-key"
34
+ }
35
+ }
36
+ }
37
+ }
38
+ ```
39
+
40
+ ## Configuration
41
+
42
+ | Environment Variable | Default | Description |
43
+ |---------------------|---------|-------------|
44
+ | `MINERU_API_KEY` | (required) | Your MinerU API Bearer token |
45
+ | `MINERU_BASE_URL` | `https://mineru.net/api/v4` | API base URL |
46
+ | `MINERU_DEFAULT_MODEL` | `pipeline` | Default model: `pipeline` or `vlm` |
47
+
48
+ Get your API key at [mineru.net](https://mineru.net)
49
+
50
+ ## Tools
51
+
52
+ ### `mineru_parse`
53
+
54
+ Parse a single document URL.
55
+
56
+ ```typescript
57
+ mineru_parse({
58
+ url: "https://example.com/document.pdf",
59
+ model: "vlm", // optional: "pipeline" (default) or "vlm" (90% accuracy)
60
+ pages: "1-10,15", // optional: page ranges
61
+ ocr: true, // optional: enable OCR (pipeline only)
62
+ formula: true, // optional: formula recognition
63
+ table: true, // optional: table recognition
64
+ language: "en", // optional: language code
65
+ formats: ["html"] // optional: extra export formats
66
+ })
67
+ ```
68
+
69
+ ### `mineru_status`
70
+
71
+ Check task progress and get download URL.
72
+
73
+ ```typescript
74
+ mineru_status({
75
+ task_id: "abc-123",
76
+ format: "concise" // optional: "concise" (default) or "detailed"
77
+ })
78
+ ```
79
+
80
+ **Concise output**: `done | abc-123 | https://cdn-mineru.../result.zip`
81
+
82
+ ### `mineru_batch`
83
+
84
+ Parse multiple documents in one batch (max 200).
85
+
86
+ ```typescript
87
+ mineru_batch({
88
+ urls: ["https://example.com/doc1.pdf", "https://example.com/doc2.pdf"],
89
+ model: "vlm"
90
+ })
91
+ ```
92
+
93
+ ### `mineru_batch_status`
94
+
95
+ Get batch results with pagination.
96
+
97
+ ```typescript
98
+ mineru_batch_status({
99
+ batch_id: "batch-123",
100
+ limit: 10, // optional: max results (default: 10)
101
+ offset: 0, // optional: skip first N results
102
+ format: "concise" // optional: "concise" or "detailed"
103
+ })
104
+ ```
105
+
106
+ ## Supported Formats
107
+
108
+ - PDF, DOC, DOCX, PPT, PPTX
109
+ - PNG, JPG, JPEG
110
+
111
+ ## Limits
112
+
113
+ - Single file: 200MB max, 600 pages max
114
+ - Daily quota: 2000 pages at high priority
115
+ - Batch: max 200 URLs per request
116
+
117
+ ## License
118
+
119
+ MIT
120
+
121
+ ## Links
122
+
123
+ - [MinerU](https://mineru.net) - Document parsing service
124
+ - [MinerU GitHub](https://github.com/opendatalab/MinerU) - Open source version
125
+ - [MCP Specification](https://modelcontextprotocol.io) - Model Context Protocol
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
package/dist/index.js ADDED
@@ -0,0 +1,233 @@
1
+ #!/usr/bin/env node
2
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
3
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
+ import { z } from "zod";
5
+ import axios, { AxiosError } from "axios";
6
+ // Configuration from environment
7
+ const config = {
8
+ apiKey: process.env.MINERU_API_KEY || "",
9
+ baseUrl: process.env.MINERU_BASE_URL || "https://mineru.net/api/v4",
10
+ defaultModel: (process.env.MINERU_DEFAULT_MODEL || "pipeline"),
11
+ };
12
+ // Error codes with actionable messages
13
+ const ERROR_MESSAGES = {
14
+ "A0202": "Token error. Check your API key.",
15
+ "A0211": "Token expired. Get a new API key.",
16
+ "-60002": "Invalid file format. Use: pdf, doc, docx, ppt, pptx, png, jpg, jpeg",
17
+ "-60005": "File too large. Max 200MB.",
18
+ "-60006": "Too many pages. Max 600 per file. Split the document.",
19
+ "-60008": "URL timeout. Check the URL is accessible.",
20
+ "-60009": "Queue full. Try again later.",
21
+ "-60012": "Task not found. Check task_id is valid.",
22
+ "-60013": "Access denied. You can only access your own tasks.",
23
+ };
24
+ // API client
25
+ async function mineruRequest(endpoint, method = "GET", data) {
26
+ if (!config.apiKey) {
27
+ throw new Error("MINERU_API_KEY not set. Add it to your environment.");
28
+ }
29
+ try {
30
+ const response = await axios({
31
+ method,
32
+ url: `${config.baseUrl}${endpoint}`,
33
+ headers: {
34
+ "Content-Type": "application/json",
35
+ Authorization: `Bearer ${config.apiKey}`,
36
+ },
37
+ data,
38
+ });
39
+ const result = response.data;
40
+ if (result.code !== 0) {
41
+ const code = String(result.code);
42
+ const msg = ERROR_MESSAGES[code] || result.msg || "Unknown error";
43
+ throw new Error(`MinerU error ${code}: ${msg}`);
44
+ }
45
+ return result.data;
46
+ }
47
+ catch (error) {
48
+ if (error instanceof AxiosError) {
49
+ const code = error.response?.data?.code;
50
+ if (code) {
51
+ const msg = ERROR_MESSAGES[String(code)] || error.response?.data?.msg;
52
+ throw new Error(`MinerU error ${code}: ${msg}`);
53
+ }
54
+ throw new Error(`HTTP ${error.response?.status}: ${error.message}`);
55
+ }
56
+ throw error;
57
+ }
58
+ }
59
+ // Format helpers
60
+ function formatConciseStatus(status) {
61
+ const parts = [status.state, status.task_id];
62
+ if (status.state === "done" && status.full_zip_url) {
63
+ parts.push(status.full_zip_url);
64
+ }
65
+ else if (status.state === "running" && status.extract_progress) {
66
+ const p = status.extract_progress;
67
+ parts.push(`${p.extracted_pages}/${p.total_pages} pages`);
68
+ }
69
+ else if (status.state === "failed" && status.err_msg) {
70
+ parts.push(status.err_msg);
71
+ }
72
+ return parts.join(" | ");
73
+ }
74
+ function formatDetailedStatus(status) {
75
+ return JSON.stringify(status, null, 2);
76
+ }
77
+ function formatConciseBatch(batch, limit, offset) {
78
+ const results = batch.extract_result.slice(offset, offset + limit);
79
+ const total = batch.extract_result.length;
80
+ const done = batch.extract_result.filter((r) => r.state === "done").length;
81
+ const lines = [`Batch ${batch.batch_id}: ${done}/${total} done`];
82
+ for (const r of results) {
83
+ let line = `- ${r.file_name}: ${r.state}`;
84
+ if (r.state === "done" && r.full_zip_url) {
85
+ line += ` ${r.full_zip_url}`;
86
+ }
87
+ else if (r.state === "running" && r.extract_progress) {
88
+ line += ` (${r.extract_progress.extracted_pages}/${r.extract_progress.total_pages})`;
89
+ }
90
+ lines.push(line);
91
+ }
92
+ if (offset + limit < total) {
93
+ lines.push(`[+${total - offset - limit} more, use offset=${offset + limit}]`);
94
+ }
95
+ return lines.join("\n");
96
+ }
97
+ // Create MCP server
98
+ const server = new McpServer({
99
+ name: "mineru",
100
+ version: "1.0.0",
101
+ });
102
+ // Tool 1: mineru_parse
103
+ server.tool("mineru_parse", "Parse a document URL. Returns task_id to check status.", {
104
+ url: z.string().describe("Document URL (PDF, DOC, PPT, images)"),
105
+ model: z
106
+ .enum(["pipeline", "vlm"])
107
+ .optional()
108
+ .describe("pipeline=fast, vlm=90% accuracy"),
109
+ pages: z.string().optional().describe("Page range: 1-10,15 or 2--2"),
110
+ ocr: z.boolean().optional().describe("Enable OCR (pipeline only)"),
111
+ formula: z.boolean().optional().describe("Formula recognition"),
112
+ table: z.boolean().optional().describe("Table recognition"),
113
+ language: z.string().optional().describe("Language code: ch, en, etc"),
114
+ formats: z
115
+ .array(z.enum(["docx", "html", "latex"]))
116
+ .optional()
117
+ .describe("Extra export formats"),
118
+ }, async (params) => {
119
+ const requestData = {
120
+ url: params.url,
121
+ model_version: params.model || config.defaultModel,
122
+ };
123
+ if (params.pages)
124
+ requestData.page_ranges = params.pages;
125
+ if (params.ocr !== undefined)
126
+ requestData.is_ocr = params.ocr;
127
+ if (params.formula !== undefined)
128
+ requestData.enable_formula = params.formula;
129
+ if (params.table !== undefined)
130
+ requestData.enable_table = params.table;
131
+ if (params.language)
132
+ requestData.language = params.language;
133
+ if (params.formats?.length)
134
+ requestData.extra_formats = params.formats;
135
+ const result = await mineruRequest("/extract/task", "POST", requestData);
136
+ return {
137
+ content: [
138
+ {
139
+ type: "text",
140
+ text: `Task created: ${result.task_id}\nUse mineru_status to check progress.`,
141
+ },
142
+ ],
143
+ };
144
+ });
145
+ // Tool 2: mineru_status
146
+ server.tool("mineru_status", "Check task progress. Returns download URL when done.", {
147
+ task_id: z.string().describe("Task ID from mineru_parse"),
148
+ format: z
149
+ .enum(["concise", "detailed"])
150
+ .optional()
151
+ .default("concise")
152
+ .describe("Output format"),
153
+ }, async (params) => {
154
+ const status = await mineruRequest(`/extract/task/${params.task_id}`);
155
+ const text = params.format === "detailed"
156
+ ? formatDetailedStatus(status)
157
+ : formatConciseStatus(status);
158
+ return {
159
+ content: [{ type: "text", text }],
160
+ };
161
+ });
162
+ // Tool 3: mineru_batch
163
+ server.tool("mineru_batch", "Parse multiple URLs in one batch (max 200).", {
164
+ urls: z.array(z.string()).describe("Array of document URLs"),
165
+ model: z
166
+ .enum(["pipeline", "vlm"])
167
+ .optional()
168
+ .describe("pipeline=fast, vlm=90% accuracy"),
169
+ ocr: z.boolean().optional().describe("Enable OCR (pipeline only)"),
170
+ formula: z.boolean().optional().describe("Formula recognition"),
171
+ table: z.boolean().optional().describe("Table recognition"),
172
+ language: z.string().optional().describe("Language code: ch, en, etc"),
173
+ formats: z
174
+ .array(z.enum(["docx", "html", "latex"]))
175
+ .optional()
176
+ .describe("Extra export formats"),
177
+ }, async (params) => {
178
+ if (params.urls.length > 200) {
179
+ throw new Error("Max 200 URLs per batch. Split into smaller batches.");
180
+ }
181
+ const requestData = {
182
+ files: params.urls.map((url) => ({ url })),
183
+ model_version: params.model || config.defaultModel,
184
+ };
185
+ if (params.ocr !== undefined)
186
+ requestData.is_ocr = params.ocr;
187
+ if (params.formula !== undefined)
188
+ requestData.enable_formula = params.formula;
189
+ if (params.table !== undefined)
190
+ requestData.enable_table = params.table;
191
+ if (params.language)
192
+ requestData.language = params.language;
193
+ if (params.formats?.length)
194
+ requestData.extra_formats = params.formats;
195
+ const result = await mineruRequest("/extract/task/batch", "POST", requestData);
196
+ return {
197
+ content: [
198
+ {
199
+ type: "text",
200
+ text: `Batch created: ${result.batch_id}\n${params.urls.length} files queued.\nUse mineru_batch_status to check progress.`,
201
+ },
202
+ ],
203
+ };
204
+ });
205
+ // Tool 4: mineru_batch_status
206
+ server.tool("mineru_batch_status", "Get batch results. Supports pagination for large batches.", {
207
+ batch_id: z.string().describe("Batch ID from mineru_batch"),
208
+ limit: z.number().optional().default(10).describe("Max results to return"),
209
+ offset: z.number().optional().default(0).describe("Skip first N results"),
210
+ format: z
211
+ .enum(["concise", "detailed"])
212
+ .optional()
213
+ .default("concise")
214
+ .describe("Output format"),
215
+ }, async (params) => {
216
+ const batch = await mineruRequest(`/extract-results/batch/${params.batch_id}`);
217
+ const text = params.format === "detailed"
218
+ ? JSON.stringify(batch, null, 2)
219
+ : formatConciseBatch(batch, params.limit ?? 10, params.offset ?? 0);
220
+ return {
221
+ content: [{ type: "text", text }],
222
+ };
223
+ });
224
+ // Start server
225
+ async function main() {
226
+ const transport = new StdioServerTransport();
227
+ await server.connect(transport);
228
+ console.error("MinerU MCP server running");
229
+ }
230
+ main().catch((error) => {
231
+ console.error("Fatal error:", error);
232
+ process.exit(1);
233
+ });
package/package.json ADDED
@@ -0,0 +1,53 @@
1
+ {
2
+ "name": "mineru-mcp",
3
+ "version": "1.0.0",
4
+ "description": "MCP server for MinerU document parsing API - optimized for Claude Code with 73% token reduction",
5
+ "type": "module",
6
+ "main": "dist/index.js",
7
+ "bin": {
8
+ "mineru-mcp": "dist/index.js"
9
+ },
10
+ "files": [
11
+ "dist",
12
+ "README.md"
13
+ ],
14
+ "scripts": {
15
+ "build": "tsc",
16
+ "dev": "tsx src/index.ts",
17
+ "start": "node dist/index.js",
18
+ "prepublishOnly": "npm run build"
19
+ },
20
+ "keywords": [
21
+ "mcp",
22
+ "model-context-protocol",
23
+ "mineru",
24
+ "pdf",
25
+ "document-parsing",
26
+ "ocr",
27
+ "claude",
28
+ "claude-code",
29
+ "anthropic",
30
+ "ai",
31
+ "llm"
32
+ ],
33
+ "author": "",
34
+ "license": "MIT",
35
+ "repository": {
36
+ "type": "git",
37
+ "url": "https://github.com/anthropics/claude-code"
38
+ },
39
+ "homepage": "https://mineru.net",
40
+ "engines": {
41
+ "node": ">=18.0.0"
42
+ },
43
+ "dependencies": {
44
+ "@modelcontextprotocol/sdk": "^1.12.0",
45
+ "axios": "^1.7.0",
46
+ "zod": "^3.23.0"
47
+ },
48
+ "devDependencies": {
49
+ "@types/node": "^22.0.0",
50
+ "tsx": "^4.19.0",
51
+ "typescript": "^5.6.0"
52
+ }
53
+ }