paddle-ocr-server 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: paddle-ocr-server
3
+ Version: 0.1.0
4
+ Summary: MCP server for PaddleOCR PDF-to-Markdown conversion
5
+ Requires-Python: >=3.13
6
+ Requires-Dist: fastmcp>=3.4.2
7
+ Requires-Dist: httpx>=0.28.0
8
+ Description-Content-Type: text/markdown
9
+
10
+ # PaddleOCR MCP Server
11
+
12
+ 通过 PaddleOCR 官方 API 将 PDF 或图片转换为 Markdown。
13
+
14
+ ## 功能
15
+
16
+ - **Tools**: `ocr_pdf` — 提交 PDF/图片到 PaddleOCR,返回合并后的 Markdown 文件路径
17
+
18
+ ## 环境要求
19
+
20
+ - Python >= 3.13
21
+ - [uv](https://docs.astral.sh/uv/)
22
+
23
+ ## 环境变量
24
+
25
+ | 变量 | 必需 | 说明 |
26
+ |------|------|------|
27
+ | `PADDLEOCR_ACCESS_TOKEN` | ✅ | AI Studio access token,从 https://aistudio.baidu.com/account/accessToken 获取 |
28
+
29
+ ## 安装
30
+
31
+ ```bash
32
+ cd servers/paddle-ocr
33
+ uv sync
34
+ ```
35
+
36
+ ## 运行
37
+
38
+ ```bash
39
+ PADDLEOCR_ACCESS_TOKEN=your-token uv run python server.py
40
+ ```
41
+
42
+ ## 测试
43
+
44
+ ```bash
45
+ uv run pytest -v
46
+ ```
47
+
48
+ ## 接入 MCP Host
49
+
50
+ 发布到 PyPI 后可用 `uvx`:
51
+
52
+ ```json
53
+ {
54
+ "mcpServers": {
55
+ "paddle-ocr": {
56
+ "command": "uvx",
57
+ "args": ["paddle-ocr-server"],
58
+ "env": {
59
+ "PADDLEOCR_ACCESS_TOKEN": "your-token"
60
+ }
61
+ }
62
+ }
63
+ }
64
+ ```
65
+
66
+ 本地开发时用 `uv`:
67
+
68
+ ```json
69
+ {
70
+ "mcpServers": {
71
+ "paddle-ocr": {
72
+ "command": "uv",
73
+ "args": ["--directory", "/path/to/servers/paddle-ocr", "run", "paddle-ocr-server"],
74
+ "env": {
75
+ "PADDLEOCR_ACCESS_TOKEN": "your-token"
76
+ }
77
+ }
78
+ }
79
+ }
80
+ ```
81
+ ```
82
+
83
+ ## 使用方式
84
+
85
+ 调用 `ocr_pdf` tool,提供 `file_path`(本地文件)或 `file_url`(URL),二选一。
86
+
87
+ 结果保存到当前工作目录下的同名文件夹,结构如下:
88
+ - `full.md` — 合并所有页面的 Markdown
89
+ - `pages/page_N.md` — 每页单独的 Markdown
90
+ - `imgs/` — 图片文件
91
+
92
+ 返回值为 `full.md` 的绝对路径。
@@ -0,0 +1,5 @@
1
+ server.py,sha256=Gyj-OyZtsx0K2TlLehJV6W1asZuTMeUl8T_eqD6TSjU,6964
2
+ paddle_ocr_server-0.1.0.dist-info/METADATA,sha256=cAbwIDguFDUjyBEiG_4_nif4Uef_0F90ObfKP_xG1H4,1862
3
+ paddle_ocr_server-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
4
+ paddle_ocr_server-0.1.0.dist-info/entry_points.txt,sha256=SbUtF8LurvLyiCcMBe4Xg4Iotm3AX-SKpLNLXrh-Ds8,50
5
+ paddle_ocr_server-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ paddle-ocr-server = server:main
server.py ADDED
@@ -0,0 +1,220 @@
1
+ """PaddleOCR MCP Server — PDF/图片 → Markdown via PaddleOCR 官方 API.
2
+
3
+ 暴露一个 tool: ocr_pdf
4
+ - 输入本地文件路径或 URL
5
+ - 提交异步 OCR 任务,轮询直到完成
6
+ - 下载结果,保存 Markdown 和图片到当前工作目录下的同名文件夹(可指定 output_dir 覆盖)
7
+ - 返回合并后的 Markdown 文件路径
8
+
9
+ 环境变量:
10
+ PADDLEOCR_ACCESS_TOKEN — 必需,AI Studio access token
11
+ """
12
+
13
+ import asyncio
14
+ import json
15
+ import os
16
+ import re
17
+ import time
18
+ from pathlib import Path
19
+
20
+ import httpx
21
+ from fastmcp import FastMCP
22
+
23
+ mcp = FastMCP("PaddleOCR")
24
+
25
+ API_BASE = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs"
26
+ DEFAULT_MODEL = "PaddleOCR-VL-1.6"
27
+ POLL_INTERVAL = 5
28
+ POLL_TIMEOUT = 600
29
+
30
+
31
+ def _get_token() -> str:
32
+ token = os.environ.get("PADDLEOCR_ACCESS_TOKEN", "")
33
+ if not token:
34
+ raise ValueError("PADDLEOCR_ACCESS_TOKEN 环境变量未设置")
35
+ return token
36
+
37
+
38
+ def _auth_headers(token: str) -> dict[str, str]:
39
+ return {"Authorization": f"bearer {token}"}
40
+
41
+
42
+ async def _submit_job(
43
+ file_path: str | None,
44
+ file_url: str | None,
45
+ use_chart_recognition: bool = True,
46
+ ) -> str:
47
+ """提交 OCR 任务,返回 jobId。"""
48
+ token = _get_token()
49
+ headers = _auth_headers(token)
50
+
51
+ optional_payload = {
52
+ "useDocOrientationClassify": False,
53
+ "useDocUnwarping": False,
54
+ "useChartRecognition": use_chart_recognition,
55
+ }
56
+
57
+ async with httpx.AsyncClient(timeout=120) as client:
58
+ if file_url:
59
+ headers["Content-Type"] = "application/json"
60
+ payload = {
61
+ "fileUrl": file_url,
62
+ "model": DEFAULT_MODEL,
63
+ "optionalPayload": optional_payload,
64
+ }
65
+ resp = await client.post(API_BASE, json=payload, headers=headers)
66
+ else:
67
+ data = {
68
+ "model": DEFAULT_MODEL,
69
+ "optionalPayload": json.dumps(optional_payload),
70
+ }
71
+ files = {"file": open(file_path, "rb")} # noqa: SIM115
72
+ resp = await client.post(API_BASE, headers=headers, data=data, files=files)
73
+
74
+ resp.raise_for_status()
75
+ return resp.json()["data"]["jobId"]
76
+
77
+
78
+ async def _poll_until_done(job_id: str) -> str:
79
+ """轮询任务状态直到完成,返回 jsonUrl。"""
80
+ token = _get_token()
81
+ headers = _auth_headers(token)
82
+ deadline = time.monotonic() + POLL_TIMEOUT
83
+
84
+ async with httpx.AsyncClient(timeout=60) as client:
85
+ while True:
86
+ resp = await client.get(f"{API_BASE}/{job_id}", headers=headers)
87
+ resp.raise_for_status()
88
+ body = resp.json()["data"]
89
+ state = body["state"]
90
+
91
+ if state == "done":
92
+ return body["resultUrl"]["jsonUrl"]
93
+ if state == "failed":
94
+ raise RuntimeError(f"OCR 任务失败: {body.get('errorMsg', '未知错误')}")
95
+
96
+ if time.monotonic() > deadline:
97
+ raise TimeoutError(f"OCR 任务轮询超时 ({POLL_TIMEOUT}s)")
98
+
99
+ await asyncio.sleep(POLL_INTERVAL)
100
+
101
+
102
+ def _re_process(text: str) -> str:
103
+ """Markdown 后处理:修复 LaTeX 公式、压缩空行。"""
104
+ # 去掉行内公式的空格: $ ... $ → $...$
105
+ text = re.sub(r"(?<!\$)\$\s*([^$]+?)\s*\$(?!\$)", r"$\1$", text)
106
+ # 块公式标准化: $$...$$ 前后加空行
107
+ text = re.sub(
108
+ r"\$\$([\s\S]*?)\$\$",
109
+ lambda m: f"\n$$\n{m.group(1).strip()}\n$$\n",
110
+ text,
111
+ )
112
+ # 压缩多余空行
113
+ text = re.sub(r"\n{3,}", "\n\n", text)
114
+ return text
115
+
116
+
117
+ async def _download_and_save(jsonl_url: str, output_dir: Path) -> Path:
118
+ """下载 JSONL 结果,保存 Markdown 和图片到 output_dir,返回合并后的 md 路径。"""
119
+ async with httpx.AsyncClient(timeout=120) as client:
120
+ resp = await client.get(jsonl_url)
121
+ resp.raise_for_status()
122
+
123
+ output_dir.mkdir(parents=True, exist_ok=True)
124
+ pages_dir = output_dir / "pages"
125
+ pages_dir.mkdir(parents=True, exist_ok=True)
126
+ pages_md: list[str] = []
127
+ page_num = 0
128
+
129
+ for line in resp.text.strip().split("\n"):
130
+ line = line.strip()
131
+ if not line:
132
+ continue
133
+ result = json.loads(line)["result"]
134
+
135
+ for res in result["layoutParsingResults"]:
136
+ md_text = _re_process(res["markdown"]["text"])
137
+ pages_md.append(md_text)
138
+
139
+ # 保存单页 md
140
+ page_file = pages_dir / f"page_{page_num}.md"
141
+ page_file.write_text(md_text, encoding="utf-8")
142
+
143
+ # 下载 markdown 中引用的图片
144
+ for img_path, img_url in res["markdown"].get("images", {}).items():
145
+ full_img_path = output_dir / img_path
146
+ full_img_path.parent.mkdir(parents=True, exist_ok=True)
147
+ img_bytes = await _fetch_bytes(img_url)
148
+ full_img_path.write_bytes(img_bytes)
149
+
150
+ page_num += 1
151
+
152
+ # 合并所有页面到单个 md
153
+ merged = output_dir / "full.md"
154
+ merged.write_text("\n\n---\n\n".join(pages_md), encoding="utf-8")
155
+ return merged
156
+
157
+
158
+ async def _fetch_bytes(url: str) -> bytes:
159
+ async with httpx.AsyncClient(timeout=60) as client:
160
+ resp = await client.get(url)
161
+ resp.raise_for_status()
162
+ return resp.content
163
+
164
+
165
+ @mcp.tool
166
+ async def ocr_pdf(
167
+ file_path: str | None = None,
168
+ file_url: str | None = None,
169
+ output_dir: str | None = None,
170
+ use_chart_recognition: bool = True,
171
+ ) -> str:
172
+ """将 PDF 或图片文件通过 PaddleOCR 转换为 Markdown。
173
+
174
+ 提供 file_path(本地文件)或 file_url(URL),二选一。
175
+ 结果保存到指定目录,返回合并后的 Markdown 文件路径。
176
+
177
+ Args:
178
+ file_path: 本地文件的绝对路径(PDF/图片)。
179
+ file_url: 文件的 URL 地址。
180
+ output_dir: 输出目录的绝对路径。未指定时默认保存到当前工作目录下的同名文件夹。
181
+ use_chart_recognition: 是否启用图表识别,默认 True。
182
+ """
183
+ if not file_path and not file_url:
184
+ raise ValueError("必须提供 file_path 或 file_url 其中之一")
185
+ if file_path and file_url:
186
+ raise ValueError("file_path 和 file_url 只能提供一个")
187
+
188
+ src = None
189
+ if file_path:
190
+ src = Path(file_path)
191
+ if not src.exists():
192
+ raise FileNotFoundError(f"文件不存在: {file_path}")
193
+
194
+ # 确定输出目录
195
+ if output_dir:
196
+ out = Path(output_dir)
197
+ elif src:
198
+ out = Path.cwd() / src.stem
199
+ else:
200
+ from urllib.parse import urlparse
201
+
202
+ url_path = urlparse(file_url).path
203
+ stem = Path(url_path).stem or "ocr_output"
204
+ out = Path.cwd() / stem
205
+
206
+ # 提交 → 轮询 → 取结果
207
+ job_id = await _submit_job(file_path, file_url, use_chart_recognition)
208
+ jsonl_url = await _poll_until_done(job_id)
209
+ merged_md = await _download_and_save(jsonl_url, out)
210
+
211
+ return str(merged_md)
212
+
213
+
214
+
215
+ def main():
216
+ mcp.run()
217
+
218
+
219
+ if __name__ == "__main__":
220
+ main()