paddle-ocr-server 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: paddle-ocr-server
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server for PaddleOCR PDF-to-Markdown conversion
|
|
5
|
+
Requires-Python: >=3.13
|
|
6
|
+
Requires-Dist: fastmcp>=3.4.2
|
|
7
|
+
Requires-Dist: httpx>=0.28.0
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
|
|
10
|
+
# PaddleOCR MCP Server
|
|
11
|
+
|
|
12
|
+
通过 PaddleOCR 官方 API 将 PDF 或图片转换为 Markdown。
|
|
13
|
+
|
|
14
|
+
## 功能
|
|
15
|
+
|
|
16
|
+
- **Tools**: `ocr_pdf` — 提交 PDF/图片到 PaddleOCR,返回合并后的 Markdown 文件路径
|
|
17
|
+
|
|
18
|
+
## 环境要求
|
|
19
|
+
|
|
20
|
+
- Python >= 3.13
|
|
21
|
+
- [uv](https://docs.astral.sh/uv/)
|
|
22
|
+
|
|
23
|
+
## 环境变量
|
|
24
|
+
|
|
25
|
+
| 变量 | 必需 | 说明 |
|
|
26
|
+
|------|------|------|
|
|
27
|
+
| `PADDLEOCR_ACCESS_TOKEN` | ✅ | AI Studio access token,从 https://aistudio.baidu.com/account/accessToken 获取 |
|
|
28
|
+
|
|
29
|
+
## 安装
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
cd servers/paddle-ocr
|
|
33
|
+
uv sync
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## 运行
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
PADDLEOCR_ACCESS_TOKEN=your-token uv run python server.py
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## 测试
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
uv run pytest -v
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## 接入 MCP Host
|
|
49
|
+
|
|
50
|
+
发布到 PyPI 后可用 `uvx`:
|
|
51
|
+
|
|
52
|
+
```json
|
|
53
|
+
{
|
|
54
|
+
"mcpServers": {
|
|
55
|
+
"paddle-ocr": {
|
|
56
|
+
"command": "uvx",
|
|
57
|
+
"args": ["paddle-ocr-server"],
|
|
58
|
+
"env": {
|
|
59
|
+
"PADDLEOCR_ACCESS_TOKEN": "your-token"
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
本地开发时用 `uv`:
|
|
67
|
+
|
|
68
|
+
```json
|
|
69
|
+
{
|
|
70
|
+
"mcpServers": {
|
|
71
|
+
"paddle-ocr": {
|
|
72
|
+
"command": "uv",
|
|
73
|
+
"args": ["--directory", "/path/to/servers/paddle-ocr", "run", "paddle-ocr-server"],
|
|
74
|
+
"env": {
|
|
75
|
+
"PADDLEOCR_ACCESS_TOKEN": "your-token"
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
```
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## 使用方式
|
|
84
|
+
|
|
85
|
+
调用 `ocr_pdf` tool,提供 `file_path`(本地文件)或 `file_url`(URL),二选一。
|
|
86
|
+
|
|
87
|
+
结果保存到当前工作目录下的同名文件夹,结构如下:
|
|
88
|
+
- `full.md` — 合并所有页面的 Markdown
|
|
89
|
+
- `pages/page_N.md` — 每页单独的 Markdown
|
|
90
|
+
- `imgs/` — 图片文件
|
|
91
|
+
|
|
92
|
+
返回值为 `full.md` 的绝对路径。
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
server.py,sha256=Gyj-OyZtsx0K2TlLehJV6W1asZuTMeUl8T_eqD6TSjU,6964
|
|
2
|
+
paddle_ocr_server-0.1.0.dist-info/METADATA,sha256=cAbwIDguFDUjyBEiG_4_nif4Uef_0F90ObfKP_xG1H4,1862
|
|
3
|
+
paddle_ocr_server-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
4
|
+
paddle_ocr_server-0.1.0.dist-info/entry_points.txt,sha256=SbUtF8LurvLyiCcMBe4Xg4Iotm3AX-SKpLNLXrh-Ds8,50
|
|
5
|
+
paddle_ocr_server-0.1.0.dist-info/RECORD,,
|
server.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""PaddleOCR MCP Server — PDF/图片 → Markdown via PaddleOCR 官方 API.
|
|
2
|
+
|
|
3
|
+
暴露一个 tool: ocr_pdf
|
|
4
|
+
- 输入本地文件路径或 URL
|
|
5
|
+
- 提交异步 OCR 任务,轮询直到完成
|
|
6
|
+
- 下载结果,保存 Markdown 和图片到当前工作目录下的同名文件夹(可指定 output_dir 覆盖)
|
|
7
|
+
- 返回合并后的 Markdown 文件路径
|
|
8
|
+
|
|
9
|
+
环境变量:
|
|
10
|
+
PADDLEOCR_ACCESS_TOKEN — 必需,AI Studio access token
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import time
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
import httpx
|
|
21
|
+
from fastmcp import FastMCP
|
|
22
|
+
|
|
23
|
+
mcp = FastMCP("PaddleOCR")
|
|
24
|
+
|
|
25
|
+
API_BASE = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs"
|
|
26
|
+
DEFAULT_MODEL = "PaddleOCR-VL-1.6"
|
|
27
|
+
POLL_INTERVAL = 5
|
|
28
|
+
POLL_TIMEOUT = 600
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_token() -> str:
|
|
32
|
+
token = os.environ.get("PADDLEOCR_ACCESS_TOKEN", "")
|
|
33
|
+
if not token:
|
|
34
|
+
raise ValueError("PADDLEOCR_ACCESS_TOKEN 环境变量未设置")
|
|
35
|
+
return token
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _auth_headers(token: str) -> dict[str, str]:
|
|
39
|
+
return {"Authorization": f"bearer {token}"}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
async def _submit_job(
|
|
43
|
+
file_path: str | None,
|
|
44
|
+
file_url: str | None,
|
|
45
|
+
use_chart_recognition: bool = True,
|
|
46
|
+
) -> str:
|
|
47
|
+
"""提交 OCR 任务,返回 jobId。"""
|
|
48
|
+
token = _get_token()
|
|
49
|
+
headers = _auth_headers(token)
|
|
50
|
+
|
|
51
|
+
optional_payload = {
|
|
52
|
+
"useDocOrientationClassify": False,
|
|
53
|
+
"useDocUnwarping": False,
|
|
54
|
+
"useChartRecognition": use_chart_recognition,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async with httpx.AsyncClient(timeout=120) as client:
|
|
58
|
+
if file_url:
|
|
59
|
+
headers["Content-Type"] = "application/json"
|
|
60
|
+
payload = {
|
|
61
|
+
"fileUrl": file_url,
|
|
62
|
+
"model": DEFAULT_MODEL,
|
|
63
|
+
"optionalPayload": optional_payload,
|
|
64
|
+
}
|
|
65
|
+
resp = await client.post(API_BASE, json=payload, headers=headers)
|
|
66
|
+
else:
|
|
67
|
+
data = {
|
|
68
|
+
"model": DEFAULT_MODEL,
|
|
69
|
+
"optionalPayload": json.dumps(optional_payload),
|
|
70
|
+
}
|
|
71
|
+
files = {"file": open(file_path, "rb")} # noqa: SIM115
|
|
72
|
+
resp = await client.post(API_BASE, headers=headers, data=data, files=files)
|
|
73
|
+
|
|
74
|
+
resp.raise_for_status()
|
|
75
|
+
return resp.json()["data"]["jobId"]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def _poll_until_done(job_id: str) -> str:
|
|
79
|
+
"""轮询任务状态直到完成,返回 jsonUrl。"""
|
|
80
|
+
token = _get_token()
|
|
81
|
+
headers = _auth_headers(token)
|
|
82
|
+
deadline = time.monotonic() + POLL_TIMEOUT
|
|
83
|
+
|
|
84
|
+
async with httpx.AsyncClient(timeout=60) as client:
|
|
85
|
+
while True:
|
|
86
|
+
resp = await client.get(f"{API_BASE}/{job_id}", headers=headers)
|
|
87
|
+
resp.raise_for_status()
|
|
88
|
+
body = resp.json()["data"]
|
|
89
|
+
state = body["state"]
|
|
90
|
+
|
|
91
|
+
if state == "done":
|
|
92
|
+
return body["resultUrl"]["jsonUrl"]
|
|
93
|
+
if state == "failed":
|
|
94
|
+
raise RuntimeError(f"OCR 任务失败: {body.get('errorMsg', '未知错误')}")
|
|
95
|
+
|
|
96
|
+
if time.monotonic() > deadline:
|
|
97
|
+
raise TimeoutError(f"OCR 任务轮询超时 ({POLL_TIMEOUT}s)")
|
|
98
|
+
|
|
99
|
+
await asyncio.sleep(POLL_INTERVAL)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _re_process(text: str) -> str:
|
|
103
|
+
"""Markdown 后处理:修复 LaTeX 公式、压缩空行。"""
|
|
104
|
+
# 去掉行内公式的空格: $ ... $ → $...$
|
|
105
|
+
text = re.sub(r"(?<!\$)\$\s*([^$]+?)\s*\$(?!\$)", r"$\1$", text)
|
|
106
|
+
# 块公式标准化: $$...$$ 前后加空行
|
|
107
|
+
text = re.sub(
|
|
108
|
+
r"\$\$([\s\S]*?)\$\$",
|
|
109
|
+
lambda m: f"\n$$\n{m.group(1).strip()}\n$$\n",
|
|
110
|
+
text,
|
|
111
|
+
)
|
|
112
|
+
# 压缩多余空行
|
|
113
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
114
|
+
return text
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
async def _download_and_save(jsonl_url: str, output_dir: Path) -> Path:
|
|
118
|
+
"""下载 JSONL 结果,保存 Markdown 和图片到 output_dir,返回合并后的 md 路径。"""
|
|
119
|
+
async with httpx.AsyncClient(timeout=120) as client:
|
|
120
|
+
resp = await client.get(jsonl_url)
|
|
121
|
+
resp.raise_for_status()
|
|
122
|
+
|
|
123
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
124
|
+
pages_dir = output_dir / "pages"
|
|
125
|
+
pages_dir.mkdir(parents=True, exist_ok=True)
|
|
126
|
+
pages_md: list[str] = []
|
|
127
|
+
page_num = 0
|
|
128
|
+
|
|
129
|
+
for line in resp.text.strip().split("\n"):
|
|
130
|
+
line = line.strip()
|
|
131
|
+
if not line:
|
|
132
|
+
continue
|
|
133
|
+
result = json.loads(line)["result"]
|
|
134
|
+
|
|
135
|
+
for res in result["layoutParsingResults"]:
|
|
136
|
+
md_text = _re_process(res["markdown"]["text"])
|
|
137
|
+
pages_md.append(md_text)
|
|
138
|
+
|
|
139
|
+
# 保存单页 md
|
|
140
|
+
page_file = pages_dir / f"page_{page_num}.md"
|
|
141
|
+
page_file.write_text(md_text, encoding="utf-8")
|
|
142
|
+
|
|
143
|
+
# 下载 markdown 中引用的图片
|
|
144
|
+
for img_path, img_url in res["markdown"].get("images", {}).items():
|
|
145
|
+
full_img_path = output_dir / img_path
|
|
146
|
+
full_img_path.parent.mkdir(parents=True, exist_ok=True)
|
|
147
|
+
img_bytes = await _fetch_bytes(img_url)
|
|
148
|
+
full_img_path.write_bytes(img_bytes)
|
|
149
|
+
|
|
150
|
+
page_num += 1
|
|
151
|
+
|
|
152
|
+
# 合并所有页面到单个 md
|
|
153
|
+
merged = output_dir / "full.md"
|
|
154
|
+
merged.write_text("\n\n---\n\n".join(pages_md), encoding="utf-8")
|
|
155
|
+
return merged
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
async def _fetch_bytes(url: str) -> bytes:
|
|
159
|
+
async with httpx.AsyncClient(timeout=60) as client:
|
|
160
|
+
resp = await client.get(url)
|
|
161
|
+
resp.raise_for_status()
|
|
162
|
+
return resp.content
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@mcp.tool
|
|
166
|
+
async def ocr_pdf(
|
|
167
|
+
file_path: str | None = None,
|
|
168
|
+
file_url: str | None = None,
|
|
169
|
+
output_dir: str | None = None,
|
|
170
|
+
use_chart_recognition: bool = True,
|
|
171
|
+
) -> str:
|
|
172
|
+
"""将 PDF 或图片文件通过 PaddleOCR 转换为 Markdown。
|
|
173
|
+
|
|
174
|
+
提供 file_path(本地文件)或 file_url(URL),二选一。
|
|
175
|
+
结果保存到指定目录,返回合并后的 Markdown 文件路径。
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
file_path: 本地文件的绝对路径(PDF/图片)。
|
|
179
|
+
file_url: 文件的 URL 地址。
|
|
180
|
+
output_dir: 输出目录的绝对路径。未指定时默认保存到当前工作目录下的同名文件夹。
|
|
181
|
+
use_chart_recognition: 是否启用图表识别,默认 True。
|
|
182
|
+
"""
|
|
183
|
+
if not file_path and not file_url:
|
|
184
|
+
raise ValueError("必须提供 file_path 或 file_url 其中之一")
|
|
185
|
+
if file_path and file_url:
|
|
186
|
+
raise ValueError("file_path 和 file_url 只能提供一个")
|
|
187
|
+
|
|
188
|
+
src = None
|
|
189
|
+
if file_path:
|
|
190
|
+
src = Path(file_path)
|
|
191
|
+
if not src.exists():
|
|
192
|
+
raise FileNotFoundError(f"文件不存在: {file_path}")
|
|
193
|
+
|
|
194
|
+
# 确定输出目录
|
|
195
|
+
if output_dir:
|
|
196
|
+
out = Path(output_dir)
|
|
197
|
+
elif src:
|
|
198
|
+
out = Path.cwd() / src.stem
|
|
199
|
+
else:
|
|
200
|
+
from urllib.parse import urlparse
|
|
201
|
+
|
|
202
|
+
url_path = urlparse(file_url).path
|
|
203
|
+
stem = Path(url_path).stem or "ocr_output"
|
|
204
|
+
out = Path.cwd() / stem
|
|
205
|
+
|
|
206
|
+
# 提交 → 轮询 → 取结果
|
|
207
|
+
job_id = await _submit_job(file_path, file_url, use_chart_recognition)
|
|
208
|
+
jsonl_url = await _poll_until_done(job_id)
|
|
209
|
+
merged_md = await _download_and_save(jsonl_url, out)
|
|
210
|
+
|
|
211
|
+
return str(merged_md)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def main():
|
|
216
|
+
mcp.run()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
if __name__ == "__main__":
|
|
220
|
+
main()
|