paddleocr-mcp 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddleocr_mcp-0.3.0/PKG-INFO +16 -0
- paddleocr_mcp-0.3.0/README.md +5 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp/__init__.py +15 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp/__main__.py +191 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp/pipelines.py +926 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp/py.typed +0 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp.egg-info/PKG-INFO +16 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp.egg-info/SOURCES.txt +12 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp.egg-info/dependency_links.txt +1 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp.egg-info/entry_points.txt +2 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp.egg-info/requires.txt +14 -0
- paddleocr_mcp-0.3.0/paddleocr_mcp.egg-info/top_level.txt +1 -0
- paddleocr_mcp-0.3.0/pyproject.toml +30 -0
- paddleocr_mcp-0.3.0/setup.cfg +4 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: paddleocr_mcp
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Requires-Python: >=3.10
|
|
5
|
+
Requires-Dist: mcp>=1.5.0
|
|
6
|
+
Requires-Dist: fastmcp>=2.0.0
|
|
7
|
+
Requires-Dist: httpx>=0.24.0
|
|
8
|
+
Requires-Dist: numpy>=1.24.0
|
|
9
|
+
Requires-Dist: pillow>=9.0.0
|
|
10
|
+
Requires-Dist: puremagic>=1.30.0
|
|
11
|
+
Requires-Dist: typing-extensions>=4.0.0
|
|
12
|
+
Provides-Extra: local
|
|
13
|
+
Requires-Dist: paddleocr[doc-parser]>=3.2; extra == "local"
|
|
14
|
+
Provides-Extra: local-cpu
|
|
15
|
+
Requires-Dist: paddleocr[doc-parser]>=3.2; extra == "local-cpu"
|
|
16
|
+
Requires-Dist: paddlepaddle>=3.0.0; extra == "local-cpu"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
# you may not use this file except in compliance with the License.
|
|
7
|
+
# You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
# See the License for the specific language governing permissions and
|
|
15
|
+
# limitations under the License.
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import asyncio
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
|
|
22
|
+
from fastmcp import FastMCP
|
|
23
|
+
|
|
24
|
+
from .pipelines import create_pipeline_handler
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _parse_args() -> argparse.Namespace:
|
|
28
|
+
"""Parse command line arguments."""
|
|
29
|
+
parser = argparse.ArgumentParser(
|
|
30
|
+
description="PaddleOCR MCP server - Supports local library, AI Studio service, and self-hosted servers."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--pipeline",
|
|
35
|
+
choices=["OCR", "PP-StructureV3", "PaddleOCR-VL"],
|
|
36
|
+
default=os.getenv("PADDLEOCR_MCP_PIPELINE", "OCR"),
|
|
37
|
+
help="Pipeline name.",
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--ppocr_source",
|
|
41
|
+
choices=["local", "aistudio", "self_hosted"],
|
|
42
|
+
default=os.getenv("PADDLEOCR_MCP_PPOCR_SOURCE", "local"),
|
|
43
|
+
help="Source of PaddleOCR functionality: local (local library), aistudio (AI Studio service), self_hosted (self-hosted server).",
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--http",
|
|
47
|
+
action="store_true",
|
|
48
|
+
help="Use HTTP transport instead of STDIO (suitable for remote deployment and multiple clients).",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--host",
|
|
52
|
+
default="127.0.0.1",
|
|
53
|
+
help="Host address for HTTP mode (default: 127.0.0.1).",
|
|
54
|
+
)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--port",
|
|
57
|
+
type=int,
|
|
58
|
+
default=8000,
|
|
59
|
+
help="Port for HTTP mode (default: 8000).",
|
|
60
|
+
)
|
|
61
|
+
parser.add_argument(
|
|
62
|
+
"--verbose", action="store_true", help="Enable verbose logging for debugging."
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Local mode configuration
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--pipeline_config",
|
|
68
|
+
default=os.getenv("PADDLEOCR_MCP_PIPELINE_CONFIG"),
|
|
69
|
+
help="PaddleOCR pipeline configuration file path (for local mode).",
|
|
70
|
+
)
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--device",
|
|
73
|
+
default=os.getenv("PADDLEOCR_MCP_DEVICE"),
|
|
74
|
+
help="Device to run inference on.",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Service mode configuration
|
|
78
|
+
parser.add_argument(
|
|
79
|
+
"--server_url",
|
|
80
|
+
default=os.getenv("PADDLEOCR_MCP_SERVER_URL"),
|
|
81
|
+
help="Base URL of the underlying server (required in service mode).",
|
|
82
|
+
)
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"--aistudio_access_token",
|
|
85
|
+
default=os.getenv("PADDLEOCR_MCP_AISTUDIO_ACCESS_TOKEN"),
|
|
86
|
+
help="AI Studio access token (required for AI Studio).",
|
|
87
|
+
)
|
|
88
|
+
parser.add_argument(
|
|
89
|
+
"--timeout",
|
|
90
|
+
type=int,
|
|
91
|
+
default=int(os.getenv("PADDLEOCR_MCP_TIMEOUT", "60")),
|
|
92
|
+
help="HTTP read timeout in seconds for API requests to the underlying server.",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
args = parser.parse_args()
|
|
96
|
+
return args
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _validate_args(args: argparse.Namespace) -> None:
|
|
100
|
+
"""Validate command line arguments."""
|
|
101
|
+
if not args.http and (args.host != "127.0.0.1" or args.port != 8000):
|
|
102
|
+
print(
|
|
103
|
+
"Host and port arguments are only valid when using HTTP transport (see: `--http`).",
|
|
104
|
+
file=sys.stderr,
|
|
105
|
+
)
|
|
106
|
+
sys.exit(2)
|
|
107
|
+
|
|
108
|
+
if args.ppocr_source in ["aistudio", "self_hosted"]:
|
|
109
|
+
if not args.server_url:
|
|
110
|
+
print("Error: The server base URL is required.", file=sys.stderr)
|
|
111
|
+
print(
|
|
112
|
+
"Please either set `--server_url` or set the environment variable "
|
|
113
|
+
"`PADDLEOCR_MCP_SERVER_URL`.",
|
|
114
|
+
file=sys.stderr,
|
|
115
|
+
)
|
|
116
|
+
sys.exit(2)
|
|
117
|
+
|
|
118
|
+
if args.ppocr_source == "aistudio" and not args.aistudio_access_token:
|
|
119
|
+
print("Error: The AI Studio access token is required.", file=sys.stderr)
|
|
120
|
+
print(
|
|
121
|
+
"Please either set `--aistudio_access_token` or set the environment variable "
|
|
122
|
+
"`PADDLEOCR_MCP_AISTUDIO_ACCESS_TOKEN`.",
|
|
123
|
+
file=sys.stderr,
|
|
124
|
+
)
|
|
125
|
+
sys.exit(2)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
async def async_main() -> None:
|
|
129
|
+
"""Asynchronous main entry point."""
|
|
130
|
+
args = _parse_args()
|
|
131
|
+
|
|
132
|
+
_validate_args(args)
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
pipeline_handler = create_pipeline_handler(
|
|
136
|
+
args.pipeline,
|
|
137
|
+
args.ppocr_source,
|
|
138
|
+
pipeline_config=args.pipeline_config,
|
|
139
|
+
device=args.device,
|
|
140
|
+
server_url=args.server_url,
|
|
141
|
+
aistudio_access_token=args.aistudio_access_token,
|
|
142
|
+
timeout=args.timeout,
|
|
143
|
+
)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
print(f"Failed to create the pipeline handler: {e}", file=sys.stderr)
|
|
146
|
+
if args.verbose:
|
|
147
|
+
import traceback
|
|
148
|
+
|
|
149
|
+
traceback.print_exc(file=sys.stderr)
|
|
150
|
+
sys.exit(1)
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
await pipeline_handler.start()
|
|
154
|
+
|
|
155
|
+
server_name = f"PaddleOCR {args.pipeline} MCP server"
|
|
156
|
+
mcp = FastMCP(
|
|
157
|
+
name=server_name,
|
|
158
|
+
log_level="INFO" if args.verbose else "WARNING",
|
|
159
|
+
mask_error_details=True,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
pipeline_handler.register_tools(mcp)
|
|
163
|
+
|
|
164
|
+
if args.http:
|
|
165
|
+
await mcp.run_async(
|
|
166
|
+
transport="streamable-http",
|
|
167
|
+
host=args.host,
|
|
168
|
+
port=args.port,
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
await mcp.run_async()
|
|
172
|
+
|
|
173
|
+
except Exception as e:
|
|
174
|
+
print(f"Failed to start the server: {e}", file=sys.stderr)
|
|
175
|
+
if args.verbose:
|
|
176
|
+
import traceback
|
|
177
|
+
|
|
178
|
+
traceback.print_exc(file=sys.stderr)
|
|
179
|
+
sys.exit(1)
|
|
180
|
+
|
|
181
|
+
finally:
|
|
182
|
+
await pipeline_handler.stop()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def main():
|
|
186
|
+
"""Main entry point."""
|
|
187
|
+
asyncio.run(async_main())
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
if __name__ == "__main__":
|
|
191
|
+
main()
|
|
@@ -0,0 +1,926 @@
|
|
|
1
|
+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# TODO:
|
|
16
|
+
# 1. Reuse `httpx` client.
|
|
17
|
+
# 2. Use `contextvars` to manage MCP context objects.
|
|
18
|
+
# 3. Implement structured logging, log stack traces, and log operation timing.
|
|
19
|
+
# 4. Report progress for long-running operations.
|
|
20
|
+
|
|
21
|
+
import abc
|
|
22
|
+
import asyncio
|
|
23
|
+
import base64
|
|
24
|
+
import io
|
|
25
|
+
import json
|
|
26
|
+
import re
|
|
27
|
+
from pathlib import PurePath
|
|
28
|
+
from queue import Queue
|
|
29
|
+
from threading import Thread
|
|
30
|
+
from typing import Any, Callable, Dict, List, NoReturn, Optional, Type, Union
|
|
31
|
+
from urllib.parse import urlparse
|
|
32
|
+
|
|
33
|
+
import httpx
|
|
34
|
+
import numpy as np
|
|
35
|
+
import puremagic
|
|
36
|
+
from fastmcp import Context, FastMCP
|
|
37
|
+
from mcp.types import ImageContent, TextContent
|
|
38
|
+
from PIL import Image as PILImage
|
|
39
|
+
from typing_extensions import Literal, Self, assert_never
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
from paddleocr import PaddleOCR, PaddleOCRVL, PPStructureV3
|
|
43
|
+
|
|
44
|
+
LOCAL_OCR_AVAILABLE = True
|
|
45
|
+
except ImportError:
|
|
46
|
+
LOCAL_OCR_AVAILABLE = False
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
OutputMode = Literal["simple", "detailed"]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _is_file_path(s: str) -> bool:
|
|
53
|
+
try:
|
|
54
|
+
PurePath(s)
|
|
55
|
+
return True
|
|
56
|
+
except Exception:
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _is_base64(s: str) -> bool:
|
|
61
|
+
pattern = r"^[A-Za-z0-9+/]+={0,2}$"
|
|
62
|
+
return bool(re.fullmatch(pattern, s))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _is_url(s: str) -> bool:
|
|
66
|
+
if not (s.startswith("http://") or s.startswith("https://")):
|
|
67
|
+
return False
|
|
68
|
+
result = urlparse(s)
|
|
69
|
+
return all([result.scheme, result.netloc]) and result.scheme in ("http", "https")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _infer_file_type_from_bytes(data: bytes) -> Optional[str]:
|
|
73
|
+
mime = puremagic.from_string(data, mime=True)
|
|
74
|
+
if mime.startswith("image/"):
|
|
75
|
+
return "image"
|
|
76
|
+
elif mime == "application/pdf":
|
|
77
|
+
return "pdf"
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_str_with_max_len(obj: object, max_len: int) -> str:
|
|
82
|
+
s = str(obj)
|
|
83
|
+
if len(s) > max_len:
|
|
84
|
+
return s[:max_len] + "..."
|
|
85
|
+
else:
|
|
86
|
+
return s
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class _EngineWrapper:
|
|
90
|
+
def __init__(self, engine: Any) -> None:
|
|
91
|
+
self._engine = engine
|
|
92
|
+
self._queue: Queue = Queue()
|
|
93
|
+
self._closed = False
|
|
94
|
+
self._loop = asyncio.get_running_loop()
|
|
95
|
+
self._thread = Thread(target=self._worker, daemon=False)
|
|
96
|
+
self._thread.start()
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def engine(self) -> Any:
|
|
100
|
+
return self._engine
|
|
101
|
+
|
|
102
|
+
async def call(self, func: Callable, *args: Any, **kwargs: Any) -> Any:
|
|
103
|
+
if self._closed:
|
|
104
|
+
raise RuntimeError("Engine wrapper has already been closed")
|
|
105
|
+
fut = self._loop.create_future()
|
|
106
|
+
self._queue.put((func, args, kwargs, fut))
|
|
107
|
+
return await fut
|
|
108
|
+
|
|
109
|
+
async def close(self) -> None:
|
|
110
|
+
if not self._closed:
|
|
111
|
+
self._queue.put(None)
|
|
112
|
+
await self._loop.run_in_executor(None, self._thread.join)
|
|
113
|
+
self._closed = True
|
|
114
|
+
|
|
115
|
+
def _worker(self) -> None:
|
|
116
|
+
while not self._closed:
|
|
117
|
+
item = self._queue.get()
|
|
118
|
+
if item is None:
|
|
119
|
+
break
|
|
120
|
+
func, args, kwargs, fut = item
|
|
121
|
+
try:
|
|
122
|
+
result = func(*args, **kwargs)
|
|
123
|
+
self._loop.call_soon_threadsafe(fut.set_result, result)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
self._loop.call_soon_threadsafe(fut.set_exception, e)
|
|
126
|
+
finally:
|
|
127
|
+
self._queue.task_done()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class PipelineHandler(abc.ABC):
|
|
131
|
+
"""Abstract base class for pipeline handlers."""
|
|
132
|
+
|
|
133
|
+
def __init__(
|
|
134
|
+
self,
|
|
135
|
+
pipeline: str,
|
|
136
|
+
ppocr_source: str,
|
|
137
|
+
pipeline_config: Optional[str],
|
|
138
|
+
device: Optional[str],
|
|
139
|
+
server_url: Optional[str],
|
|
140
|
+
aistudio_access_token: Optional[str],
|
|
141
|
+
timeout: Optional[int],
|
|
142
|
+
) -> None:
|
|
143
|
+
"""Initialize the pipeline handler.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
pipeline: Pipeline name.
|
|
147
|
+
ppocr_source: Source of PaddleOCR functionality.
|
|
148
|
+
pipeline_config: Path to pipeline configuration.
|
|
149
|
+
device: Device to run inference on.
|
|
150
|
+
server_url: Base URL for service mode.
|
|
151
|
+
aistudio_access_token: AI Studio access token.
|
|
152
|
+
timeout: Read timeout in seconds for HTTP requests.
|
|
153
|
+
"""
|
|
154
|
+
self._pipeline = pipeline
|
|
155
|
+
if ppocr_source == "local":
|
|
156
|
+
self._mode = "local"
|
|
157
|
+
elif ppocr_source in ("aistudio", "self_hosted"):
|
|
158
|
+
self._mode = "service"
|
|
159
|
+
else:
|
|
160
|
+
raise ValueError(f"Unknown PaddleOCR source {repr(ppocr_source)}")
|
|
161
|
+
self._ppocr_source = ppocr_source
|
|
162
|
+
self._pipeline_config = pipeline_config
|
|
163
|
+
self._device = device
|
|
164
|
+
self._server_url = server_url
|
|
165
|
+
self._aistudio_access_token = aistudio_access_token
|
|
166
|
+
self._timeout = timeout or 60
|
|
167
|
+
|
|
168
|
+
if self._mode == "local":
|
|
169
|
+
if not LOCAL_OCR_AVAILABLE:
|
|
170
|
+
raise RuntimeError("PaddleOCR is not locally available")
|
|
171
|
+
try:
|
|
172
|
+
self._engine = self._create_local_engine()
|
|
173
|
+
except Exception as e:
|
|
174
|
+
raise RuntimeError(
|
|
175
|
+
f"Failed to create PaddleOCR engine: {str(e)}"
|
|
176
|
+
) from e
|
|
177
|
+
|
|
178
|
+
self._status: Literal["initialized", "started", "stopped"] = "initialized"
|
|
179
|
+
|
|
180
|
+
async def start(self) -> None:
|
|
181
|
+
if self._status == "initialized":
|
|
182
|
+
if self._mode == "local":
|
|
183
|
+
self._engine_wrapper = _EngineWrapper(self._engine)
|
|
184
|
+
self._status = "started"
|
|
185
|
+
elif self._status == "started":
|
|
186
|
+
pass
|
|
187
|
+
elif self._status == "stopped":
|
|
188
|
+
raise RuntimeError("Pipeline handler has already been stopped")
|
|
189
|
+
else:
|
|
190
|
+
assert_never(self._status)
|
|
191
|
+
|
|
192
|
+
async def stop(self) -> None:
|
|
193
|
+
if self._status == "initialized":
|
|
194
|
+
raise RuntimeError("Pipeline handler has not been started")
|
|
195
|
+
elif self._status == "started":
|
|
196
|
+
if self._mode == "local":
|
|
197
|
+
await self._engine_wrapper.close()
|
|
198
|
+
self._status = "stopped"
|
|
199
|
+
elif self._status == "stopped":
|
|
200
|
+
pass
|
|
201
|
+
else:
|
|
202
|
+
assert_never(self._status)
|
|
203
|
+
|
|
204
|
+
async def __aenter__(self) -> Self:
|
|
205
|
+
await self.start()
|
|
206
|
+
return self
|
|
207
|
+
|
|
208
|
+
async def __aexit__(
|
|
209
|
+
self,
|
|
210
|
+
exc_type: Any,
|
|
211
|
+
exc_val: Any,
|
|
212
|
+
exc_tb: Any,
|
|
213
|
+
) -> None:
|
|
214
|
+
await self.stop()
|
|
215
|
+
|
|
216
|
+
@abc.abstractmethod
|
|
217
|
+
def register_tools(self, mcp: FastMCP) -> None:
|
|
218
|
+
"""Register tools with the MCP server.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
mcp: The `FastMCP` instance.
|
|
222
|
+
"""
|
|
223
|
+
raise NotImplementedError
|
|
224
|
+
|
|
225
|
+
@abc.abstractmethod
|
|
226
|
+
def _create_local_engine(self) -> Any:
|
|
227
|
+
"""Create the local OCR engine.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
The OCR engine instance.
|
|
231
|
+
"""
|
|
232
|
+
raise NotImplementedError
|
|
233
|
+
|
|
234
|
+
@abc.abstractmethod
|
|
235
|
+
def _get_service_endpoint(self) -> str:
|
|
236
|
+
"""Get the service endpoint.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Service endpoint path.
|
|
240
|
+
"""
|
|
241
|
+
raise NotImplementedError
|
|
242
|
+
|
|
243
|
+
@abc.abstractmethod
|
|
244
|
+
def _transform_local_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
|
245
|
+
"""Transform keyword arguments for local execution.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
kwargs: Keyword arguments.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Transformed keyword arguments.
|
|
252
|
+
"""
|
|
253
|
+
raise NotImplementedError
|
|
254
|
+
|
|
255
|
+
@abc.abstractmethod
|
|
256
|
+
def _transform_service_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
|
257
|
+
"""Transform keyword arguments for service execution.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
kwargs: Keyword arguments.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Transformed keyword arguments.
|
|
264
|
+
"""
|
|
265
|
+
raise NotImplementedError
|
|
266
|
+
|
|
267
|
+
@abc.abstractmethod
|
|
268
|
+
async def _parse_local_result(
|
|
269
|
+
self, local_result: Dict, ctx: Context
|
|
270
|
+
) -> Dict[str, Any]:
|
|
271
|
+
"""Parse raw result from local engine into a unified format.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
local_result: Raw result from local engine.
|
|
275
|
+
ctx: MCP context.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Parsed result in unified format.
|
|
279
|
+
"""
|
|
280
|
+
raise NotImplementedError
|
|
281
|
+
|
|
282
|
+
@abc.abstractmethod
|
|
283
|
+
async def _parse_service_result(
|
|
284
|
+
self, service_result: Dict[str, Any], ctx: Context
|
|
285
|
+
) -> Dict[str, Any]:
|
|
286
|
+
"""Parse raw result from the service into a unified format.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
service_result: Raw result from the service.
|
|
290
|
+
ctx: MCP context.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Parsed result in unified format.
|
|
294
|
+
"""
|
|
295
|
+
raise NotImplementedError
|
|
296
|
+
|
|
297
|
+
@abc.abstractmethod
|
|
298
|
+
async def _log_completion_stats(self, result: Dict[str, Any], ctx: Context) -> None:
|
|
299
|
+
"""Log statistics after processing completion.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
result: Processing result.
|
|
303
|
+
ctx: MCP context.
|
|
304
|
+
"""
|
|
305
|
+
raise NotImplementedError
|
|
306
|
+
|
|
307
|
+
@abc.abstractmethod
|
|
308
|
+
async def _format_output(
|
|
309
|
+
self,
|
|
310
|
+
result: Dict[str, Any],
|
|
311
|
+
detailed: bool,
|
|
312
|
+
ctx: Context,
|
|
313
|
+
**kwargs: Any,
|
|
314
|
+
) -> Union[str, List[Union[TextContent, ImageContent]]]:
|
|
315
|
+
"""Format output into simple or detailed format.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
result: Processing result.
|
|
319
|
+
detailed: Whether to use detailed format.
|
|
320
|
+
ctx: MCP context.
|
|
321
|
+
**kwargs: Additional arguments.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Formatted output in requested format.
|
|
325
|
+
"""
|
|
326
|
+
raise NotImplementedError
|
|
327
|
+
|
|
328
|
+
async def _predict_with_local_engine(
|
|
329
|
+
self, processed_input: Union[str, np.ndarray], ctx: Context, **kwargs: Any
|
|
330
|
+
) -> Dict:
|
|
331
|
+
if not hasattr(self, "_engine_wrapper"):
|
|
332
|
+
raise RuntimeError("Engine wrapper has not been initialized")
|
|
333
|
+
return await self._engine_wrapper.call(
|
|
334
|
+
self._engine_wrapper.engine.predict, processed_input, **kwargs
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class SimpleInferencePipelineHandler(PipelineHandler):
|
|
339
|
+
"""Base class for simple inference pipeline handlers."""
|
|
340
|
+
|
|
341
|
+
async def process(
|
|
342
|
+
self,
|
|
343
|
+
input_data: str,
|
|
344
|
+
output_mode: OutputMode,
|
|
345
|
+
ctx: Context,
|
|
346
|
+
file_type: Optional[str] = None,
|
|
347
|
+
infer_kwargs: Optional[Dict[str, Any]] = None,
|
|
348
|
+
format_kwargs: Optional[Dict[str, Any]] = None,
|
|
349
|
+
) -> Union[str, List[Union[TextContent, ImageContent]]]:
|
|
350
|
+
"""Process input data through the pipeline.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
input_data: Input data (file path, URL, or Base64).
|
|
354
|
+
output_mode: Output mode ("simple" or "detailed").
|
|
355
|
+
ctx: MCP context.
|
|
356
|
+
file_type: File type for URLs ("image", "pdf", or None for auto-detection).
|
|
357
|
+
infer_kwargs: Additional arguments for performing pipeline inference.
|
|
358
|
+
format_kwargs: Additional arguments for formatting the output.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
Processed result in the requested output format.
|
|
362
|
+
"""
|
|
363
|
+
infer_kwargs = infer_kwargs or {}
|
|
364
|
+
format_kwargs = format_kwargs or {}
|
|
365
|
+
try:
|
|
366
|
+
await ctx.info(
|
|
367
|
+
f"Starting {self._pipeline} processing (source: {self._ppocr_source})"
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
if self._mode == "local":
|
|
371
|
+
processed_input = self._process_input_for_local(input_data, file_type)
|
|
372
|
+
infer_kwargs = self._transform_local_kwargs(infer_kwargs)
|
|
373
|
+
raw_result = await self._predict_with_local_engine(
|
|
374
|
+
processed_input, ctx, **infer_kwargs
|
|
375
|
+
)
|
|
376
|
+
result = await self._parse_local_result(raw_result, ctx)
|
|
377
|
+
else:
|
|
378
|
+
processed_input, inferred_file_type = self._process_input_for_service(
|
|
379
|
+
input_data, file_type
|
|
380
|
+
)
|
|
381
|
+
infer_kwargs = self._transform_service_kwargs(infer_kwargs)
|
|
382
|
+
raw_result = await self._call_service(
|
|
383
|
+
processed_input, inferred_file_type, ctx, **infer_kwargs
|
|
384
|
+
)
|
|
385
|
+
result = await self._parse_service_result(raw_result, ctx)
|
|
386
|
+
|
|
387
|
+
await self._log_completion_stats(result, ctx)
|
|
388
|
+
return await self._format_output(
|
|
389
|
+
result, output_mode == "detailed", ctx, **format_kwargs
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
except Exception as e:
|
|
393
|
+
await ctx.error(f"{self._pipeline} processing failed: {str(e)}")
|
|
394
|
+
self._handle_error(e, output_mode)
|
|
395
|
+
|
|
396
|
+
def _process_input_for_local(
|
|
397
|
+
self, input_data: str, file_type: Optional[str]
|
|
398
|
+
) -> Union[str, np.ndarray]:
|
|
399
|
+
# TODO: Use `file_type` to handle more cases.
|
|
400
|
+
if _is_base64(input_data):
|
|
401
|
+
if input_data.startswith("data:"):
|
|
402
|
+
base64_data = input_data.split(",", 1)[1]
|
|
403
|
+
else:
|
|
404
|
+
base64_data = input_data
|
|
405
|
+
try:
|
|
406
|
+
image_bytes = base64.b64decode(base64_data)
|
|
407
|
+
file_type = _infer_file_type_from_bytes(image_bytes)
|
|
408
|
+
if file_type != "image":
|
|
409
|
+
raise ValueError("Currently, only images can be passed via Base64.")
|
|
410
|
+
image_pil = PILImage.open(io.BytesIO(image_bytes))
|
|
411
|
+
image_arr = np.array(image_pil.convert("RGB"))
|
|
412
|
+
return np.ascontiguousarray(image_arr[..., ::-1])
|
|
413
|
+
except Exception as e:
|
|
414
|
+
raise ValueError(f"Failed to decode Base64 image: {str(e)}") from e
|
|
415
|
+
elif _is_file_path(input_data) or _is_url(input_data):
|
|
416
|
+
return input_data
|
|
417
|
+
else:
|
|
418
|
+
raise ValueError("Invalid input data format")
|
|
419
|
+
|
|
420
|
+
def _process_input_for_service(
|
|
421
|
+
self, input_data: str, file_type: Optional[str]
|
|
422
|
+
) -> tuple[str, Optional[str]]:
|
|
423
|
+
if _is_url(input_data):
|
|
424
|
+
norm_ft = None
|
|
425
|
+
if isinstance(file_type, str):
|
|
426
|
+
if file_type.lower() in ("None", "none", "null", "unknown", ""):
|
|
427
|
+
norm_ft = None
|
|
428
|
+
else:
|
|
429
|
+
norm_ft = file_type.lower()
|
|
430
|
+
return input_data, norm_ft
|
|
431
|
+
elif _is_base64(input_data):
|
|
432
|
+
try:
|
|
433
|
+
if input_data.startswith("data:"):
|
|
434
|
+
base64_data = input_data.split(",", 1)[1]
|
|
435
|
+
else:
|
|
436
|
+
base64_data = input_data
|
|
437
|
+
bytes_ = base64.b64decode(base64_data)
|
|
438
|
+
file_type_str = _infer_file_type_from_bytes(bytes_)
|
|
439
|
+
if file_type_str is None:
|
|
440
|
+
raise ValueError(
|
|
441
|
+
"Unsupported file type in Base64 data. "
|
|
442
|
+
"Only images (JPEG, PNG, etc.) and PDF documents are supported."
|
|
443
|
+
)
|
|
444
|
+
return input_data, file_type_str
|
|
445
|
+
except Exception as e:
|
|
446
|
+
raise ValueError(f"Failed to decode Base64 data: {str(e)}") from e
|
|
447
|
+
elif _is_file_path(input_data):
|
|
448
|
+
try:
|
|
449
|
+
with open(input_data, "rb") as f:
|
|
450
|
+
bytes_ = f.read()
|
|
451
|
+
input_data = base64.b64encode(bytes_).decode("ascii")
|
|
452
|
+
file_type_str = _infer_file_type_from_bytes(bytes_)
|
|
453
|
+
if file_type_str is None:
|
|
454
|
+
raise ValueError(
|
|
455
|
+
f"Unsupported file type for '{input_data}'. "
|
|
456
|
+
"Only images (JPEG, PNG, etc.) and PDF documents are supported."
|
|
457
|
+
)
|
|
458
|
+
return input_data, file_type_str
|
|
459
|
+
except Exception as e:
|
|
460
|
+
raise ValueError(f"Failed to read file: {str(e)}") from e
|
|
461
|
+
else:
|
|
462
|
+
raise ValueError("Invalid input data format")
|
|
463
|
+
|
|
464
|
+
async def _call_service(
|
|
465
|
+
self,
|
|
466
|
+
processed_input: str,
|
|
467
|
+
file_type: Optional[str],
|
|
468
|
+
ctx: Context,
|
|
469
|
+
**kwargs: Any,
|
|
470
|
+
) -> Dict[str, Any]:
|
|
471
|
+
if not self._server_url:
|
|
472
|
+
raise RuntimeError("Server URL not configured")
|
|
473
|
+
|
|
474
|
+
endpoint = self._get_service_endpoint()
|
|
475
|
+
url = f"{self._server_url.rstrip('/')}/{endpoint.lstrip('/')}"
|
|
476
|
+
|
|
477
|
+
payload = self._prepare_service_payload(processed_input, file_type, **kwargs)
|
|
478
|
+
headers = {"Content-Type": "application/json"}
|
|
479
|
+
|
|
480
|
+
if self._ppocr_source == "aistudio":
|
|
481
|
+
if not self._aistudio_access_token:
|
|
482
|
+
raise RuntimeError("Missing AI Studio access token")
|
|
483
|
+
headers["Authorization"] = f"token {self._aistudio_access_token}"
|
|
484
|
+
|
|
485
|
+
try:
|
|
486
|
+
timeout = httpx.Timeout(
|
|
487
|
+
connect=30.0, read=self._timeout, write=30.0, pool=30.0
|
|
488
|
+
)
|
|
489
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
490
|
+
response = await client.post(url, json=payload, headers=headers)
|
|
491
|
+
response.raise_for_status()
|
|
492
|
+
return response.json()
|
|
493
|
+
except httpx.HTTPError as e:
|
|
494
|
+
raise RuntimeError(f"HTTP request failed: {type(e).__name__}: {str(e)}")
|
|
495
|
+
except json.JSONDecodeError as e:
|
|
496
|
+
raise RuntimeError(f"Invalid service response: {str(e)}")
|
|
497
|
+
|
|
498
|
+
def _prepare_service_payload(
|
|
499
|
+
self, processed_input: str, file_type: Optional[str], **kwargs: Any
|
|
500
|
+
) -> Dict[str, Any]:
|
|
501
|
+
payload: Dict[str, Any] = {"file": processed_input, **kwargs}
|
|
502
|
+
if file_type == "image":
|
|
503
|
+
payload["fileType"] = 1
|
|
504
|
+
elif file_type == "pdf":
|
|
505
|
+
payload["fileType"] = 0
|
|
506
|
+
else:
|
|
507
|
+
payload["fileType"] = None
|
|
508
|
+
|
|
509
|
+
return payload
|
|
510
|
+
|
|
511
|
+
def _handle_error(self, exc: Exception, output_mode: OutputMode) -> NoReturn:
|
|
512
|
+
raise exc
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
class OCRHandler(SimpleInferencePipelineHandler):
|
|
516
|
+
def register_tools(self, mcp: FastMCP) -> None:
|
|
517
|
+
@mcp.tool("ocr")
|
|
518
|
+
async def _ocr(
|
|
519
|
+
input_data: str,
|
|
520
|
+
output_mode: OutputMode = "simple",
|
|
521
|
+
file_type: Optional[str] = None,
|
|
522
|
+
*,
|
|
523
|
+
ctx: Context,
|
|
524
|
+
) -> Union[str, List[Union[TextContent, ImageContent]]]:
|
|
525
|
+
"""Extracts text from images and PDFs. Accepts file path, URL, or Base64.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
input_data: The file to process (file path, URL, or Base64 string).
|
|
529
|
+
output_mode: The desired output format.
|
|
530
|
+
- "simple": (Default) Clean, readable text suitable for most use cases.
|
|
531
|
+
- "detailed": A JSON output including text, confidence, and precise bounding box coordinates. Only use this when coordinates are specifically required.
|
|
532
|
+
file_type: File type. This parameter is REQUIRED when `input_data` is a URL and should be omitted for other types.
|
|
533
|
+
- "image": For image files
|
|
534
|
+
- "pdf": For PDF documents
|
|
535
|
+
- None: For unknown file types
|
|
536
|
+
"""
|
|
537
|
+
await ctx.info(
|
|
538
|
+
f"--- OCR tool received `input_data`: {get_str_with_max_len(input_data, 50)} ---"
|
|
539
|
+
)
|
|
540
|
+
return await self.process(input_data, output_mode, ctx, file_type)
|
|
541
|
+
|
|
542
|
+
def _create_local_engine(self) -> Any:
|
|
543
|
+
return PaddleOCR(
|
|
544
|
+
paddlex_config=self._pipeline_config,
|
|
545
|
+
device=self._device,
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
def _get_service_endpoint(self) -> str:
|
|
549
|
+
return "ocr"
|
|
550
|
+
|
|
551
|
+
def _transform_local_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
|
552
|
+
return {
|
|
553
|
+
"use_doc_unwarping": False,
|
|
554
|
+
"use_doc_orientation_classify": False,
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
def _transform_service_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
|
558
|
+
return {
|
|
559
|
+
"useDocUnwarping": False,
|
|
560
|
+
"useDocOrientationClassify": False,
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
async def _parse_local_result(self, local_result: Dict, ctx: Context) -> Dict:
|
|
564
|
+
clean_texts, confidences, text_lines = [], [], []
|
|
565
|
+
|
|
566
|
+
for result in local_result:
|
|
567
|
+
texts = result["rec_texts"]
|
|
568
|
+
scores = result["rec_scores"]
|
|
569
|
+
boxes = result["rec_boxes"]
|
|
570
|
+
|
|
571
|
+
for i, text in enumerate(texts):
|
|
572
|
+
if text and text.strip():
|
|
573
|
+
conf = scores[i] if i < len(scores) else 0
|
|
574
|
+
clean_texts.append(text.strip())
|
|
575
|
+
confidences.append(conf)
|
|
576
|
+
instance = {
|
|
577
|
+
"text": text.strip(),
|
|
578
|
+
"confidence": round(conf, 3),
|
|
579
|
+
"bbox": boxes[i].tolist(),
|
|
580
|
+
}
|
|
581
|
+
text_lines.append(instance)
|
|
582
|
+
|
|
583
|
+
return {
|
|
584
|
+
"text": "\n".join(clean_texts),
|
|
585
|
+
"confidence": sum(confidences) / len(confidences) if confidences else 0,
|
|
586
|
+
"text_lines": text_lines,
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
async def _parse_service_result(self, service_result: Dict, ctx: Context) -> Dict:
|
|
590
|
+
result_data = service_result.get("result", service_result)
|
|
591
|
+
ocr_results = result_data.get("ocrResults")
|
|
592
|
+
|
|
593
|
+
all_texts, all_confidences, text_lines = [], [], []
|
|
594
|
+
|
|
595
|
+
for ocr_result in ocr_results:
|
|
596
|
+
pruned = ocr_result["prunedResult"]
|
|
597
|
+
|
|
598
|
+
texts = pruned["rec_texts"]
|
|
599
|
+
scores = pruned["rec_scores"]
|
|
600
|
+
boxes = pruned["rec_boxes"]
|
|
601
|
+
|
|
602
|
+
for i, text in enumerate(texts):
|
|
603
|
+
if text and text.strip():
|
|
604
|
+
conf = scores[i] if i < len(scores) else 0
|
|
605
|
+
all_texts.append(text.strip())
|
|
606
|
+
all_confidences.append(conf)
|
|
607
|
+
instance = {
|
|
608
|
+
"text": text.strip(),
|
|
609
|
+
"confidence": round(conf, 3),
|
|
610
|
+
"bbox": boxes[i],
|
|
611
|
+
}
|
|
612
|
+
text_lines.append(instance)
|
|
613
|
+
|
|
614
|
+
return {
|
|
615
|
+
"text": "\n".join(all_texts),
|
|
616
|
+
"confidence": (
|
|
617
|
+
sum(all_confidences) / len(all_confidences) if all_confidences else 0
|
|
618
|
+
),
|
|
619
|
+
"text_lines": text_lines,
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
async def _log_completion_stats(self, result: Dict, ctx: Context) -> None:
|
|
623
|
+
text_length = len(result["text"])
|
|
624
|
+
text_line_count = len(result["text_lines"])
|
|
625
|
+
await ctx.info(
|
|
626
|
+
f"OCR completed: {text_length} characters, {text_line_count} text lines"
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
async def _format_output(
|
|
630
|
+
self,
|
|
631
|
+
result: Dict,
|
|
632
|
+
detailed: bool,
|
|
633
|
+
ctx: Context,
|
|
634
|
+
**kwargs: Any,
|
|
635
|
+
) -> Union[str, List[Union[TextContent, ImageContent]]]:
|
|
636
|
+
if not result["text"].strip():
|
|
637
|
+
return (
|
|
638
|
+
"❌ No text detected"
|
|
639
|
+
if not detailed
|
|
640
|
+
else json.dumps({"error": "No text detected"}, ensure_ascii=False)
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
if detailed:
|
|
644
|
+
return json.dumps(result, ensure_ascii=False, indent=2)
|
|
645
|
+
else:
|
|
646
|
+
confidence = result["confidence"]
|
|
647
|
+
text_line_count = len(result["text_lines"])
|
|
648
|
+
|
|
649
|
+
output = result["text"]
|
|
650
|
+
if confidence > 0:
|
|
651
|
+
output += f"\n\n📊 Confidence: {(confidence * 100):.1f}% | {text_line_count} text lines"
|
|
652
|
+
|
|
653
|
+
return output
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
class _LayoutParsingHandler(SimpleInferencePipelineHandler):
|
|
657
|
+
def _get_service_endpoint(self) -> str:
|
|
658
|
+
return "layout-parsing"
|
|
659
|
+
|
|
660
|
+
def _transform_local_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
|
661
|
+
return {
|
|
662
|
+
"use_doc_unwarping": False,
|
|
663
|
+
"use_doc_orientation_classify": False,
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
def _transform_service_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
|
|
667
|
+
return {
|
|
668
|
+
"useDocUnwarping": False,
|
|
669
|
+
"useDocOrientationClassify": False,
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
async def _parse_local_result(self, local_result: Dict, ctx: Context) -> Dict:
|
|
673
|
+
markdown_parts = []
|
|
674
|
+
all_images_mapping = {}
|
|
675
|
+
detailed_results = []
|
|
676
|
+
|
|
677
|
+
for result in local_result:
|
|
678
|
+
markdown = result.markdown
|
|
679
|
+
text = markdown["markdown_texts"]
|
|
680
|
+
markdown_parts.append(text)
|
|
681
|
+
images = markdown["markdown_images"]
|
|
682
|
+
processed_images = {}
|
|
683
|
+
for img_key, img_data in images.items():
|
|
684
|
+
with io.BytesIO() as buffer:
|
|
685
|
+
img_data.save(buffer, format="JPEG")
|
|
686
|
+
processed_images[img_key] = base64.b64encode(buffer.getvalue())
|
|
687
|
+
all_images_mapping.update(processed_images)
|
|
688
|
+
detailed_results.append(result)
|
|
689
|
+
|
|
690
|
+
return {
|
|
691
|
+
# TODO: Page concatenation can be done better via `pipeline.concatenate_markdown_pages`
|
|
692
|
+
"markdown": "\n".join(markdown_parts),
|
|
693
|
+
"pages": len(local_result),
|
|
694
|
+
"images_mapping": all_images_mapping,
|
|
695
|
+
"detailed_results": detailed_results,
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
async def _parse_service_result(self, service_result: Dict, ctx: Context) -> Dict:
|
|
699
|
+
result_data = service_result.get("result", service_result)
|
|
700
|
+
layout_results = result_data.get("layoutParsingResults")
|
|
701
|
+
|
|
702
|
+
if not layout_results:
|
|
703
|
+
return {
|
|
704
|
+
"markdown": "",
|
|
705
|
+
"pages": 0,
|
|
706
|
+
"images_mapping": {},
|
|
707
|
+
"detailed_results": [],
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
markdown_parts = []
|
|
711
|
+
all_images_mapping = {}
|
|
712
|
+
detailed_results = []
|
|
713
|
+
|
|
714
|
+
for res in layout_results:
|
|
715
|
+
markdown_parts.append(res["markdown"]["text"])
|
|
716
|
+
images = res["markdown"]["images"]
|
|
717
|
+
processed_images = {}
|
|
718
|
+
for img_key, img_data in images.items():
|
|
719
|
+
processed_images[img_key] = await self._process_image_data(
|
|
720
|
+
img_data, ctx
|
|
721
|
+
)
|
|
722
|
+
all_images_mapping.update(processed_images)
|
|
723
|
+
detailed_results.append(res["prunedResult"])
|
|
724
|
+
|
|
725
|
+
return {
|
|
726
|
+
"markdown": "\n".join(markdown_parts),
|
|
727
|
+
"pages": len(layout_results),
|
|
728
|
+
"images_mapping": all_images_mapping,
|
|
729
|
+
"detailed_results": detailed_results,
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
async def _process_image_data(self, img_data: str, ctx: Context) -> str:
|
|
733
|
+
if _is_url(img_data):
|
|
734
|
+
try:
|
|
735
|
+
timeout = httpx.Timeout(connect=30.0, read=30.0, write=30.0, pool=30.0)
|
|
736
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
737
|
+
response = await client.get(img_data)
|
|
738
|
+
response.raise_for_status()
|
|
739
|
+
img_bytes = response.content
|
|
740
|
+
return base64.b64encode(img_bytes).decode("ascii")
|
|
741
|
+
except Exception as e:
|
|
742
|
+
await ctx.error(
|
|
743
|
+
f"Failed to download image from URL {img_data}: {str(e)}"
|
|
744
|
+
)
|
|
745
|
+
return img_data
|
|
746
|
+
elif _is_base64(img_data):
|
|
747
|
+
return img_data
|
|
748
|
+
else:
|
|
749
|
+
await ctx.error(
|
|
750
|
+
f"Unknown image data format: {get_str_with_max_len(img_data, 50)}"
|
|
751
|
+
)
|
|
752
|
+
return img_data
|
|
753
|
+
|
|
754
|
+
async def _log_completion_stats(self, result: Dict, ctx: Context) -> None:
|
|
755
|
+
page_count = result["pages"]
|
|
756
|
+
await ctx.info(f"Layout parsing completed: {page_count} pages")
|
|
757
|
+
|
|
758
|
+
async def _format_output(
|
|
759
|
+
self,
|
|
760
|
+
result: Dict,
|
|
761
|
+
detailed: bool,
|
|
762
|
+
ctx: Context,
|
|
763
|
+
**kwargs: Any,
|
|
764
|
+
) -> Union[str, List[Union[TextContent, ImageContent]]]:
|
|
765
|
+
if not result["markdown"].strip():
|
|
766
|
+
return (
|
|
767
|
+
"❌ No document content detected"
|
|
768
|
+
if not detailed
|
|
769
|
+
else json.dumps({"error": "No content detected"}, ensure_ascii=False)
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
markdown_text = result["markdown"]
|
|
773
|
+
images_mapping = result.get("images_mapping", {})
|
|
774
|
+
|
|
775
|
+
if kwargs.get("return_images"):
|
|
776
|
+
content_list = self._parse_markdown_with_images(
|
|
777
|
+
markdown_text, images_mapping
|
|
778
|
+
)
|
|
779
|
+
else:
|
|
780
|
+
content_list = [TextContent(type="text", text=markdown_text)]
|
|
781
|
+
|
|
782
|
+
if detailed:
|
|
783
|
+
if "detailed_results" in result and result["detailed_results"]:
|
|
784
|
+
for detailed_result in result["detailed_results"]:
|
|
785
|
+
content_list.append(
|
|
786
|
+
TextContent(
|
|
787
|
+
type="text",
|
|
788
|
+
text=json.dumps(
|
|
789
|
+
detailed_result,
|
|
790
|
+
ensure_ascii=False,
|
|
791
|
+
indent=2,
|
|
792
|
+
default=str,
|
|
793
|
+
),
|
|
794
|
+
)
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
return content_list
|
|
798
|
+
|
|
799
|
+
def _parse_markdown_with_images(
|
|
800
|
+
self, markdown_text: str, images_mapping: Dict[str, str]
|
|
801
|
+
) -> List[Union[TextContent, ImageContent]]:
|
|
802
|
+
"""Parse markdown text and return mixed list of text and images."""
|
|
803
|
+
if not images_mapping:
|
|
804
|
+
return [TextContent(type="text", text=markdown_text)]
|
|
805
|
+
|
|
806
|
+
content_list = []
|
|
807
|
+
img_pattern = r'<img[^>]+src="([^"]+)"[^>]*>'
|
|
808
|
+
last_pos = 0
|
|
809
|
+
|
|
810
|
+
for match in re.finditer(img_pattern, markdown_text):
|
|
811
|
+
text_before = markdown_text[last_pos : match.start()]
|
|
812
|
+
if text_before.strip():
|
|
813
|
+
content_list.append(TextContent(type="text", text=text_before))
|
|
814
|
+
|
|
815
|
+
img_src = match.group(1)
|
|
816
|
+
if img_src in images_mapping:
|
|
817
|
+
content_list.append(
|
|
818
|
+
ImageContent(
|
|
819
|
+
type="image",
|
|
820
|
+
data=images_mapping[img_src],
|
|
821
|
+
mimeType="image/jpeg",
|
|
822
|
+
)
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
last_pos = match.end()
|
|
826
|
+
|
|
827
|
+
remaining_text = markdown_text[last_pos:]
|
|
828
|
+
if remaining_text.strip():
|
|
829
|
+
content_list.append(TextContent(type="text", text=remaining_text))
|
|
830
|
+
|
|
831
|
+
return content_list or [TextContent(type="text", text=markdown_text)]
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
class PPStructureV3Handler(_LayoutParsingHandler):
|
|
835
|
+
def register_tools(self, mcp: FastMCP) -> None:
|
|
836
|
+
@mcp.tool("pp_structurev3")
|
|
837
|
+
async def _pp_structurev3(
|
|
838
|
+
input_data: str,
|
|
839
|
+
output_mode: OutputMode = "simple",
|
|
840
|
+
file_type: Optional[str] = None,
|
|
841
|
+
return_images: bool = True,
|
|
842
|
+
*,
|
|
843
|
+
ctx: Context,
|
|
844
|
+
) -> Union[str, List[Union[TextContent, ImageContent]]]:
|
|
845
|
+
"""Extracts structured markdown from complex documents (images/PDFs), including tables, formulas, etc. Accepts file path, URL, or Base64.
|
|
846
|
+
|
|
847
|
+
Args:
|
|
848
|
+
input_data: The file to process (file path, URL, or Base64 string).
|
|
849
|
+
output_mode: The desired output format.
|
|
850
|
+
- "simple": (Default) Clean, readable markdown with embedded images. Best for most use cases.
|
|
851
|
+
- "detailed": JSON data about document structure, plus markdown. Only use this when coordinates are specifically required.
|
|
852
|
+
file_type: File type. This parameter is REQUIRED when `input_data` is a URL and should be omitted for other types.
|
|
853
|
+
- "image": For image files
|
|
854
|
+
- "pdf": For PDF documents
|
|
855
|
+
- None: For unknown file types
|
|
856
|
+
return_images: Whether to return the images extracted from the document.
|
|
857
|
+
"""
|
|
858
|
+
return await self.process(
|
|
859
|
+
input_data,
|
|
860
|
+
output_mode,
|
|
861
|
+
ctx,
|
|
862
|
+
file_type,
|
|
863
|
+
format_kwargs={"return_images": return_images},
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
def _create_local_engine(self) -> Any:
|
|
867
|
+
return PPStructureV3(
|
|
868
|
+
paddlex_config=self._pipeline_config,
|
|
869
|
+
device=self._device,
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
class PaddleOCRVLHandler(_LayoutParsingHandler):
|
|
874
|
+
def register_tools(self, mcp: FastMCP) -> None:
|
|
875
|
+
@mcp.tool("paddleocr_vl")
|
|
876
|
+
async def _paddleocr_vl(
|
|
877
|
+
input_data: str,
|
|
878
|
+
output_mode: OutputMode = "simple",
|
|
879
|
+
file_type: Optional[str] = None,
|
|
880
|
+
return_images: bool = True,
|
|
881
|
+
*,
|
|
882
|
+
ctx: Context,
|
|
883
|
+
) -> Union[str, List[Union[TextContent, ImageContent]]]:
|
|
884
|
+
"""Extracts structured markdown from complex documents (images/PDFs) using a VLM-based approach. The extracted elements include tables, formulas, etc. Accepts file path, URL, or Base64.
|
|
885
|
+
|
|
886
|
+
Args:
|
|
887
|
+
input_data: The file to process (file path, URL, or Base64 string).
|
|
888
|
+
output_mode: The desired output format.
|
|
889
|
+
- "simple": (Default) Clean, readable markdown with embedded images. Best for most use cases.
|
|
890
|
+
- "detailed": JSON data about document structure, plus markdown. Only use this when coordinates are specifically required.
|
|
891
|
+
file_type: File type. This parameter is REQUIRED when `input_data` is a URL and should be omitted for other types.
|
|
892
|
+
- "image": For image files
|
|
893
|
+
- "pdf": For PDF documents
|
|
894
|
+
- None: For unknown file types
|
|
895
|
+
return_images: Whether to return the images extracted from the document.
|
|
896
|
+
"""
|
|
897
|
+
return await self.process(
|
|
898
|
+
input_data,
|
|
899
|
+
output_mode,
|
|
900
|
+
ctx,
|
|
901
|
+
file_type,
|
|
902
|
+
format_kwargs={"return_images": return_images},
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
def _create_local_engine(self) -> Any:
|
|
906
|
+
return PaddleOCRVL(
|
|
907
|
+
paddlex_config=self._pipeline_config,
|
|
908
|
+
device=self._device,
|
|
909
|
+
)
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
_PIPELINE_HANDLERS: Dict[str, Type[PipelineHandler]] = {
|
|
913
|
+
"OCR": OCRHandler,
|
|
914
|
+
"PP-StructureV3": PPStructureV3Handler,
|
|
915
|
+
"PaddleOCR-VL": PaddleOCRVLHandler,
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
def create_pipeline_handler(
|
|
920
|
+
pipeline: str, /, *args: Any, **kwargs: Any
|
|
921
|
+
) -> PipelineHandler:
|
|
922
|
+
if pipeline in _PIPELINE_HANDLERS:
|
|
923
|
+
cls = _PIPELINE_HANDLERS[pipeline]
|
|
924
|
+
return cls(pipeline, *args, **kwargs)
|
|
925
|
+
else:
|
|
926
|
+
raise ValueError(f"Unknown pipeline {repr(pipeline)}")
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: paddleocr_mcp
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Requires-Python: >=3.10
|
|
5
|
+
Requires-Dist: mcp>=1.5.0
|
|
6
|
+
Requires-Dist: fastmcp>=2.0.0
|
|
7
|
+
Requires-Dist: httpx>=0.24.0
|
|
8
|
+
Requires-Dist: numpy>=1.24.0
|
|
9
|
+
Requires-Dist: pillow>=9.0.0
|
|
10
|
+
Requires-Dist: puremagic>=1.30.0
|
|
11
|
+
Requires-Dist: typing-extensions>=4.0.0
|
|
12
|
+
Provides-Extra: local
|
|
13
|
+
Requires-Dist: paddleocr[doc-parser]>=3.2; extra == "local"
|
|
14
|
+
Provides-Extra: local-cpu
|
|
15
|
+
Requires-Dist: paddleocr[doc-parser]>=3.2; extra == "local-cpu"
|
|
16
|
+
Requires-Dist: paddlepaddle>=3.0.0; extra == "local-cpu"
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
paddleocr_mcp/__init__.py
|
|
4
|
+
paddleocr_mcp/__main__.py
|
|
5
|
+
paddleocr_mcp/pipelines.py
|
|
6
|
+
paddleocr_mcp/py.typed
|
|
7
|
+
paddleocr_mcp.egg-info/PKG-INFO
|
|
8
|
+
paddleocr_mcp.egg-info/SOURCES.txt
|
|
9
|
+
paddleocr_mcp.egg-info/dependency_links.txt
|
|
10
|
+
paddleocr_mcp.egg-info/entry_points.txt
|
|
11
|
+
paddleocr_mcp.egg-info/requires.txt
|
|
12
|
+
paddleocr_mcp.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
paddleocr_mcp
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "paddleocr_mcp"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
requires-python = ">=3.10"
|
|
9
|
+
dependencies = [
|
|
10
|
+
"mcp>=1.5.0",
|
|
11
|
+
"fastmcp>=2.0.0",
|
|
12
|
+
"httpx>=0.24.0",
|
|
13
|
+
"numpy>=1.24.0",
|
|
14
|
+
"pillow>=9.0.0",
|
|
15
|
+
"puremagic>=1.30.0",
|
|
16
|
+
"typing-extensions>=4.0.0",
|
|
17
|
+
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
local = [
|
|
22
|
+
"paddleocr[doc-parser]>=3.2",
|
|
23
|
+
]
|
|
24
|
+
local-cpu = [
|
|
25
|
+
"paddleocr[doc-parser]>=3.2",
|
|
26
|
+
"paddlepaddle>=3.0.0",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
paddleocr_mcp = "paddleocr_mcp.__main__:main"
|