fast-paddleocr-mcp 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_paddleocr_mcp-0.3.9.dist-info/METADATA +235 -0
- fast_paddleocr_mcp-0.3.9.dist-info/RECORD +8 -0
- fast_paddleocr_mcp-0.3.9.dist-info/WHEEL +4 -0
- fast_paddleocr_mcp-0.3.9.dist-info/entry_points.txt +2 -0
- fast_paddleocr_mcp-0.3.9.dist-info/licenses/LICENSE +21 -0
- paddleocr_cli/__init__.py +3 -0
- paddleocr_cli/__main__.py +6 -0
- paddleocr_cli/mcp_server.py +275 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fast-paddleocr-mcp
|
|
3
|
+
Version: 0.3.9
|
|
4
|
+
Summary: Fast PaddleOCR MCP server - Extract text from images using PaddleOCR with optimized performance
|
|
5
|
+
Project-URL: Homepage, https://github.com/yourusername/PaddleOCR-MCP
|
|
6
|
+
Project-URL: Documentation, https://github.com/yourusername/PaddleOCR-MCP#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/yourusername/PaddleOCR-MCP
|
|
8
|
+
Project-URL: Issues, https://github.com/yourusername/PaddleOCR-MCP/issues
|
|
9
|
+
Author: PaddleOCR-MCP Contributors
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: image-processing,mcp,model-context-protocol,ocr,paddleocr,text-recognition
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Requires-Dist: mcp>=1.0.0
|
|
26
|
+
Requires-Dist: paddleocr>=2.7.0
|
|
27
|
+
Requires-Dist: paddlepaddle>=2.5.0
|
|
28
|
+
Requires-Dist: pillow>=10.0.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# PaddleOCR-MCP
|
|
36
|
+
|
|
37
|
+
PaddleOCR MCP (Model Context Protocol) server and CLI tool that extracts text from images and outputs results in markdown format. Optimized for fast inference with GPU auto-detection.
|
|
38
|
+
|
|
39
|
+
## MCP Server Configuration
|
|
40
|
+
|
|
41
|
+
The MCP (Model Context Protocol) server allows integration with MCP clients like Cursor, Claude Desktop, etc.
|
|
42
|
+
|
|
43
|
+
**Use `uvx` directly (no installation required, automatically downloads from PyPI):**
|
|
44
|
+
|
|
45
|
+
```json
|
|
46
|
+
{
|
|
47
|
+
"mcpServers": {
|
|
48
|
+
"fast-paddleocr-mcp": {
|
|
49
|
+
"command": "uvx",
|
|
50
|
+
"args": ["fast-paddleocr-mcp"]
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
#### MCP Tool: `ocr_image`
|
|
57
|
+
|
|
58
|
+
The server provides a single tool called `ocr_image` that:
|
|
59
|
+
- **Input**: `image_path` (string) - Path to the input image file
|
|
60
|
+
- **Output**: Returns the path to the generated markdown file containing OCR results
|
|
61
|
+
- **Automatic optimizations**: All performance optimizations are applied automatically with intelligent fallback
|
|
62
|
+
- **Default language**: Uses 'ch' (Chinese and English) by default for maximum compatibility
|
|
63
|
+
|
|
64
|
+
Example: When called with `image_path: "photo.png"`, it returns `"photo.png.md"` containing the recognized text.
|
|
65
|
+
|
|
66
|
+
**Note**: The server automatically applies all optimizations (HPI, GPU acceleration, image preprocessing, etc.) and falls back to simpler configurations if needed. No configuration required from the caller.
|
|
67
|
+
|
|
68
|
+
See [MCP_README.md](MCP_README.md) for detailed MCP server documentation.
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
|
|
72
|
+
### Basic Usage
|
|
73
|
+
|
|
74
|
+
The tool is optimized for speed by default with the following settings:
|
|
75
|
+
- **Fast mode enabled** (disables preprocessing for maximum speed)
|
|
76
|
+
- **PP-OCRv4** (faster mobile models)
|
|
77
|
+
- **640px image size limit** (faster processing)
|
|
78
|
+
- **Auto GPU detection** (uses GPU if available, falls back to CPU)
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Output will be saved as <image_name>.png.md
|
|
82
|
+
# Uses: fast mode + PP-OCRv4 + 640px + auto GPU detection
|
|
83
|
+
uvx --from . paddleocr-md image.png
|
|
84
|
+
|
|
85
|
+
# Specify custom output path
|
|
86
|
+
uvx --from . paddleocr-md image.png -o result.md
|
|
87
|
+
|
|
88
|
+
# Force CPU mode
|
|
89
|
+
uvx --from . paddleocr-md image.png --cpu
|
|
90
|
+
|
|
91
|
+
# Disable fast mode for better accuracy on rotated text
|
|
92
|
+
uvx --from . paddleocr-md image.png --no-fast
|
|
93
|
+
|
|
94
|
+
# Use PP-OCRv5 for better accuracy (slower)
|
|
95
|
+
uvx --from . paddleocr-md image.png --ocr-version PP-OCRv5
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Default Optimization Settings
|
|
99
|
+
|
|
100
|
+
The MCP server is optimized for **low latency** by default with these settings:
|
|
101
|
+
|
|
102
|
+
- ✅ **Fast mode enabled**: Disables textline orientation classification (skips one model)
|
|
103
|
+
- ✅ **PP-OCRv4**: Uses faster mobile models (PP-OCRv4_mobile_det, PP-OCRv4_mobile_rec)
|
|
104
|
+
- ✅ **High-Performance Inference (HPI)**: Automatically selects optimal inference backend
|
|
105
|
+
- Can reduce latency by **40-73%** (e.g., 73.1% reduction on PP-OCRv5_mobile_rec)
|
|
106
|
+
- Supports Paddle Inference, OpenVINO, ONNX Runtime, TensorRT
|
|
107
|
+
- ✅ **Multi-threaded CPU**: Uses all available CPU cores for parallel processing
|
|
108
|
+
- ✅ **MKL-DNN enabled**: Intel CPU optimization for faster inference
|
|
109
|
+
- ✅ **Single image batch**: `rec_batch_num=1` for lowest latency per image
|
|
110
|
+
- ✅ **Auto GPU detection**: Automatically uses GPU if available, falls back to CPU
|
|
111
|
+
- **GPU device selection**: Uses first available GPU (gpu_id=0)
|
|
112
|
+
- **TensorRT support**: Automatically enabled via HPI if TensorRT is installed
|
|
113
|
+
- **GPU memory**: Uses default allocation (can be customized if needed)
|
|
114
|
+
- ✅ **Automatic image preprocessing**: Optimizes images before OCR for better performance
|
|
115
|
+
- **Automatic downsampling**: Resizes large images to maximum 1920px (maintains aspect ratio)
|
|
116
|
+
- Reduces processing time for large images significantly
|
|
117
|
+
- Uses high-quality LANCZOS resampling to preserve text quality
|
|
118
|
+
- **Image sharpening**: Enhances text edges for improved OCR accuracy
|
|
119
|
+
- Uses unsharp mask filter (radius=1, percent=150, threshold=3)
|
|
120
|
+
- Additional sharpening enhancement (factor=1.2)
|
|
121
|
+
- Makes text characters more distinct and easier to recognize
|
|
122
|
+
- **Format conversion**: Automatically converts RGBA, LA, P modes to RGB with white background
|
|
123
|
+
- **Temporary file management**: Automatically cleans up preprocessed images after OCR
|
|
124
|
+
- ✅ **Logging disabled**: Reduces overhead by disabling verbose logging
|
|
125
|
+
|
|
126
|
+
**GPU Performance:**
|
|
127
|
+
- When GPU is available, HPI automatically selects TensorRT backend for maximum performance
|
|
128
|
+
- TensorRT can provide 2-3x speedup compared to standard GPU inference
|
|
129
|
+
- First run with HPI may take longer to build the inference engine, but subsequent runs will be much faster
|
|
130
|
+
|
|
131
|
+
**Requirements**:
|
|
132
|
+
- PaddleOCR >= 2.7.0 with all latest features supported (HPI, MKL-DNN, etc.)
|
|
133
|
+
- No backward compatibility - requires latest PaddleOCR version
|
|
134
|
+
- For maximum GPU performance: NVIDIA GPU with CUDA support and TensorRT (optional)
|
|
135
|
+
- Sufficient GPU memory (typically 1-2GB for mobile models)
|
|
136
|
+
|
|
137
|
+
#### Customization Options
|
|
138
|
+
|
|
139
|
+
1. **`--no-fast`**: Disable fast mode for better accuracy
|
|
140
|
+
- Enables textline orientation classification
|
|
141
|
+
- Better accuracy on rotated text, but slower
|
|
142
|
+
|
|
143
|
+
2. **`--cpu`**: Force CPU mode
|
|
144
|
+
- Overrides auto GPU detection
|
|
145
|
+
- Explicitly use CPU
|
|
146
|
+
|
|
147
|
+
3. **`--gpu`**: Force GPU mode
|
|
148
|
+
- Will fail if GPU not available
|
|
149
|
+
- Use when you want to ensure GPU usage
|
|
150
|
+
|
|
151
|
+
4. **`--ocr-version PP-OCRv5`**: Use better accuracy version
|
|
152
|
+
- PP-OCRv5 has better accuracy but slower than PP-OCRv4 (default)
|
|
153
|
+
- Uses server models
|
|
154
|
+
|
|
155
|
+
5. **`--max-size <pixels>`**: Adjust image processing size
|
|
156
|
+
- Default: 640px
|
|
157
|
+
- Larger values (e.g., 960, 1280) = better accuracy, slower
|
|
158
|
+
- Smaller values (e.g., 480) = faster, may reduce accuracy
|
|
159
|
+
|
|
160
|
+
6. **`--hpi`**: High-Performance Inference
|
|
161
|
+
- Automatically selects best inference backend (Paddle Inference, OpenVINO, ONNX Runtime, TensorRT)
|
|
162
|
+
- Requires HPI dependencies: `paddleocr install_hpi_deps cpu/gpu`
|
|
163
|
+
- Best performance but requires additional setup
|
|
164
|
+
|
|
165
|
+
### Examples
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
# Basic usage (uses all optimizations by default: fast + PP-OCRv4 + 640px + auto GPU)
|
|
169
|
+
uvx --from . paddleocr-md photo.jpg
|
|
170
|
+
|
|
171
|
+
# Process with custom output
|
|
172
|
+
uvx --from . paddleocr-md document.png -o extracted_text.md
|
|
173
|
+
|
|
174
|
+
# Better accuracy (slower) - disable fast mode and use PP-OCRv5
|
|
175
|
+
uvx --from . paddleocr-md image.png --no-fast --ocr-version PP-OCRv5 --max-size 960
|
|
176
|
+
|
|
177
|
+
# Force CPU mode
|
|
178
|
+
uvx --from . paddleocr-md image.png --cpu
|
|
179
|
+
|
|
180
|
+
# Use High-Performance Inference (requires HPI dependencies)
|
|
181
|
+
uvx --from . paddleocr-md image.png --hpi
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## Output Format
|
|
185
|
+
|
|
186
|
+
The tool generates a markdown file containing:
|
|
187
|
+
- Source image path
|
|
188
|
+
- List of detected text (one per line)
|
|
189
|
+
|
|
190
|
+
Example output (`test_image.png.md`):
|
|
191
|
+
```markdown
|
|
192
|
+
# OCR Result
|
|
193
|
+
|
|
194
|
+
**Source Image:** `test_image.png`
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
- HelloPaddleOcR
|
|
199
|
+
- 10000C
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## Testing
|
|
203
|
+
|
|
204
|
+
Run tests using pytest:
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
# Install development dependencies
|
|
208
|
+
pip install -e ".[dev]"
|
|
209
|
+
|
|
210
|
+
# Run all tests
|
|
211
|
+
pytest
|
|
212
|
+
|
|
213
|
+
# Run tests with coverage
|
|
214
|
+
pytest --cov=paddleocr_cli --cov-report=html
|
|
215
|
+
|
|
216
|
+
# Run specific test file
|
|
217
|
+
pytest tests/test_mcp_server.py
|
|
218
|
+
|
|
219
|
+
# Run specific test class or function
|
|
220
|
+
pytest tests/test_mcp_server.py::TestGetOCR
|
|
221
|
+
pytest tests/test_mcp_server.py::TestGetOCR::test_get_ocr_default_language
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
The test suite includes:
|
|
225
|
+
- OCR instance initialization and caching
|
|
226
|
+
- Tool listing and definition
|
|
227
|
+
- OCR tool calls with various parameters
|
|
228
|
+
- Language parameter handling
|
|
229
|
+
- File validation and error handling
|
|
230
|
+
- Markdown output generation
|
|
231
|
+
- Edge cases and error scenarios
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
MIT
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
paddleocr_cli/__init__.py,sha256=9d0cQnw9I6dU0JgGO_zoLUAWnjE2wI6S7zhGn5YHn-c,79
|
|
2
|
+
paddleocr_cli/__main__.py,sha256=U6N_qETotH9_4MKQRO-H6pDI8nuzSeE7DQ7fvTqIyW8,128
|
|
3
|
+
paddleocr_cli/mcp_server.py,sha256=DLDQ7okwuVudUnnf09Tfrt_DkGpJF5PPETMpGF3rwuo,10394
|
|
4
|
+
fast_paddleocr_mcp-0.3.9.dist-info/METADATA,sha256=u3b_Xq8BVw3DDryPLmwCFZgl8VaBqWESJB2gMLvGXQc,8816
|
|
5
|
+
fast_paddleocr_mcp-0.3.9.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
fast_paddleocr_mcp-0.3.9.dist-info/entry_points.txt,sha256=hFFJt1b5a_MXARyZ6Qg8yVtR5VcgZnaRZ2s2aKa52hs,69
|
|
7
|
+
fast_paddleocr_mcp-0.3.9.dist-info/licenses/LICENSE,sha256=UMu98eNUpnO26Nv7JguuhZtFeh5HEE-X00IrJ5uH04A,1104
|
|
8
|
+
fast_paddleocr_mcp-0.3.9.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 PaddleOCR-MCP Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""MCP server for PaddleOCR - accepts image path and outputs image path + .md"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from mcp.server import NotificationOptions, Server
|
|
12
|
+
from mcp.server.models import InitializationOptions
|
|
13
|
+
import mcp.server.stdio
|
|
14
|
+
import mcp.types as types
|
|
15
|
+
except ImportError:
|
|
16
|
+
print("Error: mcp package is not installed. Please install it with: pip install mcp", file=sys.stderr)
|
|
17
|
+
sys.exit(1)
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from paddleocr import PaddleOCR
|
|
21
|
+
except ImportError:
|
|
22
|
+
print("Error: paddleocr is not installed. Please install it with: pip install paddleocr", file=sys.stderr)
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from PIL import Image, ImageEnhance, ImageFilter
|
|
27
|
+
except ImportError:
|
|
28
|
+
print("Error: pillow is not installed. Please install it with: pip install pillow", file=sys.stderr)
|
|
29
|
+
sys.exit(1)
|
|
30
|
+
|
|
31
|
+
# Initialize MCP server
|
|
32
|
+
server = Server("fast-paddleocr-mcp")
|
|
33
|
+
|
|
34
|
+
# Cache PaddleOCR instance (lazy initialization with automatic fallback)
|
|
35
|
+
ocr_cache: dict[str, PaddleOCR] = {}
|
|
36
|
+
|
|
37
|
+
# Image preprocessing parameters
|
|
38
|
+
MAX_IMAGE_SIZE = 1920 # Maximum dimension (width or height) for automatic downsampling
|
|
39
|
+
SHARPEN_FACTOR = 1.2 # Sharpening factor (1.0 = no sharpening, higher = more sharpening)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def preprocess_image(image_path: str) -> str:
|
|
43
|
+
"""Preprocess image with automatic downsampling and sharpening for better OCR performance
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
image_path: Path to the input image file
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Path to the preprocessed image (temporary file)
|
|
50
|
+
"""
|
|
51
|
+
from PIL import Image, ImageEnhance, ImageFilter
|
|
52
|
+
|
|
53
|
+
# Open the original image
|
|
54
|
+
img = Image.open(image_path)
|
|
55
|
+
|
|
56
|
+
# Convert to RGB if necessary (handle RGBA, L, P, etc.)
|
|
57
|
+
if img.mode != 'RGB':
|
|
58
|
+
if img.mode == 'RGBA':
|
|
59
|
+
# Create a white background for transparent images
|
|
60
|
+
background = Image.new('RGB', img.size, (255, 255, 255))
|
|
61
|
+
background.paste(img, mask=img.split()[3]) # Use alpha channel as mask
|
|
62
|
+
img = background
|
|
63
|
+
elif img.mode == 'LA':
|
|
64
|
+
# Convert LA (grayscale with alpha) to RGB
|
|
65
|
+
background = Image.new('RGB', img.size, (255, 255, 255))
|
|
66
|
+
rgb_img = img.convert('RGB')
|
|
67
|
+
# Use alpha channel from original image
|
|
68
|
+
alpha = img.split()[1] if len(img.split()) > 1 else None
|
|
69
|
+
if alpha:
|
|
70
|
+
background.paste(rgb_img, mask=alpha)
|
|
71
|
+
else:
|
|
72
|
+
background.paste(rgb_img)
|
|
73
|
+
img = background
|
|
74
|
+
elif img.mode == 'P':
|
|
75
|
+
# Convert palette mode to RGB (handle transparency)
|
|
76
|
+
# First check if the palette image has transparency
|
|
77
|
+
if 'transparency' in img.info:
|
|
78
|
+
img = img.convert('RGBA')
|
|
79
|
+
else:
|
|
80
|
+
img = img.convert('RGB')
|
|
81
|
+
|
|
82
|
+
if img.mode == 'RGBA':
|
|
83
|
+
background = Image.new('RGB', img.size, (255, 255, 255))
|
|
84
|
+
background.paste(img, mask=img.split()[3]) # Use alpha channel as mask
|
|
85
|
+
img = background
|
|
86
|
+
# else: already RGB, no conversion needed
|
|
87
|
+
else:
|
|
88
|
+
# Convert other modes (L, etc.) to RGB
|
|
89
|
+
img = img.convert('RGB')
|
|
90
|
+
|
|
91
|
+
# Automatic downsampling: resize if image is too large
|
|
92
|
+
# Large images slow down OCR significantly, so we resize while maintaining aspect ratio
|
|
93
|
+
width, height = img.size
|
|
94
|
+
if width > MAX_IMAGE_SIZE or height > MAX_IMAGE_SIZE:
|
|
95
|
+
# Calculate new dimensions maintaining aspect ratio
|
|
96
|
+
if width > height:
|
|
97
|
+
new_width = MAX_IMAGE_SIZE
|
|
98
|
+
new_height = int(height * (MAX_IMAGE_SIZE / width))
|
|
99
|
+
else:
|
|
100
|
+
new_height = MAX_IMAGE_SIZE
|
|
101
|
+
new_width = int(width * (MAX_IMAGE_SIZE / height))
|
|
102
|
+
|
|
103
|
+
# Resize using high-quality resampling (LANCZOS) to preserve text quality
|
|
104
|
+
img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
|
105
|
+
|
|
106
|
+
# Apply sharpening filter to enhance text edges and improve OCR accuracy
|
|
107
|
+
# Unsharp mask filter enhances edges without oversharpening
|
|
108
|
+
img = img.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3))
|
|
109
|
+
|
|
110
|
+
# Additional sharpening with ImageEnhance for fine control
|
|
111
|
+
# This helps make text characters more distinct and easier to recognize
|
|
112
|
+
enhancer = ImageEnhance.Sharpness(img)
|
|
113
|
+
img = enhancer.enhance(SHARPEN_FACTOR)
|
|
114
|
+
|
|
115
|
+
# Save preprocessed image to temporary file
|
|
116
|
+
# Use JPEG format with high quality to preserve text clarity
|
|
117
|
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg', prefix='preprocessed_')
|
|
118
|
+
temp_path = temp_file.name
|
|
119
|
+
temp_file.close()
|
|
120
|
+
|
|
121
|
+
# Save as JPEG with high quality to preserve text clarity
|
|
122
|
+
img.save(temp_path, 'JPEG', quality=95, optimize=True)
|
|
123
|
+
|
|
124
|
+
return temp_path
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def get_ocr() -> PaddleOCR:
|
|
128
|
+
"""Initialize PaddleOCR with optimized settings for speed and low latency
|
|
129
|
+
|
|
130
|
+
Uses default language 'ch' (Chinese and English) with all performance optimizations enabled.
|
|
131
|
+
No backward compatibility - assumes latest PaddleOCR version with all features supported.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
PaddleOCR instance with optimal configuration
|
|
135
|
+
"""
|
|
136
|
+
global ocr_cache
|
|
137
|
+
|
|
138
|
+
# Use default language 'ch' (Chinese and English) - most versatile
|
|
139
|
+
lang_key = 'ch'
|
|
140
|
+
|
|
141
|
+
if lang_key not in ocr_cache:
|
|
142
|
+
# Build optimization parameters compatible with PaddleOCR 2.7+
|
|
143
|
+
# Note: PaddleOCR 2.7+ uses different parameter names
|
|
144
|
+
ocr_params = {
|
|
145
|
+
'lang': lang_key, # Default language 'ch' (Chinese and English) - most versatile
|
|
146
|
+
'use_textline_orientation': False, # Fast mode: disable textline orientation classification
|
|
147
|
+
'text_recognition_batch_size': 1, # Process one image at a time for lowest latency
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
# Initialize PaddleOCR with compatible parameters
|
|
151
|
+
# PaddleOCR 2.7+ automatically detects GPU/CPU and uses optimal settings
|
|
152
|
+
ocr_cache[lang_key] = PaddleOCR(**ocr_params)
|
|
153
|
+
|
|
154
|
+
return ocr_cache[lang_key]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@server.list_tools()
|
|
158
|
+
async def handle_list_tools() -> list[types.Tool]:
|
|
159
|
+
"""List available tools"""
|
|
160
|
+
return [
|
|
161
|
+
types.Tool(
|
|
162
|
+
name="ocr_image",
|
|
163
|
+
description="Extract text from an image using PaddleOCR with automatic optimizations. Returns the path to the generated markdown file (image_path + .md). All optimizations are applied automatically.",
|
|
164
|
+
inputSchema={
|
|
165
|
+
"type": "object",
|
|
166
|
+
"properties": {
|
|
167
|
+
"image_path": {
|
|
168
|
+
"type": "string",
|
|
169
|
+
"description": "Path to the input image file"
|
|
170
|
+
}
|
|
171
|
+
},
|
|
172
|
+
"required": ["image_path"]
|
|
173
|
+
}
|
|
174
|
+
)
|
|
175
|
+
]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@server.call_tool()
|
|
179
|
+
async def handle_call_tool(name: str, arguments: Optional[dict[str, Any]]) -> list[types.TextContent]:
|
|
180
|
+
"""Handle tool calls"""
|
|
181
|
+
if name != "ocr_image":
|
|
182
|
+
raise ValueError(f"Unknown tool: {name}")
|
|
183
|
+
|
|
184
|
+
if not arguments or "image_path" not in arguments:
|
|
185
|
+
raise ValueError("Missing required argument: image_path")
|
|
186
|
+
|
|
187
|
+
image_path = arguments["image_path"]
|
|
188
|
+
if not isinstance(image_path, str):
|
|
189
|
+
raise ValueError("image_path must be a string")
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
# Validate input file exists
|
|
193
|
+
image_path_obj = Path(image_path)
|
|
194
|
+
if not image_path_obj.exists():
|
|
195
|
+
raise FileNotFoundError(f"Image file not found: {image_path}")
|
|
196
|
+
|
|
197
|
+
if not image_path_obj.is_file():
|
|
198
|
+
raise ValueError(f"Path is not a file: {image_path}")
|
|
199
|
+
|
|
200
|
+
# Preprocess image: automatic downsampling and sharpening
|
|
201
|
+
# This improves OCR performance and accuracy
|
|
202
|
+
preprocessed_path = preprocess_image(str(image_path_obj))
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
# Initialize OCR with automatic optimizations and fallback
|
|
206
|
+
ocr_instance = get_ocr()
|
|
207
|
+
|
|
208
|
+
# Perform OCR on preprocessed image
|
|
209
|
+
# PaddleOCR 2.7+ doesn't support cls parameter
|
|
210
|
+
result = ocr_instance.ocr(preprocessed_path)
|
|
211
|
+
finally:
|
|
212
|
+
# Clean up temporary preprocessed image file
|
|
213
|
+
try:
|
|
214
|
+
if os.path.exists(preprocessed_path):
|
|
215
|
+
os.unlink(preprocessed_path)
|
|
216
|
+
except Exception:
|
|
217
|
+
pass # Ignore cleanup errors
|
|
218
|
+
|
|
219
|
+
# Generate output markdown file path (image.png -> image.png.md)
|
|
220
|
+
output_path = Path(str(image_path_obj) + '.md')
|
|
221
|
+
|
|
222
|
+
# Extract text from OCR result
|
|
223
|
+
detected_texts = []
|
|
224
|
+
if result and result[0]:
|
|
225
|
+
for line in result[0]:
|
|
226
|
+
if line and len(line) >= 2:
|
|
227
|
+
text = line[1][0] # Extract text from OCR result format: [[box_coords], (text, confidence)]
|
|
228
|
+
if text:
|
|
229
|
+
detected_texts.append(text)
|
|
230
|
+
|
|
231
|
+
# Write markdown file
|
|
232
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
233
|
+
f.write("# OCR Result\n\n")
|
|
234
|
+
f.write(f"**Source Image:** `{image_path}`\n\n")
|
|
235
|
+
f.write("---\n\n")
|
|
236
|
+
|
|
237
|
+
if detected_texts:
|
|
238
|
+
for text in detected_texts:
|
|
239
|
+
f.write(f"- {text}\n")
|
|
240
|
+
else:
|
|
241
|
+
f.write("- No text detected\n")
|
|
242
|
+
|
|
243
|
+
# Return the output file path
|
|
244
|
+
return [types.TextContent(type="text", text=str(output_path))]
|
|
245
|
+
|
|
246
|
+
except Exception as e:
|
|
247
|
+
error_msg = f"Error processing image {image_path}: {str(e)}"
|
|
248
|
+
print(error_msg, file=sys.stderr)
|
|
249
|
+
raise RuntimeError(error_msg) from e
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
async def main_async():
|
|
253
|
+
"""Async main entry point for the MCP server"""
|
|
254
|
+
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
|
|
255
|
+
await server.run(
|
|
256
|
+
read_stream,
|
|
257
|
+
write_stream,
|
|
258
|
+
InitializationOptions(
|
|
259
|
+
server_name="fast-paddleocr-mcp",
|
|
260
|
+
server_version="0.3.9",
|
|
261
|
+
capabilities=server.get_capabilities(
|
|
262
|
+
notification_options=NotificationOptions(),
|
|
263
|
+
experimental_capabilities={},
|
|
264
|
+
),
|
|
265
|
+
),
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def main():
|
|
270
|
+
"""Main entry point for the MCP server (synchronous wrapper)"""
|
|
271
|
+
asyncio.run(main_async())
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
if __name__ == "__main__":
|
|
275
|
+
main()
|