markitdown-paddleocr 0.1.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/PKG-INFO +88 -1
- {markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/README.md +87 -0
- markitdown_paddleocr-0.2.2/src/markitdown_paddleocr/__about__.py +1 -0
- {markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/src/markitdown_paddleocr/_config.py +18 -0
- markitdown_paddleocr-0.2.2/src/markitdown_paddleocr/_converter.py +570 -0
- {markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/src/markitdown_paddleocr/_plugin.py +22 -7
- markitdown_paddleocr-0.1.0/src/markitdown_paddleocr/__about__.py +0 -1
- markitdown_paddleocr-0.1.0/src/markitdown_paddleocr/_converter.py +0 -304
- {markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/.gitignore +0 -0
- {markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/pyproject.toml +0 -0
- {markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/src/markitdown_paddleocr/__init__.py +0 -0
- {markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/src/markitdown_paddleocr/_dual_converter.py +0 -0
- {markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/src/markitdown_paddleocr/_paddle_client.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: markitdown-paddleocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Intelligent PDF/Image to Markdown converter using PaddleOCR cloud API
|
|
5
5
|
Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/microsoft/markitdown/issues
|
|
@@ -178,6 +178,93 @@ PaddleOcrConverter.convert()
|
|
|
178
178
|
- `Pillow>=9.0.0` - 图像处理
|
|
179
179
|
- `requests>=2.28.0` - HTTP 请求
|
|
180
180
|
|
|
181
|
+
## 发布到 PyPI
|
|
182
|
+
|
|
183
|
+
### 前置条件
|
|
184
|
+
|
|
185
|
+
1. 安装构建工具:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
pip install build twine hatch
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
2. 配置 PyPI API Token(Windows 用户环境变量):
|
|
192
|
+
|
|
193
|
+
```powershell
|
|
194
|
+
# PowerShell 设置用户环境变量
|
|
195
|
+
[System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
或在 Bash/Zsh 中:
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
export PYPI_API_TOKEN="pypi-..."
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### 快速发布(推荐)
|
|
205
|
+
|
|
206
|
+
项目根目录提供了上传脚本,可一键发布两个插件:
|
|
207
|
+
|
|
208
|
+
**Bash / Git Bash:**
|
|
209
|
+
```bash
|
|
210
|
+
# 构建两个插件
|
|
211
|
+
cd packages/markitdown-glmocr && hatch build
|
|
212
|
+
|
|
213
|
+
cd ../markitdown-paddleocr && hatch build
|
|
214
|
+
|
|
215
|
+
# 上传(自动上传所有构建的版本)
|
|
216
|
+
cd ../..
|
|
217
|
+
./scripts/pypi-upload.sh
|
|
218
|
+
|
|
219
|
+
# 或指定版本号
|
|
220
|
+
./scripts/pypi-upload.sh 0.2.0
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
**PowerShell:**
|
|
224
|
+
```powershell
|
|
225
|
+
# 构建两个插件
|
|
226
|
+
cd packages/markitdown-glmocr; hatch build
|
|
227
|
+
cd ../markitdown-paddleocr; hatch build
|
|
228
|
+
|
|
229
|
+
# 上传
|
|
230
|
+
cd ../..
|
|
231
|
+
.\scripts\pypi-upload.ps1
|
|
232
|
+
|
|
233
|
+
# 或指定版本号
|
|
234
|
+
.\scripts\pypi-upload.ps1 -Version "0.2.0"
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### 手动发布
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
# 1. 进入项目目录
|
|
241
|
+
cd packages/markitdown-paddleocr
|
|
242
|
+
|
|
243
|
+
# 2. 构建
|
|
244
|
+
hatch build
|
|
245
|
+
|
|
246
|
+
# 3. 检查
|
|
247
|
+
twine check dist/*
|
|
248
|
+
|
|
249
|
+
# 4. 上传
|
|
250
|
+
twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### 发布到 TestPyPI(测试)
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
257
|
+
|
|
258
|
+
# 从 TestPyPI 安装验证
|
|
259
|
+
pip install --index-url https://test.pypi.org/simple/ markitdown-paddleocr
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
### 注意事项
|
|
263
|
+
|
|
264
|
+
- 发布前确保 `src/markitdown_paddleocr/__about__.py` 中的版本号已更新
|
|
265
|
+
- 同一版本号不能重复上传,如需修正必须 bump 版本号
|
|
266
|
+
- `PYPI_API_TOKEN` 切勿提交到代码仓库
|
|
267
|
+
|
|
181
268
|
## 许可证
|
|
182
269
|
|
|
183
270
|
MIT
|
|
@@ -152,6 +152,93 @@ PaddleOcrConverter.convert()
|
|
|
152
152
|
- `Pillow>=9.0.0` - 图像处理
|
|
153
153
|
- `requests>=2.28.0` - HTTP 请求
|
|
154
154
|
|
|
155
|
+
## 发布到 PyPI
|
|
156
|
+
|
|
157
|
+
### 前置条件
|
|
158
|
+
|
|
159
|
+
1. 安装构建工具:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
pip install build twine hatch
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
2. 配置 PyPI API Token(Windows 用户环境变量):
|
|
166
|
+
|
|
167
|
+
```powershell
|
|
168
|
+
# PowerShell 设置用户环境变量
|
|
169
|
+
[System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
或在 Bash/Zsh 中:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
export PYPI_API_TOKEN="pypi-..."
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### 快速发布(推荐)
|
|
179
|
+
|
|
180
|
+
项目根目录提供了上传脚本,可一键发布两个插件:
|
|
181
|
+
|
|
182
|
+
**Bash / Git Bash:**
|
|
183
|
+
```bash
|
|
184
|
+
# 构建两个插件
|
|
185
|
+
cd packages/markitdown-glmocr && hatch build
|
|
186
|
+
|
|
187
|
+
cd ../markitdown-paddleocr && hatch build
|
|
188
|
+
|
|
189
|
+
# 上传(自动上传所有构建的版本)
|
|
190
|
+
cd ../..
|
|
191
|
+
./scripts/pypi-upload.sh
|
|
192
|
+
|
|
193
|
+
# 或指定版本号
|
|
194
|
+
./scripts/pypi-upload.sh 0.2.0
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
**PowerShell:**
|
|
198
|
+
```powershell
|
|
199
|
+
# 构建两个插件
|
|
200
|
+
cd packages/markitdown-glmocr; hatch build
|
|
201
|
+
cd ../markitdown-paddleocr; hatch build
|
|
202
|
+
|
|
203
|
+
# 上传
|
|
204
|
+
cd ../..
|
|
205
|
+
.\scripts\pypi-upload.ps1
|
|
206
|
+
|
|
207
|
+
# 或指定版本号
|
|
208
|
+
.\scripts\pypi-upload.ps1 -Version "0.2.0"
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### 手动发布
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
# 1. 进入项目目录
|
|
215
|
+
cd packages/markitdown-paddleocr
|
|
216
|
+
|
|
217
|
+
# 2. 构建
|
|
218
|
+
hatch build
|
|
219
|
+
|
|
220
|
+
# 3. 检查
|
|
221
|
+
twine check dist/*
|
|
222
|
+
|
|
223
|
+
# 4. 上传
|
|
224
|
+
twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### 发布到 TestPyPI(测试)
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
231
|
+
|
|
232
|
+
# 从 TestPyPI 安装验证
|
|
233
|
+
pip install --index-url https://test.pypi.org/simple/ markitdown-paddleocr
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### 注意事项
|
|
237
|
+
|
|
238
|
+
- 发布前确保 `src/markitdown_paddleocr/__about__.py` 中的版本号已更新
|
|
239
|
+
- 同一版本号不能重复上传,如需修正必须 bump 版本号
|
|
240
|
+
- `PYPI_API_TOKEN` 切勿提交到代码仓库
|
|
241
|
+
|
|
155
242
|
## 许可证
|
|
156
243
|
|
|
157
244
|
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.2"
|
{markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/src/markitdown_paddleocr/_config.py
RENAMED
|
@@ -2,6 +2,19 @@
|
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from enum import Enum
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ScanDetectionMode(str, Enum):
|
|
9
|
+
"""扫描检测模式。
|
|
10
|
+
|
|
11
|
+
- PAGE_BY_PAGE: 逐页分析,当前默认行为
|
|
12
|
+
- FIRST_PAGE_HINT: 首页是扫描件则全文档使用OCR
|
|
13
|
+
- SAMPLING: 抽样前N页,多数是扫描件则全部OCR
|
|
14
|
+
"""
|
|
15
|
+
PAGE_BY_PAGE = "page_by_page"
|
|
16
|
+
FIRST_PAGE_HINT = "first_page_hint"
|
|
17
|
+
SAMPLING = "sampling"
|
|
5
18
|
|
|
6
19
|
|
|
7
20
|
@dataclass
|
|
@@ -35,6 +48,11 @@ class PaddleOcrConfig:
|
|
|
35
48
|
# Processing strategy
|
|
36
49
|
force_ai: bool = False
|
|
37
50
|
|
|
51
|
+
# Scan detection mode for optimization
|
|
52
|
+
scan_detection_mode: ScanDetectionMode = ScanDetectionMode.SAMPLING
|
|
53
|
+
scan_sample_pages: int = 3 # Number of pages to sample in SAMPLING mode
|
|
54
|
+
scan_text_threshold: int = 50 # Min text length to consider page as non-scanned
|
|
55
|
+
|
|
38
56
|
@classmethod
|
|
39
57
|
def from_env(cls, **overrides) -> "PaddleOcrConfig":
|
|
40
58
|
"""Create config from environment variables with optional overrides."""
|
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
"""PaddleOcr Converter - PDF/Image to Markdown using PaddleOCR cloud API."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any, BinaryIO, Optional
|
|
7
|
+
|
|
8
|
+
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
|
9
|
+
from markitdown._exceptions import (
|
|
10
|
+
MISSING_DEPENDENCY_MESSAGE,
|
|
11
|
+
MissingDependencyException,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from ._config import PaddleOcrConfig, ScanDetectionMode
|
|
15
|
+
from ._paddle_client import PaddleClient
|
|
16
|
+
|
|
17
|
+
# Import PDF dependencies
|
|
18
|
+
_dependency_exc_info = None
|
|
19
|
+
try:
|
|
20
|
+
import pdfminer
|
|
21
|
+
import pdfminer.high_level
|
|
22
|
+
import pdfplumber
|
|
23
|
+
except ImportError:
|
|
24
|
+
_dependency_exc_info = sys.exc_info()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
28
|
+
"application/pdf",
|
|
29
|
+
"application/x-pdf",
|
|
30
|
+
"image/jpeg",
|
|
31
|
+
"image/png",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PaddleOcrConverter(DocumentConverter):
|
|
41
|
+
"""Intelligent PDF/Image converter using PaddleOCR cloud API.
|
|
42
|
+
|
|
43
|
+
Features:
|
|
44
|
+
- Auto-detect page content type (plain text vs images/tables)
|
|
45
|
+
- Plain text pages use pdfplumber/pdfminer (fast, free)
|
|
46
|
+
- Complex pages use PaddleOCR API for AI-powered OCR
|
|
47
|
+
- Image files (PNG, JPG) use PaddleOCR API directly
|
|
48
|
+
- Asynchronous job model: submit → poll → fetch result
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
token: Optional[str] = None,
|
|
54
|
+
model: str = "PaddleOCR-VL-1.5",
|
|
55
|
+
poll_interval: float = 2.0,
|
|
56
|
+
poll_timeout: float = 300.0,
|
|
57
|
+
force_ai: bool = False,
|
|
58
|
+
use_doc_orientation_classify: bool = False,
|
|
59
|
+
use_doc_unwarping: bool = False,
|
|
60
|
+
use_chart_recognition: bool = False,
|
|
61
|
+
scan_detection_mode: Optional[ScanDetectionMode] = None,
|
|
62
|
+
scan_sample_pages: Optional[int] = None,
|
|
63
|
+
scan_text_threshold: Optional[int] = None,
|
|
64
|
+
config: Optional[PaddleOcrConfig] = None,
|
|
65
|
+
):
|
|
66
|
+
"""Initialize converter.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided)
|
|
70
|
+
model: OCR model name (default: PaddleOCR-VL-1.5)
|
|
71
|
+
poll_interval: Seconds between status polls (default: 2.0)
|
|
72
|
+
poll_timeout: Max seconds to wait for job completion (default: 300.0)
|
|
73
|
+
force_ai: Force all pages to use OCR (default: False)
|
|
74
|
+
use_doc_orientation_classify: Enable document orientation classification
|
|
75
|
+
use_doc_unwarping: Enable document unwarping
|
|
76
|
+
use_chart_recognition: Enable chart recognition
|
|
77
|
+
scan_detection_mode: 扫描检测模式,优化扫描PDF处理
|
|
78
|
+
scan_sample_pages: SAMPLING模式下抽样页数 (default: 3)
|
|
79
|
+
scan_text_threshold: 判定为扫描件的最小文本长度阈值 (default: 50)
|
|
80
|
+
config: Optional PaddleOcrConfig instance
|
|
81
|
+
"""
|
|
82
|
+
# Build config from explicit params or provided config
|
|
83
|
+
if config:
|
|
84
|
+
self.token = token or config.token
|
|
85
|
+
self.model = model if model != "PaddleOCR-VL-1.5" else config.model
|
|
86
|
+
self.poll_interval = (
|
|
87
|
+
poll_interval if poll_interval != 2.0 else config.poll_interval
|
|
88
|
+
)
|
|
89
|
+
self.poll_timeout = (
|
|
90
|
+
poll_timeout if poll_timeout != 300.0 else config.poll_timeout
|
|
91
|
+
)
|
|
92
|
+
self.force_ai = force_ai or config.force_ai
|
|
93
|
+
self.use_doc_orientation_classify = (
|
|
94
|
+
use_doc_orientation_classify or config.use_doc_orientation_classify
|
|
95
|
+
)
|
|
96
|
+
self.use_doc_unwarping = use_doc_unwarping or config.use_doc_unwarping
|
|
97
|
+
self.use_chart_recognition = (
|
|
98
|
+
use_chart_recognition or config.use_chart_recognition
|
|
99
|
+
)
|
|
100
|
+
self.scan_detection_mode = (
|
|
101
|
+
scan_detection_mode
|
|
102
|
+
if scan_detection_mode is not None
|
|
103
|
+
else config.scan_detection_mode
|
|
104
|
+
)
|
|
105
|
+
self.scan_sample_pages = (
|
|
106
|
+
scan_sample_pages
|
|
107
|
+
if scan_sample_pages is not None
|
|
108
|
+
else config.scan_sample_pages
|
|
109
|
+
)
|
|
110
|
+
self.scan_text_threshold = (
|
|
111
|
+
scan_text_threshold
|
|
112
|
+
if scan_text_threshold is not None
|
|
113
|
+
else config.scan_text_threshold
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
self.token = token
|
|
117
|
+
self.model = model
|
|
118
|
+
self.poll_interval = poll_interval
|
|
119
|
+
self.poll_timeout = poll_timeout
|
|
120
|
+
self.force_ai = force_ai
|
|
121
|
+
self.use_doc_orientation_classify = use_doc_orientation_classify
|
|
122
|
+
self.use_doc_unwarping = use_doc_unwarping
|
|
123
|
+
self.use_chart_recognition = use_chart_recognition
|
|
124
|
+
self.scan_detection_mode = (
|
|
125
|
+
scan_detection_mode
|
|
126
|
+
if scan_detection_mode is not None
|
|
127
|
+
else ScanDetectionMode.SAMPLING
|
|
128
|
+
)
|
|
129
|
+
self.scan_sample_pages = scan_sample_pages if scan_sample_pages is not None else 3
|
|
130
|
+
self.scan_text_threshold = (
|
|
131
|
+
scan_text_threshold if scan_text_threshold is not None else 50
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Lazy init client
|
|
135
|
+
self._client: Optional[PaddleClient] = None
|
|
136
|
+
|
|
137
|
+
def _get_client(self) -> PaddleClient:
|
|
138
|
+
"""Get or create PaddleClient instance."""
|
|
139
|
+
if self._client is None:
|
|
140
|
+
config = PaddleOcrConfig(
|
|
141
|
+
token=self.token or "",
|
|
142
|
+
model=self.model,
|
|
143
|
+
poll_interval=self.poll_interval,
|
|
144
|
+
poll_timeout=self.poll_timeout,
|
|
145
|
+
force_ai=self.force_ai,
|
|
146
|
+
use_doc_orientation_classify=self.use_doc_orientation_classify,
|
|
147
|
+
use_doc_unwarping=self.use_doc_unwarping,
|
|
148
|
+
use_chart_recognition=self.use_chart_recognition,
|
|
149
|
+
)
|
|
150
|
+
self._client = PaddleClient(config=config)
|
|
151
|
+
return self._client
|
|
152
|
+
|
|
153
|
+
def _has_token(self) -> bool:
|
|
154
|
+
"""Check if a valid token is available."""
|
|
155
|
+
if self.token:
|
|
156
|
+
return True
|
|
157
|
+
import os
|
|
158
|
+
|
|
159
|
+
return bool(os.environ.get("BAIDU_PADDLE_TOKEN", ""))
|
|
160
|
+
|
|
161
|
+
def accepts(
|
|
162
|
+
self,
|
|
163
|
+
file_stream: BinaryIO,
|
|
164
|
+
stream_info: StreamInfo,
|
|
165
|
+
**kwargs: Any,
|
|
166
|
+
) -> bool:
|
|
167
|
+
# Without a token, PaddleOCR API cannot work — decline so other
|
|
168
|
+
# converters (e.g. GlmOcrConverter) get a chance.
|
|
169
|
+
if not self._has_token():
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
mimetype = (stream_info.mimetype or "").lower()
|
|
173
|
+
extension = (stream_info.extension or "").lower()
|
|
174
|
+
|
|
175
|
+
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
176
|
+
return True
|
|
177
|
+
|
|
178
|
+
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
179
|
+
if mimetype.startswith(prefix):
|
|
180
|
+
return True
|
|
181
|
+
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
def convert(
|
|
185
|
+
self,
|
|
186
|
+
file_stream: BinaryIO,
|
|
187
|
+
stream_info: StreamInfo,
|
|
188
|
+
**kwargs: Any,
|
|
189
|
+
) -> DocumentConverterResult:
|
|
190
|
+
if _dependency_exc_info is not None:
|
|
191
|
+
raise MissingDependencyException(
|
|
192
|
+
MISSING_DEPENDENCY_MESSAGE.format(
|
|
193
|
+
converter=type(self).__name__,
|
|
194
|
+
extension=".pdf",
|
|
195
|
+
feature="pdf",
|
|
196
|
+
)
|
|
197
|
+
) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
|
|
198
|
+
|
|
199
|
+
extension = (stream_info.extension or "").lower()
|
|
200
|
+
|
|
201
|
+
logger.info("PaddleOcrConverter: 开始转换, 文件类型=%s", extension)
|
|
202
|
+
|
|
203
|
+
# Image files: use PaddleOCR directly
|
|
204
|
+
if extension in (".jpg", ".jpeg", ".png"):
|
|
205
|
+
return self._convert_image(file_stream, extension)
|
|
206
|
+
|
|
207
|
+
# PDF files: use hybrid approach
|
|
208
|
+
return self._convert_pdf(file_stream)
|
|
209
|
+
|
|
210
|
+
def _convert_image(
|
|
211
|
+
self, file_stream: BinaryIO, extension: str = ".png"
|
|
212
|
+
) -> DocumentConverterResult:
|
|
213
|
+
"""Convert image file using PaddleOCR API."""
|
|
214
|
+
img_bytes = file_stream.read()
|
|
215
|
+
filename = f"image{extension}"
|
|
216
|
+
|
|
217
|
+
logger.info("PaddleOcrConverter: 开始 OCR 识别图片, 格式=%s", extension)
|
|
218
|
+
try:
|
|
219
|
+
markdown = self._get_client().ocr(file_bytes=img_bytes, filename=filename)
|
|
220
|
+
except Exception as e:
|
|
221
|
+
logger.error(
|
|
222
|
+
"PaddleOcrConverter: 图片 OCR 识别异常, 格式=%s, 错误=%s", extension, e
|
|
223
|
+
)
|
|
224
|
+
raise
|
|
225
|
+
|
|
226
|
+
logger.info("PaddleOcrConverter: 图片 OCR 识别完成, 输出长度=%d", len(markdown))
|
|
227
|
+
return DocumentConverterResult(markdown=markdown)
|
|
228
|
+
|
|
229
|
+
def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
|
|
230
|
+
"""Convert PDF using hybrid approach (pdfplumber for text, PaddleOCR for complex pages)."""
|
|
231
|
+
pdf_stream = io.BytesIO(file_stream.read())
|
|
232
|
+
pdf_bytes = pdf_stream.getvalue() # Keep original bytes for batch OCR
|
|
233
|
+
markdown_parts = []
|
|
234
|
+
ocr_failed = False
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
with pdfplumber.open(pdf_stream) as pdf:
|
|
238
|
+
total_pages = len(pdf.pages)
|
|
239
|
+
logger.info("PaddleOcrConverter: 开始处理 PDF, 总页数=%d", total_pages)
|
|
240
|
+
|
|
241
|
+
# Optimization: detect if entire PDF is scanned
|
|
242
|
+
all_scanned = self._detect_all_scanned(pdf)
|
|
243
|
+
|
|
244
|
+
if all_scanned and not self.force_ai:
|
|
245
|
+
# Batch mode: upload entire PDF to OCR API (single API call)
|
|
246
|
+
logger.info(
|
|
247
|
+
"PaddleOcrConverter: 全文档扫描模式, 批量上传PDF, 页数=%d",
|
|
248
|
+
total_pages,
|
|
249
|
+
)
|
|
250
|
+
try:
|
|
251
|
+
markdown = self._convert_pdf_batch(pdf_bytes)
|
|
252
|
+
if markdown.strip():
|
|
253
|
+
logger.info(
|
|
254
|
+
"PaddleOcrConverter: 批量OCR完成, 输出长度=%d",
|
|
255
|
+
len(markdown),
|
|
256
|
+
)
|
|
257
|
+
return DocumentConverterResult(markdown=markdown)
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logger.warning(
|
|
260
|
+
"PaddleOcrConverter: 批量OCR失败, 降级为逐页处理, 错误=%s",
|
|
261
|
+
e,
|
|
262
|
+
)
|
|
263
|
+
ocr_failed = True
|
|
264
|
+
# Fall through to per-page processing
|
|
265
|
+
|
|
266
|
+
# Per-page processing (PAGE_BY_PAGE mode or batch failed)
|
|
267
|
+
for page_num, page in enumerate(pdf.pages):
|
|
268
|
+
# Choose processing method
|
|
269
|
+
if self.force_ai or all_scanned:
|
|
270
|
+
# All scanned (after batch failed) or force_ai
|
|
271
|
+
logger.info(
|
|
272
|
+
"PaddleOcrConverter: 第 %d/%d 页, 使用 PaddleOCR",
|
|
273
|
+
page_num + 1,
|
|
274
|
+
total_pages,
|
|
275
|
+
)
|
|
276
|
+
try:
|
|
277
|
+
markdown = self._convert_with_paddleocr(page, page_num)
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.warning(
|
|
280
|
+
"PaddleOcrConverter: 第 %d/%d 页 OCR 失败, 降级为 pdfplumber, 错误=%s",
|
|
281
|
+
page_num + 1,
|
|
282
|
+
total_pages,
|
|
283
|
+
e,
|
|
284
|
+
)
|
|
285
|
+
ocr_failed = True
|
|
286
|
+
markdown = self._extract_text_with_tables(page)
|
|
287
|
+
else:
|
|
288
|
+
# Per-page analysis (PAGE_BY_PAGE mode or non-scanned doc)
|
|
289
|
+
page_type = self._analyze_page(page)
|
|
290
|
+
|
|
291
|
+
if page_type != "plain_text":
|
|
292
|
+
logger.info(
|
|
293
|
+
"PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 PaddleOCR",
|
|
294
|
+
page_num + 1,
|
|
295
|
+
total_pages,
|
|
296
|
+
page_type,
|
|
297
|
+
)
|
|
298
|
+
try:
|
|
299
|
+
markdown = self._convert_with_paddleocr(page, page_num)
|
|
300
|
+
except Exception as e:
|
|
301
|
+
logger.warning(
|
|
302
|
+
"PaddleOcrConverter: 第 %d/%d 页 OCR 失败, 降级为 pdfplumber, 错误=%s",
|
|
303
|
+
page_num + 1,
|
|
304
|
+
total_pages,
|
|
305
|
+
e,
|
|
306
|
+
)
|
|
307
|
+
ocr_failed = True
|
|
308
|
+
markdown = self._extract_text_with_tables(page)
|
|
309
|
+
else:
|
|
310
|
+
logger.info(
|
|
311
|
+
"PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber",
|
|
312
|
+
page_num + 1,
|
|
313
|
+
total_pages,
|
|
314
|
+
page_type,
|
|
315
|
+
)
|
|
316
|
+
markdown = self._extract_text_with_tables(page)
|
|
317
|
+
|
|
318
|
+
if markdown.strip():
|
|
319
|
+
markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
|
|
320
|
+
|
|
321
|
+
page.close()
|
|
322
|
+
|
|
323
|
+
markdown = "\n\n".join(markdown_parts).strip()
|
|
324
|
+
|
|
325
|
+
except Exception as e:
|
|
326
|
+
logger.error(
|
|
327
|
+
"PaddleOcrConverter: PDF 处理异常, 降级为 pdfminer, 错误=%s", e
|
|
328
|
+
)
|
|
329
|
+
# Fallback to pdfminer
|
|
330
|
+
pdf_stream.seek(0)
|
|
331
|
+
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
332
|
+
|
|
333
|
+
# Final fallback
|
|
334
|
+
if not markdown:
|
|
335
|
+
pdf_stream.seek(0)
|
|
336
|
+
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
337
|
+
|
|
338
|
+
# If OCR failed and result is empty, raise so the framework can try
|
|
339
|
+
# the next converter (e.g. GlmOcrConverter) instead of returning empty.
|
|
340
|
+
if ocr_failed and not markdown.strip():
|
|
341
|
+
logger.error("PaddleOcrConverter: OCR 失败且所有兜底结果为空, 抛出异常")
|
|
342
|
+
raise RuntimeError(
|
|
343
|
+
"PaddleOcrConverter: OCR failed and all fallbacks returned empty"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
logger.info("PaddleOcrConverter: PDF 转换完成, 输出长度=%d", len(markdown))
|
|
347
|
+
return DocumentConverterResult(markdown=markdown)
|
|
348
|
+
|
|
349
|
+
def _convert_pdf_batch(self, pdf_bytes: bytes) -> str:
|
|
350
|
+
"""Convert entire PDF in a single API call.
|
|
351
|
+
|
|
352
|
+
More efficient for scanned PDFs: one API call instead of N calls for N pages.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
pdf_bytes: Raw PDF file content.
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Markdown text from all pages.
|
|
359
|
+
"""
|
|
360
|
+
logger.info("PaddleOcrConverter: 批量上传PDF到OCR API, 大小=%d bytes", len(pdf_bytes))
|
|
361
|
+
markdown = self._get_client().ocr(
|
|
362
|
+
file_bytes=pdf_bytes,
|
|
363
|
+
filename="document.pdf",
|
|
364
|
+
)
|
|
365
|
+
return markdown
|
|
366
|
+
|
|
367
|
+
def _analyze_page(self, page: Any) -> str:
|
|
368
|
+
"""Analyze page content type."""
|
|
369
|
+
# Check for images
|
|
370
|
+
if hasattr(page, "images") and page.images:
|
|
371
|
+
return "complex"
|
|
372
|
+
|
|
373
|
+
# Check for tables
|
|
374
|
+
tables = page.find_tables()
|
|
375
|
+
if tables:
|
|
376
|
+
return "complex"
|
|
377
|
+
|
|
378
|
+
# Check for graphics/curves
|
|
379
|
+
if hasattr(page, "curves") and page.curves:
|
|
380
|
+
return "complex"
|
|
381
|
+
|
|
382
|
+
return "plain_text"
|
|
383
|
+
|
|
384
|
+
def _is_scanned_page(self, page: Any) -> bool:
|
|
385
|
+
"""Check if a page is likely a scanned image.
|
|
386
|
+
|
|
387
|
+
A page is considered scanned if:
|
|
388
|
+
1. It contains images, AND
|
|
389
|
+
2. It has very little extractable text (below threshold)
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
page: pdfplumber page object
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
True if the page appears to be a scanned image
|
|
396
|
+
"""
|
|
397
|
+
# Must have images to be a scan
|
|
398
|
+
has_images = hasattr(page, "images") and bool(page.images)
|
|
399
|
+
if not has_images:
|
|
400
|
+
return False
|
|
401
|
+
|
|
402
|
+
# Check extractable text length
|
|
403
|
+
try:
|
|
404
|
+
text = page.extract_text() or ""
|
|
405
|
+
text_len = len(text.strip())
|
|
406
|
+
# If there's substantial text, it might be a mixed page or
|
|
407
|
+
# a digital PDF with embedded images
|
|
408
|
+
if text_len >= self.scan_text_threshold:
|
|
409
|
+
return False
|
|
410
|
+
except Exception:
|
|
411
|
+
# If text extraction fails, assume it's a scan
|
|
412
|
+
return True
|
|
413
|
+
|
|
414
|
+
return True
|
|
415
|
+
|
|
416
|
+
def _detect_all_scanned(self, pdf: Any) -> bool:
|
|
417
|
+
"""Detect if entire PDF is scanned based on scan_detection_mode.
|
|
418
|
+
|
|
419
|
+
Optimization: When first few pages are scanned, we can assume
|
|
420
|
+
all pages are scanned and skip per-page analysis.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
pdf: pdfplumber PDF object
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
True if entire PDF should be treated as scanned
|
|
427
|
+
"""
|
|
428
|
+
if self.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE:
|
|
429
|
+
return False
|
|
430
|
+
|
|
431
|
+
total_pages = len(pdf.pages)
|
|
432
|
+
if total_pages == 0:
|
|
433
|
+
return False
|
|
434
|
+
|
|
435
|
+
if self.scan_detection_mode == ScanDetectionMode.FIRST_PAGE_HINT:
|
|
436
|
+
# Check only first page
|
|
437
|
+
first_page = pdf.pages[0]
|
|
438
|
+
is_scanned = self._is_scanned_page(first_page)
|
|
439
|
+
first_page.close()
|
|
440
|
+
if is_scanned:
|
|
441
|
+
logger.info(
|
|
442
|
+
"PaddleOcrConverter: 首页检测为扫描件, 模式=FIRST_PAGE_HINT, 全文档使用OCR"
|
|
443
|
+
)
|
|
444
|
+
return is_scanned
|
|
445
|
+
|
|
446
|
+
if self.scan_detection_mode == ScanDetectionMode.SAMPLING:
|
|
447
|
+
# Sample first N pages
|
|
448
|
+
sample_count = min(self.scan_sample_pages, total_pages)
|
|
449
|
+
scanned_count = 0
|
|
450
|
+
|
|
451
|
+
for i in range(sample_count):
|
|
452
|
+
page = pdf.pages[i]
|
|
453
|
+
if self._is_scanned_page(page):
|
|
454
|
+
scanned_count += 1
|
|
455
|
+
|
|
456
|
+
# If majority of sampled pages are scanned, treat all as scanned
|
|
457
|
+
majority_threshold = sample_count // 2 + 1
|
|
458
|
+
all_scanned = scanned_count >= majority_threshold
|
|
459
|
+
|
|
460
|
+
if all_scanned:
|
|
461
|
+
logger.info(
|
|
462
|
+
"PaddleOcrConverter: 抽样检测 %d/%d 页为扫描件, 模式=SAMPLING, 全文档使用OCR",
|
|
463
|
+
scanned_count,
|
|
464
|
+
sample_count,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
return all_scanned
|
|
468
|
+
|
|
469
|
+
return False
|
|
470
|
+
|
|
471
|
+
def _convert_with_paddleocr(self, page: Any, page_num: int) -> str:
|
|
472
|
+
"""Convert page using PaddleOCR API."""
|
|
473
|
+
# Render page to image
|
|
474
|
+
img = page.to_image(resolution=150)
|
|
475
|
+
img_bytes = io.BytesIO()
|
|
476
|
+
img.save(img_bytes, format="PNG")
|
|
477
|
+
|
|
478
|
+
logger.info("PaddleOcrConverter: PaddleOCR API 开始识别第 %d 页", page_num + 1)
|
|
479
|
+
try:
|
|
480
|
+
markdown = self._get_client().ocr(
|
|
481
|
+
file_bytes=img_bytes.getvalue(),
|
|
482
|
+
filename=f"page_{page_num + 1}.png",
|
|
483
|
+
)
|
|
484
|
+
except Exception as e:
|
|
485
|
+
logger.error(
|
|
486
|
+
"PaddleOcrConverter: PaddleOCR API 第 %d 页识别异常, 错误=%s",
|
|
487
|
+
page_num + 1,
|
|
488
|
+
e,
|
|
489
|
+
)
|
|
490
|
+
raise
|
|
491
|
+
|
|
492
|
+
logger.info(
|
|
493
|
+
"PaddleOcrConverter: PaddleOCR API 第 %d 页识别完成, 输出长度=%d",
|
|
494
|
+
page_num + 1,
|
|
495
|
+
len(markdown),
|
|
496
|
+
)
|
|
497
|
+
return markdown
|
|
498
|
+
|
|
499
|
+
def _extract_text_with_tables(self, page: Any) -> str:
|
|
500
|
+
"""Extract text and tables from page."""
|
|
501
|
+
parts = []
|
|
502
|
+
|
|
503
|
+
# Extract text
|
|
504
|
+
text = page.extract_text() or ""
|
|
505
|
+
if text.strip():
|
|
506
|
+
parts.append(text.strip())
|
|
507
|
+
|
|
508
|
+
# Extract tables
|
|
509
|
+
try:
|
|
510
|
+
tables = page.extract_tables()
|
|
511
|
+
if tables:
|
|
512
|
+
for table in tables:
|
|
513
|
+
if table:
|
|
514
|
+
md_table = self._table_to_markdown(table)
|
|
515
|
+
if md_table.strip():
|
|
516
|
+
parts.append(md_table)
|
|
517
|
+
except Exception:
|
|
518
|
+
pass
|
|
519
|
+
|
|
520
|
+
return "\n\n".join(parts)
|
|
521
|
+
|
|
522
|
+
def _table_to_markdown(self, table: list[list[str]]) -> str:
|
|
523
|
+
"""Convert table to Markdown."""
|
|
524
|
+
if not table:
|
|
525
|
+
return ""
|
|
526
|
+
|
|
527
|
+
# Filter None values
|
|
528
|
+
table = [[cell if cell is not None else "" for cell in row] for row in table]
|
|
529
|
+
|
|
530
|
+
# Filter empty rows
|
|
531
|
+
table = [row for row in table if any(cell.strip() for cell in row)]
|
|
532
|
+
|
|
533
|
+
if not table:
|
|
534
|
+
return ""
|
|
535
|
+
|
|
536
|
+
# Calculate column widths
|
|
537
|
+
col_widths = [
|
|
538
|
+
max(len(str(row[i])) if i < len(row) else 0 for row in table)
|
|
539
|
+
for i in range(max(len(row) for row in table))
|
|
540
|
+
]
|
|
541
|
+
|
|
542
|
+
# Format table
|
|
543
|
+
lines = []
|
|
544
|
+
for row_idx, row in enumerate(table):
|
|
545
|
+
padded_row = row + [""] * (len(col_widths) - len(row))
|
|
546
|
+
line = (
|
|
547
|
+
"| "
|
|
548
|
+
+ " | ".join(
|
|
549
|
+
str(cell).ljust(width)
|
|
550
|
+
for cell, width in zip(padded_row, col_widths)
|
|
551
|
+
)
|
|
552
|
+
+ " |"
|
|
553
|
+
)
|
|
554
|
+
lines.append(line)
|
|
555
|
+
|
|
556
|
+
if row_idx == 0:
|
|
557
|
+
sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
|
|
558
|
+
lines.append(sep)
|
|
559
|
+
|
|
560
|
+
return "\n".join(lines)
|
|
561
|
+
|
|
562
|
+
def close(self):
|
|
563
|
+
"""Close the client."""
|
|
564
|
+
self._client = None
|
|
565
|
+
|
|
566
|
+
def __enter__(self):
|
|
567
|
+
return self
|
|
568
|
+
|
|
569
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
570
|
+
self.close()
|
{markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/src/markitdown_paddleocr/_plugin.py
RENAMED
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
"""Plugin registration for markitdown-paddleocr."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from typing import Any
|
|
5
|
+
|
|
4
6
|
from markitdown import MarkItDown
|
|
5
7
|
|
|
6
8
|
from ._converter import PaddleOcrConverter
|
|
7
9
|
|
|
8
|
-
|
|
9
10
|
__plugin_interface_version__ = 1
|
|
10
11
|
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
11
14
|
|
|
12
15
|
def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
|
|
13
16
|
"""Register markitdown-paddleocr converter.
|
|
@@ -17,19 +20,31 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
|
|
|
17
20
|
2. Environment variables (BAIDU_PADDLE_TOKEN)
|
|
18
21
|
3. Built-in defaults
|
|
19
22
|
"""
|
|
23
|
+
logger.info("markitdown-paddleocr: 开始注册插件")
|
|
24
|
+
|
|
20
25
|
# Register converter with higher priority than default PDF converter
|
|
21
26
|
PRIORITY_PADDLEOCR = -1.0
|
|
22
27
|
|
|
23
|
-
|
|
24
|
-
PaddleOcrConverter(
|
|
28
|
+
try:
|
|
29
|
+
converter = PaddleOcrConverter(
|
|
25
30
|
token=kwargs.get("token"),
|
|
26
31
|
model=kwargs.get("model", "PaddleOCR-VL-1.5"),
|
|
27
32
|
poll_interval=kwargs.get("poll_interval", 2.0),
|
|
28
33
|
poll_timeout=kwargs.get("poll_timeout", 300.0),
|
|
29
34
|
force_ai=kwargs.get("force_ai", False),
|
|
30
|
-
use_doc_orientation_classify=kwargs.get(
|
|
35
|
+
use_doc_orientation_classify=kwargs.get(
|
|
36
|
+
"use_doc_orientation_classify", False
|
|
37
|
+
),
|
|
31
38
|
use_doc_unwarping=kwargs.get("use_doc_unwarping", False),
|
|
32
39
|
use_chart_recognition=kwargs.get("use_chart_recognition", False),
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
|
|
40
|
+
)
|
|
41
|
+
markitdown.register_converter(
|
|
42
|
+
converter,
|
|
43
|
+
priority=PRIORITY_PADDLEOCR,
|
|
44
|
+
)
|
|
45
|
+
logger.info(
|
|
46
|
+
"markitdown-paddleocr: 插件注册成功, priority=%.1f", PRIORITY_PADDLEOCR
|
|
47
|
+
)
|
|
48
|
+
except Exception as e:
|
|
49
|
+
logger.error("markitdown-paddleocr: 插件注册失败, 错误=%s", e)
|
|
50
|
+
raise
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.1.0"
|
|
@@ -1,304 +0,0 @@
|
|
|
1
|
-
"""PaddleOcr Converter - PDF/Image to Markdown using PaddleOCR cloud API."""
|
|
2
|
-
|
|
3
|
-
import io
|
|
4
|
-
import sys
|
|
5
|
-
from typing import Any, BinaryIO, Optional
|
|
6
|
-
|
|
7
|
-
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
|
8
|
-
from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|
9
|
-
|
|
10
|
-
from ._config import PaddleOcrConfig
|
|
11
|
-
from ._paddle_client import PaddleClient
|
|
12
|
-
|
|
13
|
-
# Import PDF dependencies
|
|
14
|
-
_dependency_exc_info = None
|
|
15
|
-
try:
|
|
16
|
-
import pdfminer
|
|
17
|
-
import pdfminer.high_level
|
|
18
|
-
import pdfplumber
|
|
19
|
-
except ImportError:
|
|
20
|
-
_dependency_exc_info = sys.exc_info()
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
24
|
-
"application/pdf",
|
|
25
|
-
"application/x-pdf",
|
|
26
|
-
"image/jpeg",
|
|
27
|
-
"image/png",
|
|
28
|
-
]
|
|
29
|
-
|
|
30
|
-
ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class PaddleOcrConverter(DocumentConverter):
|
|
34
|
-
"""Intelligent PDF/Image converter using PaddleOCR cloud API.
|
|
35
|
-
|
|
36
|
-
Features:
|
|
37
|
-
- Auto-detect page content type (plain text vs images/tables)
|
|
38
|
-
- Plain text pages use pdfplumber/pdfminer (fast, free)
|
|
39
|
-
- Complex pages use PaddleOCR API for AI-powered OCR
|
|
40
|
-
- Image files (PNG, JPG) use PaddleOCR API directly
|
|
41
|
-
- Asynchronous job model: submit → poll → fetch result
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
def __init__(
|
|
45
|
-
self,
|
|
46
|
-
token: Optional[str] = None,
|
|
47
|
-
model: str = "PaddleOCR-VL-1.5",
|
|
48
|
-
poll_interval: float = 2.0,
|
|
49
|
-
poll_timeout: float = 300.0,
|
|
50
|
-
force_ai: bool = False,
|
|
51
|
-
use_doc_orientation_classify: bool = False,
|
|
52
|
-
use_doc_unwarping: bool = False,
|
|
53
|
-
use_chart_recognition: bool = False,
|
|
54
|
-
config: Optional[PaddleOcrConfig] = None,
|
|
55
|
-
):
|
|
56
|
-
"""Initialize converter.
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided)
|
|
60
|
-
model: OCR model name (default: PaddleOCR-VL-1.5)
|
|
61
|
-
poll_interval: Seconds between status polls (default: 2.0)
|
|
62
|
-
poll_timeout: Max seconds to wait for job completion (default: 300.0)
|
|
63
|
-
force_ai: Force all pages to use OCR (default: False)
|
|
64
|
-
use_doc_orientation_classify: Enable document orientation classification
|
|
65
|
-
use_doc_unwarping: Enable document unwarping
|
|
66
|
-
use_chart_recognition: Enable chart recognition
|
|
67
|
-
config: Optional PaddleOcrConfig instance
|
|
68
|
-
"""
|
|
69
|
-
# Build config from explicit params or provided config
|
|
70
|
-
if config:
|
|
71
|
-
self.token = token or config.token
|
|
72
|
-
self.model = model if model != "PaddleOCR-VL-1.5" else config.model
|
|
73
|
-
self.poll_interval = poll_interval if poll_interval != 2.0 else config.poll_interval
|
|
74
|
-
self.poll_timeout = poll_timeout if poll_timeout != 300.0 else config.poll_timeout
|
|
75
|
-
self.force_ai = force_ai or config.force_ai
|
|
76
|
-
self.use_doc_orientation_classify = use_doc_orientation_classify or config.use_doc_orientation_classify
|
|
77
|
-
self.use_doc_unwarping = use_doc_unwarping or config.use_doc_unwarping
|
|
78
|
-
self.use_chart_recognition = use_chart_recognition or config.use_chart_recognition
|
|
79
|
-
else:
|
|
80
|
-
self.token = token
|
|
81
|
-
self.model = model
|
|
82
|
-
self.poll_interval = poll_interval
|
|
83
|
-
self.poll_timeout = poll_timeout
|
|
84
|
-
self.force_ai = force_ai
|
|
85
|
-
self.use_doc_orientation_classify = use_doc_orientation_classify
|
|
86
|
-
self.use_doc_unwarping = use_doc_unwarping
|
|
87
|
-
self.use_chart_recognition = use_chart_recognition
|
|
88
|
-
|
|
89
|
-
# Lazy init client
|
|
90
|
-
self._client: Optional[PaddleClient] = None
|
|
91
|
-
|
|
92
|
-
def _get_client(self) -> PaddleClient:
|
|
93
|
-
"""Get or create PaddleClient instance."""
|
|
94
|
-
if self._client is None:
|
|
95
|
-
config = PaddleOcrConfig(
|
|
96
|
-
token=self.token or "",
|
|
97
|
-
model=self.model,
|
|
98
|
-
poll_interval=self.poll_interval,
|
|
99
|
-
poll_timeout=self.poll_timeout,
|
|
100
|
-
force_ai=self.force_ai,
|
|
101
|
-
use_doc_orientation_classify=self.use_doc_orientation_classify,
|
|
102
|
-
use_doc_unwarping=self.use_doc_unwarping,
|
|
103
|
-
use_chart_recognition=self.use_chart_recognition,
|
|
104
|
-
)
|
|
105
|
-
self._client = PaddleClient(config=config)
|
|
106
|
-
return self._client
|
|
107
|
-
|
|
108
|
-
def accepts(
|
|
109
|
-
self,
|
|
110
|
-
file_stream: BinaryIO,
|
|
111
|
-
stream_info: StreamInfo,
|
|
112
|
-
**kwargs: Any,
|
|
113
|
-
) -> bool:
|
|
114
|
-
mimetype = (stream_info.mimetype or "").lower()
|
|
115
|
-
extension = (stream_info.extension or "").lower()
|
|
116
|
-
|
|
117
|
-
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
118
|
-
return True
|
|
119
|
-
|
|
120
|
-
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
121
|
-
if mimetype.startswith(prefix):
|
|
122
|
-
return True
|
|
123
|
-
|
|
124
|
-
return False
|
|
125
|
-
|
|
126
|
-
def convert(
|
|
127
|
-
self,
|
|
128
|
-
file_stream: BinaryIO,
|
|
129
|
-
stream_info: StreamInfo,
|
|
130
|
-
**kwargs: Any,
|
|
131
|
-
) -> DocumentConverterResult:
|
|
132
|
-
if _dependency_exc_info is not None:
|
|
133
|
-
raise MissingDependencyException(
|
|
134
|
-
MISSING_DEPENDENCY_MESSAGE.format(
|
|
135
|
-
converter=type(self).__name__,
|
|
136
|
-
extension=".pdf",
|
|
137
|
-
feature="pdf",
|
|
138
|
-
)
|
|
139
|
-
) from _dependency_exc_info[1].with_traceback(
|
|
140
|
-
_dependency_exc_info[2]
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
extension = (stream_info.extension or "").lower()
|
|
144
|
-
|
|
145
|
-
# Image files: use PaddleOCR directly
|
|
146
|
-
if extension in (".jpg", ".jpeg", ".png"):
|
|
147
|
-
return self._convert_image(file_stream, extension)
|
|
148
|
-
|
|
149
|
-
# PDF files: use hybrid approach
|
|
150
|
-
return self._convert_pdf(file_stream)
|
|
151
|
-
|
|
152
|
-
def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult:
|
|
153
|
-
"""Convert image file using PaddleOCR API."""
|
|
154
|
-
img_bytes = file_stream.read()
|
|
155
|
-
filename = f"image{extension}"
|
|
156
|
-
|
|
157
|
-
try:
|
|
158
|
-
markdown = self._get_client().ocr(file_bytes=img_bytes, filename=filename)
|
|
159
|
-
return DocumentConverterResult(markdown=markdown)
|
|
160
|
-
except Exception as e:
|
|
161
|
-
return DocumentConverterResult(
|
|
162
|
-
markdown=f"<!-- Error converting image with PaddleOCR: {e} -->"
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
|
|
166
|
-
"""Convert PDF using hybrid approach (pdfplumber for text, PaddleOCR for complex pages)."""
|
|
167
|
-
pdf_stream = io.BytesIO(file_stream.read())
|
|
168
|
-
markdown_parts = []
|
|
169
|
-
|
|
170
|
-
try:
|
|
171
|
-
with pdfplumber.open(pdf_stream) as pdf:
|
|
172
|
-
for page_num, page in enumerate(pdf.pages):
|
|
173
|
-
# Analyze page type
|
|
174
|
-
page_type = self._analyze_page(page)
|
|
175
|
-
|
|
176
|
-
# Choose processing method
|
|
177
|
-
if self.force_ai or page_type != "plain_text":
|
|
178
|
-
# Complex content: use PaddleOCR
|
|
179
|
-
markdown = self._convert_with_paddleocr(page, page_num)
|
|
180
|
-
else:
|
|
181
|
-
# Plain text: use pdfplumber
|
|
182
|
-
markdown = self._extract_text_with_tables(page)
|
|
183
|
-
|
|
184
|
-
if markdown.strip():
|
|
185
|
-
markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
|
|
186
|
-
|
|
187
|
-
page.close()
|
|
188
|
-
|
|
189
|
-
markdown = "\n\n".join(markdown_parts).strip()
|
|
190
|
-
|
|
191
|
-
except Exception:
|
|
192
|
-
# Fallback to pdfminer
|
|
193
|
-
pdf_stream.seek(0)
|
|
194
|
-
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
195
|
-
|
|
196
|
-
# Final fallback
|
|
197
|
-
if not markdown:
|
|
198
|
-
pdf_stream.seek(0)
|
|
199
|
-
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
200
|
-
|
|
201
|
-
return DocumentConverterResult(markdown=markdown)
|
|
202
|
-
|
|
203
|
-
def _analyze_page(self, page: Any) -> str:
|
|
204
|
-
"""Analyze page content type."""
|
|
205
|
-
# Check for images
|
|
206
|
-
if hasattr(page, "images") and page.images:
|
|
207
|
-
return "complex"
|
|
208
|
-
|
|
209
|
-
# Check for tables
|
|
210
|
-
tables = page.find_tables()
|
|
211
|
-
if tables:
|
|
212
|
-
return "complex"
|
|
213
|
-
|
|
214
|
-
# Check for graphics/curves
|
|
215
|
-
if hasattr(page, "curves") and page.curves:
|
|
216
|
-
return "complex"
|
|
217
|
-
|
|
218
|
-
return "plain_text"
|
|
219
|
-
|
|
220
|
-
def _convert_with_paddleocr(self, page: Any, page_num: int) -> str:
|
|
221
|
-
"""Convert page using PaddleOCR API."""
|
|
222
|
-
try:
|
|
223
|
-
# Render page to image
|
|
224
|
-
img = page.to_image(resolution=150)
|
|
225
|
-
img_bytes = io.BytesIO()
|
|
226
|
-
img.save(img_bytes, format="PNG")
|
|
227
|
-
|
|
228
|
-
markdown = self._get_client().ocr(
|
|
229
|
-
file_bytes=img_bytes.getvalue(),
|
|
230
|
-
filename=f"page_{page_num + 1}.png",
|
|
231
|
-
)
|
|
232
|
-
return markdown
|
|
233
|
-
|
|
234
|
-
except Exception:
|
|
235
|
-
# Fallback to pdfplumber text extraction
|
|
236
|
-
return self._extract_text_with_tables(page)
|
|
237
|
-
|
|
238
|
-
def _extract_text_with_tables(self, page: Any) -> str:
|
|
239
|
-
"""Extract text and tables from page."""
|
|
240
|
-
parts = []
|
|
241
|
-
|
|
242
|
-
# Extract text
|
|
243
|
-
text = page.extract_text() or ""
|
|
244
|
-
if text.strip():
|
|
245
|
-
parts.append(text.strip())
|
|
246
|
-
|
|
247
|
-
# Extract tables
|
|
248
|
-
try:
|
|
249
|
-
tables = page.extract_tables()
|
|
250
|
-
if tables:
|
|
251
|
-
for table in tables:
|
|
252
|
-
if table:
|
|
253
|
-
md_table = self._table_to_markdown(table)
|
|
254
|
-
if md_table.strip():
|
|
255
|
-
parts.append(md_table)
|
|
256
|
-
except Exception:
|
|
257
|
-
pass
|
|
258
|
-
|
|
259
|
-
return "\n\n".join(parts)
|
|
260
|
-
|
|
261
|
-
def _table_to_markdown(self, table: list[list[str]]) -> str:
|
|
262
|
-
"""Convert table to Markdown."""
|
|
263
|
-
if not table:
|
|
264
|
-
return ""
|
|
265
|
-
|
|
266
|
-
# Filter None values
|
|
267
|
-
table = [[cell if cell is not None else "" for cell in row] for row in table]
|
|
268
|
-
|
|
269
|
-
# Filter empty rows
|
|
270
|
-
table = [row for row in table if any(cell.strip() for cell in row)]
|
|
271
|
-
|
|
272
|
-
if not table:
|
|
273
|
-
return ""
|
|
274
|
-
|
|
275
|
-
# Calculate column widths
|
|
276
|
-
col_widths = [
|
|
277
|
-
max(len(str(row[i])) if i < len(row) else 0 for row in table)
|
|
278
|
-
for i in range(max(len(row) for row in table))
|
|
279
|
-
]
|
|
280
|
-
|
|
281
|
-
# Format table
|
|
282
|
-
lines = []
|
|
283
|
-
for row_idx, row in enumerate(table):
|
|
284
|
-
padded_row = row + [""] * (len(col_widths) - len(row))
|
|
285
|
-
line = "| " + " | ".join(
|
|
286
|
-
str(cell).ljust(width) for cell, width in zip(padded_row, col_widths)
|
|
287
|
-
) + " |"
|
|
288
|
-
lines.append(line)
|
|
289
|
-
|
|
290
|
-
if row_idx == 0:
|
|
291
|
-
sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
|
|
292
|
-
lines.append(sep)
|
|
293
|
-
|
|
294
|
-
return "\n".join(lines)
|
|
295
|
-
|
|
296
|
-
def close(self):
|
|
297
|
-
"""Close the client."""
|
|
298
|
-
self._client = None
|
|
299
|
-
|
|
300
|
-
def __enter__(self):
|
|
301
|
-
return self
|
|
302
|
-
|
|
303
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
304
|
-
self.close()
|
|
File without changes
|
|
File without changes
|
{markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/src/markitdown_paddleocr/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{markitdown_paddleocr-0.1.0 → markitdown_paddleocr-0.2.2}/src/markitdown_paddleocr/_paddle_client.py
RENAMED
|
File without changes
|