markitdown-paddleocr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitdown_paddleocr/__about__.py +1 -0
- markitdown_paddleocr/__init__.py +16 -0
- markitdown_paddleocr/_config.py +46 -0
- markitdown_paddleocr/_converter.py +304 -0
- markitdown_paddleocr/_dual_converter.py +160 -0
- markitdown_paddleocr/_paddle_client.py +189 -0
- markitdown_paddleocr/_plugin.py +35 -0
- markitdown_paddleocr-0.1.0.dist-info/METADATA +183 -0
- markitdown_paddleocr-0.1.0.dist-info/RECORD +11 -0
- markitdown_paddleocr-0.1.0.dist-info/WHEEL +4 -0
- markitdown_paddleocr-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""markitdown-paddleocr: PDF/Image to Markdown converter using PaddleOCR cloud API."""
|
|
2
|
+
|
|
3
|
+
from ._plugin import register_converters
|
|
4
|
+
from ._config import PaddleOcrConfig
|
|
5
|
+
from ._converter import PaddleOcrConverter
|
|
6
|
+
from ._paddle_client import PaddleClient
|
|
7
|
+
from ._dual_converter import DualOcrConverter
|
|
8
|
+
|
|
9
|
+
__plugin_interface_version__ = 1
|
|
10
|
+
__all__ = [
|
|
11
|
+
"register_converters",
|
|
12
|
+
"PaddleOcrConfig",
|
|
13
|
+
"PaddleOcrConverter",
|
|
14
|
+
"PaddleClient",
|
|
15
|
+
"DualOcrConverter",
|
|
16
|
+
]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Configuration for markitdown-paddleocr."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class PaddleOcrConfig:
|
|
9
|
+
"""markitdown-paddleocr configuration.
|
|
10
|
+
|
|
11
|
+
Configuration priority (high to low):
|
|
12
|
+
1. Constructor kwargs
|
|
13
|
+
2. Environment variables
|
|
14
|
+
3. Built-in defaults
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# API configuration
|
|
18
|
+
token: str = "" # Reads from BAIDU_PADDLE_TOKEN by default
|
|
19
|
+
|
|
20
|
+
# OCR model
|
|
21
|
+
model: str = "PaddleOCR-VL-1.5"
|
|
22
|
+
|
|
23
|
+
# API endpoint
|
|
24
|
+
job_url: str = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs"
|
|
25
|
+
|
|
26
|
+
# Polling configuration
|
|
27
|
+
poll_interval: float = 2.0 # seconds between polls
|
|
28
|
+
poll_timeout: float = 300.0 # max seconds to wait for job completion
|
|
29
|
+
|
|
30
|
+
# Optional OCR features
|
|
31
|
+
use_doc_orientation_classify: bool = False
|
|
32
|
+
use_doc_unwarping: bool = False
|
|
33
|
+
use_chart_recognition: bool = False
|
|
34
|
+
|
|
35
|
+
# Processing strategy
|
|
36
|
+
force_ai: bool = False
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_env(cls, **overrides) -> "PaddleOcrConfig":
|
|
40
|
+
"""Create config from environment variables with optional overrides."""
|
|
41
|
+
defaults = {
|
|
42
|
+
"token": os.environ.get("BAIDU_PADDLE_TOKEN", ""),
|
|
43
|
+
"model": os.environ.get("PADDLE_OCR_MODEL", "PaddleOCR-VL-1.5"),
|
|
44
|
+
}
|
|
45
|
+
defaults.update(overrides)
|
|
46
|
+
return cls(**defaults)
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""PaddleOcr Converter - PDF/Image to Markdown using PaddleOCR cloud API."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any, BinaryIO, Optional
|
|
6
|
+
|
|
7
|
+
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
|
8
|
+
from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|
9
|
+
|
|
10
|
+
from ._config import PaddleOcrConfig
|
|
11
|
+
from ._paddle_client import PaddleClient
|
|
12
|
+
|
|
13
|
+
# Import PDF dependencies
|
|
14
|
+
_dependency_exc_info = None
|
|
15
|
+
try:
|
|
16
|
+
import pdfminer
|
|
17
|
+
import pdfminer.high_level
|
|
18
|
+
import pdfplumber
|
|
19
|
+
except ImportError:
|
|
20
|
+
_dependency_exc_info = sys.exc_info()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
24
|
+
"application/pdf",
|
|
25
|
+
"application/x-pdf",
|
|
26
|
+
"image/jpeg",
|
|
27
|
+
"image/png",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PaddleOcrConverter(DocumentConverter):
|
|
34
|
+
"""Intelligent PDF/Image converter using PaddleOCR cloud API.
|
|
35
|
+
|
|
36
|
+
Features:
|
|
37
|
+
- Auto-detect page content type (plain text vs images/tables)
|
|
38
|
+
- Plain text pages use pdfplumber/pdfminer (fast, free)
|
|
39
|
+
- Complex pages use PaddleOCR API for AI-powered OCR
|
|
40
|
+
- Image files (PNG, JPG) use PaddleOCR API directly
|
|
41
|
+
- Asynchronous job model: submit → poll → fetch result
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
token: Optional[str] = None,
|
|
47
|
+
model: str = "PaddleOCR-VL-1.5",
|
|
48
|
+
poll_interval: float = 2.0,
|
|
49
|
+
poll_timeout: float = 300.0,
|
|
50
|
+
force_ai: bool = False,
|
|
51
|
+
use_doc_orientation_classify: bool = False,
|
|
52
|
+
use_doc_unwarping: bool = False,
|
|
53
|
+
use_chart_recognition: bool = False,
|
|
54
|
+
config: Optional[PaddleOcrConfig] = None,
|
|
55
|
+
):
|
|
56
|
+
"""Initialize converter.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided)
|
|
60
|
+
model: OCR model name (default: PaddleOCR-VL-1.5)
|
|
61
|
+
poll_interval: Seconds between status polls (default: 2.0)
|
|
62
|
+
poll_timeout: Max seconds to wait for job completion (default: 300.0)
|
|
63
|
+
force_ai: Force all pages to use OCR (default: False)
|
|
64
|
+
use_doc_orientation_classify: Enable document orientation classification
|
|
65
|
+
use_doc_unwarping: Enable document unwarping
|
|
66
|
+
use_chart_recognition: Enable chart recognition
|
|
67
|
+
config: Optional PaddleOcrConfig instance
|
|
68
|
+
"""
|
|
69
|
+
# Build config from explicit params or provided config
|
|
70
|
+
if config:
|
|
71
|
+
self.token = token or config.token
|
|
72
|
+
self.model = model if model != "PaddleOCR-VL-1.5" else config.model
|
|
73
|
+
self.poll_interval = poll_interval if poll_interval != 2.0 else config.poll_interval
|
|
74
|
+
self.poll_timeout = poll_timeout if poll_timeout != 300.0 else config.poll_timeout
|
|
75
|
+
self.force_ai = force_ai or config.force_ai
|
|
76
|
+
self.use_doc_orientation_classify = use_doc_orientation_classify or config.use_doc_orientation_classify
|
|
77
|
+
self.use_doc_unwarping = use_doc_unwarping or config.use_doc_unwarping
|
|
78
|
+
self.use_chart_recognition = use_chart_recognition or config.use_chart_recognition
|
|
79
|
+
else:
|
|
80
|
+
self.token = token
|
|
81
|
+
self.model = model
|
|
82
|
+
self.poll_interval = poll_interval
|
|
83
|
+
self.poll_timeout = poll_timeout
|
|
84
|
+
self.force_ai = force_ai
|
|
85
|
+
self.use_doc_orientation_classify = use_doc_orientation_classify
|
|
86
|
+
self.use_doc_unwarping = use_doc_unwarping
|
|
87
|
+
self.use_chart_recognition = use_chart_recognition
|
|
88
|
+
|
|
89
|
+
# Lazy init client
|
|
90
|
+
self._client: Optional[PaddleClient] = None
|
|
91
|
+
|
|
92
|
+
def _get_client(self) -> PaddleClient:
|
|
93
|
+
"""Get or create PaddleClient instance."""
|
|
94
|
+
if self._client is None:
|
|
95
|
+
config = PaddleOcrConfig(
|
|
96
|
+
token=self.token or "",
|
|
97
|
+
model=self.model,
|
|
98
|
+
poll_interval=self.poll_interval,
|
|
99
|
+
poll_timeout=self.poll_timeout,
|
|
100
|
+
force_ai=self.force_ai,
|
|
101
|
+
use_doc_orientation_classify=self.use_doc_orientation_classify,
|
|
102
|
+
use_doc_unwarping=self.use_doc_unwarping,
|
|
103
|
+
use_chart_recognition=self.use_chart_recognition,
|
|
104
|
+
)
|
|
105
|
+
self._client = PaddleClient(config=config)
|
|
106
|
+
return self._client
|
|
107
|
+
|
|
108
|
+
def accepts(
|
|
109
|
+
self,
|
|
110
|
+
file_stream: BinaryIO,
|
|
111
|
+
stream_info: StreamInfo,
|
|
112
|
+
**kwargs: Any,
|
|
113
|
+
) -> bool:
|
|
114
|
+
mimetype = (stream_info.mimetype or "").lower()
|
|
115
|
+
extension = (stream_info.extension or "").lower()
|
|
116
|
+
|
|
117
|
+
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
121
|
+
if mimetype.startswith(prefix):
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
def convert(
|
|
127
|
+
self,
|
|
128
|
+
file_stream: BinaryIO,
|
|
129
|
+
stream_info: StreamInfo,
|
|
130
|
+
**kwargs: Any,
|
|
131
|
+
) -> DocumentConverterResult:
|
|
132
|
+
if _dependency_exc_info is not None:
|
|
133
|
+
raise MissingDependencyException(
|
|
134
|
+
MISSING_DEPENDENCY_MESSAGE.format(
|
|
135
|
+
converter=type(self).__name__,
|
|
136
|
+
extension=".pdf",
|
|
137
|
+
feature="pdf",
|
|
138
|
+
)
|
|
139
|
+
) from _dependency_exc_info[1].with_traceback(
|
|
140
|
+
_dependency_exc_info[2]
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
extension = (stream_info.extension or "").lower()
|
|
144
|
+
|
|
145
|
+
# Image files: use PaddleOCR directly
|
|
146
|
+
if extension in (".jpg", ".jpeg", ".png"):
|
|
147
|
+
return self._convert_image(file_stream, extension)
|
|
148
|
+
|
|
149
|
+
# PDF files: use hybrid approach
|
|
150
|
+
return self._convert_pdf(file_stream)
|
|
151
|
+
|
|
152
|
+
def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult:
|
|
153
|
+
"""Convert image file using PaddleOCR API."""
|
|
154
|
+
img_bytes = file_stream.read()
|
|
155
|
+
filename = f"image{extension}"
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
markdown = self._get_client().ocr(file_bytes=img_bytes, filename=filename)
|
|
159
|
+
return DocumentConverterResult(markdown=markdown)
|
|
160
|
+
except Exception as e:
|
|
161
|
+
return DocumentConverterResult(
|
|
162
|
+
markdown=f"<!-- Error converting image with PaddleOCR: {e} -->"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
|
|
166
|
+
"""Convert PDF using hybrid approach (pdfplumber for text, PaddleOCR for complex pages)."""
|
|
167
|
+
pdf_stream = io.BytesIO(file_stream.read())
|
|
168
|
+
markdown_parts = []
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
with pdfplumber.open(pdf_stream) as pdf:
|
|
172
|
+
for page_num, page in enumerate(pdf.pages):
|
|
173
|
+
# Analyze page type
|
|
174
|
+
page_type = self._analyze_page(page)
|
|
175
|
+
|
|
176
|
+
# Choose processing method
|
|
177
|
+
if self.force_ai or page_type != "plain_text":
|
|
178
|
+
# Complex content: use PaddleOCR
|
|
179
|
+
markdown = self._convert_with_paddleocr(page, page_num)
|
|
180
|
+
else:
|
|
181
|
+
# Plain text: use pdfplumber
|
|
182
|
+
markdown = self._extract_text_with_tables(page)
|
|
183
|
+
|
|
184
|
+
if markdown.strip():
|
|
185
|
+
markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
|
|
186
|
+
|
|
187
|
+
page.close()
|
|
188
|
+
|
|
189
|
+
markdown = "\n\n".join(markdown_parts).strip()
|
|
190
|
+
|
|
191
|
+
except Exception:
|
|
192
|
+
# Fallback to pdfminer
|
|
193
|
+
pdf_stream.seek(0)
|
|
194
|
+
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
195
|
+
|
|
196
|
+
# Final fallback
|
|
197
|
+
if not markdown:
|
|
198
|
+
pdf_stream.seek(0)
|
|
199
|
+
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
200
|
+
|
|
201
|
+
return DocumentConverterResult(markdown=markdown)
|
|
202
|
+
|
|
203
|
+
def _analyze_page(self, page: Any) -> str:
|
|
204
|
+
"""Analyze page content type."""
|
|
205
|
+
# Check for images
|
|
206
|
+
if hasattr(page, "images") and page.images:
|
|
207
|
+
return "complex"
|
|
208
|
+
|
|
209
|
+
# Check for tables
|
|
210
|
+
tables = page.find_tables()
|
|
211
|
+
if tables:
|
|
212
|
+
return "complex"
|
|
213
|
+
|
|
214
|
+
# Check for graphics/curves
|
|
215
|
+
if hasattr(page, "curves") and page.curves:
|
|
216
|
+
return "complex"
|
|
217
|
+
|
|
218
|
+
return "plain_text"
|
|
219
|
+
|
|
220
|
+
def _convert_with_paddleocr(self, page: Any, page_num: int) -> str:
|
|
221
|
+
"""Convert page using PaddleOCR API."""
|
|
222
|
+
try:
|
|
223
|
+
# Render page to image
|
|
224
|
+
img = page.to_image(resolution=150)
|
|
225
|
+
img_bytes = io.BytesIO()
|
|
226
|
+
img.save(img_bytes, format="PNG")
|
|
227
|
+
|
|
228
|
+
markdown = self._get_client().ocr(
|
|
229
|
+
file_bytes=img_bytes.getvalue(),
|
|
230
|
+
filename=f"page_{page_num + 1}.png",
|
|
231
|
+
)
|
|
232
|
+
return markdown
|
|
233
|
+
|
|
234
|
+
except Exception:
|
|
235
|
+
# Fallback to pdfplumber text extraction
|
|
236
|
+
return self._extract_text_with_tables(page)
|
|
237
|
+
|
|
238
|
+
def _extract_text_with_tables(self, page: Any) -> str:
|
|
239
|
+
"""Extract text and tables from page."""
|
|
240
|
+
parts = []
|
|
241
|
+
|
|
242
|
+
# Extract text
|
|
243
|
+
text = page.extract_text() or ""
|
|
244
|
+
if text.strip():
|
|
245
|
+
parts.append(text.strip())
|
|
246
|
+
|
|
247
|
+
# Extract tables
|
|
248
|
+
try:
|
|
249
|
+
tables = page.extract_tables()
|
|
250
|
+
if tables:
|
|
251
|
+
for table in tables:
|
|
252
|
+
if table:
|
|
253
|
+
md_table = self._table_to_markdown(table)
|
|
254
|
+
if md_table.strip():
|
|
255
|
+
parts.append(md_table)
|
|
256
|
+
except Exception:
|
|
257
|
+
pass
|
|
258
|
+
|
|
259
|
+
return "\n\n".join(parts)
|
|
260
|
+
|
|
261
|
+
def _table_to_markdown(self, table: list[list[str]]) -> str:
|
|
262
|
+
"""Convert table to Markdown."""
|
|
263
|
+
if not table:
|
|
264
|
+
return ""
|
|
265
|
+
|
|
266
|
+
# Filter None values
|
|
267
|
+
table = [[cell if cell is not None else "" for cell in row] for row in table]
|
|
268
|
+
|
|
269
|
+
# Filter empty rows
|
|
270
|
+
table = [row for row in table if any(cell.strip() for cell in row)]
|
|
271
|
+
|
|
272
|
+
if not table:
|
|
273
|
+
return ""
|
|
274
|
+
|
|
275
|
+
# Calculate column widths
|
|
276
|
+
col_widths = [
|
|
277
|
+
max(len(str(row[i])) if i < len(row) else 0 for row in table)
|
|
278
|
+
for i in range(max(len(row) for row in table))
|
|
279
|
+
]
|
|
280
|
+
|
|
281
|
+
# Format table
|
|
282
|
+
lines = []
|
|
283
|
+
for row_idx, row in enumerate(table):
|
|
284
|
+
padded_row = row + [""] * (len(col_widths) - len(row))
|
|
285
|
+
line = "| " + " | ".join(
|
|
286
|
+
str(cell).ljust(width) for cell, width in zip(padded_row, col_widths)
|
|
287
|
+
) + " |"
|
|
288
|
+
lines.append(line)
|
|
289
|
+
|
|
290
|
+
if row_idx == 0:
|
|
291
|
+
sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
|
|
292
|
+
lines.append(sep)
|
|
293
|
+
|
|
294
|
+
return "\n".join(lines)
|
|
295
|
+
|
|
296
|
+
def close(self):
|
|
297
|
+
"""Close the client."""
|
|
298
|
+
self._client = None
|
|
299
|
+
|
|
300
|
+
def __enter__(self):
|
|
301
|
+
return self
|
|
302
|
+
|
|
303
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
304
|
+
self.close()
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""DualOcrConverter - glmocr (primary) → paddleocr (fallback) automatic degradation."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
|
|
7
|
+
from typing import BinaryIO, Any
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DualOcrConverter(DocumentConverter):
|
|
13
|
+
"""Dual OCR converter with automatic fallback: glmocr → paddleocr.
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
converter = DualOcrConverter()
|
|
17
|
+
md = MarkItDown(enable_plugins=False)
|
|
18
|
+
md.register_converter(converter, priority=-1.0)
|
|
19
|
+
result = md.convert("document.pdf")
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
# glmocr kwargs
|
|
25
|
+
glmocr_api_key: Optional[str] = None,
|
|
26
|
+
glmocr_timeout: int = 1800,
|
|
27
|
+
glmocr_enable_layout: bool = False,
|
|
28
|
+
glmocr_force_ai: bool = False,
|
|
29
|
+
# paddleocr kwargs
|
|
30
|
+
paddleocr_token: Optional[str] = None,
|
|
31
|
+
paddleocr_model: str = "PaddleOCR-VL-1.5",
|
|
32
|
+
paddleocr_poll_interval: float = 2.0,
|
|
33
|
+
paddleocr_poll_timeout: float = 300.0,
|
|
34
|
+
paddleocr_force_ai: bool = False,
|
|
35
|
+
paddleocr_use_doc_orientation_classify: bool = False,
|
|
36
|
+
paddleocr_use_doc_unwarping: bool = False,
|
|
37
|
+
paddleocr_use_chart_recognition: bool = False,
|
|
38
|
+
):
|
|
39
|
+
self.glmocr_kwargs = {
|
|
40
|
+
"api_key": glmocr_api_key,
|
|
41
|
+
"timeout": glmocr_timeout,
|
|
42
|
+
"enable_layout": glmocr_enable_layout,
|
|
43
|
+
"force_ai": glmocr_force_ai,
|
|
44
|
+
}
|
|
45
|
+
self.paddleocr_kwargs = {
|
|
46
|
+
"token": paddleocr_token,
|
|
47
|
+
"model": paddleocr_model,
|
|
48
|
+
"poll_interval": paddleocr_poll_interval,
|
|
49
|
+
"poll_timeout": paddleocr_poll_timeout,
|
|
50
|
+
"force_ai": paddleocr_force_ai,
|
|
51
|
+
"use_doc_orientation_classify": paddleocr_use_doc_orientation_classify,
|
|
52
|
+
"use_doc_unwarping": paddleocr_use_doc_unwarping,
|
|
53
|
+
"use_chart_recognition": paddleocr_use_chart_recognition,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
self._primary = None
|
|
57
|
+
self._fallback = None
|
|
58
|
+
self._init_converters()
|
|
59
|
+
|
|
60
|
+
def _init_converters(self):
|
|
61
|
+
"""Lazily init both converters."""
|
|
62
|
+
try:
|
|
63
|
+
from markitdown_glmocr import GlmOcrConverter
|
|
64
|
+
# Filter out None values
|
|
65
|
+
kwargs = {k: v for k, v in self.glmocr_kwargs.items() if v is not None}
|
|
66
|
+
self._primary = GlmOcrConverter(**kwargs)
|
|
67
|
+
logger.info("glmocr converter initialized (primary)")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.warning("glmocr init failed: %s", e)
|
|
70
|
+
self._primary = None
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
from markitdown_paddleocr import PaddleOcrConverter
|
|
74
|
+
kwargs = {k: v for k, v in self.paddleocr_kwargs.items() if v is not None}
|
|
75
|
+
self._fallback = PaddleOcrConverter(**kwargs)
|
|
76
|
+
logger.info("paddleocr converter initialized (fallback)")
|
|
77
|
+
except Exception as e:
|
|
78
|
+
logger.warning("paddleocr init failed: %s", e)
|
|
79
|
+
self._fallback = None
|
|
80
|
+
|
|
81
|
+
def accepts(
|
|
82
|
+
self,
|
|
83
|
+
file_stream: BinaryIO,
|
|
84
|
+
stream_info: StreamInfo,
|
|
85
|
+
**kwargs: Any,
|
|
86
|
+
) -> bool:
|
|
87
|
+
"""Accept if either converter accepts."""
|
|
88
|
+
if self._primary:
|
|
89
|
+
try:
|
|
90
|
+
file_stream.seek(0)
|
|
91
|
+
if self._primary.accepts(file_stream, stream_info, **kwargs):
|
|
92
|
+
return True
|
|
93
|
+
except Exception:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
if self._fallback:
|
|
97
|
+
try:
|
|
98
|
+
file_stream.seek(0)
|
|
99
|
+
if self._fallback.accepts(file_stream, stream_info, **kwargs):
|
|
100
|
+
return True
|
|
101
|
+
except Exception:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
def convert(
|
|
107
|
+
self,
|
|
108
|
+
file_stream: BinaryIO,
|
|
109
|
+
stream_info: StreamInfo,
|
|
110
|
+
**kwargs: Any,
|
|
111
|
+
) -> DocumentConverterResult:
|
|
112
|
+
"""Convert with primary, fallback on failure."""
|
|
113
|
+
data = file_stream.read()
|
|
114
|
+
|
|
115
|
+
# Try primary (glmocr)
|
|
116
|
+
if self._primary:
|
|
117
|
+
try:
|
|
118
|
+
result = self._primary.convert(io_bytes(data), stream_info, **kwargs)
|
|
119
|
+
if result.markdown and result.markdown.strip():
|
|
120
|
+
logger.info("✓ glmocr succeeded")
|
|
121
|
+
return result
|
|
122
|
+
logger.warning("glmocr returned empty result, falling back")
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.warning("glmocr failed: %s, falling back to paddleocr", e)
|
|
125
|
+
|
|
126
|
+
# Fallback (paddleocr)
|
|
127
|
+
if self._fallback:
|
|
128
|
+
try:
|
|
129
|
+
result = self._fallback.convert(io_bytes(data), stream_info, **kwargs)
|
|
130
|
+
if result.markdown and result.markdown.strip():
|
|
131
|
+
logger.info("✓ paddleocr succeeded (fallback)")
|
|
132
|
+
return result
|
|
133
|
+
logger.warning("paddleocr returned empty result")
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error("paddleocr also failed: %s", e)
|
|
136
|
+
|
|
137
|
+
# Both failed
|
|
138
|
+
return DocumentConverterResult(
|
|
139
|
+
markdown="<!-- Both OCR engines (glmocr, paddleocr) failed to convert this file -->"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def close(self):
|
|
143
|
+
if self._primary and hasattr(self._primary, "close"):
|
|
144
|
+
self._primary.close()
|
|
145
|
+
if self._fallback and hasattr(self._fallback, "close"):
|
|
146
|
+
self._fallback.close()
|
|
147
|
+
|
|
148
|
+
def __enter__(self):
|
|
149
|
+
return self
|
|
150
|
+
|
|
151
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
152
|
+
self.close()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def io_bytes(data: bytes):
|
|
156
|
+
"""Create a seekable BytesIO from bytes."""
|
|
157
|
+
import io
|
|
158
|
+
buf = io.BytesIO(data)
|
|
159
|
+
buf.seek(0)
|
|
160
|
+
return buf
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""PaddleOCR API Client - handles job submission, polling, and result fetching."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import time
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
from ._config import PaddleOcrConfig
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PaddleOcrError(Exception):
|
|
16
|
+
"""PaddleOCR API error."""
|
|
17
|
+
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PaddleClient:
|
|
22
|
+
"""Client for PaddleOCR cloud API.
|
|
23
|
+
|
|
24
|
+
Workflow: submit job → poll status → fetch JSONL result → extract markdown.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: Optional[PaddleOcrConfig] = None, **kwargs):
|
|
28
|
+
if config is None:
|
|
29
|
+
config = PaddleOcrConfig(**kwargs)
|
|
30
|
+
self.config = config
|
|
31
|
+
|
|
32
|
+
# Token from config or env
|
|
33
|
+
self.token = config.token
|
|
34
|
+
if not self.token:
|
|
35
|
+
import os
|
|
36
|
+
self.token = os.environ.get("BAIDU_PADDLE_TOKEN", "")
|
|
37
|
+
|
|
38
|
+
def _headers(self) -> dict:
|
|
39
|
+
"""Build authorization headers."""
|
|
40
|
+
return {"Authorization": f"bearer {self.token}"}
|
|
41
|
+
|
|
42
|
+
def _optional_payload(self) -> dict:
|
|
43
|
+
"""Build optional payload flags."""
|
|
44
|
+
return {
|
|
45
|
+
"useDocOrientationClassify": self.config.use_doc_orientation_classify,
|
|
46
|
+
"useDocUnwarping": self.config.use_doc_unwarping,
|
|
47
|
+
"useChartRecognition": self.config.use_chart_recognition,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def ocr(
|
|
51
|
+
self,
|
|
52
|
+
file_bytes: Optional[bytes] = None,
|
|
53
|
+
filename: Optional[str] = None,
|
|
54
|
+
file_url: Optional[str] = None,
|
|
55
|
+
) -> str:
|
|
56
|
+
"""Run OCR on a file or URL, return concatenated markdown.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
file_bytes: File content bytes (for local file upload).
|
|
60
|
+
filename: Filename for multipart upload (e.g. "page.png").
|
|
61
|
+
file_url: File URL (for URL mode, alternative to file_bytes).
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Markdown text extracted from all pages.
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
PaddleOcrError: On API errors or timeout.
|
|
68
|
+
"""
|
|
69
|
+
# 1. Submit job
|
|
70
|
+
job_id = self._submit(file_bytes=file_bytes, filename=filename, file_url=file_url)
|
|
71
|
+
logger.info("Job submitted: %s", job_id)
|
|
72
|
+
|
|
73
|
+
# 2. Poll until done
|
|
74
|
+
result_url = self._poll(job_id)
|
|
75
|
+
logger.info("Job completed, result URL obtained")
|
|
76
|
+
|
|
77
|
+
# 3. Fetch and parse results
|
|
78
|
+
return self._fetch_markdown(result_url)
|
|
79
|
+
|
|
80
|
+
def _submit(
|
|
81
|
+
self,
|
|
82
|
+
file_bytes: Optional[bytes] = None,
|
|
83
|
+
filename: Optional[str] = None,
|
|
84
|
+
file_url: Optional[str] = None,
|
|
85
|
+
) -> str:
|
|
86
|
+
"""Submit an OCR job, return job ID."""
|
|
87
|
+
headers = self._headers()
|
|
88
|
+
|
|
89
|
+
if file_url:
|
|
90
|
+
# URL mode
|
|
91
|
+
headers["Content-Type"] = "application/json"
|
|
92
|
+
payload = {
|
|
93
|
+
"fileUrl": file_url,
|
|
94
|
+
"model": self.config.model,
|
|
95
|
+
"optionalPayload": self._optional_payload(),
|
|
96
|
+
}
|
|
97
|
+
resp = requests.post(self.config.job_url, json=payload, headers=headers)
|
|
98
|
+
elif file_bytes is not None:
|
|
99
|
+
# Local file mode - multipart upload
|
|
100
|
+
data = {
|
|
101
|
+
"model": self.config.model,
|
|
102
|
+
"optionalPayload": json.dumps(self._optional_payload()),
|
|
103
|
+
}
|
|
104
|
+
fname = filename or "document"
|
|
105
|
+
files = {"file": (fname, file_bytes)}
|
|
106
|
+
resp = requests.post(self.config.job_url, headers=headers, data=data, files=files)
|
|
107
|
+
else:
|
|
108
|
+
raise PaddleOcrError("Either file_bytes or file_url must be provided")
|
|
109
|
+
|
|
110
|
+
if resp.status_code != 200:
|
|
111
|
+
raise PaddleOcrError(f"Submit failed (HTTP {resp.status_code}): {resp.text}")
|
|
112
|
+
|
|
113
|
+
result = resp.json()
|
|
114
|
+
job_id = result.get("data", {}).get("jobId")
|
|
115
|
+
if not job_id:
|
|
116
|
+
raise PaddleOcrError(f"No jobId in response: {result}")
|
|
117
|
+
|
|
118
|
+
return job_id
|
|
119
|
+
|
|
120
|
+
def _poll(self, job_id: str) -> str:
|
|
121
|
+
"""Poll job status until done, return JSONL result URL."""
|
|
122
|
+
headers = self._headers()
|
|
123
|
+
url = f"{self.config.job_url}/{job_id}"
|
|
124
|
+
start = time.time()
|
|
125
|
+
|
|
126
|
+
while True:
|
|
127
|
+
resp = requests.get(url, headers=headers)
|
|
128
|
+
if resp.status_code != 200:
|
|
129
|
+
raise PaddleOcrError(f"Poll failed (HTTP {resp.status_code}): {resp.text}")
|
|
130
|
+
|
|
131
|
+
data = resp.json().get("data", {})
|
|
132
|
+
state = data.get("state", "")
|
|
133
|
+
|
|
134
|
+
if state == "done":
|
|
135
|
+
result_url = data.get("resultUrl", {}).get("jsonUrl", "")
|
|
136
|
+
if not result_url:
|
|
137
|
+
raise PaddleOcrError("Job done but no resultUrl in response")
|
|
138
|
+
return result_url
|
|
139
|
+
|
|
140
|
+
if state == "failed":
|
|
141
|
+
error_msg = data.get("errorMsg", "Unknown error")
|
|
142
|
+
raise PaddleOcrError(f"Job failed: {error_msg}")
|
|
143
|
+
|
|
144
|
+
# Still pending or running
|
|
145
|
+
if state == "running":
|
|
146
|
+
progress = data.get("extractProgress", {})
|
|
147
|
+
total = progress.get("totalPages", "?")
|
|
148
|
+
extracted = progress.get("extractedPages", "?")
|
|
149
|
+
logger.debug("Running: %s/%s pages", extracted, total)
|
|
150
|
+
else:
|
|
151
|
+
logger.debug("State: %s", state)
|
|
152
|
+
|
|
153
|
+
# Check timeout
|
|
154
|
+
elapsed = time.time() - start
|
|
155
|
+
if elapsed > self.config.poll_timeout:
|
|
156
|
+
raise PaddleOcrError(
|
|
157
|
+
f"Job polling timed out after {self.config.poll_timeout}s (state={state})"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
time.sleep(self.config.poll_interval)
|
|
161
|
+
|
|
162
|
+
def _fetch_markdown(self, jsonl_url: str) -> str:
|
|
163
|
+
"""Fetch JSONL result and extract markdown from all pages."""
|
|
164
|
+
resp = requests.get(jsonl_url)
|
|
165
|
+
resp.raise_for_status()
|
|
166
|
+
|
|
167
|
+
markdown_parts = []
|
|
168
|
+
lines = resp.text.strip().split("\n")
|
|
169
|
+
|
|
170
|
+
for line in lines:
|
|
171
|
+
line = line.strip()
|
|
172
|
+
if not line:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
page_data = json.loads(line)
|
|
177
|
+
except json.JSONDecodeError:
|
|
178
|
+
logger.warning("Skipping invalid JSONL line")
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
result = page_data.get("result", {})
|
|
182
|
+
layout_results = result.get("layoutParsingResults", [])
|
|
183
|
+
|
|
184
|
+
for layout in layout_results:
|
|
185
|
+
md_text = layout.get("markdown", {}).get("text", "")
|
|
186
|
+
if md_text.strip():
|
|
187
|
+
markdown_parts.append(md_text.strip())
|
|
188
|
+
|
|
189
|
+
return "\n\n".join(markdown_parts)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Plugin registration for markitdown-paddleocr."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
from markitdown import MarkItDown
|
|
5
|
+
|
|
6
|
+
from ._converter import PaddleOcrConverter
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__plugin_interface_version__ = 1
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
|
|
13
|
+
"""Register markitdown-paddleocr converter.
|
|
14
|
+
|
|
15
|
+
Config sources (priority high to low):
|
|
16
|
+
1. kwargs parameters
|
|
17
|
+
2. Environment variables (BAIDU_PADDLE_TOKEN)
|
|
18
|
+
3. Built-in defaults
|
|
19
|
+
"""
|
|
20
|
+
# Register converter with higher priority than default PDF converter
|
|
21
|
+
PRIORITY_PADDLEOCR = -1.0
|
|
22
|
+
|
|
23
|
+
markitdown.register_converter(
|
|
24
|
+
PaddleOcrConverter(
|
|
25
|
+
token=kwargs.get("token"),
|
|
26
|
+
model=kwargs.get("model", "PaddleOCR-VL-1.5"),
|
|
27
|
+
poll_interval=kwargs.get("poll_interval", 2.0),
|
|
28
|
+
poll_timeout=kwargs.get("poll_timeout", 300.0),
|
|
29
|
+
force_ai=kwargs.get("force_ai", False),
|
|
30
|
+
use_doc_orientation_classify=kwargs.get("use_doc_orientation_classify", False),
|
|
31
|
+
use_doc_unwarping=kwargs.get("use_doc_unwarping", False),
|
|
32
|
+
use_chart_recognition=kwargs.get("use_chart_recognition", False),
|
|
33
|
+
),
|
|
34
|
+
priority=PRIORITY_PADDLEOCR,
|
|
35
|
+
)
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: markitdown-paddleocr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Intelligent PDF/Image to Markdown converter using PaddleOCR cloud API
|
|
5
|
+
Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
|
|
6
|
+
Project-URL: Issues, https://github.com/microsoft/markitdown/issues
|
|
7
|
+
Project-URL: Source, https://github.com/microsoft/markitdown
|
|
8
|
+
Author-email: Contributors <noreply@github.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: baidu,markitdown,ocr,paddleocr,pdf,vision
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Programming Language :: Python
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: markitdown>=0.1.0
|
|
19
|
+
Requires-Dist: pdfminer-six>=20251230
|
|
20
|
+
Requires-Dist: pdfplumber>=0.11.9
|
|
21
|
+
Requires-Dist: pillow>=9.0.0
|
|
22
|
+
Requires-Dist: requests>=2.28.0
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# markitdown-paddleocr
|
|
28
|
+
|
|
29
|
+
智能 PDF/图片转 Markdown 插件,使用百度 PaddleOCR 云端 API 驱动的 OCR 识别。
|
|
30
|
+
|
|
31
|
+
## 特性
|
|
32
|
+
|
|
33
|
+
- 🔍 **智能检测**:自动识别每页内容类型(纯文本 vs 图片/表格)
|
|
34
|
+
- 📄 **默认解析**:纯文本页面使用 pdfplumber/pdfminer 提取,速度快、成本低
|
|
35
|
+
- 🤖 **AI 增强**:复杂页面(图片、表格)使用 PaddleOCR API 转换为 Markdown
|
|
36
|
+
- 🔄 **异步 Job 模型**:提交 OCR 任务 → 轮询状态 → 获取结果
|
|
37
|
+
- 📊 **结构化输出**:返回 Markdown(含表格、公式、图表等)
|
|
38
|
+
|
|
39
|
+
## 安装
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install markitdown-paddleocr
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## 配置
|
|
46
|
+
|
|
47
|
+
### 环境变量(推荐)
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# 必需:百度 PaddleOCR Token
|
|
51
|
+
export BAIDU_PADDLE_TOKEN="your-paddle-token"
|
|
52
|
+
|
|
53
|
+
# 可选
|
|
54
|
+
export PADDLE_OCR_MODEL="PaddleOCR-VL-1.5" # 模型名称
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### 配置优先级
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
构造函数参数 > 环境变量 > 内置默认值
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## 使用方法
|
|
64
|
+
|
|
65
|
+
### 命令行(推荐)
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# 1. 设置 Token
|
|
69
|
+
export BAIDU_PADDLE_TOKEN="your-token"
|
|
70
|
+
|
|
71
|
+
# 2. 查看已安装插件
|
|
72
|
+
markitdown --list-plugins
|
|
73
|
+
|
|
74
|
+
# 3. 使用插件转换 PDF
|
|
75
|
+
markitdown -p document.pdf
|
|
76
|
+
|
|
77
|
+
# 4. 保存到文件
|
|
78
|
+
markitdown -p document.pdf -o output.md
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Python API
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from markitdown import MarkItDown
|
|
85
|
+
from markitdown_paddleocr import PaddleOcrConverter
|
|
86
|
+
|
|
87
|
+
# 方式1:自动从环境变量读取 BAIDU_PADDLE_TOKEN
|
|
88
|
+
converter = PaddleOcrConverter()
|
|
89
|
+
md = MarkItDown(enable_plugins=False)
|
|
90
|
+
md.register_converter(converter, priority=-1.0)
|
|
91
|
+
result = md.convert("document.pdf")
|
|
92
|
+
print(result.markdown)
|
|
93
|
+
|
|
94
|
+
# 方式2:手动传入 Token
|
|
95
|
+
converter = PaddleOcrConverter(token="your-token")
|
|
96
|
+
md = MarkItDown(enable_plugins=False)
|
|
97
|
+
md.register_converter(converter, priority=-1.0)
|
|
98
|
+
result = md.convert("document.pdf")
|
|
99
|
+
print(result.markdown)
|
|
100
|
+
|
|
101
|
+
# 方式3:强制所有页面使用 OCR
|
|
102
|
+
converter = PaddleOcrConverter(token="your-token", force_ai=True)
|
|
103
|
+
md = MarkItDown(enable_plugins=False)
|
|
104
|
+
md.register_converter(converter, priority=-1.0)
|
|
105
|
+
result = md.convert("document.pdf")
|
|
106
|
+
print(result.markdown)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### 直接使用 PaddleClient
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from markitdown_paddleocr import PaddleClient
|
|
113
|
+
|
|
114
|
+
client = PaddleClient(token="your-token")
|
|
115
|
+
|
|
116
|
+
# 本地文件
|
|
117
|
+
markdown = client.ocr(file_bytes=open("image.png", "rb").read(), filename="image.png")
|
|
118
|
+
print(markdown)
|
|
119
|
+
|
|
120
|
+
# URL 模式
|
|
121
|
+
markdown = client.ocr(file_url="https://example.com/document.pdf")
|
|
122
|
+
print(markdown)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## 配置选项
|
|
126
|
+
|
|
127
|
+
### PaddleOcrConverter 参数
|
|
128
|
+
|
|
129
|
+
| 参数 | 类型 | 默认值 | 说明 |
|
|
130
|
+
|------|------|--------|------|
|
|
131
|
+
| `token` | str | 环境变量 `BAIDU_PADDLE_TOKEN` | PaddleOCR Token |
|
|
132
|
+
| `model` | str | `PaddleOCR-VL-1.5` | OCR 模型名称 |
|
|
133
|
+
| `poll_interval` | float | 2.0 | 轮询间隔(秒) |
|
|
134
|
+
| `poll_timeout` | float | 300.0 | 轮询超时(秒) |
|
|
135
|
+
| `force_ai` | bool | False | 强制所有页面使用 OCR |
|
|
136
|
+
| `use_doc_orientation_classify` | bool | False | 文档方向分类 |
|
|
137
|
+
| `use_doc_unwarping` | bool | False | 文档去扭曲 |
|
|
138
|
+
| `use_chart_recognition` | bool | False | 图表识别 |
|
|
139
|
+
|
|
140
|
+
### 环境变量
|
|
141
|
+
|
|
142
|
+
| 变量 | 说明 | 示例 |
|
|
143
|
+
|------|------|------|
|
|
144
|
+
| `BAIDU_PADDLE_TOKEN` | Token(必需) | `7963b85a...` |
|
|
145
|
+
| `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.5` |
|
|
146
|
+
|
|
147
|
+
## 工作原理
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
PDF/图片 输入
|
|
151
|
+
│
|
|
152
|
+
▼
|
|
153
|
+
PaddleOcrConverter.convert()
|
|
154
|
+
│
|
|
155
|
+
├─ 图片文件 ──► PaddleClient.ocr() ──► markdown
|
|
156
|
+
│
|
|
157
|
+
└─ PDF 文件 ──► 逐页分析内容类型
|
|
158
|
+
│
|
|
159
|
+
├─ 纯文本页 ──► pdfplumber 提取文本
|
|
160
|
+
│
|
|
161
|
+
└─ 复杂页(图片/表格)
|
|
162
|
+
│
|
|
163
|
+
└─► 渲染为图片 ──► PaddleClient.ocr()
|
|
164
|
+
│
|
|
165
|
+
├─ POST /api/v2/ocr/jobs (提交 Job)
|
|
166
|
+
├─ GET /api/v2/ocr/jobs/{id} (轮询状态)
|
|
167
|
+
└─ GET jsonUrl (获取 JSONL 结果)
|
|
168
|
+
│
|
|
169
|
+
▼
|
|
170
|
+
合并输出完整 Markdown
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## 依赖
|
|
174
|
+
|
|
175
|
+
- `markitdown>=0.1.0` - 基础框架
|
|
176
|
+
- `pdfplumber>=0.11.9` - PDF 解析和截图
|
|
177
|
+
- `pdfminer.six>=20251230` - 文本提取备用
|
|
178
|
+
- `Pillow>=9.0.0` - 图像处理
|
|
179
|
+
- `requests>=2.28.0` - HTTP 请求
|
|
180
|
+
|
|
181
|
+
## 许可证
|
|
182
|
+
|
|
183
|
+
MIT
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
markitdown_paddleocr/__about__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
2
|
+
markitdown_paddleocr/__init__.py,sha256=Os7IXewhl6lmRnpAYYBKKwNaRbx8zceIxCXJz3m16_g,464
|
|
3
|
+
markitdown_paddleocr/_config.py,sha256=Y7CrOtyf53WnQWFrjD9bjea51Wpn0ysq_kRH75CfHMY,1292
|
|
4
|
+
markitdown_paddleocr/_converter.py,sha256=hffXCH0KycApXIAyrZFkJ_wqAkkNja2UTv1h_Vr91U0,10686
|
|
5
|
+
markitdown_paddleocr/_dual_converter.py,sha256=gJnnpozquqAtowomjygq0BlI6S66LOY0p7eclm_35rc,5580
|
|
6
|
+
markitdown_paddleocr/_paddle_client.py,sha256=CQYushgLtep-J6X0y8V5soVX5ElxoXW2EaWrsaaZqvc,6357
|
|
7
|
+
markitdown_paddleocr/_plugin.py,sha256=1ZFxQTlywQid5H0YMrdx73lodp848TY4A8bT8VzayDo,1194
|
|
8
|
+
markitdown_paddleocr-0.1.0.dist-info/METADATA,sha256=Xlrj7VV7lcsMhI3sZLFzJUeKzuB66sdcuWPKsYC9_og,5225
|
|
9
|
+
markitdown_paddleocr-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
10
|
+
markitdown_paddleocr-0.1.0.dist-info/entry_points.txt,sha256=MPWJijvtpHf_degXBwV7i91MRBfbOiQeIqNUurhPd18,64
|
|
11
|
+
markitdown_paddleocr-0.1.0.dist-info/RECORD,,
|