markitdown-paddleocr 0.1.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: markitdown-paddleocr
3
- Version: 0.1.0
3
+ Version: 0.2.2
4
4
  Summary: Intelligent PDF/Image to Markdown converter using PaddleOCR cloud API
5
5
  Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
6
6
  Project-URL: Issues, https://github.com/microsoft/markitdown/issues
@@ -178,6 +178,93 @@ PaddleOcrConverter.convert()
178
178
  - `Pillow>=9.0.0` - 图像处理
179
179
  - `requests>=2.28.0` - HTTP 请求
180
180
 
181
+ ## 发布到 PyPI
182
+
183
+ ### 前置条件
184
+
185
+ 1. 安装构建工具:
186
+
187
+ ```bash
188
+ pip install build twine hatch
189
+ ```
190
+
191
+ 2. 配置 PyPI API Token(Windows 用户环境变量):
192
+
193
+ ```powershell
194
+ # PowerShell 设置用户环境变量
195
+ [System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
196
+ ```
197
+
198
+ 或在 Bash/Zsh 中:
199
+
200
+ ```bash
201
+ export PYPI_API_TOKEN="pypi-..."
202
+ ```
203
+
204
+ ### 快速发布(推荐)
205
+
206
+ 项目根目录提供了上传脚本,可一键发布两个插件:
207
+
208
+ **Bash / Git Bash:**
209
+ ```bash
210
+ # 构建两个插件
211
+ cd packages/markitdown-glmocr && hatch build
212
+
213
+ cd ../markitdown-paddleocr && hatch build
214
+
215
+ # 上传(自动上传所有构建的版本)
216
+ cd ../..
217
+ ./scripts/pypi-upload.sh
218
+
219
+ # 或指定版本号
220
+ ./scripts/pypi-upload.sh 0.2.0
221
+ ```
222
+
223
+ **PowerShell:**
224
+ ```powershell
225
+ # 构建两个插件
226
+ cd packages/markitdown-glmocr; hatch build
227
+ cd ../markitdown-paddleocr; hatch build
228
+
229
+ # 上传
230
+ cd ../..
231
+ .\scripts\pypi-upload.ps1
232
+
233
+ # 或指定版本号
234
+ .\scripts\pypi-upload.ps1 -Version "0.2.0"
235
+ ```
236
+
237
+ ### 手动发布
238
+
239
+ ```bash
240
+ # 1. 进入项目目录
241
+ cd packages/markitdown-paddleocr
242
+
243
+ # 2. 构建
244
+ hatch build
245
+
246
+ # 3. 检查
247
+ twine check dist/*
248
+
249
+ # 4. 上传
250
+ twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
251
+ ```
252
+
253
+ ### 发布到 TestPyPI(测试)
254
+
255
+ ```bash
256
+ twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
257
+
258
+ # 从 TestPyPI 安装验证
259
+ pip install --index-url https://test.pypi.org/simple/ markitdown-paddleocr
260
+ ```
261
+
262
+ ### 注意事项
263
+
264
+ - 发布前确保 `src/markitdown_paddleocr/__about__.py` 中的版本号已更新
265
+ - 同一版本号不能重复上传,如需修正必须 bump 版本号
266
+ - `PYPI_API_TOKEN` 切勿提交到代码仓库
267
+
181
268
  ## 许可证
182
269
 
183
270
  MIT
@@ -152,6 +152,93 @@ PaddleOcrConverter.convert()
152
152
  - `Pillow>=9.0.0` - 图像处理
153
153
  - `requests>=2.28.0` - HTTP 请求
154
154
 
155
+ ## 发布到 PyPI
156
+
157
+ ### 前置条件
158
+
159
+ 1. 安装构建工具:
160
+
161
+ ```bash
162
+ pip install build twine hatch
163
+ ```
164
+
165
+ 2. 配置 PyPI API Token(Windows 用户环境变量):
166
+
167
+ ```powershell
168
+ # PowerShell 设置用户环境变量
169
+ [System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
170
+ ```
171
+
172
+ 或在 Bash/Zsh 中:
173
+
174
+ ```bash
175
+ export PYPI_API_TOKEN="pypi-..."
176
+ ```
177
+
178
+ ### 快速发布(推荐)
179
+
180
+ 项目根目录提供了上传脚本,可一键发布两个插件:
181
+
182
+ **Bash / Git Bash:**
183
+ ```bash
184
+ # 构建两个插件
185
+ cd packages/markitdown-glmocr && hatch build
186
+
187
+ cd ../markitdown-paddleocr && hatch build
188
+
189
+ # 上传(自动上传所有构建的版本)
190
+ cd ../..
191
+ ./scripts/pypi-upload.sh
192
+
193
+ # 或指定版本号
194
+ ./scripts/pypi-upload.sh 0.2.0
195
+ ```
196
+
197
+ **PowerShell:**
198
+ ```powershell
199
+ # 构建两个插件
200
+ cd packages/markitdown-glmocr; hatch build
201
+ cd ../markitdown-paddleocr; hatch build
202
+
203
+ # 上传
204
+ cd ../..
205
+ .\scripts\pypi-upload.ps1
206
+
207
+ # 或指定版本号
208
+ .\scripts\pypi-upload.ps1 -Version "0.2.0"
209
+ ```
210
+
211
+ ### 手动发布
212
+
213
+ ```bash
214
+ # 1. 进入项目目录
215
+ cd packages/markitdown-paddleocr
216
+
217
+ # 2. 构建
218
+ hatch build
219
+
220
+ # 3. 检查
221
+ twine check dist/*
222
+
223
+ # 4. 上传
224
+ twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
225
+ ```
226
+
227
+ ### 发布到 TestPyPI(测试)
228
+
229
+ ```bash
230
+ twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
231
+
232
+ # 从 TestPyPI 安装验证
233
+ pip install --index-url https://test.pypi.org/simple/ markitdown-paddleocr
234
+ ```
235
+
236
+ ### 注意事项
237
+
238
+ - 发布前确保 `src/markitdown_paddleocr/__about__.py` 中的版本号已更新
239
+ - 同一版本号不能重复上传,如需修正必须 bump 版本号
240
+ - `PYPI_API_TOKEN` 切勿提交到代码仓库
241
+
155
242
  ## 许可证
156
243
 
157
244
  MIT
@@ -0,0 +1 @@
1
+ __version__ = "0.2.2"
@@ -2,6 +2,19 @@
2
2
 
3
3
  import os
4
4
  from dataclasses import dataclass
5
+ from enum import Enum
6
+
7
+
8
+ class ScanDetectionMode(str, Enum):
9
+ """扫描检测模式。
10
+
11
+ - PAGE_BY_PAGE: 逐页分析,当前默认行为
12
+ - FIRST_PAGE_HINT: 首页是扫描件则全文档使用OCR
13
+ - SAMPLING: 抽样前N页,多数是扫描件则全部OCR
14
+ """
15
+ PAGE_BY_PAGE = "page_by_page"
16
+ FIRST_PAGE_HINT = "first_page_hint"
17
+ SAMPLING = "sampling"
5
18
 
6
19
 
7
20
  @dataclass
@@ -35,6 +48,11 @@ class PaddleOcrConfig:
35
48
  # Processing strategy
36
49
  force_ai: bool = False
37
50
 
51
+ # Scan detection mode for optimization
52
+ scan_detection_mode: ScanDetectionMode = ScanDetectionMode.SAMPLING
53
+ scan_sample_pages: int = 3 # Number of pages to sample in SAMPLING mode
54
+ scan_text_threshold: int = 50 # Min text length to consider page as non-scanned
55
+
38
56
  @classmethod
39
57
  def from_env(cls, **overrides) -> "PaddleOcrConfig":
40
58
  """Create config from environment variables with optional overrides."""
@@ -0,0 +1,570 @@
1
+ """PaddleOcr Converter - PDF/Image to Markdown using PaddleOCR cloud API."""
2
+
3
+ import io
4
+ import logging
5
+ import sys
6
+ from typing import Any, BinaryIO, Optional
7
+
8
+ from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
9
+ from markitdown._exceptions import (
10
+ MISSING_DEPENDENCY_MESSAGE,
11
+ MissingDependencyException,
12
+ )
13
+
14
+ from ._config import PaddleOcrConfig, ScanDetectionMode
15
+ from ._paddle_client import PaddleClient
16
+
17
+ # Import PDF dependencies
18
+ _dependency_exc_info = None
19
+ try:
20
+ import pdfminer
21
+ import pdfminer.high_level
22
+ import pdfplumber
23
+ except ImportError:
24
+ _dependency_exc_info = sys.exc_info()
25
+
26
+
27
+ ACCEPTED_MIME_TYPE_PREFIXES = [
28
+ "application/pdf",
29
+ "application/x-pdf",
30
+ "image/jpeg",
31
+ "image/png",
32
+ ]
33
+
34
+ ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
35
+
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class PaddleOcrConverter(DocumentConverter):
41
+ """Intelligent PDF/Image converter using PaddleOCR cloud API.
42
+
43
+ Features:
44
+ - Auto-detect page content type (plain text vs images/tables)
45
+ - Plain text pages use pdfplumber/pdfminer (fast, free)
46
+ - Complex pages use PaddleOCR API for AI-powered OCR
47
+ - Image files (PNG, JPG) use PaddleOCR API directly
48
+ - Asynchronous job model: submit → poll → fetch result
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ token: Optional[str] = None,
54
+ model: str = "PaddleOCR-VL-1.5",
55
+ poll_interval: float = 2.0,
56
+ poll_timeout: float = 300.0,
57
+ force_ai: bool = False,
58
+ use_doc_orientation_classify: bool = False,
59
+ use_doc_unwarping: bool = False,
60
+ use_chart_recognition: bool = False,
61
+ scan_detection_mode: Optional[ScanDetectionMode] = None,
62
+ scan_sample_pages: Optional[int] = None,
63
+ scan_text_threshold: Optional[int] = None,
64
+ config: Optional[PaddleOcrConfig] = None,
65
+ ):
66
+ """Initialize converter.
67
+
68
+ Args:
69
+ token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided)
70
+ model: OCR model name (default: PaddleOCR-VL-1.5)
71
+ poll_interval: Seconds between status polls (default: 2.0)
72
+ poll_timeout: Max seconds to wait for job completion (default: 300.0)
73
+ force_ai: Force all pages to use OCR (default: False)
74
+ use_doc_orientation_classify: Enable document orientation classification
75
+ use_doc_unwarping: Enable document unwarping
76
+ use_chart_recognition: Enable chart recognition
77
+ scan_detection_mode: 扫描检测模式,优化扫描PDF处理
78
+ scan_sample_pages: SAMPLING模式下抽样页数 (default: 3)
79
+ scan_text_threshold: 判定为扫描件的最小文本长度阈值 (default: 50)
80
+ config: Optional PaddleOcrConfig instance
81
+ """
82
+ # Build config from explicit params or provided config
83
+ if config:
84
+ self.token = token or config.token
85
+ self.model = model if model != "PaddleOCR-VL-1.5" else config.model
86
+ self.poll_interval = (
87
+ poll_interval if poll_interval != 2.0 else config.poll_interval
88
+ )
89
+ self.poll_timeout = (
90
+ poll_timeout if poll_timeout != 300.0 else config.poll_timeout
91
+ )
92
+ self.force_ai = force_ai or config.force_ai
93
+ self.use_doc_orientation_classify = (
94
+ use_doc_orientation_classify or config.use_doc_orientation_classify
95
+ )
96
+ self.use_doc_unwarping = use_doc_unwarping or config.use_doc_unwarping
97
+ self.use_chart_recognition = (
98
+ use_chart_recognition or config.use_chart_recognition
99
+ )
100
+ self.scan_detection_mode = (
101
+ scan_detection_mode
102
+ if scan_detection_mode is not None
103
+ else config.scan_detection_mode
104
+ )
105
+ self.scan_sample_pages = (
106
+ scan_sample_pages
107
+ if scan_sample_pages is not None
108
+ else config.scan_sample_pages
109
+ )
110
+ self.scan_text_threshold = (
111
+ scan_text_threshold
112
+ if scan_text_threshold is not None
113
+ else config.scan_text_threshold
114
+ )
115
+ else:
116
+ self.token = token
117
+ self.model = model
118
+ self.poll_interval = poll_interval
119
+ self.poll_timeout = poll_timeout
120
+ self.force_ai = force_ai
121
+ self.use_doc_orientation_classify = use_doc_orientation_classify
122
+ self.use_doc_unwarping = use_doc_unwarping
123
+ self.use_chart_recognition = use_chart_recognition
124
+ self.scan_detection_mode = (
125
+ scan_detection_mode
126
+ if scan_detection_mode is not None
127
+ else ScanDetectionMode.SAMPLING
128
+ )
129
+ self.scan_sample_pages = scan_sample_pages if scan_sample_pages is not None else 3
130
+ self.scan_text_threshold = (
131
+ scan_text_threshold if scan_text_threshold is not None else 50
132
+ )
133
+
134
+ # Lazy init client
135
+ self._client: Optional[PaddleClient] = None
136
+
137
+ def _get_client(self) -> PaddleClient:
138
+ """Get or create PaddleClient instance."""
139
+ if self._client is None:
140
+ config = PaddleOcrConfig(
141
+ token=self.token or "",
142
+ model=self.model,
143
+ poll_interval=self.poll_interval,
144
+ poll_timeout=self.poll_timeout,
145
+ force_ai=self.force_ai,
146
+ use_doc_orientation_classify=self.use_doc_orientation_classify,
147
+ use_doc_unwarping=self.use_doc_unwarping,
148
+ use_chart_recognition=self.use_chart_recognition,
149
+ )
150
+ self._client = PaddleClient(config=config)
151
+ return self._client
152
+
153
+ def _has_token(self) -> bool:
154
+ """Check if a valid token is available."""
155
+ if self.token:
156
+ return True
157
+ import os
158
+
159
+ return bool(os.environ.get("BAIDU_PADDLE_TOKEN", ""))
160
+
161
+ def accepts(
162
+ self,
163
+ file_stream: BinaryIO,
164
+ stream_info: StreamInfo,
165
+ **kwargs: Any,
166
+ ) -> bool:
167
+ # Without a token, PaddleOCR API cannot work — decline so other
168
+ # converters (e.g. GlmOcrConverter) get a chance.
169
+ if not self._has_token():
170
+ return False
171
+
172
+ mimetype = (stream_info.mimetype or "").lower()
173
+ extension = (stream_info.extension or "").lower()
174
+
175
+ if extension in ACCEPTED_FILE_EXTENSIONS:
176
+ return True
177
+
178
+ for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
179
+ if mimetype.startswith(prefix):
180
+ return True
181
+
182
+ return False
183
+
184
+ def convert(
185
+ self,
186
+ file_stream: BinaryIO,
187
+ stream_info: StreamInfo,
188
+ **kwargs: Any,
189
+ ) -> DocumentConverterResult:
190
+ if _dependency_exc_info is not None:
191
+ raise MissingDependencyException(
192
+ MISSING_DEPENDENCY_MESSAGE.format(
193
+ converter=type(self).__name__,
194
+ extension=".pdf",
195
+ feature="pdf",
196
+ )
197
+ ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
198
+
199
+ extension = (stream_info.extension or "").lower()
200
+
201
+ logger.info("PaddleOcrConverter: 开始转换, 文件类型=%s", extension)
202
+
203
+ # Image files: use PaddleOCR directly
204
+ if extension in (".jpg", ".jpeg", ".png"):
205
+ return self._convert_image(file_stream, extension)
206
+
207
+ # PDF files: use hybrid approach
208
+ return self._convert_pdf(file_stream)
209
+
210
+ def _convert_image(
211
+ self, file_stream: BinaryIO, extension: str = ".png"
212
+ ) -> DocumentConverterResult:
213
+ """Convert image file using PaddleOCR API."""
214
+ img_bytes = file_stream.read()
215
+ filename = f"image{extension}"
216
+
217
+ logger.info("PaddleOcrConverter: 开始 OCR 识别图片, 格式=%s", extension)
218
+ try:
219
+ markdown = self._get_client().ocr(file_bytes=img_bytes, filename=filename)
220
+ except Exception as e:
221
+ logger.error(
222
+ "PaddleOcrConverter: 图片 OCR 识别异常, 格式=%s, 错误=%s", extension, e
223
+ )
224
+ raise
225
+
226
+ logger.info("PaddleOcrConverter: 图片 OCR 识别完成, 输出长度=%d", len(markdown))
227
+ return DocumentConverterResult(markdown=markdown)
228
+
229
+ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
230
+ """Convert PDF using hybrid approach (pdfplumber for text, PaddleOCR for complex pages)."""
231
+ pdf_stream = io.BytesIO(file_stream.read())
232
+ pdf_bytes = pdf_stream.getvalue() # Keep original bytes for batch OCR
233
+ markdown_parts = []
234
+ ocr_failed = False
235
+
236
+ try:
237
+ with pdfplumber.open(pdf_stream) as pdf:
238
+ total_pages = len(pdf.pages)
239
+ logger.info("PaddleOcrConverter: 开始处理 PDF, 总页数=%d", total_pages)
240
+
241
+ # Optimization: detect if entire PDF is scanned
242
+ all_scanned = self._detect_all_scanned(pdf)
243
+
244
+ if all_scanned and not self.force_ai:
245
+ # Batch mode: upload entire PDF to OCR API (single API call)
246
+ logger.info(
247
+ "PaddleOcrConverter: 全文档扫描模式, 批量上传PDF, 页数=%d",
248
+ total_pages,
249
+ )
250
+ try:
251
+ markdown = self._convert_pdf_batch(pdf_bytes)
252
+ if markdown.strip():
253
+ logger.info(
254
+ "PaddleOcrConverter: 批量OCR完成, 输出长度=%d",
255
+ len(markdown),
256
+ )
257
+ return DocumentConverterResult(markdown=markdown)
258
+ except Exception as e:
259
+ logger.warning(
260
+ "PaddleOcrConverter: 批量OCR失败, 降级为逐页处理, 错误=%s",
261
+ e,
262
+ )
263
+ ocr_failed = True
264
+ # Fall through to per-page processing
265
+
266
+ # Per-page processing (PAGE_BY_PAGE mode or batch failed)
267
+ for page_num, page in enumerate(pdf.pages):
268
+ # Choose processing method
269
+ if self.force_ai or all_scanned:
270
+ # All scanned (after batch failed) or force_ai
271
+ logger.info(
272
+ "PaddleOcrConverter: 第 %d/%d 页, 使用 PaddleOCR",
273
+ page_num + 1,
274
+ total_pages,
275
+ )
276
+ try:
277
+ markdown = self._convert_with_paddleocr(page, page_num)
278
+ except Exception as e:
279
+ logger.warning(
280
+ "PaddleOcrConverter: 第 %d/%d 页 OCR 失败, 降级为 pdfplumber, 错误=%s",
281
+ page_num + 1,
282
+ total_pages,
283
+ e,
284
+ )
285
+ ocr_failed = True
286
+ markdown = self._extract_text_with_tables(page)
287
+ else:
288
+ # Per-page analysis (PAGE_BY_PAGE mode or non-scanned doc)
289
+ page_type = self._analyze_page(page)
290
+
291
+ if page_type != "plain_text":
292
+ logger.info(
293
+ "PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 PaddleOCR",
294
+ page_num + 1,
295
+ total_pages,
296
+ page_type,
297
+ )
298
+ try:
299
+ markdown = self._convert_with_paddleocr(page, page_num)
300
+ except Exception as e:
301
+ logger.warning(
302
+ "PaddleOcrConverter: 第 %d/%d 页 OCR 失败, 降级为 pdfplumber, 错误=%s",
303
+ page_num + 1,
304
+ total_pages,
305
+ e,
306
+ )
307
+ ocr_failed = True
308
+ markdown = self._extract_text_with_tables(page)
309
+ else:
310
+ logger.info(
311
+ "PaddleOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber",
312
+ page_num + 1,
313
+ total_pages,
314
+ page_type,
315
+ )
316
+ markdown = self._extract_text_with_tables(page)
317
+
318
+ if markdown.strip():
319
+ markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
320
+
321
+ page.close()
322
+
323
+ markdown = "\n\n".join(markdown_parts).strip()
324
+
325
+ except Exception as e:
326
+ logger.error(
327
+ "PaddleOcrConverter: PDF 处理异常, 降级为 pdfminer, 错误=%s", e
328
+ )
329
+ # Fallback to pdfminer
330
+ pdf_stream.seek(0)
331
+ markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
332
+
333
+ # Final fallback
334
+ if not markdown:
335
+ pdf_stream.seek(0)
336
+ markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
337
+
338
+ # If OCR failed and result is empty, raise so the framework can try
339
+ # the next converter (e.g. GlmOcrConverter) instead of returning empty.
340
+ if ocr_failed and not markdown.strip():
341
+ logger.error("PaddleOcrConverter: OCR 失败且所有兜底结果为空, 抛出异常")
342
+ raise RuntimeError(
343
+ "PaddleOcrConverter: OCR failed and all fallbacks returned empty"
344
+ )
345
+
346
+ logger.info("PaddleOcrConverter: PDF 转换完成, 输出长度=%d", len(markdown))
347
+ return DocumentConverterResult(markdown=markdown)
348
+
349
+ def _convert_pdf_batch(self, pdf_bytes: bytes) -> str:
350
+ """Convert entire PDF in a single API call.
351
+
352
+ More efficient for scanned PDFs: one API call instead of N calls for N pages.
353
+
354
+ Args:
355
+ pdf_bytes: Raw PDF file content.
356
+
357
+ Returns:
358
+ Markdown text from all pages.
359
+ """
360
+ logger.info("PaddleOcrConverter: 批量上传PDF到OCR API, 大小=%d bytes", len(pdf_bytes))
361
+ markdown = self._get_client().ocr(
362
+ file_bytes=pdf_bytes,
363
+ filename="document.pdf",
364
+ )
365
+ return markdown
366
+
367
+ def _analyze_page(self, page: Any) -> str:
368
+ """Analyze page content type."""
369
+ # Check for images
370
+ if hasattr(page, "images") and page.images:
371
+ return "complex"
372
+
373
+ # Check for tables
374
+ tables = page.find_tables()
375
+ if tables:
376
+ return "complex"
377
+
378
+ # Check for graphics/curves
379
+ if hasattr(page, "curves") and page.curves:
380
+ return "complex"
381
+
382
+ return "plain_text"
383
+
384
+ def _is_scanned_page(self, page: Any) -> bool:
385
+ """Check if a page is likely a scanned image.
386
+
387
+ A page is considered scanned if:
388
+ 1. It contains images, AND
389
+ 2. It has very little extractable text (below threshold)
390
+
391
+ Args:
392
+ page: pdfplumber page object
393
+
394
+ Returns:
395
+ True if the page appears to be a scanned image
396
+ """
397
+ # Must have images to be a scan
398
+ has_images = hasattr(page, "images") and bool(page.images)
399
+ if not has_images:
400
+ return False
401
+
402
+ # Check extractable text length
403
+ try:
404
+ text = page.extract_text() or ""
405
+ text_len = len(text.strip())
406
+ # If there's substantial text, it might be a mixed page or
407
+ # a digital PDF with embedded images
408
+ if text_len >= self.scan_text_threshold:
409
+ return False
410
+ except Exception:
411
+ # If text extraction fails, assume it's a scan
412
+ return True
413
+
414
+ return True
415
+
416
+ def _detect_all_scanned(self, pdf: Any) -> bool:
417
+ """Detect if entire PDF is scanned based on scan_detection_mode.
418
+
419
+ Optimization: When first few pages are scanned, we can assume
420
+ all pages are scanned and skip per-page analysis.
421
+
422
+ Args:
423
+ pdf: pdfplumber PDF object
424
+
425
+ Returns:
426
+ True if entire PDF should be treated as scanned
427
+ """
428
+ if self.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE:
429
+ return False
430
+
431
+ total_pages = len(pdf.pages)
432
+ if total_pages == 0:
433
+ return False
434
+
435
+ if self.scan_detection_mode == ScanDetectionMode.FIRST_PAGE_HINT:
436
+ # Check only first page
437
+ first_page = pdf.pages[0]
438
+ is_scanned = self._is_scanned_page(first_page)
439
+ first_page.close()
440
+ if is_scanned:
441
+ logger.info(
442
+ "PaddleOcrConverter: 首页检测为扫描件, 模式=FIRST_PAGE_HINT, 全文档使用OCR"
443
+ )
444
+ return is_scanned
445
+
446
+ if self.scan_detection_mode == ScanDetectionMode.SAMPLING:
447
+ # Sample first N pages
448
+ sample_count = min(self.scan_sample_pages, total_pages)
449
+ scanned_count = 0
450
+
451
+ for i in range(sample_count):
452
+ page = pdf.pages[i]
453
+ if self._is_scanned_page(page):
454
+ scanned_count += 1
455
+
456
+ # If majority of sampled pages are scanned, treat all as scanned
457
+ majority_threshold = sample_count // 2 + 1
458
+ all_scanned = scanned_count >= majority_threshold
459
+
460
+ if all_scanned:
461
+ logger.info(
462
+ "PaddleOcrConverter: 抽样检测 %d/%d 页为扫描件, 模式=SAMPLING, 全文档使用OCR",
463
+ scanned_count,
464
+ sample_count,
465
+ )
466
+
467
+ return all_scanned
468
+
469
+ return False
470
+
471
+ def _convert_with_paddleocr(self, page: Any, page_num: int) -> str:
472
+ """Convert page using PaddleOCR API."""
473
+ # Render page to image
474
+ img = page.to_image(resolution=150)
475
+ img_bytes = io.BytesIO()
476
+ img.save(img_bytes, format="PNG")
477
+
478
+ logger.info("PaddleOcrConverter: PaddleOCR API 开始识别第 %d 页", page_num + 1)
479
+ try:
480
+ markdown = self._get_client().ocr(
481
+ file_bytes=img_bytes.getvalue(),
482
+ filename=f"page_{page_num + 1}.png",
483
+ )
484
+ except Exception as e:
485
+ logger.error(
486
+ "PaddleOcrConverter: PaddleOCR API 第 %d 页识别异常, 错误=%s",
487
+ page_num + 1,
488
+ e,
489
+ )
490
+ raise
491
+
492
+ logger.info(
493
+ "PaddleOcrConverter: PaddleOCR API 第 %d 页识别完成, 输出长度=%d",
494
+ page_num + 1,
495
+ len(markdown),
496
+ )
497
+ return markdown
498
+
499
+ def _extract_text_with_tables(self, page: Any) -> str:
500
+ """Extract text and tables from page."""
501
+ parts = []
502
+
503
+ # Extract text
504
+ text = page.extract_text() or ""
505
+ if text.strip():
506
+ parts.append(text.strip())
507
+
508
+ # Extract tables
509
+ try:
510
+ tables = page.extract_tables()
511
+ if tables:
512
+ for table in tables:
513
+ if table:
514
+ md_table = self._table_to_markdown(table)
515
+ if md_table.strip():
516
+ parts.append(md_table)
517
+ except Exception:
518
+ pass
519
+
520
+ return "\n\n".join(parts)
521
+
522
+ def _table_to_markdown(self, table: list[list[str]]) -> str:
523
+ """Convert table to Markdown."""
524
+ if not table:
525
+ return ""
526
+
527
+ # Filter None values
528
+ table = [[cell if cell is not None else "" for cell in row] for row in table]
529
+
530
+ # Filter empty rows
531
+ table = [row for row in table if any(cell.strip() for cell in row)]
532
+
533
+ if not table:
534
+ return ""
535
+
536
+ # Calculate column widths
537
+ col_widths = [
538
+ max(len(str(row[i])) if i < len(row) else 0 for row in table)
539
+ for i in range(max(len(row) for row in table))
540
+ ]
541
+
542
+ # Format table
543
+ lines = []
544
+ for row_idx, row in enumerate(table):
545
+ padded_row = row + [""] * (len(col_widths) - len(row))
546
+ line = (
547
+ "| "
548
+ + " | ".join(
549
+ str(cell).ljust(width)
550
+ for cell, width in zip(padded_row, col_widths)
551
+ )
552
+ + " |"
553
+ )
554
+ lines.append(line)
555
+
556
+ if row_idx == 0:
557
+ sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
558
+ lines.append(sep)
559
+
560
+ return "\n".join(lines)
561
+
562
+ def close(self):
563
+ """Close the client."""
564
+ self._client = None
565
+
566
+ def __enter__(self):
567
+ return self
568
+
569
+ def __exit__(self, exc_type, exc_val, exc_tb):
570
+ self.close()
@@ -1,13 +1,16 @@
1
1
  """Plugin registration for markitdown-paddleocr."""
2
2
 
3
+ import logging
3
4
  from typing import Any
5
+
4
6
  from markitdown import MarkItDown
5
7
 
6
8
  from ._converter import PaddleOcrConverter
7
9
 
8
-
9
10
  __plugin_interface_version__ = 1
10
11
 
12
+ logger = logging.getLogger(__name__)
13
+
11
14
 
12
15
  def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
13
16
  """Register markitdown-paddleocr converter.
@@ -17,19 +20,31 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
17
20
  2. Environment variables (BAIDU_PADDLE_TOKEN)
18
21
  3. Built-in defaults
19
22
  """
23
+ logger.info("markitdown-paddleocr: 开始注册插件")
24
+
20
25
  # Register converter with higher priority than default PDF converter
21
26
  PRIORITY_PADDLEOCR = -1.0
22
27
 
23
- markitdown.register_converter(
24
- PaddleOcrConverter(
28
+ try:
29
+ converter = PaddleOcrConverter(
25
30
  token=kwargs.get("token"),
26
31
  model=kwargs.get("model", "PaddleOCR-VL-1.5"),
27
32
  poll_interval=kwargs.get("poll_interval", 2.0),
28
33
  poll_timeout=kwargs.get("poll_timeout", 300.0),
29
34
  force_ai=kwargs.get("force_ai", False),
30
- use_doc_orientation_classify=kwargs.get("use_doc_orientation_classify", False),
35
+ use_doc_orientation_classify=kwargs.get(
36
+ "use_doc_orientation_classify", False
37
+ ),
31
38
  use_doc_unwarping=kwargs.get("use_doc_unwarping", False),
32
39
  use_chart_recognition=kwargs.get("use_chart_recognition", False),
33
- ),
34
- priority=PRIORITY_PADDLEOCR,
35
- )
40
+ )
41
+ markitdown.register_converter(
42
+ converter,
43
+ priority=PRIORITY_PADDLEOCR,
44
+ )
45
+ logger.info(
46
+ "markitdown-paddleocr: 插件注册成功, priority=%.1f", PRIORITY_PADDLEOCR
47
+ )
48
+ except Exception as e:
49
+ logger.error("markitdown-paddleocr: 插件注册失败, 错误=%s", e)
50
+ raise
@@ -1 +0,0 @@
1
- __version__ = "0.1.0"
@@ -1,304 +0,0 @@
1
- """PaddleOcr Converter - PDF/Image to Markdown using PaddleOCR cloud API."""
2
-
3
- import io
4
- import sys
5
- from typing import Any, BinaryIO, Optional
6
-
7
- from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
8
- from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
9
-
10
- from ._config import PaddleOcrConfig
11
- from ._paddle_client import PaddleClient
12
-
13
- # Import PDF dependencies
14
- _dependency_exc_info = None
15
- try:
16
- import pdfminer
17
- import pdfminer.high_level
18
- import pdfplumber
19
- except ImportError:
20
- _dependency_exc_info = sys.exc_info()
21
-
22
-
23
- ACCEPTED_MIME_TYPE_PREFIXES = [
24
- "application/pdf",
25
- "application/x-pdf",
26
- "image/jpeg",
27
- "image/png",
28
- ]
29
-
30
- ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
31
-
32
-
33
- class PaddleOcrConverter(DocumentConverter):
34
- """Intelligent PDF/Image converter using PaddleOCR cloud API.
35
-
36
- Features:
37
- - Auto-detect page content type (plain text vs images/tables)
38
- - Plain text pages use pdfplumber/pdfminer (fast, free)
39
- - Complex pages use PaddleOCR API for AI-powered OCR
40
- - Image files (PNG, JPG) use PaddleOCR API directly
41
- - Asynchronous job model: submit → poll → fetch result
42
- """
43
-
44
- def __init__(
45
- self,
46
- token: Optional[str] = None,
47
- model: str = "PaddleOCR-VL-1.5",
48
- poll_interval: float = 2.0,
49
- poll_timeout: float = 300.0,
50
- force_ai: bool = False,
51
- use_doc_orientation_classify: bool = False,
52
- use_doc_unwarping: bool = False,
53
- use_chart_recognition: bool = False,
54
- config: Optional[PaddleOcrConfig] = None,
55
- ):
56
- """Initialize converter.
57
-
58
- Args:
59
- token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided)
60
- model: OCR model name (default: PaddleOCR-VL-1.5)
61
- poll_interval: Seconds between status polls (default: 2.0)
62
- poll_timeout: Max seconds to wait for job completion (default: 300.0)
63
- force_ai: Force all pages to use OCR (default: False)
64
- use_doc_orientation_classify: Enable document orientation classification
65
- use_doc_unwarping: Enable document unwarping
66
- use_chart_recognition: Enable chart recognition
67
- config: Optional PaddleOcrConfig instance
68
- """
69
- # Build config from explicit params or provided config
70
- if config:
71
- self.token = token or config.token
72
- self.model = model if model != "PaddleOCR-VL-1.5" else config.model
73
- self.poll_interval = poll_interval if poll_interval != 2.0 else config.poll_interval
74
- self.poll_timeout = poll_timeout if poll_timeout != 300.0 else config.poll_timeout
75
- self.force_ai = force_ai or config.force_ai
76
- self.use_doc_orientation_classify = use_doc_orientation_classify or config.use_doc_orientation_classify
77
- self.use_doc_unwarping = use_doc_unwarping or config.use_doc_unwarping
78
- self.use_chart_recognition = use_chart_recognition or config.use_chart_recognition
79
- else:
80
- self.token = token
81
- self.model = model
82
- self.poll_interval = poll_interval
83
- self.poll_timeout = poll_timeout
84
- self.force_ai = force_ai
85
- self.use_doc_orientation_classify = use_doc_orientation_classify
86
- self.use_doc_unwarping = use_doc_unwarping
87
- self.use_chart_recognition = use_chart_recognition
88
-
89
- # Lazy init client
90
- self._client: Optional[PaddleClient] = None
91
-
92
- def _get_client(self) -> PaddleClient:
93
- """Get or create PaddleClient instance."""
94
- if self._client is None:
95
- config = PaddleOcrConfig(
96
- token=self.token or "",
97
- model=self.model,
98
- poll_interval=self.poll_interval,
99
- poll_timeout=self.poll_timeout,
100
- force_ai=self.force_ai,
101
- use_doc_orientation_classify=self.use_doc_orientation_classify,
102
- use_doc_unwarping=self.use_doc_unwarping,
103
- use_chart_recognition=self.use_chart_recognition,
104
- )
105
- self._client = PaddleClient(config=config)
106
- return self._client
107
-
108
- def accepts(
109
- self,
110
- file_stream: BinaryIO,
111
- stream_info: StreamInfo,
112
- **kwargs: Any,
113
- ) -> bool:
114
- mimetype = (stream_info.mimetype or "").lower()
115
- extension = (stream_info.extension or "").lower()
116
-
117
- if extension in ACCEPTED_FILE_EXTENSIONS:
118
- return True
119
-
120
- for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
121
- if mimetype.startswith(prefix):
122
- return True
123
-
124
- return False
125
-
126
- def convert(
127
- self,
128
- file_stream: BinaryIO,
129
- stream_info: StreamInfo,
130
- **kwargs: Any,
131
- ) -> DocumentConverterResult:
132
- if _dependency_exc_info is not None:
133
- raise MissingDependencyException(
134
- MISSING_DEPENDENCY_MESSAGE.format(
135
- converter=type(self).__name__,
136
- extension=".pdf",
137
- feature="pdf",
138
- )
139
- ) from _dependency_exc_info[1].with_traceback(
140
- _dependency_exc_info[2]
141
- )
142
-
143
- extension = (stream_info.extension or "").lower()
144
-
145
- # Image files: use PaddleOCR directly
146
- if extension in (".jpg", ".jpeg", ".png"):
147
- return self._convert_image(file_stream, extension)
148
-
149
- # PDF files: use hybrid approach
150
- return self._convert_pdf(file_stream)
151
-
152
- def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult:
153
- """Convert image file using PaddleOCR API."""
154
- img_bytes = file_stream.read()
155
- filename = f"image{extension}"
156
-
157
- try:
158
- markdown = self._get_client().ocr(file_bytes=img_bytes, filename=filename)
159
- return DocumentConverterResult(markdown=markdown)
160
- except Exception as e:
161
- return DocumentConverterResult(
162
- markdown=f"<!-- Error converting image with PaddleOCR: {e} -->"
163
- )
164
-
165
- def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
166
- """Convert PDF using hybrid approach (pdfplumber for text, PaddleOCR for complex pages)."""
167
- pdf_stream = io.BytesIO(file_stream.read())
168
- markdown_parts = []
169
-
170
- try:
171
- with pdfplumber.open(pdf_stream) as pdf:
172
- for page_num, page in enumerate(pdf.pages):
173
- # Analyze page type
174
- page_type = self._analyze_page(page)
175
-
176
- # Choose processing method
177
- if self.force_ai or page_type != "plain_text":
178
- # Complex content: use PaddleOCR
179
- markdown = self._convert_with_paddleocr(page, page_num)
180
- else:
181
- # Plain text: use pdfplumber
182
- markdown = self._extract_text_with_tables(page)
183
-
184
- if markdown.strip():
185
- markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
186
-
187
- page.close()
188
-
189
- markdown = "\n\n".join(markdown_parts).strip()
190
-
191
- except Exception:
192
- # Fallback to pdfminer
193
- pdf_stream.seek(0)
194
- markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
195
-
196
- # Final fallback
197
- if not markdown:
198
- pdf_stream.seek(0)
199
- markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
200
-
201
- return DocumentConverterResult(markdown=markdown)
202
-
203
- def _analyze_page(self, page: Any) -> str:
204
- """Analyze page content type."""
205
- # Check for images
206
- if hasattr(page, "images") and page.images:
207
- return "complex"
208
-
209
- # Check for tables
210
- tables = page.find_tables()
211
- if tables:
212
- return "complex"
213
-
214
- # Check for graphics/curves
215
- if hasattr(page, "curves") and page.curves:
216
- return "complex"
217
-
218
- return "plain_text"
219
-
220
- def _convert_with_paddleocr(self, page: Any, page_num: int) -> str:
221
- """Convert page using PaddleOCR API."""
222
- try:
223
- # Render page to image
224
- img = page.to_image(resolution=150)
225
- img_bytes = io.BytesIO()
226
- img.save(img_bytes, format="PNG")
227
-
228
- markdown = self._get_client().ocr(
229
- file_bytes=img_bytes.getvalue(),
230
- filename=f"page_{page_num + 1}.png",
231
- )
232
- return markdown
233
-
234
- except Exception:
235
- # Fallback to pdfplumber text extraction
236
- return self._extract_text_with_tables(page)
237
-
238
- def _extract_text_with_tables(self, page: Any) -> str:
239
- """Extract text and tables from page."""
240
- parts = []
241
-
242
- # Extract text
243
- text = page.extract_text() or ""
244
- if text.strip():
245
- parts.append(text.strip())
246
-
247
- # Extract tables
248
- try:
249
- tables = page.extract_tables()
250
- if tables:
251
- for table in tables:
252
- if table:
253
- md_table = self._table_to_markdown(table)
254
- if md_table.strip():
255
- parts.append(md_table)
256
- except Exception:
257
- pass
258
-
259
- return "\n\n".join(parts)
260
-
261
- def _table_to_markdown(self, table: list[list[str]]) -> str:
262
- """Convert table to Markdown."""
263
- if not table:
264
- return ""
265
-
266
- # Filter None values
267
- table = [[cell if cell is not None else "" for cell in row] for row in table]
268
-
269
- # Filter empty rows
270
- table = [row for row in table if any(cell.strip() for cell in row)]
271
-
272
- if not table:
273
- return ""
274
-
275
- # Calculate column widths
276
- col_widths = [
277
- max(len(str(row[i])) if i < len(row) else 0 for row in table)
278
- for i in range(max(len(row) for row in table))
279
- ]
280
-
281
- # Format table
282
- lines = []
283
- for row_idx, row in enumerate(table):
284
- padded_row = row + [""] * (len(col_widths) - len(row))
285
- line = "| " + " | ".join(
286
- str(cell).ljust(width) for cell, width in zip(padded_row, col_widths)
287
- ) + " |"
288
- lines.append(line)
289
-
290
- if row_idx == 0:
291
- sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
292
- lines.append(sep)
293
-
294
- return "\n".join(lines)
295
-
296
- def close(self):
297
- """Close the client."""
298
- self._client = None
299
-
300
- def __enter__(self):
301
- return self
302
-
303
- def __exit__(self, exc_type, exc_val, exc_tb):
304
- self.close()