markitdown-paddleocr 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/PKG-INFO +4 -4
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/README.md +3 -3
- markitdown_paddleocr-0.2.3/src/markitdown_paddleocr/__about__.py +1 -0
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/_config.py +3 -2
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/_converter.py +9 -5
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/_dual_converter.py +11 -4
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/_plugin.py +1 -1
- markitdown_paddleocr-0.2.2/src/markitdown_paddleocr/__about__.py +0 -1
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/.gitignore +0 -0
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/pyproject.toml +0 -0
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/__init__.py +0 -0
- {markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/_paddle_client.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: markitdown-paddleocr
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Intelligent PDF/Image to Markdown converter using PaddleOCR cloud API
|
|
5
5
|
Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/microsoft/markitdown/issues
|
|
@@ -51,7 +51,7 @@ pip install markitdown-paddleocr
|
|
|
51
51
|
export BAIDU_PADDLE_TOKEN="your-paddle-token"
|
|
52
52
|
|
|
53
53
|
# 可选
|
|
54
|
-
export PADDLE_OCR_MODEL="PaddleOCR-VL-1.
|
|
54
|
+
export PADDLE_OCR_MODEL="PaddleOCR-VL-1.6" # 模型名称
|
|
55
55
|
```
|
|
56
56
|
|
|
57
57
|
### 配置优先级
|
|
@@ -129,7 +129,7 @@ print(markdown)
|
|
|
129
129
|
| 参数 | 类型 | 默认值 | 说明 |
|
|
130
130
|
|------|------|--------|------|
|
|
131
131
|
| `token` | str | 环境变量 `BAIDU_PADDLE_TOKEN` | PaddleOCR Token |
|
|
132
|
-
| `model` | str | `PaddleOCR-VL-1.
|
|
132
|
+
| `model` | str | `PaddleOCR-VL-1.6` | OCR 模型名称 |
|
|
133
133
|
| `poll_interval` | float | 2.0 | 轮询间隔(秒) |
|
|
134
134
|
| `poll_timeout` | float | 300.0 | 轮询超时(秒) |
|
|
135
135
|
| `force_ai` | bool | False | 强制所有页面使用 OCR |
|
|
@@ -142,7 +142,7 @@ print(markdown)
|
|
|
142
142
|
| 变量 | 说明 | 示例 |
|
|
143
143
|
|------|------|------|
|
|
144
144
|
| `BAIDU_PADDLE_TOKEN` | Token(必需) | `7963b85a...` |
|
|
145
|
-
| `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.
|
|
145
|
+
| `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.6` |
|
|
146
146
|
|
|
147
147
|
## 工作原理
|
|
148
148
|
|
|
@@ -25,7 +25,7 @@ pip install markitdown-paddleocr
|
|
|
25
25
|
export BAIDU_PADDLE_TOKEN="your-paddle-token"
|
|
26
26
|
|
|
27
27
|
# 可选
|
|
28
|
-
export PADDLE_OCR_MODEL="PaddleOCR-VL-1.
|
|
28
|
+
export PADDLE_OCR_MODEL="PaddleOCR-VL-1.6" # 模型名称
|
|
29
29
|
```
|
|
30
30
|
|
|
31
31
|
### 配置优先级
|
|
@@ -103,7 +103,7 @@ print(markdown)
|
|
|
103
103
|
| 参数 | 类型 | 默认值 | 说明 |
|
|
104
104
|
|------|------|--------|------|
|
|
105
105
|
| `token` | str | 环境变量 `BAIDU_PADDLE_TOKEN` | PaddleOCR Token |
|
|
106
|
-
| `model` | str | `PaddleOCR-VL-1.
|
|
106
|
+
| `model` | str | `PaddleOCR-VL-1.6` | OCR 模型名称 |
|
|
107
107
|
| `poll_interval` | float | 2.0 | 轮询间隔(秒) |
|
|
108
108
|
| `poll_timeout` | float | 300.0 | 轮询超时(秒) |
|
|
109
109
|
| `force_ai` | bool | False | 强制所有页面使用 OCR |
|
|
@@ -116,7 +116,7 @@ print(markdown)
|
|
|
116
116
|
| 变量 | 说明 | 示例 |
|
|
117
117
|
|------|------|------|
|
|
118
118
|
| `BAIDU_PADDLE_TOKEN` | Token(必需) | `7963b85a...` |
|
|
119
|
-
| `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.
|
|
119
|
+
| `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.6` |
|
|
120
120
|
|
|
121
121
|
## 工作原理
|
|
122
122
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.3"
|
{markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/_config.py
RENAMED
|
@@ -12,6 +12,7 @@ class ScanDetectionMode(str, Enum):
|
|
|
12
12
|
- FIRST_PAGE_HINT: 首页是扫描件则全文档使用OCR
|
|
13
13
|
- SAMPLING: 抽样前N页,多数是扫描件则全部OCR
|
|
14
14
|
"""
|
|
15
|
+
|
|
15
16
|
PAGE_BY_PAGE = "page_by_page"
|
|
16
17
|
FIRST_PAGE_HINT = "first_page_hint"
|
|
17
18
|
SAMPLING = "sampling"
|
|
@@ -31,7 +32,7 @@ class PaddleOcrConfig:
|
|
|
31
32
|
token: str = "" # Reads from BAIDU_PADDLE_TOKEN by default
|
|
32
33
|
|
|
33
34
|
# OCR model
|
|
34
|
-
model: str = "PaddleOCR-VL-1.
|
|
35
|
+
model: str = "PaddleOCR-VL-1.6"
|
|
35
36
|
|
|
36
37
|
# API endpoint
|
|
37
38
|
job_url: str = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs"
|
|
@@ -58,7 +59,7 @@ class PaddleOcrConfig:
|
|
|
58
59
|
"""Create config from environment variables with optional overrides."""
|
|
59
60
|
defaults = {
|
|
60
61
|
"token": os.environ.get("BAIDU_PADDLE_TOKEN", ""),
|
|
61
|
-
"model": os.environ.get("PADDLE_OCR_MODEL", "PaddleOCR-VL-1.
|
|
62
|
+
"model": os.environ.get("PADDLE_OCR_MODEL", "PaddleOCR-VL-1.6"),
|
|
62
63
|
}
|
|
63
64
|
defaults.update(overrides)
|
|
64
65
|
return cls(**defaults)
|
{markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/_converter.py
RENAMED
|
@@ -51,7 +51,7 @@ class PaddleOcrConverter(DocumentConverter):
|
|
|
51
51
|
def __init__(
|
|
52
52
|
self,
|
|
53
53
|
token: Optional[str] = None,
|
|
54
|
-
model: str = "PaddleOCR-VL-1.
|
|
54
|
+
model: str = "PaddleOCR-VL-1.6",
|
|
55
55
|
poll_interval: float = 2.0,
|
|
56
56
|
poll_timeout: float = 300.0,
|
|
57
57
|
force_ai: bool = False,
|
|
@@ -67,7 +67,7 @@ class PaddleOcrConverter(DocumentConverter):
|
|
|
67
67
|
|
|
68
68
|
Args:
|
|
69
69
|
token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided)
|
|
70
|
-
model: OCR model name (default: PaddleOCR-VL-1.
|
|
70
|
+
model: OCR model name (default: PaddleOCR-VL-1.6)
|
|
71
71
|
poll_interval: Seconds between status polls (default: 2.0)
|
|
72
72
|
poll_timeout: Max seconds to wait for job completion (default: 300.0)
|
|
73
73
|
force_ai: Force all pages to use OCR (default: False)
|
|
@@ -82,7 +82,7 @@ class PaddleOcrConverter(DocumentConverter):
|
|
|
82
82
|
# Build config from explicit params or provided config
|
|
83
83
|
if config:
|
|
84
84
|
self.token = token or config.token
|
|
85
|
-
self.model = model if model != "PaddleOCR-VL-1.
|
|
85
|
+
self.model = model if model != "PaddleOCR-VL-1.6" else config.model
|
|
86
86
|
self.poll_interval = (
|
|
87
87
|
poll_interval if poll_interval != 2.0 else config.poll_interval
|
|
88
88
|
)
|
|
@@ -126,7 +126,9 @@ class PaddleOcrConverter(DocumentConverter):
|
|
|
126
126
|
if scan_detection_mode is not None
|
|
127
127
|
else ScanDetectionMode.SAMPLING
|
|
128
128
|
)
|
|
129
|
-
self.scan_sample_pages =
|
|
129
|
+
self.scan_sample_pages = (
|
|
130
|
+
scan_sample_pages if scan_sample_pages is not None else 3
|
|
131
|
+
)
|
|
130
132
|
self.scan_text_threshold = (
|
|
131
133
|
scan_text_threshold if scan_text_threshold is not None else 50
|
|
132
134
|
)
|
|
@@ -357,7 +359,9 @@ class PaddleOcrConverter(DocumentConverter):
|
|
|
357
359
|
Returns:
|
|
358
360
|
Markdown text from all pages.
|
|
359
361
|
"""
|
|
360
|
-
logger.info(
|
|
362
|
+
logger.info(
|
|
363
|
+
"PaddleOcrConverter: 批量上传PDF到OCR API, 大小=%d bytes", len(pdf_bytes)
|
|
364
|
+
)
|
|
361
365
|
markdown = self._get_client().ocr(
|
|
362
366
|
file_bytes=pdf_bytes,
|
|
363
367
|
filename="document.pdf",
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
"""DualOcrConverter - glmocr (primary) → paddleocr (fallback) automatic degradation."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Optional
|
|
4
|
+
from typing import Any, BinaryIO, Optional
|
|
5
5
|
|
|
6
|
-
from markitdown import
|
|
7
|
-
|
|
6
|
+
from markitdown import (
|
|
7
|
+
DocumentConverter,
|
|
8
|
+
DocumentConverterResult,
|
|
9
|
+
MarkItDown,
|
|
10
|
+
StreamInfo,
|
|
11
|
+
)
|
|
8
12
|
|
|
9
13
|
logger = logging.getLogger(__name__)
|
|
10
14
|
|
|
@@ -28,7 +32,7 @@ class DualOcrConverter(DocumentConverter):
|
|
|
28
32
|
glmocr_force_ai: bool = False,
|
|
29
33
|
# paddleocr kwargs
|
|
30
34
|
paddleocr_token: Optional[str] = None,
|
|
31
|
-
paddleocr_model: str = "PaddleOCR-VL-1.
|
|
35
|
+
paddleocr_model: str = "PaddleOCR-VL-1.6",
|
|
32
36
|
paddleocr_poll_interval: float = 2.0,
|
|
33
37
|
paddleocr_poll_timeout: float = 300.0,
|
|
34
38
|
paddleocr_force_ai: bool = False,
|
|
@@ -61,6 +65,7 @@ class DualOcrConverter(DocumentConverter):
|
|
|
61
65
|
"""Lazily init both converters."""
|
|
62
66
|
try:
|
|
63
67
|
from markitdown_glmocr import GlmOcrConverter
|
|
68
|
+
|
|
64
69
|
# Filter out None values
|
|
65
70
|
kwargs = {k: v for k, v in self.glmocr_kwargs.items() if v is not None}
|
|
66
71
|
self._primary = GlmOcrConverter(**kwargs)
|
|
@@ -71,6 +76,7 @@ class DualOcrConverter(DocumentConverter):
|
|
|
71
76
|
|
|
72
77
|
try:
|
|
73
78
|
from markitdown_paddleocr import PaddleOcrConverter
|
|
79
|
+
|
|
74
80
|
kwargs = {k: v for k, v in self.paddleocr_kwargs.items() if v is not None}
|
|
75
81
|
self._fallback = PaddleOcrConverter(**kwargs)
|
|
76
82
|
logger.info("paddleocr converter initialized (fallback)")
|
|
@@ -155,6 +161,7 @@ class DualOcrConverter(DocumentConverter):
|
|
|
155
161
|
def io_bytes(data: bytes):
|
|
156
162
|
"""Create a seekable BytesIO from bytes."""
|
|
157
163
|
import io
|
|
164
|
+
|
|
158
165
|
buf = io.BytesIO(data)
|
|
159
166
|
buf.seek(0)
|
|
160
167
|
return buf
|
{markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/_plugin.py
RENAMED
|
@@ -28,7 +28,7 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
|
|
|
28
28
|
try:
|
|
29
29
|
converter = PaddleOcrConverter(
|
|
30
30
|
token=kwargs.get("token"),
|
|
31
|
-
model=kwargs.get("model", "PaddleOCR-VL-1.
|
|
31
|
+
model=kwargs.get("model", "PaddleOCR-VL-1.6"),
|
|
32
32
|
poll_interval=kwargs.get("poll_interval", 2.0),
|
|
33
33
|
poll_timeout=kwargs.get("poll_timeout", 300.0),
|
|
34
34
|
force_ai=kwargs.get("force_ai", False),
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.2"
|
|
File without changes
|
|
File without changes
|
{markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/__init__.py
RENAMED
|
File without changes
|
{markitdown_paddleocr-0.2.2 → markitdown_paddleocr-0.2.3}/src/markitdown_paddleocr/_paddle_client.py
RENAMED
|
File without changes
|