markitdown-paddleocr 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: markitdown-paddleocr
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Intelligent PDF/Image to Markdown converter using PaddleOCR cloud API
5
5
  Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
6
6
  Project-URL: Issues, https://github.com/microsoft/markitdown/issues
@@ -51,7 +51,7 @@ pip install markitdown-paddleocr
51
51
  export BAIDU_PADDLE_TOKEN="your-paddle-token"
52
52
 
53
53
  # 可选
54
- export PADDLE_OCR_MODEL="PaddleOCR-VL-1.5" # 模型名称
54
+ export PADDLE_OCR_MODEL="PaddleOCR-VL-1.6" # 模型名称
55
55
  ```
56
56
 
57
57
  ### 配置优先级
@@ -129,7 +129,7 @@ print(markdown)
129
129
  | 参数 | 类型 | 默认值 | 说明 |
130
130
  |------|------|--------|------|
131
131
  | `token` | str | 环境变量 `BAIDU_PADDLE_TOKEN` | PaddleOCR Token |
132
- | `model` | str | `PaddleOCR-VL-1.5` | OCR 模型名称 |
132
+ | `model` | str | `PaddleOCR-VL-1.6` | OCR 模型名称 |
133
133
  | `poll_interval` | float | 2.0 | 轮询间隔(秒) |
134
134
  | `poll_timeout` | float | 300.0 | 轮询超时(秒) |
135
135
  | `force_ai` | bool | False | 强制所有页面使用 OCR |
@@ -142,7 +142,7 @@ print(markdown)
142
142
  | 变量 | 说明 | 示例 |
143
143
  |------|------|------|
144
144
  | `BAIDU_PADDLE_TOKEN` | Token(必需) | `7963b85a...` |
145
- | `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.5` |
145
+ | `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.6` |
146
146
 
147
147
  ## 工作原理
148
148
 
@@ -25,7 +25,7 @@ pip install markitdown-paddleocr
25
25
  export BAIDU_PADDLE_TOKEN="your-paddle-token"
26
26
 
27
27
  # 可选
28
- export PADDLE_OCR_MODEL="PaddleOCR-VL-1.5" # 模型名称
28
+ export PADDLE_OCR_MODEL="PaddleOCR-VL-1.6" # 模型名称
29
29
  ```
30
30
 
31
31
  ### 配置优先级
@@ -103,7 +103,7 @@ print(markdown)
103
103
  | 参数 | 类型 | 默认值 | 说明 |
104
104
  |------|------|--------|------|
105
105
  | `token` | str | 环境变量 `BAIDU_PADDLE_TOKEN` | PaddleOCR Token |
106
- | `model` | str | `PaddleOCR-VL-1.5` | OCR 模型名称 |
106
+ | `model` | str | `PaddleOCR-VL-1.6` | OCR 模型名称 |
107
107
  | `poll_interval` | float | 2.0 | 轮询间隔(秒) |
108
108
  | `poll_timeout` | float | 300.0 | 轮询超时(秒) |
109
109
  | `force_ai` | bool | False | 强制所有页面使用 OCR |
@@ -116,7 +116,7 @@ print(markdown)
116
116
  | 变量 | 说明 | 示例 |
117
117
  |------|------|------|
118
118
  | `BAIDU_PADDLE_TOKEN` | Token(必需) | `7963b85a...` |
119
- | `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.5` |
119
+ | `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.6` |
120
120
 
121
121
  ## 工作原理
122
122
 
@@ -0,0 +1 @@
1
+ __version__ = "0.2.3"
@@ -12,6 +12,7 @@ class ScanDetectionMode(str, Enum):
12
12
  - FIRST_PAGE_HINT: 首页是扫描件则全文档使用OCR
13
13
  - SAMPLING: 抽样前N页,多数是扫描件则全部OCR
14
14
  """
15
+
15
16
  PAGE_BY_PAGE = "page_by_page"
16
17
  FIRST_PAGE_HINT = "first_page_hint"
17
18
  SAMPLING = "sampling"
@@ -31,7 +32,7 @@ class PaddleOcrConfig:
31
32
  token: str = "" # Reads from BAIDU_PADDLE_TOKEN by default
32
33
 
33
34
  # OCR model
34
- model: str = "PaddleOCR-VL-1.5"
35
+ model: str = "PaddleOCR-VL-1.6"
35
36
 
36
37
  # API endpoint
37
38
  job_url: str = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs"
@@ -58,7 +59,7 @@ class PaddleOcrConfig:
58
59
  """Create config from environment variables with optional overrides."""
59
60
  defaults = {
60
61
  "token": os.environ.get("BAIDU_PADDLE_TOKEN", ""),
61
- "model": os.environ.get("PADDLE_OCR_MODEL", "PaddleOCR-VL-1.5"),
62
+ "model": os.environ.get("PADDLE_OCR_MODEL", "PaddleOCR-VL-1.6"),
62
63
  }
63
64
  defaults.update(overrides)
64
65
  return cls(**defaults)
@@ -51,7 +51,7 @@ class PaddleOcrConverter(DocumentConverter):
51
51
  def __init__(
52
52
  self,
53
53
  token: Optional[str] = None,
54
- model: str = "PaddleOCR-VL-1.5",
54
+ model: str = "PaddleOCR-VL-1.6",
55
55
  poll_interval: float = 2.0,
56
56
  poll_timeout: float = 300.0,
57
57
  force_ai: bool = False,
@@ -67,7 +67,7 @@ class PaddleOcrConverter(DocumentConverter):
67
67
 
68
68
  Args:
69
69
  token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided)
70
- model: OCR model name (default: PaddleOCR-VL-1.5)
70
+ model: OCR model name (default: PaddleOCR-VL-1.6)
71
71
  poll_interval: Seconds between status polls (default: 2.0)
72
72
  poll_timeout: Max seconds to wait for job completion (default: 300.0)
73
73
  force_ai: Force all pages to use OCR (default: False)
@@ -82,7 +82,7 @@ class PaddleOcrConverter(DocumentConverter):
82
82
  # Build config from explicit params or provided config
83
83
  if config:
84
84
  self.token = token or config.token
85
- self.model = model if model != "PaddleOCR-VL-1.5" else config.model
85
+ self.model = model if model != "PaddleOCR-VL-1.6" else config.model
86
86
  self.poll_interval = (
87
87
  poll_interval if poll_interval != 2.0 else config.poll_interval
88
88
  )
@@ -126,7 +126,9 @@ class PaddleOcrConverter(DocumentConverter):
126
126
  if scan_detection_mode is not None
127
127
  else ScanDetectionMode.SAMPLING
128
128
  )
129
- self.scan_sample_pages = scan_sample_pages if scan_sample_pages is not None else 3
129
+ self.scan_sample_pages = (
130
+ scan_sample_pages if scan_sample_pages is not None else 3
131
+ )
130
132
  self.scan_text_threshold = (
131
133
  scan_text_threshold if scan_text_threshold is not None else 50
132
134
  )
@@ -357,7 +359,9 @@ class PaddleOcrConverter(DocumentConverter):
357
359
  Returns:
358
360
  Markdown text from all pages.
359
361
  """
360
- logger.info("PaddleOcrConverter: 批量上传PDF到OCR API, 大小=%d bytes", len(pdf_bytes))
362
+ logger.info(
363
+ "PaddleOcrConverter: 批量上传PDF到OCR API, 大小=%d bytes", len(pdf_bytes)
364
+ )
361
365
  markdown = self._get_client().ocr(
362
366
  file_bytes=pdf_bytes,
363
367
  filename="document.pdf",
@@ -1,10 +1,14 @@
1
1
  """DualOcrConverter - glmocr (primary) → paddleocr (fallback) automatic degradation."""
2
2
 
3
3
  import logging
4
- from typing import Optional
4
+ from typing import Any, BinaryIO, Optional
5
5
 
6
- from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
7
- from typing import BinaryIO, Any
6
+ from markitdown import (
7
+ DocumentConverter,
8
+ DocumentConverterResult,
9
+ MarkItDown,
10
+ StreamInfo,
11
+ )
8
12
 
9
13
  logger = logging.getLogger(__name__)
10
14
 
@@ -28,7 +32,7 @@ class DualOcrConverter(DocumentConverter):
28
32
  glmocr_force_ai: bool = False,
29
33
  # paddleocr kwargs
30
34
  paddleocr_token: Optional[str] = None,
31
- paddleocr_model: str = "PaddleOCR-VL-1.5",
35
+ paddleocr_model: str = "PaddleOCR-VL-1.6",
32
36
  paddleocr_poll_interval: float = 2.0,
33
37
  paddleocr_poll_timeout: float = 300.0,
34
38
  paddleocr_force_ai: bool = False,
@@ -61,6 +65,7 @@ class DualOcrConverter(DocumentConverter):
61
65
  """Lazily init both converters."""
62
66
  try:
63
67
  from markitdown_glmocr import GlmOcrConverter
68
+
64
69
  # Filter out None values
65
70
  kwargs = {k: v for k, v in self.glmocr_kwargs.items() if v is not None}
66
71
  self._primary = GlmOcrConverter(**kwargs)
@@ -71,6 +76,7 @@ class DualOcrConverter(DocumentConverter):
71
76
 
72
77
  try:
73
78
  from markitdown_paddleocr import PaddleOcrConverter
79
+
74
80
  kwargs = {k: v for k, v in self.paddleocr_kwargs.items() if v is not None}
75
81
  self._fallback = PaddleOcrConverter(**kwargs)
76
82
  logger.info("paddleocr converter initialized (fallback)")
@@ -155,6 +161,7 @@ class DualOcrConverter(DocumentConverter):
155
161
  def io_bytes(data: bytes):
156
162
  """Create a seekable BytesIO from bytes."""
157
163
  import io
164
+
158
165
  buf = io.BytesIO(data)
159
166
  buf.seek(0)
160
167
  return buf
@@ -28,7 +28,7 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
28
28
  try:
29
29
  converter = PaddleOcrConverter(
30
30
  token=kwargs.get("token"),
31
- model=kwargs.get("model", "PaddleOCR-VL-1.5"),
31
+ model=kwargs.get("model", "PaddleOCR-VL-1.6"),
32
32
  poll_interval=kwargs.get("poll_interval", 2.0),
33
33
  poll_timeout=kwargs.get("poll_timeout", 300.0),
34
34
  force_ai=kwargs.get("force_ai", False),
@@ -1 +0,0 @@
1
- __version__ = "0.2.2"