markitdown-paddleocr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ .vscode
2
+
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+ cover/
55
+ .test-logs/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # poetry
101
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105
+ #poetry.lock
106
+
107
+ # pdm
108
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109
+ #pdm.lock
110
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111
+ # in version control.
112
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
113
+ .pdm.toml
114
+ .pdm-python
115
+ .pdm-build/
116
+
117
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118
+ __pypackages__/
119
+
120
+ # Celery stuff
121
+ celerybeat-schedule
122
+ celerybeat.pid
123
+
124
+ # SageMath parsed files
125
+ *.sage.py
126
+
127
+ # Environments
128
+ .env
129
+ .venv
130
+ env/
131
+ venv/
132
+ ENV/
133
+ env.bak/
134
+ venv.bak/
135
+
136
+ # Spyder project settings
137
+ .spyderproject
138
+ .spyproject
139
+
140
+ # Rope project settings
141
+ .ropeproject
142
+
143
+ # mkdocs documentation
144
+ /site
145
+
146
+ # mypy
147
+ .mypy_cache/
148
+ .dmypy.json
149
+ dmypy.json
150
+
151
+ # Pyre type checker
152
+ .pyre/
153
+
154
+ # pytype static type analyzer
155
+ .pytype/
156
+
157
+ # Cython debug symbols
158
+ cython_debug/
159
+
160
+ # PyCharm
161
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
164
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165
+ #.idea/
166
+ src/.DS_Store
167
+ .DS_Store
168
+ .cursorrules
169
+
170
+ # Local secrets (never commit)
171
+ .secrets.local
172
+ *.secrets
173
+ .env.local
174
+ test-data/
@@ -0,0 +1,183 @@
1
+ Metadata-Version: 2.4
2
+ Name: markitdown-paddleocr
3
+ Version: 0.1.0
4
+ Summary: Intelligent PDF/Image to Markdown converter using PaddleOCR cloud API
5
+ Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
6
+ Project-URL: Issues, https://github.com/microsoft/markitdown/issues
7
+ Project-URL: Source, https://github.com/microsoft/markitdown
8
+ Author-email: Contributors <noreply@github.com>
9
+ License-Expression: MIT
10
+ Keywords: baidu,markitdown,ocr,paddleocr,pdf,vision
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Programming Language :: Python
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: markitdown>=0.1.0
19
+ Requires-Dist: pdfminer-six>=20251230
20
+ Requires-Dist: pdfplumber>=0.11.9
21
+ Requires-Dist: pillow>=9.0.0
22
+ Requires-Dist: requests>=2.28.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # markitdown-paddleocr
28
+
29
+ 智能 PDF/图片转 Markdown 插件,使用百度 PaddleOCR 云端 API 驱动的 OCR 识别。
30
+
31
+ ## 特性
32
+
33
+ - 🔍 **智能检测**:自动识别每页内容类型(纯文本 vs 图片/表格)
34
+ - 📄 **默认解析**:纯文本页面使用 pdfplumber/pdfminer 提取,速度快、成本低
35
+ - 🤖 **AI 增强**:复杂页面(图片、表格)使用 PaddleOCR API 转换为 Markdown
36
+ - 🔄 **异步 Job 模型**:提交 OCR 任务 → 轮询状态 → 获取结果
37
+ - 📊 **结构化输出**:返回 Markdown(含表格、公式、图表等)
38
+
39
+ ## 安装
40
+
41
+ ```bash
42
+ pip install markitdown-paddleocr
43
+ ```
44
+
45
+ ## 配置
46
+
47
+ ### 环境变量(推荐)
48
+
49
+ ```bash
50
+ # 必需:百度 PaddleOCR Token
51
+ export BAIDU_PADDLE_TOKEN="your-paddle-token"
52
+
53
+ # 可选
54
+ export PADDLE_OCR_MODEL="PaddleOCR-VL-1.5" # 模型名称
55
+ ```
56
+
57
+ ### 配置优先级
58
+
59
+ ```
60
+ 构造函数参数 > 环境变量 > 内置默认值
61
+ ```
62
+
63
+ ## 使用方法
64
+
65
+ ### 命令行(推荐)
66
+
67
+ ```bash
68
+ # 1. 设置 Token
69
+ export BAIDU_PADDLE_TOKEN="your-token"
70
+
71
+ # 2. 查看已安装插件
72
+ markitdown --list-plugins
73
+
74
+ # 3. 使用插件转换 PDF
75
+ markitdown -p document.pdf
76
+
77
+ # 4. 保存到文件
78
+ markitdown -p document.pdf -o output.md
79
+ ```
80
+
81
+ ### Python API
82
+
83
+ ```python
84
+ from markitdown import MarkItDown
85
+ from markitdown_paddleocr import PaddleOcrConverter
86
+
87
+ # 方式1:自动从环境变量读取 BAIDU_PADDLE_TOKEN
88
+ converter = PaddleOcrConverter()
89
+ md = MarkItDown(enable_plugins=False)
90
+ md.register_converter(converter, priority=-1.0)
91
+ result = md.convert("document.pdf")
92
+ print(result.markdown)
93
+
94
+ # 方式2:手动传入 Token
95
+ converter = PaddleOcrConverter(token="your-token")
96
+ md = MarkItDown(enable_plugins=False)
97
+ md.register_converter(converter, priority=-1.0)
98
+ result = md.convert("document.pdf")
99
+ print(result.markdown)
100
+
101
+ # 方式3:强制所有页面使用 OCR
102
+ converter = PaddleOcrConverter(token="your-token", force_ai=True)
103
+ md = MarkItDown(enable_plugins=False)
104
+ md.register_converter(converter, priority=-1.0)
105
+ result = md.convert("document.pdf")
106
+ print(result.markdown)
107
+ ```
108
+
109
+ ### 直接使用 PaddleClient
110
+
111
+ ```python
112
+ from markitdown_paddleocr import PaddleClient
113
+
114
+ client = PaddleClient(token="your-token")
115
+
116
+ # 本地文件
117
+ markdown = client.ocr(file_bytes=open("image.png", "rb").read(), filename="image.png")
118
+ print(markdown)
119
+
120
+ # URL 模式
121
+ markdown = client.ocr(file_url="https://example.com/document.pdf")
122
+ print(markdown)
123
+ ```
124
+
125
+ ## 配置选项
126
+
127
+ ### PaddleOcrConverter 参数
128
+
129
+ | 参数 | 类型 | 默认值 | 说明 |
130
+ |------|------|--------|------|
131
+ | `token` | str | 环境变量 `BAIDU_PADDLE_TOKEN` | PaddleOCR Token |
132
+ | `model` | str | `PaddleOCR-VL-1.5` | OCR 模型名称 |
133
+ | `poll_interval` | float | 2.0 | 轮询间隔(秒) |
134
+ | `poll_timeout` | float | 300.0 | 轮询超时(秒) |
135
+ | `force_ai` | bool | False | 强制所有页面使用 OCR |
136
+ | `use_doc_orientation_classify` | bool | False | 文档方向分类 |
137
+ | `use_doc_unwarping` | bool | False | 文档去扭曲 |
138
+ | `use_chart_recognition` | bool | False | 图表识别 |
139
+
140
+ ### 环境变量
141
+
142
+ | 变量 | 说明 | 示例 |
143
+ |------|------|------|
144
+ | `BAIDU_PADDLE_TOKEN` | Token(必需) | `7963b85a...` |
145
+ | `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.5` |
146
+
147
+ ## 工作原理
148
+
149
+ ```
150
+ PDF/图片 输入
151
+
152
+
153
+ PaddleOcrConverter.convert()
154
+
155
+ ├─ 图片文件 ──► PaddleClient.ocr() ──► markdown
156
+
157
+ └─ PDF 文件 ──► 逐页分析内容类型
158
+
159
+ ├─ 纯文本页 ──► pdfplumber 提取文本
160
+
161
+ └─ 复杂页(图片/表格)
162
+
163
+ └─► 渲染为图片 ──► PaddleClient.ocr()
164
+
165
+ ├─ POST /api/v2/ocr/jobs (提交 Job)
166
+ ├─ GET /api/v2/ocr/jobs/{id} (轮询状态)
167
+ └─ GET jsonUrl (获取 JSONL 结果)
168
+
169
+
170
+ 合并输出完整 Markdown
171
+ ```
172
+
173
+ ## 依赖
174
+
175
+ - `markitdown>=0.1.0` - 基础框架
176
+ - `pdfplumber>=0.11.9` - PDF 解析和截图
177
+ - `pdfminer.six>=20251230` - 文本提取备用
178
+ - `Pillow>=9.0.0` - 图像处理
179
+ - `requests>=2.28.0` - HTTP 请求
180
+
181
+ ## 许可证
182
+
183
+ MIT
@@ -0,0 +1,157 @@
1
+ # markitdown-paddleocr
2
+
3
+ 智能 PDF/图片转 Markdown 插件,使用百度 PaddleOCR 云端 API 驱动的 OCR 识别。
4
+
5
+ ## 特性
6
+
7
+ - 🔍 **智能检测**:自动识别每页内容类型(纯文本 vs 图片/表格)
8
+ - 📄 **默认解析**:纯文本页面使用 pdfplumber/pdfminer 提取,速度快、成本低
9
+ - 🤖 **AI 增强**:复杂页面(图片、表格)使用 PaddleOCR API 转换为 Markdown
10
+ - 🔄 **异步 Job 模型**:提交 OCR 任务 → 轮询状态 → 获取结果
11
+ - 📊 **结构化输出**:返回 Markdown(含表格、公式、图表等)
12
+
13
+ ## 安装
14
+
15
+ ```bash
16
+ pip install markitdown-paddleocr
17
+ ```
18
+
19
+ ## 配置
20
+
21
+ ### 环境变量(推荐)
22
+
23
+ ```bash
24
+ # 必需:百度 PaddleOCR Token
25
+ export BAIDU_PADDLE_TOKEN="your-paddle-token"
26
+
27
+ # 可选
28
+ export PADDLE_OCR_MODEL="PaddleOCR-VL-1.5" # 模型名称
29
+ ```
30
+
31
+ ### 配置优先级
32
+
33
+ ```
34
+ 构造函数参数 > 环境变量 > 内置默认值
35
+ ```
36
+
37
+ ## 使用方法
38
+
39
+ ### 命令行(推荐)
40
+
41
+ ```bash
42
+ # 1. 设置 Token
43
+ export BAIDU_PADDLE_TOKEN="your-token"
44
+
45
+ # 2. 查看已安装插件
46
+ markitdown --list-plugins
47
+
48
+ # 3. 使用插件转换 PDF
49
+ markitdown -p document.pdf
50
+
51
+ # 4. 保存到文件
52
+ markitdown -p document.pdf -o output.md
53
+ ```
54
+
55
+ ### Python API
56
+
57
+ ```python
58
+ from markitdown import MarkItDown
59
+ from markitdown_paddleocr import PaddleOcrConverter
60
+
61
+ # 方式1:自动从环境变量读取 BAIDU_PADDLE_TOKEN
62
+ converter = PaddleOcrConverter()
63
+ md = MarkItDown(enable_plugins=False)
64
+ md.register_converter(converter, priority=-1.0)
65
+ result = md.convert("document.pdf")
66
+ print(result.markdown)
67
+
68
+ # 方式2:手动传入 Token
69
+ converter = PaddleOcrConverter(token="your-token")
70
+ md = MarkItDown(enable_plugins=False)
71
+ md.register_converter(converter, priority=-1.0)
72
+ result = md.convert("document.pdf")
73
+ print(result.markdown)
74
+
75
+ # 方式3:强制所有页面使用 OCR
76
+ converter = PaddleOcrConverter(token="your-token", force_ai=True)
77
+ md = MarkItDown(enable_plugins=False)
78
+ md.register_converter(converter, priority=-1.0)
79
+ result = md.convert("document.pdf")
80
+ print(result.markdown)
81
+ ```
82
+
83
+ ### 直接使用 PaddleClient
84
+
85
+ ```python
86
+ from markitdown_paddleocr import PaddleClient
87
+
88
+ client = PaddleClient(token="your-token")
89
+
90
+ # 本地文件
91
+ markdown = client.ocr(file_bytes=open("image.png", "rb").read(), filename="image.png")
92
+ print(markdown)
93
+
94
+ # URL 模式
95
+ markdown = client.ocr(file_url="https://example.com/document.pdf")
96
+ print(markdown)
97
+ ```
98
+
99
+ ## 配置选项
100
+
101
+ ### PaddleOcrConverter 参数
102
+
103
+ | 参数 | 类型 | 默认值 | 说明 |
104
+ |------|------|--------|------|
105
+ | `token` | str | 环境变量 `BAIDU_PADDLE_TOKEN` | PaddleOCR Token |
106
+ | `model` | str | `PaddleOCR-VL-1.5` | OCR 模型名称 |
107
+ | `poll_interval` | float | 2.0 | 轮询间隔(秒) |
108
+ | `poll_timeout` | float | 300.0 | 轮询超时(秒) |
109
+ | `force_ai` | bool | False | 强制所有页面使用 OCR |
110
+ | `use_doc_orientation_classify` | bool | False | 文档方向分类 |
111
+ | `use_doc_unwarping` | bool | False | 文档去扭曲 |
112
+ | `use_chart_recognition` | bool | False | 图表识别 |
113
+
114
+ ### 环境变量
115
+
116
+ | 变量 | 说明 | 示例 |
117
+ |------|------|------|
118
+ | `BAIDU_PADDLE_TOKEN` | Token(必需) | `7963b85a...` |
119
+ | `PADDLE_OCR_MODEL` | 模型名称 | `PaddleOCR-VL-1.5` |
120
+
121
+ ## 工作原理
122
+
123
+ ```
124
+ PDF/图片 输入
125
+
126
+
127
+ PaddleOcrConverter.convert()
128
+
129
+ ├─ 图片文件 ──► PaddleClient.ocr() ──► markdown
130
+
131
+ └─ PDF 文件 ──► 逐页分析内容类型
132
+
133
+ ├─ 纯文本页 ──► pdfplumber 提取文本
134
+
135
+ └─ 复杂页(图片/表格)
136
+
137
+ └─► 渲染为图片 ──► PaddleClient.ocr()
138
+
139
+ ├─ POST /api/v2/ocr/jobs (提交 Job)
140
+ ├─ GET /api/v2/ocr/jobs/{id} (轮询状态)
141
+ └─ GET jsonUrl (获取 JSONL 结果)
142
+
143
+
144
+ 合并输出完整 Markdown
145
+ ```
146
+
147
+ ## 依赖
148
+
149
+ - `markitdown>=0.1.0` - 基础框架
150
+ - `pdfplumber>=0.11.9` - PDF 解析和截图
151
+ - `pdfminer.six>=20251230` - 文本提取备用
152
+ - `Pillow>=9.0.0` - 图像处理
153
+ - `requests>=2.28.0` - HTTP 请求
154
+
155
+ ## 许可证
156
+
157
+ MIT
@@ -0,0 +1,58 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "markitdown-paddleocr"
7
+ dynamic = ["version"]
8
+ description = "Intelligent PDF/Image to Markdown converter using PaddleOCR cloud API"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ keywords = ["markitdown", "pdf", "ocr", "paddleocr", "baidu", "vision"]
13
+ authors = [
14
+ { name = "Contributors", email = "noreply@github.com" },
15
+ ]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Programming Language :: Python",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ ]
24
+
25
+ dependencies = [
26
+ "markitdown>=0.1.0",
27
+ "pdfminer.six>=20251230",
28
+ "pdfplumber>=0.11.9",
29
+ "Pillow>=9.0.0",
30
+ "requests>=2.28.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=7.0.0",
36
+ ]
37
+
38
+ [project.urls]
39
+ Documentation = "https://github.com/microsoft/markitdown#readme"
40
+ Issues = "https://github.com/microsoft/markitdown/issues"
41
+ Source = "https://github.com/microsoft/markitdown"
42
+
43
+ [tool.hatch.version]
44
+ path = "src/markitdown_paddleocr/__about__.py"
45
+
46
+ # Plugin entry point - MarkItDown will discover this plugin
47
+ [project.entry-points."markitdown.plugin"]
48
+ markitdown_paddleocr = "markitdown_paddleocr"
49
+
50
+ [tool.hatch.build.targets.sdist]
51
+ only-include = ["src/markitdown_paddleocr"]
52
+
53
+ [tool.hatch.build.targets.wheel]
54
+ packages = ["src/markitdown_paddleocr"]
55
+
56
+ [tool.pytest.ini_options]
57
+ testpaths = ["tests"]
58
+ python_files = ["test_*.py"]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,16 @@
1
+ """markitdown-paddleocr: PDF/Image to Markdown converter using PaddleOCR cloud API."""
2
+
3
+ from ._plugin import register_converters
4
+ from ._config import PaddleOcrConfig
5
+ from ._converter import PaddleOcrConverter
6
+ from ._paddle_client import PaddleClient
7
+ from ._dual_converter import DualOcrConverter
8
+
9
+ __plugin_interface_version__ = 1
10
+ __all__ = [
11
+ "register_converters",
12
+ "PaddleOcrConfig",
13
+ "PaddleOcrConverter",
14
+ "PaddleClient",
15
+ "DualOcrConverter",
16
+ ]
@@ -0,0 +1,46 @@
1
+ """Configuration for markitdown-paddleocr."""
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+
6
+
7
+ @dataclass
8
+ class PaddleOcrConfig:
9
+ """markitdown-paddleocr configuration.
10
+
11
+ Configuration priority (high to low):
12
+ 1. Constructor kwargs
13
+ 2. Environment variables
14
+ 3. Built-in defaults
15
+ """
16
+
17
+ # API configuration
18
+ token: str = "" # Reads from BAIDU_PADDLE_TOKEN by default
19
+
20
+ # OCR model
21
+ model: str = "PaddleOCR-VL-1.5"
22
+
23
+ # API endpoint
24
+ job_url: str = "https://paddleocr.aistudio-app.com/api/v2/ocr/jobs"
25
+
26
+ # Polling configuration
27
+ poll_interval: float = 2.0 # seconds between polls
28
+ poll_timeout: float = 300.0 # max seconds to wait for job completion
29
+
30
+ # Optional OCR features
31
+ use_doc_orientation_classify: bool = False
32
+ use_doc_unwarping: bool = False
33
+ use_chart_recognition: bool = False
34
+
35
+ # Processing strategy
36
+ force_ai: bool = False
37
+
38
+ @classmethod
39
+ def from_env(cls, **overrides) -> "PaddleOcrConfig":
40
+ """Create config from environment variables with optional overrides."""
41
+ defaults = {
42
+ "token": os.environ.get("BAIDU_PADDLE_TOKEN", ""),
43
+ "model": os.environ.get("PADDLE_OCR_MODEL", "PaddleOCR-VL-1.5"),
44
+ }
45
+ defaults.update(overrides)
46
+ return cls(**defaults)
@@ -0,0 +1,304 @@
1
+ """PaddleOcr Converter - PDF/Image to Markdown using PaddleOCR cloud API."""
2
+
3
+ import io
4
+ import sys
5
+ from typing import Any, BinaryIO, Optional
6
+
7
+ from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
8
+ from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
9
+
10
+ from ._config import PaddleOcrConfig
11
+ from ._paddle_client import PaddleClient
12
+
13
+ # Import PDF dependencies
14
+ _dependency_exc_info = None
15
+ try:
16
+ import pdfminer
17
+ import pdfminer.high_level
18
+ import pdfplumber
19
+ except ImportError:
20
+ _dependency_exc_info = sys.exc_info()
21
+
22
+
23
+ ACCEPTED_MIME_TYPE_PREFIXES = [
24
+ "application/pdf",
25
+ "application/x-pdf",
26
+ "image/jpeg",
27
+ "image/png",
28
+ ]
29
+
30
+ ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
31
+
32
+
33
+ class PaddleOcrConverter(DocumentConverter):
34
+ """Intelligent PDF/Image converter using PaddleOCR cloud API.
35
+
36
+ Features:
37
+ - Auto-detect page content type (plain text vs images/tables)
38
+ - Plain text pages use pdfplumber/pdfminer (fast, free)
39
+ - Complex pages use PaddleOCR API for AI-powered OCR
40
+ - Image files (PNG, JPG) use PaddleOCR API directly
41
+ - Asynchronous job model: submit → poll → fetch result
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ token: Optional[str] = None,
47
+ model: str = "PaddleOCR-VL-1.5",
48
+ poll_interval: float = 2.0,
49
+ poll_timeout: float = 300.0,
50
+ force_ai: bool = False,
51
+ use_doc_orientation_classify: bool = False,
52
+ use_doc_unwarping: bool = False,
53
+ use_chart_recognition: bool = False,
54
+ config: Optional[PaddleOcrConfig] = None,
55
+ ):
56
+ """Initialize converter.
57
+
58
+ Args:
59
+ token: Baidu PaddleOCR token (reads from BAIDU_PADDLE_TOKEN env var if not provided)
60
+ model: OCR model name (default: PaddleOCR-VL-1.5)
61
+ poll_interval: Seconds between status polls (default: 2.0)
62
+ poll_timeout: Max seconds to wait for job completion (default: 300.0)
63
+ force_ai: Force all pages to use OCR (default: False)
64
+ use_doc_orientation_classify: Enable document orientation classification
65
+ use_doc_unwarping: Enable document unwarping
66
+ use_chart_recognition: Enable chart recognition
67
+ config: Optional PaddleOcrConfig instance
68
+ """
69
+ # Build config from explicit params or provided config
70
+ if config:
71
+ self.token = token or config.token
72
+ self.model = model if model != "PaddleOCR-VL-1.5" else config.model
73
+ self.poll_interval = poll_interval if poll_interval != 2.0 else config.poll_interval
74
+ self.poll_timeout = poll_timeout if poll_timeout != 300.0 else config.poll_timeout
75
+ self.force_ai = force_ai or config.force_ai
76
+ self.use_doc_orientation_classify = use_doc_orientation_classify or config.use_doc_orientation_classify
77
+ self.use_doc_unwarping = use_doc_unwarping or config.use_doc_unwarping
78
+ self.use_chart_recognition = use_chart_recognition or config.use_chart_recognition
79
+ else:
80
+ self.token = token
81
+ self.model = model
82
+ self.poll_interval = poll_interval
83
+ self.poll_timeout = poll_timeout
84
+ self.force_ai = force_ai
85
+ self.use_doc_orientation_classify = use_doc_orientation_classify
86
+ self.use_doc_unwarping = use_doc_unwarping
87
+ self.use_chart_recognition = use_chart_recognition
88
+
89
+ # Lazy init client
90
+ self._client: Optional[PaddleClient] = None
91
+
92
+ def _get_client(self) -> PaddleClient:
93
+ """Get or create PaddleClient instance."""
94
+ if self._client is None:
95
+ config = PaddleOcrConfig(
96
+ token=self.token or "",
97
+ model=self.model,
98
+ poll_interval=self.poll_interval,
99
+ poll_timeout=self.poll_timeout,
100
+ force_ai=self.force_ai,
101
+ use_doc_orientation_classify=self.use_doc_orientation_classify,
102
+ use_doc_unwarping=self.use_doc_unwarping,
103
+ use_chart_recognition=self.use_chart_recognition,
104
+ )
105
+ self._client = PaddleClient(config=config)
106
+ return self._client
107
+
108
+ def accepts(
109
+ self,
110
+ file_stream: BinaryIO,
111
+ stream_info: StreamInfo,
112
+ **kwargs: Any,
113
+ ) -> bool:
114
+ mimetype = (stream_info.mimetype or "").lower()
115
+ extension = (stream_info.extension or "").lower()
116
+
117
+ if extension in ACCEPTED_FILE_EXTENSIONS:
118
+ return True
119
+
120
+ for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
121
+ if mimetype.startswith(prefix):
122
+ return True
123
+
124
+ return False
125
+
126
+ def convert(
127
+ self,
128
+ file_stream: BinaryIO,
129
+ stream_info: StreamInfo,
130
+ **kwargs: Any,
131
+ ) -> DocumentConverterResult:
132
+ if _dependency_exc_info is not None:
133
+ raise MissingDependencyException(
134
+ MISSING_DEPENDENCY_MESSAGE.format(
135
+ converter=type(self).__name__,
136
+ extension=".pdf",
137
+ feature="pdf",
138
+ )
139
+ ) from _dependency_exc_info[1].with_traceback(
140
+ _dependency_exc_info[2]
141
+ )
142
+
143
+ extension = (stream_info.extension or "").lower()
144
+
145
+ # Image files: use PaddleOCR directly
146
+ if extension in (".jpg", ".jpeg", ".png"):
147
+ return self._convert_image(file_stream, extension)
148
+
149
+ # PDF files: use hybrid approach
150
+ return self._convert_pdf(file_stream)
151
+
152
+ def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult:
153
+ """Convert image file using PaddleOCR API."""
154
+ img_bytes = file_stream.read()
155
+ filename = f"image{extension}"
156
+
157
+ try:
158
+ markdown = self._get_client().ocr(file_bytes=img_bytes, filename=filename)
159
+ return DocumentConverterResult(markdown=markdown)
160
+ except Exception as e:
161
+ return DocumentConverterResult(
162
+ markdown=f"<!-- Error converting image with PaddleOCR: {e} -->"
163
+ )
164
+
165
+ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
166
+ """Convert PDF using hybrid approach (pdfplumber for text, PaddleOCR for complex pages)."""
167
+ pdf_stream = io.BytesIO(file_stream.read())
168
+ markdown_parts = []
169
+
170
+ try:
171
+ with pdfplumber.open(pdf_stream) as pdf:
172
+ for page_num, page in enumerate(pdf.pages):
173
+ # Analyze page type
174
+ page_type = self._analyze_page(page)
175
+
176
+ # Choose processing method
177
+ if self.force_ai or page_type != "plain_text":
178
+ # Complex content: use PaddleOCR
179
+ markdown = self._convert_with_paddleocr(page, page_num)
180
+ else:
181
+ # Plain text: use pdfplumber
182
+ markdown = self._extract_text_with_tables(page)
183
+
184
+ if markdown.strip():
185
+ markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
186
+
187
+ page.close()
188
+
189
+ markdown = "\n\n".join(markdown_parts).strip()
190
+
191
+ except Exception:
192
+ # Fallback to pdfminer
193
+ pdf_stream.seek(0)
194
+ markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
195
+
196
+ # Final fallback
197
+ if not markdown:
198
+ pdf_stream.seek(0)
199
+ markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
200
+
201
+ return DocumentConverterResult(markdown=markdown)
202
+
203
+ def _analyze_page(self, page: Any) -> str:
204
+ """Analyze page content type."""
205
+ # Check for images
206
+ if hasattr(page, "images") and page.images:
207
+ return "complex"
208
+
209
+ # Check for tables
210
+ tables = page.find_tables()
211
+ if tables:
212
+ return "complex"
213
+
214
+ # Check for graphics/curves
215
+ if hasattr(page, "curves") and page.curves:
216
+ return "complex"
217
+
218
+ return "plain_text"
219
+
220
+ def _convert_with_paddleocr(self, page: Any, page_num: int) -> str:
221
+ """Convert page using PaddleOCR API."""
222
+ try:
223
+ # Render page to image
224
+ img = page.to_image(resolution=150)
225
+ img_bytes = io.BytesIO()
226
+ img.save(img_bytes, format="PNG")
227
+
228
+ markdown = self._get_client().ocr(
229
+ file_bytes=img_bytes.getvalue(),
230
+ filename=f"page_{page_num + 1}.png",
231
+ )
232
+ return markdown
233
+
234
+ except Exception:
235
+ # Fallback to pdfplumber text extraction
236
+ return self._extract_text_with_tables(page)
237
+
238
+ def _extract_text_with_tables(self, page: Any) -> str:
239
+ """Extract text and tables from page."""
240
+ parts = []
241
+
242
+ # Extract text
243
+ text = page.extract_text() or ""
244
+ if text.strip():
245
+ parts.append(text.strip())
246
+
247
+ # Extract tables
248
+ try:
249
+ tables = page.extract_tables()
250
+ if tables:
251
+ for table in tables:
252
+ if table:
253
+ md_table = self._table_to_markdown(table)
254
+ if md_table.strip():
255
+ parts.append(md_table)
256
+ except Exception:
257
+ pass
258
+
259
+ return "\n\n".join(parts)
260
+
261
+ def _table_to_markdown(self, table: list[list[str]]) -> str:
262
+ """Convert table to Markdown."""
263
+ if not table:
264
+ return ""
265
+
266
+ # Filter None values
267
+ table = [[cell if cell is not None else "" for cell in row] for row in table]
268
+
269
+ # Filter empty rows
270
+ table = [row for row in table if any(cell.strip() for cell in row)]
271
+
272
+ if not table:
273
+ return ""
274
+
275
+ # Calculate column widths
276
+ col_widths = [
277
+ max(len(str(row[i])) if i < len(row) else 0 for row in table)
278
+ for i in range(max(len(row) for row in table))
279
+ ]
280
+
281
+ # Format table
282
+ lines = []
283
+ for row_idx, row in enumerate(table):
284
+ padded_row = row + [""] * (len(col_widths) - len(row))
285
+ line = "| " + " | ".join(
286
+ str(cell).ljust(width) for cell, width in zip(padded_row, col_widths)
287
+ ) + " |"
288
+ lines.append(line)
289
+
290
+ if row_idx == 0:
291
+ sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
292
+ lines.append(sep)
293
+
294
+ return "\n".join(lines)
295
+
296
+ def close(self):
297
+ """Close the client."""
298
+ self._client = None
299
+
300
+ def __enter__(self):
301
+ return self
302
+
303
+ def __exit__(self, exc_type, exc_val, exc_tb):
304
+ self.close()
@@ -0,0 +1,160 @@
1
+ """DualOcrConverter - glmocr (primary) → paddleocr (fallback) automatic degradation."""
2
+
3
+ import logging
4
+ from typing import Optional
5
+
6
+ from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
7
+ from typing import BinaryIO, Any
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class DualOcrConverter(DocumentConverter):
13
+ """Dual OCR converter with automatic fallback: glmocr → paddleocr.
14
+
15
+ Usage:
16
+ converter = DualOcrConverter()
17
+ md = MarkItDown(enable_plugins=False)
18
+ md.register_converter(converter, priority=-1.0)
19
+ result = md.convert("document.pdf")
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ # glmocr kwargs
25
+ glmocr_api_key: Optional[str] = None,
26
+ glmocr_timeout: int = 1800,
27
+ glmocr_enable_layout: bool = False,
28
+ glmocr_force_ai: bool = False,
29
+ # paddleocr kwargs
30
+ paddleocr_token: Optional[str] = None,
31
+ paddleocr_model: str = "PaddleOCR-VL-1.5",
32
+ paddleocr_poll_interval: float = 2.0,
33
+ paddleocr_poll_timeout: float = 300.0,
34
+ paddleocr_force_ai: bool = False,
35
+ paddleocr_use_doc_orientation_classify: bool = False,
36
+ paddleocr_use_doc_unwarping: bool = False,
37
+ paddleocr_use_chart_recognition: bool = False,
38
+ ):
39
+ self.glmocr_kwargs = {
40
+ "api_key": glmocr_api_key,
41
+ "timeout": glmocr_timeout,
42
+ "enable_layout": glmocr_enable_layout,
43
+ "force_ai": glmocr_force_ai,
44
+ }
45
+ self.paddleocr_kwargs = {
46
+ "token": paddleocr_token,
47
+ "model": paddleocr_model,
48
+ "poll_interval": paddleocr_poll_interval,
49
+ "poll_timeout": paddleocr_poll_timeout,
50
+ "force_ai": paddleocr_force_ai,
51
+ "use_doc_orientation_classify": paddleocr_use_doc_orientation_classify,
52
+ "use_doc_unwarping": paddleocr_use_doc_unwarping,
53
+ "use_chart_recognition": paddleocr_use_chart_recognition,
54
+ }
55
+
56
+ self._primary = None
57
+ self._fallback = None
58
+ self._init_converters()
59
+
60
+ def _init_converters(self):
61
+ """Lazily init both converters."""
62
+ try:
63
+ from markitdown_glmocr import GlmOcrConverter
64
+ # Filter out None values
65
+ kwargs = {k: v for k, v in self.glmocr_kwargs.items() if v is not None}
66
+ self._primary = GlmOcrConverter(**kwargs)
67
+ logger.info("glmocr converter initialized (primary)")
68
+ except Exception as e:
69
+ logger.warning("glmocr init failed: %s", e)
70
+ self._primary = None
71
+
72
+ try:
73
+ from markitdown_paddleocr import PaddleOcrConverter
74
+ kwargs = {k: v for k, v in self.paddleocr_kwargs.items() if v is not None}
75
+ self._fallback = PaddleOcrConverter(**kwargs)
76
+ logger.info("paddleocr converter initialized (fallback)")
77
+ except Exception as e:
78
+ logger.warning("paddleocr init failed: %s", e)
79
+ self._fallback = None
80
+
81
+ def accepts(
82
+ self,
83
+ file_stream: BinaryIO,
84
+ stream_info: StreamInfo,
85
+ **kwargs: Any,
86
+ ) -> bool:
87
+ """Accept if either converter accepts."""
88
+ if self._primary:
89
+ try:
90
+ file_stream.seek(0)
91
+ if self._primary.accepts(file_stream, stream_info, **kwargs):
92
+ return True
93
+ except Exception:
94
+ pass
95
+
96
+ if self._fallback:
97
+ try:
98
+ file_stream.seek(0)
99
+ if self._fallback.accepts(file_stream, stream_info, **kwargs):
100
+ return True
101
+ except Exception:
102
+ pass
103
+
104
+ return False
105
+
106
+ def convert(
107
+ self,
108
+ file_stream: BinaryIO,
109
+ stream_info: StreamInfo,
110
+ **kwargs: Any,
111
+ ) -> DocumentConverterResult:
112
+ """Convert with primary, fallback on failure."""
113
+ data = file_stream.read()
114
+
115
+ # Try primary (glmocr)
116
+ if self._primary:
117
+ try:
118
+ result = self._primary.convert(io_bytes(data), stream_info, **kwargs)
119
+ if result.markdown and result.markdown.strip():
120
+ logger.info("✓ glmocr succeeded")
121
+ return result
122
+ logger.warning("glmocr returned empty result, falling back")
123
+ except Exception as e:
124
+ logger.warning("glmocr failed: %s, falling back to paddleocr", e)
125
+
126
+ # Fallback (paddleocr)
127
+ if self._fallback:
128
+ try:
129
+ result = self._fallback.convert(io_bytes(data), stream_info, **kwargs)
130
+ if result.markdown and result.markdown.strip():
131
+ logger.info("✓ paddleocr succeeded (fallback)")
132
+ return result
133
+ logger.warning("paddleocr returned empty result")
134
+ except Exception as e:
135
+ logger.error("paddleocr also failed: %s", e)
136
+
137
+ # Both failed
138
+ return DocumentConverterResult(
139
+ markdown="<!-- Both OCR engines (glmocr, paddleocr) failed to convert this file -->"
140
+ )
141
+
142
+ def close(self):
143
+ if self._primary and hasattr(self._primary, "close"):
144
+ self._primary.close()
145
+ if self._fallback and hasattr(self._fallback, "close"):
146
+ self._fallback.close()
147
+
148
+ def __enter__(self):
149
+ return self
150
+
151
+ def __exit__(self, exc_type, exc_val, exc_tb):
152
+ self.close()
153
+
154
+
155
+ def io_bytes(data: bytes):
156
+ """Create a seekable BytesIO from bytes."""
157
+ import io
158
+ buf = io.BytesIO(data)
159
+ buf.seek(0)
160
+ return buf
@@ -0,0 +1,189 @@
1
+ """PaddleOCR API Client - handles job submission, polling, and result fetching."""
2
+
3
+ import json
4
+ import logging
5
+ import time
6
+ from typing import Optional
7
+
8
+ import requests
9
+
10
+ from ._config import PaddleOcrConfig
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class PaddleOcrError(Exception):
16
+ """PaddleOCR API error."""
17
+
18
+ pass
19
+
20
+
21
+ class PaddleClient:
22
+ """Client for PaddleOCR cloud API.
23
+
24
+ Workflow: submit job → poll status → fetch JSONL result → extract markdown.
25
+ """
26
+
27
+ def __init__(self, config: Optional[PaddleOcrConfig] = None, **kwargs):
28
+ if config is None:
29
+ config = PaddleOcrConfig(**kwargs)
30
+ self.config = config
31
+
32
+ # Token from config or env
33
+ self.token = config.token
34
+ if not self.token:
35
+ import os
36
+ self.token = os.environ.get("BAIDU_PADDLE_TOKEN", "")
37
+
38
+ def _headers(self) -> dict:
39
+ """Build authorization headers."""
40
+ return {"Authorization": f"bearer {self.token}"}
41
+
42
+ def _optional_payload(self) -> dict:
43
+ """Build optional payload flags."""
44
+ return {
45
+ "useDocOrientationClassify": self.config.use_doc_orientation_classify,
46
+ "useDocUnwarping": self.config.use_doc_unwarping,
47
+ "useChartRecognition": self.config.use_chart_recognition,
48
+ }
49
+
50
+ def ocr(
51
+ self,
52
+ file_bytes: Optional[bytes] = None,
53
+ filename: Optional[str] = None,
54
+ file_url: Optional[str] = None,
55
+ ) -> str:
56
+ """Run OCR on a file or URL, return concatenated markdown.
57
+
58
+ Args:
59
+ file_bytes: File content bytes (for local file upload).
60
+ filename: Filename for multipart upload (e.g. "page.png").
61
+ file_url: File URL (for URL mode, alternative to file_bytes).
62
+
63
+ Returns:
64
+ Markdown text extracted from all pages.
65
+
66
+ Raises:
67
+ PaddleOcrError: On API errors or timeout.
68
+ """
69
+ # 1. Submit job
70
+ job_id = self._submit(file_bytes=file_bytes, filename=filename, file_url=file_url)
71
+ logger.info("Job submitted: %s", job_id)
72
+
73
+ # 2. Poll until done
74
+ result_url = self._poll(job_id)
75
+ logger.info("Job completed, result URL obtained")
76
+
77
+ # 3. Fetch and parse results
78
+ return self._fetch_markdown(result_url)
79
+
80
+ def _submit(
81
+ self,
82
+ file_bytes: Optional[bytes] = None,
83
+ filename: Optional[str] = None,
84
+ file_url: Optional[str] = None,
85
+ ) -> str:
86
+ """Submit an OCR job, return job ID."""
87
+ headers = self._headers()
88
+
89
+ if file_url:
90
+ # URL mode
91
+ headers["Content-Type"] = "application/json"
92
+ payload = {
93
+ "fileUrl": file_url,
94
+ "model": self.config.model,
95
+ "optionalPayload": self._optional_payload(),
96
+ }
97
+ resp = requests.post(self.config.job_url, json=payload, headers=headers)
98
+ elif file_bytes is not None:
99
+ # Local file mode - multipart upload
100
+ data = {
101
+ "model": self.config.model,
102
+ "optionalPayload": json.dumps(self._optional_payload()),
103
+ }
104
+ fname = filename or "document"
105
+ files = {"file": (fname, file_bytes)}
106
+ resp = requests.post(self.config.job_url, headers=headers, data=data, files=files)
107
+ else:
108
+ raise PaddleOcrError("Either file_bytes or file_url must be provided")
109
+
110
+ if resp.status_code != 200:
111
+ raise PaddleOcrError(f"Submit failed (HTTP {resp.status_code}): {resp.text}")
112
+
113
+ result = resp.json()
114
+ job_id = result.get("data", {}).get("jobId")
115
+ if not job_id:
116
+ raise PaddleOcrError(f"No jobId in response: {result}")
117
+
118
+ return job_id
119
+
120
+ def _poll(self, job_id: str) -> str:
121
+ """Poll job status until done, return JSONL result URL."""
122
+ headers = self._headers()
123
+ url = f"{self.config.job_url}/{job_id}"
124
+ start = time.time()
125
+
126
+ while True:
127
+ resp = requests.get(url, headers=headers)
128
+ if resp.status_code != 200:
129
+ raise PaddleOcrError(f"Poll failed (HTTP {resp.status_code}): {resp.text}")
130
+
131
+ data = resp.json().get("data", {})
132
+ state = data.get("state", "")
133
+
134
+ if state == "done":
135
+ result_url = data.get("resultUrl", {}).get("jsonUrl", "")
136
+ if not result_url:
137
+ raise PaddleOcrError("Job done but no resultUrl in response")
138
+ return result_url
139
+
140
+ if state == "failed":
141
+ error_msg = data.get("errorMsg", "Unknown error")
142
+ raise PaddleOcrError(f"Job failed: {error_msg}")
143
+
144
+ # Still pending or running
145
+ if state == "running":
146
+ progress = data.get("extractProgress", {})
147
+ total = progress.get("totalPages", "?")
148
+ extracted = progress.get("extractedPages", "?")
149
+ logger.debug("Running: %s/%s pages", extracted, total)
150
+ else:
151
+ logger.debug("State: %s", state)
152
+
153
+ # Check timeout
154
+ elapsed = time.time() - start
155
+ if elapsed > self.config.poll_timeout:
156
+ raise PaddleOcrError(
157
+ f"Job polling timed out after {self.config.poll_timeout}s (state={state})"
158
+ )
159
+
160
+ time.sleep(self.config.poll_interval)
161
+
162
+ def _fetch_markdown(self, jsonl_url: str) -> str:
163
+ """Fetch JSONL result and extract markdown from all pages."""
164
+ resp = requests.get(jsonl_url)
165
+ resp.raise_for_status()
166
+
167
+ markdown_parts = []
168
+ lines = resp.text.strip().split("\n")
169
+
170
+ for line in lines:
171
+ line = line.strip()
172
+ if not line:
173
+ continue
174
+
175
+ try:
176
+ page_data = json.loads(line)
177
+ except json.JSONDecodeError:
178
+ logger.warning("Skipping invalid JSONL line")
179
+ continue
180
+
181
+ result = page_data.get("result", {})
182
+ layout_results = result.get("layoutParsingResults", [])
183
+
184
+ for layout in layout_results:
185
+ md_text = layout.get("markdown", {}).get("text", "")
186
+ if md_text.strip():
187
+ markdown_parts.append(md_text.strip())
188
+
189
+ return "\n\n".join(markdown_parts)
@@ -0,0 +1,35 @@
1
+ """Plugin registration for markitdown-paddleocr."""
2
+
3
+ from typing import Any
4
+ from markitdown import MarkItDown
5
+
6
+ from ._converter import PaddleOcrConverter
7
+
8
+
9
+ __plugin_interface_version__ = 1
10
+
11
+
12
+ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
13
+ """Register markitdown-paddleocr converter.
14
+
15
+ Config sources (priority high to low):
16
+ 1. kwargs parameters
17
+ 2. Environment variables (BAIDU_PADDLE_TOKEN)
18
+ 3. Built-in defaults
19
+ """
20
+ # Register converter with higher priority than default PDF converter
21
+ PRIORITY_PADDLEOCR = -1.0
22
+
23
+ markitdown.register_converter(
24
+ PaddleOcrConverter(
25
+ token=kwargs.get("token"),
26
+ model=kwargs.get("model", "PaddleOCR-VL-1.5"),
27
+ poll_interval=kwargs.get("poll_interval", 2.0),
28
+ poll_timeout=kwargs.get("poll_timeout", 300.0),
29
+ force_ai=kwargs.get("force_ai", False),
30
+ use_doc_orientation_classify=kwargs.get("use_doc_orientation_classify", False),
31
+ use_doc_unwarping=kwargs.get("use_doc_unwarping", False),
32
+ use_chart_recognition=kwargs.get("use_chart_recognition", False),
33
+ ),
34
+ priority=PRIORITY_PADDLEOCR,
35
+ )