markitdown-glmocr 0.1.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {markitdown_glmocr-0.1.0 → markitdown_glmocr-0.2.2}/.gitignore +1 -0
- {markitdown_glmocr-0.1.0 → markitdown_glmocr-0.2.2}/PKG-INFO +88 -1
- {markitdown_glmocr-0.1.0 → markitdown_glmocr-0.2.2}/README.md +87 -0
- markitdown_glmocr-0.2.2/src/markitdown_glmocr/__about__.py +1 -0
- markitdown_glmocr-0.2.2/src/markitdown_glmocr/_config.py +43 -0
- markitdown_glmocr-0.2.2/src/markitdown_glmocr/_converter.py +551 -0
- {markitdown_glmocr-0.1.0 → markitdown_glmocr-0.2.2}/src/markitdown_glmocr/_plugin.py +21 -8
- markitdown_glmocr-0.1.0/src/markitdown_glmocr/__about__.py +0 -1
- markitdown_glmocr-0.1.0/src/markitdown_glmocr/_config.py +0 -25
- markitdown_glmocr-0.1.0/src/markitdown_glmocr/_converter.py +0 -304
- {markitdown_glmocr-0.1.0 → markitdown_glmocr-0.2.2}/pyproject.toml +0 -0
- {markitdown_glmocr-0.1.0 → markitdown_glmocr-0.2.2}/src/markitdown_glmocr/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: markitdown-glmocr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Intelligent PDF to Markdown converter using glmocr SDK
|
|
5
5
|
Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/microsoft/markitdown/issues
|
|
@@ -218,6 +218,93 @@ glmocr SDK 返回的结构化数据支持以下标签:
|
|
|
218
218
|
- `Pillow>=9.0.0` - 图像处理
|
|
219
219
|
- `glmocr` - 智谱 OCR SDK(可选,AI 功能需要)
|
|
220
220
|
|
|
221
|
+
## 发布到 PyPI
|
|
222
|
+
|
|
223
|
+
### 前置条件
|
|
224
|
+
|
|
225
|
+
1. 安装构建工具:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
pip install build twine hatch
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
2. 配置 PyPI API Token(Windows 用户环境变量):
|
|
232
|
+
|
|
233
|
+
```powershell
|
|
234
|
+
# PowerShell 设置用户环境变量
|
|
235
|
+
[System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
或在 Bash/Zsh 中:
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
export PYPI_API_TOKEN="pypi-..."
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### 快速发布(推荐)
|
|
245
|
+
|
|
246
|
+
项目根目录提供了上传脚本,可一键发布两个插件:
|
|
247
|
+
|
|
248
|
+
**Bash / Git Bash:**
|
|
249
|
+
```bash
|
|
250
|
+
# 构建两个插件
|
|
251
|
+
cd packages/markitdown-glmocr && hatch build
|
|
252
|
+
|
|
253
|
+
cd ../markitdown-paddleocr && hatch build
|
|
254
|
+
|
|
255
|
+
# 上传(自动上传所有构建的版本)
|
|
256
|
+
cd ../..
|
|
257
|
+
./scripts/pypi-upload.sh
|
|
258
|
+
|
|
259
|
+
# 或指定版本号
|
|
260
|
+
./scripts/pypi-upload.sh 0.2.0
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
**PowerShell:**
|
|
264
|
+
```powershell
|
|
265
|
+
# 构建两个插件
|
|
266
|
+
cd packages/markitdown-glmocr; hatch build
|
|
267
|
+
cd ../markitdown-paddleocr; hatch build
|
|
268
|
+
|
|
269
|
+
# 上传
|
|
270
|
+
cd ../..
|
|
271
|
+
.\scripts\pypi-upload.ps1
|
|
272
|
+
|
|
273
|
+
# 或指定版本号
|
|
274
|
+
.\scripts\pypi-upload.ps1 -Version "0.2.0"
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
### 手动发布
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
# 1. 进入项目目录
|
|
281
|
+
cd packages/markitdown-glmocr
|
|
282
|
+
|
|
283
|
+
# 2. 构建
|
|
284
|
+
hatch build
|
|
285
|
+
|
|
286
|
+
# 3. 检查
|
|
287
|
+
twine check dist/*
|
|
288
|
+
|
|
289
|
+
# 4. 上传
|
|
290
|
+
twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
### 发布到 TestPyPI(测试)
|
|
294
|
+
|
|
295
|
+
```bash
|
|
296
|
+
twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
297
|
+
|
|
298
|
+
# 从 TestPyPI 安装验证
|
|
299
|
+
pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### 注意事项
|
|
303
|
+
|
|
304
|
+
- 发布前确保 `src/markitdown_glmocr/__about__.py` 中的版本号已更新
|
|
305
|
+
- 同一版本号不能重复上传,如需修正必须 bump 版本号
|
|
306
|
+
- `PYPI_API_TOKEN` 切勿提交到代码仓库
|
|
307
|
+
|
|
221
308
|
## 许可证
|
|
222
309
|
|
|
223
310
|
MIT
|
|
@@ -191,6 +191,93 @@ glmocr SDK 返回的结构化数据支持以下标签:
|
|
|
191
191
|
- `Pillow>=9.0.0` - 图像处理
|
|
192
192
|
- `glmocr` - 智谱 OCR SDK(可选,AI 功能需要)
|
|
193
193
|
|
|
194
|
+
## 发布到 PyPI
|
|
195
|
+
|
|
196
|
+
### 前置条件
|
|
197
|
+
|
|
198
|
+
1. 安装构建工具:
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
pip install build twine hatch
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
2. 配置 PyPI API Token(Windows 用户环境变量):
|
|
205
|
+
|
|
206
|
+
```powershell
|
|
207
|
+
# PowerShell 设置用户环境变量
|
|
208
|
+
[System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
或在 Bash/Zsh 中:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
export PYPI_API_TOKEN="pypi-..."
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### 快速发布(推荐)
|
|
218
|
+
|
|
219
|
+
项目根目录提供了上传脚本,可一键发布两个插件:
|
|
220
|
+
|
|
221
|
+
**Bash / Git Bash:**
|
|
222
|
+
```bash
|
|
223
|
+
# 构建两个插件
|
|
224
|
+
cd packages/markitdown-glmocr && hatch build
|
|
225
|
+
|
|
226
|
+
cd ../markitdown-paddleocr && hatch build
|
|
227
|
+
|
|
228
|
+
# 上传(自动上传所有构建的版本)
|
|
229
|
+
cd ../..
|
|
230
|
+
./scripts/pypi-upload.sh
|
|
231
|
+
|
|
232
|
+
# 或指定版本号
|
|
233
|
+
./scripts/pypi-upload.sh 0.2.0
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
**PowerShell:**
|
|
237
|
+
```powershell
|
|
238
|
+
# 构建两个插件
|
|
239
|
+
cd packages/markitdown-glmocr; hatch build
|
|
240
|
+
cd ../markitdown-paddleocr; hatch build
|
|
241
|
+
|
|
242
|
+
# 上传
|
|
243
|
+
cd ../..
|
|
244
|
+
.\scripts\pypi-upload.ps1
|
|
245
|
+
|
|
246
|
+
# 或指定版本号
|
|
247
|
+
.\scripts\pypi-upload.ps1 -Version "0.2.0"
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### 手动发布
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
# 1. 进入项目目录
|
|
254
|
+
cd packages/markitdown-glmocr
|
|
255
|
+
|
|
256
|
+
# 2. 构建
|
|
257
|
+
hatch build
|
|
258
|
+
|
|
259
|
+
# 3. 检查
|
|
260
|
+
twine check dist/*
|
|
261
|
+
|
|
262
|
+
# 4. 上传
|
|
263
|
+
twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### 发布到 TestPyPI(测试)
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
270
|
+
|
|
271
|
+
# 从 TestPyPI 安装验证
|
|
272
|
+
pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
### 注意事项
|
|
276
|
+
|
|
277
|
+
- 发布前确保 `src/markitdown_glmocr/__about__.py` 中的版本号已更新
|
|
278
|
+
- 同一版本号不能重复上传,如需修正必须 bump 版本号
|
|
279
|
+
- `PYPI_API_TOKEN` 切勿提交到代码仓库
|
|
280
|
+
|
|
194
281
|
## 许可证
|
|
195
282
|
|
|
196
283
|
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.2"
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Configuration for markitdown-glmocr."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ScanDetectionMode(str, Enum):
|
|
8
|
+
"""扫描检测模式。
|
|
9
|
+
|
|
10
|
+
- PAGE_BY_PAGE: 逐页分析,当前默认行为
|
|
11
|
+
- FIRST_PAGE_HINT: 首页是扫描件则全文档使用OCR
|
|
12
|
+
- SAMPLING: 抽样前N页,多数是扫描件则全部OCR
|
|
13
|
+
"""
|
|
14
|
+
PAGE_BY_PAGE = "page_by_page"
|
|
15
|
+
FIRST_PAGE_HINT = "first_page_hint"
|
|
16
|
+
SAMPLING = "sampling"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class GlmOcrConfig:
|
|
21
|
+
"""markitdown-glmocr configuration.
|
|
22
|
+
|
|
23
|
+
Configuration priority (high to low):
|
|
24
|
+
1. Constructor kwargs
|
|
25
|
+
2. Environment variables
|
|
26
|
+
3. .env file
|
|
27
|
+
4. Built-in defaults
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
# API configuration
|
|
31
|
+
api_key: str = "" # Reads from ZHIPU_API_KEY by default
|
|
32
|
+
|
|
33
|
+
# OCR configuration
|
|
34
|
+
timeout: int = 1800
|
|
35
|
+
enable_layout: bool = False
|
|
36
|
+
|
|
37
|
+
# Processing strategy
|
|
38
|
+
force_ai: bool = False
|
|
39
|
+
|
|
40
|
+
# Scan detection mode for optimization
|
|
41
|
+
scan_detection_mode: ScanDetectionMode = ScanDetectionMode.SAMPLING
|
|
42
|
+
scan_sample_pages: int = 3 # Number of pages to sample in SAMPLING mode
|
|
43
|
+
scan_text_threshold: int = 50 # Min text length to consider page as non-scanned
|
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
"""GlmOcr PDF/Image Converter - Intelligent PDF and Image to Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any, BinaryIO, Optional
|
|
7
|
+
|
|
8
|
+
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
|
9
|
+
from markitdown._exceptions import (
|
|
10
|
+
MISSING_DEPENDENCY_MESSAGE,
|
|
11
|
+
MissingDependencyException,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from ._config import GlmOcrConfig, ScanDetectionMode
|
|
15
|
+
|
|
16
|
+
# Import dependencies
|
|
17
|
+
_dependency_exc_info = None
|
|
18
|
+
try:
|
|
19
|
+
import pdfminer
|
|
20
|
+
import pdfminer.high_level
|
|
21
|
+
import pdfplumber
|
|
22
|
+
except ImportError:
|
|
23
|
+
_dependency_exc_info = sys.exc_info()
|
|
24
|
+
|
|
25
|
+
# glmocr SDK
|
|
26
|
+
try:
|
|
27
|
+
import glmocr
|
|
28
|
+
from glmocr import GlmOcr
|
|
29
|
+
except ImportError:
|
|
30
|
+
glmocr = None
|
|
31
|
+
GlmOcr = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
35
|
+
"application/pdf",
|
|
36
|
+
"application/x-pdf",
|
|
37
|
+
"image/jpeg",
|
|
38
|
+
"image/png",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class GlmOcrConverter(DocumentConverter):
|
|
48
|
+
"""
|
|
49
|
+
Intelligent PDF/Image converter using glmocr SDK.
|
|
50
|
+
|
|
51
|
+
Features:
|
|
52
|
+
- Auto-detect page content type (plain text vs images/tables)
|
|
53
|
+
- Plain text pages use pdfplumber/pdfminer (fast, free)
|
|
54
|
+
- Complex pages use glmocr SDK for AI-powered OCR
|
|
55
|
+
- Image files (PNG, JPG) use glmocr SDK directly
|
|
56
|
+
- One-liner: glmocr.parse("document.pdf") handles everything
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
api_key: Optional[str] = None,
|
|
62
|
+
timeout: int = 1800,
|
|
63
|
+
enable_layout: bool = False,
|
|
64
|
+
force_ai: bool = False,
|
|
65
|
+
scan_detection_mode: Optional[ScanDetectionMode] = None,
|
|
66
|
+
scan_sample_pages: Optional[int] = None,
|
|
67
|
+
scan_text_threshold: Optional[int] = None,
|
|
68
|
+
config: Optional[GlmOcrConfig] = None,
|
|
69
|
+
):
|
|
70
|
+
"""
|
|
71
|
+
Initialize converter.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
api_key: Zhipu API key (reads from ZHIPU_API_KEY env var if not provided)
|
|
75
|
+
timeout: Request timeout in seconds (default: 1800)
|
|
76
|
+
enable_layout: Enable layout detection (default: False)
|
|
77
|
+
force_ai: Force all pages to use AI (default: False)
|
|
78
|
+
scan_detection_mode: 扫描检测模式,优化扫描PDF处理
|
|
79
|
+
scan_sample_pages: SAMPLING模式下抽样页数 (default: 3)
|
|
80
|
+
scan_text_threshold: 判定为扫描件的最小文本长度阈值 (default: 50)
|
|
81
|
+
config: Optional GlmOcrConfig instance
|
|
82
|
+
"""
|
|
83
|
+
if glmocr is None:
|
|
84
|
+
raise ImportError(
|
|
85
|
+
"glmocr is required. Install with: pip install markitdown-glmocr[glmocr]"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Use config if provided
|
|
89
|
+
if config:
|
|
90
|
+
self.api_key = api_key or config.api_key
|
|
91
|
+
self.timeout = timeout if timeout != 1800 else config.timeout
|
|
92
|
+
self.enable_layout = (
|
|
93
|
+
enable_layout if enable_layout else config.enable_layout
|
|
94
|
+
)
|
|
95
|
+
self.force_ai = force_ai or config.force_ai
|
|
96
|
+
self.scan_detection_mode = (
|
|
97
|
+
scan_detection_mode
|
|
98
|
+
if scan_detection_mode is not None
|
|
99
|
+
else config.scan_detection_mode
|
|
100
|
+
)
|
|
101
|
+
self.scan_sample_pages = (
|
|
102
|
+
scan_sample_pages
|
|
103
|
+
if scan_sample_pages is not None
|
|
104
|
+
else config.scan_sample_pages
|
|
105
|
+
)
|
|
106
|
+
self.scan_text_threshold = (
|
|
107
|
+
scan_text_threshold
|
|
108
|
+
if scan_text_threshold is not None
|
|
109
|
+
else config.scan_text_threshold
|
|
110
|
+
)
|
|
111
|
+
else:
|
|
112
|
+
self.api_key = api_key
|
|
113
|
+
self.timeout = timeout
|
|
114
|
+
self.enable_layout = enable_layout
|
|
115
|
+
self.force_ai = force_ai
|
|
116
|
+
self.scan_detection_mode = (
|
|
117
|
+
scan_detection_mode
|
|
118
|
+
if scan_detection_mode is not None
|
|
119
|
+
else ScanDetectionMode.SAMPLING
|
|
120
|
+
)
|
|
121
|
+
self.scan_sample_pages = (
|
|
122
|
+
scan_sample_pages if scan_sample_pages is not None else 3
|
|
123
|
+
)
|
|
124
|
+
self.scan_text_threshold = (
|
|
125
|
+
scan_text_threshold if scan_text_threshold is not None else 50
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Lazy init GlmOcr instance
|
|
129
|
+
self._glmocr: Optional[GlmOcr] = None
|
|
130
|
+
|
|
131
|
+
def _get_glmocr(self) -> GlmOcr:
|
|
132
|
+
"""Get or create GlmOcr instance."""
|
|
133
|
+
if self._glmocr is None:
|
|
134
|
+
kwargs = {"timeout": self.timeout, "enable_layout": self.enable_layout}
|
|
135
|
+
if self.api_key:
|
|
136
|
+
kwargs["api_key"] = self.api_key
|
|
137
|
+
self._glmocr = GlmOcr(**kwargs)
|
|
138
|
+
return self._glmocr
|
|
139
|
+
|
|
140
|
+
def accepts(
|
|
141
|
+
self,
|
|
142
|
+
file_stream: BinaryIO,
|
|
143
|
+
stream_info: StreamInfo,
|
|
144
|
+
**kwargs: Any,
|
|
145
|
+
) -> bool:
|
|
146
|
+
mimetype = (stream_info.mimetype or "").lower()
|
|
147
|
+
extension = (stream_info.extension or "").lower()
|
|
148
|
+
|
|
149
|
+
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
153
|
+
if mimetype.startswith(prefix):
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
def convert(
|
|
159
|
+
self,
|
|
160
|
+
file_stream: BinaryIO,
|
|
161
|
+
stream_info: StreamInfo,
|
|
162
|
+
**kwargs: Any,
|
|
163
|
+
) -> DocumentConverterResult:
|
|
164
|
+
if _dependency_exc_info is not None:
|
|
165
|
+
raise MissingDependencyException(
|
|
166
|
+
MISSING_DEPENDENCY_MESSAGE.format(
|
|
167
|
+
converter=type(self).__name__,
|
|
168
|
+
extension=".pdf",
|
|
169
|
+
feature="pdf",
|
|
170
|
+
)
|
|
171
|
+
) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
|
|
172
|
+
|
|
173
|
+
extension = (stream_info.extension or "").lower()
|
|
174
|
+
|
|
175
|
+
logger.info("GlmOcrConverter: 开始转换, 文件类型=%s", extension)
|
|
176
|
+
|
|
177
|
+
# Image files: use glmocr directly
|
|
178
|
+
if extension in (".jpg", ".jpeg", ".png"):
|
|
179
|
+
return self._convert_image(file_stream, extension)
|
|
180
|
+
|
|
181
|
+
# PDF files: use hybrid approach
|
|
182
|
+
return self._convert_pdf(file_stream)
|
|
183
|
+
|
|
184
|
+
def _convert_image(
|
|
185
|
+
self, file_stream: BinaryIO, extension: str = ".png"
|
|
186
|
+
) -> DocumentConverterResult:
|
|
187
|
+
"""Convert image file using glmocr SDK."""
|
|
188
|
+
img_bytes = file_stream.read()
|
|
189
|
+
|
|
190
|
+
logger.info("GlmOcrConverter: 开始 OCR 识别图片, 格式=%s", extension)
|
|
191
|
+
try:
|
|
192
|
+
result = self._get_glmocr().parse(img_bytes)
|
|
193
|
+
except Exception as e:
|
|
194
|
+
logger.error(
|
|
195
|
+
"GlmOcrConverter: 图片 OCR 识别异常, 格式=%s, 错误=%s", extension, e
|
|
196
|
+
)
|
|
197
|
+
raise
|
|
198
|
+
|
|
199
|
+
# Check for errors
|
|
200
|
+
d = result.to_dict()
|
|
201
|
+
if "error" in d:
|
|
202
|
+
logger.error(
|
|
203
|
+
"GlmOcrConverter: 图片 OCR 返回错误, 格式=%s, 错误=%s",
|
|
204
|
+
extension,
|
|
205
|
+
d["error"],
|
|
206
|
+
)
|
|
207
|
+
raise RuntimeError(
|
|
208
|
+
f"GlmOcrConverter: glmocr SDK returned error: {d['error']}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
markdown = result.markdown_result or ""
|
|
212
|
+
logger.info("GlmOcrConverter: 图片 OCR 识别完成, 输出长度=%d", len(markdown))
|
|
213
|
+
return DocumentConverterResult(markdown=markdown)
|
|
214
|
+
|
|
215
|
+
def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
|
|
216
|
+
pdf_stream = io.BytesIO(file_stream.read())
|
|
217
|
+
pdf_bytes = pdf_stream.getvalue() # Keep original bytes for batch OCR
|
|
218
|
+
markdown_parts = []
|
|
219
|
+
|
|
220
|
+
with pdfplumber.open(pdf_stream) as pdf:
|
|
221
|
+
total_pages = len(pdf.pages)
|
|
222
|
+
logger.info("GlmOcrConverter: 开始处理 PDF, 总页数=%d", total_pages)
|
|
223
|
+
|
|
224
|
+
# Optimization: detect if entire PDF is scanned
|
|
225
|
+
all_scanned = self._detect_all_scanned(pdf)
|
|
226
|
+
|
|
227
|
+
if all_scanned and not self.force_ai:
|
|
228
|
+
# Batch mode: upload entire PDF to glmocr SDK (single API call)
|
|
229
|
+
logger.info(
|
|
230
|
+
"GlmOcrConverter: 全文档扫描模式, 批量上传PDF, 页数=%d",
|
|
231
|
+
total_pages,
|
|
232
|
+
)
|
|
233
|
+
try:
|
|
234
|
+
markdown = self._convert_pdf_batch(pdf_bytes)
|
|
235
|
+
if markdown.strip():
|
|
236
|
+
logger.info(
|
|
237
|
+
"GlmOcrConverter: 批量OCR完成, 输出长度=%d",
|
|
238
|
+
len(markdown),
|
|
239
|
+
)
|
|
240
|
+
return DocumentConverterResult(markdown=markdown)
|
|
241
|
+
except Exception as e:
|
|
242
|
+
logger.error(
|
|
243
|
+
"GlmOcrConverter: 批量OCR失败, 抛出异常让框架fallback到下一个converter, 错误=%s",
|
|
244
|
+
e,
|
|
245
|
+
)
|
|
246
|
+
raise
|
|
247
|
+
|
|
248
|
+
# Per-page processing (PAGE_BY_PAGE mode or batch failed)
|
|
249
|
+
for page_num, page in enumerate(pdf.pages):
|
|
250
|
+
# Choose processing method
|
|
251
|
+
if self.force_ai or all_scanned:
|
|
252
|
+
# All scanned (after batch failed) or force_ai
|
|
253
|
+
logger.info(
|
|
254
|
+
"GlmOcrConverter: 第 %d/%d 页, 使用 glmocr OCR",
|
|
255
|
+
page_num + 1,
|
|
256
|
+
total_pages,
|
|
257
|
+
)
|
|
258
|
+
try:
|
|
259
|
+
markdown = self._convert_with_glmocr(page, page_num)
|
|
260
|
+
except Exception as e:
|
|
261
|
+
logger.error(
|
|
262
|
+
"GlmOcrConverter: 第 %d/%d 页识别异常, 错误=%s",
|
|
263
|
+
page_num + 1,
|
|
264
|
+
e,
|
|
265
|
+
)
|
|
266
|
+
raise
|
|
267
|
+
else:
|
|
268
|
+
# Per-page analysis (PAGE_BY_PAGE mode or non-scanned doc)
|
|
269
|
+
page_type = self._analyze_page(page)
|
|
270
|
+
|
|
271
|
+
if page_type != "plain_text":
|
|
272
|
+
logger.info(
|
|
273
|
+
"GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 glmocr OCR",
|
|
274
|
+
page_num + 1,
|
|
275
|
+
total_pages,
|
|
276
|
+
page_type,
|
|
277
|
+
)
|
|
278
|
+
try:
|
|
279
|
+
markdown = self._convert_with_glmocr(page, page_num)
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.error(
|
|
282
|
+
"GlmOcrConverter: 第 %d/%d 页识别异常, 错误=%s",
|
|
283
|
+
page_num + 1,
|
|
284
|
+
e,
|
|
285
|
+
)
|
|
286
|
+
raise
|
|
287
|
+
else:
|
|
288
|
+
logger.info(
|
|
289
|
+
"GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber",
|
|
290
|
+
page_num + 1,
|
|
291
|
+
total_pages,
|
|
292
|
+
page_type,
|
|
293
|
+
)
|
|
294
|
+
markdown = self._extract_text_with_tables(page)
|
|
295
|
+
|
|
296
|
+
if markdown.strip():
|
|
297
|
+
markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
|
|
298
|
+
|
|
299
|
+
page.close()
|
|
300
|
+
|
|
301
|
+
markdown = "\n\n".join(markdown_parts).strip()
|
|
302
|
+
logger.info("GlmOcrConverter: PDF 转换完成, 输出长度=%d", len(markdown))
|
|
303
|
+
return DocumentConverterResult(markdown=markdown)
|
|
304
|
+
|
|
305
|
+
def _convert_pdf_batch(self, pdf_bytes: bytes) -> str:
|
|
306
|
+
"""Convert entire PDF in a single API call.
|
|
307
|
+
|
|
308
|
+
More efficient for scanned PDFs: one API call instead of N calls for N pages.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
pdf_bytes: Raw PDF file content.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Markdown text from all pages.
|
|
315
|
+
"""
|
|
316
|
+
logger.info(
|
|
317
|
+
"GlmOcrConverter: 批量上传PDF到glmocr SDK, 大小=%d bytes", len(pdf_bytes)
|
|
318
|
+
)
|
|
319
|
+
result = self._get_glmocr().parse(pdf_bytes)
|
|
320
|
+
|
|
321
|
+
# Check for errors
|
|
322
|
+
d = result.to_dict()
|
|
323
|
+
if "error" in d:
|
|
324
|
+
logger.error(
|
|
325
|
+
"GlmOcrConverter: 批量OCR返回错误, 错误=%s",
|
|
326
|
+
d["error"],
|
|
327
|
+
)
|
|
328
|
+
raise RuntimeError(
|
|
329
|
+
f"GlmOcrConverter: glmocr SDK batch OCR error: {d['error']}"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
markdown = result.markdown_result or ""
|
|
333
|
+
return markdown
|
|
334
|
+
|
|
335
|
+
def _analyze_page(self, page: Any) -> str:
|
|
336
|
+
"""Analyze page content type."""
|
|
337
|
+
# Check for images
|
|
338
|
+
if hasattr(page, "images") and page.images:
|
|
339
|
+
return "complex"
|
|
340
|
+
|
|
341
|
+
# Check for tables
|
|
342
|
+
tables = page.find_tables()
|
|
343
|
+
if tables:
|
|
344
|
+
return "complex"
|
|
345
|
+
|
|
346
|
+
# Check for graphics/curves
|
|
347
|
+
if hasattr(page, "curves") and page.curves:
|
|
348
|
+
return "complex"
|
|
349
|
+
|
|
350
|
+
return "plain_text"
|
|
351
|
+
|
|
352
|
+
def _is_scanned_page(self, page: Any) -> bool:
|
|
353
|
+
"""Check if a page is likely a scanned image.
|
|
354
|
+
|
|
355
|
+
A page is considered scanned if:
|
|
356
|
+
1. It contains images, AND
|
|
357
|
+
2. It has very little extractable text (below threshold)
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
page: pdfplumber page object
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
True if the page appears to be a scanned image
|
|
364
|
+
"""
|
|
365
|
+
# Must have images to be a scan
|
|
366
|
+
has_images = hasattr(page, "images") and bool(page.images)
|
|
367
|
+
if not has_images:
|
|
368
|
+
return False
|
|
369
|
+
|
|
370
|
+
# Check extractable text length
|
|
371
|
+
try:
|
|
372
|
+
text = page.extract_text() or ""
|
|
373
|
+
text_len = len(text.strip())
|
|
374
|
+
# If there's substantial text, it might be a mixed page or
|
|
375
|
+
# a digital PDF with embedded images
|
|
376
|
+
if text_len >= self.scan_text_threshold:
|
|
377
|
+
return False
|
|
378
|
+
except Exception:
|
|
379
|
+
# If text extraction fails, assume it's a scan
|
|
380
|
+
return True
|
|
381
|
+
|
|
382
|
+
return True
|
|
383
|
+
|
|
384
|
+
def _detect_all_scanned(self, pdf: Any) -> bool:
|
|
385
|
+
"""Detect if entire PDF is scanned based on scan_detection_mode.
|
|
386
|
+
|
|
387
|
+
Optimization: When first few pages are scanned, we can assume
|
|
388
|
+
all pages are scanned and skip per-page analysis.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
pdf: pdfplumber PDF object
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
True if entire PDF should be treated as scanned
|
|
395
|
+
"""
|
|
396
|
+
if self.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE:
|
|
397
|
+
return False
|
|
398
|
+
|
|
399
|
+
total_pages = len(pdf.pages)
|
|
400
|
+
if total_pages == 0:
|
|
401
|
+
return False
|
|
402
|
+
|
|
403
|
+
if self.scan_detection_mode == ScanDetectionMode.FIRST_PAGE_HINT:
|
|
404
|
+
# Check only first page
|
|
405
|
+
first_page = pdf.pages[0]
|
|
406
|
+
is_scanned = self._is_scanned_page(first_page)
|
|
407
|
+
first_page.close()
|
|
408
|
+
if is_scanned:
|
|
409
|
+
logger.info(
|
|
410
|
+
"GlmOcrConverter: 首页检测为扫描件, 模式=FIRST_PAGE_HINT, 全文档使用OCR"
|
|
411
|
+
)
|
|
412
|
+
return is_scanned
|
|
413
|
+
|
|
414
|
+
if self.scan_detection_mode == ScanDetectionMode.SAMPLING:
|
|
415
|
+
# Sample first N pages
|
|
416
|
+
sample_count = min(self.scan_sample_pages, total_pages)
|
|
417
|
+
scanned_count = 0
|
|
418
|
+
|
|
419
|
+
for i in range(sample_count):
|
|
420
|
+
page = pdf.pages[i]
|
|
421
|
+
if self._is_scanned_page(page):
|
|
422
|
+
scanned_count += 1
|
|
423
|
+
|
|
424
|
+
# If majority of sampled pages are scanned, treat all as scanned
|
|
425
|
+
majority_threshold = sample_count // 2 + 1
|
|
426
|
+
all_scanned = scanned_count >= majority_threshold
|
|
427
|
+
|
|
428
|
+
if all_scanned:
|
|
429
|
+
logger.info(
|
|
430
|
+
"GlmOcrConverter: 抽样检测 %d/%d 页为扫描件, 模式=SAMPLING, 全文档使用OCR",
|
|
431
|
+
scanned_count,
|
|
432
|
+
sample_count,
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
return all_scanned
|
|
436
|
+
|
|
437
|
+
return False
|
|
438
|
+
|
|
439
|
+
def _convert_with_glmocr(self, page: Any, page_num: int) -> str:
|
|
440
|
+
"""Convert page using glmocr SDK.
|
|
441
|
+
|
|
442
|
+
Raises RuntimeError on OCR failure so the framework can try the next converter.
|
|
443
|
+
"""
|
|
444
|
+
# Render page to image
|
|
445
|
+
img = page.to_image(resolution=150)
|
|
446
|
+
img_bytes = io.BytesIO()
|
|
447
|
+
img.save(img_bytes, format="PNG")
|
|
448
|
+
|
|
449
|
+
logger.info("GlmOcrConverter: glmocr SDK 开始识别第 %d 页", page_num + 1)
|
|
450
|
+
try:
|
|
451
|
+
result = self._get_glmocr().parse(img_bytes.getvalue())
|
|
452
|
+
except Exception as e:
|
|
453
|
+
logger.error(
|
|
454
|
+
"GlmOcrConverter: glmocr SDK 第 %d 页识别异常, 错误=%s", page_num + 1, e
|
|
455
|
+
)
|
|
456
|
+
raise
|
|
457
|
+
|
|
458
|
+
# Check for errors
|
|
459
|
+
d = result.to_dict()
|
|
460
|
+
if "error" in d:
|
|
461
|
+
logger.error(
|
|
462
|
+
"GlmOcrConverter: glmocr SDK 第 %d 页返回错误, 错误=%s",
|
|
463
|
+
page_num + 1,
|
|
464
|
+
d["error"],
|
|
465
|
+
)
|
|
466
|
+
raise RuntimeError(
|
|
467
|
+
f"GlmOcrConverter: glmocr SDK returned error on page {page_num + 1}: {d['error']}"
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
markdown = result.markdown_result or ""
|
|
471
|
+
logger.info(
|
|
472
|
+
"GlmOcrConverter: glmocr SDK 第 %d 页识别完成, 输出长度=%d",
|
|
473
|
+
page_num + 1,
|
|
474
|
+
len(markdown),
|
|
475
|
+
)
|
|
476
|
+
return markdown
|
|
477
|
+
|
|
478
|
+
def _extract_text_with_tables(self, page: Any) -> str:
|
|
479
|
+
"""Extract text and tables from page."""
|
|
480
|
+
parts = []
|
|
481
|
+
|
|
482
|
+
# Extract text
|
|
483
|
+
text = page.extract_text() or ""
|
|
484
|
+
if text.strip():
|
|
485
|
+
parts.append(text.strip())
|
|
486
|
+
|
|
487
|
+
# Extract tables
|
|
488
|
+
try:
|
|
489
|
+
tables = page.extract_tables()
|
|
490
|
+
if tables:
|
|
491
|
+
for table in tables:
|
|
492
|
+
if table:
|
|
493
|
+
md_table = self._table_to_markdown(table)
|
|
494
|
+
if md_table.strip():
|
|
495
|
+
parts.append(md_table)
|
|
496
|
+
except Exception:
|
|
497
|
+
pass
|
|
498
|
+
|
|
499
|
+
return "\n\n".join(parts)
|
|
500
|
+
|
|
501
|
+
def _table_to_markdown(self, table: list[list[str]]) -> str:
|
|
502
|
+
"""Convert table to Markdown."""
|
|
503
|
+
if not table:
|
|
504
|
+
return ""
|
|
505
|
+
|
|
506
|
+
# Filter None values
|
|
507
|
+
table = [[cell if cell is not None else "" for cell in row] for row in table]
|
|
508
|
+
|
|
509
|
+
# Filter empty rows
|
|
510
|
+
table = [row for row in table if any(cell.strip() for cell in row)]
|
|
511
|
+
|
|
512
|
+
if not table:
|
|
513
|
+
return ""
|
|
514
|
+
|
|
515
|
+
# Calculate column widths
|
|
516
|
+
col_widths = [
|
|
517
|
+
max(len(str(row[i])) if i < len(row) else 0 for row in table)
|
|
518
|
+
for i in range(max(len(row) for row in table))
|
|
519
|
+
]
|
|
520
|
+
|
|
521
|
+
# Format table
|
|
522
|
+
lines = []
|
|
523
|
+
for row_idx, row in enumerate(table):
|
|
524
|
+
padded_row = row + [""] * (len(col_widths) - len(row))
|
|
525
|
+
line = (
|
|
526
|
+
"| "
|
|
527
|
+
+ " | ".join(
|
|
528
|
+
str(cell).ljust(width)
|
|
529
|
+
for cell, width in zip(padded_row, col_widths)
|
|
530
|
+
)
|
|
531
|
+
+ " |"
|
|
532
|
+
)
|
|
533
|
+
lines.append(line)
|
|
534
|
+
|
|
535
|
+
if row_idx == 0:
|
|
536
|
+
sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
|
|
537
|
+
lines.append(sep)
|
|
538
|
+
|
|
539
|
+
return "\n".join(lines)
|
|
540
|
+
|
|
541
|
+
def close(self):
|
|
542
|
+
"""Close the GlmOcr instance."""
|
|
543
|
+
if self._glmocr:
|
|
544
|
+
self._glmocr.close()
|
|
545
|
+
self._glmocr = None
|
|
546
|
+
|
|
547
|
+
def __enter__(self):
|
|
548
|
+
return self
|
|
549
|
+
|
|
550
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
551
|
+
self.close()
|
|
@@ -1,33 +1,46 @@
|
|
|
1
1
|
"""Plugin registration for markitdown-glmocr."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from typing import Any
|
|
5
|
+
|
|
4
6
|
from markitdown import MarkItDown
|
|
5
7
|
|
|
6
8
|
from ._converter import GlmOcrConverter
|
|
7
9
|
|
|
8
|
-
|
|
9
10
|
__plugin_interface_version__ = 1
|
|
10
11
|
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
11
14
|
|
|
12
15
|
def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
|
|
13
16
|
"""
|
|
14
17
|
Register markitdown-glmocr converter.
|
|
15
|
-
|
|
18
|
+
|
|
16
19
|
Config sources (priority high to low):
|
|
17
20
|
1. kwargs parameters
|
|
18
21
|
2. Environment variables (ZHIPU_API_KEY)
|
|
19
22
|
3. .env file
|
|
20
23
|
4. Built-in defaults
|
|
21
24
|
"""
|
|
25
|
+
logger.info("markitdown-glmocr: 开始注册插件")
|
|
26
|
+
|
|
22
27
|
# Register converter
|
|
28
|
+
# Priority -1.0: same level as PaddleOcrConverter,
|
|
29
|
+
# the upper-level agent's skills control which plugin to call first.
|
|
23
30
|
PRIORITY_GLMOCR = -1.0
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
GlmOcrConverter(
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
converter = GlmOcrConverter(
|
|
27
34
|
api_key=kwargs.get("api_key"),
|
|
28
35
|
timeout=kwargs.get("timeout", 1800),
|
|
29
36
|
enable_layout=kwargs.get("enable_layout", False),
|
|
30
37
|
force_ai=kwargs.get("force_ai", False),
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
|
|
38
|
+
)
|
|
39
|
+
markitdown.register_converter(
|
|
40
|
+
converter,
|
|
41
|
+
priority=PRIORITY_GLMOCR,
|
|
42
|
+
)
|
|
43
|
+
logger.info("markitdown-glmocr: 插件注册成功, priority=%.1f", PRIORITY_GLMOCR)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
logger.error("markitdown-glmocr: 插件注册失败, 错误=%s", e)
|
|
46
|
+
raise
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.1.0"
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
"""Configuration for markitdown-glmocr."""
|
|
2
|
-
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
@dataclass
|
|
7
|
-
class GlmOcrConfig:
|
|
8
|
-
"""markitdown-glmocr configuration.
|
|
9
|
-
|
|
10
|
-
Configuration priority (high to low):
|
|
11
|
-
1. Constructor kwargs
|
|
12
|
-
2. Environment variables
|
|
13
|
-
3. .env file
|
|
14
|
-
4. Built-in defaults
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
# API configuration
|
|
18
|
-
api_key: str = "" # Reads from ZHIPU_API_KEY by default
|
|
19
|
-
|
|
20
|
-
# OCR configuration
|
|
21
|
-
timeout: int = 1800
|
|
22
|
-
enable_layout: bool = False
|
|
23
|
-
|
|
24
|
-
# Processing strategy
|
|
25
|
-
force_ai: bool = False
|
|
@@ -1,304 +0,0 @@
|
|
|
1
|
-
"""GlmOcr PDF/Image Converter - Intelligent PDF and Image to Markdown conversion."""
|
|
2
|
-
|
|
3
|
-
import io
|
|
4
|
-
import sys
|
|
5
|
-
from typing import Any, BinaryIO, Optional
|
|
6
|
-
|
|
7
|
-
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
|
8
|
-
from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|
9
|
-
|
|
10
|
-
from ._config import GlmOcrConfig
|
|
11
|
-
|
|
12
|
-
# Import dependencies
|
|
13
|
-
_dependency_exc_info = None
|
|
14
|
-
try:
|
|
15
|
-
import pdfminer
|
|
16
|
-
import pdfminer.high_level
|
|
17
|
-
import pdfplumber
|
|
18
|
-
except ImportError:
|
|
19
|
-
_dependency_exc_info = sys.exc_info()
|
|
20
|
-
|
|
21
|
-
# glmocr SDK
|
|
22
|
-
try:
|
|
23
|
-
import glmocr
|
|
24
|
-
from glmocr import GlmOcr
|
|
25
|
-
except ImportError:
|
|
26
|
-
glmocr = None
|
|
27
|
-
GlmOcr = None
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
31
|
-
"application/pdf",
|
|
32
|
-
"application/x-pdf",
|
|
33
|
-
"image/jpeg",
|
|
34
|
-
"image/png",
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class GlmOcrConverter(DocumentConverter):
|
|
41
|
-
"""
|
|
42
|
-
Intelligent PDF/Image converter using glmocr SDK.
|
|
43
|
-
|
|
44
|
-
Features:
|
|
45
|
-
- Auto-detect page content type (plain text vs images/tables)
|
|
46
|
-
- Plain text pages use pdfplumber/pdfminer (fast, free)
|
|
47
|
-
- Complex pages use glmocr SDK for AI-powered OCR
|
|
48
|
-
- Image files (PNG, JPG) use glmocr SDK directly
|
|
49
|
-
- One-liner: glmocr.parse("document.pdf") handles everything
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
def __init__(
|
|
53
|
-
self,
|
|
54
|
-
api_key: Optional[str] = None,
|
|
55
|
-
timeout: int = 1800,
|
|
56
|
-
enable_layout: bool = False,
|
|
57
|
-
force_ai: bool = False,
|
|
58
|
-
config: Optional[GlmOcrConfig] = None,
|
|
59
|
-
):
|
|
60
|
-
"""
|
|
61
|
-
Initialize converter.
|
|
62
|
-
|
|
63
|
-
Args:
|
|
64
|
-
api_key: Zhipu API key (reads from ZHIPU_API_KEY env var if not provided)
|
|
65
|
-
timeout: Request timeout in seconds (default: 1800)
|
|
66
|
-
enable_layout: Enable layout detection (default: False)
|
|
67
|
-
force_ai: Force all pages to use AI (default: False)
|
|
68
|
-
config: Optional GlmOcrConfig instance
|
|
69
|
-
"""
|
|
70
|
-
if glmocr is None:
|
|
71
|
-
raise ImportError(
|
|
72
|
-
"glmocr is required. Install with: pip install markitdown-glmocr[glmocr]"
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
# Use config if provided
|
|
76
|
-
if config:
|
|
77
|
-
self.api_key = api_key or config.api_key
|
|
78
|
-
self.timeout = timeout if timeout != 1800 else config.timeout
|
|
79
|
-
self.enable_layout = enable_layout if enable_layout else config.enable_layout
|
|
80
|
-
self.force_ai = force_ai or config.force_ai
|
|
81
|
-
else:
|
|
82
|
-
self.api_key = api_key
|
|
83
|
-
self.timeout = timeout
|
|
84
|
-
self.enable_layout = enable_layout
|
|
85
|
-
self.force_ai = force_ai
|
|
86
|
-
|
|
87
|
-
# Lazy init GlmOcr instance
|
|
88
|
-
self._glmocr: Optional[GlmOcr] = None
|
|
89
|
-
|
|
90
|
-
def _get_glmocr(self) -> GlmOcr:
|
|
91
|
-
"""Get or create GlmOcr instance."""
|
|
92
|
-
if self._glmocr is None:
|
|
93
|
-
kwargs = {"timeout": self.timeout, "enable_layout": self.enable_layout}
|
|
94
|
-
if self.api_key:
|
|
95
|
-
kwargs["api_key"] = self.api_key
|
|
96
|
-
self._glmocr = GlmOcr(**kwargs)
|
|
97
|
-
return self._glmocr
|
|
98
|
-
|
|
99
|
-
def accepts(
|
|
100
|
-
self,
|
|
101
|
-
file_stream: BinaryIO,
|
|
102
|
-
stream_info: StreamInfo,
|
|
103
|
-
**kwargs: Any,
|
|
104
|
-
) -> bool:
|
|
105
|
-
mimetype = (stream_info.mimetype or "").lower()
|
|
106
|
-
extension = (stream_info.extension or "").lower()
|
|
107
|
-
|
|
108
|
-
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
109
|
-
return True
|
|
110
|
-
|
|
111
|
-
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
112
|
-
if mimetype.startswith(prefix):
|
|
113
|
-
return True
|
|
114
|
-
|
|
115
|
-
return False
|
|
116
|
-
|
|
117
|
-
def convert(
|
|
118
|
-
self,
|
|
119
|
-
file_stream: BinaryIO,
|
|
120
|
-
stream_info: StreamInfo,
|
|
121
|
-
**kwargs: Any,
|
|
122
|
-
) -> DocumentConverterResult:
|
|
123
|
-
if _dependency_exc_info is not None:
|
|
124
|
-
raise MissingDependencyException(
|
|
125
|
-
MISSING_DEPENDENCY_MESSAGE.format(
|
|
126
|
-
converter=type(self).__name__,
|
|
127
|
-
extension=".pdf",
|
|
128
|
-
feature="pdf",
|
|
129
|
-
)
|
|
130
|
-
) from _dependency_exc_info[1].with_traceback(
|
|
131
|
-
_dependency_exc_info[2]
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
extension = (stream_info.extension or "").lower()
|
|
135
|
-
|
|
136
|
-
# Image files: use glmocr directly
|
|
137
|
-
if extension in (".jpg", ".jpeg", ".png"):
|
|
138
|
-
return self._convert_image(file_stream, extension)
|
|
139
|
-
|
|
140
|
-
# PDF files: use hybrid approach
|
|
141
|
-
return self._convert_pdf(file_stream)
|
|
142
|
-
|
|
143
|
-
def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult:
|
|
144
|
-
"""Convert image file using glmocr SDK."""
|
|
145
|
-
img_bytes = file_stream.read()
|
|
146
|
-
|
|
147
|
-
try:
|
|
148
|
-
result = self._get_glmocr().parse(img_bytes)
|
|
149
|
-
|
|
150
|
-
# Check for errors
|
|
151
|
-
d = result.to_dict()
|
|
152
|
-
if "error" in d:
|
|
153
|
-
return DocumentConverterResult(markdown="")
|
|
154
|
-
|
|
155
|
-
return DocumentConverterResult(
|
|
156
|
-
markdown=result.markdown_result or ""
|
|
157
|
-
)
|
|
158
|
-
except Exception as e:
|
|
159
|
-
return DocumentConverterResult(
|
|
160
|
-
markdown=f"<!-- Error converting image: {e} -->"
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
|
|
164
|
-
pdf_stream = io.BytesIO(file_stream.read())
|
|
165
|
-
markdown_parts = []
|
|
166
|
-
|
|
167
|
-
try:
|
|
168
|
-
with pdfplumber.open(pdf_stream) as pdf:
|
|
169
|
-
for page_num, page in enumerate(pdf.pages):
|
|
170
|
-
# Analyze page type
|
|
171
|
-
page_type = self._analyze_page(page)
|
|
172
|
-
|
|
173
|
-
# Choose processing method
|
|
174
|
-
if self.force_ai or page_type != "plain_text":
|
|
175
|
-
# Complex content: use glmocr
|
|
176
|
-
markdown = self._convert_with_glmocr(page, page_num)
|
|
177
|
-
else:
|
|
178
|
-
# Plain text: use pdfplumber
|
|
179
|
-
markdown = self._extract_text_with_tables(page)
|
|
180
|
-
|
|
181
|
-
if markdown.strip():
|
|
182
|
-
markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
|
|
183
|
-
|
|
184
|
-
page.close()
|
|
185
|
-
|
|
186
|
-
markdown = "\n\n".join(markdown_parts).strip()
|
|
187
|
-
|
|
188
|
-
except Exception:
|
|
189
|
-
# Fallback to pdfminer
|
|
190
|
-
pdf_stream.seek(0)
|
|
191
|
-
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
192
|
-
|
|
193
|
-
# Final fallback
|
|
194
|
-
if not markdown:
|
|
195
|
-
pdf_stream.seek(0)
|
|
196
|
-
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
197
|
-
|
|
198
|
-
return DocumentConverterResult(markdown=markdown)
|
|
199
|
-
|
|
200
|
-
def _analyze_page(self, page: Any) -> str:
|
|
201
|
-
"""Analyze page content type."""
|
|
202
|
-
# Check for images
|
|
203
|
-
if hasattr(page, "images") and page.images:
|
|
204
|
-
return "complex"
|
|
205
|
-
|
|
206
|
-
# Check for tables
|
|
207
|
-
tables = page.find_tables()
|
|
208
|
-
if tables:
|
|
209
|
-
return "complex"
|
|
210
|
-
|
|
211
|
-
# Check for graphics/curves
|
|
212
|
-
if hasattr(page, "curves") and page.curves:
|
|
213
|
-
return "complex"
|
|
214
|
-
|
|
215
|
-
return "plain_text"
|
|
216
|
-
|
|
217
|
-
def _convert_with_glmocr(self, page: Any, page_num: int) -> str:
|
|
218
|
-
"""Convert page using glmocr SDK."""
|
|
219
|
-
try:
|
|
220
|
-
# Render page to image
|
|
221
|
-
img = page.to_image(resolution=150)
|
|
222
|
-
img_bytes = io.BytesIO()
|
|
223
|
-
img.save(img_bytes, format="PNG")
|
|
224
|
-
result = self._get_glmocr().parse(img_bytes.getvalue())
|
|
225
|
-
|
|
226
|
-
# Check for errors
|
|
227
|
-
d = result.to_dict()
|
|
228
|
-
if "error" in d:
|
|
229
|
-
return self._extract_text_with_tables(page)
|
|
230
|
-
|
|
231
|
-
return result.markdown_result or ""
|
|
232
|
-
|
|
233
|
-
except Exception:
|
|
234
|
-
return self._extract_text_with_tables(page)
|
|
235
|
-
|
|
236
|
-
def _extract_text_with_tables(self, page: Any) -> str:
|
|
237
|
-
"""Extract text and tables from page."""
|
|
238
|
-
parts = []
|
|
239
|
-
|
|
240
|
-
# Extract text
|
|
241
|
-
text = page.extract_text() or ""
|
|
242
|
-
if text.strip():
|
|
243
|
-
parts.append(text.strip())
|
|
244
|
-
|
|
245
|
-
# Extract tables
|
|
246
|
-
try:
|
|
247
|
-
tables = page.extract_tables()
|
|
248
|
-
if tables:
|
|
249
|
-
for table in tables:
|
|
250
|
-
if table:
|
|
251
|
-
md_table = self._table_to_markdown(table)
|
|
252
|
-
if md_table.strip():
|
|
253
|
-
parts.append(md_table)
|
|
254
|
-
except Exception:
|
|
255
|
-
pass
|
|
256
|
-
|
|
257
|
-
return "\n\n".join(parts)
|
|
258
|
-
|
|
259
|
-
def _table_to_markdown(self, table: list[list[str]]) -> str:
|
|
260
|
-
"""Convert table to Markdown."""
|
|
261
|
-
if not table:
|
|
262
|
-
return ""
|
|
263
|
-
|
|
264
|
-
# Filter None values
|
|
265
|
-
table = [[cell if cell is not None else "" for cell in row] for row in table]
|
|
266
|
-
|
|
267
|
-
# Filter empty rows
|
|
268
|
-
table = [row for row in table if any(cell.strip() for cell in row)]
|
|
269
|
-
|
|
270
|
-
if not table:
|
|
271
|
-
return ""
|
|
272
|
-
|
|
273
|
-
# Calculate column widths
|
|
274
|
-
col_widths = [
|
|
275
|
-
max(len(str(row[i])) if i < len(row) else 0 for row in table)
|
|
276
|
-
for i in range(max(len(row) for row in table))
|
|
277
|
-
]
|
|
278
|
-
|
|
279
|
-
# Format table
|
|
280
|
-
lines = []
|
|
281
|
-
for row_idx, row in enumerate(table):
|
|
282
|
-
padded_row = row + [""] * (len(col_widths) - len(row))
|
|
283
|
-
line = "| " + " | ".join(
|
|
284
|
-
str(cell).ljust(width) for cell, width in zip(padded_row, col_widths)
|
|
285
|
-
) + " |"
|
|
286
|
-
lines.append(line)
|
|
287
|
-
|
|
288
|
-
if row_idx == 0:
|
|
289
|
-
sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
|
|
290
|
-
lines.append(sep)
|
|
291
|
-
|
|
292
|
-
return "\n".join(lines)
|
|
293
|
-
|
|
294
|
-
def close(self):
|
|
295
|
-
"""Close the GlmOcr instance."""
|
|
296
|
-
if self._glmocr:
|
|
297
|
-
self._glmocr.close()
|
|
298
|
-
self._glmocr = None
|
|
299
|
-
|
|
300
|
-
def __enter__(self):
|
|
301
|
-
return self
|
|
302
|
-
|
|
303
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
304
|
-
self.close()
|
|
File without changes
|
|
File without changes
|