markitdown-glmocr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitdown_glmocr/__about__.py +1 -0
- markitdown_glmocr/__init__.py +12 -0
- markitdown_glmocr/_config.py +25 -0
- markitdown_glmocr/_converter.py +304 -0
- markitdown_glmocr/_plugin.py +33 -0
- markitdown_glmocr-0.1.0.dist-info/METADATA +223 -0
- markitdown_glmocr-0.1.0.dist-info/RECORD +9 -0
- markitdown_glmocr-0.1.0.dist-info/WHEEL +4 -0
- markitdown_glmocr-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""markitdown-glmocr: Intelligent PDF to Markdown converter using glmocr SDK."""
|
|
2
|
+
|
|
3
|
+
from ._plugin import register_converters
|
|
4
|
+
from ._config import GlmOcrConfig
|
|
5
|
+
from ._converter import GlmOcrConverter
|
|
6
|
+
|
|
7
|
+
__plugin_interface_version__ = 1
|
|
8
|
+
__all__ = [
|
|
9
|
+
"register_converters",
|
|
10
|
+
"GlmOcrConfig",
|
|
11
|
+
"GlmOcrConverter",
|
|
12
|
+
]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Configuration for markitdown-glmocr."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class GlmOcrConfig:
|
|
8
|
+
"""markitdown-glmocr configuration.
|
|
9
|
+
|
|
10
|
+
Configuration priority (high to low):
|
|
11
|
+
1. Constructor kwargs
|
|
12
|
+
2. Environment variables
|
|
13
|
+
3. .env file
|
|
14
|
+
4. Built-in defaults
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# API configuration
|
|
18
|
+
api_key: str = "" # Reads from ZHIPU_API_KEY by default
|
|
19
|
+
|
|
20
|
+
# OCR configuration
|
|
21
|
+
timeout: int = 1800
|
|
22
|
+
enable_layout: bool = False
|
|
23
|
+
|
|
24
|
+
# Processing strategy
|
|
25
|
+
force_ai: bool = False
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""GlmOcr PDF/Image Converter - Intelligent PDF and Image to Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any, BinaryIO, Optional
|
|
6
|
+
|
|
7
|
+
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
|
8
|
+
from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
|
9
|
+
|
|
10
|
+
from ._config import GlmOcrConfig
|
|
11
|
+
|
|
12
|
+
# Import dependencies
|
|
13
|
+
_dependency_exc_info = None
|
|
14
|
+
try:
|
|
15
|
+
import pdfminer
|
|
16
|
+
import pdfminer.high_level
|
|
17
|
+
import pdfplumber
|
|
18
|
+
except ImportError:
|
|
19
|
+
_dependency_exc_info = sys.exc_info()
|
|
20
|
+
|
|
21
|
+
# glmocr SDK
|
|
22
|
+
try:
|
|
23
|
+
import glmocr
|
|
24
|
+
from glmocr import GlmOcr
|
|
25
|
+
except ImportError:
|
|
26
|
+
glmocr = None
|
|
27
|
+
GlmOcr = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
ACCEPTED_MIME_TYPE_PREFIXES = [
|
|
31
|
+
"application/pdf",
|
|
32
|
+
"application/x-pdf",
|
|
33
|
+
"image/jpeg",
|
|
34
|
+
"image/png",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class GlmOcrConverter(DocumentConverter):
|
|
41
|
+
"""
|
|
42
|
+
Intelligent PDF/Image converter using glmocr SDK.
|
|
43
|
+
|
|
44
|
+
Features:
|
|
45
|
+
- Auto-detect page content type (plain text vs images/tables)
|
|
46
|
+
- Plain text pages use pdfplumber/pdfminer (fast, free)
|
|
47
|
+
- Complex pages use glmocr SDK for AI-powered OCR
|
|
48
|
+
- Image files (PNG, JPG) use glmocr SDK directly
|
|
49
|
+
- One-liner: glmocr.parse("document.pdf") handles everything
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
api_key: Optional[str] = None,
|
|
55
|
+
timeout: int = 1800,
|
|
56
|
+
enable_layout: bool = False,
|
|
57
|
+
force_ai: bool = False,
|
|
58
|
+
config: Optional[GlmOcrConfig] = None,
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Initialize converter.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
api_key: Zhipu API key (reads from ZHIPU_API_KEY env var if not provided)
|
|
65
|
+
timeout: Request timeout in seconds (default: 1800)
|
|
66
|
+
enable_layout: Enable layout detection (default: False)
|
|
67
|
+
force_ai: Force all pages to use AI (default: False)
|
|
68
|
+
config: Optional GlmOcrConfig instance
|
|
69
|
+
"""
|
|
70
|
+
if glmocr is None:
|
|
71
|
+
raise ImportError(
|
|
72
|
+
"glmocr is required. Install with: pip install markitdown-glmocr[glmocr]"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Use config if provided
|
|
76
|
+
if config:
|
|
77
|
+
self.api_key = api_key or config.api_key
|
|
78
|
+
self.timeout = timeout if timeout != 1800 else config.timeout
|
|
79
|
+
self.enable_layout = enable_layout if enable_layout else config.enable_layout
|
|
80
|
+
self.force_ai = force_ai or config.force_ai
|
|
81
|
+
else:
|
|
82
|
+
self.api_key = api_key
|
|
83
|
+
self.timeout = timeout
|
|
84
|
+
self.enable_layout = enable_layout
|
|
85
|
+
self.force_ai = force_ai
|
|
86
|
+
|
|
87
|
+
# Lazy init GlmOcr instance
|
|
88
|
+
self._glmocr: Optional[GlmOcr] = None
|
|
89
|
+
|
|
90
|
+
def _get_glmocr(self) -> GlmOcr:
|
|
91
|
+
"""Get or create GlmOcr instance."""
|
|
92
|
+
if self._glmocr is None:
|
|
93
|
+
kwargs = {"timeout": self.timeout, "enable_layout": self.enable_layout}
|
|
94
|
+
if self.api_key:
|
|
95
|
+
kwargs["api_key"] = self.api_key
|
|
96
|
+
self._glmocr = GlmOcr(**kwargs)
|
|
97
|
+
return self._glmocr
|
|
98
|
+
|
|
99
|
+
def accepts(
|
|
100
|
+
self,
|
|
101
|
+
file_stream: BinaryIO,
|
|
102
|
+
stream_info: StreamInfo,
|
|
103
|
+
**kwargs: Any,
|
|
104
|
+
) -> bool:
|
|
105
|
+
mimetype = (stream_info.mimetype or "").lower()
|
|
106
|
+
extension = (stream_info.extension or "").lower()
|
|
107
|
+
|
|
108
|
+
if extension in ACCEPTED_FILE_EXTENSIONS:
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
|
112
|
+
if mimetype.startswith(prefix):
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
return False
|
|
116
|
+
|
|
117
|
+
def convert(
|
|
118
|
+
self,
|
|
119
|
+
file_stream: BinaryIO,
|
|
120
|
+
stream_info: StreamInfo,
|
|
121
|
+
**kwargs: Any,
|
|
122
|
+
) -> DocumentConverterResult:
|
|
123
|
+
if _dependency_exc_info is not None:
|
|
124
|
+
raise MissingDependencyException(
|
|
125
|
+
MISSING_DEPENDENCY_MESSAGE.format(
|
|
126
|
+
converter=type(self).__name__,
|
|
127
|
+
extension=".pdf",
|
|
128
|
+
feature="pdf",
|
|
129
|
+
)
|
|
130
|
+
) from _dependency_exc_info[1].with_traceback(
|
|
131
|
+
_dependency_exc_info[2]
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
extension = (stream_info.extension or "").lower()
|
|
135
|
+
|
|
136
|
+
# Image files: use glmocr directly
|
|
137
|
+
if extension in (".jpg", ".jpeg", ".png"):
|
|
138
|
+
return self._convert_image(file_stream, extension)
|
|
139
|
+
|
|
140
|
+
# PDF files: use hybrid approach
|
|
141
|
+
return self._convert_pdf(file_stream)
|
|
142
|
+
|
|
143
|
+
def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult:
|
|
144
|
+
"""Convert image file using glmocr SDK."""
|
|
145
|
+
img_bytes = file_stream.read()
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
result = self._get_glmocr().parse(img_bytes)
|
|
149
|
+
|
|
150
|
+
# Check for errors
|
|
151
|
+
d = result.to_dict()
|
|
152
|
+
if "error" in d:
|
|
153
|
+
return DocumentConverterResult(markdown="")
|
|
154
|
+
|
|
155
|
+
return DocumentConverterResult(
|
|
156
|
+
markdown=result.markdown_result or ""
|
|
157
|
+
)
|
|
158
|
+
except Exception as e:
|
|
159
|
+
return DocumentConverterResult(
|
|
160
|
+
markdown=f"<!-- Error converting image: {e} -->"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
|
|
164
|
+
pdf_stream = io.BytesIO(file_stream.read())
|
|
165
|
+
markdown_parts = []
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
with pdfplumber.open(pdf_stream) as pdf:
|
|
169
|
+
for page_num, page in enumerate(pdf.pages):
|
|
170
|
+
# Analyze page type
|
|
171
|
+
page_type = self._analyze_page(page)
|
|
172
|
+
|
|
173
|
+
# Choose processing method
|
|
174
|
+
if self.force_ai or page_type != "plain_text":
|
|
175
|
+
# Complex content: use glmocr
|
|
176
|
+
markdown = self._convert_with_glmocr(page, page_num)
|
|
177
|
+
else:
|
|
178
|
+
# Plain text: use pdfplumber
|
|
179
|
+
markdown = self._extract_text_with_tables(page)
|
|
180
|
+
|
|
181
|
+
if markdown.strip():
|
|
182
|
+
markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
|
|
183
|
+
|
|
184
|
+
page.close()
|
|
185
|
+
|
|
186
|
+
markdown = "\n\n".join(markdown_parts).strip()
|
|
187
|
+
|
|
188
|
+
except Exception:
|
|
189
|
+
# Fallback to pdfminer
|
|
190
|
+
pdf_stream.seek(0)
|
|
191
|
+
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
192
|
+
|
|
193
|
+
# Final fallback
|
|
194
|
+
if not markdown:
|
|
195
|
+
pdf_stream.seek(0)
|
|
196
|
+
markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
|
|
197
|
+
|
|
198
|
+
return DocumentConverterResult(markdown=markdown)
|
|
199
|
+
|
|
200
|
+
def _analyze_page(self, page: Any) -> str:
|
|
201
|
+
"""Analyze page content type."""
|
|
202
|
+
# Check for images
|
|
203
|
+
if hasattr(page, "images") and page.images:
|
|
204
|
+
return "complex"
|
|
205
|
+
|
|
206
|
+
# Check for tables
|
|
207
|
+
tables = page.find_tables()
|
|
208
|
+
if tables:
|
|
209
|
+
return "complex"
|
|
210
|
+
|
|
211
|
+
# Check for graphics/curves
|
|
212
|
+
if hasattr(page, "curves") and page.curves:
|
|
213
|
+
return "complex"
|
|
214
|
+
|
|
215
|
+
return "plain_text"
|
|
216
|
+
|
|
217
|
+
def _convert_with_glmocr(self, page: Any, page_num: int) -> str:
|
|
218
|
+
"""Convert page using glmocr SDK."""
|
|
219
|
+
try:
|
|
220
|
+
# Render page to image
|
|
221
|
+
img = page.to_image(resolution=150)
|
|
222
|
+
img_bytes = io.BytesIO()
|
|
223
|
+
img.save(img_bytes, format="PNG")
|
|
224
|
+
result = self._get_glmocr().parse(img_bytes.getvalue())
|
|
225
|
+
|
|
226
|
+
# Check for errors
|
|
227
|
+
d = result.to_dict()
|
|
228
|
+
if "error" in d:
|
|
229
|
+
return self._extract_text_with_tables(page)
|
|
230
|
+
|
|
231
|
+
return result.markdown_result or ""
|
|
232
|
+
|
|
233
|
+
except Exception:
|
|
234
|
+
return self._extract_text_with_tables(page)
|
|
235
|
+
|
|
236
|
+
def _extract_text_with_tables(self, page: Any) -> str:
|
|
237
|
+
"""Extract text and tables from page."""
|
|
238
|
+
parts = []
|
|
239
|
+
|
|
240
|
+
# Extract text
|
|
241
|
+
text = page.extract_text() or ""
|
|
242
|
+
if text.strip():
|
|
243
|
+
parts.append(text.strip())
|
|
244
|
+
|
|
245
|
+
# Extract tables
|
|
246
|
+
try:
|
|
247
|
+
tables = page.extract_tables()
|
|
248
|
+
if tables:
|
|
249
|
+
for table in tables:
|
|
250
|
+
if table:
|
|
251
|
+
md_table = self._table_to_markdown(table)
|
|
252
|
+
if md_table.strip():
|
|
253
|
+
parts.append(md_table)
|
|
254
|
+
except Exception:
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
return "\n\n".join(parts)
|
|
258
|
+
|
|
259
|
+
def _table_to_markdown(self, table: list[list[str]]) -> str:
|
|
260
|
+
"""Convert table to Markdown."""
|
|
261
|
+
if not table:
|
|
262
|
+
return ""
|
|
263
|
+
|
|
264
|
+
# Filter None values
|
|
265
|
+
table = [[cell if cell is not None else "" for cell in row] for row in table]
|
|
266
|
+
|
|
267
|
+
# Filter empty rows
|
|
268
|
+
table = [row for row in table if any(cell.strip() for cell in row)]
|
|
269
|
+
|
|
270
|
+
if not table:
|
|
271
|
+
return ""
|
|
272
|
+
|
|
273
|
+
# Calculate column widths
|
|
274
|
+
col_widths = [
|
|
275
|
+
max(len(str(row[i])) if i < len(row) else 0 for row in table)
|
|
276
|
+
for i in range(max(len(row) for row in table))
|
|
277
|
+
]
|
|
278
|
+
|
|
279
|
+
# Format table
|
|
280
|
+
lines = []
|
|
281
|
+
for row_idx, row in enumerate(table):
|
|
282
|
+
padded_row = row + [""] * (len(col_widths) - len(row))
|
|
283
|
+
line = "| " + " | ".join(
|
|
284
|
+
str(cell).ljust(width) for cell, width in zip(padded_row, col_widths)
|
|
285
|
+
) + " |"
|
|
286
|
+
lines.append(line)
|
|
287
|
+
|
|
288
|
+
if row_idx == 0:
|
|
289
|
+
sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
|
|
290
|
+
lines.append(sep)
|
|
291
|
+
|
|
292
|
+
return "\n".join(lines)
|
|
293
|
+
|
|
294
|
+
def close(self):
|
|
295
|
+
"""Close the GlmOcr instance."""
|
|
296
|
+
if self._glmocr:
|
|
297
|
+
self._glmocr.close()
|
|
298
|
+
self._glmocr = None
|
|
299
|
+
|
|
300
|
+
def __enter__(self):
|
|
301
|
+
return self
|
|
302
|
+
|
|
303
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
304
|
+
self.close()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Plugin registration for markitdown-glmocr."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
from markitdown import MarkItDown
|
|
5
|
+
|
|
6
|
+
from ._converter import GlmOcrConverter
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__plugin_interface_version__ = 1
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
|
|
13
|
+
"""
|
|
14
|
+
Register markitdown-glmocr converter.
|
|
15
|
+
|
|
16
|
+
Config sources (priority high to low):
|
|
17
|
+
1. kwargs parameters
|
|
18
|
+
2. Environment variables (ZHIPU_API_KEY)
|
|
19
|
+
3. .env file
|
|
20
|
+
4. Built-in defaults
|
|
21
|
+
"""
|
|
22
|
+
# Register converter
|
|
23
|
+
PRIORITY_GLMOCR = -1.0
|
|
24
|
+
|
|
25
|
+
markitdown.register_converter(
|
|
26
|
+
GlmOcrConverter(
|
|
27
|
+
api_key=kwargs.get("api_key"),
|
|
28
|
+
timeout=kwargs.get("timeout", 1800),
|
|
29
|
+
enable_layout=kwargs.get("enable_layout", False),
|
|
30
|
+
force_ai=kwargs.get("force_ai", False),
|
|
31
|
+
),
|
|
32
|
+
priority=PRIORITY_GLMOCR,
|
|
33
|
+
)
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: markitdown-glmocr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Intelligent PDF to Markdown converter using glmocr SDK
|
|
5
|
+
Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
|
|
6
|
+
Project-URL: Issues, https://github.com/microsoft/markitdown/issues
|
|
7
|
+
Project-URL: Source, https://github.com/microsoft/markitdown
|
|
8
|
+
Author-email: Contributors <noreply@github.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: ai,glm-ocr,glmocr,llm,markitdown,ocr,pdf,vision
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Programming Language :: Python
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: markitdown>=0.1.0
|
|
19
|
+
Requires-Dist: pdfminer-six>=20251230
|
|
20
|
+
Requires-Dist: pdfplumber>=0.11.9
|
|
21
|
+
Requires-Dist: pillow>=9.0.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
24
|
+
Provides-Extra: glmocr
|
|
25
|
+
Requires-Dist: glmocr; extra == 'glmocr'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# markitdown-glmocr
|
|
29
|
+
|
|
30
|
+
智能 PDF 转 Markdown 插件,使用 glmocr SDK(智谱 GLM-OCR)驱动的图片和表格提取。
|
|
31
|
+
|
|
32
|
+
## 特性
|
|
33
|
+
|
|
34
|
+
- 🔍 **智能检测**:自动识别每页内容类型(纯文本 vs 图片/表格)
|
|
35
|
+
- 📄 **默认解析**:纯文本页面使用 pdfplumber/pdfminer 提取,速度快、成本低
|
|
36
|
+
- 🤖 **AI 增强**:复杂页面(图片、表格)使用 glmocr SDK 转换为 Markdown
|
|
37
|
+
- ⚡ **一行调用**:`glmocr.parse("document.pdf")` 完成 OCR,无需手动截图编码
|
|
38
|
+
- 📊 **结构化输出**:返回 Markdown + JSON 结构(含区域标签、边界框)
|
|
39
|
+
|
|
40
|
+
## 安装
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# 基础安装
|
|
44
|
+
pip install markitdown-glmocr
|
|
45
|
+
|
|
46
|
+
# 安装 AI 功能
|
|
47
|
+
pip install markitdown-glmocr[glmocr]
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## 配置
|
|
51
|
+
|
|
52
|
+
### 环境变量(推荐)
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# 必需:智谱 API Key
|
|
56
|
+
export ZHIPU_API_KEY="your-zhipu-api-key"
|
|
57
|
+
|
|
58
|
+
# 可选
|
|
59
|
+
export GLMOCR_MODEL="glm-ocr" # 模型名称
|
|
60
|
+
export GLMOCR_TIMEOUT="600" # 请求超时(秒)
|
|
61
|
+
export GLMOCR_ENABLE_LAYOUT="true" # 启用布局检测
|
|
62
|
+
export GLMOCR_LOG_LEVEL="INFO" # 日志级别
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### 配置优先级
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
构造函数参数 > 环境变量 > .env 文件 > config.yaml > 内置默认值
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### 本地敏感配置
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# 创建 .env 文件(自动读取)
|
|
75
|
+
echo "ZHIPU_API_KEY=your-api-key" > .env
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## 使用方法
|
|
79
|
+
|
|
80
|
+
### 命令行(推荐)
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# 1. 设置 API Key
|
|
84
|
+
export ZHIPU_API_KEY="sk-xxx"
|
|
85
|
+
|
|
86
|
+
# 2. 查看已安装插件
|
|
87
|
+
markitdown --list-plugins
|
|
88
|
+
|
|
89
|
+
# 3. 使用插件转换 PDF
|
|
90
|
+
markitdown -p document.pdf
|
|
91
|
+
|
|
92
|
+
# 4. 保存到文件
|
|
93
|
+
markitdown -p document.pdf -o output.md
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Python API
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from markitdown import MarkItDown
|
|
100
|
+
from markitdown_glmocr import GlmOcrConverter
|
|
101
|
+
|
|
102
|
+
# 方式1:自动从环境变量读取 ZHIPU_API_KEY
|
|
103
|
+
converter = GlmOcrConverter()
|
|
104
|
+
md = MarkItDown(enable_plugins=False)
|
|
105
|
+
md.register_converter(converter, priority=-1.0)
|
|
106
|
+
result = md.convert("document.pdf")
|
|
107
|
+
print(result.markdown)
|
|
108
|
+
|
|
109
|
+
# 方式2:手动传入 API Key
|
|
110
|
+
converter = GlmOcrConverter(api_key="sk-xxx")
|
|
111
|
+
md = MarkItDown(enable_plugins=False)
|
|
112
|
+
md.register_converter(converter, priority=-1.0)
|
|
113
|
+
result = md.convert("document.pdf")
|
|
114
|
+
print(result.markdown)
|
|
115
|
+
|
|
116
|
+
# 方式3:直接使用 glmocr SDK(更简单)
|
|
117
|
+
import glmocr
|
|
118
|
+
result = glmocr.parse("document.pdf")
|
|
119
|
+
print(result.markdown_result) # Markdown 输出
|
|
120
|
+
print(result.json_result) # 结构化 JSON(区域标签、边界框)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### 处理结果
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
import glmocr
|
|
127
|
+
|
|
128
|
+
result = glmocr.parse("report.pdf")
|
|
129
|
+
|
|
130
|
+
# 获取 Markdown
|
|
131
|
+
print(result.markdown_result)
|
|
132
|
+
|
|
133
|
+
# 获取结构化数据(按页分组)
|
|
134
|
+
for page_idx, page_regions in enumerate(result.json_result):
|
|
135
|
+
print(f"Page {page_idx + 1}: {len(page_regions)} regions")
|
|
136
|
+
for region in page_regions:
|
|
137
|
+
print(f" [{region['label']}] {region['content'][:60]}")
|
|
138
|
+
|
|
139
|
+
# 按标签筛选
|
|
140
|
+
tables = [r for r in result.json_result[0] if r["label"] == "table"]
|
|
141
|
+
formulas = [r for r in result.json_result[0] if r["label"] == "formula"]
|
|
142
|
+
|
|
143
|
+
# 保存到磁盘
|
|
144
|
+
result.save(output_dir="./output")
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## 配置选项
|
|
148
|
+
|
|
149
|
+
### GlmOcrConverter 参数
|
|
150
|
+
|
|
151
|
+
| 参数 | 类型 | 默认值 | 说明 |
|
|
152
|
+
|------|------|--------|------|
|
|
153
|
+
| `api_key` | str | 环境变量 `ZHIPU_API_KEY` | 智谱 API Key |
|
|
154
|
+
| `timeout` | int | 1800 | 请求超时(秒) |
|
|
155
|
+
| `enable_layout` | bool | False | 启用布局检测 |
|
|
156
|
+
| `force_ai` | bool | False | 强制所有页面使用 AI |
|
|
157
|
+
|
|
158
|
+
### 环境变量
|
|
159
|
+
|
|
160
|
+
| 变量 | 说明 | 示例 |
|
|
161
|
+
|------|------|------|
|
|
162
|
+
| `ZHIPU_API_KEY` | API Key(必需) | `sk-abc123` |
|
|
163
|
+
| `GLMOCR_MODEL` | 模型名称 | `glm-ocr` |
|
|
164
|
+
| `GLMOCR_TIMEOUT` | 请求超时(秒) | `600` |
|
|
165
|
+
| `GLMOCR_ENABLE_LAYOUT` | 布局检测 | `true` |
|
|
166
|
+
| `GLMOCR_LOG_LEVEL` | 日志级别 | `INFO` |
|
|
167
|
+
|
|
168
|
+
## 工作原理
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
PDF 输入
|
|
172
|
+
│
|
|
173
|
+
▼
|
|
174
|
+
逐页分析内容类型
|
|
175
|
+
│
|
|
176
|
+
├─ 纯文本页面 ──► pdfplumber 提取文本
|
|
177
|
+
│
|
|
178
|
+
└─ 复杂页面(图片/表格)
|
|
179
|
+
│
|
|
180
|
+
└─► glmocr.parse() 一行调用
|
|
181
|
+
│
|
|
182
|
+
├─ 内置截图渲染
|
|
183
|
+
├─ 内置 base64 编码
|
|
184
|
+
└─ 内置 OCR 识别
|
|
185
|
+
│
|
|
186
|
+
▼
|
|
187
|
+
合并输出完整 Markdown
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## 区域标签(json_result)
|
|
191
|
+
|
|
192
|
+
glmocr SDK 返回的结构化数据支持以下标签:
|
|
193
|
+
|
|
194
|
+
| 标签 | 说明 |
|
|
195
|
+
|------|------|
|
|
196
|
+
| `title` | 标题 |
|
|
197
|
+
| `text` | 正文文本 |
|
|
198
|
+
| `table` | 表格 |
|
|
199
|
+
| `figure` | 图片 |
|
|
200
|
+
| `formula` | 公式 |
|
|
201
|
+
| `header` | 页眉 |
|
|
202
|
+
| `footer` | 页脚 |
|
|
203
|
+
| `page_number` | 页码 |
|
|
204
|
+
| `reference` | 参考文献 |
|
|
205
|
+
| `seal` | 印章 |
|
|
206
|
+
|
|
207
|
+
## 技术架构
|
|
208
|
+
|
|
209
|
+
- **glmocr**: 智谱 OCR SDK,一行代码完成 PDF/图片解析
|
|
210
|
+
- **pdfplumber**: PDF 页面分析和纯文本提取
|
|
211
|
+
- **pdfminer**: 纯文本页面提取备用
|
|
212
|
+
|
|
213
|
+
## 依赖
|
|
214
|
+
|
|
215
|
+
- `markitdown>=0.1.0` - 基础框架
|
|
216
|
+
- `pdfplumber>=0.11.9` - PDF 解析和截图
|
|
217
|
+
- `pdfminer.six>=20251230` - 文本提取备用
|
|
218
|
+
- `Pillow>=9.0.0` - 图像处理
|
|
219
|
+
- `glmocr` - 智谱 OCR SDK(可选,AI 功能需要)
|
|
220
|
+
|
|
221
|
+
## 许可证
|
|
222
|
+
|
|
223
|
+
MIT
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
markitdown_glmocr/__about__.py,sha256=QTYqXqSTHFRkM9TEgpDFcHvwLbvqHDqvqfQ9EiXkcAM,23
|
|
2
|
+
markitdown_glmocr/__init__.py,sha256=wQfsyeo8WQbF8cARbtIWgYysGJCVXHT9nijia2onuI8,325
|
|
3
|
+
markitdown_glmocr/_config.py,sha256=QASW-I7n7ctCRXMwtCsewyxWSt8YQt8rVfQN75wrC6g,566
|
|
4
|
+
markitdown_glmocr/_converter.py,sha256=cj5gQs_Yi5VHxMdDpBMsvBCnWhyKQO9ZbORCCo3jLVs,9953
|
|
5
|
+
markitdown_glmocr/_plugin.py,sha256=khzqfGLj91eaLlV7g7epQSYiVEy8Jgh9lVTsLw5gNUQ,880
|
|
6
|
+
markitdown_glmocr-0.1.0.dist-info/METADATA,sha256=ZNPudtIkbHaqAvwqJwhZBpmPG7Nf1y9gZ736qGVgolk,6015
|
|
7
|
+
markitdown_glmocr-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
8
|
+
markitdown_glmocr-0.1.0.dist-info/entry_points.txt,sha256=RuVnDvfZHzj1V8wBvEj74xhS5YLWQhtDAgCzbOBXxwY,58
|
|
9
|
+
markitdown_glmocr-0.1.0.dist-info/RECORD,,
|