markitdown-glmocr 0.1.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -171,3 +171,4 @@ src/.DS_Store
171
171
  .secrets.local
172
172
  *.secrets
173
173
  .env.local
174
+ test-data/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: markitdown-glmocr
3
- Version: 0.1.0
3
+ Version: 0.2.2
4
4
  Summary: Intelligent PDF to Markdown converter using glmocr SDK
5
5
  Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
6
6
  Project-URL: Issues, https://github.com/microsoft/markitdown/issues
@@ -218,6 +218,93 @@ glmocr SDK 返回的结构化数据支持以下标签:
218
218
  - `Pillow>=9.0.0` - 图像处理
219
219
  - `glmocr` - 智谱 OCR SDK(可选,AI 功能需要)
220
220
 
221
+ ## 发布到 PyPI
222
+
223
+ ### 前置条件
224
+
225
+ 1. 安装构建工具:
226
+
227
+ ```bash
228
+ pip install build twine hatch
229
+ ```
230
+
231
+ 2. 配置 PyPI API Token(Windows 用户环境变量):
232
+
233
+ ```powershell
234
+ # PowerShell 设置用户环境变量
235
+ [System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
236
+ ```
237
+
238
+ 或在 Bash/Zsh 中:
239
+
240
+ ```bash
241
+ export PYPI_API_TOKEN="pypi-..."
242
+ ```
243
+
244
+ ### 快速发布(推荐)
245
+
246
+ 项目根目录提供了上传脚本,可一键发布两个插件:
247
+
248
+ **Bash / Git Bash:**
249
+ ```bash
250
+ # 构建两个插件
251
+ cd packages/markitdown-glmocr && hatch build
252
+
253
+ cd ../markitdown-paddleocr && hatch build
254
+
255
+ # 上传(自动上传所有构建的版本)
256
+ cd ../..
257
+ ./scripts/pypi-upload.sh
258
+
259
+ # 或指定版本号
260
+ ./scripts/pypi-upload.sh 0.2.0
261
+ ```
262
+
263
+ **PowerShell:**
264
+ ```powershell
265
+ # 构建两个插件
266
+ cd packages/markitdown-glmocr; hatch build
267
+ cd ../markitdown-paddleocr; hatch build
268
+
269
+ # 上传
270
+ cd ../..
271
+ .\scripts\pypi-upload.ps1
272
+
273
+ # 或指定版本号
274
+ .\scripts\pypi-upload.ps1 -Version "0.2.0"
275
+ ```
276
+
277
+ ### 手动发布
278
+
279
+ ```bash
280
+ # 1. 进入项目目录
281
+ cd packages/markitdown-glmocr
282
+
283
+ # 2. 构建
284
+ hatch build
285
+
286
+ # 3. 检查
287
+ twine check dist/*
288
+
289
+ # 4. 上传
290
+ twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
291
+ ```
292
+
293
+ ### 发布到 TestPyPI(测试)
294
+
295
+ ```bash
296
+ twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
297
+
298
+ # 从 TestPyPI 安装验证
299
+ pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr
300
+ ```
301
+
302
+ ### 注意事项
303
+
304
+ - 发布前确保 `src/markitdown_glmocr/__about__.py` 中的版本号已更新
305
+ - 同一版本号不能重复上传,如需修正必须 bump 版本号
306
+ - `PYPI_API_TOKEN` 切勿提交到代码仓库
307
+
221
308
  ## 许可证
222
309
 
223
310
  MIT
@@ -191,6 +191,93 @@ glmocr SDK 返回的结构化数据支持以下标签:
191
191
  - `Pillow>=9.0.0` - 图像处理
192
192
  - `glmocr` - 智谱 OCR SDK(可选,AI 功能需要)
193
193
 
194
+ ## 发布到 PyPI
195
+
196
+ ### 前置条件
197
+
198
+ 1. 安装构建工具:
199
+
200
+ ```bash
201
+ pip install build twine hatch
202
+ ```
203
+
204
+ 2. 配置 PyPI API Token(Windows 用户环境变量):
205
+
206
+ ```powershell
207
+ # PowerShell 设置用户环境变量
208
+ [System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
209
+ ```
210
+
211
+ 或在 Bash/Zsh 中:
212
+
213
+ ```bash
214
+ export PYPI_API_TOKEN="pypi-..."
215
+ ```
216
+
217
+ ### 快速发布(推荐)
218
+
219
+ 项目根目录提供了上传脚本,可一键发布两个插件:
220
+
221
+ **Bash / Git Bash:**
222
+ ```bash
223
+ # 构建两个插件
224
+ cd packages/markitdown-glmocr && hatch build
225
+
226
+ cd ../markitdown-paddleocr && hatch build
227
+
228
+ # 上传(自动上传所有构建的版本)
229
+ cd ../..
230
+ ./scripts/pypi-upload.sh
231
+
232
+ # 或指定版本号
233
+ ./scripts/pypi-upload.sh 0.2.0
234
+ ```
235
+
236
+ **PowerShell:**
237
+ ```powershell
238
+ # 构建两个插件
239
+ cd packages/markitdown-glmocr; hatch build
240
+ cd ../markitdown-paddleocr; hatch build
241
+
242
+ # 上传
243
+ cd ../..
244
+ .\scripts\pypi-upload.ps1
245
+
246
+ # 或指定版本号
247
+ .\scripts\pypi-upload.ps1 -Version "0.2.0"
248
+ ```
249
+
250
+ ### 手动发布
251
+
252
+ ```bash
253
+ # 1. 进入项目目录
254
+ cd packages/markitdown-glmocr
255
+
256
+ # 2. 构建
257
+ hatch build
258
+
259
+ # 3. 检查
260
+ twine check dist/*
261
+
262
+ # 4. 上传
263
+ twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
264
+ ```
265
+
266
+ ### 发布到 TestPyPI(测试)
267
+
268
+ ```bash
269
+ twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
270
+
271
+ # 从 TestPyPI 安装验证
272
+ pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr
273
+ ```
274
+
275
+ ### 注意事项
276
+
277
+ - 发布前确保 `src/markitdown_glmocr/__about__.py` 中的版本号已更新
278
+ - 同一版本号不能重复上传,如需修正必须 bump 版本号
279
+ - `PYPI_API_TOKEN` 切勿提交到代码仓库
280
+
194
281
  ## 许可证
195
282
 
196
283
  MIT
@@ -0,0 +1 @@
1
+ __version__ = "0.2.2"
@@ -0,0 +1,43 @@
1
+ """Configuration for markitdown-glmocr."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+
6
+
7
+ class ScanDetectionMode(str, Enum):
8
+ """扫描检测模式。
9
+
10
+ - PAGE_BY_PAGE: 逐页分析,当前默认行为
11
+ - FIRST_PAGE_HINT: 首页是扫描件则全文档使用OCR
12
+ - SAMPLING: 抽样前N页,多数是扫描件则全部OCR
13
+ """
14
+ PAGE_BY_PAGE = "page_by_page"
15
+ FIRST_PAGE_HINT = "first_page_hint"
16
+ SAMPLING = "sampling"
17
+
18
+
19
+ @dataclass
20
+ class GlmOcrConfig:
21
+ """markitdown-glmocr configuration.
22
+
23
+ Configuration priority (high to low):
24
+ 1. Constructor kwargs
25
+ 2. Environment variables
26
+ 3. .env file
27
+ 4. Built-in defaults
28
+ """
29
+
30
+ # API configuration
31
+ api_key: str = "" # Reads from ZHIPU_API_KEY by default
32
+
33
+ # OCR configuration
34
+ timeout: int = 1800
35
+ enable_layout: bool = False
36
+
37
+ # Processing strategy
38
+ force_ai: bool = False
39
+
40
+ # Scan detection mode for optimization
41
+ scan_detection_mode: ScanDetectionMode = ScanDetectionMode.SAMPLING
42
+ scan_sample_pages: int = 3 # Number of pages to sample in SAMPLING mode
43
+ scan_text_threshold: int = 50 # Min text length to consider page as non-scanned
@@ -0,0 +1,551 @@
1
+ """GlmOcr PDF/Image Converter - Intelligent PDF and Image to Markdown conversion."""
2
+
3
+ import io
4
+ import logging
5
+ import sys
6
+ from typing import Any, BinaryIO, Optional
7
+
8
+ from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
9
+ from markitdown._exceptions import (
10
+ MISSING_DEPENDENCY_MESSAGE,
11
+ MissingDependencyException,
12
+ )
13
+
14
+ from ._config import GlmOcrConfig, ScanDetectionMode
15
+
16
+ # Import dependencies
17
+ _dependency_exc_info = None
18
+ try:
19
+ import pdfminer
20
+ import pdfminer.high_level
21
+ import pdfplumber
22
+ except ImportError:
23
+ _dependency_exc_info = sys.exc_info()
24
+
25
+ # glmocr SDK
26
+ try:
27
+ import glmocr
28
+ from glmocr import GlmOcr
29
+ except ImportError:
30
+ glmocr = None
31
+ GlmOcr = None
32
+
33
+
34
+ ACCEPTED_MIME_TYPE_PREFIXES = [
35
+ "application/pdf",
36
+ "application/x-pdf",
37
+ "image/jpeg",
38
+ "image/png",
39
+ ]
40
+
41
+ ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
42
+
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ class GlmOcrConverter(DocumentConverter):
48
+ """
49
+ Intelligent PDF/Image converter using glmocr SDK.
50
+
51
+ Features:
52
+ - Auto-detect page content type (plain text vs images/tables)
53
+ - Plain text pages use pdfplumber/pdfminer (fast, free)
54
+ - Complex pages use glmocr SDK for AI-powered OCR
55
+ - Image files (PNG, JPG) use glmocr SDK directly
56
+ - One-liner: glmocr.parse("document.pdf") handles everything
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ api_key: Optional[str] = None,
62
+ timeout: int = 1800,
63
+ enable_layout: bool = False,
64
+ force_ai: bool = False,
65
+ scan_detection_mode: Optional[ScanDetectionMode] = None,
66
+ scan_sample_pages: Optional[int] = None,
67
+ scan_text_threshold: Optional[int] = None,
68
+ config: Optional[GlmOcrConfig] = None,
69
+ ):
70
+ """
71
+ Initialize converter.
72
+
73
+ Args:
74
+ api_key: Zhipu API key (reads from ZHIPU_API_KEY env var if not provided)
75
+ timeout: Request timeout in seconds (default: 1800)
76
+ enable_layout: Enable layout detection (default: False)
77
+ force_ai: Force all pages to use AI (default: False)
78
+ scan_detection_mode: 扫描检测模式,优化扫描PDF处理
79
+ scan_sample_pages: SAMPLING模式下抽样页数 (default: 3)
80
+ scan_text_threshold: 判定为扫描件的最小文本长度阈值 (default: 50)
81
+ config: Optional GlmOcrConfig instance
82
+ """
83
+ if glmocr is None:
84
+ raise ImportError(
85
+ "glmocr is required. Install with: pip install markitdown-glmocr[glmocr]"
86
+ )
87
+
88
+ # Use config if provided
89
+ if config:
90
+ self.api_key = api_key or config.api_key
91
+ self.timeout = timeout if timeout != 1800 else config.timeout
92
+ self.enable_layout = (
93
+ enable_layout if enable_layout else config.enable_layout
94
+ )
95
+ self.force_ai = force_ai or config.force_ai
96
+ self.scan_detection_mode = (
97
+ scan_detection_mode
98
+ if scan_detection_mode is not None
99
+ else config.scan_detection_mode
100
+ )
101
+ self.scan_sample_pages = (
102
+ scan_sample_pages
103
+ if scan_sample_pages is not None
104
+ else config.scan_sample_pages
105
+ )
106
+ self.scan_text_threshold = (
107
+ scan_text_threshold
108
+ if scan_text_threshold is not None
109
+ else config.scan_text_threshold
110
+ )
111
+ else:
112
+ self.api_key = api_key
113
+ self.timeout = timeout
114
+ self.enable_layout = enable_layout
115
+ self.force_ai = force_ai
116
+ self.scan_detection_mode = (
117
+ scan_detection_mode
118
+ if scan_detection_mode is not None
119
+ else ScanDetectionMode.SAMPLING
120
+ )
121
+ self.scan_sample_pages = (
122
+ scan_sample_pages if scan_sample_pages is not None else 3
123
+ )
124
+ self.scan_text_threshold = (
125
+ scan_text_threshold if scan_text_threshold is not None else 50
126
+ )
127
+
128
+ # Lazy init GlmOcr instance
129
+ self._glmocr: Optional[GlmOcr] = None
130
+
131
+ def _get_glmocr(self) -> GlmOcr:
132
+ """Get or create GlmOcr instance."""
133
+ if self._glmocr is None:
134
+ kwargs = {"timeout": self.timeout, "enable_layout": self.enable_layout}
135
+ if self.api_key:
136
+ kwargs["api_key"] = self.api_key
137
+ self._glmocr = GlmOcr(**kwargs)
138
+ return self._glmocr
139
+
140
+ def accepts(
141
+ self,
142
+ file_stream: BinaryIO,
143
+ stream_info: StreamInfo,
144
+ **kwargs: Any,
145
+ ) -> bool:
146
+ mimetype = (stream_info.mimetype or "").lower()
147
+ extension = (stream_info.extension or "").lower()
148
+
149
+ if extension in ACCEPTED_FILE_EXTENSIONS:
150
+ return True
151
+
152
+ for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
153
+ if mimetype.startswith(prefix):
154
+ return True
155
+
156
+ return False
157
+
158
+ def convert(
159
+ self,
160
+ file_stream: BinaryIO,
161
+ stream_info: StreamInfo,
162
+ **kwargs: Any,
163
+ ) -> DocumentConverterResult:
164
+ if _dependency_exc_info is not None:
165
+ raise MissingDependencyException(
166
+ MISSING_DEPENDENCY_MESSAGE.format(
167
+ converter=type(self).__name__,
168
+ extension=".pdf",
169
+ feature="pdf",
170
+ )
171
+ ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
172
+
173
+ extension = (stream_info.extension or "").lower()
174
+
175
+ logger.info("GlmOcrConverter: 开始转换, 文件类型=%s", extension)
176
+
177
+ # Image files: use glmocr directly
178
+ if extension in (".jpg", ".jpeg", ".png"):
179
+ return self._convert_image(file_stream, extension)
180
+
181
+ # PDF files: use hybrid approach
182
+ return self._convert_pdf(file_stream)
183
+
184
+ def _convert_image(
185
+ self, file_stream: BinaryIO, extension: str = ".png"
186
+ ) -> DocumentConverterResult:
187
+ """Convert image file using glmocr SDK."""
188
+ img_bytes = file_stream.read()
189
+
190
+ logger.info("GlmOcrConverter: 开始 OCR 识别图片, 格式=%s", extension)
191
+ try:
192
+ result = self._get_glmocr().parse(img_bytes)
193
+ except Exception as e:
194
+ logger.error(
195
+ "GlmOcrConverter: 图片 OCR 识别异常, 格式=%s, 错误=%s", extension, e
196
+ )
197
+ raise
198
+
199
+ # Check for errors
200
+ d = result.to_dict()
201
+ if "error" in d:
202
+ logger.error(
203
+ "GlmOcrConverter: 图片 OCR 返回错误, 格式=%s, 错误=%s",
204
+ extension,
205
+ d["error"],
206
+ )
207
+ raise RuntimeError(
208
+ f"GlmOcrConverter: glmocr SDK returned error: {d['error']}"
209
+ )
210
+
211
+ markdown = result.markdown_result or ""
212
+ logger.info("GlmOcrConverter: 图片 OCR 识别完成, 输出长度=%d", len(markdown))
213
+ return DocumentConverterResult(markdown=markdown)
214
+
215
+ def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
216
+ pdf_stream = io.BytesIO(file_stream.read())
217
+ pdf_bytes = pdf_stream.getvalue() # Keep original bytes for batch OCR
218
+ markdown_parts = []
219
+
220
+ with pdfplumber.open(pdf_stream) as pdf:
221
+ total_pages = len(pdf.pages)
222
+ logger.info("GlmOcrConverter: 开始处理 PDF, 总页数=%d", total_pages)
223
+
224
+ # Optimization: detect if entire PDF is scanned
225
+ all_scanned = self._detect_all_scanned(pdf)
226
+
227
+ if all_scanned and not self.force_ai:
228
+ # Batch mode: upload entire PDF to glmocr SDK (single API call)
229
+ logger.info(
230
+ "GlmOcrConverter: 全文档扫描模式, 批量上传PDF, 页数=%d",
231
+ total_pages,
232
+ )
233
+ try:
234
+ markdown = self._convert_pdf_batch(pdf_bytes)
235
+ if markdown.strip():
236
+ logger.info(
237
+ "GlmOcrConverter: 批量OCR完成, 输出长度=%d",
238
+ len(markdown),
239
+ )
240
+ return DocumentConverterResult(markdown=markdown)
241
+ except Exception as e:
242
+ logger.error(
243
+ "GlmOcrConverter: 批量OCR失败, 抛出异常让框架fallback到下一个converter, 错误=%s",
244
+ e,
245
+ )
246
+ raise
247
+
248
+ # Per-page processing (PAGE_BY_PAGE mode or batch failed)
249
+ for page_num, page in enumerate(pdf.pages):
250
+ # Choose processing method
251
+ if self.force_ai or all_scanned:
252
+ # All scanned (after batch failed) or force_ai
253
+ logger.info(
254
+ "GlmOcrConverter: 第 %d/%d 页, 使用 glmocr OCR",
255
+ page_num + 1,
256
+ total_pages,
257
+ )
258
+ try:
259
+ markdown = self._convert_with_glmocr(page, page_num)
260
+ except Exception as e:
261
+ logger.error(
262
+ "GlmOcrConverter: 第 %d/%d 页识别异常, 错误=%s",
263
+ page_num + 1,
264
+ e,
265
+ )
266
+ raise
267
+ else:
268
+ # Per-page analysis (PAGE_BY_PAGE mode or non-scanned doc)
269
+ page_type = self._analyze_page(page)
270
+
271
+ if page_type != "plain_text":
272
+ logger.info(
273
+ "GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 glmocr OCR",
274
+ page_num + 1,
275
+ total_pages,
276
+ page_type,
277
+ )
278
+ try:
279
+ markdown = self._convert_with_glmocr(page, page_num)
280
+ except Exception as e:
281
+ logger.error(
282
+ "GlmOcrConverter: 第 %d/%d 页识别异常, 错误=%s",
283
+ page_num + 1,
284
+ e,
285
+ )
286
+ raise
287
+ else:
288
+ logger.info(
289
+ "GlmOcrConverter: 第 %d/%d 页, 类型=%s, 使用 pdfplumber",
290
+ page_num + 1,
291
+ total_pages,
292
+ page_type,
293
+ )
294
+ markdown = self._extract_text_with_tables(page)
295
+
296
+ if markdown.strip():
297
+ markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
298
+
299
+ page.close()
300
+
301
+ markdown = "\n\n".join(markdown_parts).strip()
302
+ logger.info("GlmOcrConverter: PDF 转换完成, 输出长度=%d", len(markdown))
303
+ return DocumentConverterResult(markdown=markdown)
304
+
305
+ def _convert_pdf_batch(self, pdf_bytes: bytes) -> str:
306
+ """Convert entire PDF in a single API call.
307
+
308
+ More efficient for scanned PDFs: one API call instead of N calls for N pages.
309
+
310
+ Args:
311
+ pdf_bytes: Raw PDF file content.
312
+
313
+ Returns:
314
+ Markdown text from all pages.
315
+ """
316
+ logger.info(
317
+ "GlmOcrConverter: 批量上传PDF到glmocr SDK, 大小=%d bytes", len(pdf_bytes)
318
+ )
319
+ result = self._get_glmocr().parse(pdf_bytes)
320
+
321
+ # Check for errors
322
+ d = result.to_dict()
323
+ if "error" in d:
324
+ logger.error(
325
+ "GlmOcrConverter: 批量OCR返回错误, 错误=%s",
326
+ d["error"],
327
+ )
328
+ raise RuntimeError(
329
+ f"GlmOcrConverter: glmocr SDK batch OCR error: {d['error']}"
330
+ )
331
+
332
+ markdown = result.markdown_result or ""
333
+ return markdown
334
+
335
+ def _analyze_page(self, page: Any) -> str:
336
+ """Analyze page content type."""
337
+ # Check for images
338
+ if hasattr(page, "images") and page.images:
339
+ return "complex"
340
+
341
+ # Check for tables
342
+ tables = page.find_tables()
343
+ if tables:
344
+ return "complex"
345
+
346
+ # Check for graphics/curves
347
+ if hasattr(page, "curves") and page.curves:
348
+ return "complex"
349
+
350
+ return "plain_text"
351
+
352
+ def _is_scanned_page(self, page: Any) -> bool:
353
+ """Check if a page is likely a scanned image.
354
+
355
+ A page is considered scanned if:
356
+ 1. It contains images, AND
357
+ 2. It has very little extractable text (below threshold)
358
+
359
+ Args:
360
+ page: pdfplumber page object
361
+
362
+ Returns:
363
+ True if the page appears to be a scanned image
364
+ """
365
+ # Must have images to be a scan
366
+ has_images = hasattr(page, "images") and bool(page.images)
367
+ if not has_images:
368
+ return False
369
+
370
+ # Check extractable text length
371
+ try:
372
+ text = page.extract_text() or ""
373
+ text_len = len(text.strip())
374
+ # If there's substantial text, it might be a mixed page or
375
+ # a digital PDF with embedded images
376
+ if text_len >= self.scan_text_threshold:
377
+ return False
378
+ except Exception:
379
+ # If text extraction fails, assume it's a scan
380
+ return True
381
+
382
+ return True
383
+
384
+ def _detect_all_scanned(self, pdf: Any) -> bool:
385
+ """Detect if entire PDF is scanned based on scan_detection_mode.
386
+
387
+ Optimization: When first few pages are scanned, we can assume
388
+ all pages are scanned and skip per-page analysis.
389
+
390
+ Args:
391
+ pdf: pdfplumber PDF object
392
+
393
+ Returns:
394
+ True if entire PDF should be treated as scanned
395
+ """
396
+ if self.scan_detection_mode == ScanDetectionMode.PAGE_BY_PAGE:
397
+ return False
398
+
399
+ total_pages = len(pdf.pages)
400
+ if total_pages == 0:
401
+ return False
402
+
403
+ if self.scan_detection_mode == ScanDetectionMode.FIRST_PAGE_HINT:
404
+ # Check only first page
405
+ first_page = pdf.pages[0]
406
+ is_scanned = self._is_scanned_page(first_page)
407
+ first_page.close()
408
+ if is_scanned:
409
+ logger.info(
410
+ "GlmOcrConverter: 首页检测为扫描件, 模式=FIRST_PAGE_HINT, 全文档使用OCR"
411
+ )
412
+ return is_scanned
413
+
414
+ if self.scan_detection_mode == ScanDetectionMode.SAMPLING:
415
+ # Sample first N pages
416
+ sample_count = min(self.scan_sample_pages, total_pages)
417
+ scanned_count = 0
418
+
419
+ for i in range(sample_count):
420
+ page = pdf.pages[i]
421
+ if self._is_scanned_page(page):
422
+ scanned_count += 1
423
+
424
+ # If majority of sampled pages are scanned, treat all as scanned
425
+ majority_threshold = sample_count // 2 + 1
426
+ all_scanned = scanned_count >= majority_threshold
427
+
428
+ if all_scanned:
429
+ logger.info(
430
+ "GlmOcrConverter: 抽样检测 %d/%d 页为扫描件, 模式=SAMPLING, 全文档使用OCR",
431
+ scanned_count,
432
+ sample_count,
433
+ )
434
+
435
+ return all_scanned
436
+
437
+ return False
438
+
439
+ def _convert_with_glmocr(self, page: Any, page_num: int) -> str:
440
+ """Convert page using glmocr SDK.
441
+
442
+ Raises RuntimeError on OCR failure so the framework can try the next converter.
443
+ """
444
+ # Render page to image
445
+ img = page.to_image(resolution=150)
446
+ img_bytes = io.BytesIO()
447
+ img.save(img_bytes, format="PNG")
448
+
449
+ logger.info("GlmOcrConverter: glmocr SDK 开始识别第 %d 页", page_num + 1)
450
+ try:
451
+ result = self._get_glmocr().parse(img_bytes.getvalue())
452
+ except Exception as e:
453
+ logger.error(
454
+ "GlmOcrConverter: glmocr SDK 第 %d 页识别异常, 错误=%s", page_num + 1, e
455
+ )
456
+ raise
457
+
458
+ # Check for errors
459
+ d = result.to_dict()
460
+ if "error" in d:
461
+ logger.error(
462
+ "GlmOcrConverter: glmocr SDK 第 %d 页返回错误, 错误=%s",
463
+ page_num + 1,
464
+ d["error"],
465
+ )
466
+ raise RuntimeError(
467
+ f"GlmOcrConverter: glmocr SDK returned error on page {page_num + 1}: {d['error']}"
468
+ )
469
+
470
+ markdown = result.markdown_result or ""
471
+ logger.info(
472
+ "GlmOcrConverter: glmocr SDK 第 %d 页识别完成, 输出长度=%d",
473
+ page_num + 1,
474
+ len(markdown),
475
+ )
476
+ return markdown
477
+
478
+ def _extract_text_with_tables(self, page: Any) -> str:
479
+ """Extract text and tables from page."""
480
+ parts = []
481
+
482
+ # Extract text
483
+ text = page.extract_text() or ""
484
+ if text.strip():
485
+ parts.append(text.strip())
486
+
487
+ # Extract tables
488
+ try:
489
+ tables = page.extract_tables()
490
+ if tables:
491
+ for table in tables:
492
+ if table:
493
+ md_table = self._table_to_markdown(table)
494
+ if md_table.strip():
495
+ parts.append(md_table)
496
+ except Exception:
497
+ pass
498
+
499
+ return "\n\n".join(parts)
500
+
501
+ def _table_to_markdown(self, table: list[list[str]]) -> str:
502
+ """Convert table to Markdown."""
503
+ if not table:
504
+ return ""
505
+
506
+ # Filter None values
507
+ table = [[cell if cell is not None else "" for cell in row] for row in table]
508
+
509
+ # Filter empty rows
510
+ table = [row for row in table if any(cell.strip() for cell in row)]
511
+
512
+ if not table:
513
+ return ""
514
+
515
+ # Calculate column widths
516
+ col_widths = [
517
+ max(len(str(row[i])) if i < len(row) else 0 for row in table)
518
+ for i in range(max(len(row) for row in table))
519
+ ]
520
+
521
+ # Format table
522
+ lines = []
523
+ for row_idx, row in enumerate(table):
524
+ padded_row = row + [""] * (len(col_widths) - len(row))
525
+ line = (
526
+ "| "
527
+ + " | ".join(
528
+ str(cell).ljust(width)
529
+ for cell, width in zip(padded_row, col_widths)
530
+ )
531
+ + " |"
532
+ )
533
+ lines.append(line)
534
+
535
+ if row_idx == 0:
536
+ sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
537
+ lines.append(sep)
538
+
539
+ return "\n".join(lines)
540
+
541
+ def close(self):
542
+ """Close the GlmOcr instance."""
543
+ if self._glmocr:
544
+ self._glmocr.close()
545
+ self._glmocr = None
546
+
547
+ def __enter__(self):
548
+ return self
549
+
550
+ def __exit__(self, exc_type, exc_val, exc_tb):
551
+ self.close()
@@ -1,33 +1,46 @@
1
1
  """Plugin registration for markitdown-glmocr."""
2
2
 
3
+ import logging
3
4
  from typing import Any
5
+
4
6
  from markitdown import MarkItDown
5
7
 
6
8
  from ._converter import GlmOcrConverter
7
9
 
8
-
9
10
  __plugin_interface_version__ = 1
10
11
 
12
+ logger = logging.getLogger(__name__)
13
+
11
14
 
12
15
  def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
13
16
  """
14
17
  Register markitdown-glmocr converter.
15
-
18
+
16
19
  Config sources (priority high to low):
17
20
  1. kwargs parameters
18
21
  2. Environment variables (ZHIPU_API_KEY)
19
22
  3. .env file
20
23
  4. Built-in defaults
21
24
  """
25
+ logger.info("markitdown-glmocr: 开始注册插件")
26
+
22
27
  # Register converter
28
+ # Priority -1.0: same level as PaddleOcrConverter,
29
+ # the upper-level agent's skills control which plugin to call first.
23
30
  PRIORITY_GLMOCR = -1.0
24
-
25
- markitdown.register_converter(
26
- GlmOcrConverter(
31
+
32
+ try:
33
+ converter = GlmOcrConverter(
27
34
  api_key=kwargs.get("api_key"),
28
35
  timeout=kwargs.get("timeout", 1800),
29
36
  enable_layout=kwargs.get("enable_layout", False),
30
37
  force_ai=kwargs.get("force_ai", False),
31
- ),
32
- priority=PRIORITY_GLMOCR,
33
- )
38
+ )
39
+ markitdown.register_converter(
40
+ converter,
41
+ priority=PRIORITY_GLMOCR,
42
+ )
43
+ logger.info("markitdown-glmocr: 插件注册成功, priority=%.1f", PRIORITY_GLMOCR)
44
+ except Exception as e:
45
+ logger.error("markitdown-glmocr: 插件注册失败, 错误=%s", e)
46
+ raise
@@ -1 +0,0 @@
1
- __version__ = "0.1.0"
@@ -1,25 +0,0 @@
1
- """Configuration for markitdown-glmocr."""
2
-
3
- from dataclasses import dataclass, field
4
-
5
-
6
- @dataclass
7
- class GlmOcrConfig:
8
- """markitdown-glmocr configuration.
9
-
10
- Configuration priority (high to low):
11
- 1. Constructor kwargs
12
- 2. Environment variables
13
- 3. .env file
14
- 4. Built-in defaults
15
- """
16
-
17
- # API configuration
18
- api_key: str = "" # Reads from ZHIPU_API_KEY by default
19
-
20
- # OCR configuration
21
- timeout: int = 1800
22
- enable_layout: bool = False
23
-
24
- # Processing strategy
25
- force_ai: bool = False
@@ -1,304 +0,0 @@
1
- """GlmOcr PDF/Image Converter - Intelligent PDF and Image to Markdown conversion."""
2
-
3
- import io
4
- import sys
5
- from typing import Any, BinaryIO, Optional
6
-
7
- from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
8
- from markitdown._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
9
-
10
- from ._config import GlmOcrConfig
11
-
12
- # Import dependencies
13
- _dependency_exc_info = None
14
- try:
15
- import pdfminer
16
- import pdfminer.high_level
17
- import pdfplumber
18
- except ImportError:
19
- _dependency_exc_info = sys.exc_info()
20
-
21
- # glmocr SDK
22
- try:
23
- import glmocr
24
- from glmocr import GlmOcr
25
- except ImportError:
26
- glmocr = None
27
- GlmOcr = None
28
-
29
-
30
- ACCEPTED_MIME_TYPE_PREFIXES = [
31
- "application/pdf",
32
- "application/x-pdf",
33
- "image/jpeg",
34
- "image/png",
35
- ]
36
-
37
- ACCEPTED_FILE_EXTENSIONS = [".pdf", ".jpg", ".jpeg", ".png"]
38
-
39
-
40
- class GlmOcrConverter(DocumentConverter):
41
- """
42
- Intelligent PDF/Image converter using glmocr SDK.
43
-
44
- Features:
45
- - Auto-detect page content type (plain text vs images/tables)
46
- - Plain text pages use pdfplumber/pdfminer (fast, free)
47
- - Complex pages use glmocr SDK for AI-powered OCR
48
- - Image files (PNG, JPG) use glmocr SDK directly
49
- - One-liner: glmocr.parse("document.pdf") handles everything
50
- """
51
-
52
- def __init__(
53
- self,
54
- api_key: Optional[str] = None,
55
- timeout: int = 1800,
56
- enable_layout: bool = False,
57
- force_ai: bool = False,
58
- config: Optional[GlmOcrConfig] = None,
59
- ):
60
- """
61
- Initialize converter.
62
-
63
- Args:
64
- api_key: Zhipu API key (reads from ZHIPU_API_KEY env var if not provided)
65
- timeout: Request timeout in seconds (default: 1800)
66
- enable_layout: Enable layout detection (default: False)
67
- force_ai: Force all pages to use AI (default: False)
68
- config: Optional GlmOcrConfig instance
69
- """
70
- if glmocr is None:
71
- raise ImportError(
72
- "glmocr is required. Install with: pip install markitdown-glmocr[glmocr]"
73
- )
74
-
75
- # Use config if provided
76
- if config:
77
- self.api_key = api_key or config.api_key
78
- self.timeout = timeout if timeout != 1800 else config.timeout
79
- self.enable_layout = enable_layout if enable_layout else config.enable_layout
80
- self.force_ai = force_ai or config.force_ai
81
- else:
82
- self.api_key = api_key
83
- self.timeout = timeout
84
- self.enable_layout = enable_layout
85
- self.force_ai = force_ai
86
-
87
- # Lazy init GlmOcr instance
88
- self._glmocr: Optional[GlmOcr] = None
89
-
90
- def _get_glmocr(self) -> GlmOcr:
91
- """Get or create GlmOcr instance."""
92
- if self._glmocr is None:
93
- kwargs = {"timeout": self.timeout, "enable_layout": self.enable_layout}
94
- if self.api_key:
95
- kwargs["api_key"] = self.api_key
96
- self._glmocr = GlmOcr(**kwargs)
97
- return self._glmocr
98
-
99
- def accepts(
100
- self,
101
- file_stream: BinaryIO,
102
- stream_info: StreamInfo,
103
- **kwargs: Any,
104
- ) -> bool:
105
- mimetype = (stream_info.mimetype or "").lower()
106
- extension = (stream_info.extension or "").lower()
107
-
108
- if extension in ACCEPTED_FILE_EXTENSIONS:
109
- return True
110
-
111
- for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
112
- if mimetype.startswith(prefix):
113
- return True
114
-
115
- return False
116
-
117
- def convert(
118
- self,
119
- file_stream: BinaryIO,
120
- stream_info: StreamInfo,
121
- **kwargs: Any,
122
- ) -> DocumentConverterResult:
123
- if _dependency_exc_info is not None:
124
- raise MissingDependencyException(
125
- MISSING_DEPENDENCY_MESSAGE.format(
126
- converter=type(self).__name__,
127
- extension=".pdf",
128
- feature="pdf",
129
- )
130
- ) from _dependency_exc_info[1].with_traceback(
131
- _dependency_exc_info[2]
132
- )
133
-
134
- extension = (stream_info.extension or "").lower()
135
-
136
- # Image files: use glmocr directly
137
- if extension in (".jpg", ".jpeg", ".png"):
138
- return self._convert_image(file_stream, extension)
139
-
140
- # PDF files: use hybrid approach
141
- return self._convert_pdf(file_stream)
142
-
143
- def _convert_image(self, file_stream: BinaryIO, extension: str = ".png") -> DocumentConverterResult:
144
- """Convert image file using glmocr SDK."""
145
- img_bytes = file_stream.read()
146
-
147
- try:
148
- result = self._get_glmocr().parse(img_bytes)
149
-
150
- # Check for errors
151
- d = result.to_dict()
152
- if "error" in d:
153
- return DocumentConverterResult(markdown="")
154
-
155
- return DocumentConverterResult(
156
- markdown=result.markdown_result or ""
157
- )
158
- except Exception as e:
159
- return DocumentConverterResult(
160
- markdown=f"<!-- Error converting image: {e} -->"
161
- )
162
-
163
- def _convert_pdf(self, file_stream: BinaryIO) -> DocumentConverterResult:
164
- pdf_stream = io.BytesIO(file_stream.read())
165
- markdown_parts = []
166
-
167
- try:
168
- with pdfplumber.open(pdf_stream) as pdf:
169
- for page_num, page in enumerate(pdf.pages):
170
- # Analyze page type
171
- page_type = self._analyze_page(page)
172
-
173
- # Choose processing method
174
- if self.force_ai or page_type != "plain_text":
175
- # Complex content: use glmocr
176
- markdown = self._convert_with_glmocr(page, page_num)
177
- else:
178
- # Plain text: use pdfplumber
179
- markdown = self._extract_text_with_tables(page)
180
-
181
- if markdown.strip():
182
- markdown_parts.append(f"## Page {page_num + 1}\n\n{markdown}")
183
-
184
- page.close()
185
-
186
- markdown = "\n\n".join(markdown_parts).strip()
187
-
188
- except Exception:
189
- # Fallback to pdfminer
190
- pdf_stream.seek(0)
191
- markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
192
-
193
- # Final fallback
194
- if not markdown:
195
- pdf_stream.seek(0)
196
- markdown = pdfminer.high_level.extract_text(pdf_stream) or ""
197
-
198
- return DocumentConverterResult(markdown=markdown)
199
-
200
- def _analyze_page(self, page: Any) -> str:
201
- """Analyze page content type."""
202
- # Check for images
203
- if hasattr(page, "images") and page.images:
204
- return "complex"
205
-
206
- # Check for tables
207
- tables = page.find_tables()
208
- if tables:
209
- return "complex"
210
-
211
- # Check for graphics/curves
212
- if hasattr(page, "curves") and page.curves:
213
- return "complex"
214
-
215
- return "plain_text"
216
-
217
- def _convert_with_glmocr(self, page: Any, page_num: int) -> str:
218
- """Convert page using glmocr SDK."""
219
- try:
220
- # Render page to image
221
- img = page.to_image(resolution=150)
222
- img_bytes = io.BytesIO()
223
- img.save(img_bytes, format="PNG")
224
- result = self._get_glmocr().parse(img_bytes.getvalue())
225
-
226
- # Check for errors
227
- d = result.to_dict()
228
- if "error" in d:
229
- return self._extract_text_with_tables(page)
230
-
231
- return result.markdown_result or ""
232
-
233
- except Exception:
234
- return self._extract_text_with_tables(page)
235
-
236
- def _extract_text_with_tables(self, page: Any) -> str:
237
- """Extract text and tables from page."""
238
- parts = []
239
-
240
- # Extract text
241
- text = page.extract_text() or ""
242
- if text.strip():
243
- parts.append(text.strip())
244
-
245
- # Extract tables
246
- try:
247
- tables = page.extract_tables()
248
- if tables:
249
- for table in tables:
250
- if table:
251
- md_table = self._table_to_markdown(table)
252
- if md_table.strip():
253
- parts.append(md_table)
254
- except Exception:
255
- pass
256
-
257
- return "\n\n".join(parts)
258
-
259
- def _table_to_markdown(self, table: list[list[str]]) -> str:
260
- """Convert table to Markdown."""
261
- if not table:
262
- return ""
263
-
264
- # Filter None values
265
- table = [[cell if cell is not None else "" for cell in row] for row in table]
266
-
267
- # Filter empty rows
268
- table = [row for row in table if any(cell.strip() for cell in row)]
269
-
270
- if not table:
271
- return ""
272
-
273
- # Calculate column widths
274
- col_widths = [
275
- max(len(str(row[i])) if i < len(row) else 0 for row in table)
276
- for i in range(max(len(row) for row in table))
277
- ]
278
-
279
- # Format table
280
- lines = []
281
- for row_idx, row in enumerate(table):
282
- padded_row = row + [""] * (len(col_widths) - len(row))
283
- line = "| " + " | ".join(
284
- str(cell).ljust(width) for cell, width in zip(padded_row, col_widths)
285
- ) + " |"
286
- lines.append(line)
287
-
288
- if row_idx == 0:
289
- sep = "|" + "|".join("-" * (w + 2) for w in col_widths) + "|"
290
- lines.append(sep)
291
-
292
- return "\n".join(lines)
293
-
294
- def close(self):
295
- """Close the GlmOcr instance."""
296
- if self._glmocr:
297
- self._glmocr.close()
298
- self._glmocr = None
299
-
300
- def __enter__(self):
301
- return self
302
-
303
- def __exit__(self, exc_type, exc_val, exc_tb):
304
- self.close()