markitdown-glmocr 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {markitdown_glmocr-0.2.0 → markitdown_glmocr-0.2.2}/PKG-INFO +55 -16
- {markitdown_glmocr-0.2.0 → markitdown_glmocr-0.2.2}/README.md +54 -15
- markitdown_glmocr-0.2.2/src/markitdown_glmocr/__about__.py +1 -0
- {markitdown_glmocr-0.2.0 → markitdown_glmocr-0.2.2}/src/markitdown_glmocr/_converter.py +9 -5
- {markitdown_glmocr-0.2.0 → markitdown_glmocr-0.2.2}/src/markitdown_glmocr/_plugin.py +3 -3
- markitdown_glmocr-0.2.0/src/markitdown_glmocr/__about__.py +0 -1
- {markitdown_glmocr-0.2.0 → markitdown_glmocr-0.2.2}/.gitignore +0 -0
- {markitdown_glmocr-0.2.0 → markitdown_glmocr-0.2.2}/pyproject.toml +0 -0
- {markitdown_glmocr-0.2.0 → markitdown_glmocr-0.2.2}/src/markitdown_glmocr/__init__.py +0 -0
- {markitdown_glmocr-0.2.0 → markitdown_glmocr-0.2.2}/src/markitdown_glmocr/_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: markitdown-glmocr
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Intelligent PDF to Markdown converter using glmocr SDK
|
|
5
5
|
Project-URL: Documentation, https://github.com/microsoft/markitdown#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/microsoft/markitdown/issues
|
|
@@ -222,39 +222,78 @@ glmocr SDK 返回的结构化数据支持以下标签:
|
|
|
222
222
|
|
|
223
223
|
### 前置条件
|
|
224
224
|
|
|
225
|
-
|
|
225
|
+
1. 安装构建工具:
|
|
226
226
|
|
|
227
227
|
```bash
|
|
228
|
-
pip install build twine
|
|
228
|
+
pip install build twine hatch
|
|
229
229
|
```
|
|
230
230
|
|
|
231
|
-
|
|
231
|
+
2. 配置 PyPI API Token(Windows 用户环境变量):
|
|
232
232
|
|
|
233
|
+
```powershell
|
|
234
|
+
# PowerShell 设置用户环境变量
|
|
235
|
+
[System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
或在 Bash/Zsh 中:
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
export PYPI_API_TOKEN="pypi-..."
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### 快速发布(推荐)
|
|
245
|
+
|
|
246
|
+
项目根目录提供了上传脚本,可一键发布两个插件:
|
|
247
|
+
|
|
248
|
+
**Bash / Git Bash:**
|
|
233
249
|
```bash
|
|
234
|
-
|
|
250
|
+
# 构建两个插件
|
|
251
|
+
cd packages/markitdown-glmocr && hatch build
|
|
252
|
+
|
|
253
|
+
cd ../markitdown-paddleocr && hatch build
|
|
254
|
+
|
|
255
|
+
# 上传(自动上传所有构建的版本)
|
|
256
|
+
cd ../..
|
|
257
|
+
./scripts/pypi-upload.sh
|
|
258
|
+
|
|
259
|
+
# 或指定版本号
|
|
260
|
+
./scripts/pypi-upload.sh 0.2.0
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
**PowerShell:**
|
|
264
|
+
```powershell
|
|
265
|
+
# 构建两个插件
|
|
266
|
+
cd packages/markitdown-glmocr; hatch build
|
|
267
|
+
cd ../markitdown-paddleocr; hatch build
|
|
268
|
+
|
|
269
|
+
# 上传
|
|
270
|
+
cd ../..
|
|
271
|
+
.\scripts\pypi-upload.ps1
|
|
272
|
+
|
|
273
|
+
# 或指定版本号
|
|
274
|
+
.\scripts\pypi-upload.ps1 -Version "0.2.0"
|
|
235
275
|
```
|
|
236
276
|
|
|
237
|
-
###
|
|
277
|
+
### 手动发布
|
|
238
278
|
|
|
239
279
|
```bash
|
|
240
|
-
# 1.
|
|
280
|
+
# 1. 进入项目目录
|
|
241
281
|
cd packages/markitdown-glmocr
|
|
242
282
|
|
|
243
|
-
# 2.
|
|
244
|
-
|
|
283
|
+
# 2. 构建
|
|
284
|
+
hatch build
|
|
245
285
|
|
|
246
|
-
# 3.
|
|
286
|
+
# 3. 检查
|
|
247
287
|
twine check dist/*
|
|
248
288
|
|
|
249
|
-
# 4.
|
|
250
|
-
twine upload
|
|
289
|
+
# 4. 上传
|
|
290
|
+
twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
251
291
|
```
|
|
252
292
|
|
|
253
293
|
### 发布到 TestPyPI(测试)
|
|
254
294
|
|
|
255
295
|
```bash
|
|
256
|
-
|
|
257
|
-
twine upload --repository testpypi dist/* -u __token__ -p "$PyPI_API_Token"
|
|
296
|
+
twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
258
297
|
|
|
259
298
|
# 从 TestPyPI 安装验证
|
|
260
299
|
pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr
|
|
@@ -262,9 +301,9 @@ pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr
|
|
|
262
301
|
|
|
263
302
|
### 注意事项
|
|
264
303
|
|
|
265
|
-
- 发布前确保 `
|
|
304
|
+
- 发布前确保 `src/markitdown_glmocr/__about__.py` 中的版本号已更新
|
|
266
305
|
- 同一版本号不能重复上传,如需修正必须 bump 版本号
|
|
267
|
-
- `
|
|
306
|
+
- `PYPI_API_TOKEN` 切勿提交到代码仓库
|
|
268
307
|
|
|
269
308
|
## 许可证
|
|
270
309
|
|
|
@@ -195,39 +195,78 @@ glmocr SDK 返回的结构化数据支持以下标签:
|
|
|
195
195
|
|
|
196
196
|
### 前置条件
|
|
197
197
|
|
|
198
|
-
|
|
198
|
+
1. 安装构建工具:
|
|
199
199
|
|
|
200
200
|
```bash
|
|
201
|
-
pip install build twine
|
|
201
|
+
pip install build twine hatch
|
|
202
202
|
```
|
|
203
203
|
|
|
204
|
-
|
|
204
|
+
2. 配置 PyPI API Token(Windows 用户环境变量):
|
|
205
205
|
|
|
206
|
+
```powershell
|
|
207
|
+
# PowerShell 设置用户环境变量
|
|
208
|
+
[System.Environment]::SetEnvironmentVariable('PYPI_API_TOKEN', 'pypi-...', 'User')
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
或在 Bash/Zsh 中:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
export PYPI_API_TOKEN="pypi-..."
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### 快速发布(推荐)
|
|
218
|
+
|
|
219
|
+
项目根目录提供了上传脚本,可一键发布两个插件:
|
|
220
|
+
|
|
221
|
+
**Bash / Git Bash:**
|
|
206
222
|
```bash
|
|
207
|
-
|
|
223
|
+
# 构建两个插件
|
|
224
|
+
cd packages/markitdown-glmocr && hatch build
|
|
225
|
+
|
|
226
|
+
cd ../markitdown-paddleocr && hatch build
|
|
227
|
+
|
|
228
|
+
# 上传(自动上传所有构建的版本)
|
|
229
|
+
cd ../..
|
|
230
|
+
./scripts/pypi-upload.sh
|
|
231
|
+
|
|
232
|
+
# 或指定版本号
|
|
233
|
+
./scripts/pypi-upload.sh 0.2.0
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
**PowerShell:**
|
|
237
|
+
```powershell
|
|
238
|
+
# 构建两个插件
|
|
239
|
+
cd packages/markitdown-glmocr; hatch build
|
|
240
|
+
cd ../markitdown-paddleocr; hatch build
|
|
241
|
+
|
|
242
|
+
# 上传
|
|
243
|
+
cd ../..
|
|
244
|
+
.\scripts\pypi-upload.ps1
|
|
245
|
+
|
|
246
|
+
# 或指定版本号
|
|
247
|
+
.\scripts\pypi-upload.ps1 -Version "0.2.0"
|
|
208
248
|
```
|
|
209
249
|
|
|
210
|
-
###
|
|
250
|
+
### 手动发布
|
|
211
251
|
|
|
212
252
|
```bash
|
|
213
|
-
# 1.
|
|
253
|
+
# 1. 进入项目目录
|
|
214
254
|
cd packages/markitdown-glmocr
|
|
215
255
|
|
|
216
|
-
# 2.
|
|
217
|
-
|
|
256
|
+
# 2. 构建
|
|
257
|
+
hatch build
|
|
218
258
|
|
|
219
|
-
# 3.
|
|
259
|
+
# 3. 检查
|
|
220
260
|
twine check dist/*
|
|
221
261
|
|
|
222
|
-
# 4.
|
|
223
|
-
twine upload
|
|
262
|
+
# 4. 上传
|
|
263
|
+
twine upload --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
224
264
|
```
|
|
225
265
|
|
|
226
266
|
### 发布到 TestPyPI(测试)
|
|
227
267
|
|
|
228
268
|
```bash
|
|
229
|
-
|
|
230
|
-
twine upload --repository testpypi dist/* -u __token__ -p "$PyPI_API_Token"
|
|
269
|
+
twine upload --repository testpypi --username __token__ --password "$PYPI_API_TOKEN" --disable-progress-bar dist/*
|
|
231
270
|
|
|
232
271
|
# 从 TestPyPI 安装验证
|
|
233
272
|
pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr
|
|
@@ -235,9 +274,9 @@ pip install --index-url https://test.pypi.org/simple/ markitdown-glmocr
|
|
|
235
274
|
|
|
236
275
|
### 注意事项
|
|
237
276
|
|
|
238
|
-
- 发布前确保 `
|
|
277
|
+
- 发布前确保 `src/markitdown_glmocr/__about__.py` 中的版本号已更新
|
|
239
278
|
- 同一版本号不能重复上传,如需修正必须 bump 版本号
|
|
240
|
-
- `
|
|
279
|
+
- `PYPI_API_TOKEN` 切勿提交到代码仓库
|
|
241
280
|
|
|
242
281
|
## 许可证
|
|
243
282
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.2"
|
|
@@ -118,7 +118,9 @@ class GlmOcrConverter(DocumentConverter):
|
|
|
118
118
|
if scan_detection_mode is not None
|
|
119
119
|
else ScanDetectionMode.SAMPLING
|
|
120
120
|
)
|
|
121
|
-
self.scan_sample_pages =
|
|
121
|
+
self.scan_sample_pages = (
|
|
122
|
+
scan_sample_pages if scan_sample_pages is not None else 3
|
|
123
|
+
)
|
|
122
124
|
self.scan_text_threshold = (
|
|
123
125
|
scan_text_threshold if scan_text_threshold is not None else 50
|
|
124
126
|
)
|
|
@@ -237,11 +239,11 @@ class GlmOcrConverter(DocumentConverter):
|
|
|
237
239
|
)
|
|
238
240
|
return DocumentConverterResult(markdown=markdown)
|
|
239
241
|
except Exception as e:
|
|
240
|
-
logger.
|
|
241
|
-
"GlmOcrConverter: 批量OCR失败,
|
|
242
|
+
logger.error(
|
|
243
|
+
"GlmOcrConverter: 批量OCR失败, 抛出异常让框架fallback到下一个converter, 错误=%s",
|
|
242
244
|
e,
|
|
243
245
|
)
|
|
244
|
-
|
|
246
|
+
raise
|
|
245
247
|
|
|
246
248
|
# Per-page processing (PAGE_BY_PAGE mode or batch failed)
|
|
247
249
|
for page_num, page in enumerate(pdf.pages):
|
|
@@ -311,7 +313,9 @@ class GlmOcrConverter(DocumentConverter):
|
|
|
311
313
|
Returns:
|
|
312
314
|
Markdown text from all pages.
|
|
313
315
|
"""
|
|
314
|
-
logger.info(
|
|
316
|
+
logger.info(
|
|
317
|
+
"GlmOcrConverter: 批量上传PDF到glmocr SDK, 大小=%d bytes", len(pdf_bytes)
|
|
318
|
+
)
|
|
315
319
|
result = self._get_glmocr().parse(pdf_bytes)
|
|
316
320
|
|
|
317
321
|
# Check for errors
|
|
@@ -25,9 +25,9 @@ def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
|
|
|
25
25
|
logger.info("markitdown-glmocr: 开始注册插件")
|
|
26
26
|
|
|
27
27
|
# Register converter
|
|
28
|
-
# Priority -
|
|
29
|
-
#
|
|
30
|
-
PRIORITY_GLMOCR = -
|
|
28
|
+
# Priority -1.0: same level as PaddleOcrConverter,
|
|
29
|
+
# the upper-level agent's skills control which plugin to call first.
|
|
30
|
+
PRIORITY_GLMOCR = -1.0
|
|
31
31
|
|
|
32
32
|
try:
|
|
33
33
|
converter = GlmOcrConverter(
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|