mineru 2.7.3__py3-none-any.whl → 2.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/model_init.py +8 -1
- mineru/backend/pipeline/pipeline_analyze.py +6 -1
- mineru/backend/vlm/utils.py +2 -0
- mineru/backend/vlm/vlm_analyze.py +41 -26
- mineru/cli/fast_api.py +159 -79
- mineru/model/vlm/vllm_server.py +15 -10
- mineru/utils/block_sort.py +4 -0
- mineru/utils/config_reader.py +5 -1
- mineru/utils/model_utils.py +6 -0
- mineru/utils/os_env_config.py +5 -0
- mineru/utils/pdf_image_tools.py +73 -25
- mineru/version.py +1 -1
- {mineru-2.7.3.dist-info → mineru-2.7.5.dist-info}/METADATA +9 -4
- {mineru-2.7.3.dist-info → mineru-2.7.5.dist-info}/RECORD +18 -18
- {mineru-2.7.3.dist-info → mineru-2.7.5.dist-info}/entry_points.txt +1 -1
- {mineru-2.7.3.dist-info → mineru-2.7.5.dist-info}/WHEEL +0 -0
- {mineru-2.7.3.dist-info → mineru-2.7.5.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.7.3.dist-info → mineru-2.7.5.dist-info}/top_level.txt +0 -0
|
@@ -297,7 +297,14 @@ def ocr_det_batch_setting(device):
|
|
|
297
297
|
# 检测torch的版本号
|
|
298
298
|
import torch
|
|
299
299
|
from packaging import version
|
|
300
|
-
|
|
300
|
+
|
|
301
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
302
|
+
|
|
303
|
+
if (
|
|
304
|
+
version.parse(torch.__version__) >= version.parse("2.8.0")
|
|
305
|
+
or str(device).startswith('mps')
|
|
306
|
+
or device_type.lower() in ["corex"]
|
|
307
|
+
):
|
|
301
308
|
enable_ocr_det_batch = False
|
|
302
309
|
else:
|
|
303
310
|
enable_ocr_det_batch = True
|
|
@@ -193,7 +193,12 @@ def batch_image_analyze(
|
|
|
193
193
|
# 检测torch的版本号
|
|
194
194
|
import torch
|
|
195
195
|
from packaging import version
|
|
196
|
-
|
|
196
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
197
|
+
if (
|
|
198
|
+
version.parse(torch.__version__) >= version.parse("2.8.0")
|
|
199
|
+
or str(device).startswith('mps')
|
|
200
|
+
or device_type.lower() in ["corex"]
|
|
201
|
+
):
|
|
197
202
|
enable_ocr_det_batch = False
|
|
198
203
|
else:
|
|
199
204
|
enable_ocr_det_batch = True
|
mineru/backend/vlm/utils.py
CHANGED
|
@@ -22,6 +22,8 @@ def enable_custom_logits_processors() -> bool:
|
|
|
22
22
|
compute_capability = "8.0"
|
|
23
23
|
elif hasattr(torch, 'musa') and torch.musa.is_available():
|
|
24
24
|
compute_capability = "8.0"
|
|
25
|
+
elif hasattr(torch, 'mlu') and torch.mlu.is_available():
|
|
26
|
+
compute_capability = "8.0"
|
|
25
27
|
else:
|
|
26
28
|
logger.info("CUDA not available, disabling custom_logits_processors")
|
|
27
29
|
return False
|
|
@@ -101,20 +101,27 @@ class ModelSingleton:
|
|
|
101
101
|
except ImportError:
|
|
102
102
|
raise ImportError("Please install vllm to use the vllm-engine backend.")
|
|
103
103
|
|
|
104
|
-
"""
|
|
105
104
|
# musa vllm v1 引擎特殊配置
|
|
106
|
-
device = get_device()
|
|
107
|
-
if
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
105
|
+
# device = get_device()
|
|
106
|
+
# if device_type.startswith("musa"):
|
|
107
|
+
# import torch
|
|
108
|
+
# if torch.musa.is_available():
|
|
109
|
+
# compilation_config = {
|
|
110
|
+
# "cudagraph_capture_sizes": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
111
|
+
# "simple_cuda_graph": True
|
|
112
|
+
# }
|
|
113
|
+
# block_size = 32
|
|
114
|
+
# kwargs["compilation_config"] = compilation_config
|
|
115
|
+
# kwargs["block_size"] = block_size
|
|
116
|
+
|
|
117
|
+
# corex vllm v1 引擎特殊配置
|
|
118
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
119
|
+
if device_type.lower() == "corex":
|
|
120
|
+
compilation_config = {
|
|
121
|
+
"cudagraph_mode": "FULL_DECODE_ONLY",
|
|
122
|
+
"level": 0
|
|
123
|
+
}
|
|
124
|
+
kwargs["compilation_config"] = compilation_config
|
|
118
125
|
|
|
119
126
|
if "compilation_config" in kwargs:
|
|
120
127
|
if isinstance(kwargs["compilation_config"], str):
|
|
@@ -141,20 +148,28 @@ class ModelSingleton:
|
|
|
141
148
|
except ImportError:
|
|
142
149
|
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
|
|
143
150
|
|
|
144
|
-
|
|
151
|
+
|
|
145
152
|
# musa vllm v1 引擎特殊配置
|
|
146
|
-
device = get_device()
|
|
147
|
-
if device.startswith("musa"):
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
153
|
+
# device = get_device()
|
|
154
|
+
# if device.startswith("musa"):
|
|
155
|
+
# import torch
|
|
156
|
+
# if torch.musa.is_available():
|
|
157
|
+
# compilation_config = CompilationConfig(
|
|
158
|
+
# cudagraph_capture_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
159
|
+
# simple_cuda_graph=True
|
|
160
|
+
# )
|
|
161
|
+
# block_size = 32
|
|
162
|
+
# kwargs["compilation_config"] = compilation_config
|
|
163
|
+
# kwargs["block_size"] = block_size
|
|
164
|
+
|
|
165
|
+
# corex vllm v1 引擎特殊配置
|
|
166
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
167
|
+
if device_type.lower() == "corex":
|
|
168
|
+
compilation_config = CompilationConfig(
|
|
169
|
+
cudagraph_mode="FULL_DECODE_ONLY",
|
|
170
|
+
level=0
|
|
171
|
+
)
|
|
172
|
+
kwargs["compilation_config"] = compilation_config
|
|
158
173
|
|
|
159
174
|
if "compilation_config" in kwargs:
|
|
160
175
|
if isinstance(kwargs["compilation_config"], dict):
|
mineru/cli/fast_api.py
CHANGED
|
@@ -7,12 +7,12 @@ import asyncio
|
|
|
7
7
|
import uvicorn
|
|
8
8
|
import click
|
|
9
9
|
import zipfile
|
|
10
|
+
import shutil
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
import glob
|
|
12
|
-
from fastapi import Depends, FastAPI, HTTPException, UploadFile, File, Form
|
|
13
|
+
from fastapi import Depends, FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
|
|
13
14
|
from fastapi.middleware.gzip import GZipMiddleware
|
|
14
15
|
from fastapi.responses import JSONResponse, FileResponse
|
|
15
|
-
from starlette.background import BackgroundTask
|
|
16
16
|
from typing import List, Optional
|
|
17
17
|
from loguru import logger
|
|
18
18
|
|
|
@@ -30,23 +30,30 @@ from mineru.version import __version__
|
|
|
30
30
|
# 并发控制器
|
|
31
31
|
_request_semaphore: Optional[asyncio.Semaphore] = None
|
|
32
32
|
|
|
33
|
+
|
|
33
34
|
# 并发控制依赖函数
|
|
34
35
|
async def limit_concurrency():
|
|
35
36
|
if _request_semaphore is not None:
|
|
36
|
-
|
|
37
|
+
# 检查信号量是否已用尽,如果是则拒绝请求
|
|
38
|
+
if _request_semaphore._value == 0:
|
|
37
39
|
raise HTTPException(
|
|
38
40
|
status_code=503,
|
|
39
|
-
detail=f"Server is at maximum capacity: {os.getenv('MINERU_API_MAX_CONCURRENT_REQUESTS', 'unset')}. Please try again later."
|
|
41
|
+
detail=f"Server is at maximum capacity: {os.getenv('MINERU_API_MAX_CONCURRENT_REQUESTS', 'unset')}. Please try again later.",
|
|
40
42
|
)
|
|
41
43
|
async with _request_semaphore:
|
|
42
44
|
yield
|
|
43
45
|
else:
|
|
44
46
|
yield
|
|
45
47
|
|
|
48
|
+
|
|
46
49
|
def create_app():
|
|
47
50
|
# By default, the OpenAPI documentation endpoints (openapi_url, docs_url, redoc_url) are enabled.
|
|
48
51
|
# To disable the FastAPI docs and schema endpoints, set the environment variable MINERU_API_ENABLE_FASTAPI_DOCS=0.
|
|
49
|
-
enable_docs = str(os.getenv("MINERU_API_ENABLE_FASTAPI_DOCS", "1")).lower() in (
|
|
52
|
+
enable_docs = str(os.getenv("MINERU_API_ENABLE_FASTAPI_DOCS", "1")).lower() in (
|
|
53
|
+
"1",
|
|
54
|
+
"true",
|
|
55
|
+
"yes",
|
|
56
|
+
)
|
|
50
57
|
app = FastAPI(
|
|
51
58
|
openapi_url="/openapi.json" if enable_docs else None,
|
|
52
59
|
docs_url="/docs" if enable_docs else None,
|
|
@@ -56,7 +63,9 @@ def create_app():
|
|
|
56
63
|
# 初始化并发控制器:从环境变量MINERU_API_MAX_CONCURRENT_REQUESTS读取
|
|
57
64
|
global _request_semaphore
|
|
58
65
|
try:
|
|
59
|
-
max_concurrent_requests = int(
|
|
66
|
+
max_concurrent_requests = int(
|
|
67
|
+
os.getenv("MINERU_API_MAX_CONCURRENT_REQUESTS", "0")
|
|
68
|
+
)
|
|
60
69
|
except ValueError:
|
|
61
70
|
max_concurrent_requests = 0
|
|
62
71
|
|
|
@@ -67,6 +76,7 @@ def create_app():
|
|
|
67
76
|
app.add_middleware(GZipMiddleware, minimum_size=1000)
|
|
68
77
|
return app
|
|
69
78
|
|
|
79
|
+
|
|
70
80
|
app = create_app()
|
|
71
81
|
|
|
72
82
|
|
|
@@ -76,27 +86,34 @@ def sanitize_filename(filename: str) -> str:
|
|
|
76
86
|
移除路径遍历字符, 保留 Unicode 字母、数字、._-
|
|
77
87
|
禁止隐藏文件
|
|
78
88
|
"""
|
|
79
|
-
sanitized = re.sub(r
|
|
80
|
-
sanitized = re.sub(r
|
|
81
|
-
if sanitized.startswith(
|
|
82
|
-
sanitized =
|
|
83
|
-
return sanitized or
|
|
89
|
+
sanitized = re.sub(r"[/\\.]{2,}|[/\\]", "", filename)
|
|
90
|
+
sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE)
|
|
91
|
+
if sanitized.startswith("."):
|
|
92
|
+
sanitized = "_" + sanitized[1:]
|
|
93
|
+
return sanitized or "unnamed"
|
|
94
|
+
|
|
84
95
|
|
|
85
96
|
def cleanup_file(file_path: str) -> None:
|
|
86
|
-
"""
|
|
97
|
+
"""清理临时文件或目录"""
|
|
87
98
|
try:
|
|
88
99
|
if os.path.exists(file_path):
|
|
89
|
-
os.
|
|
100
|
+
if os.path.isfile(file_path):
|
|
101
|
+
os.remove(file_path)
|
|
102
|
+
elif os.path.isdir(file_path):
|
|
103
|
+
shutil.rmtree(file_path)
|
|
90
104
|
except Exception as e:
|
|
91
105
|
logger.warning(f"fail clean file {file_path}: {e}")
|
|
92
106
|
|
|
107
|
+
|
|
93
108
|
def encode_image(image_path: str) -> str:
|
|
94
109
|
"""Encode image using base64"""
|
|
95
110
|
with open(image_path, "rb") as f:
|
|
96
111
|
return b64encode(f.read()).decode()
|
|
97
112
|
|
|
98
113
|
|
|
99
|
-
def get_infer_result(
|
|
114
|
+
def get_infer_result(
|
|
115
|
+
file_suffix_identifier: str, pdf_name: str, parse_dir: str
|
|
116
|
+
) -> Optional[str]:
|
|
100
117
|
"""从结果文件中读取推理结果"""
|
|
101
118
|
result_file_path = os.path.join(parse_dir, f"{pdf_name}{file_suffix_identifier}")
|
|
102
119
|
if os.path.exists(result_file_path):
|
|
@@ -107,11 +124,14 @@ def get_infer_result(file_suffix_identifier: str, pdf_name: str, parse_dir: str)
|
|
|
107
124
|
|
|
108
125
|
@app.post(path="/file_parse", dependencies=[Depends(limit_concurrency)])
|
|
109
126
|
async def parse_pdf(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
127
|
+
background_tasks: BackgroundTasks,
|
|
128
|
+
files: List[UploadFile] = File(
|
|
129
|
+
..., description="Upload pdf or image files for parsing"
|
|
130
|
+
),
|
|
131
|
+
output_dir: str = Form("./output", description="Output local directory"),
|
|
132
|
+
lang_list: List[str] = Form(
|
|
133
|
+
["ch"],
|
|
134
|
+
description="""(Adapted only for pipeline and hybrid backend)Input the languages in the pdf to improve OCR accuracy.Options:
|
|
115
135
|
- ch: Chinese, English, Chinese Traditional.
|
|
116
136
|
- ch_lite: Chinese, English, Chinese Traditional, Japanese.
|
|
117
137
|
- ch_server: Chinese, English, Chinese Traditional, Japanese.
|
|
@@ -129,41 +149,54 @@ async def parse_pdf(
|
|
|
129
149
|
- east_slavic: Russian, Belarusian, Ukrainian, English.
|
|
130
150
|
- cyrillic: Russian, Belarusian, Ukrainian, Serbian (Cyrillic), Bulgarian, Mongolian, Abkhazian, Adyghe, Kabardian, Avar, Dargin, Ingush, Chechen, Lak, Lezgin, Tabasaran, Kazakh, Kyrgyz, Tajik, Macedonian, Tatar, Chuvash, Bashkir, Malian, Moldovan, Udmurt, Komi, Ossetian, Buryat, Kalmyk, Tuvan, Sakha, Karakalpak, English.
|
|
131
151
|
- devanagari: Hindi, Marathi, Nepali, Bihari, Maithili, Angika, Bhojpuri, Magahi, Santali, Newari, Konkani, Sanskrit, Haryanvi, English.
|
|
132
|
-
"""
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
152
|
+
""",
|
|
153
|
+
),
|
|
154
|
+
backend: str = Form(
|
|
155
|
+
"hybrid-auto-engine",
|
|
156
|
+
description="""The backend for parsing:
|
|
137
157
|
- pipeline: More general, supports multiple languages, hallucination-free.
|
|
138
158
|
- vlm-auto-engine: High accuracy via local computing power, supports Chinese and English documents only.
|
|
139
159
|
- vlm-http-client: High accuracy via remote computing power(client suitable for openai-compatible servers), supports Chinese and English documents only.
|
|
140
160
|
- hybrid-auto-engine: Next-generation high accuracy solution via local computing power, supports multiple languages.
|
|
141
|
-
- hybrid-http-client: High accuracy via remote computing power but requires a little local computing power(client suitable for openai-compatible servers), supports multiple languages."""
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
161
|
+
- hybrid-http-client: High accuracy via remote computing power but requires a little local computing power(client suitable for openai-compatible servers), supports multiple languages.""",
|
|
162
|
+
),
|
|
163
|
+
parse_method: str = Form(
|
|
164
|
+
"auto",
|
|
165
|
+
description="""(Adapted only for pipeline and hybrid backend)The method for parsing PDF:
|
|
146
166
|
- auto: Automatically determine the method based on the file type
|
|
147
167
|
- txt: Use text extraction method
|
|
148
168
|
- ocr: Use OCR method for image-based PDFs
|
|
149
|
-
"""
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
169
|
+
""",
|
|
170
|
+
),
|
|
171
|
+
formula_enable: bool = Form(True, description="Enable formula parsing."),
|
|
172
|
+
table_enable: bool = Form(True, description="Enable table parsing."),
|
|
173
|
+
server_url: Optional[str] = Form(
|
|
174
|
+
None,
|
|
175
|
+
description="(Adapted only for <vlm/hybrid>-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000",
|
|
176
|
+
),
|
|
177
|
+
return_md: bool = Form(True, description="Return markdown content in response"),
|
|
178
|
+
return_middle_json: bool = Form(
|
|
179
|
+
False, description="Return middle JSON in response"
|
|
180
|
+
),
|
|
181
|
+
return_model_output: bool = Form(
|
|
182
|
+
False, description="Return model output JSON in response"
|
|
183
|
+
),
|
|
184
|
+
return_content_list: bool = Form(
|
|
185
|
+
False, description="Return content list JSON in response"
|
|
186
|
+
),
|
|
187
|
+
return_images: bool = Form(
|
|
188
|
+
False, description="Return extracted images in response"
|
|
189
|
+
),
|
|
190
|
+
response_format_zip: bool = Form(
|
|
191
|
+
False, description="Return results as a ZIP file instead of JSON"
|
|
192
|
+
),
|
|
193
|
+
start_page_id: int = Form(
|
|
194
|
+
0, description="The starting page for PDF parsing, beginning from 0"
|
|
195
|
+
),
|
|
196
|
+
end_page_id: int = Form(
|
|
197
|
+
99999, description="The ending page for PDF parsing, beginning from 0"
|
|
198
|
+
),
|
|
165
199
|
):
|
|
166
|
-
|
|
167
200
|
# 获取命令行配置参数
|
|
168
201
|
config = getattr(app.state, "config", {})
|
|
169
202
|
|
|
@@ -171,6 +204,7 @@ async def parse_pdf(
|
|
|
171
204
|
# 创建唯一的输出目录
|
|
172
205
|
unique_dir = os.path.join(output_dir, str(uuid.uuid4()))
|
|
173
206
|
os.makedirs(unique_dir, exist_ok=True)
|
|
207
|
+
background_tasks.add_task(cleanup_file, unique_dir)
|
|
174
208
|
|
|
175
209
|
# 处理上传的PDF文件
|
|
176
210
|
pdf_file_names = []
|
|
@@ -196,20 +230,21 @@ async def parse_pdf(
|
|
|
196
230
|
except Exception as e:
|
|
197
231
|
return JSONResponse(
|
|
198
232
|
status_code=400,
|
|
199
|
-
content={"error": f"Failed to load file: {str(e)}"}
|
|
233
|
+
content={"error": f"Failed to load file: {str(e)}"},
|
|
200
234
|
)
|
|
201
235
|
else:
|
|
202
236
|
return JSONResponse(
|
|
203
237
|
status_code=400,
|
|
204
|
-
content={"error": f"Unsupported file type: {file_suffix}"}
|
|
238
|
+
content={"error": f"Unsupported file type: {file_suffix}"},
|
|
205
239
|
)
|
|
206
240
|
|
|
207
|
-
|
|
208
241
|
# 设置语言列表,确保与文件数量一致
|
|
209
242
|
actual_lang_list = lang_list
|
|
210
243
|
if len(actual_lang_list) != len(pdf_file_names):
|
|
211
244
|
# 如果语言列表长度不匹配,使用第一个语言或默认"ch"
|
|
212
|
-
actual_lang_list = [
|
|
245
|
+
actual_lang_list = [
|
|
246
|
+
actual_lang_list[0] if actual_lang_list else "ch"
|
|
247
|
+
] * len(pdf_file_names)
|
|
213
248
|
|
|
214
249
|
# 调用异步处理函数
|
|
215
250
|
await aio_do_parse(
|
|
@@ -231,13 +266,15 @@ async def parse_pdf(
|
|
|
231
266
|
f_dump_content_list=return_content_list,
|
|
232
267
|
start_page_id=start_page_id,
|
|
233
268
|
end_page_id=end_page_id,
|
|
234
|
-
**config
|
|
269
|
+
**config,
|
|
235
270
|
)
|
|
236
271
|
|
|
237
272
|
# 根据 response_format_zip 决定返回类型
|
|
238
273
|
if response_format_zip:
|
|
239
274
|
zip_fd, zip_path = tempfile.mkstemp(suffix=".zip", prefix="mineru_results_")
|
|
240
275
|
os.close(zip_fd)
|
|
276
|
+
background_tasks.add_task(cleanup_file, zip_path)
|
|
277
|
+
|
|
241
278
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
242
279
|
for pdf_name in pdf_file_names:
|
|
243
280
|
safe_pdf_name = sanitize_filename(pdf_name)
|
|
@@ -247,7 +284,13 @@ async def parse_pdf(
|
|
|
247
284
|
elif backend.startswith("vlm"):
|
|
248
285
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
|
249
286
|
elif backend.startswith("hybrid"):
|
|
250
|
-
parse_dir = os.path.join(
|
|
287
|
+
parse_dir = os.path.join(
|
|
288
|
+
unique_dir, pdf_name, f"hybrid_{parse_method}"
|
|
289
|
+
)
|
|
290
|
+
else:
|
|
291
|
+
# 未知 backend,跳过此文件
|
|
292
|
+
logger.warning(f"Unknown backend type: {backend}, skipping {pdf_name}")
|
|
293
|
+
continue
|
|
251
294
|
|
|
252
295
|
if not os.path.exists(parse_dir):
|
|
253
296
|
continue
|
|
@@ -256,35 +299,63 @@ async def parse_pdf(
|
|
|
256
299
|
if return_md:
|
|
257
300
|
path = os.path.join(parse_dir, f"{pdf_name}.md")
|
|
258
301
|
if os.path.exists(path):
|
|
259
|
-
zf.write(
|
|
302
|
+
zf.write(
|
|
303
|
+
path,
|
|
304
|
+
arcname=os.path.join(
|
|
305
|
+
safe_pdf_name, f"{safe_pdf_name}.md"
|
|
306
|
+
),
|
|
307
|
+
)
|
|
260
308
|
|
|
261
309
|
if return_middle_json:
|
|
262
310
|
path = os.path.join(parse_dir, f"{pdf_name}_middle.json")
|
|
263
311
|
if os.path.exists(path):
|
|
264
|
-
zf.write(
|
|
312
|
+
zf.write(
|
|
313
|
+
path,
|
|
314
|
+
arcname=os.path.join(
|
|
315
|
+
safe_pdf_name, f"{safe_pdf_name}_middle.json"
|
|
316
|
+
),
|
|
317
|
+
)
|
|
265
318
|
|
|
266
319
|
if return_model_output:
|
|
267
320
|
path = os.path.join(parse_dir, f"{pdf_name}_model.json")
|
|
268
321
|
if os.path.exists(path):
|
|
269
|
-
zf.write(
|
|
322
|
+
zf.write(
|
|
323
|
+
path,
|
|
324
|
+
arcname=os.path.join(
|
|
325
|
+
safe_pdf_name, f"{safe_pdf_name}_model.json"
|
|
326
|
+
),
|
|
327
|
+
)
|
|
270
328
|
|
|
271
329
|
if return_content_list:
|
|
272
330
|
path = os.path.join(parse_dir, f"{pdf_name}_content_list.json")
|
|
273
331
|
if os.path.exists(path):
|
|
274
|
-
zf.write(
|
|
332
|
+
zf.write(
|
|
333
|
+
path,
|
|
334
|
+
arcname=os.path.join(
|
|
335
|
+
safe_pdf_name, f"{safe_pdf_name}_content_list.json"
|
|
336
|
+
),
|
|
337
|
+
)
|
|
275
338
|
|
|
276
339
|
# 写入图片
|
|
277
340
|
if return_images:
|
|
278
341
|
images_dir = os.path.join(parse_dir, "images")
|
|
279
|
-
image_paths = glob.glob(
|
|
342
|
+
image_paths = glob.glob(
|
|
343
|
+
os.path.join(glob.escape(images_dir), "*.jpg")
|
|
344
|
+
)
|
|
280
345
|
for image_path in image_paths:
|
|
281
|
-
zf.write(
|
|
346
|
+
zf.write(
|
|
347
|
+
image_path,
|
|
348
|
+
arcname=os.path.join(
|
|
349
|
+
safe_pdf_name,
|
|
350
|
+
"images",
|
|
351
|
+
os.path.basename(image_path),
|
|
352
|
+
),
|
|
353
|
+
)
|
|
282
354
|
|
|
283
355
|
return FileResponse(
|
|
284
356
|
path=zip_path,
|
|
285
357
|
media_type="application/zip",
|
|
286
358
|
filename="results.zip",
|
|
287
|
-
background=BackgroundTask(cleanup_file, zip_path)
|
|
288
359
|
)
|
|
289
360
|
else:
|
|
290
361
|
# 构建 JSON 结果
|
|
@@ -298,17 +369,31 @@ async def parse_pdf(
|
|
|
298
369
|
elif backend.startswith("vlm"):
|
|
299
370
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
|
300
371
|
elif backend.startswith("hybrid"):
|
|
301
|
-
parse_dir = os.path.join(
|
|
372
|
+
parse_dir = os.path.join(
|
|
373
|
+
unique_dir, pdf_name, f"hybrid_{parse_method}"
|
|
374
|
+
)
|
|
375
|
+
else:
|
|
376
|
+
# 未知 backend,跳过此文件
|
|
377
|
+
logger.warning(f"Unknown backend type: {backend}, skipping {pdf_name}")
|
|
378
|
+
continue
|
|
302
379
|
|
|
303
380
|
if os.path.exists(parse_dir):
|
|
304
381
|
if return_md:
|
|
305
|
-
data["md_content"] = get_infer_result(
|
|
382
|
+
data["md_content"] = get_infer_result(
|
|
383
|
+
".md", pdf_name, parse_dir
|
|
384
|
+
)
|
|
306
385
|
if return_middle_json:
|
|
307
|
-
data["middle_json"] = get_infer_result(
|
|
386
|
+
data["middle_json"] = get_infer_result(
|
|
387
|
+
"_middle.json", pdf_name, parse_dir
|
|
388
|
+
)
|
|
308
389
|
if return_model_output:
|
|
309
|
-
data["model_output"] = get_infer_result(
|
|
390
|
+
data["model_output"] = get_infer_result(
|
|
391
|
+
"_model.json", pdf_name, parse_dir
|
|
392
|
+
)
|
|
310
393
|
if return_content_list:
|
|
311
|
-
data["content_list"] = get_infer_result(
|
|
394
|
+
data["content_list"] = get_infer_result(
|
|
395
|
+
"_content_list.json", pdf_name, parse_dir
|
|
396
|
+
)
|
|
312
397
|
if return_images:
|
|
313
398
|
images_dir = os.path.join(parse_dir, "images")
|
|
314
399
|
safe_pattern = os.path.join(glob.escape(images_dir), "*.jpg")
|
|
@@ -325,24 +410,24 @@ async def parse_pdf(
|
|
|
325
410
|
content={
|
|
326
411
|
"backend": backend,
|
|
327
412
|
"version": __version__,
|
|
328
|
-
"results": result_dict
|
|
329
|
-
}
|
|
413
|
+
"results": result_dict,
|
|
414
|
+
},
|
|
330
415
|
)
|
|
331
416
|
except Exception as e:
|
|
332
417
|
logger.exception(e)
|
|
333
418
|
return JSONResponse(
|
|
334
|
-
status_code=500,
|
|
335
|
-
content={"error": f"Failed to process file: {str(e)}"}
|
|
419
|
+
status_code=500, content={"error": f"Failed to process file: {str(e)}"}
|
|
336
420
|
)
|
|
337
421
|
|
|
338
422
|
|
|
339
|
-
@click.command(
|
|
423
|
+
@click.command(
|
|
424
|
+
context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)
|
|
425
|
+
)
|
|
340
426
|
@click.pass_context
|
|
341
|
-
@click.option(
|
|
342
|
-
@click.option(
|
|
343
|
-
@click.option(
|
|
427
|
+
@click.option("--host", default="127.0.0.1", help="Server host (default: 127.0.0.1)")
|
|
428
|
+
@click.option("--port", default=8000, type=int, help="Server port (default: 8000)")
|
|
429
|
+
@click.option("--reload", is_flag=True, help="Enable auto-reload (development mode)")
|
|
344
430
|
def main(ctx, host, port, reload, **kwargs):
|
|
345
|
-
|
|
346
431
|
kwargs.update(arg_parse(ctx))
|
|
347
432
|
|
|
348
433
|
# 将配置参数存储到应用状态中
|
|
@@ -359,12 +444,7 @@ def main(ctx, host, port, reload, **kwargs):
|
|
|
359
444
|
print(f"Start MinerU FastAPI Service: http://{host}:{port}")
|
|
360
445
|
print(f"API documentation: http://{host}:{port}/docs")
|
|
361
446
|
|
|
362
|
-
uvicorn.run(
|
|
363
|
-
"mineru.cli.fast_api:app",
|
|
364
|
-
host=host,
|
|
365
|
-
port=port,
|
|
366
|
-
reload=reload
|
|
367
|
-
)
|
|
447
|
+
uvicorn.run("mineru.cli.fast_api:app", host=host, port=port, reload=reload)
|
|
368
448
|
|
|
369
449
|
|
|
370
450
|
if __name__ == "__main__":
|
mineru/model/vlm/vllm_server.py
CHANGED
|
@@ -56,17 +56,22 @@ def main():
|
|
|
56
56
|
model_path = auto_download_and_get_model_root_path("/", "vlm")
|
|
57
57
|
if (not has_logits_processors_arg) and custom_logits_processors:
|
|
58
58
|
args.extend(["--logits-processors", "mineru_vl_utils:MinerULogitsProcessor"])
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
# musa vllm v1 引擎特殊配置
|
|
61
|
-
device = get_device()
|
|
62
|
-
if device.startswith("musa"):
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
61
|
+
# device = get_device()
|
|
62
|
+
# if device.startswith("musa"):
|
|
63
|
+
# import torch
|
|
64
|
+
# if torch.musa.is_available():
|
|
65
|
+
# if not has_block_size_arg:
|
|
66
|
+
# args.extend(["--block-size", "32"])
|
|
67
|
+
# if not has_compilation_config:
|
|
68
|
+
# args.extend(["--compilation-config", '{"cudagraph_capture_sizes": [1,2,3,4,5,6,7,8,10,12,14,16,18,20,24,28,30], "simple_cuda_graph": true}'])
|
|
69
|
+
|
|
70
|
+
# corex vllm v1 引擎特殊配置
|
|
71
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
72
|
+
if device_type.lower() == "corex":
|
|
73
|
+
if not has_compilation_config:
|
|
74
|
+
args.extend(["--compilation-config", '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'])
|
|
70
75
|
|
|
71
76
|
# 重构参数,将模型路径作为位置参数
|
|
72
77
|
sys.argv = [sys.argv[0]] + ["serve", model_path] + args
|
mineru/utils/block_sort.py
CHANGED
|
@@ -198,6 +198,10 @@ def model_init(model_name: str):
|
|
|
198
198
|
if hasattr(torch, 'npu') and torch.npu.is_available():
|
|
199
199
|
if torch.npu.is_bf16_supported():
|
|
200
200
|
bf_16_support = True
|
|
201
|
+
elif device_name.startswith("mlu"):
|
|
202
|
+
if hasattr(torch, 'mlu') and torch.mlu.is_available():
|
|
203
|
+
if torch.mlu.is_bf16_supported():
|
|
204
|
+
bf_16_support = True
|
|
201
205
|
|
|
202
206
|
if model_name == 'layoutreader':
|
|
203
207
|
# 检测modelscope的缓存目录是否存在
|
mineru/utils/config_reader.py
CHANGED
mineru/utils/model_utils.py
CHANGED
|
@@ -429,6 +429,9 @@ def clean_memory(device='cuda'):
|
|
|
429
429
|
elif str(device).startswith("musa"):
|
|
430
430
|
if torch.musa.is_available():
|
|
431
431
|
torch.musa.empty_cache()
|
|
432
|
+
elif str(device).startswith("mlu"):
|
|
433
|
+
if torch.mlu.is_available():
|
|
434
|
+
torch.mlu.empty_cache()
|
|
432
435
|
gc.collect()
|
|
433
436
|
|
|
434
437
|
|
|
@@ -470,5 +473,8 @@ def get_vram(device) -> int:
|
|
|
470
473
|
elif str(device).startswith("musa"):
|
|
471
474
|
if torch.musa.is_available():
|
|
472
475
|
total_memory = round(torch.musa.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
476
|
+
elif str(device).startswith("mlu"):
|
|
477
|
+
if torch.mlu.is_available():
|
|
478
|
+
total_memory = round(torch.mlu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
473
479
|
|
|
474
480
|
return total_memory
|
mineru/utils/os_env_config.py
CHANGED
|
@@ -11,6 +11,11 @@ def get_load_images_timeout() -> int:
|
|
|
11
11
|
return get_value_from_string(env_value, 300)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def get_load_images_threads() -> int:
|
|
15
|
+
env_value = os.getenv('MINERU_PDF_RENDER_THREADS', None)
|
|
16
|
+
return get_value_from_string(env_value, 4)
|
|
17
|
+
|
|
18
|
+
|
|
14
19
|
def get_value_from_string(env_value: str, default_value: int) -> int:
|
|
15
20
|
if env_value is not None:
|
|
16
21
|
try:
|
mineru/utils/pdf_image_tools.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
2
|
import os
|
|
3
|
+
import signal
|
|
4
|
+
import time
|
|
3
5
|
from io import BytesIO
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
@@ -9,13 +11,13 @@ from PIL import Image, ImageOps
|
|
|
9
11
|
|
|
10
12
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
11
13
|
from mineru.utils.check_sys_env import is_windows_environment
|
|
12
|
-
from mineru.utils.os_env_config import get_load_images_timeout
|
|
14
|
+
from mineru.utils.os_env_config import get_load_images_timeout, get_load_images_threads
|
|
13
15
|
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
|
|
14
16
|
from mineru.utils.enum_class import ImageType
|
|
15
17
|
from mineru.utils.hash_utils import str_sha256
|
|
16
18
|
from mineru.utils.pdf_page_id import get_end_page_id
|
|
17
19
|
|
|
18
|
-
from concurrent.futures import ProcessPoolExecutor,
|
|
20
|
+
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
|
|
@@ -57,7 +59,7 @@ def load_images_from_pdf(
|
|
|
57
59
|
end_page_id=None,
|
|
58
60
|
image_type=ImageType.PIL,
|
|
59
61
|
timeout=None,
|
|
60
|
-
threads=
|
|
62
|
+
threads=None,
|
|
61
63
|
):
|
|
62
64
|
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
|
63
65
|
|
|
@@ -67,8 +69,8 @@ def load_images_from_pdf(
|
|
|
67
69
|
start_page_id (int, optional): 起始页码. Defaults to 0.
|
|
68
70
|
end_page_id (int | None, optional): 结束页码. Defaults to None.
|
|
69
71
|
image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
|
|
70
|
-
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量
|
|
71
|
-
threads (int):
|
|
72
|
+
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_RENDER_TIMEOUT 读取,若未设置则默认为 300 秒。
|
|
73
|
+
threads (int): 进程数, 如果为 None,则从环境变量 MINERU_PDF_RENDER_THREADS 读取,若未设置则默认为 4.
|
|
72
74
|
|
|
73
75
|
Raises:
|
|
74
76
|
TimeoutError: 当转换超时时抛出
|
|
@@ -86,6 +88,9 @@ def load_images_from_pdf(
|
|
|
86
88
|
else:
|
|
87
89
|
if timeout is None:
|
|
88
90
|
timeout = get_load_images_timeout()
|
|
91
|
+
if threads is None:
|
|
92
|
+
threads = get_load_images_threads()
|
|
93
|
+
|
|
89
94
|
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
|
90
95
|
|
|
91
96
|
# 计算总页数
|
|
@@ -108,11 +113,13 @@ def load_images_from_pdf(
|
|
|
108
113
|
|
|
109
114
|
page_ranges.append((range_start, range_end))
|
|
110
115
|
|
|
111
|
-
|
|
116
|
+
logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
|
|
112
117
|
|
|
113
|
-
|
|
118
|
+
executor = ProcessPoolExecutor(max_workers=actual_threads)
|
|
119
|
+
try:
|
|
114
120
|
# 提交所有任务
|
|
115
121
|
futures = []
|
|
122
|
+
future_to_range = {}
|
|
116
123
|
for range_start, range_end in page_ranges:
|
|
117
124
|
future = executor.submit(
|
|
118
125
|
_load_images_from_pdf_worker,
|
|
@@ -122,27 +129,68 @@ def load_images_from_pdf(
|
|
|
122
129
|
range_end,
|
|
123
130
|
image_type,
|
|
124
131
|
)
|
|
125
|
-
futures.append(
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
all_results.sort(key=lambda x: x[0])
|
|
136
|
-
images_list = []
|
|
137
|
-
for _, imgs in all_results:
|
|
138
|
-
images_list.extend(imgs)
|
|
139
|
-
|
|
140
|
-
return images_list, pdf_doc
|
|
141
|
-
except FuturesTimeoutError:
|
|
132
|
+
futures.append(future)
|
|
133
|
+
future_to_range[future] = range_start
|
|
134
|
+
|
|
135
|
+
# 使用 wait() 设置单一全局超时
|
|
136
|
+
done, not_done = wait(futures, timeout=timeout, return_when=ALL_COMPLETED)
|
|
137
|
+
|
|
138
|
+
# 检查是否有未完成的任务(超时情况)
|
|
139
|
+
if not_done:
|
|
140
|
+
# 超时:强制终止所有子进程
|
|
141
|
+
_terminate_executor_processes(executor)
|
|
142
142
|
pdf_doc.close()
|
|
143
|
-
executor.shutdown(wait=False, cancel_futures=True)
|
|
144
143
|
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
|
|
145
144
|
|
|
145
|
+
# 所有任务完成,收集结果
|
|
146
|
+
all_results = []
|
|
147
|
+
for future in futures:
|
|
148
|
+
range_start = future_to_range[future]
|
|
149
|
+
# 这里不需要 timeout,因为任务已完成
|
|
150
|
+
images_list = future.result()
|
|
151
|
+
all_results.append((range_start, images_list))
|
|
152
|
+
|
|
153
|
+
# 按起始页码排序并合并结果
|
|
154
|
+
all_results.sort(key=lambda x: x[0])
|
|
155
|
+
images_list = []
|
|
156
|
+
for _, imgs in all_results:
|
|
157
|
+
images_list.extend(imgs)
|
|
158
|
+
|
|
159
|
+
return images_list, pdf_doc
|
|
160
|
+
|
|
161
|
+
except Exception as e:
|
|
162
|
+
# 发生任何异常时,确保清理子进程
|
|
163
|
+
_terminate_executor_processes(executor)
|
|
164
|
+
pdf_doc.close()
|
|
165
|
+
if isinstance(e, TimeoutError):
|
|
166
|
+
raise
|
|
167
|
+
raise
|
|
168
|
+
finally:
|
|
169
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _terminate_executor_processes(executor):
|
|
173
|
+
"""强制终止 ProcessPoolExecutor 中的所有子进程"""
|
|
174
|
+
if hasattr(executor, '_processes'):
|
|
175
|
+
for pid, process in executor._processes.items():
|
|
176
|
+
if process.is_alive():
|
|
177
|
+
try:
|
|
178
|
+
# 先发送 SIGTERM 允许优雅退出
|
|
179
|
+
os.kill(pid, signal.SIGTERM)
|
|
180
|
+
except (ProcessLookupError, OSError):
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
# 给子进程一点时间响应 SIGTERM
|
|
184
|
+
time.sleep(0.1)
|
|
185
|
+
|
|
186
|
+
# 对仍然存活的进程发送 SIGKILL 强制终止
|
|
187
|
+
for pid, process in executor._processes.items():
|
|
188
|
+
if process.is_alive():
|
|
189
|
+
try:
|
|
190
|
+
os.kill(pid, signal.SIGKILL)
|
|
191
|
+
except (ProcessLookupError, OSError):
|
|
192
|
+
pass
|
|
193
|
+
|
|
146
194
|
|
|
147
195
|
def load_images_from_pdf_core(
|
|
148
196
|
pdf_bytes: bytes,
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.7.
|
|
1
|
+
__version__ = "2.7.5"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.5
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -135,16 +135,21 @@ Dynamic: license-file
|
|
|
135
135
|
|
|
136
136
|
# Changelog
|
|
137
137
|
|
|
138
|
-
- 2026/01/
|
|
139
|
-
- Added support for domestic computing platforms
|
|
138
|
+
- 2026/01/30 2.7.4 Release
|
|
139
|
+
- Added support for domestic computing platforms IluvatarCorex and Cambricon. Currently, the officially supported domestic computing platforms include:
|
|
140
140
|
- [Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend/)
|
|
141
141
|
- [T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead/)
|
|
142
142
|
- [METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX/)
|
|
143
143
|
- [Hygon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
|
|
144
144
|
- [Enflame](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
|
|
145
145
|
- [MooreThreads](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
|
|
146
|
+
- [IluvatarCorex](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/IluvatarCorex/)
|
|
147
|
+
- [Cambricon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Cambricon/)
|
|
146
148
|
- MinerU continues to ensure compatibility with domestic hardware platforms, supporting mainstream chip architectures. With secure and reliable technology, we empower researchers, government, and enterprises to reach new heights in document digitization!
|
|
147
|
-
|
|
149
|
+
|
|
150
|
+
- 2026/01/23 2.7.2 Release
|
|
151
|
+
- Added support for domestic computing platforms Hygon, Enflame, and Moore Threads.
|
|
152
|
+
- Cross-page table merging optimization, improving merge success rate and merge quality.
|
|
148
153
|
|
|
149
154
|
- 2026/01/06 2.7.1 Release
|
|
150
155
|
- fix bug: #4300
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=lBcjVwt4I0-VUeE_7gM1gQBbtKOi9jGT3DavJbzcYnQ,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
|
|
@@ -8,23 +8,23 @@ mineru/backend/hybrid/hybrid_magic_model.py,sha256=_DvBq5WP_UZvmHfhZloxqv-MKoWWe
|
|
|
8
8
|
mineru/backend/hybrid/hybrid_model_output_to_middle_json.py,sha256=yE-c1eGa5LzPqLfKfvBON_SJRljqyz2B7LiglFcE7FQ,8468
|
|
9
9
|
mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
10
10
|
mineru/backend/pipeline/batch_analyze.py,sha256=3UBs2WOwcI-mfGAlxZt437OqSOleXPLnpYbrD9h5D54,21303
|
|
11
|
-
mineru/backend/pipeline/model_init.py,sha256=
|
|
11
|
+
mineru/backend/pipeline/model_init.py,sha256=2DHYwqrWXtK3v6u5EfoFHZqfUNE00MLfzuEGh-OhoBg,12609
|
|
12
12
|
mineru/backend/pipeline/model_json_to_middle_json.py,sha256=reXkUR_wKmJD64d7vRNXMxFviwkzDlGjRshpdwsVquI,10951
|
|
13
13
|
mineru/backend/pipeline/model_list.py,sha256=7cXMBfZrP0K6qWueg1D_-WoUANeSINzkn_ic9E7YQLs,222
|
|
14
14
|
mineru/backend/pipeline/para_split.py,sha256=Kq95MmvkPm7rKxlCSGiTvVKyF7CErHI2eGGAs5sLl0Q,17119
|
|
15
|
-
mineru/backend/pipeline/pipeline_analyze.py,sha256=
|
|
15
|
+
mineru/backend/pipeline/pipeline_analyze.py,sha256=82XH7hVynuD_nuk-v7a_zhx_3Z_MHS31sIurQ0lHmXQ,6737
|
|
16
16
|
mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc9rogxreZCrUJzJvPO8,14974
|
|
17
17
|
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=NJCLGKE7BqM24bRdpXCfTalyiqozowFZjpdzpIUy5aA,14672
|
|
18
18
|
mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
19
19
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
20
|
-
mineru/backend/vlm/utils.py,sha256=
|
|
21
|
-
mineru/backend/vlm/vlm_analyze.py,sha256=
|
|
20
|
+
mineru/backend/vlm/utils.py,sha256=PIYqOStLCZlxU9TiZK4EhP90rPYIe_0thEZeP01YPls,3940
|
|
21
|
+
mineru/backend/vlm/vlm_analyze.py,sha256=_2-xJC2C2rT87lZw8JZfC6PFFY0FfEbM9PK2TOkIJao,15604
|
|
22
22
|
mineru/backend/vlm/vlm_magic_model.py,sha256=RodoVwNJhzjyuRLn5Io5gFMIX1NxCuuLzCbUxGaKV80,21447
|
|
23
23
|
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
|
|
24
24
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
25
25
|
mineru/cli/client.py,sha256=mPNfMEShVG-ithmlJQ5nGRIad2gCZgUjBGHN7zAmLhQ,6978
|
|
26
26
|
mineru/cli/common.py,sha256=fMPc235DtnupQkh9uFIMHUpxOSvCp5yc3A56sAabAWY,20475
|
|
27
|
-
mineru/cli/fast_api.py,sha256=
|
|
27
|
+
mineru/cli/fast_api.py,sha256=Zgbp8giikcuOngZalmzrsfUMrdKnOd9TAAZrMfGQWXs,18664
|
|
28
28
|
mineru/cli/gradio_app.py,sha256=2IIWOm2bEHHq5BZMlfmN3yAJw1Nf8SUALTQ95o-bYy0,21863
|
|
29
29
|
mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
|
|
30
30
|
mineru/cli/vlm_server.py,sha256=27HaqO3wpMXSA_nA3CC6JOBTHK3q66SP00cD6m9HuQE,1974
|
|
@@ -151,17 +151,17 @@ mineru/model/utils/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-
|
|
|
151
151
|
mineru/model/utils/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
|
|
152
152
|
mineru/model/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
153
|
mineru/model/vlm/lmdeploy_server.py,sha256=PvxJNcUIKB8VzWMDXeV1t0SHSgz_ULO36ZAzJbppz90,3262
|
|
154
|
-
mineru/model/vlm/vllm_server.py,sha256=
|
|
154
|
+
mineru/model/vlm/vllm_server.py,sha256=7taySlBANFBTS81Q8PJ6gJWjTgjnFQoGvMknK4NuyLY,3360
|
|
155
155
|
mineru/resources/header.html,sha256=7xrf6bGloR-3ZeTDyA-JvavE_NeRuUDe3p07cEKUXSI,4769
|
|
156
156
|
mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
|
157
157
|
mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
158
158
|
mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
|
|
159
|
-
mineru/utils/block_sort.py,sha256=
|
|
159
|
+
mineru/utils/block_sort.py,sha256=e6nNjdUeRixT70OfvlEzM1FGwKxFSVwiLtwYGrsG_U0,13724
|
|
160
160
|
mineru/utils/boxbase.py,sha256=xnGA1k7hVtTQrreqlJmK-SA3y9edTHgLmGiqGrSXckE,7568
|
|
161
161
|
mineru/utils/char_utils.py,sha256=74T5Ylr5mi1uddAIuJku9Z6sH7vhR7t595_H7qmbu4c,1777
|
|
162
162
|
mineru/utils/check_sys_env.py,sha256=TRjzg4xWyoSGrgv4KaP225A-99xBgLAfZ1cPcGqrBAA,1191
|
|
163
163
|
mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
|
|
164
|
-
mineru/utils/config_reader.py,sha256=
|
|
164
|
+
mineru/utils/config_reader.py,sha256=mwXYVuj52mA__2BU2qOPP0Pn9m0dDLi4mAqPS9a4Pjo,4575
|
|
165
165
|
mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
|
|
166
166
|
mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
|
|
167
167
|
mineru/utils/engine_utils.py,sha256=Jmao9-O-sZDzH7vANKEDaY6NJ8tuthKsTr23LFIeBLU,2203
|
|
@@ -172,12 +172,12 @@ mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,85
|
|
|
172
172
|
mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
|
|
173
173
|
mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
|
|
174
174
|
mineru/utils/magic_model_utils.py,sha256=8Hv-BDk9Ez4TUx6hrVJ_675yZZggPj6Uib81lSpm0ig,11683
|
|
175
|
-
mineru/utils/model_utils.py,sha256=
|
|
175
|
+
mineru/utils/model_utils.py,sha256=xlw5hUYKa6o1NiM8PoXO1HFvHfrgY5e4Ut_upGEY9yI,19909
|
|
176
176
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
177
177
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
178
|
-
mineru/utils/os_env_config.py,sha256=
|
|
178
|
+
mineru/utils/os_env_config.py,sha256=VHK9lS3QFJhrwWa9FOFU1Swm7oXnby4SaNNjTyonTTg,990
|
|
179
179
|
mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
|
|
180
|
-
mineru/utils/pdf_image_tools.py,sha256=
|
|
180
|
+
mineru/utils/pdf_image_tools.py,sha256=tTSk39fgJKLEshwPAuJGLl_pVSrmEKjWA55F6dGcr4g,9987
|
|
181
181
|
mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
|
|
182
182
|
mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
|
|
183
183
|
mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
|
|
@@ -185,9 +185,9 @@ mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,128
|
|
|
185
185
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
186
186
|
mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
|
|
187
187
|
mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
|
|
188
|
-
mineru-2.7.
|
|
189
|
-
mineru-2.7.
|
|
190
|
-
mineru-2.7.
|
|
191
|
-
mineru-2.7.
|
|
192
|
-
mineru-2.7.
|
|
193
|
-
mineru-2.7.
|
|
188
|
+
mineru-2.7.5.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
189
|
+
mineru-2.7.5.dist-info/METADATA,sha256=MvPv4AgyLwaHz3hspAPrZ0wEeSE0wnu0MkMwfAJ5hTs,36928
|
|
190
|
+
mineru-2.7.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
191
|
+
mineru-2.7.5.dist-info/entry_points.txt,sha256=a9AHBIiYe3dpT3oofVQJC8fI0WjDhQASCUlhdMOK120,376
|
|
192
|
+
mineru-2.7.5.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
193
|
+
mineru-2.7.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|