mineru 2.7.2__py3-none-any.whl → 2.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/hybrid/hybrid_magic_model.py +2 -1
- mineru/backend/pipeline/model_init.py +8 -1
- mineru/backend/pipeline/pipeline_analyze.py +6 -1
- mineru/backend/vlm/utils.py +2 -0
- mineru/backend/vlm/vlm_analyze.py +41 -26
- mineru/backend/vlm/vlm_magic_model.py +2 -1
- mineru/cli/fast_api.py +159 -79
- mineru/model/vlm/vllm_server.py +15 -10
- mineru/utils/block_sort.py +4 -0
- mineru/utils/config_reader.py +5 -1
- mineru/utils/magic_model_utils.py +62 -14
- mineru/utils/model_utils.py +6 -0
- mineru/version.py +1 -1
- {mineru-2.7.2.dist-info → mineru-2.7.4.dist-info}/METADATA +9 -4
- {mineru-2.7.2.dist-info → mineru-2.7.4.dist-info}/RECORD +19 -19
- {mineru-2.7.2.dist-info → mineru-2.7.4.dist-info}/WHEEL +1 -1
- {mineru-2.7.2.dist-info → mineru-2.7.4.dist-info}/entry_points.txt +1 -1
- {mineru-2.7.2.dist-info → mineru-2.7.4.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.7.2.dist-info → mineru-2.7.4.dist-info}/top_level.txt +0 -0
|
@@ -297,7 +297,14 @@ def ocr_det_batch_setting(device):
|
|
|
297
297
|
# 检测torch的版本号
|
|
298
298
|
import torch
|
|
299
299
|
from packaging import version
|
|
300
|
-
|
|
300
|
+
|
|
301
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
302
|
+
|
|
303
|
+
if (
|
|
304
|
+
version.parse(torch.__version__) >= version.parse("2.8.0")
|
|
305
|
+
or str(device).startswith('mps')
|
|
306
|
+
or device_type.lower() in ["corex"]
|
|
307
|
+
):
|
|
301
308
|
enable_ocr_det_batch = False
|
|
302
309
|
else:
|
|
303
310
|
enable_ocr_det_batch = True
|
|
@@ -193,7 +193,12 @@ def batch_image_analyze(
|
|
|
193
193
|
# 检测torch的版本号
|
|
194
194
|
import torch
|
|
195
195
|
from packaging import version
|
|
196
|
-
|
|
196
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
197
|
+
if (
|
|
198
|
+
version.parse(torch.__version__) >= version.parse("2.8.0")
|
|
199
|
+
or str(device).startswith('mps')
|
|
200
|
+
or device_type.lower() in ["corex"]
|
|
201
|
+
):
|
|
197
202
|
enable_ocr_det_batch = False
|
|
198
203
|
else:
|
|
199
204
|
enable_ocr_det_batch = True
|
mineru/backend/vlm/utils.py
CHANGED
|
@@ -22,6 +22,8 @@ def enable_custom_logits_processors() -> bool:
|
|
|
22
22
|
compute_capability = "8.0"
|
|
23
23
|
elif hasattr(torch, 'musa') and torch.musa.is_available():
|
|
24
24
|
compute_capability = "8.0"
|
|
25
|
+
elif hasattr(torch, 'mlu') and torch.mlu.is_available():
|
|
26
|
+
compute_capability = "8.0"
|
|
25
27
|
else:
|
|
26
28
|
logger.info("CUDA not available, disabling custom_logits_processors")
|
|
27
29
|
return False
|
|
@@ -101,20 +101,27 @@ class ModelSingleton:
|
|
|
101
101
|
except ImportError:
|
|
102
102
|
raise ImportError("Please install vllm to use the vllm-engine backend.")
|
|
103
103
|
|
|
104
|
-
"""
|
|
105
104
|
# musa vllm v1 引擎特殊配置
|
|
106
|
-
device = get_device()
|
|
107
|
-
if
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
105
|
+
# device = get_device()
|
|
106
|
+
# if device_type.startswith("musa"):
|
|
107
|
+
# import torch
|
|
108
|
+
# if torch.musa.is_available():
|
|
109
|
+
# compilation_config = {
|
|
110
|
+
# "cudagraph_capture_sizes": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
111
|
+
# "simple_cuda_graph": True
|
|
112
|
+
# }
|
|
113
|
+
# block_size = 32
|
|
114
|
+
# kwargs["compilation_config"] = compilation_config
|
|
115
|
+
# kwargs["block_size"] = block_size
|
|
116
|
+
|
|
117
|
+
# corex vllm v1 引擎特殊配置
|
|
118
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
119
|
+
if device_type.lower() == "corex":
|
|
120
|
+
compilation_config = {
|
|
121
|
+
"cudagraph_mode": "FULL_DECODE_ONLY",
|
|
122
|
+
"level": 0
|
|
123
|
+
}
|
|
124
|
+
kwargs["compilation_config"] = compilation_config
|
|
118
125
|
|
|
119
126
|
if "compilation_config" in kwargs:
|
|
120
127
|
if isinstance(kwargs["compilation_config"], str):
|
|
@@ -141,20 +148,28 @@ class ModelSingleton:
|
|
|
141
148
|
except ImportError:
|
|
142
149
|
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
|
|
143
150
|
|
|
144
|
-
|
|
151
|
+
|
|
145
152
|
# musa vllm v1 引擎特殊配置
|
|
146
|
-
device = get_device()
|
|
147
|
-
if device.startswith("musa"):
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
153
|
+
# device = get_device()
|
|
154
|
+
# if device.startswith("musa"):
|
|
155
|
+
# import torch
|
|
156
|
+
# if torch.musa.is_available():
|
|
157
|
+
# compilation_config = CompilationConfig(
|
|
158
|
+
# cudagraph_capture_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
159
|
+
# simple_cuda_graph=True
|
|
160
|
+
# )
|
|
161
|
+
# block_size = 32
|
|
162
|
+
# kwargs["compilation_config"] = compilation_config
|
|
163
|
+
# kwargs["block_size"] = block_size
|
|
164
|
+
|
|
165
|
+
# corex vllm v1 引擎特殊配置
|
|
166
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
167
|
+
if device_type.lower() == "corex":
|
|
168
|
+
compilation_config = CompilationConfig(
|
|
169
|
+
cudagraph_mode="FULL_DECODE_ONLY",
|
|
170
|
+
level=0
|
|
171
|
+
)
|
|
172
|
+
kwargs["compilation_config"] = compilation_config
|
|
158
173
|
|
|
159
174
|
if "compilation_config" in kwargs:
|
|
160
175
|
if isinstance(kwargs["compilation_config"], dict):
|
mineru/cli/fast_api.py
CHANGED
|
@@ -7,12 +7,12 @@ import asyncio
|
|
|
7
7
|
import uvicorn
|
|
8
8
|
import click
|
|
9
9
|
import zipfile
|
|
10
|
+
import shutil
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
import glob
|
|
12
|
-
from fastapi import Depends, FastAPI, HTTPException, UploadFile, File, Form
|
|
13
|
+
from fastapi import Depends, FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
|
|
13
14
|
from fastapi.middleware.gzip import GZipMiddleware
|
|
14
15
|
from fastapi.responses import JSONResponse, FileResponse
|
|
15
|
-
from starlette.background import BackgroundTask
|
|
16
16
|
from typing import List, Optional
|
|
17
17
|
from loguru import logger
|
|
18
18
|
|
|
@@ -30,23 +30,30 @@ from mineru.version import __version__
|
|
|
30
30
|
# 并发控制器
|
|
31
31
|
_request_semaphore: Optional[asyncio.Semaphore] = None
|
|
32
32
|
|
|
33
|
+
|
|
33
34
|
# 并发控制依赖函数
|
|
34
35
|
async def limit_concurrency():
|
|
35
36
|
if _request_semaphore is not None:
|
|
36
|
-
|
|
37
|
+
# 检查信号量是否已用尽,如果是则拒绝请求
|
|
38
|
+
if _request_semaphore._value == 0:
|
|
37
39
|
raise HTTPException(
|
|
38
40
|
status_code=503,
|
|
39
|
-
detail=f"Server is at maximum capacity: {os.getenv('MINERU_API_MAX_CONCURRENT_REQUESTS', 'unset')}. Please try again later."
|
|
41
|
+
detail=f"Server is at maximum capacity: {os.getenv('MINERU_API_MAX_CONCURRENT_REQUESTS', 'unset')}. Please try again later.",
|
|
40
42
|
)
|
|
41
43
|
async with _request_semaphore:
|
|
42
44
|
yield
|
|
43
45
|
else:
|
|
44
46
|
yield
|
|
45
47
|
|
|
48
|
+
|
|
46
49
|
def create_app():
|
|
47
50
|
# By default, the OpenAPI documentation endpoints (openapi_url, docs_url, redoc_url) are enabled.
|
|
48
51
|
# To disable the FastAPI docs and schema endpoints, set the environment variable MINERU_API_ENABLE_FASTAPI_DOCS=0.
|
|
49
|
-
enable_docs = str(os.getenv("MINERU_API_ENABLE_FASTAPI_DOCS", "1")).lower() in (
|
|
52
|
+
enable_docs = str(os.getenv("MINERU_API_ENABLE_FASTAPI_DOCS", "1")).lower() in (
|
|
53
|
+
"1",
|
|
54
|
+
"true",
|
|
55
|
+
"yes",
|
|
56
|
+
)
|
|
50
57
|
app = FastAPI(
|
|
51
58
|
openapi_url="/openapi.json" if enable_docs else None,
|
|
52
59
|
docs_url="/docs" if enable_docs else None,
|
|
@@ -56,7 +63,9 @@ def create_app():
|
|
|
56
63
|
# 初始化并发控制器:从环境变量MINERU_API_MAX_CONCURRENT_REQUESTS读取
|
|
57
64
|
global _request_semaphore
|
|
58
65
|
try:
|
|
59
|
-
max_concurrent_requests = int(
|
|
66
|
+
max_concurrent_requests = int(
|
|
67
|
+
os.getenv("MINERU_API_MAX_CONCURRENT_REQUESTS", "0")
|
|
68
|
+
)
|
|
60
69
|
except ValueError:
|
|
61
70
|
max_concurrent_requests = 0
|
|
62
71
|
|
|
@@ -67,6 +76,7 @@ def create_app():
|
|
|
67
76
|
app.add_middleware(GZipMiddleware, minimum_size=1000)
|
|
68
77
|
return app
|
|
69
78
|
|
|
79
|
+
|
|
70
80
|
app = create_app()
|
|
71
81
|
|
|
72
82
|
|
|
@@ -76,27 +86,34 @@ def sanitize_filename(filename: str) -> str:
|
|
|
76
86
|
移除路径遍历字符, 保留 Unicode 字母、数字、._-
|
|
77
87
|
禁止隐藏文件
|
|
78
88
|
"""
|
|
79
|
-
sanitized = re.sub(r
|
|
80
|
-
sanitized = re.sub(r
|
|
81
|
-
if sanitized.startswith(
|
|
82
|
-
sanitized =
|
|
83
|
-
return sanitized or
|
|
89
|
+
sanitized = re.sub(r"[/\\.]{2,}|[/\\]", "", filename)
|
|
90
|
+
sanitized = re.sub(r"[^\w.-]", "_", sanitized, flags=re.UNICODE)
|
|
91
|
+
if sanitized.startswith("."):
|
|
92
|
+
sanitized = "_" + sanitized[1:]
|
|
93
|
+
return sanitized or "unnamed"
|
|
94
|
+
|
|
84
95
|
|
|
85
96
|
def cleanup_file(file_path: str) -> None:
|
|
86
|
-
"""
|
|
97
|
+
"""清理临时文件或目录"""
|
|
87
98
|
try:
|
|
88
99
|
if os.path.exists(file_path):
|
|
89
|
-
os.
|
|
100
|
+
if os.path.isfile(file_path):
|
|
101
|
+
os.remove(file_path)
|
|
102
|
+
elif os.path.isdir(file_path):
|
|
103
|
+
shutil.rmtree(file_path)
|
|
90
104
|
except Exception as e:
|
|
91
105
|
logger.warning(f"fail clean file {file_path}: {e}")
|
|
92
106
|
|
|
107
|
+
|
|
93
108
|
def encode_image(image_path: str) -> str:
|
|
94
109
|
"""Encode image using base64"""
|
|
95
110
|
with open(image_path, "rb") as f:
|
|
96
111
|
return b64encode(f.read()).decode()
|
|
97
112
|
|
|
98
113
|
|
|
99
|
-
def get_infer_result(
|
|
114
|
+
def get_infer_result(
|
|
115
|
+
file_suffix_identifier: str, pdf_name: str, parse_dir: str
|
|
116
|
+
) -> Optional[str]:
|
|
100
117
|
"""从结果文件中读取推理结果"""
|
|
101
118
|
result_file_path = os.path.join(parse_dir, f"{pdf_name}{file_suffix_identifier}")
|
|
102
119
|
if os.path.exists(result_file_path):
|
|
@@ -107,11 +124,14 @@ def get_infer_result(file_suffix_identifier: str, pdf_name: str, parse_dir: str)
|
|
|
107
124
|
|
|
108
125
|
@app.post(path="/file_parse", dependencies=[Depends(limit_concurrency)])
|
|
109
126
|
async def parse_pdf(
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
127
|
+
background_tasks: BackgroundTasks,
|
|
128
|
+
files: List[UploadFile] = File(
|
|
129
|
+
..., description="Upload pdf or image files for parsing"
|
|
130
|
+
),
|
|
131
|
+
output_dir: str = Form("./output", description="Output local directory"),
|
|
132
|
+
lang_list: List[str] = Form(
|
|
133
|
+
["ch"],
|
|
134
|
+
description="""(Adapted only for pipeline and hybrid backend)Input the languages in the pdf to improve OCR accuracy.Options:
|
|
115
135
|
- ch: Chinese, English, Chinese Traditional.
|
|
116
136
|
- ch_lite: Chinese, English, Chinese Traditional, Japanese.
|
|
117
137
|
- ch_server: Chinese, English, Chinese Traditional, Japanese.
|
|
@@ -129,41 +149,54 @@ async def parse_pdf(
|
|
|
129
149
|
- east_slavic: Russian, Belarusian, Ukrainian, English.
|
|
130
150
|
- cyrillic: Russian, Belarusian, Ukrainian, Serbian (Cyrillic), Bulgarian, Mongolian, Abkhazian, Adyghe, Kabardian, Avar, Dargin, Ingush, Chechen, Lak, Lezgin, Tabasaran, Kazakh, Kyrgyz, Tajik, Macedonian, Tatar, Chuvash, Bashkir, Malian, Moldovan, Udmurt, Komi, Ossetian, Buryat, Kalmyk, Tuvan, Sakha, Karakalpak, English.
|
|
131
151
|
- devanagari: Hindi, Marathi, Nepali, Bihari, Maithili, Angika, Bhojpuri, Magahi, Santali, Newari, Konkani, Sanskrit, Haryanvi, English.
|
|
132
|
-
"""
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
152
|
+
""",
|
|
153
|
+
),
|
|
154
|
+
backend: str = Form(
|
|
155
|
+
"hybrid-auto-engine",
|
|
156
|
+
description="""The backend for parsing:
|
|
137
157
|
- pipeline: More general, supports multiple languages, hallucination-free.
|
|
138
158
|
- vlm-auto-engine: High accuracy via local computing power, supports Chinese and English documents only.
|
|
139
159
|
- vlm-http-client: High accuracy via remote computing power(client suitable for openai-compatible servers), supports Chinese and English documents only.
|
|
140
160
|
- hybrid-auto-engine: Next-generation high accuracy solution via local computing power, supports multiple languages.
|
|
141
|
-
- hybrid-http-client: High accuracy via remote computing power but requires a little local computing power(client suitable for openai-compatible servers), supports multiple languages."""
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
161
|
+
- hybrid-http-client: High accuracy via remote computing power but requires a little local computing power(client suitable for openai-compatible servers), supports multiple languages.""",
|
|
162
|
+
),
|
|
163
|
+
parse_method: str = Form(
|
|
164
|
+
"auto",
|
|
165
|
+
description="""(Adapted only for pipeline and hybrid backend)The method for parsing PDF:
|
|
146
166
|
- auto: Automatically determine the method based on the file type
|
|
147
167
|
- txt: Use text extraction method
|
|
148
168
|
- ocr: Use OCR method for image-based PDFs
|
|
149
|
-
"""
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
169
|
+
""",
|
|
170
|
+
),
|
|
171
|
+
formula_enable: bool = Form(True, description="Enable formula parsing."),
|
|
172
|
+
table_enable: bool = Form(True, description="Enable table parsing."),
|
|
173
|
+
server_url: Optional[str] = Form(
|
|
174
|
+
None,
|
|
175
|
+
description="(Adapted only for <vlm/hybrid>-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000",
|
|
176
|
+
),
|
|
177
|
+
return_md: bool = Form(True, description="Return markdown content in response"),
|
|
178
|
+
return_middle_json: bool = Form(
|
|
179
|
+
False, description="Return middle JSON in response"
|
|
180
|
+
),
|
|
181
|
+
return_model_output: bool = Form(
|
|
182
|
+
False, description="Return model output JSON in response"
|
|
183
|
+
),
|
|
184
|
+
return_content_list: bool = Form(
|
|
185
|
+
False, description="Return content list JSON in response"
|
|
186
|
+
),
|
|
187
|
+
return_images: bool = Form(
|
|
188
|
+
False, description="Return extracted images in response"
|
|
189
|
+
),
|
|
190
|
+
response_format_zip: bool = Form(
|
|
191
|
+
False, description="Return results as a ZIP file instead of JSON"
|
|
192
|
+
),
|
|
193
|
+
start_page_id: int = Form(
|
|
194
|
+
0, description="The starting page for PDF parsing, beginning from 0"
|
|
195
|
+
),
|
|
196
|
+
end_page_id: int = Form(
|
|
197
|
+
99999, description="The ending page for PDF parsing, beginning from 0"
|
|
198
|
+
),
|
|
165
199
|
):
|
|
166
|
-
|
|
167
200
|
# 获取命令行配置参数
|
|
168
201
|
config = getattr(app.state, "config", {})
|
|
169
202
|
|
|
@@ -171,6 +204,7 @@ async def parse_pdf(
|
|
|
171
204
|
# 创建唯一的输出目录
|
|
172
205
|
unique_dir = os.path.join(output_dir, str(uuid.uuid4()))
|
|
173
206
|
os.makedirs(unique_dir, exist_ok=True)
|
|
207
|
+
background_tasks.add_task(cleanup_file, unique_dir)
|
|
174
208
|
|
|
175
209
|
# 处理上传的PDF文件
|
|
176
210
|
pdf_file_names = []
|
|
@@ -196,20 +230,21 @@ async def parse_pdf(
|
|
|
196
230
|
except Exception as e:
|
|
197
231
|
return JSONResponse(
|
|
198
232
|
status_code=400,
|
|
199
|
-
content={"error": f"Failed to load file: {str(e)}"}
|
|
233
|
+
content={"error": f"Failed to load file: {str(e)}"},
|
|
200
234
|
)
|
|
201
235
|
else:
|
|
202
236
|
return JSONResponse(
|
|
203
237
|
status_code=400,
|
|
204
|
-
content={"error": f"Unsupported file type: {file_suffix}"}
|
|
238
|
+
content={"error": f"Unsupported file type: {file_suffix}"},
|
|
205
239
|
)
|
|
206
240
|
|
|
207
|
-
|
|
208
241
|
# 设置语言列表,确保与文件数量一致
|
|
209
242
|
actual_lang_list = lang_list
|
|
210
243
|
if len(actual_lang_list) != len(pdf_file_names):
|
|
211
244
|
# 如果语言列表长度不匹配,使用第一个语言或默认"ch"
|
|
212
|
-
actual_lang_list = [
|
|
245
|
+
actual_lang_list = [
|
|
246
|
+
actual_lang_list[0] if actual_lang_list else "ch"
|
|
247
|
+
] * len(pdf_file_names)
|
|
213
248
|
|
|
214
249
|
# 调用异步处理函数
|
|
215
250
|
await aio_do_parse(
|
|
@@ -231,13 +266,15 @@ async def parse_pdf(
|
|
|
231
266
|
f_dump_content_list=return_content_list,
|
|
232
267
|
start_page_id=start_page_id,
|
|
233
268
|
end_page_id=end_page_id,
|
|
234
|
-
**config
|
|
269
|
+
**config,
|
|
235
270
|
)
|
|
236
271
|
|
|
237
272
|
# 根据 response_format_zip 决定返回类型
|
|
238
273
|
if response_format_zip:
|
|
239
274
|
zip_fd, zip_path = tempfile.mkstemp(suffix=".zip", prefix="mineru_results_")
|
|
240
275
|
os.close(zip_fd)
|
|
276
|
+
background_tasks.add_task(cleanup_file, zip_path)
|
|
277
|
+
|
|
241
278
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
242
279
|
for pdf_name in pdf_file_names:
|
|
243
280
|
safe_pdf_name = sanitize_filename(pdf_name)
|
|
@@ -247,7 +284,13 @@ async def parse_pdf(
|
|
|
247
284
|
elif backend.startswith("vlm"):
|
|
248
285
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
|
249
286
|
elif backend.startswith("hybrid"):
|
|
250
|
-
parse_dir = os.path.join(
|
|
287
|
+
parse_dir = os.path.join(
|
|
288
|
+
unique_dir, pdf_name, f"hybrid_{parse_method}"
|
|
289
|
+
)
|
|
290
|
+
else:
|
|
291
|
+
# 未知 backend,跳过此文件
|
|
292
|
+
logger.warning(f"Unknown backend type: {backend}, skipping {pdf_name}")
|
|
293
|
+
continue
|
|
251
294
|
|
|
252
295
|
if not os.path.exists(parse_dir):
|
|
253
296
|
continue
|
|
@@ -256,35 +299,63 @@ async def parse_pdf(
|
|
|
256
299
|
if return_md:
|
|
257
300
|
path = os.path.join(parse_dir, f"{pdf_name}.md")
|
|
258
301
|
if os.path.exists(path):
|
|
259
|
-
zf.write(
|
|
302
|
+
zf.write(
|
|
303
|
+
path,
|
|
304
|
+
arcname=os.path.join(
|
|
305
|
+
safe_pdf_name, f"{safe_pdf_name}.md"
|
|
306
|
+
),
|
|
307
|
+
)
|
|
260
308
|
|
|
261
309
|
if return_middle_json:
|
|
262
310
|
path = os.path.join(parse_dir, f"{pdf_name}_middle.json")
|
|
263
311
|
if os.path.exists(path):
|
|
264
|
-
zf.write(
|
|
312
|
+
zf.write(
|
|
313
|
+
path,
|
|
314
|
+
arcname=os.path.join(
|
|
315
|
+
safe_pdf_name, f"{safe_pdf_name}_middle.json"
|
|
316
|
+
),
|
|
317
|
+
)
|
|
265
318
|
|
|
266
319
|
if return_model_output:
|
|
267
320
|
path = os.path.join(parse_dir, f"{pdf_name}_model.json")
|
|
268
321
|
if os.path.exists(path):
|
|
269
|
-
zf.write(
|
|
322
|
+
zf.write(
|
|
323
|
+
path,
|
|
324
|
+
arcname=os.path.join(
|
|
325
|
+
safe_pdf_name, f"{safe_pdf_name}_model.json"
|
|
326
|
+
),
|
|
327
|
+
)
|
|
270
328
|
|
|
271
329
|
if return_content_list:
|
|
272
330
|
path = os.path.join(parse_dir, f"{pdf_name}_content_list.json")
|
|
273
331
|
if os.path.exists(path):
|
|
274
|
-
zf.write(
|
|
332
|
+
zf.write(
|
|
333
|
+
path,
|
|
334
|
+
arcname=os.path.join(
|
|
335
|
+
safe_pdf_name, f"{safe_pdf_name}_content_list.json"
|
|
336
|
+
),
|
|
337
|
+
)
|
|
275
338
|
|
|
276
339
|
# 写入图片
|
|
277
340
|
if return_images:
|
|
278
341
|
images_dir = os.path.join(parse_dir, "images")
|
|
279
|
-
image_paths = glob.glob(
|
|
342
|
+
image_paths = glob.glob(
|
|
343
|
+
os.path.join(glob.escape(images_dir), "*.jpg")
|
|
344
|
+
)
|
|
280
345
|
for image_path in image_paths:
|
|
281
|
-
zf.write(
|
|
346
|
+
zf.write(
|
|
347
|
+
image_path,
|
|
348
|
+
arcname=os.path.join(
|
|
349
|
+
safe_pdf_name,
|
|
350
|
+
"images",
|
|
351
|
+
os.path.basename(image_path),
|
|
352
|
+
),
|
|
353
|
+
)
|
|
282
354
|
|
|
283
355
|
return FileResponse(
|
|
284
356
|
path=zip_path,
|
|
285
357
|
media_type="application/zip",
|
|
286
358
|
filename="results.zip",
|
|
287
|
-
background=BackgroundTask(cleanup_file, zip_path)
|
|
288
359
|
)
|
|
289
360
|
else:
|
|
290
361
|
# 构建 JSON 结果
|
|
@@ -298,17 +369,31 @@ async def parse_pdf(
|
|
|
298
369
|
elif backend.startswith("vlm"):
|
|
299
370
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
|
300
371
|
elif backend.startswith("hybrid"):
|
|
301
|
-
parse_dir = os.path.join(
|
|
372
|
+
parse_dir = os.path.join(
|
|
373
|
+
unique_dir, pdf_name, f"hybrid_{parse_method}"
|
|
374
|
+
)
|
|
375
|
+
else:
|
|
376
|
+
# 未知 backend,跳过此文件
|
|
377
|
+
logger.warning(f"Unknown backend type: {backend}, skipping {pdf_name}")
|
|
378
|
+
continue
|
|
302
379
|
|
|
303
380
|
if os.path.exists(parse_dir):
|
|
304
381
|
if return_md:
|
|
305
|
-
data["md_content"] = get_infer_result(
|
|
382
|
+
data["md_content"] = get_infer_result(
|
|
383
|
+
".md", pdf_name, parse_dir
|
|
384
|
+
)
|
|
306
385
|
if return_middle_json:
|
|
307
|
-
data["middle_json"] = get_infer_result(
|
|
386
|
+
data["middle_json"] = get_infer_result(
|
|
387
|
+
"_middle.json", pdf_name, parse_dir
|
|
388
|
+
)
|
|
308
389
|
if return_model_output:
|
|
309
|
-
data["model_output"] = get_infer_result(
|
|
390
|
+
data["model_output"] = get_infer_result(
|
|
391
|
+
"_model.json", pdf_name, parse_dir
|
|
392
|
+
)
|
|
310
393
|
if return_content_list:
|
|
311
|
-
data["content_list"] = get_infer_result(
|
|
394
|
+
data["content_list"] = get_infer_result(
|
|
395
|
+
"_content_list.json", pdf_name, parse_dir
|
|
396
|
+
)
|
|
312
397
|
if return_images:
|
|
313
398
|
images_dir = os.path.join(parse_dir, "images")
|
|
314
399
|
safe_pattern = os.path.join(glob.escape(images_dir), "*.jpg")
|
|
@@ -325,24 +410,24 @@ async def parse_pdf(
|
|
|
325
410
|
content={
|
|
326
411
|
"backend": backend,
|
|
327
412
|
"version": __version__,
|
|
328
|
-
"results": result_dict
|
|
329
|
-
}
|
|
413
|
+
"results": result_dict,
|
|
414
|
+
},
|
|
330
415
|
)
|
|
331
416
|
except Exception as e:
|
|
332
417
|
logger.exception(e)
|
|
333
418
|
return JSONResponse(
|
|
334
|
-
status_code=500,
|
|
335
|
-
content={"error": f"Failed to process file: {str(e)}"}
|
|
419
|
+
status_code=500, content={"error": f"Failed to process file: {str(e)}"}
|
|
336
420
|
)
|
|
337
421
|
|
|
338
422
|
|
|
339
|
-
@click.command(
|
|
423
|
+
@click.command(
|
|
424
|
+
context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)
|
|
425
|
+
)
|
|
340
426
|
@click.pass_context
|
|
341
|
-
@click.option(
|
|
342
|
-
@click.option(
|
|
343
|
-
@click.option(
|
|
427
|
+
@click.option("--host", default="127.0.0.1", help="Server host (default: 127.0.0.1)")
|
|
428
|
+
@click.option("--port", default=8000, type=int, help="Server port (default: 8000)")
|
|
429
|
+
@click.option("--reload", is_flag=True, help="Enable auto-reload (development mode)")
|
|
344
430
|
def main(ctx, host, port, reload, **kwargs):
|
|
345
|
-
|
|
346
431
|
kwargs.update(arg_parse(ctx))
|
|
347
432
|
|
|
348
433
|
# 将配置参数存储到应用状态中
|
|
@@ -359,12 +444,7 @@ def main(ctx, host, port, reload, **kwargs):
|
|
|
359
444
|
print(f"Start MinerU FastAPI Service: http://{host}:{port}")
|
|
360
445
|
print(f"API documentation: http://{host}:{port}/docs")
|
|
361
446
|
|
|
362
|
-
uvicorn.run(
|
|
363
|
-
"mineru.cli.fast_api:app",
|
|
364
|
-
host=host,
|
|
365
|
-
port=port,
|
|
366
|
-
reload=reload
|
|
367
|
-
)
|
|
447
|
+
uvicorn.run("mineru.cli.fast_api:app", host=host, port=port, reload=reload)
|
|
368
448
|
|
|
369
449
|
|
|
370
450
|
if __name__ == "__main__":
|
mineru/model/vlm/vllm_server.py
CHANGED
|
@@ -56,17 +56,22 @@ def main():
|
|
|
56
56
|
model_path = auto_download_and_get_model_root_path("/", "vlm")
|
|
57
57
|
if (not has_logits_processors_arg) and custom_logits_processors:
|
|
58
58
|
args.extend(["--logits-processors", "mineru_vl_utils:MinerULogitsProcessor"])
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
# musa vllm v1 引擎特殊配置
|
|
61
|
-
device = get_device()
|
|
62
|
-
if device.startswith("musa"):
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
61
|
+
# device = get_device()
|
|
62
|
+
# if device.startswith("musa"):
|
|
63
|
+
# import torch
|
|
64
|
+
# if torch.musa.is_available():
|
|
65
|
+
# if not has_block_size_arg:
|
|
66
|
+
# args.extend(["--block-size", "32"])
|
|
67
|
+
# if not has_compilation_config:
|
|
68
|
+
# args.extend(["--compilation-config", '{"cudagraph_capture_sizes": [1,2,3,4,5,6,7,8,10,12,14,16,18,20,24,28,30], "simple_cuda_graph": true}'])
|
|
69
|
+
|
|
70
|
+
# corex vllm v1 引擎特殊配置
|
|
71
|
+
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
72
|
+
if device_type.lower() == "corex":
|
|
73
|
+
if not has_compilation_config:
|
|
74
|
+
args.extend(["--compilation-config", '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'])
|
|
70
75
|
|
|
71
76
|
# 重构参数,将模型路径作为位置参数
|
|
72
77
|
sys.argv = [sys.argv[0]] + ["serve", model_path] + args
|
mineru/utils/block_sort.py
CHANGED
|
@@ -198,6 +198,10 @@ def model_init(model_name: str):
|
|
|
198
198
|
if hasattr(torch, 'npu') and torch.npu.is_available():
|
|
199
199
|
if torch.npu.is_bf16_supported():
|
|
200
200
|
bf_16_support = True
|
|
201
|
+
elif device_name.startswith("mlu"):
|
|
202
|
+
if hasattr(torch, 'mlu') and torch.mlu.is_available():
|
|
203
|
+
if torch.mlu.is_bf16_supported():
|
|
204
|
+
bf_16_support = True
|
|
201
205
|
|
|
202
206
|
if model_name == 'layoutreader':
|
|
203
207
|
# 检测modelscope的缓存目录是否存在
|
mineru/utils/config_reader.py
CHANGED
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
包含两个MagicModel类中重复使用的方法和逻辑
|
|
3
3
|
"""
|
|
4
4
|
from typing import List, Dict, Any, Callable
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
5
7
|
from mineru.utils.boxbase import bbox_distance, bbox_center_distance, is_in
|
|
6
8
|
|
|
7
9
|
|
|
@@ -172,11 +174,15 @@ def tie_up_category_by_index(
|
|
|
172
174
|
get_subjects_func: Callable,
|
|
173
175
|
get_objects_func: Callable,
|
|
174
176
|
extract_subject_func: Callable = None,
|
|
175
|
-
extract_object_func: Callable = None
|
|
177
|
+
extract_object_func: Callable = None,
|
|
178
|
+
object_block_type: str = "object",
|
|
176
179
|
):
|
|
177
180
|
"""
|
|
178
181
|
基于index的类别关联方法,用于将主体对象与客体对象进行关联
|
|
179
|
-
客体优先匹配给index
|
|
182
|
+
客体优先匹配给index最接近的主体,匹配优先级为:
|
|
183
|
+
1. index差值(最高优先级)
|
|
184
|
+
2. bbox边缘距离(相邻边距离)
|
|
185
|
+
3. bbox中心点距离(最低优先级,作为最终tiebreaker)
|
|
180
186
|
|
|
181
187
|
参数:
|
|
182
188
|
get_subjects_func: 函数,提取主体对象
|
|
@@ -207,6 +213,29 @@ def tie_up_category_by_index(
|
|
|
207
213
|
"sub_idx": i,
|
|
208
214
|
}
|
|
209
215
|
|
|
216
|
+
# 提取所有客体的index集合,用于计算有效index差值
|
|
217
|
+
object_indices = set(obj["index"] for obj in objects)
|
|
218
|
+
|
|
219
|
+
def calc_effective_index_diff(obj_index: int, sub_index: int) -> int:
|
|
220
|
+
"""
|
|
221
|
+
计算有效的index差值
|
|
222
|
+
有效差值 = 绝对差值 - 区间内其他客体的数量
|
|
223
|
+
即:如果obj_index和sub_index之间的差值是由其他客体造成的,则应该扣除这部分差值
|
|
224
|
+
"""
|
|
225
|
+
if obj_index == sub_index:
|
|
226
|
+
return 0
|
|
227
|
+
|
|
228
|
+
start, end = min(obj_index, sub_index), max(obj_index, sub_index)
|
|
229
|
+
abs_diff = end - start
|
|
230
|
+
|
|
231
|
+
# 计算区间(start, end)内有多少个其他客体的index
|
|
232
|
+
other_objects_count = 0
|
|
233
|
+
for idx in range(start + 1, end):
|
|
234
|
+
if idx in object_indices:
|
|
235
|
+
other_objects_count += 1
|
|
236
|
+
|
|
237
|
+
return abs_diff - other_objects_count
|
|
238
|
+
|
|
210
239
|
# 为每个客体找到最匹配的主体
|
|
211
240
|
for obj in objects:
|
|
212
241
|
if len(subjects) == 0:
|
|
@@ -217,10 +246,10 @@ def tie_up_category_by_index(
|
|
|
217
246
|
min_index_diff = float("inf")
|
|
218
247
|
best_subject_indices = []
|
|
219
248
|
|
|
220
|
-
#
|
|
249
|
+
# 找出有效index差值最小的所有主体
|
|
221
250
|
for i, subject in enumerate(subjects):
|
|
222
251
|
sub_index = subject["index"]
|
|
223
|
-
index_diff =
|
|
252
|
+
index_diff = calc_effective_index_diff(obj_index, sub_index)
|
|
224
253
|
|
|
225
254
|
if index_diff < min_index_diff:
|
|
226
255
|
min_index_diff = index_diff
|
|
@@ -228,18 +257,37 @@ def tie_up_category_by_index(
|
|
|
228
257
|
elif index_diff == min_index_diff:
|
|
229
258
|
best_subject_indices.append(i)
|
|
230
259
|
|
|
231
|
-
|
|
232
|
-
if len(best_subject_indices) > 1:
|
|
233
|
-
min_center_dist = float("inf")
|
|
260
|
+
if len(best_subject_indices) == 1:
|
|
234
261
|
best_subject_idx = best_subject_indices[0]
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
262
|
+
# 如果有多个主体的index差值相同(最多两个),根据边缘距离进行筛选
|
|
263
|
+
elif len(best_subject_indices) == 2:
|
|
264
|
+
# 计算所有候选主体的边缘距离
|
|
265
|
+
edge_distances = [(idx, bbox_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
|
|
266
|
+
edge_dist_diff = abs(edge_distances[0][1] - edge_distances[1][1])
|
|
267
|
+
|
|
268
|
+
for idx, edge_dist in edge_distances:
|
|
269
|
+
logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Edge distance: {edge_dist}")
|
|
270
|
+
|
|
271
|
+
if edge_dist_diff > 2:
|
|
272
|
+
# 边缘距离差值大于2,匹配边缘距离更小的主体
|
|
273
|
+
best_subject_idx = min(edge_distances, key=lambda x: x[1])[0]
|
|
274
|
+
logger.debug(f"Obj index: {obj_index}, edge_dist_diff > 2, matching to subject with min edge distance, index: {subjects[best_subject_idx]['index']}")
|
|
275
|
+
elif object_block_type == "table_caption":
|
|
276
|
+
# 边缘距离差值<=2且为table_caption,匹配index更大的主体
|
|
277
|
+
best_subject_idx = max(best_subject_indices, key=lambda idx: subjects[idx]["index"])
|
|
278
|
+
logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and table_caption, matching to later subject with index: {subjects[best_subject_idx]['index']}")
|
|
279
|
+
elif object_block_type.endswith("footnote"):
|
|
280
|
+
# 边缘距离差值<=2且为footnote,匹配index更小的主体
|
|
281
|
+
best_subject_idx = min(best_subject_indices, key=lambda idx: subjects[idx]["index"])
|
|
282
|
+
logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and footnote, matching to earlier subject with index: {subjects[best_subject_idx]['index']}")
|
|
283
|
+
else:
|
|
284
|
+
# 边缘距离差值<=2 且不适用特殊匹配规则,使用中心点距离匹配
|
|
285
|
+
center_distances = [(idx, bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
|
|
286
|
+
for idx, center_dist in center_distances:
|
|
287
|
+
logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Center distance: {center_dist}")
|
|
288
|
+
best_subject_idx = min(center_distances, key=lambda x: x[1])[0]
|
|
241
289
|
else:
|
|
242
|
-
|
|
290
|
+
raise ValueError("More than two subjects have the same minimal index difference, which is unexpected.")
|
|
243
291
|
|
|
244
292
|
# 将客体添加到最佳主体的obj_bboxes中
|
|
245
293
|
result_dict[best_subject_idx]["obj_bboxes"].append(extract_object_func(obj))
|
mineru/utils/model_utils.py
CHANGED
|
@@ -429,6 +429,9 @@ def clean_memory(device='cuda'):
|
|
|
429
429
|
elif str(device).startswith("musa"):
|
|
430
430
|
if torch.musa.is_available():
|
|
431
431
|
torch.musa.empty_cache()
|
|
432
|
+
elif str(device).startswith("mlu"):
|
|
433
|
+
if torch.mlu.is_available():
|
|
434
|
+
torch.mlu.empty_cache()
|
|
432
435
|
gc.collect()
|
|
433
436
|
|
|
434
437
|
|
|
@@ -470,5 +473,8 @@ def get_vram(device) -> int:
|
|
|
470
473
|
elif str(device).startswith("musa"):
|
|
471
474
|
if torch.musa.is_available():
|
|
472
475
|
total_memory = round(torch.musa.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
476
|
+
elif str(device).startswith("mlu"):
|
|
477
|
+
if torch.mlu.is_available():
|
|
478
|
+
total_memory = round(torch.mlu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
473
479
|
|
|
474
480
|
return total_memory
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.7.
|
|
1
|
+
__version__ = "2.7.4"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.4
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -135,16 +135,21 @@ Dynamic: license-file
|
|
|
135
135
|
|
|
136
136
|
# Changelog
|
|
137
137
|
|
|
138
|
-
- 2026/01/
|
|
139
|
-
- Added support for domestic computing platforms
|
|
138
|
+
- 2026/01/30 2.7.4 Release
|
|
139
|
+
- Added support for domestic computing platforms IluvatarCorex and Cambricon. Currently, the officially supported domestic computing platforms include:
|
|
140
140
|
- [Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend/)
|
|
141
141
|
- [T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead/)
|
|
142
142
|
- [METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX/)
|
|
143
143
|
- [Hygon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
|
|
144
144
|
- [Enflame](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
|
|
145
145
|
- [MooreThreads](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
|
|
146
|
+
- [IluvatarCorex](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/IluvatarCorex/)
|
|
147
|
+
- [Cambricon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Cambricon/)
|
|
146
148
|
- MinerU continues to ensure compatibility with domestic hardware platforms, supporting mainstream chip architectures. With secure and reliable technology, we empower researchers, government, and enterprises to reach new heights in document digitization!
|
|
147
|
-
|
|
149
|
+
|
|
150
|
+
- 2026/01/23 2.7.2 Release
|
|
151
|
+
- Added support for domestic computing platforms Hygon, Enflame, and Moore Threads.
|
|
152
|
+
- Cross-page table merging optimization, improving merge success rate and merge quality.
|
|
148
153
|
|
|
149
154
|
- 2026/01/06 2.7.1 Release
|
|
150
155
|
- fix bug: #4300
|
|
@@ -1,30 +1,30 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=yLdxKZXyzrDqew_33G4dvZoqgGxRCyEx9vhYW3y2Je4,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
|
|
6
6
|
mineru/backend/hybrid/hybrid_analyze.py,sha256=Sckw6T-pvMv3V_nqZkBeW8kY4zNIBlWxqeS2vXqNqtY,20939
|
|
7
|
-
mineru/backend/hybrid/hybrid_magic_model.py,sha256=
|
|
7
|
+
mineru/backend/hybrid/hybrid_magic_model.py,sha256=_DvBq5WP_UZvmHfhZloxqv-MKoWWe_ye1kNLv6RA5rU,24713
|
|
8
8
|
mineru/backend/hybrid/hybrid_model_output_to_middle_json.py,sha256=yE-c1eGa5LzPqLfKfvBON_SJRljqyz2B7LiglFcE7FQ,8468
|
|
9
9
|
mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
10
10
|
mineru/backend/pipeline/batch_analyze.py,sha256=3UBs2WOwcI-mfGAlxZt437OqSOleXPLnpYbrD9h5D54,21303
|
|
11
|
-
mineru/backend/pipeline/model_init.py,sha256=
|
|
11
|
+
mineru/backend/pipeline/model_init.py,sha256=2DHYwqrWXtK3v6u5EfoFHZqfUNE00MLfzuEGh-OhoBg,12609
|
|
12
12
|
mineru/backend/pipeline/model_json_to_middle_json.py,sha256=reXkUR_wKmJD64d7vRNXMxFviwkzDlGjRshpdwsVquI,10951
|
|
13
13
|
mineru/backend/pipeline/model_list.py,sha256=7cXMBfZrP0K6qWueg1D_-WoUANeSINzkn_ic9E7YQLs,222
|
|
14
14
|
mineru/backend/pipeline/para_split.py,sha256=Kq95MmvkPm7rKxlCSGiTvVKyF7CErHI2eGGAs5sLl0Q,17119
|
|
15
|
-
mineru/backend/pipeline/pipeline_analyze.py,sha256=
|
|
15
|
+
mineru/backend/pipeline/pipeline_analyze.py,sha256=82XH7hVynuD_nuk-v7a_zhx_3Z_MHS31sIurQ0lHmXQ,6737
|
|
16
16
|
mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc9rogxreZCrUJzJvPO8,14974
|
|
17
17
|
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=NJCLGKE7BqM24bRdpXCfTalyiqozowFZjpdzpIUy5aA,14672
|
|
18
18
|
mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
19
19
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
20
|
-
mineru/backend/vlm/utils.py,sha256=
|
|
21
|
-
mineru/backend/vlm/vlm_analyze.py,sha256=
|
|
22
|
-
mineru/backend/vlm/vlm_magic_model.py,sha256=
|
|
20
|
+
mineru/backend/vlm/utils.py,sha256=PIYqOStLCZlxU9TiZK4EhP90rPYIe_0thEZeP01YPls,3940
|
|
21
|
+
mineru/backend/vlm/vlm_analyze.py,sha256=_2-xJC2C2rT87lZw8JZfC6PFFY0FfEbM9PK2TOkIJao,15604
|
|
22
|
+
mineru/backend/vlm/vlm_magic_model.py,sha256=RodoVwNJhzjyuRLn5Io5gFMIX1NxCuuLzCbUxGaKV80,21447
|
|
23
23
|
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
|
|
24
24
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
25
25
|
mineru/cli/client.py,sha256=mPNfMEShVG-ithmlJQ5nGRIad2gCZgUjBGHN7zAmLhQ,6978
|
|
26
26
|
mineru/cli/common.py,sha256=fMPc235DtnupQkh9uFIMHUpxOSvCp5yc3A56sAabAWY,20475
|
|
27
|
-
mineru/cli/fast_api.py,sha256=
|
|
27
|
+
mineru/cli/fast_api.py,sha256=Zgbp8giikcuOngZalmzrsfUMrdKnOd9TAAZrMfGQWXs,18664
|
|
28
28
|
mineru/cli/gradio_app.py,sha256=2IIWOm2bEHHq5BZMlfmN3yAJw1Nf8SUALTQ95o-bYy0,21863
|
|
29
29
|
mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
|
|
30
30
|
mineru/cli/vlm_server.py,sha256=27HaqO3wpMXSA_nA3CC6JOBTHK3q66SP00cD6m9HuQE,1974
|
|
@@ -151,17 +151,17 @@ mineru/model/utils/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-
|
|
|
151
151
|
mineru/model/utils/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
|
|
152
152
|
mineru/model/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
153
|
mineru/model/vlm/lmdeploy_server.py,sha256=PvxJNcUIKB8VzWMDXeV1t0SHSgz_ULO36ZAzJbppz90,3262
|
|
154
|
-
mineru/model/vlm/vllm_server.py,sha256=
|
|
154
|
+
mineru/model/vlm/vllm_server.py,sha256=7taySlBANFBTS81Q8PJ6gJWjTgjnFQoGvMknK4NuyLY,3360
|
|
155
155
|
mineru/resources/header.html,sha256=7xrf6bGloR-3ZeTDyA-JvavE_NeRuUDe3p07cEKUXSI,4769
|
|
156
156
|
mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
|
157
157
|
mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
158
158
|
mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
|
|
159
|
-
mineru/utils/block_sort.py,sha256=
|
|
159
|
+
mineru/utils/block_sort.py,sha256=e6nNjdUeRixT70OfvlEzM1FGwKxFSVwiLtwYGrsG_U0,13724
|
|
160
160
|
mineru/utils/boxbase.py,sha256=xnGA1k7hVtTQrreqlJmK-SA3y9edTHgLmGiqGrSXckE,7568
|
|
161
161
|
mineru/utils/char_utils.py,sha256=74T5Ylr5mi1uddAIuJku9Z6sH7vhR7t595_H7qmbu4c,1777
|
|
162
162
|
mineru/utils/check_sys_env.py,sha256=TRjzg4xWyoSGrgv4KaP225A-99xBgLAfZ1cPcGqrBAA,1191
|
|
163
163
|
mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
|
|
164
|
-
mineru/utils/config_reader.py,sha256=
|
|
164
|
+
mineru/utils/config_reader.py,sha256=mwXYVuj52mA__2BU2qOPP0Pn9m0dDLi4mAqPS9a4Pjo,4575
|
|
165
165
|
mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
|
|
166
166
|
mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
|
|
167
167
|
mineru/utils/engine_utils.py,sha256=Jmao9-O-sZDzH7vANKEDaY6NJ8tuthKsTr23LFIeBLU,2203
|
|
@@ -171,8 +171,8 @@ mineru/utils/guess_suffix_or_lang.py,sha256=aUC2wAJwa5LH0SHxwTbOEJqVVgvpdUCWFF6o
|
|
|
171
171
|
mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
|
|
172
172
|
mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
|
|
173
173
|
mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
|
|
174
|
-
mineru/utils/magic_model_utils.py,sha256=
|
|
175
|
-
mineru/utils/model_utils.py,sha256=
|
|
174
|
+
mineru/utils/magic_model_utils.py,sha256=8Hv-BDk9Ez4TUx6hrVJ_675yZZggPj6Uib81lSpm0ig,11683
|
|
175
|
+
mineru/utils/model_utils.py,sha256=xlw5hUYKa6o1NiM8PoXO1HFvHfrgY5e4Ut_upGEY9yI,19909
|
|
176
176
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
177
177
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
178
178
|
mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
|
|
@@ -185,9 +185,9 @@ mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,128
|
|
|
185
185
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
186
186
|
mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
|
|
187
187
|
mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
|
|
188
|
-
mineru-2.7.
|
|
189
|
-
mineru-2.7.
|
|
190
|
-
mineru-2.7.
|
|
191
|
-
mineru-2.7.
|
|
192
|
-
mineru-2.7.
|
|
193
|
-
mineru-2.7.
|
|
188
|
+
mineru-2.7.4.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
189
|
+
mineru-2.7.4.dist-info/METADATA,sha256=lNxDREB_s7eDnknMUeBn5FCgtDc8qPQS-hEs4Wb6WTg,36928
|
|
190
|
+
mineru-2.7.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
191
|
+
mineru-2.7.4.dist-info/entry_points.txt,sha256=a9AHBIiYe3dpT3oofVQJC8fI0WjDhQASCUlhdMOK120,376
|
|
192
|
+
mineru-2.7.4.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
193
|
+
mineru-2.7.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|