mineru 2.7.4__py3-none-any.whl → 2.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/vlm/utils.py +128 -1
- mineru/backend/vlm/vlm_analyze.py +3 -44
- mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py +7 -1
- mineru/model/vlm/vllm_server.py +3 -23
- mineru/utils/block_sort.py +4 -0
- mineru/utils/config_reader.py +6 -1
- mineru/utils/model_utils.py +6 -0
- mineru/utils/os_env_config.py +5 -0
- mineru/utils/pdf_image_tools.py +73 -25
- mineru/version.py +1 -1
- {mineru-2.7.4.dist-info → mineru-2.7.6.dist-info}/METADATA +12 -7
- {mineru-2.7.4.dist-info → mineru-2.7.6.dist-info}/RECORD +16 -16
- {mineru-2.7.4.dist-info → mineru-2.7.6.dist-info}/WHEEL +0 -0
- {mineru-2.7.4.dist-info → mineru-2.7.6.dist-info}/entry_points.txt +0 -0
- {mineru-2.7.4.dist-info → mineru-2.7.6.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.7.4.dist-info → mineru-2.7.6.dist-info}/top_level.txt +0 -0
mineru/backend/vlm/utils.py
CHANGED
|
@@ -24,6 +24,9 @@ def enable_custom_logits_processors() -> bool:
|
|
|
24
24
|
compute_capability = "8.0"
|
|
25
25
|
elif hasattr(torch, 'mlu') and torch.mlu.is_available():
|
|
26
26
|
compute_capability = "8.0"
|
|
27
|
+
elif hasattr(torch, 'sdaa') and torch.sdaa.is_available():
|
|
28
|
+
compute_capability = "8.0"
|
|
29
|
+
|
|
27
30
|
else:
|
|
28
31
|
logger.info("CUDA not available, disabling custom_logits_processors")
|
|
29
32
|
return False
|
|
@@ -102,4 +105,128 @@ def set_default_batch_size() -> int:
|
|
|
102
105
|
except Exception as e:
|
|
103
106
|
logger.warning(f'Error determining VRAM: {e}, using default batch_ratio: 1')
|
|
104
107
|
batch_size = 1
|
|
105
|
-
return batch_size
|
|
108
|
+
return batch_size
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _get_device_config(device_type: str) -> dict | None:
|
|
112
|
+
"""获取不同设备类型的配置参数"""
|
|
113
|
+
|
|
114
|
+
# 各设备类型的配置定义
|
|
115
|
+
DEVICE_CONFIGS = {
|
|
116
|
+
# "musa": {
|
|
117
|
+
# "compilation_config_dict": {
|
|
118
|
+
# "cudagraph_capture_sizes": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
119
|
+
# "simple_cuda_graph": True
|
|
120
|
+
# },
|
|
121
|
+
# "block_size": 32,
|
|
122
|
+
# },
|
|
123
|
+
"corex": {
|
|
124
|
+
"compilation_config_dict": {
|
|
125
|
+
"cudagraph_mode": "FULL_DECODE_ONLY",
|
|
126
|
+
"level": 0
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
"kxpu": {
|
|
130
|
+
"compilation_config_dict": {
|
|
131
|
+
"splitting_ops": [
|
|
132
|
+
"vllm.unified_attention", "vllm.unified_attention_with_output",
|
|
133
|
+
"vllm.unified_attention_with_output_kunlun", "vllm.mamba_mixer2",
|
|
134
|
+
"vllm.mamba_mixer", "vllm.short_conv", "vllm.linear_attention",
|
|
135
|
+
"vllm.plamo2_mamba_mixer", "vllm.gdn_attention", "vllm.sparse_attn_indexer"
|
|
136
|
+
]
|
|
137
|
+
},
|
|
138
|
+
"block_size": 128,
|
|
139
|
+
"dtype": "float16",
|
|
140
|
+
"distributed_executor_backend": "mp",
|
|
141
|
+
"enable_chunked_prefill": False,
|
|
142
|
+
"enable_prefix_caching": False,
|
|
143
|
+
},
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return DEVICE_CONFIGS.get(device_type.lower())
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _check_server_arg_exists(args: list, arg_name: str) -> bool:
|
|
150
|
+
"""检查命令行参数列表中是否已存在指定参数"""
|
|
151
|
+
return any(arg == f"--{arg_name}" or arg.startswith(f"--{arg_name}=") for arg in args)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _add_server_arg_if_missing(args: list, arg_name: str, value: str) -> None:
|
|
155
|
+
"""如果参数不存在,则添加到命令行参数列表"""
|
|
156
|
+
if not _check_server_arg_exists(args, arg_name):
|
|
157
|
+
args.extend([f"--{arg_name}", value])
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _add_server_flag_if_missing(args: list, flag_name: str) -> None:
|
|
161
|
+
"""如果 flag 不存在,则添加到命令行参数列表"""
|
|
162
|
+
if not _check_server_arg_exists(args, flag_name):
|
|
163
|
+
args.append(f"--{flag_name}")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _add_engine_kwarg_if_missing(kwargs: dict, key: str, value) -> None:
|
|
167
|
+
"""如果参数不存在,则添加到 kwargs 字典"""
|
|
168
|
+
if key not in kwargs:
|
|
169
|
+
kwargs[key] = value
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def mod_kwargs_by_device_type(kwargs_or_args: dict | list, vllm_mode: str) -> dict | list:
|
|
173
|
+
"""根据设备类型修改 vllm 配置参数
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
kwargs_or_args: 配置参数,server 模式为 list,engine 模式为 dict
|
|
177
|
+
vllm_mode: vllm 运行模式 ("server", "sync_engine", "async_engine")
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
修改后的配置参数
|
|
181
|
+
"""
|
|
182
|
+
device_type = os.getenv("MINERU_VLLM_DEVICE", "")
|
|
183
|
+
config = _get_device_config(device_type)
|
|
184
|
+
|
|
185
|
+
if config is None:
|
|
186
|
+
return kwargs_or_args
|
|
187
|
+
|
|
188
|
+
if vllm_mode == "server":
|
|
189
|
+
_apply_server_config(kwargs_or_args, config)
|
|
190
|
+
else:
|
|
191
|
+
_apply_engine_config(kwargs_or_args, config, vllm_mode)
|
|
192
|
+
|
|
193
|
+
return kwargs_or_args
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _apply_server_config(args: list, config: dict) -> None:
|
|
197
|
+
"""应用 server 模式的配置"""
|
|
198
|
+
import json
|
|
199
|
+
|
|
200
|
+
for key, value in config.items():
|
|
201
|
+
if key == "compilation_config_dict":
|
|
202
|
+
_add_server_arg_if_missing(
|
|
203
|
+
args, "compilation-config",
|
|
204
|
+
json.dumps(value, separators=(',', ':'))
|
|
205
|
+
)
|
|
206
|
+
else:
|
|
207
|
+
# 转换 key 格式: block_size -> block-size
|
|
208
|
+
arg_name = key.replace("_", "-")
|
|
209
|
+
if arg_name in {"enable-chunked-prefill", "enable-prefix-caching"} and value is False:
|
|
210
|
+
_add_server_flag_if_missing(args, f"no-{arg_name}")
|
|
211
|
+
continue
|
|
212
|
+
_add_server_arg_if_missing(args, arg_name, str(value))
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _apply_engine_config(kwargs: dict, config: dict, vllm_mode: str) -> None:
|
|
216
|
+
"""应用 engine 模式的配置"""
|
|
217
|
+
try:
|
|
218
|
+
from vllm.config import CompilationConfig
|
|
219
|
+
except ImportError:
|
|
220
|
+
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
|
|
221
|
+
|
|
222
|
+
for key, value in config.items():
|
|
223
|
+
if key == "compilation_config_dict":
|
|
224
|
+
if vllm_mode == "sync_engine":
|
|
225
|
+
compilation_config = value
|
|
226
|
+
elif vllm_mode == "async_engine":
|
|
227
|
+
compilation_config = CompilationConfig(**value)
|
|
228
|
+
else:
|
|
229
|
+
continue
|
|
230
|
+
_add_engine_kwarg_if_missing(kwargs, "compilation_config", compilation_config)
|
|
231
|
+
else:
|
|
232
|
+
_add_engine_kwarg_if_missing(kwargs, key, value)
|
|
@@ -6,7 +6,7 @@ import json
|
|
|
6
6
|
from loguru import logger
|
|
7
7
|
|
|
8
8
|
from .utils import enable_custom_logits_processors, set_default_gpu_memory_utilization, set_default_batch_size, \
|
|
9
|
-
set_lmdeploy_backend
|
|
9
|
+
set_lmdeploy_backend, mod_kwargs_by_device_type
|
|
10
10
|
from .model_output_to_middle_json import result_to_middle_json
|
|
11
11
|
from ...data.data_reader_writer import DataWriter
|
|
12
12
|
from mineru.utils.pdf_image_tools import load_images_from_pdf
|
|
@@ -101,27 +101,7 @@ class ModelSingleton:
|
|
|
101
101
|
except ImportError:
|
|
102
102
|
raise ImportError("Please install vllm to use the vllm-engine backend.")
|
|
103
103
|
|
|
104
|
-
|
|
105
|
-
# device = get_device()
|
|
106
|
-
# if device_type.startswith("musa"):
|
|
107
|
-
# import torch
|
|
108
|
-
# if torch.musa.is_available():
|
|
109
|
-
# compilation_config = {
|
|
110
|
-
# "cudagraph_capture_sizes": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
111
|
-
# "simple_cuda_graph": True
|
|
112
|
-
# }
|
|
113
|
-
# block_size = 32
|
|
114
|
-
# kwargs["compilation_config"] = compilation_config
|
|
115
|
-
# kwargs["block_size"] = block_size
|
|
116
|
-
|
|
117
|
-
# corex vllm v1 引擎特殊配置
|
|
118
|
-
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
119
|
-
if device_type.lower() == "corex":
|
|
120
|
-
compilation_config = {
|
|
121
|
-
"cudagraph_mode": "FULL_DECODE_ONLY",
|
|
122
|
-
"level": 0
|
|
123
|
-
}
|
|
124
|
-
kwargs["compilation_config"] = compilation_config
|
|
104
|
+
kwargs = mod_kwargs_by_device_type(kwargs, vllm_mode="sync_engine")
|
|
125
105
|
|
|
126
106
|
if "compilation_config" in kwargs:
|
|
127
107
|
if isinstance(kwargs["compilation_config"], str):
|
|
@@ -148,28 +128,7 @@ class ModelSingleton:
|
|
|
148
128
|
except ImportError:
|
|
149
129
|
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
|
|
150
130
|
|
|
151
|
-
|
|
152
|
-
# musa vllm v1 引擎特殊配置
|
|
153
|
-
# device = get_device()
|
|
154
|
-
# if device.startswith("musa"):
|
|
155
|
-
# import torch
|
|
156
|
-
# if torch.musa.is_available():
|
|
157
|
-
# compilation_config = CompilationConfig(
|
|
158
|
-
# cudagraph_capture_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
159
|
-
# simple_cuda_graph=True
|
|
160
|
-
# )
|
|
161
|
-
# block_size = 32
|
|
162
|
-
# kwargs["compilation_config"] = compilation_config
|
|
163
|
-
# kwargs["block_size"] = block_size
|
|
164
|
-
|
|
165
|
-
# corex vllm v1 引擎特殊配置
|
|
166
|
-
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
167
|
-
if device_type.lower() == "corex":
|
|
168
|
-
compilation_config = CompilationConfig(
|
|
169
|
-
cudagraph_mode="FULL_DECODE_ONLY",
|
|
170
|
-
level=0
|
|
171
|
-
)
|
|
172
|
-
kwargs["compilation_config"] = compilation_config
|
|
131
|
+
kwargs = mod_kwargs_by_device_type(kwargs, vllm_mode="async_engine")
|
|
173
132
|
|
|
174
133
|
if "compilation_config" in kwargs:
|
|
175
134
|
if isinstance(kwargs["compilation_config"], dict):
|
|
@@ -89,7 +89,11 @@ class FormulaRecognizer(BaseOCRV20):
|
|
|
89
89
|
return rec_formula
|
|
90
90
|
|
|
91
91
|
def batch_predict(
|
|
92
|
-
self,
|
|
92
|
+
self,
|
|
93
|
+
images_mfd_res: list,
|
|
94
|
+
images: list,
|
|
95
|
+
batch_size: int = 64,
|
|
96
|
+
interline_enable: bool = True,
|
|
93
97
|
) -> list:
|
|
94
98
|
images_formula_list = []
|
|
95
99
|
mf_image_list = []
|
|
@@ -105,6 +109,8 @@ class FormulaRecognizer(BaseOCRV20):
|
|
|
105
109
|
for idx, (xyxy, conf, cla) in enumerate(
|
|
106
110
|
zip(mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls)
|
|
107
111
|
):
|
|
112
|
+
if not interline_enable and cla.item() == 1:
|
|
113
|
+
continue # Skip interline regions if not enabled
|
|
108
114
|
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
|
|
109
115
|
new_item = {
|
|
110
116
|
"category_id": 13 + int(cla.item()),
|
mineru/model/vlm/vllm_server.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import sys
|
|
3
3
|
|
|
4
|
-
from mineru.backend.vlm.utils import set_default_gpu_memory_utilization, enable_custom_logits_processors
|
|
5
|
-
|
|
4
|
+
from mineru.backend.vlm.utils import set_default_gpu_memory_utilization, enable_custom_logits_processors, \
|
|
5
|
+
mod_kwargs_by_device_type
|
|
6
6
|
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
|
|
7
7
|
|
|
8
8
|
from vllm.entrypoints.cli.main import main as vllm_main
|
|
@@ -14,8 +14,6 @@ def main():
|
|
|
14
14
|
has_port_arg = False
|
|
15
15
|
has_gpu_memory_utilization_arg = False
|
|
16
16
|
has_logits_processors_arg = False
|
|
17
|
-
has_block_size_arg = False
|
|
18
|
-
has_compilation_config = False
|
|
19
17
|
model_path = None
|
|
20
18
|
model_arg_indices = []
|
|
21
19
|
|
|
@@ -27,10 +25,6 @@ def main():
|
|
|
27
25
|
has_gpu_memory_utilization_arg = True
|
|
28
26
|
if arg == "--logits-processors" or arg.startswith("--logits-processors="):
|
|
29
27
|
has_logits_processors_arg = True
|
|
30
|
-
if arg == "--block-size" or arg.startswith("--block-size="):
|
|
31
|
-
has_block_size_arg = True
|
|
32
|
-
if arg == "--compilation-config" or arg.startswith("--compilation-config="):
|
|
33
|
-
has_compilation_config = True
|
|
34
28
|
if arg == "--model":
|
|
35
29
|
if i + 1 < len(args):
|
|
36
30
|
model_path = args[i + 1]
|
|
@@ -57,21 +51,7 @@ def main():
|
|
|
57
51
|
if (not has_logits_processors_arg) and custom_logits_processors:
|
|
58
52
|
args.extend(["--logits-processors", "mineru_vl_utils:MinerULogitsProcessor"])
|
|
59
53
|
|
|
60
|
-
|
|
61
|
-
# device = get_device()
|
|
62
|
-
# if device.startswith("musa"):
|
|
63
|
-
# import torch
|
|
64
|
-
# if torch.musa.is_available():
|
|
65
|
-
# if not has_block_size_arg:
|
|
66
|
-
# args.extend(["--block-size", "32"])
|
|
67
|
-
# if not has_compilation_config:
|
|
68
|
-
# args.extend(["--compilation-config", '{"cudagraph_capture_sizes": [1,2,3,4,5,6,7,8,10,12,14,16,18,20,24,28,30], "simple_cuda_graph": true}'])
|
|
69
|
-
|
|
70
|
-
# corex vllm v1 引擎特殊配置
|
|
71
|
-
device_type = os.getenv("MINERU_LMDEPLOY_DEVICE", "")
|
|
72
|
-
if device_type.lower() == "corex":
|
|
73
|
-
if not has_compilation_config:
|
|
74
|
-
args.extend(["--compilation-config", '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'])
|
|
54
|
+
args = mod_kwargs_by_device_type(args, vllm_mode="server")
|
|
75
55
|
|
|
76
56
|
# 重构参数,将模型路径作为位置参数
|
|
77
57
|
sys.argv = [sys.argv[0]] + ["serve", model_path] + args
|
mineru/utils/block_sort.py
CHANGED
|
@@ -202,6 +202,10 @@ def model_init(model_name: str):
|
|
|
202
202
|
if hasattr(torch, 'mlu') and torch.mlu.is_available():
|
|
203
203
|
if torch.mlu.is_bf16_supported():
|
|
204
204
|
bf_16_support = True
|
|
205
|
+
elif device_name.startswith("sdaa"):
|
|
206
|
+
if hasattr(torch, 'sdaa') and torch.sdaa.is_available():
|
|
207
|
+
if torch.sdaa.is_bf16_supported():
|
|
208
|
+
bf_16_support = True
|
|
205
209
|
|
|
206
210
|
if model_name == 'layoutreader':
|
|
207
211
|
# 检测modelscope的缓存目录是否存在
|
mineru/utils/config_reader.py
CHANGED
mineru/utils/model_utils.py
CHANGED
|
@@ -432,6 +432,9 @@ def clean_memory(device='cuda'):
|
|
|
432
432
|
elif str(device).startswith("mlu"):
|
|
433
433
|
if torch.mlu.is_available():
|
|
434
434
|
torch.mlu.empty_cache()
|
|
435
|
+
elif str(device).startswith("sdaa"):
|
|
436
|
+
if torch.sdaa.is_available():
|
|
437
|
+
torch.sdaa.empty_cache()
|
|
435
438
|
gc.collect()
|
|
436
439
|
|
|
437
440
|
|
|
@@ -476,5 +479,8 @@ def get_vram(device) -> int:
|
|
|
476
479
|
elif str(device).startswith("mlu"):
|
|
477
480
|
if torch.mlu.is_available():
|
|
478
481
|
total_memory = round(torch.mlu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
482
|
+
elif str(device).startswith("sdaa"):
|
|
483
|
+
if torch.sdaa.is_available():
|
|
484
|
+
total_memory = round(torch.sdaa.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
479
485
|
|
|
480
486
|
return total_memory
|
mineru/utils/os_env_config.py
CHANGED
|
@@ -11,6 +11,11 @@ def get_load_images_timeout() -> int:
|
|
|
11
11
|
return get_value_from_string(env_value, 300)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def get_load_images_threads() -> int:
|
|
15
|
+
env_value = os.getenv('MINERU_PDF_RENDER_THREADS', None)
|
|
16
|
+
return get_value_from_string(env_value, 4)
|
|
17
|
+
|
|
18
|
+
|
|
14
19
|
def get_value_from_string(env_value: str, default_value: int) -> int:
|
|
15
20
|
if env_value is not None:
|
|
16
21
|
try:
|
mineru/utils/pdf_image_tools.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
2
|
import os
|
|
3
|
+
import signal
|
|
4
|
+
import time
|
|
3
5
|
from io import BytesIO
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
@@ -9,13 +11,13 @@ from PIL import Image, ImageOps
|
|
|
9
11
|
|
|
10
12
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
11
13
|
from mineru.utils.check_sys_env import is_windows_environment
|
|
12
|
-
from mineru.utils.os_env_config import get_load_images_timeout
|
|
14
|
+
from mineru.utils.os_env_config import get_load_images_timeout, get_load_images_threads
|
|
13
15
|
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
|
|
14
16
|
from mineru.utils.enum_class import ImageType
|
|
15
17
|
from mineru.utils.hash_utils import str_sha256
|
|
16
18
|
from mineru.utils.pdf_page_id import get_end_page_id
|
|
17
19
|
|
|
18
|
-
from concurrent.futures import ProcessPoolExecutor,
|
|
20
|
+
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
|
|
@@ -57,7 +59,7 @@ def load_images_from_pdf(
|
|
|
57
59
|
end_page_id=None,
|
|
58
60
|
image_type=ImageType.PIL,
|
|
59
61
|
timeout=None,
|
|
60
|
-
threads=
|
|
62
|
+
threads=None,
|
|
61
63
|
):
|
|
62
64
|
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
|
63
65
|
|
|
@@ -67,8 +69,8 @@ def load_images_from_pdf(
|
|
|
67
69
|
start_page_id (int, optional): 起始页码. Defaults to 0.
|
|
68
70
|
end_page_id (int | None, optional): 结束页码. Defaults to None.
|
|
69
71
|
image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
|
|
70
|
-
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量
|
|
71
|
-
threads (int):
|
|
72
|
+
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_RENDER_TIMEOUT 读取,若未设置则默认为 300 秒。
|
|
73
|
+
threads (int): 进程数, 如果为 None,则从环境变量 MINERU_PDF_RENDER_THREADS 读取,若未设置则默认为 4.
|
|
72
74
|
|
|
73
75
|
Raises:
|
|
74
76
|
TimeoutError: 当转换超时时抛出
|
|
@@ -86,6 +88,9 @@ def load_images_from_pdf(
|
|
|
86
88
|
else:
|
|
87
89
|
if timeout is None:
|
|
88
90
|
timeout = get_load_images_timeout()
|
|
91
|
+
if threads is None:
|
|
92
|
+
threads = get_load_images_threads()
|
|
93
|
+
|
|
89
94
|
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
|
90
95
|
|
|
91
96
|
# 计算总页数
|
|
@@ -108,11 +113,13 @@ def load_images_from_pdf(
|
|
|
108
113
|
|
|
109
114
|
page_ranges.append((range_start, range_end))
|
|
110
115
|
|
|
111
|
-
|
|
116
|
+
logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
|
|
112
117
|
|
|
113
|
-
|
|
118
|
+
executor = ProcessPoolExecutor(max_workers=actual_threads)
|
|
119
|
+
try:
|
|
114
120
|
# 提交所有任务
|
|
115
121
|
futures = []
|
|
122
|
+
future_to_range = {}
|
|
116
123
|
for range_start, range_end in page_ranges:
|
|
117
124
|
future = executor.submit(
|
|
118
125
|
_load_images_from_pdf_worker,
|
|
@@ -122,27 +129,68 @@ def load_images_from_pdf(
|
|
|
122
129
|
range_end,
|
|
123
130
|
image_type,
|
|
124
131
|
)
|
|
125
|
-
futures.append(
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
all_results.sort(key=lambda x: x[0])
|
|
136
|
-
images_list = []
|
|
137
|
-
for _, imgs in all_results:
|
|
138
|
-
images_list.extend(imgs)
|
|
139
|
-
|
|
140
|
-
return images_list, pdf_doc
|
|
141
|
-
except FuturesTimeoutError:
|
|
132
|
+
futures.append(future)
|
|
133
|
+
future_to_range[future] = range_start
|
|
134
|
+
|
|
135
|
+
# 使用 wait() 设置单一全局超时
|
|
136
|
+
done, not_done = wait(futures, timeout=timeout, return_when=ALL_COMPLETED)
|
|
137
|
+
|
|
138
|
+
# 检查是否有未完成的任务(超时情况)
|
|
139
|
+
if not_done:
|
|
140
|
+
# 超时:强制终止所有子进程
|
|
141
|
+
_terminate_executor_processes(executor)
|
|
142
142
|
pdf_doc.close()
|
|
143
|
-
executor.shutdown(wait=False, cancel_futures=True)
|
|
144
143
|
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
|
|
145
144
|
|
|
145
|
+
# 所有任务完成,收集结果
|
|
146
|
+
all_results = []
|
|
147
|
+
for future in futures:
|
|
148
|
+
range_start = future_to_range[future]
|
|
149
|
+
# 这里不需要 timeout,因为任务已完成
|
|
150
|
+
images_list = future.result()
|
|
151
|
+
all_results.append((range_start, images_list))
|
|
152
|
+
|
|
153
|
+
# 按起始页码排序并合并结果
|
|
154
|
+
all_results.sort(key=lambda x: x[0])
|
|
155
|
+
images_list = []
|
|
156
|
+
for _, imgs in all_results:
|
|
157
|
+
images_list.extend(imgs)
|
|
158
|
+
|
|
159
|
+
return images_list, pdf_doc
|
|
160
|
+
|
|
161
|
+
except Exception as e:
|
|
162
|
+
# 发生任何异常时,确保清理子进程
|
|
163
|
+
_terminate_executor_processes(executor)
|
|
164
|
+
pdf_doc.close()
|
|
165
|
+
if isinstance(e, TimeoutError):
|
|
166
|
+
raise
|
|
167
|
+
raise
|
|
168
|
+
finally:
|
|
169
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _terminate_executor_processes(executor):
|
|
173
|
+
"""强制终止 ProcessPoolExecutor 中的所有子进程"""
|
|
174
|
+
if hasattr(executor, '_processes'):
|
|
175
|
+
for pid, process in executor._processes.items():
|
|
176
|
+
if process.is_alive():
|
|
177
|
+
try:
|
|
178
|
+
# 先发送 SIGTERM 允许优雅退出
|
|
179
|
+
os.kill(pid, signal.SIGTERM)
|
|
180
|
+
except (ProcessLookupError, OSError):
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
# 给子进程一点时间响应 SIGTERM
|
|
184
|
+
time.sleep(0.1)
|
|
185
|
+
|
|
186
|
+
# 对仍然存活的进程发送 SIGKILL 强制终止
|
|
187
|
+
for pid, process in executor._processes.items():
|
|
188
|
+
if process.is_alive():
|
|
189
|
+
try:
|
|
190
|
+
os.kill(pid, signal.SIGKILL)
|
|
191
|
+
except (ProcessLookupError, OSError):
|
|
192
|
+
pass
|
|
193
|
+
|
|
146
194
|
|
|
147
195
|
def load_images_from_pdf_core(
|
|
148
196
|
pdf_bytes: bytes,
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.7.
|
|
1
|
+
__version__ = "2.7.6"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.6
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -135,17 +135,22 @@ Dynamic: license-file
|
|
|
135
135
|
|
|
136
136
|
# Changelog
|
|
137
137
|
|
|
138
|
-
- 2026/
|
|
139
|
-
- Added support for domestic computing platforms
|
|
140
|
-
- [Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend
|
|
141
|
-
- [T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead
|
|
142
|
-
- [METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX
|
|
138
|
+
- 2026/02/06 2.7.6 Release
|
|
139
|
+
- Added support for the domestic computing platforms Kunlunxin and Tecorigin; currently, the domestic computing platforms that have been adapted and supported by the official team and vendors include:
|
|
140
|
+
- [Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend)
|
|
141
|
+
- [T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead)
|
|
142
|
+
- [METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX)
|
|
143
143
|
- [Hygon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
|
|
144
144
|
- [Enflame](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
|
|
145
145
|
- [MooreThreads](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
|
|
146
146
|
- [IluvatarCorex](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/IluvatarCorex/)
|
|
147
147
|
- [Cambricon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Cambricon/)
|
|
148
|
-
|
|
148
|
+
- [Kunlunxin](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Kunlunxin/)
|
|
149
|
+
- [Tecorigin](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Tecorigin/)
|
|
150
|
+
- MinerU continues to support domestic hardware platforms and mainstream chip architectures. With secure and reliable technology, it helps research, government, and enterprise users reach new heights in document digitization!
|
|
151
|
+
|
|
152
|
+
- 2026/01/30 2.7.4 Release
|
|
153
|
+
- Added support for domestic computing platforms IluvatarCorex and Cambricon.
|
|
149
154
|
|
|
150
155
|
- 2026/01/23 2.7.2 Release
|
|
151
156
|
- Added support for domestic computing platforms Hygon, Enflame, and Moore Threads.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=6xG2XfctNZV_iMAbDf3PscewWwjPfwfmAC2zaeMR2KI,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
|
|
@@ -17,8 +17,8 @@ mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc
|
|
|
17
17
|
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=NJCLGKE7BqM24bRdpXCfTalyiqozowFZjpdzpIUy5aA,14672
|
|
18
18
|
mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
19
19
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
20
|
-
mineru/backend/vlm/utils.py,sha256=
|
|
21
|
-
mineru/backend/vlm/vlm_analyze.py,sha256=
|
|
20
|
+
mineru/backend/vlm/utils.py,sha256=igxgc-ZXje-TKQvZ2p_YJZTMkHS9yXE7u1-FcaGEVZ0,8523
|
|
21
|
+
mineru/backend/vlm/vlm_analyze.py,sha256=Vc8rRzvcE5egjW_J7L0bueo2dLK3b3KKIzvCK2AyBRk,13500
|
|
22
22
|
mineru/backend/vlm/vlm_magic_model.py,sha256=RodoVwNJhzjyuRLn5Io5gFMIX1NxCuuLzCbUxGaKV80,21447
|
|
23
23
|
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
|
|
24
24
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
@@ -51,7 +51,7 @@ mineru/model/mfd/yolo_v8.py,sha256=OI5AxVgt3FvXp4NYk0BDXXvpDlo9YjM6byDyC_TZ8Js,3
|
|
|
51
51
|
mineru/model/mfr/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
52
52
|
mineru/model/mfr/utils.py,sha256=pAi1HnkTuO0R6251Hdl-o50m0wH0Ce89PAf74WCsXPU,11499
|
|
53
53
|
mineru/model/mfr/pp_formulanet_plus_m/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py,sha256=
|
|
54
|
+
mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py,sha256=tYbxdG_oNLb18CsQkusZA-r3fxHQd1uDnfzIFQ6IIU4,5783
|
|
55
55
|
mineru/model/mfr/pp_formulanet_plus_m/processors.py,sha256=MSKyanxiDDjgDQHBov-GjKtPnMx9tSmxBC9GIkM3ft8,23832
|
|
56
56
|
mineru/model/mfr/unimernet/Unimernet.py,sha256=ZK0M9fPmZziK4D33H3YND7RnHiQkRVCS-lvNfY-N7do,7912
|
|
57
57
|
mineru/model/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -151,17 +151,17 @@ mineru/model/utils/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-
|
|
|
151
151
|
mineru/model/utils/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
|
|
152
152
|
mineru/model/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
153
|
mineru/model/vlm/lmdeploy_server.py,sha256=PvxJNcUIKB8VzWMDXeV1t0SHSgz_ULO36ZAzJbppz90,3262
|
|
154
|
-
mineru/model/vlm/vllm_server.py,sha256=
|
|
154
|
+
mineru/model/vlm/vllm_server.py,sha256=gC4bkwBbnQXpmxaiq1nPf7RgWF-pUYESjLssveJq6Do,2360
|
|
155
155
|
mineru/resources/header.html,sha256=7xrf6bGloR-3ZeTDyA-JvavE_NeRuUDe3p07cEKUXSI,4769
|
|
156
156
|
mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
|
157
157
|
mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
158
158
|
mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
|
|
159
|
-
mineru/utils/block_sort.py,sha256=
|
|
159
|
+
mineru/utils/block_sort.py,sha256=5S1VdpRgI72D2dRb3Qp5XQiqSmiPpELwFIqbpshH1jA,13916
|
|
160
160
|
mineru/utils/boxbase.py,sha256=xnGA1k7hVtTQrreqlJmK-SA3y9edTHgLmGiqGrSXckE,7568
|
|
161
161
|
mineru/utils/char_utils.py,sha256=74T5Ylr5mi1uddAIuJku9Z6sH7vhR7t595_H7qmbu4c,1777
|
|
162
162
|
mineru/utils/check_sys_env.py,sha256=TRjzg4xWyoSGrgv4KaP225A-99xBgLAfZ1cPcGqrBAA,1191
|
|
163
163
|
mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
|
|
164
|
-
mineru/utils/config_reader.py,sha256=
|
|
164
|
+
mineru/utils/config_reader.py,sha256=03ASqJUJIl6CkXVcsewpnPDAo9I7WYdj_hx-osUKrlE,4835
|
|
165
165
|
mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
|
|
166
166
|
mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
|
|
167
167
|
mineru/utils/engine_utils.py,sha256=Jmao9-O-sZDzH7vANKEDaY6NJ8tuthKsTr23LFIeBLU,2203
|
|
@@ -172,12 +172,12 @@ mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,85
|
|
|
172
172
|
mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
|
|
173
173
|
mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
|
|
174
174
|
mineru/utils/magic_model_utils.py,sha256=8Hv-BDk9Ez4TUx6hrVJ_675yZZggPj6Uib81lSpm0ig,11683
|
|
175
|
-
mineru/utils/model_utils.py,sha256=
|
|
175
|
+
mineru/utils/model_utils.py,sha256=YadxNuRvuWZ5yW2NkSpD0ZYTJdj0ZVS2X8KF_hlGWCA,20231
|
|
176
176
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
177
177
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
178
|
-
mineru/utils/os_env_config.py,sha256=
|
|
178
|
+
mineru/utils/os_env_config.py,sha256=VHK9lS3QFJhrwWa9FOFU1Swm7oXnby4SaNNjTyonTTg,990
|
|
179
179
|
mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
|
|
180
|
-
mineru/utils/pdf_image_tools.py,sha256=
|
|
180
|
+
mineru/utils/pdf_image_tools.py,sha256=tTSk39fgJKLEshwPAuJGLl_pVSrmEKjWA55F6dGcr4g,9987
|
|
181
181
|
mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
|
|
182
182
|
mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
|
|
183
183
|
mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
|
|
@@ -185,9 +185,9 @@ mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,128
|
|
|
185
185
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
186
186
|
mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
|
|
187
187
|
mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
|
|
188
|
-
mineru-2.7.
|
|
189
|
-
mineru-2.7.
|
|
190
|
-
mineru-2.7.
|
|
191
|
-
mineru-2.7.
|
|
192
|
-
mineru-2.7.
|
|
193
|
-
mineru-2.7.
|
|
188
|
+
mineru-2.7.6.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
189
|
+
mineru-2.7.6.dist-info/METADATA,sha256=m6EbuSPR6iPDZp-fBf90urMSPi9JbGLKZZC5EneGsKc,37245
|
|
190
|
+
mineru-2.7.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
191
|
+
mineru-2.7.6.dist-info/entry_points.txt,sha256=a9AHBIiYe3dpT3oofVQJC8fI0WjDhQASCUlhdMOK120,376
|
|
192
|
+
mineru-2.7.6.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
193
|
+
mineru-2.7.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|