mineru 2.7.4__py3-none-any.whl → 2.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/utils/os_env_config.py +5 -0
- mineru/utils/pdf_image_tools.py +73 -25
- mineru/version.py +1 -1
- {mineru-2.7.4.dist-info → mineru-2.7.5.dist-info}/METADATA +1 -1
- {mineru-2.7.4.dist-info → mineru-2.7.5.dist-info}/RECORD +9 -9
- {mineru-2.7.4.dist-info → mineru-2.7.5.dist-info}/WHEEL +0 -0
- {mineru-2.7.4.dist-info → mineru-2.7.5.dist-info}/entry_points.txt +0 -0
- {mineru-2.7.4.dist-info → mineru-2.7.5.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.7.4.dist-info → mineru-2.7.5.dist-info}/top_level.txt +0 -0
mineru/utils/os_env_config.py
CHANGED
|
@@ -11,6 +11,11 @@ def get_load_images_timeout() -> int:
|
|
|
11
11
|
return get_value_from_string(env_value, 300)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def get_load_images_threads() -> int:
|
|
15
|
+
env_value = os.getenv('MINERU_PDF_RENDER_THREADS', None)
|
|
16
|
+
return get_value_from_string(env_value, 4)
|
|
17
|
+
|
|
18
|
+
|
|
14
19
|
def get_value_from_string(env_value: str, default_value: int) -> int:
|
|
15
20
|
if env_value is not None:
|
|
16
21
|
try:
|
mineru/utils/pdf_image_tools.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
2
|
import os
|
|
3
|
+
import signal
|
|
4
|
+
import time
|
|
3
5
|
from io import BytesIO
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
@@ -9,13 +11,13 @@ from PIL import Image, ImageOps
|
|
|
9
11
|
|
|
10
12
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
11
13
|
from mineru.utils.check_sys_env import is_windows_environment
|
|
12
|
-
from mineru.utils.os_env_config import get_load_images_timeout
|
|
14
|
+
from mineru.utils.os_env_config import get_load_images_timeout, get_load_images_threads
|
|
13
15
|
from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
|
|
14
16
|
from mineru.utils.enum_class import ImageType
|
|
15
17
|
from mineru.utils.hash_utils import str_sha256
|
|
16
18
|
from mineru.utils.pdf_page_id import get_end_page_id
|
|
17
19
|
|
|
18
|
-
from concurrent.futures import ProcessPoolExecutor,
|
|
20
|
+
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
|
|
@@ -57,7 +59,7 @@ def load_images_from_pdf(
|
|
|
57
59
|
end_page_id=None,
|
|
58
60
|
image_type=ImageType.PIL,
|
|
59
61
|
timeout=None,
|
|
60
|
-
threads=
|
|
62
|
+
threads=None,
|
|
61
63
|
):
|
|
62
64
|
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
|
63
65
|
|
|
@@ -67,8 +69,8 @@ def load_images_from_pdf(
|
|
|
67
69
|
start_page_id (int, optional): 起始页码. Defaults to 0.
|
|
68
70
|
end_page_id (int | None, optional): 结束页码. Defaults to None.
|
|
69
71
|
image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
|
|
70
|
-
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量
|
|
71
|
-
threads (int):
|
|
72
|
+
timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_RENDER_TIMEOUT 读取,若未设置则默认为 300 秒。
|
|
73
|
+
threads (int): 进程数, 如果为 None,则从环境变量 MINERU_PDF_RENDER_THREADS 读取,若未设置则默认为 4.
|
|
72
74
|
|
|
73
75
|
Raises:
|
|
74
76
|
TimeoutError: 当转换超时时抛出
|
|
@@ -86,6 +88,9 @@ def load_images_from_pdf(
|
|
|
86
88
|
else:
|
|
87
89
|
if timeout is None:
|
|
88
90
|
timeout = get_load_images_timeout()
|
|
91
|
+
if threads is None:
|
|
92
|
+
threads = get_load_images_threads()
|
|
93
|
+
|
|
89
94
|
end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
|
|
90
95
|
|
|
91
96
|
# 计算总页数
|
|
@@ -108,11 +113,13 @@ def load_images_from_pdf(
|
|
|
108
113
|
|
|
109
114
|
page_ranges.append((range_start, range_end))
|
|
110
115
|
|
|
111
|
-
|
|
116
|
+
logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
|
|
112
117
|
|
|
113
|
-
|
|
118
|
+
executor = ProcessPoolExecutor(max_workers=actual_threads)
|
|
119
|
+
try:
|
|
114
120
|
# 提交所有任务
|
|
115
121
|
futures = []
|
|
122
|
+
future_to_range = {}
|
|
116
123
|
for range_start, range_end in page_ranges:
|
|
117
124
|
future = executor.submit(
|
|
118
125
|
_load_images_from_pdf_worker,
|
|
@@ -122,27 +129,68 @@ def load_images_from_pdf(
|
|
|
122
129
|
range_end,
|
|
123
130
|
image_type,
|
|
124
131
|
)
|
|
125
|
-
futures.append(
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
all_results.sort(key=lambda x: x[0])
|
|
136
|
-
images_list = []
|
|
137
|
-
for _, imgs in all_results:
|
|
138
|
-
images_list.extend(imgs)
|
|
139
|
-
|
|
140
|
-
return images_list, pdf_doc
|
|
141
|
-
except FuturesTimeoutError:
|
|
132
|
+
futures.append(future)
|
|
133
|
+
future_to_range[future] = range_start
|
|
134
|
+
|
|
135
|
+
# 使用 wait() 设置单一全局超时
|
|
136
|
+
done, not_done = wait(futures, timeout=timeout, return_when=ALL_COMPLETED)
|
|
137
|
+
|
|
138
|
+
# 检查是否有未完成的任务(超时情况)
|
|
139
|
+
if not_done:
|
|
140
|
+
# 超时:强制终止所有子进程
|
|
141
|
+
_terminate_executor_processes(executor)
|
|
142
142
|
pdf_doc.close()
|
|
143
|
-
executor.shutdown(wait=False, cancel_futures=True)
|
|
144
143
|
raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
|
|
145
144
|
|
|
145
|
+
# 所有任务完成,收集结果
|
|
146
|
+
all_results = []
|
|
147
|
+
for future in futures:
|
|
148
|
+
range_start = future_to_range[future]
|
|
149
|
+
# 这里不需要 timeout,因为任务已完成
|
|
150
|
+
images_list = future.result()
|
|
151
|
+
all_results.append((range_start, images_list))
|
|
152
|
+
|
|
153
|
+
# 按起始页码排序并合并结果
|
|
154
|
+
all_results.sort(key=lambda x: x[0])
|
|
155
|
+
images_list = []
|
|
156
|
+
for _, imgs in all_results:
|
|
157
|
+
images_list.extend(imgs)
|
|
158
|
+
|
|
159
|
+
return images_list, pdf_doc
|
|
160
|
+
|
|
161
|
+
except Exception as e:
|
|
162
|
+
# 发生任何异常时,确保清理子进程
|
|
163
|
+
_terminate_executor_processes(executor)
|
|
164
|
+
pdf_doc.close()
|
|
165
|
+
if isinstance(e, TimeoutError):
|
|
166
|
+
raise
|
|
167
|
+
raise
|
|
168
|
+
finally:
|
|
169
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _terminate_executor_processes(executor):
|
|
173
|
+
"""强制终止 ProcessPoolExecutor 中的所有子进程"""
|
|
174
|
+
if hasattr(executor, '_processes'):
|
|
175
|
+
for pid, process in executor._processes.items():
|
|
176
|
+
if process.is_alive():
|
|
177
|
+
try:
|
|
178
|
+
# 先发送 SIGTERM 允许优雅退出
|
|
179
|
+
os.kill(pid, signal.SIGTERM)
|
|
180
|
+
except (ProcessLookupError, OSError):
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
# 给子进程一点时间响应 SIGTERM
|
|
184
|
+
time.sleep(0.1)
|
|
185
|
+
|
|
186
|
+
# 对仍然存活的进程发送 SIGKILL 强制终止
|
|
187
|
+
for pid, process in executor._processes.items():
|
|
188
|
+
if process.is_alive():
|
|
189
|
+
try:
|
|
190
|
+
os.kill(pid, signal.SIGKILL)
|
|
191
|
+
except (ProcessLookupError, OSError):
|
|
192
|
+
pass
|
|
193
|
+
|
|
146
194
|
|
|
147
195
|
def load_images_from_pdf_core(
|
|
148
196
|
pdf_bytes: bytes,
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.7.
|
|
1
|
+
__version__ = "2.7.5"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=lBcjVwt4I0-VUeE_7gM1gQBbtKOi9jGT3DavJbzcYnQ,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
|
|
@@ -175,9 +175,9 @@ mineru/utils/magic_model_utils.py,sha256=8Hv-BDk9Ez4TUx6hrVJ_675yZZggPj6Uib81lSp
|
|
|
175
175
|
mineru/utils/model_utils.py,sha256=xlw5hUYKa6o1NiM8PoXO1HFvHfrgY5e4Ut_upGEY9yI,19909
|
|
176
176
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
177
177
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
178
|
-
mineru/utils/os_env_config.py,sha256=
|
|
178
|
+
mineru/utils/os_env_config.py,sha256=VHK9lS3QFJhrwWa9FOFU1Swm7oXnby4SaNNjTyonTTg,990
|
|
179
179
|
mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
|
|
180
|
-
mineru/utils/pdf_image_tools.py,sha256=
|
|
180
|
+
mineru/utils/pdf_image_tools.py,sha256=tTSk39fgJKLEshwPAuJGLl_pVSrmEKjWA55F6dGcr4g,9987
|
|
181
181
|
mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
|
|
182
182
|
mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
|
|
183
183
|
mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
|
|
@@ -185,9 +185,9 @@ mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,128
|
|
|
185
185
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
186
186
|
mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
|
|
187
187
|
mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
|
|
188
|
-
mineru-2.7.
|
|
189
|
-
mineru-2.7.
|
|
190
|
-
mineru-2.7.
|
|
191
|
-
mineru-2.7.
|
|
192
|
-
mineru-2.7.
|
|
193
|
-
mineru-2.7.
|
|
188
|
+
mineru-2.7.5.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
189
|
+
mineru-2.7.5.dist-info/METADATA,sha256=MvPv4AgyLwaHz3hspAPrZ0wEeSE0wnu0MkMwfAJ5hTs,36928
|
|
190
|
+
mineru-2.7.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
191
|
+
mineru-2.7.5.dist-info/entry_points.txt,sha256=a9AHBIiYe3dpT3oofVQJC8fI0WjDhQASCUlhdMOK120,376
|
|
192
|
+
mineru-2.7.5.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
193
|
+
mineru-2.7.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|