mineru 2.7.4__py3-none-any.whl → 2.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,11 @@ def get_load_images_timeout() -> int:
11
11
  return get_value_from_string(env_value, 300)
12
12
 
13
13
 
14
+ def get_load_images_threads() -> int:
15
+ env_value = os.getenv('MINERU_PDF_RENDER_THREADS', None)
16
+ return get_value_from_string(env_value, 4)
17
+
18
+
14
19
  def get_value_from_string(env_value: str, default_value: int) -> int:
15
20
  if env_value is not None:
16
21
  try:
@@ -1,5 +1,7 @@
1
1
  # Copyright (c) Opendatalab. All rights reserved.
2
2
  import os
3
+ import signal
4
+ import time
3
5
  from io import BytesIO
4
6
 
5
7
  import numpy as np
@@ -9,13 +11,13 @@ from PIL import Image, ImageOps
9
11
 
10
12
  from mineru.data.data_reader_writer import FileBasedDataWriter
11
13
  from mineru.utils.check_sys_env import is_windows_environment
12
- from mineru.utils.os_env_config import get_load_images_timeout
14
+ from mineru.utils.os_env_config import get_load_images_timeout, get_load_images_threads
13
15
  from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
14
16
  from mineru.utils.enum_class import ImageType
15
17
  from mineru.utils.hash_utils import str_sha256
16
18
  from mineru.utils.pdf_page_id import get_end_page_id
17
19
 
18
- from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
20
+ from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
19
21
 
20
22
 
21
23
  def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
@@ -57,7 +59,7 @@ def load_images_from_pdf(
57
59
  end_page_id=None,
58
60
  image_type=ImageType.PIL,
59
61
  timeout=None,
60
- threads=4,
62
+ threads=None,
61
63
  ):
62
64
  """带超时控制的 PDF 转图片函数,支持多进程加速
63
65
 
@@ -67,8 +69,8 @@ def load_images_from_pdf(
67
69
  start_page_id (int, optional): 起始页码. Defaults to 0.
68
70
  end_page_id (int | None, optional): 结束页码. Defaults to None.
69
71
  image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
70
- timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_LOAD_IMAGES_TIMEOUT 读取,若未设置则默认为 300 秒。
71
- threads (int): 进程数,默认 4
72
+ timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_RENDER_TIMEOUT 读取,若未设置则默认为 300 秒。
73
+ threads (int): 进程数, 如果为 None,则从环境变量 MINERU_PDF_RENDER_THREADS 读取,若未设置则默认为 4.
72
74
 
73
75
  Raises:
74
76
  TimeoutError: 当转换超时时抛出
@@ -86,6 +88,9 @@ def load_images_from_pdf(
86
88
  else:
87
89
  if timeout is None:
88
90
  timeout = get_load_images_timeout()
91
+ if threads is None:
92
+ threads = get_load_images_threads()
93
+
89
94
  end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
90
95
 
91
96
  # 计算总页数
@@ -108,11 +113,13 @@ def load_images_from_pdf(
108
113
 
109
114
  page_ranges.append((range_start, range_end))
110
115
 
111
- # logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
116
+ logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
112
117
 
113
- with ProcessPoolExecutor(max_workers=actual_threads) as executor:
118
+ executor = ProcessPoolExecutor(max_workers=actual_threads)
119
+ try:
114
120
  # 提交所有任务
115
121
  futures = []
122
+ future_to_range = {}
116
123
  for range_start, range_end in page_ranges:
117
124
  future = executor.submit(
118
125
  _load_images_from_pdf_worker,
@@ -122,27 +129,68 @@ def load_images_from_pdf(
122
129
  range_end,
123
130
  image_type,
124
131
  )
125
- futures.append((range_start, future))
126
-
127
- try:
128
- # 收集结果并按页码排序
129
- all_results = []
130
- for range_start, future in futures:
131
- images_list = future.result(timeout=timeout)
132
- all_results.append((range_start, images_list))
133
-
134
- # 按起始页码排序并合并结果
135
- all_results.sort(key=lambda x: x[0])
136
- images_list = []
137
- for _, imgs in all_results:
138
- images_list.extend(imgs)
139
-
140
- return images_list, pdf_doc
141
- except FuturesTimeoutError:
132
+ futures.append(future)
133
+ future_to_range[future] = range_start
134
+
135
+ # 使用 wait() 设置单一全局超时
136
+ done, not_done = wait(futures, timeout=timeout, return_when=ALL_COMPLETED)
137
+
138
+ # 检查是否有未完成的任务(超时情况)
139
+ if not_done:
140
+ # 超时:强制终止所有子进程
141
+ _terminate_executor_processes(executor)
142
142
  pdf_doc.close()
143
- executor.shutdown(wait=False, cancel_futures=True)
144
143
  raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
145
144
 
145
+ # 所有任务完成,收集结果
146
+ all_results = []
147
+ for future in futures:
148
+ range_start = future_to_range[future]
149
+ # 这里不需要 timeout,因为任务已完成
150
+ images_list = future.result()
151
+ all_results.append((range_start, images_list))
152
+
153
+ # 按起始页码排序并合并结果
154
+ all_results.sort(key=lambda x: x[0])
155
+ images_list = []
156
+ for _, imgs in all_results:
157
+ images_list.extend(imgs)
158
+
159
+ return images_list, pdf_doc
160
+
161
+ except Exception as e:
162
+ # 发生任何异常时,确保清理子进程
163
+ _terminate_executor_processes(executor)
164
+ pdf_doc.close()
165
+ if isinstance(e, TimeoutError):
166
+ raise
167
+ raise
168
+ finally:
169
+ executor.shutdown(wait=False, cancel_futures=True)
170
+
171
+
172
+ def _terminate_executor_processes(executor):
173
+ """强制终止 ProcessPoolExecutor 中的所有子进程"""
174
+ if hasattr(executor, '_processes'):
175
+ for pid, process in executor._processes.items():
176
+ if process.is_alive():
177
+ try:
178
+ # 先发送 SIGTERM 允许优雅退出
179
+ os.kill(pid, signal.SIGTERM)
180
+ except (ProcessLookupError, OSError):
181
+ pass
182
+
183
+ # 给子进程一点时间响应 SIGTERM
184
+ time.sleep(0.1)
185
+
186
+ # 对仍然存活的进程发送 SIGKILL 强制终止
187
+ for pid, process in executor._processes.items():
188
+ if process.is_alive():
189
+ try:
190
+ os.kill(pid, signal.SIGKILL)
191
+ except (ProcessLookupError, OSError):
192
+ pass
193
+
146
194
 
147
195
  def load_images_from_pdf_core(
148
196
  pdf_bytes: bytes,
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.7.4"
1
+ __version__ = "2.7.5"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mineru
3
- Version: 2.7.4
3
+ Version: 2.7.5
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: homepage, https://mineru.net/
@@ -1,5 +1,5 @@
1
1
  mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
2
- mineru/version.py,sha256=yLdxKZXyzrDqew_33G4dvZoqgGxRCyEx9vhYW3y2Je4,22
2
+ mineru/version.py,sha256=lBcjVwt4I0-VUeE_7gM1gQBbtKOi9jGT3DavJbzcYnQ,22
3
3
  mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
4
4
  mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
5
5
  mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
@@ -175,9 +175,9 @@ mineru/utils/magic_model_utils.py,sha256=8Hv-BDk9Ez4TUx6hrVJ_675yZZggPj6Uib81lSp
175
175
  mineru/utils/model_utils.py,sha256=xlw5hUYKa6o1NiM8PoXO1HFvHfrgY5e4Ut_upGEY9yI,19909
176
176
  mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
177
177
  mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
178
- mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
178
+ mineru/utils/os_env_config.py,sha256=VHK9lS3QFJhrwWa9FOFU1Swm7oXnby4SaNNjTyonTTg,990
179
179
  mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
180
- mineru/utils/pdf_image_tools.py,sha256=L2kHKoFaQo4CGjS1d68JACrlBycx6gyCnnFlbBFRKuw,8273
180
+ mineru/utils/pdf_image_tools.py,sha256=tTSk39fgJKLEshwPAuJGLl_pVSrmEKjWA55F6dGcr4g,9987
181
181
  mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
182
182
  mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
183
183
  mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
@@ -185,9 +185,9 @@ mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,128
185
185
  mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
186
186
  mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
187
187
  mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
188
- mineru-2.7.4.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
189
- mineru-2.7.4.dist-info/METADATA,sha256=lNxDREB_s7eDnknMUeBn5FCgtDc8qPQS-hEs4Wb6WTg,36928
190
- mineru-2.7.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
191
- mineru-2.7.4.dist-info/entry_points.txt,sha256=a9AHBIiYe3dpT3oofVQJC8fI0WjDhQASCUlhdMOK120,376
192
- mineru-2.7.4.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
193
- mineru-2.7.4.dist-info/RECORD,,
188
+ mineru-2.7.5.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
189
+ mineru-2.7.5.dist-info/METADATA,sha256=MvPv4AgyLwaHz3hspAPrZ0wEeSE0wnu0MkMwfAJ5hTs,36928
190
+ mineru-2.7.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
191
+ mineru-2.7.5.dist-info/entry_points.txt,sha256=a9AHBIiYe3dpT3oofVQJC8fI0WjDhQASCUlhdMOK120,376
192
+ mineru-2.7.5.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
193
+ mineru-2.7.5.dist-info/RECORD,,
File without changes