mineru 2.7.0__py3-none-any.whl → 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/cli/common.py +2 -3
- mineru/utils/pdf_image_tools.py +37 -17
- mineru/utils/table_merge.py +8 -1
- mineru/version.py +1 -1
- {mineru-2.7.0.dist-info → mineru-2.7.1.dist-info}/METADATA +8 -3
- {mineru-2.7.0.dist-info → mineru-2.7.1.dist-info}/RECORD +10 -10
- {mineru-2.7.0.dist-info → mineru-2.7.1.dist-info}/WHEEL +0 -0
- {mineru-2.7.0.dist-info → mineru-2.7.1.dist-info}/entry_points.txt +0 -0
- {mineru-2.7.0.dist-info → mineru-2.7.1.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.7.0.dist-info → mineru-2.7.1.dist-info}/top_level.txt +0 -0
mineru/cli/common.py
CHANGED
|
@@ -17,8 +17,6 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
|
|
17
17
|
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
18
18
|
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
|
19
19
|
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
|
20
|
-
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
|
21
|
-
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
|
22
20
|
from mineru.utils.pdf_page_id import get_end_page_id
|
|
23
21
|
|
|
24
22
|
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
|
|
@@ -326,6 +324,7 @@ def _process_hybrid(
|
|
|
326
324
|
server_url=None,
|
|
327
325
|
**kwargs,
|
|
328
326
|
):
|
|
327
|
+
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
|
329
328
|
"""同步处理hybrid后端逻辑"""
|
|
330
329
|
if not backend.endswith("client"):
|
|
331
330
|
server_url = None
|
|
@@ -378,8 +377,8 @@ async def _async_process_hybrid(
|
|
|
378
377
|
server_url=None,
|
|
379
378
|
**kwargs,
|
|
380
379
|
):
|
|
380
|
+
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
|
381
381
|
"""异步处理hybrid后端逻辑"""
|
|
382
|
-
|
|
383
382
|
if not backend.endswith("client"):
|
|
384
383
|
server_url = None
|
|
385
384
|
|
mineru/utils/pdf_image_tools.py
CHANGED
|
@@ -5,7 +5,7 @@ from io import BytesIO
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pypdfium2 as pdfium
|
|
7
7
|
from loguru import logger
|
|
8
|
-
from PIL import Image
|
|
8
|
+
from PIL import Image, ImageOps
|
|
9
9
|
|
|
10
10
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
11
11
|
from mineru.utils.check_sys_env import is_windows_environment
|
|
@@ -41,19 +41,23 @@ def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -
|
|
|
41
41
|
return image_dict
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
def _load_images_from_pdf_worker(
|
|
44
|
+
def _load_images_from_pdf_worker(
|
|
45
|
+
pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
46
|
+
):
|
|
45
47
|
"""用于进程池的包装函数"""
|
|
46
|
-
return load_images_from_pdf_core(
|
|
48
|
+
return load_images_from_pdf_core(
|
|
49
|
+
pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
50
|
+
)
|
|
47
51
|
|
|
48
52
|
|
|
49
53
|
def load_images_from_pdf(
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
54
|
+
pdf_bytes: bytes,
|
|
55
|
+
dpi=200,
|
|
56
|
+
start_page_id=0,
|
|
57
|
+
end_page_id=None,
|
|
58
|
+
image_type=ImageType.PIL,
|
|
59
|
+
timeout=None,
|
|
60
|
+
threads=4,
|
|
57
61
|
):
|
|
58
62
|
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
|
59
63
|
|
|
@@ -77,7 +81,7 @@ def load_images_from_pdf(
|
|
|
77
81
|
dpi,
|
|
78
82
|
start_page_id,
|
|
79
83
|
get_end_page_id(end_page_id, len(pdf_doc)),
|
|
80
|
-
image_type
|
|
84
|
+
image_type,
|
|
81
85
|
), pdf_doc
|
|
82
86
|
else:
|
|
83
87
|
if timeout is None:
|
|
@@ -116,7 +120,7 @@ def load_images_from_pdf(
|
|
|
116
120
|
dpi,
|
|
117
121
|
range_start,
|
|
118
122
|
range_end,
|
|
119
|
-
image_type
|
|
123
|
+
image_type,
|
|
120
124
|
)
|
|
121
125
|
futures.append((range_start, future))
|
|
122
126
|
|
|
@@ -163,7 +167,14 @@ def load_images_from_pdf_core(
|
|
|
163
167
|
return images_list
|
|
164
168
|
|
|
165
169
|
|
|
166
|
-
def cut_image(
|
|
170
|
+
def cut_image(
|
|
171
|
+
bbox: tuple,
|
|
172
|
+
page_num: int,
|
|
173
|
+
page_pil_img,
|
|
174
|
+
return_path,
|
|
175
|
+
image_writer: FileBasedDataWriter,
|
|
176
|
+
scale=2,
|
|
177
|
+
):
|
|
167
178
|
"""从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
|
|
168
179
|
图片存放在save_path下,文件名是:
|
|
169
180
|
{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
|
|
@@ -197,7 +208,6 @@ def get_crop_img(bbox: tuple, pil_img, scale=2):
|
|
|
197
208
|
|
|
198
209
|
|
|
199
210
|
def get_crop_np_img(bbox: tuple, input_img, scale=2):
|
|
200
|
-
|
|
201
211
|
if isinstance(input_img, Image.Image):
|
|
202
212
|
np_img = np.asarray(input_img)
|
|
203
213
|
elif isinstance(input_img, np.ndarray):
|
|
@@ -212,17 +222,27 @@ def get_crop_np_img(bbox: tuple, input_img, scale=2):
|
|
|
212
222
|
int(bbox[3] * scale),
|
|
213
223
|
)
|
|
214
224
|
|
|
215
|
-
return np_img[scale_bbox[1]:scale_bbox[3], scale_bbox[0]:scale_bbox[2]]
|
|
225
|
+
return np_img[scale_bbox[1] : scale_bbox[3], scale_bbox[0] : scale_bbox[2]]
|
|
226
|
+
|
|
216
227
|
|
|
217
228
|
def images_bytes_to_pdf_bytes(image_bytes):
|
|
218
229
|
# 内存缓冲区
|
|
219
230
|
pdf_buffer = BytesIO()
|
|
220
231
|
|
|
221
232
|
# 载入并转换所有图像为 RGB 模式
|
|
222
|
-
image = Image.open(BytesIO(image_bytes))
|
|
233
|
+
image = Image.open(BytesIO(image_bytes))
|
|
234
|
+
# 根据 EXIF 信息自动转正(处理手机拍摄的带 Orientation 标记的图片)
|
|
235
|
+
image = ImageOps.exif_transpose(image) or image
|
|
236
|
+
# 只在必要时转换
|
|
237
|
+
if image.mode != "RGB":
|
|
238
|
+
image = image.convert("RGB")
|
|
223
239
|
|
|
224
240
|
# 第一张图保存为 PDF,其余追加
|
|
225
|
-
image.save(
|
|
241
|
+
image.save(
|
|
242
|
+
pdf_buffer,
|
|
243
|
+
format="PDF",
|
|
244
|
+
# save_all=True
|
|
245
|
+
)
|
|
226
246
|
|
|
227
247
|
# 获取 PDF bytes 并重置指针(可选)
|
|
228
248
|
pdf_bytes = pdf_buffer.getvalue()
|
mineru/utils/table_merge.py
CHANGED
|
@@ -9,7 +9,14 @@ from mineru.utils.char_utils import full_to_half
|
|
|
9
9
|
from mineru.utils.enum_class import BlockType, SplitFlag
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
CONTINUATION_MARKERS = [
|
|
12
|
+
CONTINUATION_MARKERS = [
|
|
13
|
+
"(续)",
|
|
14
|
+
"(续表)",
|
|
15
|
+
"(续上表)",
|
|
16
|
+
"(continued)",
|
|
17
|
+
"(cont.)",
|
|
18
|
+
"(cont’d)",
|
|
19
|
+
]
|
|
13
20
|
|
|
14
21
|
|
|
15
22
|
def calculate_table_total_columns(soup):
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.7.
|
|
1
|
+
__version__ = "2.7.1"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.1
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -19,7 +19,7 @@ Requires-Dist: boto3>=1.28.43
|
|
|
19
19
|
Requires-Dist: click>=8.1.7
|
|
20
20
|
Requires-Dist: loguru>=0.7.2
|
|
21
21
|
Requires-Dist: numpy>=1.21.6
|
|
22
|
-
Requires-Dist: pdfminer.six
|
|
22
|
+
Requires-Dist: pdfminer.six>=20251230
|
|
23
23
|
Requires-Dist: tqdm>=4.67.1
|
|
24
24
|
Requires-Dist: requests
|
|
25
25
|
Requires-Dist: httpx
|
|
@@ -81,9 +81,9 @@ Requires-Dist: mineru[vlm]; extra == "core"
|
|
|
81
81
|
Requires-Dist: mineru[pipeline]; extra == "core"
|
|
82
82
|
Requires-Dist: mineru[api]; extra == "core"
|
|
83
83
|
Requires-Dist: mineru[gradio]; extra == "core"
|
|
84
|
-
Requires-Dist: mineru[mlx]; sys_platform == "darwin" and extra == "core"
|
|
85
84
|
Provides-Extra: all
|
|
86
85
|
Requires-Dist: mineru[core]; extra == "all"
|
|
86
|
+
Requires-Dist: mineru[mlx]; sys_platform == "darwin" and extra == "all"
|
|
87
87
|
Requires-Dist: mineru[vllm]; sys_platform == "linux" and extra == "all"
|
|
88
88
|
Requires-Dist: mineru[lmdeploy]; sys_platform == "windows" and extra == "all"
|
|
89
89
|
Dynamic: license-file
|
|
@@ -135,6 +135,11 @@ Dynamic: license-file
|
|
|
135
135
|
|
|
136
136
|
# Changelog
|
|
137
137
|
|
|
138
|
+
- 2026/01/06 2.7.1 Release
|
|
139
|
+
- fix bug: #4300
|
|
140
|
+
- Updated pdfminer.six dependency version to resolve [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
|
|
141
|
+
- Support automatic correction of input image exif orientation to improve OCR recognition accuracy #4283
|
|
142
|
+
|
|
138
143
|
- 2025/12/30 2.7.0 Release
|
|
139
144
|
- Simplified installation process. No need to separately install `vlm` acceleration engine dependencies. Using `uv pip install mineru[all]` during installation will install all optional backend dependencies.
|
|
140
145
|
- Added new `hybrid` backend, which combines the advantages of `pipeline` and `vlm` backends. Built on vlm, it integrates some capabilities of pipeline, adding extra extensibility on top of high accuracy:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=yRpSH6mBb4BJgbFlT7rt8MSjCUW17Ycx0RziLf-lQLA,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
|
|
@@ -23,7 +23,7 @@ mineru/backend/vlm/vlm_magic_model.py,sha256=mD-irxboo2DmMu4QF1wnvbti2xdNyBmNflb
|
|
|
23
23
|
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
|
|
24
24
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
25
25
|
mineru/cli/client.py,sha256=mPNfMEShVG-ithmlJQ5nGRIad2gCZgUjBGHN7zAmLhQ,6978
|
|
26
|
-
mineru/cli/common.py,sha256=
|
|
26
|
+
mineru/cli/common.py,sha256=fMPc235DtnupQkh9uFIMHUpxOSvCp5yc3A56sAabAWY,20475
|
|
27
27
|
mineru/cli/fast_api.py,sha256=TGpZqyUE1kg2eXsP76pr0p1yqNOOU9jyjL5Pc0FJwRc,16637
|
|
28
28
|
mineru/cli/gradio_app.py,sha256=2IIWOm2bEHHq5BZMlfmN3yAJw1Nf8SUALTQ95o-bYy0,21863
|
|
29
29
|
mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
|
|
@@ -177,17 +177,17 @@ mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym
|
|
|
177
177
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
178
178
|
mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
|
|
179
179
|
mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
|
|
180
|
-
mineru/utils/pdf_image_tools.py,sha256=
|
|
180
|
+
mineru/utils/pdf_image_tools.py,sha256=L2kHKoFaQo4CGjS1d68JACrlBycx6gyCnnFlbBFRKuw,8273
|
|
181
181
|
mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
|
|
182
182
|
mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
|
|
183
183
|
mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
|
|
184
184
|
mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
|
|
185
185
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
186
186
|
mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
|
|
187
|
-
mineru/utils/table_merge.py,sha256=
|
|
188
|
-
mineru-2.7.
|
|
189
|
-
mineru-2.7.
|
|
190
|
-
mineru-2.7.
|
|
191
|
-
mineru-2.7.
|
|
192
|
-
mineru-2.7.
|
|
193
|
-
mineru-2.7.
|
|
187
|
+
mineru/utils/table_merge.py,sha256=X2vQCCKx8hG9Iipn4UEP8pXHc9jeNmYNYvl5zxaTS2E,15185
|
|
188
|
+
mineru-2.7.1.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
189
|
+
mineru-2.7.1.dist-info/METADATA,sha256=gtaeoZmMvmHA8JDW1QnpszDa0-cTwogQ-5BOPTdikWA,35540
|
|
190
|
+
mineru-2.7.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
191
|
+
mineru-2.7.1.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
|
|
192
|
+
mineru-2.7.1.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
193
|
+
mineru-2.7.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|