mineru 2.7.0__py3-none-any.whl → 2.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mineru/cli/common.py CHANGED
@@ -17,8 +17,6 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
17
17
  from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
18
18
  from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
19
19
  from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
20
- from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
21
- from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
22
20
  from mineru.utils.pdf_page_id import get_end_page_id
23
21
 
24
22
  if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
@@ -326,6 +324,7 @@ def _process_hybrid(
326
324
  server_url=None,
327
325
  **kwargs,
328
326
  ):
327
+ from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
329
328
  """同步处理hybrid后端逻辑"""
330
329
  if not backend.endswith("client"):
331
330
  server_url = None
@@ -378,8 +377,8 @@ async def _async_process_hybrid(
378
377
  server_url=None,
379
378
  **kwargs,
380
379
  ):
380
+ from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
381
381
  """异步处理hybrid后端逻辑"""
382
-
383
382
  if not backend.endswith("client"):
384
383
  server_url = None
385
384
 
@@ -5,7 +5,7 @@ from io import BytesIO
5
5
  import numpy as np
6
6
  import pypdfium2 as pdfium
7
7
  from loguru import logger
8
- from PIL import Image
8
+ from PIL import Image, ImageOps
9
9
 
10
10
  from mineru.data.data_reader_writer import FileBasedDataWriter
11
11
  from mineru.utils.check_sys_env import is_windows_environment
@@ -41,19 +41,23 @@ def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -
41
41
  return image_dict
42
42
 
43
43
 
44
- def _load_images_from_pdf_worker(pdf_bytes, dpi, start_page_id, end_page_id, image_type):
44
+ def _load_images_from_pdf_worker(
45
+ pdf_bytes, dpi, start_page_id, end_page_id, image_type
46
+ ):
45
47
  """用于进程池的包装函数"""
46
- return load_images_from_pdf_core(pdf_bytes, dpi, start_page_id, end_page_id, image_type)
48
+ return load_images_from_pdf_core(
49
+ pdf_bytes, dpi, start_page_id, end_page_id, image_type
50
+ )
47
51
 
48
52
 
49
53
  def load_images_from_pdf(
50
- pdf_bytes: bytes,
51
- dpi=200,
52
- start_page_id=0,
53
- end_page_id=None,
54
- image_type=ImageType.PIL,
55
- timeout=None,
56
- threads=4,
54
+ pdf_bytes: bytes,
55
+ dpi=200,
56
+ start_page_id=0,
57
+ end_page_id=None,
58
+ image_type=ImageType.PIL,
59
+ timeout=None,
60
+ threads=4,
57
61
  ):
58
62
  """带超时控制的 PDF 转图片函数,支持多进程加速
59
63
 
@@ -77,7 +81,7 @@ def load_images_from_pdf(
77
81
  dpi,
78
82
  start_page_id,
79
83
  get_end_page_id(end_page_id, len(pdf_doc)),
80
- image_type
84
+ image_type,
81
85
  ), pdf_doc
82
86
  else:
83
87
  if timeout is None:
@@ -116,7 +120,7 @@ def load_images_from_pdf(
116
120
  dpi,
117
121
  range_start,
118
122
  range_end,
119
- image_type
123
+ image_type,
120
124
  )
121
125
  futures.append((range_start, future))
122
126
 
@@ -163,7 +167,14 @@ def load_images_from_pdf_core(
163
167
  return images_list
164
168
 
165
169
 
166
- def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
170
+ def cut_image(
171
+ bbox: tuple,
172
+ page_num: int,
173
+ page_pil_img,
174
+ return_path,
175
+ image_writer: FileBasedDataWriter,
176
+ scale=2,
177
+ ):
167
178
  """从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
168
179
  图片存放在save_path下,文件名是:
169
180
  {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
@@ -197,7 +208,6 @@ def get_crop_img(bbox: tuple, pil_img, scale=2):
197
208
 
198
209
 
199
210
  def get_crop_np_img(bbox: tuple, input_img, scale=2):
200
-
201
211
  if isinstance(input_img, Image.Image):
202
212
  np_img = np.asarray(input_img)
203
213
  elif isinstance(input_img, np.ndarray):
@@ -212,17 +222,27 @@ def get_crop_np_img(bbox: tuple, input_img, scale=2):
212
222
  int(bbox[3] * scale),
213
223
  )
214
224
 
215
- return np_img[scale_bbox[1]:scale_bbox[3], scale_bbox[0]:scale_bbox[2]]
225
+ return np_img[scale_bbox[1] : scale_bbox[3], scale_bbox[0] : scale_bbox[2]]
226
+
216
227
 
217
228
  def images_bytes_to_pdf_bytes(image_bytes):
218
229
  # 内存缓冲区
219
230
  pdf_buffer = BytesIO()
220
231
 
221
232
  # 载入并转换所有图像为 RGB 模式
222
- image = Image.open(BytesIO(image_bytes)).convert("RGB")
233
+ image = Image.open(BytesIO(image_bytes))
234
+ # 根据 EXIF 信息自动转正(处理手机拍摄的带 Orientation 标记的图片)
235
+ image = ImageOps.exif_transpose(image) or image
236
+ # 只在必要时转换
237
+ if image.mode != "RGB":
238
+ image = image.convert("RGB")
223
239
 
224
240
  # 第一张图保存为 PDF,其余追加
225
- image.save(pdf_buffer, format="PDF", save_all=True)
241
+ image.save(
242
+ pdf_buffer,
243
+ format="PDF",
244
+ # save_all=True
245
+ )
226
246
 
227
247
  # 获取 PDF bytes 并重置指针(可选)
228
248
  pdf_bytes = pdf_buffer.getvalue()
@@ -9,7 +9,14 @@ from mineru.utils.char_utils import full_to_half
9
9
  from mineru.utils.enum_class import BlockType, SplitFlag
10
10
 
11
11
 
12
- CONTINUATION_MARKERS = ["(续)", "(续表)", "(continued)", "(cont.)"]
12
+ CONTINUATION_MARKERS = [
13
+ "(续)",
14
+ "(续表)",
15
+ "(续上表)",
16
+ "(continued)",
17
+ "(cont.)",
18
+ "(cont’d)",
19
+ ]
13
20
 
14
21
 
15
22
  def calculate_table_total_columns(soup):
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.7.0"
1
+ __version__ = "2.7.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mineru
3
- Version: 2.7.0
3
+ Version: 2.7.1
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: homepage, https://mineru.net/
@@ -19,7 +19,7 @@ Requires-Dist: boto3>=1.28.43
19
19
  Requires-Dist: click>=8.1.7
20
20
  Requires-Dist: loguru>=0.7.2
21
21
  Requires-Dist: numpy>=1.21.6
22
- Requires-Dist: pdfminer.six==20250506
22
+ Requires-Dist: pdfminer.six>=20251230
23
23
  Requires-Dist: tqdm>=4.67.1
24
24
  Requires-Dist: requests
25
25
  Requires-Dist: httpx
@@ -81,9 +81,9 @@ Requires-Dist: mineru[vlm]; extra == "core"
81
81
  Requires-Dist: mineru[pipeline]; extra == "core"
82
82
  Requires-Dist: mineru[api]; extra == "core"
83
83
  Requires-Dist: mineru[gradio]; extra == "core"
84
- Requires-Dist: mineru[mlx]; sys_platform == "darwin" and extra == "core"
85
84
  Provides-Extra: all
86
85
  Requires-Dist: mineru[core]; extra == "all"
86
+ Requires-Dist: mineru[mlx]; sys_platform == "darwin" and extra == "all"
87
87
  Requires-Dist: mineru[vllm]; sys_platform == "linux" and extra == "all"
88
88
  Requires-Dist: mineru[lmdeploy]; sys_platform == "windows" and extra == "all"
89
89
  Dynamic: license-file
@@ -135,6 +135,11 @@ Dynamic: license-file
135
135
 
136
136
  # Changelog
137
137
 
138
+ - 2026/01/06 2.7.1 Release
139
+ - fix bug: #4300
140
+ - Updated pdfminer.six dependency version to resolve [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
141
+ - Support automatic correction of input image exif orientation to improve OCR recognition accuracy #4283
142
+
138
143
  - 2025/12/30 2.7.0 Release
139
144
  - Simplified installation process. No need to separately install `vlm` acceleration engine dependencies. Using `uv pip install mineru[all]` during installation will install all optional backend dependencies.
140
145
  - Added new `hybrid` backend, which combines the advantages of `pipeline` and `vlm` backends. Built on vlm, it integrates some capabilities of pipeline, adding extra extensibility on top of high accuracy:
@@ -1,5 +1,5 @@
1
1
  mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
2
- mineru/version.py,sha256=EtKWW0Hnl5oWglRNH0HZigvcDT2FEs58ek8buJdwW1E,22
2
+ mineru/version.py,sha256=yRpSH6mBb4BJgbFlT7rt8MSjCUW17Ycx0RziLf-lQLA,22
3
3
  mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
4
4
  mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
5
5
  mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
@@ -23,7 +23,7 @@ mineru/backend/vlm/vlm_magic_model.py,sha256=mD-irxboo2DmMu4QF1wnvbti2xdNyBmNflb
23
23
  mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
24
24
  mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
25
25
  mineru/cli/client.py,sha256=mPNfMEShVG-ithmlJQ5nGRIad2gCZgUjBGHN7zAmLhQ,6978
26
- mineru/cli/common.py,sha256=srzkJeh0dMlaccDlGWaDBIToqXLPFeyYL4JsARCGHTc,20468
26
+ mineru/cli/common.py,sha256=fMPc235DtnupQkh9uFIMHUpxOSvCp5yc3A56sAabAWY,20475
27
27
  mineru/cli/fast_api.py,sha256=TGpZqyUE1kg2eXsP76pr0p1yqNOOU9jyjL5Pc0FJwRc,16637
28
28
  mineru/cli/gradio_app.py,sha256=2IIWOm2bEHHq5BZMlfmN3yAJw1Nf8SUALTQ95o-bYy0,21863
29
29
  mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
@@ -177,17 +177,17 @@ mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym
177
177
  mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
178
178
  mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
179
179
  mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
180
- mineru/utils/pdf_image_tools.py,sha256=86_xvsGOEde5QGlKz5uJemjoO1upr6n_K7o3lCdyIjQ,7981
180
+ mineru/utils/pdf_image_tools.py,sha256=L2kHKoFaQo4CGjS1d68JACrlBycx6gyCnnFlbBFRKuw,8273
181
181
  mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
182
182
  mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
183
183
  mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
184
184
  mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
185
185
  mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
186
186
  mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
187
- mineru/utils/table_merge.py,sha256=zPjDwso28bHEaH9mwlQMtPvwAPcdNfwDK-pinXTf7E8,15129
188
- mineru-2.7.0.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
189
- mineru-2.7.0.dist-info/METADATA,sha256=NoPOfPd9ZWB153FfrhRLnlqBcNShcZP1WEDiCc8U93k,35263
190
- mineru-2.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
191
- mineru-2.7.0.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
192
- mineru-2.7.0.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
193
- mineru-2.7.0.dist-info/RECORD,,
187
+ mineru/utils/table_merge.py,sha256=X2vQCCKx8hG9Iipn4UEP8pXHc9jeNmYNYvl5zxaTS2E,15185
188
+ mineru-2.7.1.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
189
+ mineru-2.7.1.dist-info/METADATA,sha256=gtaeoZmMvmHA8JDW1QnpszDa0-cTwogQ-5BOPTdikWA,35540
190
+ mineru-2.7.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
191
+ mineru-2.7.1.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
192
+ mineru-2.7.1.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
193
+ mineru-2.7.1.dist-info/RECORD,,
File without changes