mineru 2.7.0__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/vlm/utils.py +4 -0
- mineru/backend/vlm/vlm_analyze.py +55 -0
- mineru/cli/common.py +2 -3
- mineru/model/mfd/yolo_v8.py +4 -2
- mineru/model/mfr/unimernet/Unimernet.py +3 -3
- mineru/model/table/rec/unet_table/utils_table_line_rec.py +7 -1
- mineru/model/vlm/vllm_server.py +18 -0
- mineru/utils/block_sort.py +12 -0
- mineru/utils/config_reader.py +9 -1
- mineru/utils/model_utils.py +13 -1
- mineru/utils/pdf_image_tools.py +37 -17
- mineru/utils/table_merge.py +209 -40
- mineru/version.py +1 -1
- {mineru-2.7.0.dist-info → mineru-2.7.2.dist-info}/METADATA +20 -4
- {mineru-2.7.0.dist-info → mineru-2.7.2.dist-info}/RECORD +19 -19
- {mineru-2.7.0.dist-info → mineru-2.7.2.dist-info}/WHEEL +1 -1
- {mineru-2.7.0.dist-info → mineru-2.7.2.dist-info}/entry_points.txt +0 -0
- {mineru-2.7.0.dist-info → mineru-2.7.2.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.7.0.dist-info → mineru-2.7.2.dist-info}/top_level.txt +0 -0
mineru/backend/vlm/utils.py
CHANGED
|
@@ -18,6 +18,10 @@ def enable_custom_logits_processors() -> bool:
|
|
|
18
18
|
compute_capability = f"{major}.{minor}"
|
|
19
19
|
elif hasattr(torch, 'npu') and torch.npu.is_available():
|
|
20
20
|
compute_capability = "8.0"
|
|
21
|
+
elif hasattr(torch, 'gcu') and torch.gcu.is_available():
|
|
22
|
+
compute_capability = "8.0"
|
|
23
|
+
elif hasattr(torch, 'musa') and torch.musa.is_available():
|
|
24
|
+
compute_capability = "8.0"
|
|
21
25
|
else:
|
|
22
26
|
logger.info("CUDA not available, disabling custom_logits_processors")
|
|
23
27
|
return False
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
+
import json
|
|
4
5
|
|
|
5
6
|
from loguru import logger
|
|
6
7
|
|
|
@@ -99,6 +100,30 @@ class ModelSingleton:
|
|
|
99
100
|
import vllm
|
|
100
101
|
except ImportError:
|
|
101
102
|
raise ImportError("Please install vllm to use the vllm-engine backend.")
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
# musa vllm v1 引擎特殊配置
|
|
106
|
+
device = get_device()
|
|
107
|
+
if device.startswith("musa"):
|
|
108
|
+
import torch
|
|
109
|
+
if torch.musa.is_available():
|
|
110
|
+
compilation_config = {
|
|
111
|
+
"cudagraph_capture_sizes": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
112
|
+
"simple_cuda_graph": True
|
|
113
|
+
}
|
|
114
|
+
block_size = 32
|
|
115
|
+
kwargs["compilation_config"] = compilation_config
|
|
116
|
+
kwargs["block_size"] = block_size
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
if "compilation_config" in kwargs:
|
|
120
|
+
if isinstance(kwargs["compilation_config"], str):
|
|
121
|
+
try:
|
|
122
|
+
kwargs["compilation_config"] = json.loads(kwargs["compilation_config"])
|
|
123
|
+
except json.JSONDecodeError:
|
|
124
|
+
logger.warning(
|
|
125
|
+
f"Failed to parse compilation_config as JSON: {kwargs['compilation_config']}")
|
|
126
|
+
del kwargs["compilation_config"]
|
|
102
127
|
if "gpu_memory_utilization" not in kwargs:
|
|
103
128
|
kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
|
|
104
129
|
if "model" not in kwargs:
|
|
@@ -112,8 +137,38 @@ class ModelSingleton:
|
|
|
112
137
|
try:
|
|
113
138
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
114
139
|
from vllm.v1.engine.async_llm import AsyncLLM
|
|
140
|
+
from vllm.config import CompilationConfig
|
|
115
141
|
except ImportError:
|
|
116
142
|
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
|
|
143
|
+
|
|
144
|
+
"""
|
|
145
|
+
# musa vllm v1 引擎特殊配置
|
|
146
|
+
device = get_device()
|
|
147
|
+
if device.startswith("musa"):
|
|
148
|
+
import torch
|
|
149
|
+
if torch.musa.is_available():
|
|
150
|
+
compilation_config = CompilationConfig(
|
|
151
|
+
cudagraph_capture_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
152
|
+
simple_cuda_graph=True
|
|
153
|
+
)
|
|
154
|
+
block_size = 32
|
|
155
|
+
kwargs["compilation_config"] = compilation_config
|
|
156
|
+
kwargs["block_size"] = block_size
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
if "compilation_config" in kwargs:
|
|
160
|
+
if isinstance(kwargs["compilation_config"], dict):
|
|
161
|
+
# 如果是字典,转换为 CompilationConfig 对象
|
|
162
|
+
kwargs["compilation_config"] = CompilationConfig(**kwargs["compilation_config"])
|
|
163
|
+
elif isinstance(kwargs["compilation_config"], str):
|
|
164
|
+
# 如果是 JSON 字符串,先解析再转换
|
|
165
|
+
try:
|
|
166
|
+
config_dict = json.loads(kwargs["compilation_config"])
|
|
167
|
+
kwargs["compilation_config"] = CompilationConfig(**config_dict)
|
|
168
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
169
|
+
logger.warning(
|
|
170
|
+
f"Failed to parse compilation_config: {kwargs['compilation_config']}, error: {e}")
|
|
171
|
+
del kwargs["compilation_config"]
|
|
117
172
|
if "gpu_memory_utilization" not in kwargs:
|
|
118
173
|
kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
|
|
119
174
|
if "model" not in kwargs:
|
mineru/cli/common.py
CHANGED
|
@@ -17,8 +17,6 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
|
|
17
17
|
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
18
18
|
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
|
19
19
|
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
|
20
|
-
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
|
21
|
-
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
|
22
20
|
from mineru.utils.pdf_page_id import get_end_page_id
|
|
23
21
|
|
|
24
22
|
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
|
|
@@ -326,6 +324,7 @@ def _process_hybrid(
|
|
|
326
324
|
server_url=None,
|
|
327
325
|
**kwargs,
|
|
328
326
|
):
|
|
327
|
+
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
|
329
328
|
"""同步处理hybrid后端逻辑"""
|
|
330
329
|
if not backend.endswith("client"):
|
|
331
330
|
server_url = None
|
|
@@ -378,8 +377,8 @@ async def _async_process_hybrid(
|
|
|
378
377
|
server_url=None,
|
|
379
378
|
**kwargs,
|
|
380
379
|
):
|
|
380
|
+
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
|
381
381
|
"""异步处理hybrid后端逻辑"""
|
|
382
|
-
|
|
383
382
|
if not backend.endswith("client"):
|
|
384
383
|
server_url = None
|
|
385
384
|
|
mineru/model/mfd/yolo_v8.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List, Union
|
|
3
|
+
|
|
4
|
+
import torch
|
|
3
5
|
from tqdm import tqdm
|
|
4
6
|
from ultralytics import YOLO
|
|
5
7
|
import numpy as np
|
|
@@ -18,8 +20,8 @@ class YOLOv8MFDModel:
|
|
|
18
20
|
conf: float = 0.25,
|
|
19
21
|
iou: float = 0.45,
|
|
20
22
|
):
|
|
21
|
-
self.
|
|
22
|
-
self.
|
|
23
|
+
self.device = torch.device(device)
|
|
24
|
+
self.model = YOLO(weight).to(self.device)
|
|
23
25
|
self.imgsz = imgsz
|
|
24
26
|
self.conf = conf
|
|
25
27
|
self.iou = iou
|
|
@@ -23,12 +23,12 @@ class MathDataset(Dataset):
|
|
|
23
23
|
class UnimernetModel(object):
|
|
24
24
|
def __init__(self, weight_dir, _device_="cpu"):
|
|
25
25
|
from .unimernet_hf import UnimernetModel
|
|
26
|
-
if _device_.startswith("mps") or _device_.startswith("npu"):
|
|
26
|
+
if _device_.startswith("mps") or _device_.startswith("npu") or _device_.startswith("musa"):
|
|
27
27
|
self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager")
|
|
28
28
|
else:
|
|
29
29
|
self.model = UnimernetModel.from_pretrained(weight_dir)
|
|
30
|
-
self.device = _device_
|
|
31
|
-
self.model.to(
|
|
30
|
+
self.device = torch.device(_device_)
|
|
31
|
+
self.model.to(self.device)
|
|
32
32
|
if not _device_.startswith("cpu"):
|
|
33
33
|
self.model = self.model.to(dtype=torch.float16)
|
|
34
34
|
self.model.eval()
|
|
@@ -4,6 +4,8 @@ import cv2
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
from scipy.spatial import distance as dist
|
|
6
6
|
from skimage import measure
|
|
7
|
+
from skimage import __version__ as skimage_version
|
|
8
|
+
from packaging import version
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
def transform_preds(coords, center, scale, output_size, rot=0):
|
|
@@ -295,7 +297,11 @@ def min_area_rect_box(
|
|
|
295
297
|
"""
|
|
296
298
|
boxes = []
|
|
297
299
|
for region in regions:
|
|
298
|
-
if
|
|
300
|
+
if version.parse(skimage_version) >= version.parse("0.26.0"):
|
|
301
|
+
region_bbox_area = region.area_bbox
|
|
302
|
+
else:
|
|
303
|
+
region_bbox_area = region.bbox_area
|
|
304
|
+
if region_bbox_area > H * W * 3 / 4: # 过滤大的单元格
|
|
299
305
|
continue
|
|
300
306
|
rect = cv2.minAreaRect(region.coords[:, ::-1])
|
|
301
307
|
|
mineru/model/vlm/vllm_server.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import sys
|
|
3
3
|
|
|
4
4
|
from mineru.backend.vlm.utils import set_default_gpu_memory_utilization, enable_custom_logits_processors
|
|
5
|
+
from mineru.utils.config_reader import get_device
|
|
5
6
|
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
|
|
6
7
|
|
|
7
8
|
from vllm.entrypoints.cli.main import main as vllm_main
|
|
@@ -13,6 +14,8 @@ def main():
|
|
|
13
14
|
has_port_arg = False
|
|
14
15
|
has_gpu_memory_utilization_arg = False
|
|
15
16
|
has_logits_processors_arg = False
|
|
17
|
+
has_block_size_arg = False
|
|
18
|
+
has_compilation_config = False
|
|
16
19
|
model_path = None
|
|
17
20
|
model_arg_indices = []
|
|
18
21
|
|
|
@@ -24,6 +27,10 @@ def main():
|
|
|
24
27
|
has_gpu_memory_utilization_arg = True
|
|
25
28
|
if arg == "--logits-processors" or arg.startswith("--logits-processors="):
|
|
26
29
|
has_logits_processors_arg = True
|
|
30
|
+
if arg == "--block-size" or arg.startswith("--block-size="):
|
|
31
|
+
has_block_size_arg = True
|
|
32
|
+
if arg == "--compilation-config" or arg.startswith("--compilation-config="):
|
|
33
|
+
has_compilation_config = True
|
|
27
34
|
if arg == "--model":
|
|
28
35
|
if i + 1 < len(args):
|
|
29
36
|
model_path = args[i + 1]
|
|
@@ -49,6 +56,17 @@ def main():
|
|
|
49
56
|
model_path = auto_download_and_get_model_root_path("/", "vlm")
|
|
50
57
|
if (not has_logits_processors_arg) and custom_logits_processors:
|
|
51
58
|
args.extend(["--logits-processors", "mineru_vl_utils:MinerULogitsProcessor"])
|
|
59
|
+
"""
|
|
60
|
+
# musa vllm v1 引擎特殊配置
|
|
61
|
+
device = get_device()
|
|
62
|
+
if device.startswith("musa"):
|
|
63
|
+
import torch
|
|
64
|
+
if torch.musa.is_available():
|
|
65
|
+
if not has_block_size_arg:
|
|
66
|
+
args.extend(["--block-size", "32"])
|
|
67
|
+
if not has_compilation_config:
|
|
68
|
+
args.extend(["--compilation-config", '{"cudagraph_capture_sizes": [1,2,3,4,5,6,7,8,10,12,14,16,18,20,24,28,30], "simple_cuda_graph": true}'])
|
|
69
|
+
"""
|
|
52
70
|
|
|
53
71
|
# 重构参数,将模型路径作为位置参数
|
|
54
72
|
sys.argv = [sys.argv[0]] + ["serve", model_path] + args
|
mineru/utils/block_sort.py
CHANGED
|
@@ -186,6 +186,18 @@ def model_init(model_name: str):
|
|
|
186
186
|
bf_16_support = True
|
|
187
187
|
elif device_name.startswith("mps"):
|
|
188
188
|
bf_16_support = True
|
|
189
|
+
elif device_name.startswith("gcu"):
|
|
190
|
+
if hasattr(torch, 'gcu') and torch.gcu.is_available():
|
|
191
|
+
if torch.gcu.is_bf16_supported():
|
|
192
|
+
bf_16_support = True
|
|
193
|
+
elif device_name.startswith("musa"):
|
|
194
|
+
if hasattr(torch, 'musa') and torch.musa.is_available():
|
|
195
|
+
if torch.musa.is_bf16_supported():
|
|
196
|
+
bf_16_support = True
|
|
197
|
+
elif device_name.startswith("npu"):
|
|
198
|
+
if hasattr(torch, 'npu') and torch.npu.is_available():
|
|
199
|
+
if torch.npu.is_bf16_supported():
|
|
200
|
+
bf_16_support = True
|
|
189
201
|
|
|
190
202
|
if model_name == 'layoutreader':
|
|
191
203
|
# 检测modelscope的缓存目录是否存在
|
mineru/utils/config_reader.py
CHANGED
|
@@ -86,7 +86,15 @@ def get_device():
|
|
|
86
86
|
if torch_npu.npu.is_available():
|
|
87
87
|
return "npu"
|
|
88
88
|
except Exception as e:
|
|
89
|
-
|
|
89
|
+
try:
|
|
90
|
+
if torch.gcu.is_available():
|
|
91
|
+
return "gcu"
|
|
92
|
+
except Exception as e:
|
|
93
|
+
try:
|
|
94
|
+
if torch.musa.is_available():
|
|
95
|
+
return "musa"
|
|
96
|
+
except Exception as e:
|
|
97
|
+
pass
|
|
90
98
|
return "cpu"
|
|
91
99
|
|
|
92
100
|
|
mineru/utils/model_utils.py
CHANGED
|
@@ -414,7 +414,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
|
|
|
414
414
|
|
|
415
415
|
|
|
416
416
|
def clean_memory(device='cuda'):
|
|
417
|
-
if device
|
|
417
|
+
if str(device).startswith("cuda"):
|
|
418
418
|
if torch.cuda.is_available():
|
|
419
419
|
torch.cuda.empty_cache()
|
|
420
420
|
torch.cuda.ipc_collect()
|
|
@@ -423,6 +423,12 @@ def clean_memory(device='cuda'):
|
|
|
423
423
|
torch_npu.npu.empty_cache()
|
|
424
424
|
elif str(device).startswith("mps"):
|
|
425
425
|
torch.mps.empty_cache()
|
|
426
|
+
elif str(device).startswith("gcu"):
|
|
427
|
+
if torch.gcu.is_available():
|
|
428
|
+
torch.gcu.empty_cache()
|
|
429
|
+
elif str(device).startswith("musa"):
|
|
430
|
+
if torch.musa.is_available():
|
|
431
|
+
torch.musa.empty_cache()
|
|
426
432
|
gc.collect()
|
|
427
433
|
|
|
428
434
|
|
|
@@ -458,5 +464,11 @@ def get_vram(device) -> int:
|
|
|
458
464
|
elif str(device).startswith("npu"):
|
|
459
465
|
if torch_npu.npu.is_available():
|
|
460
466
|
total_memory = round(torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
467
|
+
elif str(device).startswith("gcu"):
|
|
468
|
+
if torch.gcu.is_available():
|
|
469
|
+
total_memory = round(torch.gcu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
470
|
+
elif str(device).startswith("musa"):
|
|
471
|
+
if torch.musa.is_available():
|
|
472
|
+
total_memory = round(torch.musa.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
461
473
|
|
|
462
474
|
return total_memory
|
mineru/utils/pdf_image_tools.py
CHANGED
|
@@ -5,7 +5,7 @@ from io import BytesIO
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pypdfium2 as pdfium
|
|
7
7
|
from loguru import logger
|
|
8
|
-
from PIL import Image
|
|
8
|
+
from PIL import Image, ImageOps
|
|
9
9
|
|
|
10
10
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
11
11
|
from mineru.utils.check_sys_env import is_windows_environment
|
|
@@ -41,19 +41,23 @@ def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -
|
|
|
41
41
|
return image_dict
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
def _load_images_from_pdf_worker(
|
|
44
|
+
def _load_images_from_pdf_worker(
|
|
45
|
+
pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
46
|
+
):
|
|
45
47
|
"""用于进程池的包装函数"""
|
|
46
|
-
return load_images_from_pdf_core(
|
|
48
|
+
return load_images_from_pdf_core(
|
|
49
|
+
pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
50
|
+
)
|
|
47
51
|
|
|
48
52
|
|
|
49
53
|
def load_images_from_pdf(
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
54
|
+
pdf_bytes: bytes,
|
|
55
|
+
dpi=200,
|
|
56
|
+
start_page_id=0,
|
|
57
|
+
end_page_id=None,
|
|
58
|
+
image_type=ImageType.PIL,
|
|
59
|
+
timeout=None,
|
|
60
|
+
threads=4,
|
|
57
61
|
):
|
|
58
62
|
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
|
59
63
|
|
|
@@ -77,7 +81,7 @@ def load_images_from_pdf(
|
|
|
77
81
|
dpi,
|
|
78
82
|
start_page_id,
|
|
79
83
|
get_end_page_id(end_page_id, len(pdf_doc)),
|
|
80
|
-
image_type
|
|
84
|
+
image_type,
|
|
81
85
|
), pdf_doc
|
|
82
86
|
else:
|
|
83
87
|
if timeout is None:
|
|
@@ -116,7 +120,7 @@ def load_images_from_pdf(
|
|
|
116
120
|
dpi,
|
|
117
121
|
range_start,
|
|
118
122
|
range_end,
|
|
119
|
-
image_type
|
|
123
|
+
image_type,
|
|
120
124
|
)
|
|
121
125
|
futures.append((range_start, future))
|
|
122
126
|
|
|
@@ -163,7 +167,14 @@ def load_images_from_pdf_core(
|
|
|
163
167
|
return images_list
|
|
164
168
|
|
|
165
169
|
|
|
166
|
-
def cut_image(
|
|
170
|
+
def cut_image(
|
|
171
|
+
bbox: tuple,
|
|
172
|
+
page_num: int,
|
|
173
|
+
page_pil_img,
|
|
174
|
+
return_path,
|
|
175
|
+
image_writer: FileBasedDataWriter,
|
|
176
|
+
scale=2,
|
|
177
|
+
):
|
|
167
178
|
"""从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
|
|
168
179
|
图片存放在save_path下,文件名是:
|
|
169
180
|
{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
|
|
@@ -197,7 +208,6 @@ def get_crop_img(bbox: tuple, pil_img, scale=2):
|
|
|
197
208
|
|
|
198
209
|
|
|
199
210
|
def get_crop_np_img(bbox: tuple, input_img, scale=2):
|
|
200
|
-
|
|
201
211
|
if isinstance(input_img, Image.Image):
|
|
202
212
|
np_img = np.asarray(input_img)
|
|
203
213
|
elif isinstance(input_img, np.ndarray):
|
|
@@ -212,17 +222,27 @@ def get_crop_np_img(bbox: tuple, input_img, scale=2):
|
|
|
212
222
|
int(bbox[3] * scale),
|
|
213
223
|
)
|
|
214
224
|
|
|
215
|
-
return np_img[scale_bbox[1]:scale_bbox[3], scale_bbox[0]:scale_bbox[2]]
|
|
225
|
+
return np_img[scale_bbox[1] : scale_bbox[3], scale_bbox[0] : scale_bbox[2]]
|
|
226
|
+
|
|
216
227
|
|
|
217
228
|
def images_bytes_to_pdf_bytes(image_bytes):
|
|
218
229
|
# 内存缓冲区
|
|
219
230
|
pdf_buffer = BytesIO()
|
|
220
231
|
|
|
221
232
|
# 载入并转换所有图像为 RGB 模式
|
|
222
|
-
image = Image.open(BytesIO(image_bytes))
|
|
233
|
+
image = Image.open(BytesIO(image_bytes))
|
|
234
|
+
# 根据 EXIF 信息自动转正(处理手机拍摄的带 Orientation 标记的图片)
|
|
235
|
+
image = ImageOps.exif_transpose(image) or image
|
|
236
|
+
# 只在必要时转换
|
|
237
|
+
if image.mode != "RGB":
|
|
238
|
+
image = image.convert("RGB")
|
|
223
239
|
|
|
224
240
|
# 第一张图保存为 PDF,其余追加
|
|
225
|
-
image.save(
|
|
241
|
+
image.save(
|
|
242
|
+
pdf_buffer,
|
|
243
|
+
format="PDF",
|
|
244
|
+
# save_all=True
|
|
245
|
+
)
|
|
226
246
|
|
|
227
247
|
# 获取 PDF bytes 并重置指针(可选)
|
|
228
248
|
pdf_bytes = pdf_buffer.getvalue()
|
mineru/utils/table_merge.py
CHANGED
|
@@ -9,7 +9,20 @@ from mineru.utils.char_utils import full_to_half
|
|
|
9
9
|
from mineru.utils.enum_class import BlockType, SplitFlag
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
CONTINUATION_END_MARKERS = [
|
|
13
|
+
"(续)",
|
|
14
|
+
"(续表)",
|
|
15
|
+
"(续上表)",
|
|
16
|
+
"(continued)",
|
|
17
|
+
"(cont.)",
|
|
18
|
+
"(cont’d)",
|
|
19
|
+
"(…continued)",
|
|
20
|
+
"续表",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
CONTINUATION_INLINE_MARKERS = [
|
|
24
|
+
"(continued)",
|
|
25
|
+
]
|
|
13
26
|
|
|
14
27
|
|
|
15
28
|
def calculate_table_total_columns(soup):
|
|
@@ -57,6 +70,69 @@ def calculate_table_total_columns(soup):
|
|
|
57
70
|
return max_cols
|
|
58
71
|
|
|
59
72
|
|
|
73
|
+
def build_table_occupied_matrix(soup):
|
|
74
|
+
"""构建表格的占用矩阵,返回每行的有效列数
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
soup: BeautifulSoup解析的表格
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
dict: {row_idx: effective_columns} 每行的有效列数(考虑rowspan占用)
|
|
81
|
+
"""
|
|
82
|
+
rows = soup.find_all("tr")
|
|
83
|
+
if not rows:
|
|
84
|
+
return {}
|
|
85
|
+
|
|
86
|
+
occupied = {} # {row_idx: {col_idx: True}}
|
|
87
|
+
row_effective_cols = {} # {row_idx: effective_columns}
|
|
88
|
+
|
|
89
|
+
for row_idx, row in enumerate(rows):
|
|
90
|
+
col_idx = 0
|
|
91
|
+
cells = row.find_all(["td", "th"])
|
|
92
|
+
|
|
93
|
+
if row_idx not in occupied:
|
|
94
|
+
occupied[row_idx] = {}
|
|
95
|
+
|
|
96
|
+
for cell in cells:
|
|
97
|
+
# 找到下一个未被占用的列位置
|
|
98
|
+
while col_idx in occupied[row_idx]:
|
|
99
|
+
col_idx += 1
|
|
100
|
+
|
|
101
|
+
colspan = int(cell.get("colspan", 1))
|
|
102
|
+
rowspan = int(cell.get("rowspan", 1))
|
|
103
|
+
|
|
104
|
+
# 标记被这个单元格占用的所有位置
|
|
105
|
+
for r in range(row_idx, row_idx + rowspan):
|
|
106
|
+
if r not in occupied:
|
|
107
|
+
occupied[r] = {}
|
|
108
|
+
for c in range(col_idx, col_idx + colspan):
|
|
109
|
+
occupied[r][c] = True
|
|
110
|
+
|
|
111
|
+
col_idx += colspan
|
|
112
|
+
|
|
113
|
+
# 该行的有效列数为已占用的最大列索引+1
|
|
114
|
+
if occupied[row_idx]:
|
|
115
|
+
row_effective_cols[row_idx] = max(occupied[row_idx].keys()) + 1
|
|
116
|
+
else:
|
|
117
|
+
row_effective_cols[row_idx] = 0
|
|
118
|
+
|
|
119
|
+
return row_effective_cols
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def calculate_row_effective_columns(soup, row_idx):
|
|
123
|
+
"""计算指定行的有效列数(考虑rowspan占用)
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
soup: BeautifulSoup解析的表格
|
|
127
|
+
row_idx: 行索引
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
int: 该行的有效列数
|
|
131
|
+
"""
|
|
132
|
+
row_effective_cols = build_table_occupied_matrix(soup)
|
|
133
|
+
return row_effective_cols.get(row_idx, 0)
|
|
134
|
+
|
|
135
|
+
|
|
60
136
|
def calculate_row_columns(row):
|
|
61
137
|
"""
|
|
62
138
|
计算表格行的实际列数,考虑colspan属性
|
|
@@ -106,6 +182,10 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
|
|
|
106
182
|
rows1 = soup1.find_all("tr")
|
|
107
183
|
rows2 = soup2.find_all("tr")
|
|
108
184
|
|
|
185
|
+
# 构建两个表格的有效列数矩阵
|
|
186
|
+
effective_cols1 = build_table_occupied_matrix(soup1)
|
|
187
|
+
effective_cols2 = build_table_occupied_matrix(soup2)
|
|
188
|
+
|
|
109
189
|
min_rows = min(len(rows1), len(rows2), max_header_rows)
|
|
110
190
|
header_rows = 0
|
|
111
191
|
headers_match = True
|
|
@@ -123,20 +203,24 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
|
|
|
123
203
|
if len(cells1) != len(cells2):
|
|
124
204
|
structure_match = False
|
|
125
205
|
else:
|
|
126
|
-
#
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
206
|
+
# 检查有效列数是否一致(考虑rowspan影响)
|
|
207
|
+
if effective_cols1.get(i, 0) != effective_cols2.get(i, 0):
|
|
208
|
+
structure_match = False
|
|
209
|
+
else:
|
|
210
|
+
# 然后检查单元格的属性和内容
|
|
211
|
+
for cell1, cell2 in zip(cells1, cells2):
|
|
212
|
+
colspan1 = int(cell1.get("colspan", 1))
|
|
213
|
+
rowspan1 = int(cell1.get("rowspan", 1))
|
|
214
|
+
colspan2 = int(cell2.get("colspan", 1))
|
|
215
|
+
rowspan2 = int(cell2.get("rowspan", 1))
|
|
216
|
+
|
|
217
|
+
# 去除所有空白字符(包括空格、换行、制表符等)
|
|
218
|
+
text1 = ''.join(full_to_half(cell1.get_text()).split())
|
|
219
|
+
text2 = ''.join(full_to_half(cell2.get_text()).split())
|
|
220
|
+
|
|
221
|
+
if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
|
|
222
|
+
structure_match = False
|
|
223
|
+
break
|
|
140
224
|
|
|
141
225
|
if structure_match:
|
|
142
226
|
header_rows += 1
|
|
@@ -146,7 +230,54 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
|
|
|
146
230
|
headers_match = header_rows > 0 # 只有当至少匹配了一行时,才认为表头匹配
|
|
147
231
|
break
|
|
148
232
|
|
|
149
|
-
#
|
|
233
|
+
# 如果严格匹配失败,尝试视觉一致性匹配(只比较文本内容)
|
|
234
|
+
if header_rows == 0:
|
|
235
|
+
header_rows, headers_match, header_texts = _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows)
|
|
236
|
+
|
|
237
|
+
return header_rows, headers_match, header_texts
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows=5):
|
|
241
|
+
"""
|
|
242
|
+
基于视觉一致性检测表头(只比较文本内容,忽略colspan/rowspan差异)
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
soup1: 第一个表格的BeautifulSoup对象
|
|
246
|
+
soup2: 第二个表格的BeautifulSoup对象
|
|
247
|
+
rows1: 第一个表格的行列表
|
|
248
|
+
rows2: 第二个表格的行列表
|
|
249
|
+
max_header_rows: 最大可能的表头行数
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
tuple: (表头行数, 表头是否一致, 表头文本列表)
|
|
253
|
+
"""
|
|
254
|
+
# 构建两个表格的有效列数矩阵
|
|
255
|
+
effective_cols1 = build_table_occupied_matrix(soup1)
|
|
256
|
+
effective_cols2 = build_table_occupied_matrix(soup2)
|
|
257
|
+
|
|
258
|
+
min_rows = min(len(rows1), len(rows2), max_header_rows)
|
|
259
|
+
header_rows = 0
|
|
260
|
+
headers_match = True
|
|
261
|
+
header_texts = []
|
|
262
|
+
|
|
263
|
+
for i in range(min_rows):
|
|
264
|
+
cells1 = rows1[i].find_all(["td", "th"])
|
|
265
|
+
cells2 = rows2[i].find_all(["td", "th"])
|
|
266
|
+
|
|
267
|
+
# 提取每行的文本内容列表(去除空白字符)
|
|
268
|
+
texts1 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells1]
|
|
269
|
+
texts2 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells2]
|
|
270
|
+
|
|
271
|
+
# 检查视觉一致性:文本内容完全相同,且有效列数一致
|
|
272
|
+
effective_cols_match = effective_cols1.get(i, 0) == effective_cols2.get(i, 0)
|
|
273
|
+
if texts1 == texts2 and effective_cols_match:
|
|
274
|
+
header_rows += 1
|
|
275
|
+
row_texts = [full_to_half(cell.get_text().strip()) for cell in cells1]
|
|
276
|
+
header_texts.append(row_texts)
|
|
277
|
+
else:
|
|
278
|
+
headers_match = header_rows > 0
|
|
279
|
+
break
|
|
280
|
+
|
|
150
281
|
if header_rows == 0:
|
|
151
282
|
headers_match = False
|
|
152
283
|
|
|
@@ -156,20 +287,32 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
|
|
|
156
287
|
def can_merge_tables(current_table_block, previous_table_block):
|
|
157
288
|
"""判断两个表格是否可以合并"""
|
|
158
289
|
# 检查表格是否有caption和footnote
|
|
290
|
+
# 计算previous_table_block中的footnote数量
|
|
291
|
+
footnote_count = sum(1 for block in previous_table_block["blocks"] if block["type"] == BlockType.TABLE_FOOTNOTE)
|
|
159
292
|
# 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
|
|
160
293
|
caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
|
|
161
294
|
if caption_blocks:
|
|
162
|
-
#
|
|
295
|
+
# 检查是否至少有一个caption包含续表标识
|
|
296
|
+
has_continuation_marker = False
|
|
297
|
+
for block in caption_blocks:
|
|
298
|
+
caption_text = full_to_half(merge_para_with_text(block).strip()).lower()
|
|
299
|
+
if (
|
|
300
|
+
any(caption_text.endswith(marker.lower()) for marker in CONTINUATION_END_MARKERS)
|
|
301
|
+
or any(marker.lower() in caption_text for marker in CONTINUATION_INLINE_MARKERS)
|
|
302
|
+
):
|
|
303
|
+
has_continuation_marker = True
|
|
304
|
+
break
|
|
163
305
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
for marker in CONTINUATION_MARKERS)
|
|
167
|
-
for block in caption_blocks
|
|
168
|
-
):
|
|
306
|
+
# 如果所有caption都不包含续表标识,则不允许合并
|
|
307
|
+
if not has_continuation_marker:
|
|
169
308
|
return False, None, None, None, None
|
|
170
309
|
|
|
171
|
-
|
|
172
|
-
|
|
310
|
+
# 如果current_table_block的caption存在续标识,放宽footnote的限制允许previous_table_block有最多一条footnote
|
|
311
|
+
if footnote_count > 1:
|
|
312
|
+
return False, None, None, None, None
|
|
313
|
+
else:
|
|
314
|
+
if footnote_count > 0:
|
|
315
|
+
return False, None, None, None, None
|
|
173
316
|
|
|
174
317
|
# 获取两个表格的HTML内容
|
|
175
318
|
current_html = ""
|
|
@@ -219,34 +362,44 @@ def check_rows_match(soup1, soup2):
|
|
|
219
362
|
if not (rows1 and rows2):
|
|
220
363
|
return False
|
|
221
364
|
|
|
222
|
-
#
|
|
365
|
+
# 获取第一个表的最后一行数据行索引
|
|
366
|
+
last_row_idx = None
|
|
223
367
|
last_row = None
|
|
224
|
-
for
|
|
225
|
-
if
|
|
226
|
-
|
|
368
|
+
for idx in range(len(rows1) - 1, -1, -1):
|
|
369
|
+
if rows1[idx].find_all(["td", "th"]):
|
|
370
|
+
last_row_idx = idx
|
|
371
|
+
last_row = rows1[idx]
|
|
227
372
|
break
|
|
228
373
|
|
|
229
374
|
# 检测表头行数,以便获取第二个表的首个数据行
|
|
230
375
|
header_count, _, _ = detect_table_headers(soup1, soup2)
|
|
231
376
|
|
|
232
377
|
# 获取第二个表的首个数据行
|
|
378
|
+
first_data_row_idx = None
|
|
233
379
|
first_data_row = None
|
|
234
380
|
if len(rows2) > header_count:
|
|
381
|
+
first_data_row_idx = header_count
|
|
235
382
|
first_data_row = rows2[header_count] # 第一个非表头行
|
|
236
383
|
|
|
237
384
|
if not (last_row and first_data_row):
|
|
238
385
|
return False
|
|
239
386
|
|
|
240
|
-
#
|
|
387
|
+
# 计算有效列数(考虑rowspan和colspan)
|
|
388
|
+
last_row_effective_cols = calculate_row_effective_columns(soup1, last_row_idx)
|
|
389
|
+
first_row_effective_cols = calculate_row_effective_columns(soup2, first_data_row_idx)
|
|
390
|
+
|
|
391
|
+
# 计算实际列数(仅考虑colspan)和视觉列数
|
|
241
392
|
last_row_cols = calculate_row_columns(last_row)
|
|
242
393
|
first_row_cols = calculate_row_columns(first_data_row)
|
|
243
394
|
last_row_visual_cols = calculate_visual_columns(last_row)
|
|
244
395
|
first_row_visual_cols = calculate_visual_columns(first_data_row)
|
|
245
396
|
|
|
246
|
-
# logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(视觉列数:{first_row_visual_cols})")
|
|
397
|
+
# logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(有效列数:{last_row_effective_cols}, 视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(有效列数:{first_row_effective_cols}, 视觉列数:{first_row_visual_cols})")
|
|
247
398
|
|
|
248
|
-
#
|
|
249
|
-
return
|
|
399
|
+
# 同时考虑有效列数匹配、实际列数匹配和视觉列数匹配
|
|
400
|
+
return (last_row_effective_cols == first_row_effective_cols or
|
|
401
|
+
last_row_cols == first_row_cols or
|
|
402
|
+
last_row_visual_cols == first_row_visual_cols)
|
|
250
403
|
|
|
251
404
|
|
|
252
405
|
def check_row_columns_match(row1, row2):
|
|
@@ -263,12 +416,13 @@ def check_row_columns_match(row1, row2):
|
|
|
263
416
|
return True
|
|
264
417
|
|
|
265
418
|
|
|
266
|
-
def adjust_table_rows_colspan(rows, start_idx, end_idx,
|
|
419
|
+
def adjust_table_rows_colspan(soup, rows, start_idx, end_idx,
|
|
267
420
|
reference_structure, reference_visual_cols,
|
|
268
421
|
target_cols, current_cols, reference_row):
|
|
269
422
|
"""调整表格行的colspan属性以匹配目标列数
|
|
270
423
|
|
|
271
424
|
Args:
|
|
425
|
+
soup: BeautifulSoup解析的表格对象(用于计算有效列数)
|
|
272
426
|
rows: 表格行列表
|
|
273
427
|
start_idx: 起始行索引
|
|
274
428
|
end_idx: 结束行索引(不包含)
|
|
@@ -280,14 +434,21 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
|
|
|
280
434
|
"""
|
|
281
435
|
reference_row_copy = deepcopy(reference_row)
|
|
282
436
|
|
|
437
|
+
# 构建有效列数矩阵
|
|
438
|
+
effective_cols_matrix = build_table_occupied_matrix(soup)
|
|
439
|
+
|
|
283
440
|
for i in range(start_idx, end_idx):
|
|
284
441
|
row = rows[i]
|
|
285
442
|
cells = row.find_all(["td", "th"])
|
|
286
443
|
if not cells:
|
|
287
444
|
continue
|
|
288
445
|
|
|
446
|
+
# 使用有效列数(考虑rowspan)判断是否需要调整
|
|
447
|
+
current_row_effective_cols = effective_cols_matrix.get(i, 0)
|
|
289
448
|
current_row_cols = calculate_row_columns(row)
|
|
290
|
-
|
|
449
|
+
|
|
450
|
+
# 如果有效列数或实际列数已经达到目标,则跳过
|
|
451
|
+
if current_row_effective_cols >= target_cols or current_row_cols >= target_cols:
|
|
291
452
|
continue
|
|
292
453
|
|
|
293
454
|
# 检查是否与参考行结构匹配
|
|
@@ -299,9 +460,12 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
|
|
|
299
460
|
cell["colspan"] = str(reference_structure[j])
|
|
300
461
|
else:
|
|
301
462
|
# 扩展最后一个单元格以填补列数差异
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
463
|
+
# 使用有效列数来计算差异
|
|
464
|
+
cols_diff = target_cols - current_row_effective_cols
|
|
465
|
+
if cols_diff > 0:
|
|
466
|
+
last_cell = cells[-1]
|
|
467
|
+
current_last_span = int(last_cell.get("colspan", 1))
|
|
468
|
+
last_cell["colspan"] = str(current_last_span + cols_diff)
|
|
305
469
|
|
|
306
470
|
|
|
307
471
|
def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_footnotes):
|
|
@@ -332,7 +496,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
|
|
|
332
496
|
reference_visual_cols = calculate_visual_columns(last_row1)
|
|
333
497
|
# 以表1的最后一行为参考,调整表2的行
|
|
334
498
|
adjust_table_rows_colspan(
|
|
335
|
-
rows2, header_count, len(rows2),
|
|
499
|
+
soup2, rows2, header_count, len(rows2),
|
|
336
500
|
reference_structure, reference_visual_cols,
|
|
337
501
|
table_cols1, table_cols2, first_data_row2
|
|
338
502
|
)
|
|
@@ -342,7 +506,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
|
|
|
342
506
|
reference_visual_cols = calculate_visual_columns(first_data_row2)
|
|
343
507
|
# 以表2的第一个数据行为参考,调整表1的行
|
|
344
508
|
adjust_table_rows_colspan(
|
|
345
|
-
rows1, 0, len(rows1),
|
|
509
|
+
soup1, rows1, 0, len(rows1),
|
|
346
510
|
reference_structure, reference_visual_cols,
|
|
347
511
|
table_cols2, table_cols1, last_row1
|
|
348
512
|
)
|
|
@@ -356,6 +520,11 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
|
|
|
356
520
|
row.extract()
|
|
357
521
|
tbody1.append(row)
|
|
358
522
|
|
|
523
|
+
# 清空previous_table_block的footnote
|
|
524
|
+
previous_table_block["blocks"] = [
|
|
525
|
+
block for block in previous_table_block["blocks"]
|
|
526
|
+
if block["type"] != BlockType.TABLE_FOOTNOTE
|
|
527
|
+
]
|
|
359
528
|
# 添加待合并表格的footnote到前一个表格中
|
|
360
529
|
for table_footnote in wait_merge_table_footnotes:
|
|
361
530
|
temp_table_footnote = table_footnote.copy()
|
|
@@ -416,4 +585,4 @@ def merge_table(page_info_list):
|
|
|
416
585
|
# 删除当前页的table
|
|
417
586
|
for block in current_table_block["blocks"]:
|
|
418
587
|
block['lines'] = []
|
|
419
|
-
block[SplitFlag.LINES_DELETED] = True
|
|
588
|
+
block[SplitFlag.LINES_DELETED] = True
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.7.
|
|
1
|
+
__version__ = "2.7.2"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.2
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -19,7 +19,7 @@ Requires-Dist: boto3>=1.28.43
|
|
|
19
19
|
Requires-Dist: click>=8.1.7
|
|
20
20
|
Requires-Dist: loguru>=0.7.2
|
|
21
21
|
Requires-Dist: numpy>=1.21.6
|
|
22
|
-
Requires-Dist: pdfminer.six
|
|
22
|
+
Requires-Dist: pdfminer.six>=20251230
|
|
23
23
|
Requires-Dist: tqdm>=4.67.1
|
|
24
24
|
Requires-Dist: requests
|
|
25
25
|
Requires-Dist: httpx
|
|
@@ -60,7 +60,7 @@ Requires-Dist: matplotlib<4,>=3.10; extra == "pipeline"
|
|
|
60
60
|
Requires-Dist: ultralytics<9,>=8.3.48; extra == "pipeline"
|
|
61
61
|
Requires-Dist: doclayout_yolo==0.0.4; extra == "pipeline"
|
|
62
62
|
Requires-Dist: dill<1,>=0.3.8; extra == "pipeline"
|
|
63
|
-
Requires-Dist: PyYAML<7,>=6.0.
|
|
63
|
+
Requires-Dist: PyYAML<7,>=6.0.1; extra == "pipeline"
|
|
64
64
|
Requires-Dist: ftfy<7,>=6.3.1; extra == "pipeline"
|
|
65
65
|
Requires-Dist: shapely<3,>=2.0.7; extra == "pipeline"
|
|
66
66
|
Requires-Dist: pyclipper<2,>=1.3.0; extra == "pipeline"
|
|
@@ -81,9 +81,9 @@ Requires-Dist: mineru[vlm]; extra == "core"
|
|
|
81
81
|
Requires-Dist: mineru[pipeline]; extra == "core"
|
|
82
82
|
Requires-Dist: mineru[api]; extra == "core"
|
|
83
83
|
Requires-Dist: mineru[gradio]; extra == "core"
|
|
84
|
-
Requires-Dist: mineru[mlx]; sys_platform == "darwin" and extra == "core"
|
|
85
84
|
Provides-Extra: all
|
|
86
85
|
Requires-Dist: mineru[core]; extra == "all"
|
|
86
|
+
Requires-Dist: mineru[mlx]; sys_platform == "darwin" and extra == "all"
|
|
87
87
|
Requires-Dist: mineru[vllm]; sys_platform == "linux" and extra == "all"
|
|
88
88
|
Requires-Dist: mineru[lmdeploy]; sys_platform == "windows" and extra == "all"
|
|
89
89
|
Dynamic: license-file
|
|
@@ -135,6 +135,22 @@ Dynamic: license-file
|
|
|
135
135
|
|
|
136
136
|
# Changelog
|
|
137
137
|
|
|
138
|
+
- 2026/01/23 2.7.2 Release
|
|
139
|
+
- Added support for domestic computing platforms Hygon, Enflame, and Moore Threads. Currently, the officially supported domestic computing platforms include:
|
|
140
|
+
- [Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend/)
|
|
141
|
+
- [T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead/)
|
|
142
|
+
- [METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX/)
|
|
143
|
+
- [Hygon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
|
|
144
|
+
- [Enflame](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
|
|
145
|
+
- [MooreThreads](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
|
|
146
|
+
- MinerU continues to ensure compatibility with domestic hardware platforms, supporting mainstream chip architectures. With secure and reliable technology, we empower researchers, government, and enterprises to reach new heights in document digitization!
|
|
147
|
+
- Cross-page table merging optimization, improving merge success rate and merge quality
|
|
148
|
+
|
|
149
|
+
- 2026/01/06 2.7.1 Release
|
|
150
|
+
- fix bug: #4300
|
|
151
|
+
- Updated pdfminer.six dependency version to resolve [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
|
|
152
|
+
- Support automatic correction of input image exif orientation to improve OCR recognition accuracy #4283
|
|
153
|
+
|
|
138
154
|
- 2025/12/30 2.7.0 Release
|
|
139
155
|
- Simplified installation process. No need to separately install `vlm` acceleration engine dependencies. Using `uv pip install mineru[all]` during installation will install all optional backend dependencies.
|
|
140
156
|
- Added new `hybrid` backend, which combines the advantages of `pipeline` and `vlm` backends. Built on vlm, it integrates some capabilities of pipeline, adding extra extensibility on top of high accuracy:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=H1WLrviWKvrPzDle8EWdCYYkzljxs0mtbXigYc-xaKA,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
|
|
@@ -17,13 +17,13 @@ mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc
|
|
|
17
17
|
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=NJCLGKE7BqM24bRdpXCfTalyiqozowFZjpdzpIUy5aA,14672
|
|
18
18
|
mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
19
19
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
20
|
-
mineru/backend/vlm/utils.py,sha256=
|
|
21
|
-
mineru/backend/vlm/vlm_analyze.py,sha256=
|
|
20
|
+
mineru/backend/vlm/utils.py,sha256=1qma_KmDjRfOckcPbriGgRhS1XMk_johsyACfwcmDr4,3844
|
|
21
|
+
mineru/backend/vlm/vlm_analyze.py,sha256=ttnQBUy1PEm9JZoF2G1_z-7gA3MgUUUBhz6OypCb4_g,14765
|
|
22
22
|
mineru/backend/vlm/vlm_magic_model.py,sha256=mD-irxboo2DmMu4QF1wnvbti2xdNyBmNflbB4a-TmsU,21402
|
|
23
23
|
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
|
|
24
24
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
25
25
|
mineru/cli/client.py,sha256=mPNfMEShVG-ithmlJQ5nGRIad2gCZgUjBGHN7zAmLhQ,6978
|
|
26
|
-
mineru/cli/common.py,sha256=
|
|
26
|
+
mineru/cli/common.py,sha256=fMPc235DtnupQkh9uFIMHUpxOSvCp5yc3A56sAabAWY,20475
|
|
27
27
|
mineru/cli/fast_api.py,sha256=TGpZqyUE1kg2eXsP76pr0p1yqNOOU9jyjL5Pc0FJwRc,16637
|
|
28
28
|
mineru/cli/gradio_app.py,sha256=2IIWOm2bEHHq5BZMlfmN3yAJw1Nf8SUALTQ95o-bYy0,21863
|
|
29
29
|
mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
|
|
@@ -47,13 +47,13 @@ mineru/model/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
|
47
47
|
mineru/model/layout/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
48
48
|
mineru/model/layout/doclayoutyolo.py,sha256=DttINdulzTiYcVDl_70oDtUdfVmGc9qkKWmbPOGAeV0,3867
|
|
49
49
|
mineru/model/mfd/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
50
|
-
mineru/model/mfd/yolo_v8.py,sha256=
|
|
50
|
+
mineru/model/mfd/yolo_v8.py,sha256=OI5AxVgt3FvXp4NYk0BDXXvpDlo9YjM6byDyC_TZ8Js,3714
|
|
51
51
|
mineru/model/mfr/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
52
52
|
mineru/model/mfr/utils.py,sha256=pAi1HnkTuO0R6251Hdl-o50m0wH0Ce89PAf74WCsXPU,11499
|
|
53
53
|
mineru/model/mfr/pp_formulanet_plus_m/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
54
|
mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py,sha256=alGX_sPJxZh_7v1sOK3DJ8akfkWO-2c5I_JR7aXMTLU,5588
|
|
55
55
|
mineru/model/mfr/pp_formulanet_plus_m/processors.py,sha256=MSKyanxiDDjgDQHBov-GjKtPnMx9tSmxBC9GIkM3ft8,23832
|
|
56
|
-
mineru/model/mfr/unimernet/Unimernet.py,sha256=
|
|
56
|
+
mineru/model/mfr/unimernet/Unimernet.py,sha256=ZK0M9fPmZziK4D33H3YND7RnHiQkRVCS-lvNfY-N7do,7912
|
|
57
57
|
mineru/model/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
mineru/model/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
|
|
59
59
|
mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=_lN3zDKxeqsW-h9tXx79DYiT5uT4P9ixG49WrSYKFxE,7551
|
|
@@ -88,7 +88,7 @@ mineru/model/table/rec/unet_table/main.py,sha256=J13Q7_6stYyedmVedf9CZD7R0tuguGf
|
|
|
88
88
|
mineru/model/table/rec/unet_table/table_recover.py,sha256=rSyeWyuP10M8dLKA5e0n4P2DXMYbVbmgLxEcdZA8_0E,9059
|
|
89
89
|
mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=hnmYLzZFRlK0Y4gr874G9GaLahcKnNZYNun869FdmH8,8150
|
|
90
90
|
mineru/model/table/rec/unet_table/utils.py,sha256=CYAqJW0wePJk4NAemb8W203N7E32v0ujiWbxanDhd8I,16083
|
|
91
|
-
mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=
|
|
91
|
+
mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=6z0jYO6S8wAmfHe5tAyEfzWZIQv8wrn_dRU9GC7oKro,11435
|
|
92
92
|
mineru/model/table/rec/unet_table/utils_table_recover.py,sha256=XksJsY82ZS0kqUnNT-jvaYzxJ3V3svMSzj0puwIau1k,10651
|
|
93
93
|
mineru/model/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
94
94
|
mineru/model/utils/pytorchocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -151,17 +151,17 @@ mineru/model/utils/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-
|
|
|
151
151
|
mineru/model/utils/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
|
|
152
152
|
mineru/model/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
153
|
mineru/model/vlm/lmdeploy_server.py,sha256=PvxJNcUIKB8VzWMDXeV1t0SHSgz_ULO36ZAzJbppz90,3262
|
|
154
|
-
mineru/model/vlm/vllm_server.py,sha256=
|
|
154
|
+
mineru/model/vlm/vllm_server.py,sha256=DtYRAHINYN4qkR2onVMofvANPTkSP6tE4IRY_vZgpiA,3079
|
|
155
155
|
mineru/resources/header.html,sha256=7xrf6bGloR-3ZeTDyA-JvavE_NeRuUDe3p07cEKUXSI,4769
|
|
156
156
|
mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
|
157
157
|
mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
158
158
|
mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
|
|
159
|
-
mineru/utils/block_sort.py,sha256=
|
|
159
|
+
mineru/utils/block_sort.py,sha256=MmgjZBcmaWssAglzE75VixjtJ_BLNUHO0gvCNQHvlY4,13538
|
|
160
160
|
mineru/utils/boxbase.py,sha256=xnGA1k7hVtTQrreqlJmK-SA3y9edTHgLmGiqGrSXckE,7568
|
|
161
161
|
mineru/utils/char_utils.py,sha256=74T5Ylr5mi1uddAIuJku9Z6sH7vhR7t595_H7qmbu4c,1777
|
|
162
162
|
mineru/utils/check_sys_env.py,sha256=TRjzg4xWyoSGrgv4KaP225A-99xBgLAfZ1cPcGqrBAA,1191
|
|
163
163
|
mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
|
|
164
|
-
mineru/utils/config_reader.py,sha256=
|
|
164
|
+
mineru/utils/config_reader.py,sha256=mfulokOzI-33sZy7S-wEVbY3z01PdC7X3761fuhqR3s,4393
|
|
165
165
|
mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
|
|
166
166
|
mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
|
|
167
167
|
mineru/utils/engine_utils.py,sha256=Jmao9-O-sZDzH7vANKEDaY6NJ8tuthKsTr23LFIeBLU,2203
|
|
@@ -172,22 +172,22 @@ mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,85
|
|
|
172
172
|
mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
|
|
173
173
|
mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
|
|
174
174
|
mineru/utils/magic_model_utils.py,sha256=I6vdN56aqhQBGOasoWHiJbjnXsBwUojw6xFjbWZSHaU,8656
|
|
175
|
-
mineru/utils/model_utils.py,sha256=
|
|
175
|
+
mineru/utils/model_utils.py,sha256=w-jSN7Ilh27FlMjPpKNO6MPbo_dT5Ln7zCQcXaREl_k,19605
|
|
176
176
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
177
177
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
178
178
|
mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
|
|
179
179
|
mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
|
|
180
|
-
mineru/utils/pdf_image_tools.py,sha256=
|
|
180
|
+
mineru/utils/pdf_image_tools.py,sha256=L2kHKoFaQo4CGjS1d68JACrlBycx6gyCnnFlbBFRKuw,8273
|
|
181
181
|
mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
|
|
182
182
|
mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
|
|
183
183
|
mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
|
|
184
184
|
mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
|
|
185
185
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
186
186
|
mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
|
|
187
|
-
mineru/utils/table_merge.py,sha256=
|
|
188
|
-
mineru-2.7.
|
|
189
|
-
mineru-2.7.
|
|
190
|
-
mineru-2.7.
|
|
191
|
-
mineru-2.7.
|
|
192
|
-
mineru-2.7.
|
|
193
|
-
mineru-2.7.
|
|
187
|
+
mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
|
|
188
|
+
mineru-2.7.2.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
189
|
+
mineru-2.7.2.dist-info/METADATA,sha256=w3qS7X-Wjvqz8Ra5fp0QH-Wvq_RbZHGyaVOL8WIrerw,36621
|
|
190
|
+
mineru-2.7.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
191
|
+
mineru-2.7.2.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
|
|
192
|
+
mineru-2.7.2.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
193
|
+
mineru-2.7.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|