mineru 2.7.1__py3-none-any.whl → 2.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/vlm/utils.py +4 -0
- mineru/backend/vlm/vlm_analyze.py +55 -0
- mineru/model/mfd/yolo_v8.py +4 -2
- mineru/model/mfr/unimernet/Unimernet.py +3 -3
- mineru/model/table/rec/unet_table/utils_table_line_rec.py +7 -1
- mineru/model/vlm/vllm_server.py +18 -0
- mineru/utils/block_sort.py +12 -0
- mineru/utils/config_reader.py +9 -1
- mineru/utils/model_utils.py +13 -1
- mineru/utils/table_merge.py +202 -40
- mineru/version.py +1 -1
- {mineru-2.7.1.dist-info → mineru-2.7.2.dist-info}/METADATA +13 -2
- {mineru-2.7.1.dist-info → mineru-2.7.2.dist-info}/RECORD +17 -17
- {mineru-2.7.1.dist-info → mineru-2.7.2.dist-info}/WHEEL +1 -1
- {mineru-2.7.1.dist-info → mineru-2.7.2.dist-info}/entry_points.txt +0 -0
- {mineru-2.7.1.dist-info → mineru-2.7.2.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.7.1.dist-info → mineru-2.7.2.dist-info}/top_level.txt +0 -0
mineru/backend/vlm/utils.py
CHANGED
|
@@ -18,6 +18,10 @@ def enable_custom_logits_processors() -> bool:
|
|
|
18
18
|
compute_capability = f"{major}.{minor}"
|
|
19
19
|
elif hasattr(torch, 'npu') and torch.npu.is_available():
|
|
20
20
|
compute_capability = "8.0"
|
|
21
|
+
elif hasattr(torch, 'gcu') and torch.gcu.is_available():
|
|
22
|
+
compute_capability = "8.0"
|
|
23
|
+
elif hasattr(torch, 'musa') and torch.musa.is_available():
|
|
24
|
+
compute_capability = "8.0"
|
|
21
25
|
else:
|
|
22
26
|
logger.info("CUDA not available, disabling custom_logits_processors")
|
|
23
27
|
return False
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
+
import json
|
|
4
5
|
|
|
5
6
|
from loguru import logger
|
|
6
7
|
|
|
@@ -99,6 +100,30 @@ class ModelSingleton:
|
|
|
99
100
|
import vllm
|
|
100
101
|
except ImportError:
|
|
101
102
|
raise ImportError("Please install vllm to use the vllm-engine backend.")
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
# musa vllm v1 引擎特殊配置
|
|
106
|
+
device = get_device()
|
|
107
|
+
if device.startswith("musa"):
|
|
108
|
+
import torch
|
|
109
|
+
if torch.musa.is_available():
|
|
110
|
+
compilation_config = {
|
|
111
|
+
"cudagraph_capture_sizes": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
112
|
+
"simple_cuda_graph": True
|
|
113
|
+
}
|
|
114
|
+
block_size = 32
|
|
115
|
+
kwargs["compilation_config"] = compilation_config
|
|
116
|
+
kwargs["block_size"] = block_size
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
if "compilation_config" in kwargs:
|
|
120
|
+
if isinstance(kwargs["compilation_config"], str):
|
|
121
|
+
try:
|
|
122
|
+
kwargs["compilation_config"] = json.loads(kwargs["compilation_config"])
|
|
123
|
+
except json.JSONDecodeError:
|
|
124
|
+
logger.warning(
|
|
125
|
+
f"Failed to parse compilation_config as JSON: {kwargs['compilation_config']}")
|
|
126
|
+
del kwargs["compilation_config"]
|
|
102
127
|
if "gpu_memory_utilization" not in kwargs:
|
|
103
128
|
kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
|
|
104
129
|
if "model" not in kwargs:
|
|
@@ -112,8 +137,38 @@ class ModelSingleton:
|
|
|
112
137
|
try:
|
|
113
138
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
114
139
|
from vllm.v1.engine.async_llm import AsyncLLM
|
|
140
|
+
from vllm.config import CompilationConfig
|
|
115
141
|
except ImportError:
|
|
116
142
|
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
|
|
143
|
+
|
|
144
|
+
"""
|
|
145
|
+
# musa vllm v1 引擎特殊配置
|
|
146
|
+
device = get_device()
|
|
147
|
+
if device.startswith("musa"):
|
|
148
|
+
import torch
|
|
149
|
+
if torch.musa.is_available():
|
|
150
|
+
compilation_config = CompilationConfig(
|
|
151
|
+
cudagraph_capture_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
|
|
152
|
+
simple_cuda_graph=True
|
|
153
|
+
)
|
|
154
|
+
block_size = 32
|
|
155
|
+
kwargs["compilation_config"] = compilation_config
|
|
156
|
+
kwargs["block_size"] = block_size
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
if "compilation_config" in kwargs:
|
|
160
|
+
if isinstance(kwargs["compilation_config"], dict):
|
|
161
|
+
# 如果是字典,转换为 CompilationConfig 对象
|
|
162
|
+
kwargs["compilation_config"] = CompilationConfig(**kwargs["compilation_config"])
|
|
163
|
+
elif isinstance(kwargs["compilation_config"], str):
|
|
164
|
+
# 如果是 JSON 字符串,先解析再转换
|
|
165
|
+
try:
|
|
166
|
+
config_dict = json.loads(kwargs["compilation_config"])
|
|
167
|
+
kwargs["compilation_config"] = CompilationConfig(**config_dict)
|
|
168
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
169
|
+
logger.warning(
|
|
170
|
+
f"Failed to parse compilation_config: {kwargs['compilation_config']}, error: {e}")
|
|
171
|
+
del kwargs["compilation_config"]
|
|
117
172
|
if "gpu_memory_utilization" not in kwargs:
|
|
118
173
|
kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
|
|
119
174
|
if "model" not in kwargs:
|
mineru/model/mfd/yolo_v8.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List, Union
|
|
3
|
+
|
|
4
|
+
import torch
|
|
3
5
|
from tqdm import tqdm
|
|
4
6
|
from ultralytics import YOLO
|
|
5
7
|
import numpy as np
|
|
@@ -18,8 +20,8 @@ class YOLOv8MFDModel:
|
|
|
18
20
|
conf: float = 0.25,
|
|
19
21
|
iou: float = 0.45,
|
|
20
22
|
):
|
|
21
|
-
self.
|
|
22
|
-
self.
|
|
23
|
+
self.device = torch.device(device)
|
|
24
|
+
self.model = YOLO(weight).to(self.device)
|
|
23
25
|
self.imgsz = imgsz
|
|
24
26
|
self.conf = conf
|
|
25
27
|
self.iou = iou
|
|
@@ -23,12 +23,12 @@ class MathDataset(Dataset):
|
|
|
23
23
|
class UnimernetModel(object):
|
|
24
24
|
def __init__(self, weight_dir, _device_="cpu"):
|
|
25
25
|
from .unimernet_hf import UnimernetModel
|
|
26
|
-
if _device_.startswith("mps") or _device_.startswith("npu"):
|
|
26
|
+
if _device_.startswith("mps") or _device_.startswith("npu") or _device_.startswith("musa"):
|
|
27
27
|
self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager")
|
|
28
28
|
else:
|
|
29
29
|
self.model = UnimernetModel.from_pretrained(weight_dir)
|
|
30
|
-
self.device = _device_
|
|
31
|
-
self.model.to(
|
|
30
|
+
self.device = torch.device(_device_)
|
|
31
|
+
self.model.to(self.device)
|
|
32
32
|
if not _device_.startswith("cpu"):
|
|
33
33
|
self.model = self.model.to(dtype=torch.float16)
|
|
34
34
|
self.model.eval()
|
|
@@ -4,6 +4,8 @@ import cv2
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
from scipy.spatial import distance as dist
|
|
6
6
|
from skimage import measure
|
|
7
|
+
from skimage import __version__ as skimage_version
|
|
8
|
+
from packaging import version
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
def transform_preds(coords, center, scale, output_size, rot=0):
|
|
@@ -295,7 +297,11 @@ def min_area_rect_box(
|
|
|
295
297
|
"""
|
|
296
298
|
boxes = []
|
|
297
299
|
for region in regions:
|
|
298
|
-
if
|
|
300
|
+
if version.parse(skimage_version) >= version.parse("0.26.0"):
|
|
301
|
+
region_bbox_area = region.area_bbox
|
|
302
|
+
else:
|
|
303
|
+
region_bbox_area = region.bbox_area
|
|
304
|
+
if region_bbox_area > H * W * 3 / 4: # 过滤大的单元格
|
|
299
305
|
continue
|
|
300
306
|
rect = cv2.minAreaRect(region.coords[:, ::-1])
|
|
301
307
|
|
mineru/model/vlm/vllm_server.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import sys
|
|
3
3
|
|
|
4
4
|
from mineru.backend.vlm.utils import set_default_gpu_memory_utilization, enable_custom_logits_processors
|
|
5
|
+
from mineru.utils.config_reader import get_device
|
|
5
6
|
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
|
|
6
7
|
|
|
7
8
|
from vllm.entrypoints.cli.main import main as vllm_main
|
|
@@ -13,6 +14,8 @@ def main():
|
|
|
13
14
|
has_port_arg = False
|
|
14
15
|
has_gpu_memory_utilization_arg = False
|
|
15
16
|
has_logits_processors_arg = False
|
|
17
|
+
has_block_size_arg = False
|
|
18
|
+
has_compilation_config = False
|
|
16
19
|
model_path = None
|
|
17
20
|
model_arg_indices = []
|
|
18
21
|
|
|
@@ -24,6 +27,10 @@ def main():
|
|
|
24
27
|
has_gpu_memory_utilization_arg = True
|
|
25
28
|
if arg == "--logits-processors" or arg.startswith("--logits-processors="):
|
|
26
29
|
has_logits_processors_arg = True
|
|
30
|
+
if arg == "--block-size" or arg.startswith("--block-size="):
|
|
31
|
+
has_block_size_arg = True
|
|
32
|
+
if arg == "--compilation-config" or arg.startswith("--compilation-config="):
|
|
33
|
+
has_compilation_config = True
|
|
27
34
|
if arg == "--model":
|
|
28
35
|
if i + 1 < len(args):
|
|
29
36
|
model_path = args[i + 1]
|
|
@@ -49,6 +56,17 @@ def main():
|
|
|
49
56
|
model_path = auto_download_and_get_model_root_path("/", "vlm")
|
|
50
57
|
if (not has_logits_processors_arg) and custom_logits_processors:
|
|
51
58
|
args.extend(["--logits-processors", "mineru_vl_utils:MinerULogitsProcessor"])
|
|
59
|
+
"""
|
|
60
|
+
# musa vllm v1 引擎特殊配置
|
|
61
|
+
device = get_device()
|
|
62
|
+
if device.startswith("musa"):
|
|
63
|
+
import torch
|
|
64
|
+
if torch.musa.is_available():
|
|
65
|
+
if not has_block_size_arg:
|
|
66
|
+
args.extend(["--block-size", "32"])
|
|
67
|
+
if not has_compilation_config:
|
|
68
|
+
args.extend(["--compilation-config", '{"cudagraph_capture_sizes": [1,2,3,4,5,6,7,8,10,12,14,16,18,20,24,28,30], "simple_cuda_graph": true}'])
|
|
69
|
+
"""
|
|
52
70
|
|
|
53
71
|
# 重构参数,将模型路径作为位置参数
|
|
54
72
|
sys.argv = [sys.argv[0]] + ["serve", model_path] + args
|
mineru/utils/block_sort.py
CHANGED
|
@@ -186,6 +186,18 @@ def model_init(model_name: str):
|
|
|
186
186
|
bf_16_support = True
|
|
187
187
|
elif device_name.startswith("mps"):
|
|
188
188
|
bf_16_support = True
|
|
189
|
+
elif device_name.startswith("gcu"):
|
|
190
|
+
if hasattr(torch, 'gcu') and torch.gcu.is_available():
|
|
191
|
+
if torch.gcu.is_bf16_supported():
|
|
192
|
+
bf_16_support = True
|
|
193
|
+
elif device_name.startswith("musa"):
|
|
194
|
+
if hasattr(torch, 'musa') and torch.musa.is_available():
|
|
195
|
+
if torch.musa.is_bf16_supported():
|
|
196
|
+
bf_16_support = True
|
|
197
|
+
elif device_name.startswith("npu"):
|
|
198
|
+
if hasattr(torch, 'npu') and torch.npu.is_available():
|
|
199
|
+
if torch.npu.is_bf16_supported():
|
|
200
|
+
bf_16_support = True
|
|
189
201
|
|
|
190
202
|
if model_name == 'layoutreader':
|
|
191
203
|
# 检测modelscope的缓存目录是否存在
|
mineru/utils/config_reader.py
CHANGED
|
@@ -86,7 +86,15 @@ def get_device():
|
|
|
86
86
|
if torch_npu.npu.is_available():
|
|
87
87
|
return "npu"
|
|
88
88
|
except Exception as e:
|
|
89
|
-
|
|
89
|
+
try:
|
|
90
|
+
if torch.gcu.is_available():
|
|
91
|
+
return "gcu"
|
|
92
|
+
except Exception as e:
|
|
93
|
+
try:
|
|
94
|
+
if torch.musa.is_available():
|
|
95
|
+
return "musa"
|
|
96
|
+
except Exception as e:
|
|
97
|
+
pass
|
|
90
98
|
return "cpu"
|
|
91
99
|
|
|
92
100
|
|
mineru/utils/model_utils.py
CHANGED
|
@@ -414,7 +414,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
|
|
|
414
414
|
|
|
415
415
|
|
|
416
416
|
def clean_memory(device='cuda'):
|
|
417
|
-
if device
|
|
417
|
+
if str(device).startswith("cuda"):
|
|
418
418
|
if torch.cuda.is_available():
|
|
419
419
|
torch.cuda.empty_cache()
|
|
420
420
|
torch.cuda.ipc_collect()
|
|
@@ -423,6 +423,12 @@ def clean_memory(device='cuda'):
|
|
|
423
423
|
torch_npu.npu.empty_cache()
|
|
424
424
|
elif str(device).startswith("mps"):
|
|
425
425
|
torch.mps.empty_cache()
|
|
426
|
+
elif str(device).startswith("gcu"):
|
|
427
|
+
if torch.gcu.is_available():
|
|
428
|
+
torch.gcu.empty_cache()
|
|
429
|
+
elif str(device).startswith("musa"):
|
|
430
|
+
if torch.musa.is_available():
|
|
431
|
+
torch.musa.empty_cache()
|
|
426
432
|
gc.collect()
|
|
427
433
|
|
|
428
434
|
|
|
@@ -458,5 +464,11 @@ def get_vram(device) -> int:
|
|
|
458
464
|
elif str(device).startswith("npu"):
|
|
459
465
|
if torch_npu.npu.is_available():
|
|
460
466
|
total_memory = round(torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
467
|
+
elif str(device).startswith("gcu"):
|
|
468
|
+
if torch.gcu.is_available():
|
|
469
|
+
total_memory = round(torch.gcu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
470
|
+
elif str(device).startswith("musa"):
|
|
471
|
+
if torch.musa.is_available():
|
|
472
|
+
total_memory = round(torch.musa.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
|
|
461
473
|
|
|
462
474
|
return total_memory
|
mineru/utils/table_merge.py
CHANGED
|
@@ -9,13 +9,19 @@ from mineru.utils.char_utils import full_to_half
|
|
|
9
9
|
from mineru.utils.enum_class import BlockType, SplitFlag
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
CONTINUATION_END_MARKERS = [
|
|
13
13
|
"(续)",
|
|
14
14
|
"(续表)",
|
|
15
15
|
"(续上表)",
|
|
16
16
|
"(continued)",
|
|
17
17
|
"(cont.)",
|
|
18
18
|
"(cont’d)",
|
|
19
|
+
"(…continued)",
|
|
20
|
+
"续表",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
CONTINUATION_INLINE_MARKERS = [
|
|
24
|
+
"(continued)",
|
|
19
25
|
]
|
|
20
26
|
|
|
21
27
|
|
|
@@ -64,6 +70,69 @@ def calculate_table_total_columns(soup):
|
|
|
64
70
|
return max_cols
|
|
65
71
|
|
|
66
72
|
|
|
73
|
+
def build_table_occupied_matrix(soup):
|
|
74
|
+
"""构建表格的占用矩阵,返回每行的有效列数
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
soup: BeautifulSoup解析的表格
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
dict: {row_idx: effective_columns} 每行的有效列数(考虑rowspan占用)
|
|
81
|
+
"""
|
|
82
|
+
rows = soup.find_all("tr")
|
|
83
|
+
if not rows:
|
|
84
|
+
return {}
|
|
85
|
+
|
|
86
|
+
occupied = {} # {row_idx: {col_idx: True}}
|
|
87
|
+
row_effective_cols = {} # {row_idx: effective_columns}
|
|
88
|
+
|
|
89
|
+
for row_idx, row in enumerate(rows):
|
|
90
|
+
col_idx = 0
|
|
91
|
+
cells = row.find_all(["td", "th"])
|
|
92
|
+
|
|
93
|
+
if row_idx not in occupied:
|
|
94
|
+
occupied[row_idx] = {}
|
|
95
|
+
|
|
96
|
+
for cell in cells:
|
|
97
|
+
# 找到下一个未被占用的列位置
|
|
98
|
+
while col_idx in occupied[row_idx]:
|
|
99
|
+
col_idx += 1
|
|
100
|
+
|
|
101
|
+
colspan = int(cell.get("colspan", 1))
|
|
102
|
+
rowspan = int(cell.get("rowspan", 1))
|
|
103
|
+
|
|
104
|
+
# 标记被这个单元格占用的所有位置
|
|
105
|
+
for r in range(row_idx, row_idx + rowspan):
|
|
106
|
+
if r not in occupied:
|
|
107
|
+
occupied[r] = {}
|
|
108
|
+
for c in range(col_idx, col_idx + colspan):
|
|
109
|
+
occupied[r][c] = True
|
|
110
|
+
|
|
111
|
+
col_idx += colspan
|
|
112
|
+
|
|
113
|
+
# 该行的有效列数为已占用的最大列索引+1
|
|
114
|
+
if occupied[row_idx]:
|
|
115
|
+
row_effective_cols[row_idx] = max(occupied[row_idx].keys()) + 1
|
|
116
|
+
else:
|
|
117
|
+
row_effective_cols[row_idx] = 0
|
|
118
|
+
|
|
119
|
+
return row_effective_cols
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def calculate_row_effective_columns(soup, row_idx):
|
|
123
|
+
"""计算指定行的有效列数(考虑rowspan占用)
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
soup: BeautifulSoup解析的表格
|
|
127
|
+
row_idx: 行索引
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
int: 该行的有效列数
|
|
131
|
+
"""
|
|
132
|
+
row_effective_cols = build_table_occupied_matrix(soup)
|
|
133
|
+
return row_effective_cols.get(row_idx, 0)
|
|
134
|
+
|
|
135
|
+
|
|
67
136
|
def calculate_row_columns(row):
|
|
68
137
|
"""
|
|
69
138
|
计算表格行的实际列数,考虑colspan属性
|
|
@@ -113,6 +182,10 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
|
|
|
113
182
|
rows1 = soup1.find_all("tr")
|
|
114
183
|
rows2 = soup2.find_all("tr")
|
|
115
184
|
|
|
185
|
+
# 构建两个表格的有效列数矩阵
|
|
186
|
+
effective_cols1 = build_table_occupied_matrix(soup1)
|
|
187
|
+
effective_cols2 = build_table_occupied_matrix(soup2)
|
|
188
|
+
|
|
116
189
|
min_rows = min(len(rows1), len(rows2), max_header_rows)
|
|
117
190
|
header_rows = 0
|
|
118
191
|
headers_match = True
|
|
@@ -130,20 +203,24 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
|
|
|
130
203
|
if len(cells1) != len(cells2):
|
|
131
204
|
structure_match = False
|
|
132
205
|
else:
|
|
133
|
-
#
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
206
|
+
# 检查有效列数是否一致(考虑rowspan影响)
|
|
207
|
+
if effective_cols1.get(i, 0) != effective_cols2.get(i, 0):
|
|
208
|
+
structure_match = False
|
|
209
|
+
else:
|
|
210
|
+
# 然后检查单元格的属性和内容
|
|
211
|
+
for cell1, cell2 in zip(cells1, cells2):
|
|
212
|
+
colspan1 = int(cell1.get("colspan", 1))
|
|
213
|
+
rowspan1 = int(cell1.get("rowspan", 1))
|
|
214
|
+
colspan2 = int(cell2.get("colspan", 1))
|
|
215
|
+
rowspan2 = int(cell2.get("rowspan", 1))
|
|
216
|
+
|
|
217
|
+
# 去除所有空白字符(包括空格、换行、制表符等)
|
|
218
|
+
text1 = ''.join(full_to_half(cell1.get_text()).split())
|
|
219
|
+
text2 = ''.join(full_to_half(cell2.get_text()).split())
|
|
220
|
+
|
|
221
|
+
if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
|
|
222
|
+
structure_match = False
|
|
223
|
+
break
|
|
147
224
|
|
|
148
225
|
if structure_match:
|
|
149
226
|
header_rows += 1
|
|
@@ -153,7 +230,54 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
|
|
|
153
230
|
headers_match = header_rows > 0 # 只有当至少匹配了一行时,才认为表头匹配
|
|
154
231
|
break
|
|
155
232
|
|
|
156
|
-
#
|
|
233
|
+
# 如果严格匹配失败,尝试视觉一致性匹配(只比较文本内容)
|
|
234
|
+
if header_rows == 0:
|
|
235
|
+
header_rows, headers_match, header_texts = _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows)
|
|
236
|
+
|
|
237
|
+
return header_rows, headers_match, header_texts
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows=5):
|
|
241
|
+
"""
|
|
242
|
+
基于视觉一致性检测表头(只比较文本内容,忽略colspan/rowspan差异)
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
soup1: 第一个表格的BeautifulSoup对象
|
|
246
|
+
soup2: 第二个表格的BeautifulSoup对象
|
|
247
|
+
rows1: 第一个表格的行列表
|
|
248
|
+
rows2: 第二个表格的行列表
|
|
249
|
+
max_header_rows: 最大可能的表头行数
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
tuple: (表头行数, 表头是否一致, 表头文本列表)
|
|
253
|
+
"""
|
|
254
|
+
# 构建两个表格的有效列数矩阵
|
|
255
|
+
effective_cols1 = build_table_occupied_matrix(soup1)
|
|
256
|
+
effective_cols2 = build_table_occupied_matrix(soup2)
|
|
257
|
+
|
|
258
|
+
min_rows = min(len(rows1), len(rows2), max_header_rows)
|
|
259
|
+
header_rows = 0
|
|
260
|
+
headers_match = True
|
|
261
|
+
header_texts = []
|
|
262
|
+
|
|
263
|
+
for i in range(min_rows):
|
|
264
|
+
cells1 = rows1[i].find_all(["td", "th"])
|
|
265
|
+
cells2 = rows2[i].find_all(["td", "th"])
|
|
266
|
+
|
|
267
|
+
# 提取每行的文本内容列表(去除空白字符)
|
|
268
|
+
texts1 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells1]
|
|
269
|
+
texts2 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells2]
|
|
270
|
+
|
|
271
|
+
# 检查视觉一致性:文本内容完全相同,且有效列数一致
|
|
272
|
+
effective_cols_match = effective_cols1.get(i, 0) == effective_cols2.get(i, 0)
|
|
273
|
+
if texts1 == texts2 and effective_cols_match:
|
|
274
|
+
header_rows += 1
|
|
275
|
+
row_texts = [full_to_half(cell.get_text().strip()) for cell in cells1]
|
|
276
|
+
header_texts.append(row_texts)
|
|
277
|
+
else:
|
|
278
|
+
headers_match = header_rows > 0
|
|
279
|
+
break
|
|
280
|
+
|
|
157
281
|
if header_rows == 0:
|
|
158
282
|
headers_match = False
|
|
159
283
|
|
|
@@ -163,20 +287,32 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
|
|
|
163
287
|
def can_merge_tables(current_table_block, previous_table_block):
|
|
164
288
|
"""判断两个表格是否可以合并"""
|
|
165
289
|
# 检查表格是否有caption和footnote
|
|
290
|
+
# 计算previous_table_block中的footnote数量
|
|
291
|
+
footnote_count = sum(1 for block in previous_table_block["blocks"] if block["type"] == BlockType.TABLE_FOOTNOTE)
|
|
166
292
|
# 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
|
|
167
293
|
caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
|
|
168
294
|
if caption_blocks:
|
|
169
|
-
#
|
|
295
|
+
# 检查是否至少有一个caption包含续表标识
|
|
296
|
+
has_continuation_marker = False
|
|
297
|
+
for block in caption_blocks:
|
|
298
|
+
caption_text = full_to_half(merge_para_with_text(block).strip()).lower()
|
|
299
|
+
if (
|
|
300
|
+
any(caption_text.endswith(marker.lower()) for marker in CONTINUATION_END_MARKERS)
|
|
301
|
+
or any(marker.lower() in caption_text for marker in CONTINUATION_INLINE_MARKERS)
|
|
302
|
+
):
|
|
303
|
+
has_continuation_marker = True
|
|
304
|
+
break
|
|
170
305
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
for marker in CONTINUATION_MARKERS)
|
|
174
|
-
for block in caption_blocks
|
|
175
|
-
):
|
|
306
|
+
# 如果所有caption都不包含续表标识,则不允许合并
|
|
307
|
+
if not has_continuation_marker:
|
|
176
308
|
return False, None, None, None, None
|
|
177
309
|
|
|
178
|
-
|
|
179
|
-
|
|
310
|
+
# 如果current_table_block的caption存在续标识,放宽footnote的限制允许previous_table_block有最多一条footnote
|
|
311
|
+
if footnote_count > 1:
|
|
312
|
+
return False, None, None, None, None
|
|
313
|
+
else:
|
|
314
|
+
if footnote_count > 0:
|
|
315
|
+
return False, None, None, None, None
|
|
180
316
|
|
|
181
317
|
# 获取两个表格的HTML内容
|
|
182
318
|
current_html = ""
|
|
@@ -226,34 +362,44 @@ def check_rows_match(soup1, soup2):
|
|
|
226
362
|
if not (rows1 and rows2):
|
|
227
363
|
return False
|
|
228
364
|
|
|
229
|
-
#
|
|
365
|
+
# 获取第一个表的最后一行数据行索引
|
|
366
|
+
last_row_idx = None
|
|
230
367
|
last_row = None
|
|
231
|
-
for
|
|
232
|
-
if
|
|
233
|
-
|
|
368
|
+
for idx in range(len(rows1) - 1, -1, -1):
|
|
369
|
+
if rows1[idx].find_all(["td", "th"]):
|
|
370
|
+
last_row_idx = idx
|
|
371
|
+
last_row = rows1[idx]
|
|
234
372
|
break
|
|
235
373
|
|
|
236
374
|
# 检测表头行数,以便获取第二个表的首个数据行
|
|
237
375
|
header_count, _, _ = detect_table_headers(soup1, soup2)
|
|
238
376
|
|
|
239
377
|
# 获取第二个表的首个数据行
|
|
378
|
+
first_data_row_idx = None
|
|
240
379
|
first_data_row = None
|
|
241
380
|
if len(rows2) > header_count:
|
|
381
|
+
first_data_row_idx = header_count
|
|
242
382
|
first_data_row = rows2[header_count] # 第一个非表头行
|
|
243
383
|
|
|
244
384
|
if not (last_row and first_data_row):
|
|
245
385
|
return False
|
|
246
386
|
|
|
247
|
-
#
|
|
387
|
+
# 计算有效列数(考虑rowspan和colspan)
|
|
388
|
+
last_row_effective_cols = calculate_row_effective_columns(soup1, last_row_idx)
|
|
389
|
+
first_row_effective_cols = calculate_row_effective_columns(soup2, first_data_row_idx)
|
|
390
|
+
|
|
391
|
+
# 计算实际列数(仅考虑colspan)和视觉列数
|
|
248
392
|
last_row_cols = calculate_row_columns(last_row)
|
|
249
393
|
first_row_cols = calculate_row_columns(first_data_row)
|
|
250
394
|
last_row_visual_cols = calculate_visual_columns(last_row)
|
|
251
395
|
first_row_visual_cols = calculate_visual_columns(first_data_row)
|
|
252
396
|
|
|
253
|
-
# logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(视觉列数:{first_row_visual_cols})")
|
|
397
|
+
# logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(有效列数:{last_row_effective_cols}, 视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(有效列数:{first_row_effective_cols}, 视觉列数:{first_row_visual_cols})")
|
|
254
398
|
|
|
255
|
-
#
|
|
256
|
-
return
|
|
399
|
+
# 同时考虑有效列数匹配、实际列数匹配和视觉列数匹配
|
|
400
|
+
return (last_row_effective_cols == first_row_effective_cols or
|
|
401
|
+
last_row_cols == first_row_cols or
|
|
402
|
+
last_row_visual_cols == first_row_visual_cols)
|
|
257
403
|
|
|
258
404
|
|
|
259
405
|
def check_row_columns_match(row1, row2):
|
|
@@ -270,12 +416,13 @@ def check_row_columns_match(row1, row2):
|
|
|
270
416
|
return True
|
|
271
417
|
|
|
272
418
|
|
|
273
|
-
def adjust_table_rows_colspan(rows, start_idx, end_idx,
|
|
419
|
+
def adjust_table_rows_colspan(soup, rows, start_idx, end_idx,
|
|
274
420
|
reference_structure, reference_visual_cols,
|
|
275
421
|
target_cols, current_cols, reference_row):
|
|
276
422
|
"""调整表格行的colspan属性以匹配目标列数
|
|
277
423
|
|
|
278
424
|
Args:
|
|
425
|
+
soup: BeautifulSoup解析的表格对象(用于计算有效列数)
|
|
279
426
|
rows: 表格行列表
|
|
280
427
|
start_idx: 起始行索引
|
|
281
428
|
end_idx: 结束行索引(不包含)
|
|
@@ -287,14 +434,21 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
|
|
|
287
434
|
"""
|
|
288
435
|
reference_row_copy = deepcopy(reference_row)
|
|
289
436
|
|
|
437
|
+
# 构建有效列数矩阵
|
|
438
|
+
effective_cols_matrix = build_table_occupied_matrix(soup)
|
|
439
|
+
|
|
290
440
|
for i in range(start_idx, end_idx):
|
|
291
441
|
row = rows[i]
|
|
292
442
|
cells = row.find_all(["td", "th"])
|
|
293
443
|
if not cells:
|
|
294
444
|
continue
|
|
295
445
|
|
|
446
|
+
# 使用有效列数(考虑rowspan)判断是否需要调整
|
|
447
|
+
current_row_effective_cols = effective_cols_matrix.get(i, 0)
|
|
296
448
|
current_row_cols = calculate_row_columns(row)
|
|
297
|
-
|
|
449
|
+
|
|
450
|
+
# 如果有效列数或实际列数已经达到目标,则跳过
|
|
451
|
+
if current_row_effective_cols >= target_cols or current_row_cols >= target_cols:
|
|
298
452
|
continue
|
|
299
453
|
|
|
300
454
|
# 检查是否与参考行结构匹配
|
|
@@ -306,9 +460,12 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
|
|
|
306
460
|
cell["colspan"] = str(reference_structure[j])
|
|
307
461
|
else:
|
|
308
462
|
# 扩展最后一个单元格以填补列数差异
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
463
|
+
# 使用有效列数来计算差异
|
|
464
|
+
cols_diff = target_cols - current_row_effective_cols
|
|
465
|
+
if cols_diff > 0:
|
|
466
|
+
last_cell = cells[-1]
|
|
467
|
+
current_last_span = int(last_cell.get("colspan", 1))
|
|
468
|
+
last_cell["colspan"] = str(current_last_span + cols_diff)
|
|
312
469
|
|
|
313
470
|
|
|
314
471
|
def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_footnotes):
|
|
@@ -339,7 +496,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
|
|
|
339
496
|
reference_visual_cols = calculate_visual_columns(last_row1)
|
|
340
497
|
# 以表1的最后一行为参考,调整表2的行
|
|
341
498
|
adjust_table_rows_colspan(
|
|
342
|
-
rows2, header_count, len(rows2),
|
|
499
|
+
soup2, rows2, header_count, len(rows2),
|
|
343
500
|
reference_structure, reference_visual_cols,
|
|
344
501
|
table_cols1, table_cols2, first_data_row2
|
|
345
502
|
)
|
|
@@ -349,7 +506,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
|
|
|
349
506
|
reference_visual_cols = calculate_visual_columns(first_data_row2)
|
|
350
507
|
# 以表2的第一个数据行为参考,调整表1的行
|
|
351
508
|
adjust_table_rows_colspan(
|
|
352
|
-
rows1, 0, len(rows1),
|
|
509
|
+
soup1, rows1, 0, len(rows1),
|
|
353
510
|
reference_structure, reference_visual_cols,
|
|
354
511
|
table_cols2, table_cols1, last_row1
|
|
355
512
|
)
|
|
@@ -363,6 +520,11 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
|
|
|
363
520
|
row.extract()
|
|
364
521
|
tbody1.append(row)
|
|
365
522
|
|
|
523
|
+
# 清空previous_table_block的footnote
|
|
524
|
+
previous_table_block["blocks"] = [
|
|
525
|
+
block for block in previous_table_block["blocks"]
|
|
526
|
+
if block["type"] != BlockType.TABLE_FOOTNOTE
|
|
527
|
+
]
|
|
366
528
|
# 添加待合并表格的footnote到前一个表格中
|
|
367
529
|
for table_footnote in wait_merge_table_footnotes:
|
|
368
530
|
temp_table_footnote = table_footnote.copy()
|
|
@@ -423,4 +585,4 @@ def merge_table(page_info_list):
|
|
|
423
585
|
# 删除当前页的table
|
|
424
586
|
for block in current_table_block["blocks"]:
|
|
425
587
|
block['lines'] = []
|
|
426
|
-
block[SplitFlag.LINES_DELETED] = True
|
|
588
|
+
block[SplitFlag.LINES_DELETED] = True
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.7.
|
|
1
|
+
__version__ = "2.7.2"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mineru
|
|
3
|
-
Version: 2.7.
|
|
3
|
+
Version: 2.7.2
|
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
|
5
5
|
License: AGPL-3.0
|
|
6
6
|
Project-URL: homepage, https://mineru.net/
|
|
@@ -60,7 +60,7 @@ Requires-Dist: matplotlib<4,>=3.10; extra == "pipeline"
|
|
|
60
60
|
Requires-Dist: ultralytics<9,>=8.3.48; extra == "pipeline"
|
|
61
61
|
Requires-Dist: doclayout_yolo==0.0.4; extra == "pipeline"
|
|
62
62
|
Requires-Dist: dill<1,>=0.3.8; extra == "pipeline"
|
|
63
|
-
Requires-Dist: PyYAML<7,>=6.0.
|
|
63
|
+
Requires-Dist: PyYAML<7,>=6.0.1; extra == "pipeline"
|
|
64
64
|
Requires-Dist: ftfy<7,>=6.3.1; extra == "pipeline"
|
|
65
65
|
Requires-Dist: shapely<3,>=2.0.7; extra == "pipeline"
|
|
66
66
|
Requires-Dist: pyclipper<2,>=1.3.0; extra == "pipeline"
|
|
@@ -135,6 +135,17 @@ Dynamic: license-file
|
|
|
135
135
|
|
|
136
136
|
# Changelog
|
|
137
137
|
|
|
138
|
+
- 2026/01/23 2.7.2 Release
|
|
139
|
+
- Added support for domestic computing platforms Hygon, Enflame, and Moore Threads. Currently, the officially supported domestic computing platforms include:
|
|
140
|
+
- [Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend/)
|
|
141
|
+
- [T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead/)
|
|
142
|
+
- [METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX/)
|
|
143
|
+
- [Hygon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
|
|
144
|
+
- [Enflame](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
|
|
145
|
+
- [MooreThreads](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
|
|
146
|
+
- MinerU continues to ensure compatibility with domestic hardware platforms, supporting mainstream chip architectures. With secure and reliable technology, we empower researchers, government, and enterprises to reach new heights in document digitization!
|
|
147
|
+
- Cross-page table merging optimization, improving merge success rate and merge quality
|
|
148
|
+
|
|
138
149
|
- 2026/01/06 2.7.1 Release
|
|
139
150
|
- fix bug: #4300
|
|
140
151
|
- Updated pdfminer.six dependency version to resolve [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=H1WLrviWKvrPzDle8EWdCYYkzljxs0mtbXigYc-xaKA,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
|
|
@@ -17,8 +17,8 @@ mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc
|
|
|
17
17
|
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=NJCLGKE7BqM24bRdpXCfTalyiqozowFZjpdzpIUy5aA,14672
|
|
18
18
|
mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
19
19
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
20
|
-
mineru/backend/vlm/utils.py,sha256=
|
|
21
|
-
mineru/backend/vlm/vlm_analyze.py,sha256=
|
|
20
|
+
mineru/backend/vlm/utils.py,sha256=1qma_KmDjRfOckcPbriGgRhS1XMk_johsyACfwcmDr4,3844
|
|
21
|
+
mineru/backend/vlm/vlm_analyze.py,sha256=ttnQBUy1PEm9JZoF2G1_z-7gA3MgUUUBhz6OypCb4_g,14765
|
|
22
22
|
mineru/backend/vlm/vlm_magic_model.py,sha256=mD-irxboo2DmMu4QF1wnvbti2xdNyBmNflbB4a-TmsU,21402
|
|
23
23
|
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
|
|
24
24
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
@@ -47,13 +47,13 @@ mineru/model/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
|
47
47
|
mineru/model/layout/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
48
48
|
mineru/model/layout/doclayoutyolo.py,sha256=DttINdulzTiYcVDl_70oDtUdfVmGc9qkKWmbPOGAeV0,3867
|
|
49
49
|
mineru/model/mfd/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
50
|
-
mineru/model/mfd/yolo_v8.py,sha256=
|
|
50
|
+
mineru/model/mfd/yolo_v8.py,sha256=OI5AxVgt3FvXp4NYk0BDXXvpDlo9YjM6byDyC_TZ8Js,3714
|
|
51
51
|
mineru/model/mfr/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
52
52
|
mineru/model/mfr/utils.py,sha256=pAi1HnkTuO0R6251Hdl-o50m0wH0Ce89PAf74WCsXPU,11499
|
|
53
53
|
mineru/model/mfr/pp_formulanet_plus_m/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
54
|
mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py,sha256=alGX_sPJxZh_7v1sOK3DJ8akfkWO-2c5I_JR7aXMTLU,5588
|
|
55
55
|
mineru/model/mfr/pp_formulanet_plus_m/processors.py,sha256=MSKyanxiDDjgDQHBov-GjKtPnMx9tSmxBC9GIkM3ft8,23832
|
|
56
|
-
mineru/model/mfr/unimernet/Unimernet.py,sha256=
|
|
56
|
+
mineru/model/mfr/unimernet/Unimernet.py,sha256=ZK0M9fPmZziK4D33H3YND7RnHiQkRVCS-lvNfY-N7do,7912
|
|
57
57
|
mineru/model/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
mineru/model/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
|
|
59
59
|
mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=_lN3zDKxeqsW-h9tXx79DYiT5uT4P9ixG49WrSYKFxE,7551
|
|
@@ -88,7 +88,7 @@ mineru/model/table/rec/unet_table/main.py,sha256=J13Q7_6stYyedmVedf9CZD7R0tuguGf
|
|
|
88
88
|
mineru/model/table/rec/unet_table/table_recover.py,sha256=rSyeWyuP10M8dLKA5e0n4P2DXMYbVbmgLxEcdZA8_0E,9059
|
|
89
89
|
mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=hnmYLzZFRlK0Y4gr874G9GaLahcKnNZYNun869FdmH8,8150
|
|
90
90
|
mineru/model/table/rec/unet_table/utils.py,sha256=CYAqJW0wePJk4NAemb8W203N7E32v0ujiWbxanDhd8I,16083
|
|
91
|
-
mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=
|
|
91
|
+
mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=6z0jYO6S8wAmfHe5tAyEfzWZIQv8wrn_dRU9GC7oKro,11435
|
|
92
92
|
mineru/model/table/rec/unet_table/utils_table_recover.py,sha256=XksJsY82ZS0kqUnNT-jvaYzxJ3V3svMSzj0puwIau1k,10651
|
|
93
93
|
mineru/model/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
94
94
|
mineru/model/utils/pytorchocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -151,17 +151,17 @@ mineru/model/utils/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-
|
|
|
151
151
|
mineru/model/utils/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
|
|
152
152
|
mineru/model/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
153
|
mineru/model/vlm/lmdeploy_server.py,sha256=PvxJNcUIKB8VzWMDXeV1t0SHSgz_ULO36ZAzJbppz90,3262
|
|
154
|
-
mineru/model/vlm/vllm_server.py,sha256=
|
|
154
|
+
mineru/model/vlm/vllm_server.py,sha256=DtYRAHINYN4qkR2onVMofvANPTkSP6tE4IRY_vZgpiA,3079
|
|
155
155
|
mineru/resources/header.html,sha256=7xrf6bGloR-3ZeTDyA-JvavE_NeRuUDe3p07cEKUXSI,4769
|
|
156
156
|
mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
|
157
157
|
mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
158
158
|
mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
|
|
159
|
-
mineru/utils/block_sort.py,sha256=
|
|
159
|
+
mineru/utils/block_sort.py,sha256=MmgjZBcmaWssAglzE75VixjtJ_BLNUHO0gvCNQHvlY4,13538
|
|
160
160
|
mineru/utils/boxbase.py,sha256=xnGA1k7hVtTQrreqlJmK-SA3y9edTHgLmGiqGrSXckE,7568
|
|
161
161
|
mineru/utils/char_utils.py,sha256=74T5Ylr5mi1uddAIuJku9Z6sH7vhR7t595_H7qmbu4c,1777
|
|
162
162
|
mineru/utils/check_sys_env.py,sha256=TRjzg4xWyoSGrgv4KaP225A-99xBgLAfZ1cPcGqrBAA,1191
|
|
163
163
|
mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
|
|
164
|
-
mineru/utils/config_reader.py,sha256=
|
|
164
|
+
mineru/utils/config_reader.py,sha256=mfulokOzI-33sZy7S-wEVbY3z01PdC7X3761fuhqR3s,4393
|
|
165
165
|
mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
|
|
166
166
|
mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
|
|
167
167
|
mineru/utils/engine_utils.py,sha256=Jmao9-O-sZDzH7vANKEDaY6NJ8tuthKsTr23LFIeBLU,2203
|
|
@@ -172,7 +172,7 @@ mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,85
|
|
|
172
172
|
mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
|
|
173
173
|
mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
|
|
174
174
|
mineru/utils/magic_model_utils.py,sha256=I6vdN56aqhQBGOasoWHiJbjnXsBwUojw6xFjbWZSHaU,8656
|
|
175
|
-
mineru/utils/model_utils.py,sha256=
|
|
175
|
+
mineru/utils/model_utils.py,sha256=w-jSN7Ilh27FlMjPpKNO6MPbo_dT5Ln7zCQcXaREl_k,19605
|
|
176
176
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
177
177
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
178
178
|
mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
|
|
@@ -184,10 +184,10 @@ mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ
|
|
|
184
184
|
mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
|
|
185
185
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
186
186
|
mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
|
|
187
|
-
mineru/utils/table_merge.py,sha256=
|
|
188
|
-
mineru-2.7.
|
|
189
|
-
mineru-2.7.
|
|
190
|
-
mineru-2.7.
|
|
191
|
-
mineru-2.7.
|
|
192
|
-
mineru-2.7.
|
|
193
|
-
mineru-2.7.
|
|
187
|
+
mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
|
|
188
|
+
mineru-2.7.2.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
189
|
+
mineru-2.7.2.dist-info/METADATA,sha256=w3qS7X-Wjvqz8Ra5fp0QH-Wvq_RbZHGyaVOL8WIrerw,36621
|
|
190
|
+
mineru-2.7.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
191
|
+
mineru-2.7.2.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
|
|
192
|
+
mineru-2.7.2.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
193
|
+
mineru-2.7.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|