mineru 2.7.1__py3-none-any.whl → 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -424,7 +424,8 @@ def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
424
424
  # 调用通用方法
425
425
  return tie_up_category_by_index(
426
426
  get_subjects,
427
- get_objects
427
+ get_objects,
428
+ object_block_type=object_block_type
428
429
  )
429
430
 
430
431
 
@@ -18,6 +18,10 @@ def enable_custom_logits_processors() -> bool:
18
18
  compute_capability = f"{major}.{minor}"
19
19
  elif hasattr(torch, 'npu') and torch.npu.is_available():
20
20
  compute_capability = "8.0"
21
+ elif hasattr(torch, 'gcu') and torch.gcu.is_available():
22
+ compute_capability = "8.0"
23
+ elif hasattr(torch, 'musa') and torch.musa.is_available():
24
+ compute_capability = "8.0"
21
25
  else:
22
26
  logger.info("CUDA not available, disabling custom_logits_processors")
23
27
  return False
@@ -1,6 +1,7 @@
1
1
  # Copyright (c) Opendatalab. All rights reserved.
2
2
  import os
3
3
  import time
4
+ import json
4
5
 
5
6
  from loguru import logger
6
7
 
@@ -99,6 +100,30 @@ class ModelSingleton:
99
100
  import vllm
100
101
  except ImportError:
101
102
  raise ImportError("Please install vllm to use the vllm-engine backend.")
103
+
104
+ """
105
+ # musa vllm v1 引擎特殊配置
106
+ device = get_device()
107
+ if device.startswith("musa"):
108
+ import torch
109
+ if torch.musa.is_available():
110
+ compilation_config = {
111
+ "cudagraph_capture_sizes": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
112
+ "simple_cuda_graph": True
113
+ }
114
+ block_size = 32
115
+ kwargs["compilation_config"] = compilation_config
116
+ kwargs["block_size"] = block_size
117
+ """
118
+
119
+ if "compilation_config" in kwargs:
120
+ if isinstance(kwargs["compilation_config"], str):
121
+ try:
122
+ kwargs["compilation_config"] = json.loads(kwargs["compilation_config"])
123
+ except json.JSONDecodeError:
124
+ logger.warning(
125
+ f"Failed to parse compilation_config as JSON: {kwargs['compilation_config']}")
126
+ del kwargs["compilation_config"]
102
127
  if "gpu_memory_utilization" not in kwargs:
103
128
  kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
104
129
  if "model" not in kwargs:
@@ -112,8 +137,38 @@ class ModelSingleton:
112
137
  try:
113
138
  from vllm.engine.arg_utils import AsyncEngineArgs
114
139
  from vllm.v1.engine.async_llm import AsyncLLM
140
+ from vllm.config import CompilationConfig
115
141
  except ImportError:
116
142
  raise ImportError("Please install vllm to use the vllm-async-engine backend.")
143
+
144
+ """
145
+ # musa vllm v1 引擎特殊配置
146
+ device = get_device()
147
+ if device.startswith("musa"):
148
+ import torch
149
+ if torch.musa.is_available():
150
+ compilation_config = CompilationConfig(
151
+ cudagraph_capture_sizes=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 24, 28, 30],
152
+ simple_cuda_graph=True
153
+ )
154
+ block_size = 32
155
+ kwargs["compilation_config"] = compilation_config
156
+ kwargs["block_size"] = block_size
157
+ """
158
+
159
+ if "compilation_config" in kwargs:
160
+ if isinstance(kwargs["compilation_config"], dict):
161
+ # 如果是字典,转换为 CompilationConfig 对象
162
+ kwargs["compilation_config"] = CompilationConfig(**kwargs["compilation_config"])
163
+ elif isinstance(kwargs["compilation_config"], str):
164
+ # 如果是 JSON 字符串,先解析再转换
165
+ try:
166
+ config_dict = json.loads(kwargs["compilation_config"])
167
+ kwargs["compilation_config"] = CompilationConfig(**config_dict)
168
+ except (json.JSONDecodeError, TypeError) as e:
169
+ logger.warning(
170
+ f"Failed to parse compilation_config: {kwargs['compilation_config']}, error: {e}")
171
+ del kwargs["compilation_config"]
117
172
  if "gpu_memory_utilization" not in kwargs:
118
173
  kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
119
174
  if "model" not in kwargs:
@@ -349,7 +349,8 @@ def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
349
349
  # 调用通用方法
350
350
  return tie_up_category_by_index(
351
351
  get_subjects,
352
- get_objects
352
+ get_objects,
353
+ object_block_type=object_block_type
353
354
  )
354
355
 
355
356
 
@@ -1,5 +1,7 @@
1
1
  import os
2
2
  from typing import List, Union
3
+
4
+ import torch
3
5
  from tqdm import tqdm
4
6
  from ultralytics import YOLO
5
7
  import numpy as np
@@ -18,8 +20,8 @@ class YOLOv8MFDModel:
18
20
  conf: float = 0.25,
19
21
  iou: float = 0.45,
20
22
  ):
21
- self.model = YOLO(weight).to(device)
22
- self.device = device
23
+ self.device = torch.device(device)
24
+ self.model = YOLO(weight).to(self.device)
23
25
  self.imgsz = imgsz
24
26
  self.conf = conf
25
27
  self.iou = iou
@@ -23,12 +23,12 @@ class MathDataset(Dataset):
23
23
  class UnimernetModel(object):
24
24
  def __init__(self, weight_dir, _device_="cpu"):
25
25
  from .unimernet_hf import UnimernetModel
26
- if _device_.startswith("mps") or _device_.startswith("npu"):
26
+ if _device_.startswith("mps") or _device_.startswith("npu") or _device_.startswith("musa"):
27
27
  self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager")
28
28
  else:
29
29
  self.model = UnimernetModel.from_pretrained(weight_dir)
30
- self.device = _device_
31
- self.model.to(_device_)
30
+ self.device = torch.device(_device_)
31
+ self.model.to(self.device)
32
32
  if not _device_.startswith("cpu"):
33
33
  self.model = self.model.to(dtype=torch.float16)
34
34
  self.model.eval()
@@ -4,6 +4,8 @@ import cv2
4
4
  import numpy as np
5
5
  from scipy.spatial import distance as dist
6
6
  from skimage import measure
7
+ from skimage import __version__ as skimage_version
8
+ from packaging import version
7
9
 
8
10
 
9
11
  def transform_preds(coords, center, scale, output_size, rot=0):
@@ -295,7 +297,11 @@ def min_area_rect_box(
295
297
  """
296
298
  boxes = []
297
299
  for region in regions:
298
- if region.bbox_area > H * W * 3 / 4: # 过滤大的单元格
300
+ if version.parse(skimage_version) >= version.parse("0.26.0"):
301
+ region_bbox_area = region.area_bbox
302
+ else:
303
+ region_bbox_area = region.bbox_area
304
+ if region_bbox_area > H * W * 3 / 4: # 过滤大的单元格
299
305
  continue
300
306
  rect = cv2.minAreaRect(region.coords[:, ::-1])
301
307
 
@@ -2,6 +2,7 @@ import os
2
2
  import sys
3
3
 
4
4
  from mineru.backend.vlm.utils import set_default_gpu_memory_utilization, enable_custom_logits_processors
5
+ from mineru.utils.config_reader import get_device
5
6
  from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
6
7
 
7
8
  from vllm.entrypoints.cli.main import main as vllm_main
@@ -13,6 +14,8 @@ def main():
13
14
  has_port_arg = False
14
15
  has_gpu_memory_utilization_arg = False
15
16
  has_logits_processors_arg = False
17
+ has_block_size_arg = False
18
+ has_compilation_config = False
16
19
  model_path = None
17
20
  model_arg_indices = []
18
21
 
@@ -24,6 +27,10 @@ def main():
24
27
  has_gpu_memory_utilization_arg = True
25
28
  if arg == "--logits-processors" or arg.startswith("--logits-processors="):
26
29
  has_logits_processors_arg = True
30
+ if arg == "--block-size" or arg.startswith("--block-size="):
31
+ has_block_size_arg = True
32
+ if arg == "--compilation-config" or arg.startswith("--compilation-config="):
33
+ has_compilation_config = True
27
34
  if arg == "--model":
28
35
  if i + 1 < len(args):
29
36
  model_path = args[i + 1]
@@ -49,6 +56,17 @@ def main():
49
56
  model_path = auto_download_and_get_model_root_path("/", "vlm")
50
57
  if (not has_logits_processors_arg) and custom_logits_processors:
51
58
  args.extend(["--logits-processors", "mineru_vl_utils:MinerULogitsProcessor"])
59
+ """
60
+ # musa vllm v1 引擎特殊配置
61
+ device = get_device()
62
+ if device.startswith("musa"):
63
+ import torch
64
+ if torch.musa.is_available():
65
+ if not has_block_size_arg:
66
+ args.extend(["--block-size", "32"])
67
+ if not has_compilation_config:
68
+ args.extend(["--compilation-config", '{"cudagraph_capture_sizes": [1,2,3,4,5,6,7,8,10,12,14,16,18,20,24,28,30], "simple_cuda_graph": true}'])
69
+ """
52
70
 
53
71
  # 重构参数,将模型路径作为位置参数
54
72
  sys.argv = [sys.argv[0]] + ["serve", model_path] + args
@@ -186,6 +186,18 @@ def model_init(model_name: str):
186
186
  bf_16_support = True
187
187
  elif device_name.startswith("mps"):
188
188
  bf_16_support = True
189
+ elif device_name.startswith("gcu"):
190
+ if hasattr(torch, 'gcu') and torch.gcu.is_available():
191
+ if torch.gcu.is_bf16_supported():
192
+ bf_16_support = True
193
+ elif device_name.startswith("musa"):
194
+ if hasattr(torch, 'musa') and torch.musa.is_available():
195
+ if torch.musa.is_bf16_supported():
196
+ bf_16_support = True
197
+ elif device_name.startswith("npu"):
198
+ if hasattr(torch, 'npu') and torch.npu.is_available():
199
+ if torch.npu.is_bf16_supported():
200
+ bf_16_support = True
189
201
 
190
202
  if model_name == 'layoutreader':
191
203
  # 检测modelscope的缓存目录是否存在
@@ -86,7 +86,15 @@ def get_device():
86
86
  if torch_npu.npu.is_available():
87
87
  return "npu"
88
88
  except Exception as e:
89
- pass
89
+ try:
90
+ if torch.gcu.is_available():
91
+ return "gcu"
92
+ except Exception as e:
93
+ try:
94
+ if torch.musa.is_available():
95
+ return "musa"
96
+ except Exception as e:
97
+ pass
90
98
  return "cpu"
91
99
 
92
100
 
@@ -2,6 +2,8 @@
2
2
  包含两个MagicModel类中重复使用的方法和逻辑
3
3
  """
4
4
  from typing import List, Dict, Any, Callable
5
+
6
+ from loguru import logger
5
7
  from mineru.utils.boxbase import bbox_distance, bbox_center_distance, is_in
6
8
 
7
9
 
@@ -172,11 +174,15 @@ def tie_up_category_by_index(
172
174
  get_subjects_func: Callable,
173
175
  get_objects_func: Callable,
174
176
  extract_subject_func: Callable = None,
175
- extract_object_func: Callable = None
177
+ extract_object_func: Callable = None,
178
+ object_block_type: str = "object",
176
179
  ):
177
180
  """
178
181
  基于index的类别关联方法,用于将主体对象与客体对象进行关联
179
- 客体优先匹配给index最接近的主体,index差值相同时使用bbox中心点距离作为tiebreaker
182
+ 客体优先匹配给index最接近的主体,匹配优先级为:
183
+ 1. index差值(最高优先级)
184
+ 2. bbox边缘距离(相邻边距离)
185
+ 3. bbox中心点距离(最低优先级,作为最终tiebreaker)
180
186
 
181
187
  参数:
182
188
  get_subjects_func: 函数,提取主体对象
@@ -207,6 +213,29 @@ def tie_up_category_by_index(
207
213
  "sub_idx": i,
208
214
  }
209
215
 
216
+ # 提取所有客体的index集合,用于计算有效index差值
217
+ object_indices = set(obj["index"] for obj in objects)
218
+
219
+ def calc_effective_index_diff(obj_index: int, sub_index: int) -> int:
220
+ """
221
+ 计算有效的index差值
222
+ 有效差值 = 绝对差值 - 区间内其他客体的数量
223
+ 即:如果obj_index和sub_index之间的差值是由其他客体造成的,则应该扣除这部分差值
224
+ """
225
+ if obj_index == sub_index:
226
+ return 0
227
+
228
+ start, end = min(obj_index, sub_index), max(obj_index, sub_index)
229
+ abs_diff = end - start
230
+
231
+ # 计算区间(start, end)内有多少个其他客体的index
232
+ other_objects_count = 0
233
+ for idx in range(start + 1, end):
234
+ if idx in object_indices:
235
+ other_objects_count += 1
236
+
237
+ return abs_diff - other_objects_count
238
+
210
239
  # 为每个客体找到最匹配的主体
211
240
  for obj in objects:
212
241
  if len(subjects) == 0:
@@ -217,10 +246,10 @@ def tie_up_category_by_index(
217
246
  min_index_diff = float("inf")
218
247
  best_subject_indices = []
219
248
 
220
- # 找出index差值最小的所有主体
249
+ # 找出有效index差值最小的所有主体
221
250
  for i, subject in enumerate(subjects):
222
251
  sub_index = subject["index"]
223
- index_diff = abs(obj_index - sub_index)
252
+ index_diff = calc_effective_index_diff(obj_index, sub_index)
224
253
 
225
254
  if index_diff < min_index_diff:
226
255
  min_index_diff = index_diff
@@ -228,18 +257,37 @@ def tie_up_category_by_index(
228
257
  elif index_diff == min_index_diff:
229
258
  best_subject_indices.append(i)
230
259
 
231
- # 如果有多个主体的index差值相同,使用中心点距离作为tiebreaker
232
- if len(best_subject_indices) > 1:
233
- min_center_dist = float("inf")
260
+ if len(best_subject_indices) == 1:
234
261
  best_subject_idx = best_subject_indices[0]
235
-
236
- for idx in best_subject_indices:
237
- center_dist = bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])
238
- if center_dist < min_center_dist:
239
- min_center_dist = center_dist
240
- best_subject_idx = idx
262
+ # 如果有多个主体的index差值相同(最多两个),根据边缘距离进行筛选
263
+ elif len(best_subject_indices) == 2:
264
+ # 计算所有候选主体的边缘距离
265
+ edge_distances = [(idx, bbox_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
266
+ edge_dist_diff = abs(edge_distances[0][1] - edge_distances[1][1])
267
+
268
+ for idx, edge_dist in edge_distances:
269
+ logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Edge distance: {edge_dist}")
270
+
271
+ if edge_dist_diff > 2:
272
+ # 边缘距离差值大于2,匹配边缘距离更小的主体
273
+ best_subject_idx = min(edge_distances, key=lambda x: x[1])[0]
274
+ logger.debug(f"Obj index: {obj_index}, edge_dist_diff > 2, matching to subject with min edge distance, index: {subjects[best_subject_idx]['index']}")
275
+ elif object_block_type == "table_caption":
276
+ # 边缘距离差值<=2且为table_caption,匹配index更大的主体
277
+ best_subject_idx = max(best_subject_indices, key=lambda idx: subjects[idx]["index"])
278
+ logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and table_caption, matching to later subject with index: {subjects[best_subject_idx]['index']}")
279
+ elif object_block_type.endswith("footnote"):
280
+ # 边缘距离差值<=2且为footnote,匹配index更小的主体
281
+ best_subject_idx = min(best_subject_indices, key=lambda idx: subjects[idx]["index"])
282
+ logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and footnote, matching to earlier subject with index: {subjects[best_subject_idx]['index']}")
283
+ else:
284
+ # 边缘距离差值<=2 且不适用特殊匹配规则,使用中心点距离匹配
285
+ center_distances = [(idx, bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
286
+ for idx, center_dist in center_distances:
287
+ logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Center distance: {center_dist}")
288
+ best_subject_idx = min(center_distances, key=lambda x: x[1])[0]
241
289
  else:
242
- best_subject_idx = best_subject_indices[0]
290
+ raise ValueError("More than two subjects have the same minimal index difference, which is unexpected.")
243
291
 
244
292
  # 将客体添加到最佳主体的obj_bboxes中
245
293
  result_dict[best_subject_idx]["obj_bboxes"].append(extract_object_func(obj))
@@ -414,7 +414,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
414
414
 
415
415
 
416
416
  def clean_memory(device='cuda'):
417
- if device == 'cuda':
417
+ if str(device).startswith("cuda"):
418
418
  if torch.cuda.is_available():
419
419
  torch.cuda.empty_cache()
420
420
  torch.cuda.ipc_collect()
@@ -423,6 +423,12 @@ def clean_memory(device='cuda'):
423
423
  torch_npu.npu.empty_cache()
424
424
  elif str(device).startswith("mps"):
425
425
  torch.mps.empty_cache()
426
+ elif str(device).startswith("gcu"):
427
+ if torch.gcu.is_available():
428
+ torch.gcu.empty_cache()
429
+ elif str(device).startswith("musa"):
430
+ if torch.musa.is_available():
431
+ torch.musa.empty_cache()
426
432
  gc.collect()
427
433
 
428
434
 
@@ -458,5 +464,11 @@ def get_vram(device) -> int:
458
464
  elif str(device).startswith("npu"):
459
465
  if torch_npu.npu.is_available():
460
466
  total_memory = round(torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
467
+ elif str(device).startswith("gcu"):
468
+ if torch.gcu.is_available():
469
+ total_memory = round(torch.gcu.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
470
+ elif str(device).startswith("musa"):
471
+ if torch.musa.is_available():
472
+ total_memory = round(torch.musa.get_device_properties(device).total_memory / (1024 ** 3)) # 转为 GB
461
473
 
462
474
  return total_memory
@@ -9,13 +9,19 @@ from mineru.utils.char_utils import full_to_half
9
9
  from mineru.utils.enum_class import BlockType, SplitFlag
10
10
 
11
11
 
12
- CONTINUATION_MARKERS = [
12
+ CONTINUATION_END_MARKERS = [
13
13
  "(续)",
14
14
  "(续表)",
15
15
  "(续上表)",
16
16
  "(continued)",
17
17
  "(cont.)",
18
18
  "(cont’d)",
19
+ "(…continued)",
20
+ "续表",
21
+ ]
22
+
23
+ CONTINUATION_INLINE_MARKERS = [
24
+ "(continued)",
19
25
  ]
20
26
 
21
27
 
@@ -64,6 +70,69 @@ def calculate_table_total_columns(soup):
64
70
  return max_cols
65
71
 
66
72
 
73
+ def build_table_occupied_matrix(soup):
74
+ """构建表格的占用矩阵,返回每行的有效列数
75
+
76
+ Args:
77
+ soup: BeautifulSoup解析的表格
78
+
79
+ Returns:
80
+ dict: {row_idx: effective_columns} 每行的有效列数(考虑rowspan占用)
81
+ """
82
+ rows = soup.find_all("tr")
83
+ if not rows:
84
+ return {}
85
+
86
+ occupied = {} # {row_idx: {col_idx: True}}
87
+ row_effective_cols = {} # {row_idx: effective_columns}
88
+
89
+ for row_idx, row in enumerate(rows):
90
+ col_idx = 0
91
+ cells = row.find_all(["td", "th"])
92
+
93
+ if row_idx not in occupied:
94
+ occupied[row_idx] = {}
95
+
96
+ for cell in cells:
97
+ # 找到下一个未被占用的列位置
98
+ while col_idx in occupied[row_idx]:
99
+ col_idx += 1
100
+
101
+ colspan = int(cell.get("colspan", 1))
102
+ rowspan = int(cell.get("rowspan", 1))
103
+
104
+ # 标记被这个单元格占用的所有位置
105
+ for r in range(row_idx, row_idx + rowspan):
106
+ if r not in occupied:
107
+ occupied[r] = {}
108
+ for c in range(col_idx, col_idx + colspan):
109
+ occupied[r][c] = True
110
+
111
+ col_idx += colspan
112
+
113
+ # 该行的有效列数为已占用的最大列索引+1
114
+ if occupied[row_idx]:
115
+ row_effective_cols[row_idx] = max(occupied[row_idx].keys()) + 1
116
+ else:
117
+ row_effective_cols[row_idx] = 0
118
+
119
+ return row_effective_cols
120
+
121
+
122
+ def calculate_row_effective_columns(soup, row_idx):
123
+ """计算指定行的有效列数(考虑rowspan占用)
124
+
125
+ Args:
126
+ soup: BeautifulSoup解析的表格
127
+ row_idx: 行索引
128
+
129
+ Returns:
130
+ int: 该行的有效列数
131
+ """
132
+ row_effective_cols = build_table_occupied_matrix(soup)
133
+ return row_effective_cols.get(row_idx, 0)
134
+
135
+
67
136
  def calculate_row_columns(row):
68
137
  """
69
138
  计算表格行的实际列数,考虑colspan属性
@@ -113,6 +182,10 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
113
182
  rows1 = soup1.find_all("tr")
114
183
  rows2 = soup2.find_all("tr")
115
184
 
185
+ # 构建两个表格的有效列数矩阵
186
+ effective_cols1 = build_table_occupied_matrix(soup1)
187
+ effective_cols2 = build_table_occupied_matrix(soup2)
188
+
116
189
  min_rows = min(len(rows1), len(rows2), max_header_rows)
117
190
  header_rows = 0
118
191
  headers_match = True
@@ -130,20 +203,24 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
130
203
  if len(cells1) != len(cells2):
131
204
  structure_match = False
132
205
  else:
133
- # 然后检查单元格的属性和内容
134
- for cell1, cell2 in zip(cells1, cells2):
135
- colspan1 = int(cell1.get("colspan", 1))
136
- rowspan1 = int(cell1.get("rowspan", 1))
137
- colspan2 = int(cell2.get("colspan", 1))
138
- rowspan2 = int(cell2.get("rowspan", 1))
139
-
140
- # 去除所有空白字符(包括空格、换行、制表符等)
141
- text1 = ''.join(full_to_half(cell1.get_text()).split())
142
- text2 = ''.join(full_to_half(cell2.get_text()).split())
143
-
144
- if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
145
- structure_match = False
146
- break
206
+ # 检查有效列数是否一致(考虑rowspan影响)
207
+ if effective_cols1.get(i, 0) != effective_cols2.get(i, 0):
208
+ structure_match = False
209
+ else:
210
+ # 然后检查单元格的属性和内容
211
+ for cell1, cell2 in zip(cells1, cells2):
212
+ colspan1 = int(cell1.get("colspan", 1))
213
+ rowspan1 = int(cell1.get("rowspan", 1))
214
+ colspan2 = int(cell2.get("colspan", 1))
215
+ rowspan2 = int(cell2.get("rowspan", 1))
216
+
217
+ # 去除所有空白字符(包括空格、换行、制表符等)
218
+ text1 = ''.join(full_to_half(cell1.get_text()).split())
219
+ text2 = ''.join(full_to_half(cell2.get_text()).split())
220
+
221
+ if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
222
+ structure_match = False
223
+ break
147
224
 
148
225
  if structure_match:
149
226
  header_rows += 1
@@ -153,7 +230,54 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
153
230
  headers_match = header_rows > 0 # 只有当至少匹配了一行时,才认为表头匹配
154
231
  break
155
232
 
156
- # 如果没有找到匹配的表头行,则返回失败
233
+ # 如果严格匹配失败,尝试视觉一致性匹配(只比较文本内容)
234
+ if header_rows == 0:
235
+ header_rows, headers_match, header_texts = _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows)
236
+
237
+ return header_rows, headers_match, header_texts
238
+
239
+
240
+ def _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows=5):
241
+ """
242
+ 基于视觉一致性检测表头(只比较文本内容,忽略colspan/rowspan差异)
243
+
244
+ Args:
245
+ soup1: 第一个表格的BeautifulSoup对象
246
+ soup2: 第二个表格的BeautifulSoup对象
247
+ rows1: 第一个表格的行列表
248
+ rows2: 第二个表格的行列表
249
+ max_header_rows: 最大可能的表头行数
250
+
251
+ Returns:
252
+ tuple: (表头行数, 表头是否一致, 表头文本列表)
253
+ """
254
+ # 构建两个表格的有效列数矩阵
255
+ effective_cols1 = build_table_occupied_matrix(soup1)
256
+ effective_cols2 = build_table_occupied_matrix(soup2)
257
+
258
+ min_rows = min(len(rows1), len(rows2), max_header_rows)
259
+ header_rows = 0
260
+ headers_match = True
261
+ header_texts = []
262
+
263
+ for i in range(min_rows):
264
+ cells1 = rows1[i].find_all(["td", "th"])
265
+ cells2 = rows2[i].find_all(["td", "th"])
266
+
267
+ # 提取每行的文本内容列表(去除空白字符)
268
+ texts1 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells1]
269
+ texts2 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells2]
270
+
271
+ # 检查视觉一致性:文本内容完全相同,且有效列数一致
272
+ effective_cols_match = effective_cols1.get(i, 0) == effective_cols2.get(i, 0)
273
+ if texts1 == texts2 and effective_cols_match:
274
+ header_rows += 1
275
+ row_texts = [full_to_half(cell.get_text().strip()) for cell in cells1]
276
+ header_texts.append(row_texts)
277
+ else:
278
+ headers_match = header_rows > 0
279
+ break
280
+
157
281
  if header_rows == 0:
158
282
  headers_match = False
159
283
 
@@ -163,20 +287,32 @@ def detect_table_headers(soup1, soup2, max_header_rows=5):
163
287
  def can_merge_tables(current_table_block, previous_table_block):
164
288
  """判断两个表格是否可以合并"""
165
289
  # 检查表格是否有caption和footnote
290
+ # 计算previous_table_block中的footnote数量
291
+ footnote_count = sum(1 for block in previous_table_block["blocks"] if block["type"] == BlockType.TABLE_FOOTNOTE)
166
292
  # 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
167
293
  caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
168
294
  if caption_blocks:
169
- # 如果所有caption都不以"(续)"、"(续表)"、"(continued)"或"(cont.)"结尾,则不合并
295
+ # 检查是否至少有一个caption包含续表标识
296
+ has_continuation_marker = False
297
+ for block in caption_blocks:
298
+ caption_text = full_to_half(merge_para_with_text(block).strip()).lower()
299
+ if (
300
+ any(caption_text.endswith(marker.lower()) for marker in CONTINUATION_END_MARKERS)
301
+ or any(marker.lower() in caption_text for marker in CONTINUATION_INLINE_MARKERS)
302
+ ):
303
+ has_continuation_marker = True
304
+ break
170
305
 
171
- if not any(
172
- any(full_to_half(merge_para_with_text(block).strip()).lower().endswith(marker.lower())
173
- for marker in CONTINUATION_MARKERS)
174
- for block in caption_blocks
175
- ):
306
+ # 如果所有caption都不包含续表标识,则不允许合并
307
+ if not has_continuation_marker:
176
308
  return False, None, None, None, None
177
309
 
178
- if any(block["type"] == BlockType.TABLE_FOOTNOTE for block in previous_table_block["blocks"]):
179
- return False, None, None, None, None
310
+ # 如果current_table_block的caption存在续标识,放宽footnote的限制允许previous_table_block有最多一条footnote
311
+ if footnote_count > 1:
312
+ return False, None, None, None, None
313
+ else:
314
+ if footnote_count > 0:
315
+ return False, None, None, None, None
180
316
 
181
317
  # 获取两个表格的HTML内容
182
318
  current_html = ""
@@ -226,34 +362,44 @@ def check_rows_match(soup1, soup2):
226
362
  if not (rows1 and rows2):
227
363
  return False
228
364
 
229
- # 获取第一个表的最后一行数据行
365
+ # 获取第一个表的最后一行数据行索引
366
+ last_row_idx = None
230
367
  last_row = None
231
- for row in reversed(rows1):
232
- if row.find_all(["td", "th"]):
233
- last_row = row
368
+ for idx in range(len(rows1) - 1, -1, -1):
369
+ if rows1[idx].find_all(["td", "th"]):
370
+ last_row_idx = idx
371
+ last_row = rows1[idx]
234
372
  break
235
373
 
236
374
  # 检测表头行数,以便获取第二个表的首个数据行
237
375
  header_count, _, _ = detect_table_headers(soup1, soup2)
238
376
 
239
377
  # 获取第二个表的首个数据行
378
+ first_data_row_idx = None
240
379
  first_data_row = None
241
380
  if len(rows2) > header_count:
381
+ first_data_row_idx = header_count
242
382
  first_data_row = rows2[header_count] # 第一个非表头行
243
383
 
244
384
  if not (last_row and first_data_row):
245
385
  return False
246
386
 
247
- # 计算实际列数(考虑colspan)和视觉列数
387
+ # 计算有效列数(考虑rowspan和colspan
388
+ last_row_effective_cols = calculate_row_effective_columns(soup1, last_row_idx)
389
+ first_row_effective_cols = calculate_row_effective_columns(soup2, first_data_row_idx)
390
+
391
+ # 计算实际列数(仅考虑colspan)和视觉列数
248
392
  last_row_cols = calculate_row_columns(last_row)
249
393
  first_row_cols = calculate_row_columns(first_data_row)
250
394
  last_row_visual_cols = calculate_visual_columns(last_row)
251
395
  first_row_visual_cols = calculate_visual_columns(first_data_row)
252
396
 
253
- # logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(视觉列数:{first_row_visual_cols})")
397
+ # logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(有效列数:{last_row_effective_cols}, 视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(有效列数:{first_row_effective_cols}, 视觉列数:{first_row_visual_cols})")
254
398
 
255
- # 同时考虑实际列数匹配和视觉列数匹配
256
- return last_row_cols == first_row_cols or last_row_visual_cols == first_row_visual_cols
399
+ # 同时考虑有效列数匹配、实际列数匹配和视觉列数匹配
400
+ return (last_row_effective_cols == first_row_effective_cols or
401
+ last_row_cols == first_row_cols or
402
+ last_row_visual_cols == first_row_visual_cols)
257
403
 
258
404
 
259
405
  def check_row_columns_match(row1, row2):
@@ -270,12 +416,13 @@ def check_row_columns_match(row1, row2):
270
416
  return True
271
417
 
272
418
 
273
- def adjust_table_rows_colspan(rows, start_idx, end_idx,
419
+ def adjust_table_rows_colspan(soup, rows, start_idx, end_idx,
274
420
  reference_structure, reference_visual_cols,
275
421
  target_cols, current_cols, reference_row):
276
422
  """调整表格行的colspan属性以匹配目标列数
277
423
 
278
424
  Args:
425
+ soup: BeautifulSoup解析的表格对象(用于计算有效列数)
279
426
  rows: 表格行列表
280
427
  start_idx: 起始行索引
281
428
  end_idx: 结束行索引(不包含)
@@ -287,14 +434,21 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
287
434
  """
288
435
  reference_row_copy = deepcopy(reference_row)
289
436
 
437
+ # 构建有效列数矩阵
438
+ effective_cols_matrix = build_table_occupied_matrix(soup)
439
+
290
440
  for i in range(start_idx, end_idx):
291
441
  row = rows[i]
292
442
  cells = row.find_all(["td", "th"])
293
443
  if not cells:
294
444
  continue
295
445
 
446
+ # 使用有效列数(考虑rowspan)判断是否需要调整
447
+ current_row_effective_cols = effective_cols_matrix.get(i, 0)
296
448
  current_row_cols = calculate_row_columns(row)
297
- if current_row_cols >= target_cols:
449
+
450
+ # 如果有效列数或实际列数已经达到目标,则跳过
451
+ if current_row_effective_cols >= target_cols or current_row_cols >= target_cols:
298
452
  continue
299
453
 
300
454
  # 检查是否与参考行结构匹配
@@ -306,9 +460,12 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
306
460
  cell["colspan"] = str(reference_structure[j])
307
461
  else:
308
462
  # 扩展最后一个单元格以填补列数差异
309
- last_cell = cells[-1]
310
- current_last_span = int(last_cell.get("colspan", 1))
311
- last_cell["colspan"] = str(current_last_span + (target_cols - current_cols))
463
+ # 使用有效列数来计算差异
464
+ cols_diff = target_cols - current_row_effective_cols
465
+ if cols_diff > 0:
466
+ last_cell = cells[-1]
467
+ current_last_span = int(last_cell.get("colspan", 1))
468
+ last_cell["colspan"] = str(current_last_span + cols_diff)
312
469
 
313
470
 
314
471
  def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_footnotes):
@@ -339,7 +496,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
339
496
  reference_visual_cols = calculate_visual_columns(last_row1)
340
497
  # 以表1的最后一行为参考,调整表2的行
341
498
  adjust_table_rows_colspan(
342
- rows2, header_count, len(rows2),
499
+ soup2, rows2, header_count, len(rows2),
343
500
  reference_structure, reference_visual_cols,
344
501
  table_cols1, table_cols2, first_data_row2
345
502
  )
@@ -349,7 +506,7 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
349
506
  reference_visual_cols = calculate_visual_columns(first_data_row2)
350
507
  # 以表2的第一个数据行为参考,调整表1的行
351
508
  adjust_table_rows_colspan(
352
- rows1, 0, len(rows1),
509
+ soup1, rows1, 0, len(rows1),
353
510
  reference_structure, reference_visual_cols,
354
511
  table_cols2, table_cols1, last_row1
355
512
  )
@@ -363,6 +520,11 @@ def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_foo
363
520
  row.extract()
364
521
  tbody1.append(row)
365
522
 
523
+ # 清空previous_table_block的footnote
524
+ previous_table_block["blocks"] = [
525
+ block for block in previous_table_block["blocks"]
526
+ if block["type"] != BlockType.TABLE_FOOTNOTE
527
+ ]
366
528
  # 添加待合并表格的footnote到前一个表格中
367
529
  for table_footnote in wait_merge_table_footnotes:
368
530
  temp_table_footnote = table_footnote.copy()
@@ -423,4 +585,4 @@ def merge_table(page_info_list):
423
585
  # 删除当前页的table
424
586
  for block in current_table_block["blocks"]:
425
587
  block['lines'] = []
426
- block[SplitFlag.LINES_DELETED] = True
588
+ block[SplitFlag.LINES_DELETED] = True
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.7.1"
1
+ __version__ = "2.7.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mineru
3
- Version: 2.7.1
3
+ Version: 2.7.3
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: homepage, https://mineru.net/
@@ -60,7 +60,7 @@ Requires-Dist: matplotlib<4,>=3.10; extra == "pipeline"
60
60
  Requires-Dist: ultralytics<9,>=8.3.48; extra == "pipeline"
61
61
  Requires-Dist: doclayout_yolo==0.0.4; extra == "pipeline"
62
62
  Requires-Dist: dill<1,>=0.3.8; extra == "pipeline"
63
- Requires-Dist: PyYAML<7,>=6.0.2; extra == "pipeline"
63
+ Requires-Dist: PyYAML<7,>=6.0.1; extra == "pipeline"
64
64
  Requires-Dist: ftfy<7,>=6.3.1; extra == "pipeline"
65
65
  Requires-Dist: shapely<3,>=2.0.7; extra == "pipeline"
66
66
  Requires-Dist: pyclipper<2,>=1.3.0; extra == "pipeline"
@@ -135,6 +135,17 @@ Dynamic: license-file
135
135
 
136
136
  # Changelog
137
137
 
138
+ - 2026/01/23 2.7.2 Release
139
+ - Added support for domestic computing platforms Hygon, Enflame, and Moore Threads. Currently, the officially supported domestic computing platforms include:
140
+ - [Ascend](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Ascend/)
141
+ - [T-Head](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/THead/)
142
+ - [METAX](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/METAX/)
143
+ - [Hygon](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Hygon/)
144
+ - [Enflame](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/Enflame/)
145
+ - [MooreThreads](https://opendatalab.github.io/MinerU/zh/usage/acceleration_cards/MooreThreads/)
146
+ - MinerU continues to ensure compatibility with domestic hardware platforms, supporting mainstream chip architectures. With secure and reliable technology, we empower researchers, government, and enterprises to reach new heights in document digitization!
147
+ - Cross-page table merging optimization, improving merge success rate and merge quality
148
+
138
149
  - 2026/01/06 2.7.1 Release
139
150
  - fix bug: #4300
140
151
  - Updated pdfminer.six dependency version to resolve [CVE-2025-64512](https://github.com/advisories/GHSA-wf5f-4jwr-ppcp)
@@ -1,10 +1,10 @@
1
1
  mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
2
- mineru/version.py,sha256=yRpSH6mBb4BJgbFlT7rt8MSjCUW17Ycx0RziLf-lQLA,22
2
+ mineru/version.py,sha256=uf6cgtzZWaYn5QApMyykHXMzWM_oEqWLhYTsWSWu2_k,22
3
3
  mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
4
4
  mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
5
5
  mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
6
6
  mineru/backend/hybrid/hybrid_analyze.py,sha256=Sckw6T-pvMv3V_nqZkBeW8kY4zNIBlWxqeS2vXqNqtY,20939
7
- mineru/backend/hybrid/hybrid_magic_model.py,sha256=39ByeZh54KBbPe77bzGCqZrZ5RNwNxGYttcoisgDOrc,24668
7
+ mineru/backend/hybrid/hybrid_magic_model.py,sha256=_DvBq5WP_UZvmHfhZloxqv-MKoWWe_ye1kNLv6RA5rU,24713
8
8
  mineru/backend/hybrid/hybrid_model_output_to_middle_json.py,sha256=yE-c1eGa5LzPqLfKfvBON_SJRljqyz2B7LiglFcE7FQ,8468
9
9
  mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
10
10
  mineru/backend/pipeline/batch_analyze.py,sha256=3UBs2WOwcI-mfGAlxZt437OqSOleXPLnpYbrD9h5D54,21303
@@ -17,9 +17,9 @@ mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc
17
17
  mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=NJCLGKE7BqM24bRdpXCfTalyiqozowFZjpdzpIUy5aA,14672
18
18
  mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
19
19
  mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
20
- mineru/backend/vlm/utils.py,sha256=sqcS4WVCcxVL1aElKII1zNYMu2yH5tRpVqRb4lXVm38,3650
21
- mineru/backend/vlm/vlm_analyze.py,sha256=EdfEmkroA3lafRZLqN4uOaLWx9oxVnUocqFsWZyS82c,11666
22
- mineru/backend/vlm/vlm_magic_model.py,sha256=mD-irxboo2DmMu4QF1wnvbti2xdNyBmNflbB4a-TmsU,21402
20
+ mineru/backend/vlm/utils.py,sha256=1qma_KmDjRfOckcPbriGgRhS1XMk_johsyACfwcmDr4,3844
21
+ mineru/backend/vlm/vlm_analyze.py,sha256=ttnQBUy1PEm9JZoF2G1_z-7gA3MgUUUBhz6OypCb4_g,14765
22
+ mineru/backend/vlm/vlm_magic_model.py,sha256=RodoVwNJhzjyuRLn5Io5gFMIX1NxCuuLzCbUxGaKV80,21447
23
23
  mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
24
24
  mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
25
25
  mineru/cli/client.py,sha256=mPNfMEShVG-ithmlJQ5nGRIad2gCZgUjBGHN7zAmLhQ,6978
@@ -47,13 +47,13 @@ mineru/model/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
47
47
  mineru/model/layout/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
48
48
  mineru/model/layout/doclayoutyolo.py,sha256=DttINdulzTiYcVDl_70oDtUdfVmGc9qkKWmbPOGAeV0,3867
49
49
  mineru/model/mfd/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
50
- mineru/model/mfd/yolo_v8.py,sha256=t7ptmShHoZCW9WkqLNCL1KRChxpa2E7j5g5fibXlUvY,3681
50
+ mineru/model/mfd/yolo_v8.py,sha256=OI5AxVgt3FvXp4NYk0BDXXvpDlo9YjM6byDyC_TZ8Js,3714
51
51
  mineru/model/mfr/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
52
52
  mineru/model/mfr/utils.py,sha256=pAi1HnkTuO0R6251Hdl-o50m0wH0Ce89PAf74WCsXPU,11499
53
53
  mineru/model/mfr/pp_formulanet_plus_m/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
54
  mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py,sha256=alGX_sPJxZh_7v1sOK3DJ8akfkWO-2c5I_JR7aXMTLU,5588
55
55
  mineru/model/mfr/pp_formulanet_plus_m/processors.py,sha256=MSKyanxiDDjgDQHBov-GjKtPnMx9tSmxBC9GIkM3ft8,23832
56
- mineru/model/mfr/unimernet/Unimernet.py,sha256=1SGLSQ2rc6oslnEwP4Ti7JxaNlyCSGge0js-Tr1VikE,7864
56
+ mineru/model/mfr/unimernet/Unimernet.py,sha256=ZK0M9fPmZziK4D33H3YND7RnHiQkRVCS-lvNfY-N7do,7912
57
57
  mineru/model/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  mineru/model/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
59
59
  mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=_lN3zDKxeqsW-h9tXx79DYiT5uT4P9ixG49WrSYKFxE,7551
@@ -88,7 +88,7 @@ mineru/model/table/rec/unet_table/main.py,sha256=J13Q7_6stYyedmVedf9CZD7R0tuguGf
88
88
  mineru/model/table/rec/unet_table/table_recover.py,sha256=rSyeWyuP10M8dLKA5e0n4P2DXMYbVbmgLxEcdZA8_0E,9059
89
89
  mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=hnmYLzZFRlK0Y4gr874G9GaLahcKnNZYNun869FdmH8,8150
90
90
  mineru/model/table/rec/unet_table/utils.py,sha256=CYAqJW0wePJk4NAemb8W203N7E32v0ujiWbxanDhd8I,16083
91
- mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=zrCdPwI4M8nu0FEfd7lRJAe0z8kYq3KFbzwElM82USE,11174
91
+ mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=6z0jYO6S8wAmfHe5tAyEfzWZIQv8wrn_dRU9GC7oKro,11435
92
92
  mineru/model/table/rec/unet_table/utils_table_recover.py,sha256=XksJsY82ZS0kqUnNT-jvaYzxJ3V3svMSzj0puwIau1k,10651
93
93
  mineru/model/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
94
94
  mineru/model/utils/pytorchocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -151,17 +151,17 @@ mineru/model/utils/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-
151
151
  mineru/model/utils/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
152
152
  mineru/model/vlm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
153
153
  mineru/model/vlm/lmdeploy_server.py,sha256=PvxJNcUIKB8VzWMDXeV1t0SHSgz_ULO36ZAzJbppz90,3262
154
- mineru/model/vlm/vllm_server.py,sha256=w5ddusPbcVaEoWAo_BRjmwv_Ywxrc_bCMRhxihoyykY,2263
154
+ mineru/model/vlm/vllm_server.py,sha256=DtYRAHINYN4qkR2onVMofvANPTkSP6tE4IRY_vZgpiA,3079
155
155
  mineru/resources/header.html,sha256=7xrf6bGloR-3ZeTDyA-JvavE_NeRuUDe3p07cEKUXSI,4769
156
156
  mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
157
157
  mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
158
158
  mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
159
- mineru/utils/block_sort.py,sha256=5e1mOLB3W7xu5Y1hmhvGSHPL_aQ41R_4VXcP4vjYAOU,12976
159
+ mineru/utils/block_sort.py,sha256=MmgjZBcmaWssAglzE75VixjtJ_BLNUHO0gvCNQHvlY4,13538
160
160
  mineru/utils/boxbase.py,sha256=xnGA1k7hVtTQrreqlJmK-SA3y9edTHgLmGiqGrSXckE,7568
161
161
  mineru/utils/char_utils.py,sha256=74T5Ylr5mi1uddAIuJku9Z6sH7vhR7t595_H7qmbu4c,1777
162
162
  mineru/utils/check_sys_env.py,sha256=TRjzg4xWyoSGrgv4KaP225A-99xBgLAfZ1cPcGqrBAA,1191
163
163
  mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
164
- mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
164
+ mineru/utils/config_reader.py,sha256=mfulokOzI-33sZy7S-wEVbY3z01PdC7X3761fuhqR3s,4393
165
165
  mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
166
166
  mineru/utils/draw_bbox.py,sha256=FkgppjUzRhN-uxvChdkhHXcDavJEaApMD6qC6qoRwfQ,20292
167
167
  mineru/utils/engine_utils.py,sha256=Jmao9-O-sZDzH7vANKEDaY6NJ8tuthKsTr23LFIeBLU,2203
@@ -171,8 +171,8 @@ mineru/utils/guess_suffix_or_lang.py,sha256=aUC2wAJwa5LH0SHxwTbOEJqVVgvpdUCWFF6o
171
171
  mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
172
172
  mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
173
173
  mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
174
- mineru/utils/magic_model_utils.py,sha256=I6vdN56aqhQBGOasoWHiJbjnXsBwUojw6xFjbWZSHaU,8656
175
- mineru/utils/model_utils.py,sha256=6moOQqE5ShHaJKkENXP8BXJA7RCWtOGlYHZ3nidwmZs,18977
174
+ mineru/utils/magic_model_utils.py,sha256=8Hv-BDk9Ez4TUx6hrVJ_675yZZggPj6Uib81lSpm0ig,11683
175
+ mineru/utils/model_utils.py,sha256=w-jSN7Ilh27FlMjPpKNO6MPbo_dT5Ln7zCQcXaREl_k,19605
176
176
  mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
177
177
  mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
178
178
  mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
@@ -184,10 +184,10 @@ mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ
184
184
  mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
185
185
  mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
186
186
  mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
187
- mineru/utils/table_merge.py,sha256=X2vQCCKx8hG9Iipn4UEP8pXHc9jeNmYNYvl5zxaTS2E,15185
188
- mineru-2.7.1.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
189
- mineru-2.7.1.dist-info/METADATA,sha256=gtaeoZmMvmHA8JDW1QnpszDa0-cTwogQ-5BOPTdikWA,35540
190
- mineru-2.7.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
191
- mineru-2.7.1.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
192
- mineru-2.7.1.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
193
- mineru-2.7.1.dist-info/RECORD,,
187
+ mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
188
+ mineru-2.7.3.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
189
+ mineru-2.7.3.dist-info/METADATA,sha256=XDUBoY78vVkmR2TFpXk_frncPD6D_Ev067KuoRUJR2U,36621
190
+ mineru-2.7.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
191
+ mineru-2.7.3.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
192
+ mineru-2.7.3.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
193
+ mineru-2.7.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5