mineru 2.6.8__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. mineru/backend/hybrid/__init__.py +1 -0
  2. mineru/backend/hybrid/hybrid_analyze.py +526 -0
  3. mineru/backend/hybrid/hybrid_magic_model.py +617 -0
  4. mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
  5. mineru/backend/pipeline/batch_analyze.py +9 -1
  6. mineru/backend/pipeline/model_init.py +96 -1
  7. mineru/backend/pipeline/pipeline_analyze.py +6 -4
  8. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
  9. mineru/backend/vlm/utils.py +3 -1
  10. mineru/backend/vlm/vlm_analyze.py +12 -12
  11. mineru/backend/vlm/vlm_magic_model.py +24 -89
  12. mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
  13. mineru/cli/client.py +17 -17
  14. mineru/cli/common.py +170 -20
  15. mineru/cli/fast_api.py +39 -13
  16. mineru/cli/gradio_app.py +232 -206
  17. mineru/model/mfd/yolo_v8.py +12 -6
  18. mineru/model/mfr/unimernet/Unimernet.py +71 -3
  19. mineru/resources/header.html +5 -1
  20. mineru/utils/boxbase.py +23 -0
  21. mineru/utils/char_utils.py +55 -0
  22. mineru/utils/engine_utils.py +74 -0
  23. mineru/utils/enum_class.py +18 -1
  24. mineru/utils/magic_model_utils.py +85 -2
  25. mineru/utils/span_pre_proc.py +5 -3
  26. mineru/utils/table_merge.py +5 -21
  27. mineru/version.py +1 -1
  28. mineru-2.7.0.dist-info/METADATA +433 -0
  29. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/RECORD +33 -27
  30. mineru-2.6.8.dist-info/METADATA +0 -954
  31. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/WHEEL +0 -0
  32. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/entry_points.txt +0 -0
  33. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/licenses/LICENSE.md +0 -0
  34. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,8 @@ import torch
2
2
  from torch.utils.data import DataLoader, Dataset
3
3
  from tqdm import tqdm
4
4
 
5
+ from mineru.utils.boxbase import calculate_iou
6
+
5
7
 
6
8
  class MathDataset(Dataset):
7
9
  def __init__(self, image_paths, transform=None):
@@ -31,11 +33,64 @@ class UnimernetModel(object):
31
33
  self.model = self.model.to(dtype=torch.float16)
32
34
  self.model.eval()
33
35
 
36
+ @staticmethod
37
+ def _filter_boxes_by_iou(xyxy, conf, cla, iou_threshold=0.8):
38
+ """过滤IOU超过阈值的重叠框,保留置信度较高的框。
39
+
40
+ Args:
41
+ xyxy: 框坐标张量,shape为(N, 4)
42
+ conf: 置信度张量,shape为(N,)
43
+ cla: 类别张量,shape为(N,)
44
+ iou_threshold: IOU阈值,默认0.9
45
+
46
+ Returns:
47
+ 过滤后的xyxy, conf, cla张量
48
+ """
49
+ if len(xyxy) == 0:
50
+ return xyxy, conf, cla
51
+
52
+ # 转换为CPU进行处理
53
+ xyxy_cpu = xyxy.cpu()
54
+ conf_cpu = conf.cpu()
55
+
56
+ n = len(xyxy_cpu)
57
+ keep = [True] * n
58
+
59
+ for i in range(n):
60
+ if not keep[i]:
61
+ continue
62
+ bbox1 = xyxy_cpu[i].tolist()
63
+ for j in range(i + 1, n):
64
+ if not keep[j]:
65
+ continue
66
+ bbox2 = xyxy_cpu[j].tolist()
67
+ iou = calculate_iou(bbox1, bbox2)
68
+ if iou > iou_threshold:
69
+ # 保留置信度较高的框
70
+ if conf_cpu[i] >= conf_cpu[j]:
71
+ keep[j] = False
72
+ else:
73
+ keep[i] = False
74
+ break # i被删除,跳出内循环
75
+
76
+ keep_indices = [i for i in range(n) if keep[i]]
77
+ if len(keep_indices) == n:
78
+ return xyxy, conf, cla
79
+
80
+ keep_indices = torch.tensor(keep_indices, dtype=torch.long)
81
+ return xyxy[keep_indices], conf[keep_indices], cla[keep_indices]
82
+
34
83
  def predict(self, mfd_res, image):
35
84
  formula_list = []
36
85
  mf_image_list = []
86
+
87
+ # 对检测框进行IOU去重,保留置信度较高的框
88
+ xyxy_filtered, conf_filtered, cla_filtered = self._filter_boxes_by_iou(
89
+ mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
90
+ )
91
+
37
92
  for xyxy, conf, cla in zip(
38
- mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()
93
+ xyxy_filtered.cpu(), conf_filtered.cpu(), cla_filtered.cpu()
39
94
  ):
40
95
  xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
41
96
  new_item = {
@@ -61,7 +116,13 @@ class UnimernetModel(object):
61
116
  res["latex"] = latex
62
117
  return formula_list
63
118
 
64
- def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list:
119
+ def batch_predict(
120
+ self,
121
+ images_mfd_res: list,
122
+ images: list,
123
+ batch_size: int = 64,
124
+ interline_enable: bool = True,
125
+ ) -> list:
65
126
  images_formula_list = []
66
127
  mf_image_list = []
67
128
  backfill_list = []
@@ -73,9 +134,16 @@ class UnimernetModel(object):
73
134
  image = images[image_index]
74
135
  formula_list = []
75
136
 
137
+ # 对检测框进行IOU去重,保留置信度较高的框
138
+ xyxy_filtered, conf_filtered, cla_filtered = self._filter_boxes_by_iou(
139
+ mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
140
+ )
141
+
76
142
  for idx, (xyxy, conf, cla) in enumerate(zip(
77
- mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
143
+ xyxy_filtered, conf_filtered, cla_filtered
78
144
  )):
145
+ if not interline_enable and cla.item() == 1:
146
+ continue # Skip interline regions if not enabled
79
147
  xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
80
148
  new_item = {
81
149
  "category_id": 13 + int(cla.item()),
@@ -66,7 +66,11 @@
66
66
  color: #fafafa;
67
67
  opacity: 0.8;
68
68
  ">
69
- A one-stop, open-source, high-quality data extraction tool that supports converting PDF to Markdown and JSON.<br>
69
+ A one-stop, open-source, high-quality data extraction tool that supports converting PDF to Markdown and JSON.<br>
70
+ If you found our project helpful, please give us a ⭐️ to support us!
71
+ <a href="https://github.com/opendatalab/MinerU" style="display: inline-flex; align-items: center;">
72
+ <img src="https://img.shields.io/github/stars/opendatalab/MinerU.svg" alt="stars" style="vertical-align: middle; position: relative; top: 5px;">
73
+ </a>
70
74
  </p>
71
75
  <style>
72
76
  .link-block {
mineru/utils/boxbase.py CHANGED
@@ -74,6 +74,29 @@ def bbox_distance(bbox1, bbox2):
74
74
  return 0.0
75
75
 
76
76
 
77
+ def bbox_center_distance(bbox1, bbox2):
78
+ """计算两个矩形框中心点之间的欧氏距离。
79
+
80
+ Args:
81
+ bbox1 (tuple): 第一个矩形框的坐标,格式为 (x1, y1, x2, y2)
82
+ bbox2 (tuple): 第二个矩形框的坐标,格式为 (x1, y1, x2, y2)
83
+
84
+ Returns:
85
+ float: 两个矩形框中心点之间的距离
86
+ """
87
+ x1, y1, x1b, y1b = bbox1
88
+ x2, y2, x2b, y2b = bbox2
89
+
90
+ # 计算中心点
91
+ center1_x = (x1 + x1b) / 2
92
+ center1_y = (y1 + y1b) / 2
93
+ center2_x = (x2 + x2b) / 2
94
+ center2_y = (y2 + y2b) / 2
95
+
96
+ # 计算欧氏距离
97
+ return math.sqrt((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2)
98
+
99
+
77
100
  def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
78
101
  """通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
79
102
  如果比例大于ratio,则返回小的那个bbox, 否则返回None."""
@@ -0,0 +1,55 @@
1
+ # Copyright (c) Opendatalab. All rights reserved.
2
+ import re
3
+
4
+
5
+ def is_hyphen_at_line_end(line):
6
+ """Check if a line ends with one or more letters followed by a hyphen.
7
+
8
+ Args:
9
+ line (str): The line of text to check.
10
+
11
+ Returns:
12
+ bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
13
+ """
14
+ # Use regex to check if the line ends with one or more letters followed by a hyphen
15
+ return bool(re.search(r'[A-Za-z]+-\s*$', line))
16
+
17
+
18
+ def full_to_half_exclude_marks(text: str) -> str:
19
+ """Convert full-width characters to half-width characters using code point manipulation.
20
+
21
+ Args:
22
+ text: String containing full-width characters
23
+
24
+ Returns:
25
+ String with full-width characters converted to half-width
26
+ """
27
+ result = []
28
+ for char in text:
29
+ code = ord(char)
30
+ # Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
31
+ if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
32
+ result.append(chr(code - 0xFEE0)) # Shift to ASCII range
33
+ else:
34
+ result.append(char)
35
+ return ''.join(result)
36
+
37
+
38
+ def full_to_half(text: str) -> str:
39
+ """Convert full-width characters to half-width characters using code point manipulation.
40
+
41
+ Args:
42
+ text: String containing full-width characters
43
+
44
+ Returns:
45
+ String with full-width characters converted to half-width
46
+ """
47
+ result = []
48
+ for char in text:
49
+ code = ord(char)
50
+ # Full-width letters, numbers and punctuation (FF01-FF5E)
51
+ if 0xFF01 <= code <= 0xFF5E:
52
+ result.append(chr(code - 0xFEE0)) # Shift to ASCII range
53
+ else:
54
+ result.append(char)
55
+ return ''.join(result)
@@ -0,0 +1,74 @@
1
+ # Copyright (c) Opendatalab. All rights reserved.
2
+ from loguru import logger
3
+
4
+ from mineru.utils.check_sys_env import is_mac_os_version_supported, is_windows_environment, is_mac_environment, \
5
+ is_linux_environment
6
+
7
+
8
+ def get_vlm_engine(inference_engine: str, is_async: bool = False) -> str:
9
+ """
10
+ 自动选择或验证 VLM 推理引擎
11
+
12
+ Args:
13
+ inference_engine: 指定的引擎名称或 'auto' 进行自动选择
14
+ is_async: 是否使用异步引擎(仅对 vllm 有效)
15
+
16
+ Returns:
17
+ 最终选择的引擎名称
18
+ """
19
+ if inference_engine == 'auto':
20
+ # 根据操作系统自动选择引擎
21
+ if is_windows_environment():
22
+ inference_engine = _select_windows_engine()
23
+ elif is_linux_environment():
24
+ inference_engine = _select_linux_engine(is_async)
25
+ elif is_mac_environment():
26
+ inference_engine = _select_mac_engine()
27
+ else:
28
+ logger.warning("Unknown operating system, falling back to transformers")
29
+ inference_engine = 'transformers'
30
+
31
+ formatted_engine = _format_engine_name(inference_engine)
32
+ logger.info(f"Using {formatted_engine} as the inference engine for VLM.")
33
+ return formatted_engine
34
+
35
+
36
+ def _select_windows_engine() -> str:
37
+ """Windows 平台引擎选择"""
38
+ try:
39
+ import lmdeploy
40
+ return 'lmdeploy'
41
+ except ImportError:
42
+ return 'transformers'
43
+
44
+
45
+ def _select_linux_engine(is_async: bool) -> str:
46
+ """Linux 平台引擎选择"""
47
+ try:
48
+ import vllm
49
+ return 'vllm-async' if is_async else 'vllm'
50
+ except ImportError:
51
+ try:
52
+ import lmdeploy
53
+ return 'lmdeploy'
54
+ except ImportError:
55
+ return 'transformers'
56
+
57
+
58
+ def _select_mac_engine() -> str:
59
+ """macOS 平台引擎选择"""
60
+ try:
61
+ from mlx_vlm import load as mlx_load
62
+ if is_mac_os_version_supported():
63
+ return 'mlx'
64
+ else:
65
+ return 'transformers'
66
+ except ImportError:
67
+ return 'transformers'
68
+
69
+
70
+ def _format_engine_name(engine: str) -> str:
71
+ """统一格式化引擎名称"""
72
+ if engine != 'transformers':
73
+ return f"{engine}-engine"
74
+ return engine
@@ -1,3 +1,5 @@
1
+ from enum import Enum
2
+
1
3
  class BlockType:
2
4
  IMAGE = 'image'
3
5
  TABLE = 'table'
@@ -112,4 +114,19 @@ class SplitFlag:
112
114
 
113
115
  class ImageType:
114
116
  PIL = 'pil_img'
115
- BASE64 = 'base64_img'
117
+ BASE64 = 'base64_img'
118
+
119
+
120
+ class NotExtractType(Enum):
121
+ TEXT = BlockType.TEXT
122
+ TITLE = BlockType.TITLE
123
+ HEADER = BlockType.HEADER
124
+ FOOTER = BlockType.FOOTER
125
+ PAGE_NUMBER = BlockType.PAGE_NUMBER
126
+ PAGE_FOOTNOTE = BlockType.PAGE_FOOTNOTE
127
+ REF_TEXT = BlockType.REF_TEXT
128
+ TABLE_CAPTION = BlockType.TABLE_CAPTION
129
+ IMAGE_CAPTION = BlockType.IMAGE_CAPTION
130
+ TABLE_FOOTNOTE = BlockType.TABLE_FOOTNOTE
131
+ IMAGE_FOOTNOTE = BlockType.IMAGE_FOOTNOTE
132
+ CODE_CAPTION = BlockType.CODE_CAPTION
@@ -2,7 +2,7 @@
2
2
  包含两个MagicModel类中重复使用的方法和逻辑
3
3
  """
4
4
  from typing import List, Dict, Any, Callable
5
- from mineru.utils.boxbase import bbox_distance, is_in
5
+ from mineru.utils.boxbase import bbox_distance, bbox_center_distance, is_in
6
6
 
7
7
 
8
8
  def reduct_overlap(bboxes: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -165,4 +165,87 @@ def tie_up_category_by_distance_v3(
165
165
  }
166
166
  )
167
167
 
168
- return ret
168
+ return ret
169
+
170
+
171
+ def tie_up_category_by_index(
172
+ get_subjects_func: Callable,
173
+ get_objects_func: Callable,
174
+ extract_subject_func: Callable = None,
175
+ extract_object_func: Callable = None
176
+ ):
177
+ """
178
+ 基于index的类别关联方法,用于将主体对象与客体对象进行关联
179
+ 客体优先匹配给index最接近的主体,index差值相同时使用bbox中心点距离作为tiebreaker
180
+
181
+ 参数:
182
+ get_subjects_func: 函数,提取主体对象
183
+ get_objects_func: 函数,提取客体对象
184
+ extract_subject_func: 函数,自定义提取主体属性(默认使用bbox和其他属性)
185
+ extract_object_func: 函数,自定义提取客体属性(默认使用bbox和其他属性)
186
+
187
+ 返回:
188
+ 关联后的对象列表,按主体index升序排列
189
+ """
190
+ subjects = get_subjects_func()
191
+ objects = get_objects_func()
192
+
193
+ # 如果没有提供自定义提取函数,使用默认函数
194
+ if extract_subject_func is None:
195
+ extract_subject_func = lambda x: x
196
+ if extract_object_func is None:
197
+ extract_object_func = lambda x: x
198
+
199
+ # 初始化结果字典,key为主体索引,value为关联信息
200
+ result_dict = {}
201
+
202
+ # 初始化所有主体
203
+ for i, subject in enumerate(subjects):
204
+ result_dict[i] = {
205
+ "sub_bbox": extract_subject_func(subject),
206
+ "obj_bboxes": [],
207
+ "sub_idx": i,
208
+ }
209
+
210
+ # 为每个客体找到最匹配的主体
211
+ for obj in objects:
212
+ if len(subjects) == 0:
213
+ # 如果没有主体,跳过客体
214
+ continue
215
+
216
+ obj_index = obj["index"]
217
+ min_index_diff = float("inf")
218
+ best_subject_indices = []
219
+
220
+ # 找出index差值最小的所有主体
221
+ for i, subject in enumerate(subjects):
222
+ sub_index = subject["index"]
223
+ index_diff = abs(obj_index - sub_index)
224
+
225
+ if index_diff < min_index_diff:
226
+ min_index_diff = index_diff
227
+ best_subject_indices = [i]
228
+ elif index_diff == min_index_diff:
229
+ best_subject_indices.append(i)
230
+
231
+ # 如果有多个主体的index差值相同,使用中心点距离作为tiebreaker
232
+ if len(best_subject_indices) > 1:
233
+ min_center_dist = float("inf")
234
+ best_subject_idx = best_subject_indices[0]
235
+
236
+ for idx in best_subject_indices:
237
+ center_dist = bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])
238
+ if center_dist < min_center_dist:
239
+ min_center_dist = center_dist
240
+ best_subject_idx = idx
241
+ else:
242
+ best_subject_idx = best_subject_indices[0]
243
+
244
+ # 将客体添加到最佳主体的obj_bboxes中
245
+ result_dict[best_subject_idx]["obj_bboxes"].append(extract_object_func(obj))
246
+
247
+ # 转换为列表并按主体index排序
248
+ ret = list(result_dict.values())
249
+ ret.sort(key=lambda x: x["sub_idx"])
250
+
251
+ return ret
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Opendatalab. All rights reserved.
2
2
  import collections
3
+ import math
3
4
  import re
4
5
  import statistics
5
6
 
@@ -128,8 +129,9 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
128
129
  page_all_lines = []
129
130
  for block in page_dict['blocks']:
130
131
  for line in block['lines']:
131
- if 0 < abs(line['rotation']) < 90:
132
- # 旋转角度在0-90度之间的行,直接跳过
132
+ rotation_degrees = math.degrees(line['rotation'])
133
+ # 旋转角度不为0, 90, 180, 270的行,直接跳过(rotation_degrees的值可能不为整数)
134
+ if not any(abs(rotation_degrees - angle) < 0.1 for angle in [0, 90, 180, 270]):
133
135
  continue
134
136
  page_all_lines.append(line)
135
137
  for span in line['spans']:
@@ -159,7 +161,7 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
159
161
  if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
160
162
  continue
161
163
  if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
162
- if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
164
+ if span['height'] > median_span_height * 2.3 and span['height'] > span['width'] * 2.3:
163
165
  vertical_spans.append(span)
164
166
  elif block in all_bboxes:
165
167
  useful_spans.append(span)
@@ -1,35 +1,17 @@
1
1
  # Copyright (c) Opendatalab. All rights reserved.
2
+ from copy import deepcopy
2
3
 
3
4
  from loguru import logger
4
5
  from bs4 import BeautifulSoup
5
6
 
6
7
  from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
8
+ from mineru.utils.char_utils import full_to_half
7
9
  from mineru.utils.enum_class import BlockType, SplitFlag
8
10
 
9
11
 
10
12
  CONTINUATION_MARKERS = ["(续)", "(续表)", "(continued)", "(cont.)"]
11
13
 
12
14
 
13
- def full_to_half(text: str) -> str:
14
- """Convert full-width characters to half-width characters using code point manipulation.
15
-
16
- Args:
17
- text: String containing full-width characters
18
-
19
- Returns:
20
- String with full-width characters converted to half-width
21
- """
22
- result = []
23
- for char in text:
24
- code = ord(char)
25
- # Full-width letters, numbers and punctuation (FF01-FF5E)
26
- if 0xFF01 <= code <= 0xFF5E:
27
- result.append(chr(code - 0xFEE0)) # Shift to ASCII range
28
- else:
29
- result.append(char)
30
- return ''.join(result)
31
-
32
-
33
15
  def calculate_table_total_columns(soup):
34
16
  """计算表格的总列数,通过分析整个表格结构来处理rowspan和colspan
35
17
 
@@ -296,6 +278,8 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
296
278
  current_cols: 当前总列数
297
279
  reference_row: 参考行对象
298
280
  """
281
+ reference_row_copy = deepcopy(reference_row)
282
+
299
283
  for i in range(start_idx, end_idx):
300
284
  row = rows[i]
301
285
  cells = row.find_all(["td", "th"])
@@ -307,7 +291,7 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
307
291
  continue
308
292
 
309
293
  # 检查是否与参考行结构匹配
310
- if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row, reference_row):
294
+ if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row, reference_row_copy):
311
295
  # 尝试应用参考结构
312
296
  if len(cells) <= len(reference_structure):
313
297
  for j, cell in enumerate(cells):
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.6.8"
1
+ __version__ = "2.7.0"