mineru 2.6.8__py3-none-any.whl → 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/hybrid/__init__.py +1 -0
- mineru/backend/hybrid/hybrid_analyze.py +526 -0
- mineru/backend/hybrid/hybrid_magic_model.py +617 -0
- mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
- mineru/backend/pipeline/batch_analyze.py +9 -1
- mineru/backend/pipeline/model_init.py +96 -1
- mineru/backend/pipeline/pipeline_analyze.py +6 -4
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
- mineru/backend/vlm/utils.py +3 -1
- mineru/backend/vlm/vlm_analyze.py +12 -12
- mineru/backend/vlm/vlm_magic_model.py +24 -89
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
- mineru/cli/client.py +17 -17
- mineru/cli/common.py +169 -20
- mineru/cli/fast_api.py +39 -13
- mineru/cli/gradio_app.py +232 -206
- mineru/model/mfd/yolo_v8.py +12 -6
- mineru/model/mfr/unimernet/Unimernet.py +71 -3
- mineru/resources/header.html +5 -1
- mineru/utils/boxbase.py +23 -0
- mineru/utils/char_utils.py +55 -0
- mineru/utils/engine_utils.py +74 -0
- mineru/utils/enum_class.py +18 -1
- mineru/utils/magic_model_utils.py +85 -2
- mineru/utils/pdf_image_tools.py +37 -17
- mineru/utils/span_pre_proc.py +5 -3
- mineru/utils/table_merge.py +13 -22
- mineru/version.py +1 -1
- mineru-2.7.1.dist-info/METADATA +438 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/RECORD +34 -28
- mineru-2.6.8.dist-info/METADATA +0 -954
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/WHEEL +0 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/entry_points.txt +0 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/top_level.txt +0 -0
mineru/model/mfd/yolo_v8.py
CHANGED
|
@@ -27,31 +27,37 @@ class YOLOv8MFDModel:
|
|
|
27
27
|
def _run_predict(
|
|
28
28
|
self,
|
|
29
29
|
inputs: Union[np.ndarray, Image.Image, List],
|
|
30
|
-
is_batch: bool = False
|
|
30
|
+
is_batch: bool = False,
|
|
31
|
+
conf: float = None,
|
|
31
32
|
) -> List:
|
|
32
33
|
preds = self.model.predict(
|
|
33
34
|
inputs,
|
|
34
35
|
imgsz=self.imgsz,
|
|
35
|
-
conf=self.conf,
|
|
36
|
+
conf=conf if conf is not None else self.conf,
|
|
36
37
|
iou=self.iou,
|
|
37
38
|
verbose=False,
|
|
38
39
|
device=self.device
|
|
39
40
|
)
|
|
40
41
|
return [pred.cpu() for pred in preds] if is_batch else preds[0].cpu()
|
|
41
42
|
|
|
42
|
-
def predict(
|
|
43
|
-
|
|
43
|
+
def predict(
|
|
44
|
+
self,
|
|
45
|
+
image: Union[np.ndarray, Image.Image],
|
|
46
|
+
conf: float = None,
|
|
47
|
+
):
|
|
48
|
+
return self._run_predict(image, is_batch=False, conf=conf)
|
|
44
49
|
|
|
45
50
|
def batch_predict(
|
|
46
51
|
self,
|
|
47
52
|
images: List[Union[np.ndarray, Image.Image]],
|
|
48
|
-
batch_size: int = 4
|
|
53
|
+
batch_size: int = 4,
|
|
54
|
+
conf: float = None,
|
|
49
55
|
) -> List:
|
|
50
56
|
results = []
|
|
51
57
|
with tqdm(total=len(images), desc="MFD Predict") as pbar:
|
|
52
58
|
for idx in range(0, len(images), batch_size):
|
|
53
59
|
batch = images[idx: idx + batch_size]
|
|
54
|
-
batch_preds = self._run_predict(batch, is_batch=True)
|
|
60
|
+
batch_preds = self._run_predict(batch, is_batch=True, conf=conf)
|
|
55
61
|
results.extend(batch_preds)
|
|
56
62
|
pbar.update(len(batch))
|
|
57
63
|
return results
|
|
@@ -2,6 +2,8 @@ import torch
|
|
|
2
2
|
from torch.utils.data import DataLoader, Dataset
|
|
3
3
|
from tqdm import tqdm
|
|
4
4
|
|
|
5
|
+
from mineru.utils.boxbase import calculate_iou
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
class MathDataset(Dataset):
|
|
7
9
|
def __init__(self, image_paths, transform=None):
|
|
@@ -31,11 +33,64 @@ class UnimernetModel(object):
|
|
|
31
33
|
self.model = self.model.to(dtype=torch.float16)
|
|
32
34
|
self.model.eval()
|
|
33
35
|
|
|
36
|
+
@staticmethod
|
|
37
|
+
def _filter_boxes_by_iou(xyxy, conf, cla, iou_threshold=0.8):
|
|
38
|
+
"""过滤IOU超过阈值的重叠框,保留置信度较高的框。
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
xyxy: 框坐标张量,shape为(N, 4)
|
|
42
|
+
conf: 置信度张量,shape为(N,)
|
|
43
|
+
cla: 类别张量,shape为(N,)
|
|
44
|
+
iou_threshold: IOU阈值,默认0.9
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
过滤后的xyxy, conf, cla张量
|
|
48
|
+
"""
|
|
49
|
+
if len(xyxy) == 0:
|
|
50
|
+
return xyxy, conf, cla
|
|
51
|
+
|
|
52
|
+
# 转换为CPU进行处理
|
|
53
|
+
xyxy_cpu = xyxy.cpu()
|
|
54
|
+
conf_cpu = conf.cpu()
|
|
55
|
+
|
|
56
|
+
n = len(xyxy_cpu)
|
|
57
|
+
keep = [True] * n
|
|
58
|
+
|
|
59
|
+
for i in range(n):
|
|
60
|
+
if not keep[i]:
|
|
61
|
+
continue
|
|
62
|
+
bbox1 = xyxy_cpu[i].tolist()
|
|
63
|
+
for j in range(i + 1, n):
|
|
64
|
+
if not keep[j]:
|
|
65
|
+
continue
|
|
66
|
+
bbox2 = xyxy_cpu[j].tolist()
|
|
67
|
+
iou = calculate_iou(bbox1, bbox2)
|
|
68
|
+
if iou > iou_threshold:
|
|
69
|
+
# 保留置信度较高的框
|
|
70
|
+
if conf_cpu[i] >= conf_cpu[j]:
|
|
71
|
+
keep[j] = False
|
|
72
|
+
else:
|
|
73
|
+
keep[i] = False
|
|
74
|
+
break # i被删除,跳出内循环
|
|
75
|
+
|
|
76
|
+
keep_indices = [i for i in range(n) if keep[i]]
|
|
77
|
+
if len(keep_indices) == n:
|
|
78
|
+
return xyxy, conf, cla
|
|
79
|
+
|
|
80
|
+
keep_indices = torch.tensor(keep_indices, dtype=torch.long)
|
|
81
|
+
return xyxy[keep_indices], conf[keep_indices], cla[keep_indices]
|
|
82
|
+
|
|
34
83
|
def predict(self, mfd_res, image):
|
|
35
84
|
formula_list = []
|
|
36
85
|
mf_image_list = []
|
|
86
|
+
|
|
87
|
+
# 对检测框进行IOU去重,保留置信度较高的框
|
|
88
|
+
xyxy_filtered, conf_filtered, cla_filtered = self._filter_boxes_by_iou(
|
|
89
|
+
mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
|
|
90
|
+
)
|
|
91
|
+
|
|
37
92
|
for xyxy, conf, cla in zip(
|
|
38
|
-
|
|
93
|
+
xyxy_filtered.cpu(), conf_filtered.cpu(), cla_filtered.cpu()
|
|
39
94
|
):
|
|
40
95
|
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
|
|
41
96
|
new_item = {
|
|
@@ -61,7 +116,13 @@ class UnimernetModel(object):
|
|
|
61
116
|
res["latex"] = latex
|
|
62
117
|
return formula_list
|
|
63
118
|
|
|
64
|
-
def batch_predict(
|
|
119
|
+
def batch_predict(
|
|
120
|
+
self,
|
|
121
|
+
images_mfd_res: list,
|
|
122
|
+
images: list,
|
|
123
|
+
batch_size: int = 64,
|
|
124
|
+
interline_enable: bool = True,
|
|
125
|
+
) -> list:
|
|
65
126
|
images_formula_list = []
|
|
66
127
|
mf_image_list = []
|
|
67
128
|
backfill_list = []
|
|
@@ -73,9 +134,16 @@ class UnimernetModel(object):
|
|
|
73
134
|
image = images[image_index]
|
|
74
135
|
formula_list = []
|
|
75
136
|
|
|
137
|
+
# 对检测框进行IOU去重,保留置信度较高的框
|
|
138
|
+
xyxy_filtered, conf_filtered, cla_filtered = self._filter_boxes_by_iou(
|
|
139
|
+
mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
|
|
140
|
+
)
|
|
141
|
+
|
|
76
142
|
for idx, (xyxy, conf, cla) in enumerate(zip(
|
|
77
|
-
|
|
143
|
+
xyxy_filtered, conf_filtered, cla_filtered
|
|
78
144
|
)):
|
|
145
|
+
if not interline_enable and cla.item() == 1:
|
|
146
|
+
continue # Skip interline regions if not enabled
|
|
79
147
|
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
|
|
80
148
|
new_item = {
|
|
81
149
|
"category_id": 13 + int(cla.item()),
|
mineru/resources/header.html
CHANGED
|
@@ -66,7 +66,11 @@
|
|
|
66
66
|
color: #fafafa;
|
|
67
67
|
opacity: 0.8;
|
|
68
68
|
">
|
|
69
|
-
|
|
69
|
+
A one-stop, open-source, high-quality data extraction tool that supports converting PDF to Markdown and JSON.<br>
|
|
70
|
+
If you found our project helpful, please give us a ⭐️ to support us!
|
|
71
|
+
<a href="https://github.com/opendatalab/MinerU" style="display: inline-flex; align-items: center;">
|
|
72
|
+
<img src="https://img.shields.io/github/stars/opendatalab/MinerU.svg" alt="stars" style="vertical-align: middle; position: relative; top: 5px;">
|
|
73
|
+
</a>
|
|
70
74
|
</p>
|
|
71
75
|
<style>
|
|
72
76
|
.link-block {
|
mineru/utils/boxbase.py
CHANGED
|
@@ -74,6 +74,29 @@ def bbox_distance(bbox1, bbox2):
|
|
|
74
74
|
return 0.0
|
|
75
75
|
|
|
76
76
|
|
|
77
|
+
def bbox_center_distance(bbox1, bbox2):
|
|
78
|
+
"""计算两个矩形框中心点之间的欧氏距离。
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
bbox1 (tuple): 第一个矩形框的坐标,格式为 (x1, y1, x2, y2)
|
|
82
|
+
bbox2 (tuple): 第二个矩形框的坐标,格式为 (x1, y1, x2, y2)
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
float: 两个矩形框中心点之间的距离
|
|
86
|
+
"""
|
|
87
|
+
x1, y1, x1b, y1b = bbox1
|
|
88
|
+
x2, y2, x2b, y2b = bbox2
|
|
89
|
+
|
|
90
|
+
# 计算中心点
|
|
91
|
+
center1_x = (x1 + x1b) / 2
|
|
92
|
+
center1_y = (y1 + y1b) / 2
|
|
93
|
+
center2_x = (x2 + x2b) / 2
|
|
94
|
+
center2_y = (y2 + y2b) / 2
|
|
95
|
+
|
|
96
|
+
# 计算欧氏距离
|
|
97
|
+
return math.sqrt((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2)
|
|
98
|
+
|
|
99
|
+
|
|
77
100
|
def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
|
|
78
101
|
"""通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
|
|
79
102
|
如果比例大于ratio,则返回小的那个bbox, 否则返回None."""
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def is_hyphen_at_line_end(line):
|
|
6
|
+
"""Check if a line ends with one or more letters followed by a hyphen.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
line (str): The line of text to check.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
|
|
13
|
+
"""
|
|
14
|
+
# Use regex to check if the line ends with one or more letters followed by a hyphen
|
|
15
|
+
return bool(re.search(r'[A-Za-z]+-\s*$', line))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def full_to_half_exclude_marks(text: str) -> str:
|
|
19
|
+
"""Convert full-width characters to half-width characters using code point manipulation.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
text: String containing full-width characters
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
String with full-width characters converted to half-width
|
|
26
|
+
"""
|
|
27
|
+
result = []
|
|
28
|
+
for char in text:
|
|
29
|
+
code = ord(char)
|
|
30
|
+
# Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
|
|
31
|
+
if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
|
|
32
|
+
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
|
|
33
|
+
else:
|
|
34
|
+
result.append(char)
|
|
35
|
+
return ''.join(result)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def full_to_half(text: str) -> str:
|
|
39
|
+
"""Convert full-width characters to half-width characters using code point manipulation.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
text: String containing full-width characters
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
String with full-width characters converted to half-width
|
|
46
|
+
"""
|
|
47
|
+
result = []
|
|
48
|
+
for char in text:
|
|
49
|
+
code = ord(char)
|
|
50
|
+
# Full-width letters, numbers and punctuation (FF01-FF5E)
|
|
51
|
+
if 0xFF01 <= code <= 0xFF5E:
|
|
52
|
+
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
|
|
53
|
+
else:
|
|
54
|
+
result.append(char)
|
|
55
|
+
return ''.join(result)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
from loguru import logger
|
|
3
|
+
|
|
4
|
+
from mineru.utils.check_sys_env import is_mac_os_version_supported, is_windows_environment, is_mac_environment, \
|
|
5
|
+
is_linux_environment
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_vlm_engine(inference_engine: str, is_async: bool = False) -> str:
|
|
9
|
+
"""
|
|
10
|
+
自动选择或验证 VLM 推理引擎
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
inference_engine: 指定的引擎名称或 'auto' 进行自动选择
|
|
14
|
+
is_async: 是否使用异步引擎(仅对 vllm 有效)
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
最终选择的引擎名称
|
|
18
|
+
"""
|
|
19
|
+
if inference_engine == 'auto':
|
|
20
|
+
# 根据操作系统自动选择引擎
|
|
21
|
+
if is_windows_environment():
|
|
22
|
+
inference_engine = _select_windows_engine()
|
|
23
|
+
elif is_linux_environment():
|
|
24
|
+
inference_engine = _select_linux_engine(is_async)
|
|
25
|
+
elif is_mac_environment():
|
|
26
|
+
inference_engine = _select_mac_engine()
|
|
27
|
+
else:
|
|
28
|
+
logger.warning("Unknown operating system, falling back to transformers")
|
|
29
|
+
inference_engine = 'transformers'
|
|
30
|
+
|
|
31
|
+
formatted_engine = _format_engine_name(inference_engine)
|
|
32
|
+
logger.info(f"Using {formatted_engine} as the inference engine for VLM.")
|
|
33
|
+
return formatted_engine
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _select_windows_engine() -> str:
|
|
37
|
+
"""Windows 平台引擎选择"""
|
|
38
|
+
try:
|
|
39
|
+
import lmdeploy
|
|
40
|
+
return 'lmdeploy'
|
|
41
|
+
except ImportError:
|
|
42
|
+
return 'transformers'
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _select_linux_engine(is_async: bool) -> str:
|
|
46
|
+
"""Linux 平台引擎选择"""
|
|
47
|
+
try:
|
|
48
|
+
import vllm
|
|
49
|
+
return 'vllm-async' if is_async else 'vllm'
|
|
50
|
+
except ImportError:
|
|
51
|
+
try:
|
|
52
|
+
import lmdeploy
|
|
53
|
+
return 'lmdeploy'
|
|
54
|
+
except ImportError:
|
|
55
|
+
return 'transformers'
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _select_mac_engine() -> str:
|
|
59
|
+
"""macOS 平台引擎选择"""
|
|
60
|
+
try:
|
|
61
|
+
from mlx_vlm import load as mlx_load
|
|
62
|
+
if is_mac_os_version_supported():
|
|
63
|
+
return 'mlx'
|
|
64
|
+
else:
|
|
65
|
+
return 'transformers'
|
|
66
|
+
except ImportError:
|
|
67
|
+
return 'transformers'
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _format_engine_name(engine: str) -> str:
|
|
71
|
+
"""统一格式化引擎名称"""
|
|
72
|
+
if engine != 'transformers':
|
|
73
|
+
return f"{engine}-engine"
|
|
74
|
+
return engine
|
mineru/utils/enum_class.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
1
3
|
class BlockType:
|
|
2
4
|
IMAGE = 'image'
|
|
3
5
|
TABLE = 'table'
|
|
@@ -112,4 +114,19 @@ class SplitFlag:
|
|
|
112
114
|
|
|
113
115
|
class ImageType:
|
|
114
116
|
PIL = 'pil_img'
|
|
115
|
-
BASE64 = 'base64_img'
|
|
117
|
+
BASE64 = 'base64_img'
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class NotExtractType(Enum):
|
|
121
|
+
TEXT = BlockType.TEXT
|
|
122
|
+
TITLE = BlockType.TITLE
|
|
123
|
+
HEADER = BlockType.HEADER
|
|
124
|
+
FOOTER = BlockType.FOOTER
|
|
125
|
+
PAGE_NUMBER = BlockType.PAGE_NUMBER
|
|
126
|
+
PAGE_FOOTNOTE = BlockType.PAGE_FOOTNOTE
|
|
127
|
+
REF_TEXT = BlockType.REF_TEXT
|
|
128
|
+
TABLE_CAPTION = BlockType.TABLE_CAPTION
|
|
129
|
+
IMAGE_CAPTION = BlockType.IMAGE_CAPTION
|
|
130
|
+
TABLE_FOOTNOTE = BlockType.TABLE_FOOTNOTE
|
|
131
|
+
IMAGE_FOOTNOTE = BlockType.IMAGE_FOOTNOTE
|
|
132
|
+
CODE_CAPTION = BlockType.CODE_CAPTION
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
包含两个MagicModel类中重复使用的方法和逻辑
|
|
3
3
|
"""
|
|
4
4
|
from typing import List, Dict, Any, Callable
|
|
5
|
-
from mineru.utils.boxbase import bbox_distance, is_in
|
|
5
|
+
from mineru.utils.boxbase import bbox_distance, bbox_center_distance, is_in
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def reduct_overlap(bboxes: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
@@ -165,4 +165,87 @@ def tie_up_category_by_distance_v3(
|
|
|
165
165
|
}
|
|
166
166
|
)
|
|
167
167
|
|
|
168
|
-
return ret
|
|
168
|
+
return ret
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def tie_up_category_by_index(
|
|
172
|
+
get_subjects_func: Callable,
|
|
173
|
+
get_objects_func: Callable,
|
|
174
|
+
extract_subject_func: Callable = None,
|
|
175
|
+
extract_object_func: Callable = None
|
|
176
|
+
):
|
|
177
|
+
"""
|
|
178
|
+
基于index的类别关联方法,用于将主体对象与客体对象进行关联
|
|
179
|
+
客体优先匹配给index最接近的主体,index差值相同时使用bbox中心点距离作为tiebreaker
|
|
180
|
+
|
|
181
|
+
参数:
|
|
182
|
+
get_subjects_func: 函数,提取主体对象
|
|
183
|
+
get_objects_func: 函数,提取客体对象
|
|
184
|
+
extract_subject_func: 函数,自定义提取主体属性(默认使用bbox和其他属性)
|
|
185
|
+
extract_object_func: 函数,自定义提取客体属性(默认使用bbox和其他属性)
|
|
186
|
+
|
|
187
|
+
返回:
|
|
188
|
+
关联后的对象列表,按主体index升序排列
|
|
189
|
+
"""
|
|
190
|
+
subjects = get_subjects_func()
|
|
191
|
+
objects = get_objects_func()
|
|
192
|
+
|
|
193
|
+
# 如果没有提供自定义提取函数,使用默认函数
|
|
194
|
+
if extract_subject_func is None:
|
|
195
|
+
extract_subject_func = lambda x: x
|
|
196
|
+
if extract_object_func is None:
|
|
197
|
+
extract_object_func = lambda x: x
|
|
198
|
+
|
|
199
|
+
# 初始化结果字典,key为主体索引,value为关联信息
|
|
200
|
+
result_dict = {}
|
|
201
|
+
|
|
202
|
+
# 初始化所有主体
|
|
203
|
+
for i, subject in enumerate(subjects):
|
|
204
|
+
result_dict[i] = {
|
|
205
|
+
"sub_bbox": extract_subject_func(subject),
|
|
206
|
+
"obj_bboxes": [],
|
|
207
|
+
"sub_idx": i,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# 为每个客体找到最匹配的主体
|
|
211
|
+
for obj in objects:
|
|
212
|
+
if len(subjects) == 0:
|
|
213
|
+
# 如果没有主体,跳过客体
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
obj_index = obj["index"]
|
|
217
|
+
min_index_diff = float("inf")
|
|
218
|
+
best_subject_indices = []
|
|
219
|
+
|
|
220
|
+
# 找出index差值最小的所有主体
|
|
221
|
+
for i, subject in enumerate(subjects):
|
|
222
|
+
sub_index = subject["index"]
|
|
223
|
+
index_diff = abs(obj_index - sub_index)
|
|
224
|
+
|
|
225
|
+
if index_diff < min_index_diff:
|
|
226
|
+
min_index_diff = index_diff
|
|
227
|
+
best_subject_indices = [i]
|
|
228
|
+
elif index_diff == min_index_diff:
|
|
229
|
+
best_subject_indices.append(i)
|
|
230
|
+
|
|
231
|
+
# 如果有多个主体的index差值相同,使用中心点距离作为tiebreaker
|
|
232
|
+
if len(best_subject_indices) > 1:
|
|
233
|
+
min_center_dist = float("inf")
|
|
234
|
+
best_subject_idx = best_subject_indices[0]
|
|
235
|
+
|
|
236
|
+
for idx in best_subject_indices:
|
|
237
|
+
center_dist = bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])
|
|
238
|
+
if center_dist < min_center_dist:
|
|
239
|
+
min_center_dist = center_dist
|
|
240
|
+
best_subject_idx = idx
|
|
241
|
+
else:
|
|
242
|
+
best_subject_idx = best_subject_indices[0]
|
|
243
|
+
|
|
244
|
+
# 将客体添加到最佳主体的obj_bboxes中
|
|
245
|
+
result_dict[best_subject_idx]["obj_bboxes"].append(extract_object_func(obj))
|
|
246
|
+
|
|
247
|
+
# 转换为列表并按主体index排序
|
|
248
|
+
ret = list(result_dict.values())
|
|
249
|
+
ret.sort(key=lambda x: x["sub_idx"])
|
|
250
|
+
|
|
251
|
+
return ret
|
mineru/utils/pdf_image_tools.py
CHANGED
|
@@ -5,7 +5,7 @@ from io import BytesIO
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pypdfium2 as pdfium
|
|
7
7
|
from loguru import logger
|
|
8
|
-
from PIL import Image
|
|
8
|
+
from PIL import Image, ImageOps
|
|
9
9
|
|
|
10
10
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
11
11
|
from mineru.utils.check_sys_env import is_windows_environment
|
|
@@ -41,19 +41,23 @@ def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -
|
|
|
41
41
|
return image_dict
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
def _load_images_from_pdf_worker(
|
|
44
|
+
def _load_images_from_pdf_worker(
|
|
45
|
+
pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
46
|
+
):
|
|
45
47
|
"""用于进程池的包装函数"""
|
|
46
|
-
return load_images_from_pdf_core(
|
|
48
|
+
return load_images_from_pdf_core(
|
|
49
|
+
pdf_bytes, dpi, start_page_id, end_page_id, image_type
|
|
50
|
+
)
|
|
47
51
|
|
|
48
52
|
|
|
49
53
|
def load_images_from_pdf(
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
54
|
+
pdf_bytes: bytes,
|
|
55
|
+
dpi=200,
|
|
56
|
+
start_page_id=0,
|
|
57
|
+
end_page_id=None,
|
|
58
|
+
image_type=ImageType.PIL,
|
|
59
|
+
timeout=None,
|
|
60
|
+
threads=4,
|
|
57
61
|
):
|
|
58
62
|
"""带超时控制的 PDF 转图片函数,支持多进程加速
|
|
59
63
|
|
|
@@ -77,7 +81,7 @@ def load_images_from_pdf(
|
|
|
77
81
|
dpi,
|
|
78
82
|
start_page_id,
|
|
79
83
|
get_end_page_id(end_page_id, len(pdf_doc)),
|
|
80
|
-
image_type
|
|
84
|
+
image_type,
|
|
81
85
|
), pdf_doc
|
|
82
86
|
else:
|
|
83
87
|
if timeout is None:
|
|
@@ -116,7 +120,7 @@ def load_images_from_pdf(
|
|
|
116
120
|
dpi,
|
|
117
121
|
range_start,
|
|
118
122
|
range_end,
|
|
119
|
-
image_type
|
|
123
|
+
image_type,
|
|
120
124
|
)
|
|
121
125
|
futures.append((range_start, future))
|
|
122
126
|
|
|
@@ -163,7 +167,14 @@ def load_images_from_pdf_core(
|
|
|
163
167
|
return images_list
|
|
164
168
|
|
|
165
169
|
|
|
166
|
-
def cut_image(
|
|
170
|
+
def cut_image(
|
|
171
|
+
bbox: tuple,
|
|
172
|
+
page_num: int,
|
|
173
|
+
page_pil_img,
|
|
174
|
+
return_path,
|
|
175
|
+
image_writer: FileBasedDataWriter,
|
|
176
|
+
scale=2,
|
|
177
|
+
):
|
|
167
178
|
"""从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
|
|
168
179
|
图片存放在save_path下,文件名是:
|
|
169
180
|
{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
|
|
@@ -197,7 +208,6 @@ def get_crop_img(bbox: tuple, pil_img, scale=2):
|
|
|
197
208
|
|
|
198
209
|
|
|
199
210
|
def get_crop_np_img(bbox: tuple, input_img, scale=2):
|
|
200
|
-
|
|
201
211
|
if isinstance(input_img, Image.Image):
|
|
202
212
|
np_img = np.asarray(input_img)
|
|
203
213
|
elif isinstance(input_img, np.ndarray):
|
|
@@ -212,17 +222,27 @@ def get_crop_np_img(bbox: tuple, input_img, scale=2):
|
|
|
212
222
|
int(bbox[3] * scale),
|
|
213
223
|
)
|
|
214
224
|
|
|
215
|
-
return np_img[scale_bbox[1]:scale_bbox[3], scale_bbox[0]:scale_bbox[2]]
|
|
225
|
+
return np_img[scale_bbox[1] : scale_bbox[3], scale_bbox[0] : scale_bbox[2]]
|
|
226
|
+
|
|
216
227
|
|
|
217
228
|
def images_bytes_to_pdf_bytes(image_bytes):
|
|
218
229
|
# 内存缓冲区
|
|
219
230
|
pdf_buffer = BytesIO()
|
|
220
231
|
|
|
221
232
|
# 载入并转换所有图像为 RGB 模式
|
|
222
|
-
image = Image.open(BytesIO(image_bytes))
|
|
233
|
+
image = Image.open(BytesIO(image_bytes))
|
|
234
|
+
# 根据 EXIF 信息自动转正(处理手机拍摄的带 Orientation 标记的图片)
|
|
235
|
+
image = ImageOps.exif_transpose(image) or image
|
|
236
|
+
# 只在必要时转换
|
|
237
|
+
if image.mode != "RGB":
|
|
238
|
+
image = image.convert("RGB")
|
|
223
239
|
|
|
224
240
|
# 第一张图保存为 PDF,其余追加
|
|
225
|
-
image.save(
|
|
241
|
+
image.save(
|
|
242
|
+
pdf_buffer,
|
|
243
|
+
format="PDF",
|
|
244
|
+
# save_all=True
|
|
245
|
+
)
|
|
226
246
|
|
|
227
247
|
# 获取 PDF bytes 并重置指针(可选)
|
|
228
248
|
pdf_bytes = pdf_buffer.getvalue()
|
mineru/utils/span_pre_proc.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
2
|
import collections
|
|
3
|
+
import math
|
|
3
4
|
import re
|
|
4
5
|
import statistics
|
|
5
6
|
|
|
@@ -128,8 +129,9 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
|
|
|
128
129
|
page_all_lines = []
|
|
129
130
|
for block in page_dict['blocks']:
|
|
130
131
|
for line in block['lines']:
|
|
131
|
-
|
|
132
|
-
|
|
132
|
+
rotation_degrees = math.degrees(line['rotation'])
|
|
133
|
+
# 旋转角度不为0, 90, 180, 270的行,直接跳过(rotation_degrees的值可能不为整数)
|
|
134
|
+
if not any(abs(rotation_degrees - angle) < 0.1 for angle in [0, 90, 180, 270]):
|
|
133
135
|
continue
|
|
134
136
|
page_all_lines.append(line)
|
|
135
137
|
for span in line['spans']:
|
|
@@ -159,7 +161,7 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
|
|
|
159
161
|
if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
|
|
160
162
|
continue
|
|
161
163
|
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
|
162
|
-
if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
|
|
164
|
+
if span['height'] > median_span_height * 2.3 and span['height'] > span['width'] * 2.3:
|
|
163
165
|
vertical_spans.append(span)
|
|
164
166
|
elif block in all_bboxes:
|
|
165
167
|
useful_spans.append(span)
|
mineru/utils/table_merge.py
CHANGED
|
@@ -1,33 +1,22 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
|
+
from copy import deepcopy
|
|
2
3
|
|
|
3
4
|
from loguru import logger
|
|
4
5
|
from bs4 import BeautifulSoup
|
|
5
6
|
|
|
6
7
|
from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
|
|
8
|
+
from mineru.utils.char_utils import full_to_half
|
|
7
9
|
from mineru.utils.enum_class import BlockType, SplitFlag
|
|
8
10
|
|
|
9
11
|
|
|
10
|
-
CONTINUATION_MARKERS = [
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
Returns:
|
|
20
|
-
String with full-width characters converted to half-width
|
|
21
|
-
"""
|
|
22
|
-
result = []
|
|
23
|
-
for char in text:
|
|
24
|
-
code = ord(char)
|
|
25
|
-
# Full-width letters, numbers and punctuation (FF01-FF5E)
|
|
26
|
-
if 0xFF01 <= code <= 0xFF5E:
|
|
27
|
-
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
|
|
28
|
-
else:
|
|
29
|
-
result.append(char)
|
|
30
|
-
return ''.join(result)
|
|
12
|
+
CONTINUATION_MARKERS = [
|
|
13
|
+
"(续)",
|
|
14
|
+
"(续表)",
|
|
15
|
+
"(续上表)",
|
|
16
|
+
"(continued)",
|
|
17
|
+
"(cont.)",
|
|
18
|
+
"(cont’d)",
|
|
19
|
+
]
|
|
31
20
|
|
|
32
21
|
|
|
33
22
|
def calculate_table_total_columns(soup):
|
|
@@ -296,6 +285,8 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
|
|
|
296
285
|
current_cols: 当前总列数
|
|
297
286
|
reference_row: 参考行对象
|
|
298
287
|
"""
|
|
288
|
+
reference_row_copy = deepcopy(reference_row)
|
|
289
|
+
|
|
299
290
|
for i in range(start_idx, end_idx):
|
|
300
291
|
row = rows[i]
|
|
301
292
|
cells = row.find_all(["td", "th"])
|
|
@@ -307,7 +298,7 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
|
|
|
307
298
|
continue
|
|
308
299
|
|
|
309
300
|
# 检查是否与参考行结构匹配
|
|
310
|
-
if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row,
|
|
301
|
+
if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row, reference_row_copy):
|
|
311
302
|
# 尝试应用参考结构
|
|
312
303
|
if len(cells) <= len(reference_structure):
|
|
313
304
|
for j, cell in enumerate(cells):
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.
|
|
1
|
+
__version__ = "2.7.1"
|