magic-pdf 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +20 -7
- magic_pdf/libs/config_reader.py +28 -10
- magic_pdf/libs/language.py +12 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +35 -3
- magic_pdf/model/magic_model.py +49 -41
- magic_pdf/model/pdf_extract_kit.py +155 -60
- magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +7 -6
- magic_pdf/model/pek_sub_modules/self_modify.py +87 -43
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
- magic_pdf/model/pp_structure_v2.py +1 -1
- magic_pdf/pdf_parse_union_core.py +4 -2
- magic_pdf/pre_proc/citationmarker_remove.py +5 -1
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +40 -2
- magic_pdf/pre_proc/ocr_span_list_modify.py +12 -7
- magic_pdf/resources/fasttext-langdetect/lid.176.ftz +0 -0
- magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +2 -2
- magic_pdf/resources/model_config/model_configs.yaml +4 -0
- magic_pdf/rw/AbsReaderWriter.py +1 -18
- magic_pdf/rw/DiskReaderWriter.py +32 -24
- magic_pdf/rw/S3ReaderWriter.py +83 -48
- magic_pdf/tools/cli.py +79 -0
- magic_pdf/tools/cli_dev.py +156 -0
- magic_pdf/tools/common.py +119 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +120 -72
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +34 -35
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
- magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
- magic_pdf/cli/magicpdf.py +0 -337
- magic_pdf/pdf_parse_for_train.py +0 -685
- magic_pdf/train_utils/convert_to_train_format.py +0 -65
- magic_pdf/train_utils/extract_caption.py +0 -59
- magic_pdf/train_utils/remove_footer_header.py +0 -159
- magic_pdf/train_utils/vis_utils.py +0 -327
- magic_pdf-0.6.1.dist-info/entry_points.txt +0 -2
- /magic_pdf/libs/{math.py → local_math.py} +0 -0
- /magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
- /magic_pdf/{train_utils → tools}/__init__.py +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0
@@ -120,15 +120,20 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
120
120
|
if mode == 'nlp':
|
121
121
|
continue
|
122
122
|
elif mode == 'mm':
|
123
|
+
table_caption = ''
|
123
124
|
for block in para_block['blocks']: # 1st.拼table_caption
|
124
125
|
if block['type'] == BlockType.TableCaption:
|
125
|
-
|
126
|
+
table_caption = merge_para_with_text(block)
|
126
127
|
for block in para_block['blocks']: # 2nd.拼table_body
|
127
128
|
if block['type'] == BlockType.TableBody:
|
128
129
|
for line in block['lines']:
|
129
130
|
for span in line['spans']:
|
130
131
|
if span['type'] == ContentType.Table:
|
131
|
-
|
132
|
+
# if processed by table model
|
133
|
+
if span.get('latex', ''):
|
134
|
+
para_text += f"\n\n$\n {span['latex']}\n$\n\n"
|
135
|
+
else:
|
136
|
+
para_text += f"\n}) \n"
|
132
137
|
for block in para_block['blocks']: # 3rd.拼table_footnote
|
133
138
|
if block['type'] == BlockType.TableFootnote:
|
134
139
|
para_text += merge_para_with_text(block)
|
@@ -163,7 +168,7 @@ def merge_para_with_text(para_block):
|
|
163
168
|
else:
|
164
169
|
content = ocr_escape_special_markdown_char(content)
|
165
170
|
elif span_type == ContentType.InlineEquation:
|
166
|
-
content = f"${span['content']}$"
|
171
|
+
content = f" ${span['content']}$ "
|
167
172
|
elif span_type == ContentType.InterlineEquation:
|
168
173
|
content = f"\n$$\n{span['content']}\n$$\n"
|
169
174
|
|
@@ -210,28 +215,32 @@ def para_to_standard_format(para, img_buket_path):
|
|
210
215
|
return para_content
|
211
216
|
|
212
217
|
|
213
|
-
def para_to_standard_format_v2(para_block, img_buket_path):
|
218
|
+
def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
|
214
219
|
para_type = para_block['type']
|
215
220
|
if para_type == BlockType.Text:
|
216
221
|
para_content = {
|
217
222
|
'type': 'text',
|
218
223
|
'text': merge_para_with_text(para_block),
|
224
|
+
'page_idx': page_idx
|
219
225
|
}
|
220
226
|
elif para_type == BlockType.Title:
|
221
227
|
para_content = {
|
222
228
|
'type': 'text',
|
223
229
|
'text': merge_para_with_text(para_block),
|
224
|
-
'text_level': 1
|
230
|
+
'text_level': 1,
|
231
|
+
'page_idx': page_idx
|
225
232
|
}
|
226
233
|
elif para_type == BlockType.InterlineEquation:
|
227
234
|
para_content = {
|
228
235
|
'type': 'equation',
|
229
236
|
'text': merge_para_with_text(para_block),
|
230
|
-
'text_format': "latex"
|
237
|
+
'text_format': "latex",
|
238
|
+
'page_idx': page_idx
|
231
239
|
}
|
232
240
|
elif para_type == BlockType.Image:
|
233
241
|
para_content = {
|
234
242
|
'type': 'image',
|
243
|
+
'page_idx': page_idx
|
235
244
|
}
|
236
245
|
for block in para_block['blocks']:
|
237
246
|
if block['type'] == BlockType.ImageBody:
|
@@ -241,9 +250,12 @@ def para_to_standard_format_v2(para_block, img_buket_path):
|
|
241
250
|
elif para_type == BlockType.Table:
|
242
251
|
para_content = {
|
243
252
|
'type': 'table',
|
253
|
+
'page_idx': page_idx
|
244
254
|
}
|
245
255
|
for block in para_block['blocks']:
|
246
256
|
if block['type'] == BlockType.TableBody:
|
257
|
+
if block["lines"][0]["spans"][0].get('latex', ''):
|
258
|
+
para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
|
247
259
|
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
|
248
260
|
if block['type'] == BlockType.TableCaption:
|
249
261
|
para_content['table_caption'] = merge_para_with_text(block)
|
@@ -345,6 +357,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
|
|
345
357
|
raise Exception(f"drop_mode can not be null")
|
346
358
|
|
347
359
|
paras_of_layout = page_info.get("para_blocks")
|
360
|
+
page_idx = page_info.get("page_idx")
|
348
361
|
if not paras_of_layout:
|
349
362
|
continue
|
350
363
|
if make_mode == MakeMode.MM_MD:
|
@@ -355,7 +368,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
|
|
355
368
|
output_content.extend(page_markdown)
|
356
369
|
elif make_mode == MakeMode.STANDARD_FORMAT:
|
357
370
|
for para_block in paras_of_layout:
|
358
|
-
para_content = para_to_standard_format_v2(para_block, img_buket_path)
|
371
|
+
para_content = para_to_standard_format_v2(para_block, img_buket_path, page_idx)
|
359
372
|
output_content.append(para_content)
|
360
373
|
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
|
361
374
|
return '\n\n'.join(output_content)
|
magic_pdf/libs/config_reader.py
CHANGED
@@ -10,16 +10,19 @@ from loguru import logger
|
|
10
10
|
|
11
11
|
from magic_pdf.libs.commons import parse_bucket_key
|
12
12
|
|
13
|
+
# 定义配置文件名常量
|
14
|
+
CONFIG_FILE_NAME = "magic-pdf.json"
|
15
|
+
|
13
16
|
|
14
17
|
def read_config():
|
15
18
|
home_dir = os.path.expanduser("~")
|
16
19
|
|
17
|
-
config_file = os.path.join(home_dir,
|
20
|
+
config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
|
18
21
|
|
19
22
|
if not os.path.exists(config_file):
|
20
|
-
raise
|
23
|
+
raise FileNotFoundError(f"{config_file} not found")
|
21
24
|
|
22
|
-
with open(config_file, "r") as f:
|
25
|
+
with open(config_file, "r", encoding="utf-8") as f:
|
23
26
|
config = json.load(f)
|
24
27
|
return config
|
25
28
|
|
@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str):
|
|
37
40
|
access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
|
38
41
|
|
39
42
|
if access_key is None or secret_key is None or storage_endpoint is None:
|
40
|
-
raise Exception("ak, sk or endpoint not found in
|
43
|
+
raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
|
41
44
|
|
42
45
|
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
|
43
46
|
|
@@ -54,19 +57,34 @@ def get_bucket_name(path):
|
|
54
57
|
return bucket
|
55
58
|
|
56
59
|
|
57
|
-
def
|
60
|
+
def get_local_models_dir():
|
58
61
|
config = read_config()
|
59
|
-
|
62
|
+
models_dir = config.get("models-dir")
|
63
|
+
if models_dir is None:
|
64
|
+
logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
|
65
|
+
return "/tmp/models"
|
66
|
+
else:
|
67
|
+
return models_dir
|
60
68
|
|
61
69
|
|
62
|
-
def
|
70
|
+
def get_device():
|
63
71
|
config = read_config()
|
64
|
-
|
72
|
+
device = config.get("device-mode")
|
73
|
+
if device is None:
|
74
|
+
logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
|
75
|
+
return "cpu"
|
76
|
+
else:
|
77
|
+
return device
|
65
78
|
|
66
79
|
|
67
|
-
def
|
80
|
+
def get_table_recog_config():
|
68
81
|
config = read_config()
|
69
|
-
|
82
|
+
table_config = config.get("table-config")
|
83
|
+
if table_config is None:
|
84
|
+
logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
|
85
|
+
return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
|
86
|
+
else:
|
87
|
+
return table_config
|
70
88
|
|
71
89
|
|
72
90
|
if __name__ == "__main__":
|
magic_pdf/libs/language.py
CHANGED
@@ -1,8 +1,19 @@
|
|
1
|
+
import os
|
1
2
|
import unicodedata
|
3
|
+
|
4
|
+
if not os.getenv("FTLANG_CACHE"):
|
5
|
+
current_file_path = os.path.abspath(__file__)
|
6
|
+
current_dir = os.path.dirname(current_file_path)
|
7
|
+
root_dir = os.path.dirname(current_dir)
|
8
|
+
ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
|
9
|
+
os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
|
10
|
+
# print(os.getenv("FTLANG_CACHE"))
|
11
|
+
|
2
12
|
from fast_langdetect import detect_language
|
3
13
|
|
4
14
|
|
5
15
|
def detect_lang(text: str) -> str:
|
16
|
+
|
6
17
|
if len(text) == 0:
|
7
18
|
return ""
|
8
19
|
try:
|
@@ -18,6 +29,7 @@ def detect_lang(text: str) -> str:
|
|
18
29
|
|
19
30
|
|
20
31
|
if __name__ == '__main__':
|
32
|
+
print(os.getenv("FTLANG_CACHE"))
|
21
33
|
print(detect_lang("This is a test."))
|
22
34
|
print(detect_lang("<html>This is a test</html>"))
|
23
35
|
print(detect_lang("这个是中文测试。"))
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "0.7.0a1"
|
magic_pdf/model/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
__use_inside_model__ =
|
1
|
+
__use_inside_model__ = True
|
2
2
|
__model_mode__ = "full"
|
@@ -4,7 +4,7 @@ import fitz
|
|
4
4
|
import numpy as np
|
5
5
|
from loguru import logger
|
6
6
|
|
7
|
-
from magic_pdf.libs.config_reader import get_local_models_dir, get_device
|
7
|
+
from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config
|
8
8
|
from magic_pdf.model.model_list import MODEL
|
9
9
|
import magic_pdf.model as model_config
|
10
10
|
|
@@ -48,10 +48,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
|
48
48
|
return images
|
49
49
|
|
50
50
|
|
51
|
-
|
51
|
+
class ModelSingleton:
|
52
|
+
_instance = None
|
53
|
+
_models = {}
|
54
|
+
|
55
|
+
def __new__(cls, *args, **kwargs):
|
56
|
+
if cls._instance is None:
|
57
|
+
cls._instance = super().__new__(cls)
|
58
|
+
return cls._instance
|
59
|
+
|
60
|
+
def get_model(self, ocr: bool, show_log: bool):
|
61
|
+
key = (ocr, show_log)
|
62
|
+
if key not in self._models:
|
63
|
+
self._models[key] = custom_model_init(ocr=ocr, show_log=show_log)
|
64
|
+
return self._models[key]
|
65
|
+
|
66
|
+
|
67
|
+
def custom_model_init(ocr: bool = False, show_log: bool = False):
|
52
68
|
model = None
|
53
69
|
|
54
70
|
if model_config.__model_mode__ == "lite":
|
71
|
+
logger.warning("The Lite mode is provided for developers to conduct testing only, and the output quality is "
|
72
|
+
"not guaranteed to be reliable.")
|
55
73
|
model = MODEL.Paddle
|
56
74
|
elif model_config.__model_mode__ == "full":
|
57
75
|
model = MODEL.PEK
|
@@ -66,7 +84,13 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
|
|
66
84
|
# 从配置文件读取model-dir和device
|
67
85
|
local_models_dir = get_local_models_dir()
|
68
86
|
device = get_device()
|
69
|
-
|
87
|
+
table_config = get_table_recog_config()
|
88
|
+
model_input = {"ocr": ocr,
|
89
|
+
"show_log": show_log,
|
90
|
+
"models_dir": local_models_dir,
|
91
|
+
"device": device,
|
92
|
+
"table_config": table_config}
|
93
|
+
custom_model = CustomPEKModel(**model_input)
|
70
94
|
else:
|
71
95
|
logger.error("Not allow model_name!")
|
72
96
|
exit(1)
|
@@ -76,6 +100,14 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
|
|
76
100
|
logger.error("use_inside_model is False, not allow to use inside model")
|
77
101
|
exit(1)
|
78
102
|
|
103
|
+
return custom_model
|
104
|
+
|
105
|
+
|
106
|
+
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
|
107
|
+
|
108
|
+
model_manager = ModelSingleton()
|
109
|
+
custom_model = model_manager.get_model(ocr, show_log)
|
110
|
+
|
79
111
|
images = load_images_from_pdf(pdf_bytes)
|
80
112
|
|
81
113
|
model_json = []
|
magic_pdf/model/magic_model.py
CHANGED
@@ -9,13 +9,14 @@ from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
|
9
9
|
from magic_pdf.libs.ocr_content_type import ContentType
|
10
10
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
11
11
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
12
|
-
from magic_pdf.libs.
|
12
|
+
from magic_pdf.libs.local_math import float_gt
|
13
13
|
from magic_pdf.libs.boxbase import (
|
14
14
|
_is_in,
|
15
15
|
bbox_relative_pos,
|
16
16
|
bbox_distance,
|
17
17
|
_is_part_overlap,
|
18
|
-
calculate_overlap_area_in_bbox1_area_ratio,
|
18
|
+
calculate_overlap_area_in_bbox1_area_ratio,
|
19
|
+
calculate_iou,
|
19
20
|
)
|
20
21
|
from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
|
21
22
|
|
@@ -78,9 +79,23 @@ class MagicModel:
|
|
78
79
|
for layout_det2 in layout_dets:
|
79
80
|
if layout_det1 == layout_det2:
|
80
81
|
continue
|
81
|
-
if layout_det1["category_id"] in [
|
82
|
-
|
83
|
-
|
82
|
+
if layout_det1["category_id"] in [
|
83
|
+
0,
|
84
|
+
1,
|
85
|
+
2,
|
86
|
+
3,
|
87
|
+
4,
|
88
|
+
5,
|
89
|
+
6,
|
90
|
+
7,
|
91
|
+
8,
|
92
|
+
9,
|
93
|
+
] and layout_det2["category_id"] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
|
94
|
+
if (
|
95
|
+
calculate_iou(layout_det1["bbox"], layout_det2["bbox"])
|
96
|
+
> 0.9
|
97
|
+
):
|
98
|
+
if layout_det1["score"] < layout_det2["score"]:
|
84
99
|
layout_det_need_remove = layout_det1
|
85
100
|
else:
|
86
101
|
layout_det_need_remove = layout_det2
|
@@ -97,11 +112,11 @@ class MagicModel:
|
|
97
112
|
def __init__(self, model_list: list, docs: fitz.Document):
|
98
113
|
self.__model_list = model_list
|
99
114
|
self.__docs = docs
|
100
|
-
|
115
|
+
"""为所有模型数据添加bbox信息(缩放,poly->bbox)"""
|
101
116
|
self.__fix_axis()
|
102
|
-
|
117
|
+
"""删除置信度特别低的模型数据(<0.05),提高质量"""
|
103
118
|
self.__fix_by_remove_low_confidence()
|
104
|
-
|
119
|
+
"""删除高iou(>0.9)数据中置信度较低的那个"""
|
105
120
|
self.__fix_by_remove_high_iou_and_low_confidence()
|
106
121
|
|
107
122
|
def __reduct_overlap(self, bboxes):
|
@@ -125,16 +140,6 @@ class MagicModel:
|
|
125
140
|
ret = []
|
126
141
|
MAX_DIS_OF_POINT = 10**9 + 7
|
127
142
|
|
128
|
-
def expand_bbox(bbox1, bbox2):
|
129
|
-
x0 = min(bbox1[0], bbox2[0])
|
130
|
-
y0 = min(bbox1[1], bbox2[1])
|
131
|
-
x1 = max(bbox1[2], bbox2[2])
|
132
|
-
y1 = max(bbox1[3], bbox2[3])
|
133
|
-
return [x0, y0, x1, y1]
|
134
|
-
|
135
|
-
def get_bbox_area(bbox):
|
136
|
-
return abs(bbox[2] - bbox[0]) * abs(bbox[3] - bbox[1])
|
137
|
-
|
138
143
|
# subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
|
139
144
|
# 再求出筛选出的 subjects 和 object 的最短距离!
|
140
145
|
def may_find_other_nearest_bbox(subject_idx, object_idx):
|
@@ -177,6 +182,13 @@ class MagicModel:
|
|
177
182
|
|
178
183
|
return ret
|
179
184
|
|
185
|
+
def expand_bbbox(idxes):
|
186
|
+
x0s = [all_bboxes[idx]["bbox"][0] for idx in idxes]
|
187
|
+
y0s = [all_bboxes[idx]["bbox"][1] for idx in idxes]
|
188
|
+
x1s = [all_bboxes[idx]["bbox"][2] for idx in idxes]
|
189
|
+
y1s = [all_bboxes[idx]["bbox"][3] for idx in idxes]
|
190
|
+
return min(x0s), min(y0s), max(x1s), max(y1s)
|
191
|
+
|
180
192
|
subjects = self.__reduct_overlap(
|
181
193
|
list(
|
182
194
|
map(
|
@@ -268,7 +280,9 @@ class MagicModel:
|
|
268
280
|
or dis[i][j] == MAX_DIS_OF_POINT
|
269
281
|
):
|
270
282
|
continue
|
271
|
-
left, right, _, _ = bbox_relative_pos(
|
283
|
+
left, right, _, _ = bbox_relative_pos(
|
284
|
+
all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
|
285
|
+
) # 由 pos_flag_count 相关逻辑保证本段逻辑准确性
|
272
286
|
if left or right:
|
273
287
|
one_way_dis = all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
|
274
288
|
else:
|
@@ -322,6 +336,10 @@ class MagicModel:
|
|
322
336
|
break
|
323
337
|
|
324
338
|
if is_nearest:
|
339
|
+
nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
|
340
|
+
n_dis = bbox_distance(all_bboxes[i]["bbox"], [nx0, ny0, nx1, ny1])
|
341
|
+
if float_gt(dis[i][j], n_dis):
|
342
|
+
continue
|
325
343
|
tmp.append(k)
|
326
344
|
seen.add(k)
|
327
345
|
|
@@ -331,20 +349,7 @@ class MagicModel:
|
|
331
349
|
|
332
350
|
# 已经获取到某个 figure 下所有的最靠近的 captions,以及最靠近这些 captions 的 captions 。
|
333
351
|
# 先扩一下 bbox,
|
334
|
-
|
335
|
-
all_bboxes[i]["bbox"][0]
|
336
|
-
]
|
337
|
-
y0s = [all_bboxes[idx]["bbox"][1] for idx in seen] + [
|
338
|
-
all_bboxes[i]["bbox"][1]
|
339
|
-
]
|
340
|
-
x1s = [all_bboxes[idx]["bbox"][2] for idx in seen] + [
|
341
|
-
all_bboxes[i]["bbox"][2]
|
342
|
-
]
|
343
|
-
y1s = [all_bboxes[idx]["bbox"][3] for idx in seen] + [
|
344
|
-
all_bboxes[i]["bbox"][3]
|
345
|
-
]
|
346
|
-
|
347
|
-
ox0, oy0, ox1, oy1 = min(x0s), min(y0s), max(x1s), max(y1s)
|
352
|
+
ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
|
348
353
|
ix0, iy0, ix1, iy1 = all_bboxes[i]["bbox"]
|
349
354
|
|
350
355
|
# 分成了 4 个截取空间,需要计算落在每个截取空间下 objects 合并后占据的矩形面积
|
@@ -455,8 +460,10 @@ class MagicModel:
|
|
455
460
|
with_caption_subject.add(j)
|
456
461
|
return ret, total_subject_object_dis
|
457
462
|
|
458
|
-
def get_imgs(self, page_no: int):
|
459
|
-
|
463
|
+
def get_imgs(self, page_no: int):
|
464
|
+
figure_captions, _ = self.__tie_up_category_by_distance(
|
465
|
+
page_no, 3, 4
|
466
|
+
)
|
460
467
|
return [
|
461
468
|
{
|
462
469
|
"bbox": record["all"],
|
@@ -464,7 +471,7 @@ class MagicModel:
|
|
464
471
|
"img_caption_bbox": record.get("object_body", None),
|
465
472
|
"score": record["score"],
|
466
473
|
}
|
467
|
-
for record in
|
474
|
+
for record in figure_captions
|
468
475
|
]
|
469
476
|
|
470
477
|
def get_tables(
|
@@ -535,6 +542,7 @@ class MagicModel:
|
|
535
542
|
if not any(span == existing_span for existing_span in new_spans):
|
536
543
|
new_spans.append(span)
|
537
544
|
return new_spans
|
545
|
+
|
538
546
|
all_spans = []
|
539
547
|
model_page_info = self.__model_list[page_no]
|
540
548
|
layout_dets = model_page_info["layout_dets"]
|
@@ -548,13 +556,14 @@ class MagicModel:
|
|
548
556
|
for layout_det in layout_dets:
|
549
557
|
category_id = layout_det["category_id"]
|
550
558
|
if category_id in allow_category_id_list:
|
551
|
-
span = {
|
552
|
-
"bbox": layout_det["bbox"],
|
553
|
-
"score": layout_det["score"]
|
554
|
-
}
|
559
|
+
span = {"bbox": layout_det["bbox"], "score": layout_det["score"]}
|
555
560
|
if category_id == 3:
|
556
561
|
span["type"] = ContentType.Image
|
557
562
|
elif category_id == 5:
|
563
|
+
# 获取table模型结果
|
564
|
+
latex = layout_det.get("latex", None)
|
565
|
+
if latex:
|
566
|
+
span["latex"] = latex
|
558
567
|
span["type"] = ContentType.Table
|
559
568
|
elif category_id == 13:
|
560
569
|
span["content"] = layout_det["latex"]
|
@@ -604,7 +613,6 @@ class MagicModel:
|
|
604
613
|
return self.__model_list[page_no]
|
605
614
|
|
606
615
|
|
607
|
-
|
608
616
|
if __name__ == "__main__":
|
609
617
|
drw = DiskReaderWriter(r"D:/project/20231108code-clean")
|
610
618
|
if 0:
|