magic-pdf 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/__init__.py +0 -0
- magic_pdf/config/enums.py +7 -0
- magic_pdf/config/exceptions.py +32 -0
- magic_pdf/data/__init__.py +0 -0
- magic_pdf/data/data_reader_writer/__init__.py +12 -0
- magic_pdf/data/data_reader_writer/base.py +51 -0
- magic_pdf/data/data_reader_writer/filebase.py +59 -0
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
- magic_pdf/data/data_reader_writer/s3.py +69 -0
- magic_pdf/data/dataset.py +194 -0
- magic_pdf/data/io/__init__.py +0 -0
- magic_pdf/data/io/base.py +42 -0
- magic_pdf/data/io/http.py +37 -0
- magic_pdf/data/io/s3.py +114 -0
- magic_pdf/data/read_api.py +95 -0
- magic_pdf/data/schemas.py +15 -0
- magic_pdf/data/utils.py +32 -0
- magic_pdf/dict2md/ocr_mkcontent.py +74 -234
- magic_pdf/libs/Constants.py +21 -8
- magic_pdf/libs/MakeContentConfig.py +1 -0
- magic_pdf/libs/boxbase.py +54 -0
- magic_pdf/libs/clean_memory.py +10 -0
- magic_pdf/libs/config_reader.py +53 -23
- magic_pdf/libs/draw_bbox.py +150 -65
- magic_pdf/libs/ocr_content_type.py +2 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
- magic_pdf/model/magic_model.py +418 -51
- magic_pdf/model/pdf_extract_kit.py +164 -80
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
- magic_pdf/model/ppTableModel.py +2 -2
- magic_pdf/model/pp_structure_v2.py +5 -2
- magic_pdf/model/v3/__init__.py +0 -0
- magic_pdf/model/v3/helpers.py +125 -0
- magic_pdf/para/para_split_v3.py +296 -0
- magic_pdf/pdf_parse_by_ocr.py +6 -3
- magic_pdf/pdf_parse_by_txt.py +6 -3
- magic_pdf/pdf_parse_union_core_v2.py +644 -0
- magic_pdf/pipe/AbsPipe.py +5 -1
- magic_pdf/pipe/OCRPipe.py +10 -4
- magic_pdf/pipe/TXTPipe.py +10 -4
- magic_pdf/pipe/UNIPipe.py +16 -7
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
- magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
- magic_pdf/resources/model_config/model_configs.yaml +5 -13
- magic_pdf/tools/cli.py +14 -1
- magic_pdf/tools/common.py +19 -9
- magic_pdf/user_api.py +25 -6
- magic_pdf/utils/__init__.py +0 -0
- magic_pdf/utils/annotations.py +11 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
- magic_pdf-0.9.0.dist-info/METADATA +507 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
- magic_pdf-0.8.0.dist-info/METADATA +0 -459
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
magic_pdf/model/magic_model.py
CHANGED
@@ -1,17 +1,30 @@
|
|
1
|
+
import enum
|
1
2
|
import json
|
2
3
|
|
4
|
+
from magic_pdf.data.dataset import Dataset
|
3
5
|
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
|
4
|
-
bbox_relative_pos, calculate_iou,
|
5
|
-
calculate_overlap_area_in_bbox1_area_ratio
|
6
|
+
bbox_relative_pos, box_area, calculate_iou,
|
7
|
+
calculate_overlap_area_in_bbox1_area_ratio,
|
8
|
+
get_overlap_area)
|
6
9
|
from magic_pdf.libs.commons import fitz, join_path
|
7
10
|
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
8
11
|
from magic_pdf.libs.local_math import float_gt
|
9
12
|
from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
|
10
13
|
from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
|
14
|
+
from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
|
11
15
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
12
16
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
13
17
|
|
14
18
|
CAPATION_OVERLAP_AREA_RATIO = 0.6
|
19
|
+
MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
|
20
|
+
|
21
|
+
|
22
|
+
class PosRelationEnum(enum.Enum):
|
23
|
+
LEFT = 'left'
|
24
|
+
RIGHT = 'right'
|
25
|
+
UP = 'up'
|
26
|
+
BOTTOM = 'bottom'
|
27
|
+
ALL = 'all'
|
15
28
|
|
16
29
|
|
17
30
|
class MagicModel:
|
@@ -22,7 +35,7 @@ class MagicModel:
|
|
22
35
|
need_remove_list = []
|
23
36
|
page_no = model_page_info['page_info']['page_no']
|
24
37
|
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
|
25
|
-
model_page_info, self.__docs
|
38
|
+
model_page_info, self.__docs.get_page(page_no)
|
26
39
|
)
|
27
40
|
layout_dets = model_page_info['layout_dets']
|
28
41
|
for layout_det in layout_dets:
|
@@ -97,7 +110,7 @@ class MagicModel:
|
|
97
110
|
for need_remove in need_remove_list:
|
98
111
|
layout_dets.remove(need_remove)
|
99
112
|
|
100
|
-
def __init__(self, model_list: list, docs:
|
113
|
+
def __init__(self, model_list: list, docs: Dataset):
|
101
114
|
self.__model_list = model_list
|
102
115
|
self.__docs = docs
|
103
116
|
"""为所有模型数据添加bbox信息(缩放,poly->bbox)"""
|
@@ -108,6 +121,24 @@ class MagicModel:
|
|
108
121
|
self.__fix_by_remove_high_iou_and_low_confidence()
|
109
122
|
self.__fix_footnote()
|
110
123
|
|
124
|
+
def _bbox_distance(self, bbox1, bbox2):
|
125
|
+
left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
|
126
|
+
flags = [left, right, bottom, top]
|
127
|
+
count = sum([1 if v else 0 for v in flags])
|
128
|
+
if count > 1:
|
129
|
+
return float('inf')
|
130
|
+
if left or right:
|
131
|
+
l1 = bbox1[3] - bbox1[1]
|
132
|
+
l2 = bbox2[3] - bbox2[1]
|
133
|
+
else:
|
134
|
+
l1 = bbox1[2] - bbox1[0]
|
135
|
+
l2 = bbox2[2] - bbox2[0]
|
136
|
+
|
137
|
+
if l2 > l1 and (l2 - l1) / l1 > 0.3:
|
138
|
+
return float('inf')
|
139
|
+
|
140
|
+
return bbox_distance(bbox1, bbox2)
|
141
|
+
|
111
142
|
def __fix_footnote(self):
|
112
143
|
# 3: figure, 5: table, 7: footnote
|
113
144
|
for model_page_info in self.__model_list:
|
@@ -124,49 +155,51 @@ class MagicModel:
|
|
124
155
|
tables.append(obj)
|
125
156
|
if len(footnotes) * len(figures) == 0:
|
126
157
|
continue
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
)
|
158
|
+
dis_figure_footnote = {}
|
159
|
+
dis_table_footnote = {}
|
160
|
+
|
161
|
+
for i in range(len(footnotes)):
|
162
|
+
for j in range(len(figures)):
|
163
|
+
pos_flag_count = sum(
|
164
|
+
list(
|
165
|
+
map(
|
166
|
+
lambda x: 1 if x else 0,
|
167
|
+
bbox_relative_pos(
|
168
|
+
footnotes[i]['bbox'], figures[j]['bbox']
|
169
|
+
),
|
140
170
|
)
|
141
171
|
)
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
)
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
)
|
172
|
+
)
|
173
|
+
if pos_flag_count > 1:
|
174
|
+
continue
|
175
|
+
dis_figure_footnote[i] = min(
|
176
|
+
self._bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
|
177
|
+
dis_figure_footnote.get(i, float('inf')),
|
178
|
+
)
|
179
|
+
for i in range(len(footnotes)):
|
180
|
+
for j in range(len(tables)):
|
181
|
+
pos_flag_count = sum(
|
182
|
+
list(
|
183
|
+
map(
|
184
|
+
lambda x: 1 if x else 0,
|
185
|
+
bbox_relative_pos(
|
186
|
+
footnotes[i]['bbox'], tables[j]['bbox']
|
187
|
+
),
|
158
188
|
)
|
159
189
|
)
|
160
|
-
|
161
|
-
|
190
|
+
)
|
191
|
+
if pos_flag_count > 1:
|
192
|
+
continue
|
162
193
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
194
|
+
dis_table_footnote[i] = min(
|
195
|
+
self._bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
|
196
|
+
dis_table_footnote.get(i, float('inf')),
|
197
|
+
)
|
198
|
+
for i in range(len(footnotes)):
|
199
|
+
if i not in dis_figure_footnote:
|
200
|
+
continue
|
201
|
+
if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
|
202
|
+
footnotes[i]['category_id'] = CategoryId.ImageFootnote
|
170
203
|
|
171
204
|
def __reduct_overlap(self, bboxes):
|
172
205
|
N = len(bboxes)
|
@@ -192,6 +225,43 @@ class MagicModel:
|
|
192
225
|
再求出筛选出的 subjects 和 object 的最短距离
|
193
226
|
"""
|
194
227
|
|
228
|
+
def search_overlap_between_boxes(subject_idx, object_idx):
|
229
|
+
idxes = [subject_idx, object_idx]
|
230
|
+
x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
|
231
|
+
y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
|
232
|
+
x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
|
233
|
+
y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
|
234
|
+
|
235
|
+
merged_bbox = [
|
236
|
+
min(x0s),
|
237
|
+
min(y0s),
|
238
|
+
max(x1s),
|
239
|
+
max(y1s),
|
240
|
+
]
|
241
|
+
ratio = 0
|
242
|
+
|
243
|
+
other_objects = list(
|
244
|
+
map(
|
245
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
246
|
+
filter(
|
247
|
+
lambda x: x['category_id']
|
248
|
+
not in (object_category_id, subject_category_id),
|
249
|
+
self.__model_list[page_no]['layout_dets'],
|
250
|
+
),
|
251
|
+
)
|
252
|
+
)
|
253
|
+
for other_object in other_objects:
|
254
|
+
ratio = max(
|
255
|
+
ratio,
|
256
|
+
get_overlap_area(merged_bbox, other_object['bbox'])
|
257
|
+
* 1.0
|
258
|
+
/ box_area(all_bboxes[object_idx]['bbox']),
|
259
|
+
)
|
260
|
+
if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
|
261
|
+
break
|
262
|
+
|
263
|
+
return ratio
|
264
|
+
|
195
265
|
def may_find_other_nearest_bbox(subject_idx, object_idx):
|
196
266
|
ret = float('inf')
|
197
267
|
|
@@ -299,7 +369,21 @@ class MagicModel:
|
|
299
369
|
):
|
300
370
|
continue
|
301
371
|
|
302
|
-
|
372
|
+
subject_idx, object_idx = i, j
|
373
|
+
if all_bboxes[j]['category_id'] == subject_category_id:
|
374
|
+
subject_idx, object_idx = j, i
|
375
|
+
|
376
|
+
if (
|
377
|
+
search_overlap_between_boxes(subject_idx, object_idx)
|
378
|
+
>= MERGE_BOX_OVERLAP_AREA_RATIO
|
379
|
+
):
|
380
|
+
dis[i][j] = float('inf')
|
381
|
+
dis[j][i] = dis[i][j]
|
382
|
+
continue
|
383
|
+
|
384
|
+
dis[i][j] = self._bbox_distance(
|
385
|
+
all_bboxes[subject_idx]['bbox'], all_bboxes[object_idx]['bbox']
|
386
|
+
)
|
303
387
|
dis[j][i] = dis[i][j]
|
304
388
|
|
305
389
|
used = set()
|
@@ -515,6 +599,289 @@ class MagicModel:
|
|
515
599
|
with_caption_subject.add(j)
|
516
600
|
return ret, total_subject_object_dis
|
517
601
|
|
602
|
+
def __tie_up_category_by_distance_v2(
|
603
|
+
self,
|
604
|
+
page_no: int,
|
605
|
+
subject_category_id: int,
|
606
|
+
object_category_id: int,
|
607
|
+
priority_pos: PosRelationEnum,
|
608
|
+
):
|
609
|
+
"""_summary_
|
610
|
+
|
611
|
+
Args:
|
612
|
+
page_no (int): _description_
|
613
|
+
subject_category_id (int): _description_
|
614
|
+
object_category_id (int): _description_
|
615
|
+
priority_pos (PosRelationEnum): _description_
|
616
|
+
|
617
|
+
Returns:
|
618
|
+
_type_: _description_
|
619
|
+
"""
|
620
|
+
AXIS_MULPLICITY = 0.5
|
621
|
+
subjects = self.__reduct_overlap(
|
622
|
+
list(
|
623
|
+
map(
|
624
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
625
|
+
filter(
|
626
|
+
lambda x: x['category_id'] == subject_category_id,
|
627
|
+
self.__model_list[page_no]['layout_dets'],
|
628
|
+
),
|
629
|
+
)
|
630
|
+
)
|
631
|
+
)
|
632
|
+
|
633
|
+
objects = self.__reduct_overlap(
|
634
|
+
list(
|
635
|
+
map(
|
636
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
637
|
+
filter(
|
638
|
+
lambda x: x['category_id'] == object_category_id,
|
639
|
+
self.__model_list[page_no]['layout_dets'],
|
640
|
+
),
|
641
|
+
)
|
642
|
+
)
|
643
|
+
)
|
644
|
+
M = len(objects)
|
645
|
+
|
646
|
+
subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
|
647
|
+
objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
|
648
|
+
|
649
|
+
sub_obj_map_h = {i: [] for i in range(len(subjects))}
|
650
|
+
|
651
|
+
dis_by_directions = {
|
652
|
+
'top': [[-1, float('inf')]] * M,
|
653
|
+
'bottom': [[-1, float('inf')]] * M,
|
654
|
+
'left': [[-1, float('inf')]] * M,
|
655
|
+
'right': [[-1, float('inf')]] * M,
|
656
|
+
}
|
657
|
+
|
658
|
+
for i, obj in enumerate(objects):
|
659
|
+
l_x_axis, l_y_axis = (
|
660
|
+
obj['bbox'][2] - obj['bbox'][0],
|
661
|
+
obj['bbox'][3] - obj['bbox'][1],
|
662
|
+
)
|
663
|
+
axis_unit = min(l_x_axis, l_y_axis)
|
664
|
+
for j, sub in enumerate(subjects):
|
665
|
+
|
666
|
+
bbox1, bbox2, _ = _remove_overlap_between_bbox(
|
667
|
+
objects[i]['bbox'], subjects[j]['bbox']
|
668
|
+
)
|
669
|
+
left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
|
670
|
+
flags = [left, right, bottom, top]
|
671
|
+
if sum([1 if v else 0 for v in flags]) > 1:
|
672
|
+
continue
|
673
|
+
|
674
|
+
if left:
|
675
|
+
if dis_by_directions['left'][i][1] > bbox_distance(
|
676
|
+
obj['bbox'], sub['bbox']
|
677
|
+
):
|
678
|
+
dis_by_directions['left'][i] = [
|
679
|
+
j,
|
680
|
+
bbox_distance(obj['bbox'], sub['bbox']),
|
681
|
+
]
|
682
|
+
if right:
|
683
|
+
if dis_by_directions['right'][i][1] > bbox_distance(
|
684
|
+
obj['bbox'], sub['bbox']
|
685
|
+
):
|
686
|
+
dis_by_directions['right'][i] = [
|
687
|
+
j,
|
688
|
+
bbox_distance(obj['bbox'], sub['bbox']),
|
689
|
+
]
|
690
|
+
if bottom:
|
691
|
+
if dis_by_directions['bottom'][i][1] > bbox_distance(
|
692
|
+
obj['bbox'], sub['bbox']
|
693
|
+
):
|
694
|
+
dis_by_directions['bottom'][i] = [
|
695
|
+
j,
|
696
|
+
bbox_distance(obj['bbox'], sub['bbox']),
|
697
|
+
]
|
698
|
+
if top:
|
699
|
+
if dis_by_directions['top'][i][1] > bbox_distance(
|
700
|
+
obj['bbox'], sub['bbox']
|
701
|
+
):
|
702
|
+
dis_by_directions['top'][i] = [
|
703
|
+
j,
|
704
|
+
bbox_distance(obj['bbox'], sub['bbox']),
|
705
|
+
]
|
706
|
+
|
707
|
+
if (
|
708
|
+
dis_by_directions['top'][i][1] != float('inf')
|
709
|
+
and dis_by_directions['bottom'][i][1] != float('inf')
|
710
|
+
and priority_pos in (PosRelationEnum.BOTTOM, PosRelationEnum.UP)
|
711
|
+
):
|
712
|
+
RATIO = 3
|
713
|
+
if (
|
714
|
+
abs(
|
715
|
+
dis_by_directions['top'][i][1]
|
716
|
+
- dis_by_directions['bottom'][i][1]
|
717
|
+
)
|
718
|
+
< RATIO * axis_unit
|
719
|
+
):
|
720
|
+
|
721
|
+
if priority_pos == PosRelationEnum.BOTTOM:
|
722
|
+
sub_obj_map_h[dis_by_directions['bottom'][i][0]].append(i)
|
723
|
+
else:
|
724
|
+
sub_obj_map_h[dis_by_directions['top'][i][0]].append(i)
|
725
|
+
continue
|
726
|
+
|
727
|
+
if dis_by_directions['left'][i][1] != float('inf') or dis_by_directions[
|
728
|
+
'right'
|
729
|
+
][i][1] != float('inf'):
|
730
|
+
if dis_by_directions['left'][i][1] != float(
|
731
|
+
'inf'
|
732
|
+
) and dis_by_directions['right'][i][1] != float('inf'):
|
733
|
+
if AXIS_MULPLICITY * axis_unit >= abs(
|
734
|
+
dis_by_directions['left'][i][1]
|
735
|
+
- dis_by_directions['right'][i][1]
|
736
|
+
):
|
737
|
+
left_sub_bbox = subjects[dis_by_directions['left'][i][0]][
|
738
|
+
'bbox'
|
739
|
+
]
|
740
|
+
right_sub_bbox = subjects[dis_by_directions['right'][i][0]][
|
741
|
+
'bbox'
|
742
|
+
]
|
743
|
+
|
744
|
+
left_sub_bbox_y_axis = left_sub_bbox[3] - left_sub_bbox[1]
|
745
|
+
right_sub_bbox_y_axis = right_sub_bbox[3] - right_sub_bbox[1]
|
746
|
+
|
747
|
+
if (
|
748
|
+
abs(left_sub_bbox_y_axis - l_y_axis)
|
749
|
+
+ dis_by_directions['left'][i][0]
|
750
|
+
> abs(right_sub_bbox_y_axis - l_y_axis)
|
751
|
+
+ dis_by_directions['right'][i][0]
|
752
|
+
):
|
753
|
+
left_or_right = dis_by_directions['right'][i]
|
754
|
+
else:
|
755
|
+
left_or_right = dis_by_directions['left'][i]
|
756
|
+
else:
|
757
|
+
left_or_right = dis_by_directions['left'][i]
|
758
|
+
if left_or_right[1] > dis_by_directions['right'][i][1]:
|
759
|
+
left_or_right = dis_by_directions['right'][i]
|
760
|
+
else:
|
761
|
+
left_or_right = dis_by_directions['left'][i]
|
762
|
+
if left_or_right[1] == float('inf'):
|
763
|
+
left_or_right = dis_by_directions['right'][i]
|
764
|
+
else:
|
765
|
+
left_or_right = [-1, float('inf')]
|
766
|
+
|
767
|
+
if dis_by_directions['top'][i][1] != float('inf') or dis_by_directions[
|
768
|
+
'bottom'
|
769
|
+
][i][1] != float('inf'):
|
770
|
+
if dis_by_directions['top'][i][1] != float('inf') and dis_by_directions[
|
771
|
+
'bottom'
|
772
|
+
][i][1] != float('inf'):
|
773
|
+
if AXIS_MULPLICITY * axis_unit >= abs(
|
774
|
+
dis_by_directions['top'][i][1]
|
775
|
+
- dis_by_directions['bottom'][i][1]
|
776
|
+
):
|
777
|
+
top_bottom = subjects[dis_by_directions['bottom'][i][0]]['bbox']
|
778
|
+
bottom_top = subjects[dis_by_directions['top'][i][0]]['bbox']
|
779
|
+
|
780
|
+
top_bottom_x_axis = top_bottom[2] - top_bottom[0]
|
781
|
+
bottom_top_x_axis = bottom_top[2] - bottom_top[0]
|
782
|
+
if (
|
783
|
+
abs(top_bottom_x_axis - l_x_axis)
|
784
|
+
+ dis_by_directions['bottom'][i][1]
|
785
|
+
> abs(bottom_top_x_axis - l_x_axis)
|
786
|
+
+ dis_by_directions['top'][i][1]
|
787
|
+
):
|
788
|
+
top_or_bottom = dis_by_directions['top'][i]
|
789
|
+
else:
|
790
|
+
top_or_bottom = dis_by_directions['bottom'][i]
|
791
|
+
else:
|
792
|
+
top_or_bottom = dis_by_directions['top'][i]
|
793
|
+
if top_or_bottom[1] > dis_by_directions['bottom'][i][1]:
|
794
|
+
top_or_bottom = dis_by_directions['bottom'][i]
|
795
|
+
else:
|
796
|
+
top_or_bottom = dis_by_directions['top'][i]
|
797
|
+
if top_or_bottom[1] == float('inf'):
|
798
|
+
top_or_bottom = dis_by_directions['bottom'][i]
|
799
|
+
else:
|
800
|
+
top_or_bottom = [-1, float('inf')]
|
801
|
+
|
802
|
+
if left_or_right[1] != float('inf') or top_or_bottom[1] != float('inf'):
|
803
|
+
if left_or_right[1] != float('inf') and top_or_bottom[1] != float(
|
804
|
+
'inf'
|
805
|
+
):
|
806
|
+
if AXIS_MULPLICITY * axis_unit >= abs(
|
807
|
+
left_or_right[1] - top_or_bottom[1]
|
808
|
+
):
|
809
|
+
y_axis_bbox = subjects[left_or_right[0]]['bbox']
|
810
|
+
x_axis_bbox = subjects[top_or_bottom[0]]['bbox']
|
811
|
+
|
812
|
+
if (
|
813
|
+
abs((x_axis_bbox[2] - x_axis_bbox[0]) - l_x_axis) / l_x_axis
|
814
|
+
> abs((y_axis_bbox[3] - y_axis_bbox[1]) - l_y_axis)
|
815
|
+
/ l_y_axis
|
816
|
+
):
|
817
|
+
sub_obj_map_h[left_or_right[0]].append(i)
|
818
|
+
else:
|
819
|
+
sub_obj_map_h[top_or_bottom[0]].append(i)
|
820
|
+
else:
|
821
|
+
if left_or_right[1] > top_or_bottom[1]:
|
822
|
+
sub_obj_map_h[top_or_bottom[0]].append(i)
|
823
|
+
else:
|
824
|
+
sub_obj_map_h[left_or_right[0]].append(i)
|
825
|
+
else:
|
826
|
+
if left_or_right[1] != float('inf'):
|
827
|
+
sub_obj_map_h[left_or_right[0]].append(i)
|
828
|
+
else:
|
829
|
+
sub_obj_map_h[top_or_bottom[0]].append(i)
|
830
|
+
ret = []
|
831
|
+
for i in sub_obj_map_h.keys():
|
832
|
+
ret.append(
|
833
|
+
{
|
834
|
+
'sub_bbox': {
|
835
|
+
'bbox': subjects[i]['bbox'],
|
836
|
+
'score': subjects[i]['score'],
|
837
|
+
},
|
838
|
+
'obj_bboxes': [
|
839
|
+
{'score': objects[j]['score'], 'bbox': objects[j]['bbox']}
|
840
|
+
for j in sub_obj_map_h[i]
|
841
|
+
],
|
842
|
+
'sub_idx': i,
|
843
|
+
}
|
844
|
+
)
|
845
|
+
return ret
|
846
|
+
|
847
|
+
def get_imgs_v2(self, page_no: int):
|
848
|
+
with_captions = self.__tie_up_category_by_distance_v2(
|
849
|
+
page_no, 3, 4, PosRelationEnum.BOTTOM
|
850
|
+
)
|
851
|
+
with_footnotes = self.__tie_up_category_by_distance_v2(
|
852
|
+
page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL
|
853
|
+
)
|
854
|
+
ret = []
|
855
|
+
for v in with_captions:
|
856
|
+
record = {
|
857
|
+
'image_body': v['sub_bbox'],
|
858
|
+
'image_caption_list': v['obj_bboxes'],
|
859
|
+
}
|
860
|
+
filter_idx = v['sub_idx']
|
861
|
+
d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
|
862
|
+
record['image_footnote_list'] = d['obj_bboxes']
|
863
|
+
ret.append(record)
|
864
|
+
return ret
|
865
|
+
|
866
|
+
def get_tables_v2(self, page_no: int) -> list:
|
867
|
+
with_captions = self.__tie_up_category_by_distance_v2(
|
868
|
+
page_no, 5, 6, PosRelationEnum.UP
|
869
|
+
)
|
870
|
+
with_footnotes = self.__tie_up_category_by_distance_v2(
|
871
|
+
page_no, 5, 7, PosRelationEnum.ALL
|
872
|
+
)
|
873
|
+
ret = []
|
874
|
+
for v in with_captions:
|
875
|
+
record = {
|
876
|
+
'table_body': v['sub_bbox'],
|
877
|
+
'table_caption_list': v['obj_bboxes'],
|
878
|
+
}
|
879
|
+
filter_idx = v['sub_idx']
|
880
|
+
d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
|
881
|
+
record['table_footnote_list'] = d['obj_bboxes']
|
882
|
+
ret.append(record)
|
883
|
+
return ret
|
884
|
+
|
518
885
|
def get_imgs(self, page_no: int):
|
519
886
|
with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
|
520
887
|
with_footnotes, _ = self.__tie_up_category_by_distance(
|
@@ -627,13 +994,13 @@ class MagicModel:
|
|
627
994
|
span['type'] = ContentType.Image
|
628
995
|
elif category_id == 5:
|
629
996
|
# 获取table模型结果
|
630
|
-
latex = layout_det.get(
|
631
|
-
html = layout_det.get(
|
997
|
+
latex = layout_det.get('latex', None)
|
998
|
+
html = layout_det.get('html', None)
|
632
999
|
if latex:
|
633
|
-
span[
|
1000
|
+
span['latex'] = latex
|
634
1001
|
elif html:
|
635
|
-
span[
|
636
|
-
span[
|
1002
|
+
span['html'] = html
|
1003
|
+
span['type'] = ContentType.Table
|
637
1004
|
elif category_id == 13:
|
638
1005
|
span['content'] = layout_det['latex']
|
639
1006
|
span['type'] = ContentType.InlineEquation
|
@@ -648,10 +1015,10 @@ class MagicModel:
|
|
648
1015
|
|
649
1016
|
def get_page_size(self, page_no: int): # 获取页面宽高
|
650
1017
|
# 获取当前页的page对象
|
651
|
-
page = self.__docs
|
1018
|
+
page = self.__docs.get_page(page_no).get_page_info()
|
652
1019
|
# 获取当前页的宽高
|
653
|
-
page_w = page.
|
654
|
-
page_h = page.
|
1020
|
+
page_w = page.w
|
1021
|
+
page_h = page.h
|
655
1022
|
return page_w, page_h
|
656
1023
|
|
657
1024
|
def __get_blocks_by_type(
|