magic-pdf 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/__init__.py +0 -0
- magic_pdf/config/enums.py +7 -0
- magic_pdf/config/exceptions.py +32 -0
- magic_pdf/data/__init__.py +0 -0
- magic_pdf/data/data_reader_writer/__init__.py +12 -0
- magic_pdf/data/data_reader_writer/base.py +51 -0
- magic_pdf/data/data_reader_writer/filebase.py +59 -0
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
- magic_pdf/data/data_reader_writer/s3.py +69 -0
- magic_pdf/data/dataset.py +194 -0
- magic_pdf/data/io/__init__.py +0 -0
- magic_pdf/data/io/base.py +42 -0
- magic_pdf/data/io/http.py +37 -0
- magic_pdf/data/io/s3.py +114 -0
- magic_pdf/data/read_api.py +95 -0
- magic_pdf/data/schemas.py +15 -0
- magic_pdf/data/utils.py +32 -0
- magic_pdf/dict2md/ocr_mkcontent.py +74 -234
- magic_pdf/libs/Constants.py +21 -8
- magic_pdf/libs/MakeContentConfig.py +1 -0
- magic_pdf/libs/boxbase.py +35 -0
- magic_pdf/libs/clean_memory.py +10 -0
- magic_pdf/libs/config_reader.py +53 -23
- magic_pdf/libs/draw_bbox.py +150 -65
- magic_pdf/libs/ocr_content_type.py +2 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
- magic_pdf/model/magic_model.py +331 -15
- magic_pdf/model/pdf_extract_kit.py +164 -80
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
- magic_pdf/model/ppTableModel.py +2 -2
- magic_pdf/model/pp_structure_v2.py +5 -2
- magic_pdf/model/v3/__init__.py +0 -0
- magic_pdf/model/v3/helpers.py +125 -0
- magic_pdf/para/para_split_v3.py +296 -0
- magic_pdf/pdf_parse_by_ocr.py +6 -3
- magic_pdf/pdf_parse_by_txt.py +6 -3
- magic_pdf/pdf_parse_union_core_v2.py +644 -0
- magic_pdf/pipe/AbsPipe.py +5 -1
- magic_pdf/pipe/OCRPipe.py +10 -4
- magic_pdf/pipe/TXTPipe.py +10 -4
- magic_pdf/pipe/UNIPipe.py +16 -7
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
- magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
- magic_pdf/resources/model_config/model_configs.yaml +5 -13
- magic_pdf/tools/cli.py +14 -1
- magic_pdf/tools/common.py +18 -8
- magic_pdf/user_api.py +25 -6
- magic_pdf/utils/__init__.py +0 -0
- magic_pdf/utils/annotations.py +11 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/METADATA +120 -75
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
magic_pdf/model/magic_model.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
|
+
import enum
|
1
2
|
import json
|
2
3
|
|
4
|
+
from magic_pdf.data.dataset import Dataset
|
3
5
|
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
|
4
6
|
bbox_relative_pos, box_area, calculate_iou,
|
5
7
|
calculate_overlap_area_in_bbox1_area_ratio,
|
@@ -9,6 +11,7 @@ from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
|
9
11
|
from magic_pdf.libs.local_math import float_gt
|
10
12
|
from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
|
11
13
|
from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
|
14
|
+
from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
|
12
15
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
13
16
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
14
17
|
|
@@ -16,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6
|
|
16
19
|
MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
|
17
20
|
|
18
21
|
|
22
|
+
class PosRelationEnum(enum.Enum):
|
23
|
+
LEFT = 'left'
|
24
|
+
RIGHT = 'right'
|
25
|
+
UP = 'up'
|
26
|
+
BOTTOM = 'bottom'
|
27
|
+
ALL = 'all'
|
28
|
+
|
29
|
+
|
19
30
|
class MagicModel:
|
20
31
|
"""每个函数没有得到元素的时候返回空list."""
|
21
32
|
|
@@ -24,7 +35,7 @@ class MagicModel:
|
|
24
35
|
need_remove_list = []
|
25
36
|
page_no = model_page_info['page_info']['page_no']
|
26
37
|
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
|
27
|
-
model_page_info, self.__docs
|
38
|
+
model_page_info, self.__docs.get_page(page_no)
|
28
39
|
)
|
29
40
|
layout_dets = model_page_info['layout_dets']
|
30
41
|
for layout_det in layout_dets:
|
@@ -99,7 +110,7 @@ class MagicModel:
|
|
99
110
|
for need_remove in need_remove_list:
|
100
111
|
layout_dets.remove(need_remove)
|
101
112
|
|
102
|
-
def __init__(self, model_list: list, docs:
|
113
|
+
def __init__(self, model_list: list, docs: Dataset):
|
103
114
|
self.__model_list = model_list
|
104
115
|
self.__docs = docs
|
105
116
|
"""为所有模型数据添加bbox信息(缩放,poly->bbox)"""
|
@@ -110,6 +121,24 @@ class MagicModel:
|
|
110
121
|
self.__fix_by_remove_high_iou_and_low_confidence()
|
111
122
|
self.__fix_footnote()
|
112
123
|
|
124
|
+
def _bbox_distance(self, bbox1, bbox2):
|
125
|
+
left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
|
126
|
+
flags = [left, right, bottom, top]
|
127
|
+
count = sum([1 if v else 0 for v in flags])
|
128
|
+
if count > 1:
|
129
|
+
return float('inf')
|
130
|
+
if left or right:
|
131
|
+
l1 = bbox1[3] - bbox1[1]
|
132
|
+
l2 = bbox2[3] - bbox2[1]
|
133
|
+
else:
|
134
|
+
l1 = bbox1[2] - bbox1[0]
|
135
|
+
l2 = bbox2[2] - bbox2[0]
|
136
|
+
|
137
|
+
if l2 > l1 and (l2 - l1) / l1 > 0.3:
|
138
|
+
return float('inf')
|
139
|
+
|
140
|
+
return bbox_distance(bbox1, bbox2)
|
141
|
+
|
113
142
|
def __fix_footnote(self):
|
114
143
|
# 3: figure, 5: table, 7: footnote
|
115
144
|
for model_page_info in self.__model_list:
|
@@ -144,7 +173,7 @@ class MagicModel:
|
|
144
173
|
if pos_flag_count > 1:
|
145
174
|
continue
|
146
175
|
dis_figure_footnote[i] = min(
|
147
|
-
|
176
|
+
self._bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
|
148
177
|
dis_figure_footnote.get(i, float('inf')),
|
149
178
|
)
|
150
179
|
for i in range(len(footnotes)):
|
@@ -163,7 +192,7 @@ class MagicModel:
|
|
163
192
|
continue
|
164
193
|
|
165
194
|
dis_table_footnote[i] = min(
|
166
|
-
|
195
|
+
self._bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
|
167
196
|
dis_table_footnote.get(i, float('inf')),
|
168
197
|
)
|
169
198
|
for i in range(len(footnotes)):
|
@@ -195,9 +224,8 @@ class MagicModel:
|
|
195
224
|
筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
|
196
225
|
再求出筛选出的 subjects 和 object 的最短距离
|
197
226
|
"""
|
198
|
-
|
199
|
-
|
200
|
-
):
|
227
|
+
|
228
|
+
def search_overlap_between_boxes(subject_idx, object_idx):
|
201
229
|
idxes = [subject_idx, object_idx]
|
202
230
|
x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
|
203
231
|
y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
|
@@ -225,9 +253,9 @@ class MagicModel:
|
|
225
253
|
for other_object in other_objects:
|
226
254
|
ratio = max(
|
227
255
|
ratio,
|
228
|
-
get_overlap_area(
|
229
|
-
|
230
|
-
|
256
|
+
get_overlap_area(merged_bbox, other_object['bbox'])
|
257
|
+
* 1.0
|
258
|
+
/ box_area(all_bboxes[object_idx]['bbox']),
|
231
259
|
)
|
232
260
|
if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
|
233
261
|
break
|
@@ -345,12 +373,17 @@ class MagicModel:
|
|
345
373
|
if all_bboxes[j]['category_id'] == subject_category_id:
|
346
374
|
subject_idx, object_idx = j, i
|
347
375
|
|
348
|
-
if
|
376
|
+
if (
|
377
|
+
search_overlap_between_boxes(subject_idx, object_idx)
|
378
|
+
>= MERGE_BOX_OVERLAP_AREA_RATIO
|
379
|
+
):
|
349
380
|
dis[i][j] = float('inf')
|
350
381
|
dis[j][i] = dis[i][j]
|
351
382
|
continue
|
352
383
|
|
353
|
-
dis[i][j] =
|
384
|
+
dis[i][j] = self._bbox_distance(
|
385
|
+
all_bboxes[subject_idx]['bbox'], all_bboxes[object_idx]['bbox']
|
386
|
+
)
|
354
387
|
dis[j][i] = dis[i][j]
|
355
388
|
|
356
389
|
used = set()
|
@@ -566,6 +599,289 @@ class MagicModel:
|
|
566
599
|
with_caption_subject.add(j)
|
567
600
|
return ret, total_subject_object_dis
|
568
601
|
|
602
|
+
def __tie_up_category_by_distance_v2(
|
603
|
+
self,
|
604
|
+
page_no: int,
|
605
|
+
subject_category_id: int,
|
606
|
+
object_category_id: int,
|
607
|
+
priority_pos: PosRelationEnum,
|
608
|
+
):
|
609
|
+
"""_summary_
|
610
|
+
|
611
|
+
Args:
|
612
|
+
page_no (int): _description_
|
613
|
+
subject_category_id (int): _description_
|
614
|
+
object_category_id (int): _description_
|
615
|
+
priority_pos (PosRelationEnum): _description_
|
616
|
+
|
617
|
+
Returns:
|
618
|
+
_type_: _description_
|
619
|
+
"""
|
620
|
+
AXIS_MULPLICITY = 0.5
|
621
|
+
subjects = self.__reduct_overlap(
|
622
|
+
list(
|
623
|
+
map(
|
624
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
625
|
+
filter(
|
626
|
+
lambda x: x['category_id'] == subject_category_id,
|
627
|
+
self.__model_list[page_no]['layout_dets'],
|
628
|
+
),
|
629
|
+
)
|
630
|
+
)
|
631
|
+
)
|
632
|
+
|
633
|
+
objects = self.__reduct_overlap(
|
634
|
+
list(
|
635
|
+
map(
|
636
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
637
|
+
filter(
|
638
|
+
lambda x: x['category_id'] == object_category_id,
|
639
|
+
self.__model_list[page_no]['layout_dets'],
|
640
|
+
),
|
641
|
+
)
|
642
|
+
)
|
643
|
+
)
|
644
|
+
M = len(objects)
|
645
|
+
|
646
|
+
subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
|
647
|
+
objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
|
648
|
+
|
649
|
+
sub_obj_map_h = {i: [] for i in range(len(subjects))}
|
650
|
+
|
651
|
+
dis_by_directions = {
|
652
|
+
'top': [[-1, float('inf')]] * M,
|
653
|
+
'bottom': [[-1, float('inf')]] * M,
|
654
|
+
'left': [[-1, float('inf')]] * M,
|
655
|
+
'right': [[-1, float('inf')]] * M,
|
656
|
+
}
|
657
|
+
|
658
|
+
for i, obj in enumerate(objects):
|
659
|
+
l_x_axis, l_y_axis = (
|
660
|
+
obj['bbox'][2] - obj['bbox'][0],
|
661
|
+
obj['bbox'][3] - obj['bbox'][1],
|
662
|
+
)
|
663
|
+
axis_unit = min(l_x_axis, l_y_axis)
|
664
|
+
for j, sub in enumerate(subjects):
|
665
|
+
|
666
|
+
bbox1, bbox2, _ = _remove_overlap_between_bbox(
|
667
|
+
objects[i]['bbox'], subjects[j]['bbox']
|
668
|
+
)
|
669
|
+
left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
|
670
|
+
flags = [left, right, bottom, top]
|
671
|
+
if sum([1 if v else 0 for v in flags]) > 1:
|
672
|
+
continue
|
673
|
+
|
674
|
+
if left:
|
675
|
+
if dis_by_directions['left'][i][1] > bbox_distance(
|
676
|
+
obj['bbox'], sub['bbox']
|
677
|
+
):
|
678
|
+
dis_by_directions['left'][i] = [
|
679
|
+
j,
|
680
|
+
bbox_distance(obj['bbox'], sub['bbox']),
|
681
|
+
]
|
682
|
+
if right:
|
683
|
+
if dis_by_directions['right'][i][1] > bbox_distance(
|
684
|
+
obj['bbox'], sub['bbox']
|
685
|
+
):
|
686
|
+
dis_by_directions['right'][i] = [
|
687
|
+
j,
|
688
|
+
bbox_distance(obj['bbox'], sub['bbox']),
|
689
|
+
]
|
690
|
+
if bottom:
|
691
|
+
if dis_by_directions['bottom'][i][1] > bbox_distance(
|
692
|
+
obj['bbox'], sub['bbox']
|
693
|
+
):
|
694
|
+
dis_by_directions['bottom'][i] = [
|
695
|
+
j,
|
696
|
+
bbox_distance(obj['bbox'], sub['bbox']),
|
697
|
+
]
|
698
|
+
if top:
|
699
|
+
if dis_by_directions['top'][i][1] > bbox_distance(
|
700
|
+
obj['bbox'], sub['bbox']
|
701
|
+
):
|
702
|
+
dis_by_directions['top'][i] = [
|
703
|
+
j,
|
704
|
+
bbox_distance(obj['bbox'], sub['bbox']),
|
705
|
+
]
|
706
|
+
|
707
|
+
if (
|
708
|
+
dis_by_directions['top'][i][1] != float('inf')
|
709
|
+
and dis_by_directions['bottom'][i][1] != float('inf')
|
710
|
+
and priority_pos in (PosRelationEnum.BOTTOM, PosRelationEnum.UP)
|
711
|
+
):
|
712
|
+
RATIO = 3
|
713
|
+
if (
|
714
|
+
abs(
|
715
|
+
dis_by_directions['top'][i][1]
|
716
|
+
- dis_by_directions['bottom'][i][1]
|
717
|
+
)
|
718
|
+
< RATIO * axis_unit
|
719
|
+
):
|
720
|
+
|
721
|
+
if priority_pos == PosRelationEnum.BOTTOM:
|
722
|
+
sub_obj_map_h[dis_by_directions['bottom'][i][0]].append(i)
|
723
|
+
else:
|
724
|
+
sub_obj_map_h[dis_by_directions['top'][i][0]].append(i)
|
725
|
+
continue
|
726
|
+
|
727
|
+
if dis_by_directions['left'][i][1] != float('inf') or dis_by_directions[
|
728
|
+
'right'
|
729
|
+
][i][1] != float('inf'):
|
730
|
+
if dis_by_directions['left'][i][1] != float(
|
731
|
+
'inf'
|
732
|
+
) and dis_by_directions['right'][i][1] != float('inf'):
|
733
|
+
if AXIS_MULPLICITY * axis_unit >= abs(
|
734
|
+
dis_by_directions['left'][i][1]
|
735
|
+
- dis_by_directions['right'][i][1]
|
736
|
+
):
|
737
|
+
left_sub_bbox = subjects[dis_by_directions['left'][i][0]][
|
738
|
+
'bbox'
|
739
|
+
]
|
740
|
+
right_sub_bbox = subjects[dis_by_directions['right'][i][0]][
|
741
|
+
'bbox'
|
742
|
+
]
|
743
|
+
|
744
|
+
left_sub_bbox_y_axis = left_sub_bbox[3] - left_sub_bbox[1]
|
745
|
+
right_sub_bbox_y_axis = right_sub_bbox[3] - right_sub_bbox[1]
|
746
|
+
|
747
|
+
if (
|
748
|
+
abs(left_sub_bbox_y_axis - l_y_axis)
|
749
|
+
+ dis_by_directions['left'][i][0]
|
750
|
+
> abs(right_sub_bbox_y_axis - l_y_axis)
|
751
|
+
+ dis_by_directions['right'][i][0]
|
752
|
+
):
|
753
|
+
left_or_right = dis_by_directions['right'][i]
|
754
|
+
else:
|
755
|
+
left_or_right = dis_by_directions['left'][i]
|
756
|
+
else:
|
757
|
+
left_or_right = dis_by_directions['left'][i]
|
758
|
+
if left_or_right[1] > dis_by_directions['right'][i][1]:
|
759
|
+
left_or_right = dis_by_directions['right'][i]
|
760
|
+
else:
|
761
|
+
left_or_right = dis_by_directions['left'][i]
|
762
|
+
if left_or_right[1] == float('inf'):
|
763
|
+
left_or_right = dis_by_directions['right'][i]
|
764
|
+
else:
|
765
|
+
left_or_right = [-1, float('inf')]
|
766
|
+
|
767
|
+
if dis_by_directions['top'][i][1] != float('inf') or dis_by_directions[
|
768
|
+
'bottom'
|
769
|
+
][i][1] != float('inf'):
|
770
|
+
if dis_by_directions['top'][i][1] != float('inf') and dis_by_directions[
|
771
|
+
'bottom'
|
772
|
+
][i][1] != float('inf'):
|
773
|
+
if AXIS_MULPLICITY * axis_unit >= abs(
|
774
|
+
dis_by_directions['top'][i][1]
|
775
|
+
- dis_by_directions['bottom'][i][1]
|
776
|
+
):
|
777
|
+
top_bottom = subjects[dis_by_directions['bottom'][i][0]]['bbox']
|
778
|
+
bottom_top = subjects[dis_by_directions['top'][i][0]]['bbox']
|
779
|
+
|
780
|
+
top_bottom_x_axis = top_bottom[2] - top_bottom[0]
|
781
|
+
bottom_top_x_axis = bottom_top[2] - bottom_top[0]
|
782
|
+
if (
|
783
|
+
abs(top_bottom_x_axis - l_x_axis)
|
784
|
+
+ dis_by_directions['bottom'][i][1]
|
785
|
+
> abs(bottom_top_x_axis - l_x_axis)
|
786
|
+
+ dis_by_directions['top'][i][1]
|
787
|
+
):
|
788
|
+
top_or_bottom = dis_by_directions['top'][i]
|
789
|
+
else:
|
790
|
+
top_or_bottom = dis_by_directions['bottom'][i]
|
791
|
+
else:
|
792
|
+
top_or_bottom = dis_by_directions['top'][i]
|
793
|
+
if top_or_bottom[1] > dis_by_directions['bottom'][i][1]:
|
794
|
+
top_or_bottom = dis_by_directions['bottom'][i]
|
795
|
+
else:
|
796
|
+
top_or_bottom = dis_by_directions['top'][i]
|
797
|
+
if top_or_bottom[1] == float('inf'):
|
798
|
+
top_or_bottom = dis_by_directions['bottom'][i]
|
799
|
+
else:
|
800
|
+
top_or_bottom = [-1, float('inf')]
|
801
|
+
|
802
|
+
if left_or_right[1] != float('inf') or top_or_bottom[1] != float('inf'):
|
803
|
+
if left_or_right[1] != float('inf') and top_or_bottom[1] != float(
|
804
|
+
'inf'
|
805
|
+
):
|
806
|
+
if AXIS_MULPLICITY * axis_unit >= abs(
|
807
|
+
left_or_right[1] - top_or_bottom[1]
|
808
|
+
):
|
809
|
+
y_axis_bbox = subjects[left_or_right[0]]['bbox']
|
810
|
+
x_axis_bbox = subjects[top_or_bottom[0]]['bbox']
|
811
|
+
|
812
|
+
if (
|
813
|
+
abs((x_axis_bbox[2] - x_axis_bbox[0]) - l_x_axis) / l_x_axis
|
814
|
+
> abs((y_axis_bbox[3] - y_axis_bbox[1]) - l_y_axis)
|
815
|
+
/ l_y_axis
|
816
|
+
):
|
817
|
+
sub_obj_map_h[left_or_right[0]].append(i)
|
818
|
+
else:
|
819
|
+
sub_obj_map_h[top_or_bottom[0]].append(i)
|
820
|
+
else:
|
821
|
+
if left_or_right[1] > top_or_bottom[1]:
|
822
|
+
sub_obj_map_h[top_or_bottom[0]].append(i)
|
823
|
+
else:
|
824
|
+
sub_obj_map_h[left_or_right[0]].append(i)
|
825
|
+
else:
|
826
|
+
if left_or_right[1] != float('inf'):
|
827
|
+
sub_obj_map_h[left_or_right[0]].append(i)
|
828
|
+
else:
|
829
|
+
sub_obj_map_h[top_or_bottom[0]].append(i)
|
830
|
+
ret = []
|
831
|
+
for i in sub_obj_map_h.keys():
|
832
|
+
ret.append(
|
833
|
+
{
|
834
|
+
'sub_bbox': {
|
835
|
+
'bbox': subjects[i]['bbox'],
|
836
|
+
'score': subjects[i]['score'],
|
837
|
+
},
|
838
|
+
'obj_bboxes': [
|
839
|
+
{'score': objects[j]['score'], 'bbox': objects[j]['bbox']}
|
840
|
+
for j in sub_obj_map_h[i]
|
841
|
+
],
|
842
|
+
'sub_idx': i,
|
843
|
+
}
|
844
|
+
)
|
845
|
+
return ret
|
846
|
+
|
847
|
+
def get_imgs_v2(self, page_no: int):
|
848
|
+
with_captions = self.__tie_up_category_by_distance_v2(
|
849
|
+
page_no, 3, 4, PosRelationEnum.BOTTOM
|
850
|
+
)
|
851
|
+
with_footnotes = self.__tie_up_category_by_distance_v2(
|
852
|
+
page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL
|
853
|
+
)
|
854
|
+
ret = []
|
855
|
+
for v in with_captions:
|
856
|
+
record = {
|
857
|
+
'image_body': v['sub_bbox'],
|
858
|
+
'image_caption_list': v['obj_bboxes'],
|
859
|
+
}
|
860
|
+
filter_idx = v['sub_idx']
|
861
|
+
d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
|
862
|
+
record['image_footnote_list'] = d['obj_bboxes']
|
863
|
+
ret.append(record)
|
864
|
+
return ret
|
865
|
+
|
866
|
+
def get_tables_v2(self, page_no: int) -> list:
|
867
|
+
with_captions = self.__tie_up_category_by_distance_v2(
|
868
|
+
page_no, 5, 6, PosRelationEnum.UP
|
869
|
+
)
|
870
|
+
with_footnotes = self.__tie_up_category_by_distance_v2(
|
871
|
+
page_no, 5, 7, PosRelationEnum.ALL
|
872
|
+
)
|
873
|
+
ret = []
|
874
|
+
for v in with_captions:
|
875
|
+
record = {
|
876
|
+
'table_body': v['sub_bbox'],
|
877
|
+
'table_caption_list': v['obj_bboxes'],
|
878
|
+
}
|
879
|
+
filter_idx = v['sub_idx']
|
880
|
+
d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
|
881
|
+
record['table_footnote_list'] = d['obj_bboxes']
|
882
|
+
ret.append(record)
|
883
|
+
return ret
|
884
|
+
|
569
885
|
def get_imgs(self, page_no: int):
|
570
886
|
with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
|
571
887
|
with_footnotes, _ = self.__tie_up_category_by_distance(
|
@@ -699,10 +1015,10 @@ class MagicModel:
|
|
699
1015
|
|
700
1016
|
def get_page_size(self, page_no: int): # 获取页面宽高
|
701
1017
|
# 获取当前页的page对象
|
702
|
-
page = self.__docs
|
1018
|
+
page = self.__docs.get_page(page_no).get_page_info()
|
703
1019
|
# 获取当前页的宽高
|
704
|
-
page_w = page.
|
705
|
-
page_h = page.
|
1020
|
+
page_w = page.w
|
1021
|
+
page_h = page.h
|
706
1022
|
return page_w, page_h
|
707
1023
|
|
708
1024
|
def __get_blocks_by_type(
|