magic-pdf 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. magic_pdf/config/__init__.py +0 -0
  2. magic_pdf/config/enums.py +7 -0
  3. magic_pdf/config/exceptions.py +32 -0
  4. magic_pdf/data/__init__.py +0 -0
  5. magic_pdf/data/data_reader_writer/__init__.py +12 -0
  6. magic_pdf/data/data_reader_writer/base.py +51 -0
  7. magic_pdf/data/data_reader_writer/filebase.py +59 -0
  8. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
  9. magic_pdf/data/data_reader_writer/s3.py +69 -0
  10. magic_pdf/data/dataset.py +194 -0
  11. magic_pdf/data/io/__init__.py +0 -0
  12. magic_pdf/data/io/base.py +42 -0
  13. magic_pdf/data/io/http.py +37 -0
  14. magic_pdf/data/io/s3.py +114 -0
  15. magic_pdf/data/read_api.py +95 -0
  16. magic_pdf/data/schemas.py +15 -0
  17. magic_pdf/data/utils.py +32 -0
  18. magic_pdf/dict2md/ocr_mkcontent.py +74 -234
  19. magic_pdf/libs/Constants.py +21 -8
  20. magic_pdf/libs/MakeContentConfig.py +1 -0
  21. magic_pdf/libs/boxbase.py +54 -0
  22. magic_pdf/libs/clean_memory.py +10 -0
  23. magic_pdf/libs/config_reader.py +53 -23
  24. magic_pdf/libs/draw_bbox.py +150 -65
  25. magic_pdf/libs/ocr_content_type.py +2 -0
  26. magic_pdf/libs/version.py +1 -1
  27. magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
  28. magic_pdf/model/magic_model.py +418 -51
  29. magic_pdf/model/pdf_extract_kit.py +164 -80
  30. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
  31. magic_pdf/model/ppTableModel.py +2 -2
  32. magic_pdf/model/pp_structure_v2.py +5 -2
  33. magic_pdf/model/v3/__init__.py +0 -0
  34. magic_pdf/model/v3/helpers.py +125 -0
  35. magic_pdf/para/para_split_v3.py +296 -0
  36. magic_pdf/pdf_parse_by_ocr.py +6 -3
  37. magic_pdf/pdf_parse_by_txt.py +6 -3
  38. magic_pdf/pdf_parse_union_core_v2.py +644 -0
  39. magic_pdf/pipe/AbsPipe.py +5 -1
  40. magic_pdf/pipe/OCRPipe.py +10 -4
  41. magic_pdf/pipe/TXTPipe.py +10 -4
  42. magic_pdf/pipe/UNIPipe.py +16 -7
  43. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
  44. magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
  45. magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
  46. magic_pdf/resources/model_config/model_configs.yaml +5 -13
  47. magic_pdf/tools/cli.py +14 -1
  48. magic_pdf/tools/common.py +19 -9
  49. magic_pdf/user_api.py +25 -6
  50. magic_pdf/utils/__init__.py +0 -0
  51. magic_pdf/utils/annotations.py +11 -0
  52. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
  53. magic_pdf-0.9.0.dist-info/METADATA +507 -0
  54. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
  55. magic_pdf-0.8.0.dist-info/METADATA +0 -459
  56. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
  57. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
  58. {magic_pdf-0.8.0.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,30 @@
1
+ import enum
1
2
  import json
2
3
 
4
+ from magic_pdf.data.dataset import Dataset
3
5
  from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
4
- bbox_relative_pos, calculate_iou,
5
- calculate_overlap_area_in_bbox1_area_ratio)
6
+ bbox_relative_pos, box_area, calculate_iou,
7
+ calculate_overlap_area_in_bbox1_area_ratio,
8
+ get_overlap_area)
6
9
  from magic_pdf.libs.commons import fitz, join_path
7
10
  from magic_pdf.libs.coordinate_transform import get_scale_ratio
8
11
  from magic_pdf.libs.local_math import float_gt
9
12
  from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
10
13
  from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
14
+ from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
11
15
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
12
16
  from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
13
17
 
14
18
  CAPATION_OVERLAP_AREA_RATIO = 0.6
19
+ MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
20
+
21
+
22
+ class PosRelationEnum(enum.Enum):
23
+ LEFT = 'left'
24
+ RIGHT = 'right'
25
+ UP = 'up'
26
+ BOTTOM = 'bottom'
27
+ ALL = 'all'
15
28
 
16
29
 
17
30
  class MagicModel:
@@ -22,7 +35,7 @@ class MagicModel:
22
35
  need_remove_list = []
23
36
  page_no = model_page_info['page_info']['page_no']
24
37
  horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
25
- model_page_info, self.__docs[page_no]
38
+ model_page_info, self.__docs.get_page(page_no)
26
39
  )
27
40
  layout_dets = model_page_info['layout_dets']
28
41
  for layout_det in layout_dets:
@@ -97,7 +110,7 @@ class MagicModel:
97
110
  for need_remove in need_remove_list:
98
111
  layout_dets.remove(need_remove)
99
112
 
100
- def __init__(self, model_list: list, docs: fitz.Document):
113
+ def __init__(self, model_list: list, docs: Dataset):
101
114
  self.__model_list = model_list
102
115
  self.__docs = docs
103
116
  """为所有模型数据添加bbox信息(缩放,poly->bbox)"""
@@ -108,6 +121,24 @@ class MagicModel:
108
121
  self.__fix_by_remove_high_iou_and_low_confidence()
109
122
  self.__fix_footnote()
110
123
 
124
+ def _bbox_distance(self, bbox1, bbox2):
125
+ left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
126
+ flags = [left, right, bottom, top]
127
+ count = sum([1 if v else 0 for v in flags])
128
+ if count > 1:
129
+ return float('inf')
130
+ if left or right:
131
+ l1 = bbox1[3] - bbox1[1]
132
+ l2 = bbox2[3] - bbox2[1]
133
+ else:
134
+ l1 = bbox1[2] - bbox1[0]
135
+ l2 = bbox2[2] - bbox2[0]
136
+
137
+ if l2 > l1 and (l2 - l1) / l1 > 0.3:
138
+ return float('inf')
139
+
140
+ return bbox_distance(bbox1, bbox2)
141
+
111
142
  def __fix_footnote(self):
112
143
  # 3: figure, 5: table, 7: footnote
113
144
  for model_page_info in self.__model_list:
@@ -124,49 +155,51 @@ class MagicModel:
124
155
  tables.append(obj)
125
156
  if len(footnotes) * len(figures) == 0:
126
157
  continue
127
- dis_figure_footnote = {}
128
- dis_table_footnote = {}
129
-
130
- for i in range(len(footnotes)):
131
- for j in range(len(figures)):
132
- pos_flag_count = sum(
133
- list(
134
- map(
135
- lambda x: 1 if x else 0,
136
- bbox_relative_pos(
137
- footnotes[i]['bbox'], figures[j]['bbox']
138
- ),
139
- )
158
+ dis_figure_footnote = {}
159
+ dis_table_footnote = {}
160
+
161
+ for i in range(len(footnotes)):
162
+ for j in range(len(figures)):
163
+ pos_flag_count = sum(
164
+ list(
165
+ map(
166
+ lambda x: 1 if x else 0,
167
+ bbox_relative_pos(
168
+ footnotes[i]['bbox'], figures[j]['bbox']
169
+ ),
140
170
  )
141
171
  )
142
- if pos_flag_count > 1:
143
- continue
144
- dis_figure_footnote[i] = min(
145
- bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
146
- dis_figure_footnote.get(i, float('inf')),
147
- )
148
- for i in range(len(footnotes)):
149
- for j in range(len(tables)):
150
- pos_flag_count = sum(
151
- list(
152
- map(
153
- lambda x: 1 if x else 0,
154
- bbox_relative_pos(
155
- footnotes[i]['bbox'], tables[j]['bbox']
156
- ),
157
- )
172
+ )
173
+ if pos_flag_count > 1:
174
+ continue
175
+ dis_figure_footnote[i] = min(
176
+ self._bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
177
+ dis_figure_footnote.get(i, float('inf')),
178
+ )
179
+ for i in range(len(footnotes)):
180
+ for j in range(len(tables)):
181
+ pos_flag_count = sum(
182
+ list(
183
+ map(
184
+ lambda x: 1 if x else 0,
185
+ bbox_relative_pos(
186
+ footnotes[i]['bbox'], tables[j]['bbox']
187
+ ),
158
188
  )
159
189
  )
160
- if pos_flag_count > 1:
161
- continue
190
+ )
191
+ if pos_flag_count > 1:
192
+ continue
162
193
 
163
- dis_table_footnote[i] = min(
164
- bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
165
- dis_table_footnote.get(i, float('inf')),
166
- )
167
- for i in range(len(footnotes)):
168
- if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
169
- footnotes[i]['category_id'] = CategoryId.ImageFootnote
194
+ dis_table_footnote[i] = min(
195
+ self._bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
196
+ dis_table_footnote.get(i, float('inf')),
197
+ )
198
+ for i in range(len(footnotes)):
199
+ if i not in dis_figure_footnote:
200
+ continue
201
+ if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
202
+ footnotes[i]['category_id'] = CategoryId.ImageFootnote
170
203
 
171
204
  def __reduct_overlap(self, bboxes):
172
205
  N = len(bboxes)
@@ -192,6 +225,43 @@ class MagicModel:
192
225
  再求出筛选出的 subjects 和 object 的最短距离
193
226
  """
194
227
 
228
+ def search_overlap_between_boxes(subject_idx, object_idx):
229
+ idxes = [subject_idx, object_idx]
230
+ x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
231
+ y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
232
+ x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
233
+ y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
234
+
235
+ merged_bbox = [
236
+ min(x0s),
237
+ min(y0s),
238
+ max(x1s),
239
+ max(y1s),
240
+ ]
241
+ ratio = 0
242
+
243
+ other_objects = list(
244
+ map(
245
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
246
+ filter(
247
+ lambda x: x['category_id']
248
+ not in (object_category_id, subject_category_id),
249
+ self.__model_list[page_no]['layout_dets'],
250
+ ),
251
+ )
252
+ )
253
+ for other_object in other_objects:
254
+ ratio = max(
255
+ ratio,
256
+ get_overlap_area(merged_bbox, other_object['bbox'])
257
+ * 1.0
258
+ / box_area(all_bboxes[object_idx]['bbox']),
259
+ )
260
+ if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
261
+ break
262
+
263
+ return ratio
264
+
195
265
  def may_find_other_nearest_bbox(subject_idx, object_idx):
196
266
  ret = float('inf')
197
267
 
@@ -299,7 +369,21 @@ class MagicModel:
299
369
  ):
300
370
  continue
301
371
 
302
- dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
372
+ subject_idx, object_idx = i, j
373
+ if all_bboxes[j]['category_id'] == subject_category_id:
374
+ subject_idx, object_idx = j, i
375
+
376
+ if (
377
+ search_overlap_between_boxes(subject_idx, object_idx)
378
+ >= MERGE_BOX_OVERLAP_AREA_RATIO
379
+ ):
380
+ dis[i][j] = float('inf')
381
+ dis[j][i] = dis[i][j]
382
+ continue
383
+
384
+ dis[i][j] = self._bbox_distance(
385
+ all_bboxes[subject_idx]['bbox'], all_bboxes[object_idx]['bbox']
386
+ )
303
387
  dis[j][i] = dis[i][j]
304
388
 
305
389
  used = set()
@@ -515,6 +599,289 @@ class MagicModel:
515
599
  with_caption_subject.add(j)
516
600
  return ret, total_subject_object_dis
517
601
 
602
+ def __tie_up_category_by_distance_v2(
603
+ self,
604
+ page_no: int,
605
+ subject_category_id: int,
606
+ object_category_id: int,
607
+ priority_pos: PosRelationEnum,
608
+ ):
609
+ """_summary_
610
+
611
+ Args:
612
+ page_no (int): _description_
613
+ subject_category_id (int): _description_
614
+ object_category_id (int): _description_
615
+ priority_pos (PosRelationEnum): _description_
616
+
617
+ Returns:
618
+ _type_: _description_
619
+ """
620
+ AXIS_MULPLICITY = 0.5
621
+ subjects = self.__reduct_overlap(
622
+ list(
623
+ map(
624
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
625
+ filter(
626
+ lambda x: x['category_id'] == subject_category_id,
627
+ self.__model_list[page_no]['layout_dets'],
628
+ ),
629
+ )
630
+ )
631
+ )
632
+
633
+ objects = self.__reduct_overlap(
634
+ list(
635
+ map(
636
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
637
+ filter(
638
+ lambda x: x['category_id'] == object_category_id,
639
+ self.__model_list[page_no]['layout_dets'],
640
+ ),
641
+ )
642
+ )
643
+ )
644
+ M = len(objects)
645
+
646
+ subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
647
+ objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
648
+
649
+ sub_obj_map_h = {i: [] for i in range(len(subjects))}
650
+
651
+ dis_by_directions = {
652
+ 'top': [[-1, float('inf')]] * M,
653
+ 'bottom': [[-1, float('inf')]] * M,
654
+ 'left': [[-1, float('inf')]] * M,
655
+ 'right': [[-1, float('inf')]] * M,
656
+ }
657
+
658
+ for i, obj in enumerate(objects):
659
+ l_x_axis, l_y_axis = (
660
+ obj['bbox'][2] - obj['bbox'][0],
661
+ obj['bbox'][3] - obj['bbox'][1],
662
+ )
663
+ axis_unit = min(l_x_axis, l_y_axis)
664
+ for j, sub in enumerate(subjects):
665
+
666
+ bbox1, bbox2, _ = _remove_overlap_between_bbox(
667
+ objects[i]['bbox'], subjects[j]['bbox']
668
+ )
669
+ left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
670
+ flags = [left, right, bottom, top]
671
+ if sum([1 if v else 0 for v in flags]) > 1:
672
+ continue
673
+
674
+ if left:
675
+ if dis_by_directions['left'][i][1] > bbox_distance(
676
+ obj['bbox'], sub['bbox']
677
+ ):
678
+ dis_by_directions['left'][i] = [
679
+ j,
680
+ bbox_distance(obj['bbox'], sub['bbox']),
681
+ ]
682
+ if right:
683
+ if dis_by_directions['right'][i][1] > bbox_distance(
684
+ obj['bbox'], sub['bbox']
685
+ ):
686
+ dis_by_directions['right'][i] = [
687
+ j,
688
+ bbox_distance(obj['bbox'], sub['bbox']),
689
+ ]
690
+ if bottom:
691
+ if dis_by_directions['bottom'][i][1] > bbox_distance(
692
+ obj['bbox'], sub['bbox']
693
+ ):
694
+ dis_by_directions['bottom'][i] = [
695
+ j,
696
+ bbox_distance(obj['bbox'], sub['bbox']),
697
+ ]
698
+ if top:
699
+ if dis_by_directions['top'][i][1] > bbox_distance(
700
+ obj['bbox'], sub['bbox']
701
+ ):
702
+ dis_by_directions['top'][i] = [
703
+ j,
704
+ bbox_distance(obj['bbox'], sub['bbox']),
705
+ ]
706
+
707
+ if (
708
+ dis_by_directions['top'][i][1] != float('inf')
709
+ and dis_by_directions['bottom'][i][1] != float('inf')
710
+ and priority_pos in (PosRelationEnum.BOTTOM, PosRelationEnum.UP)
711
+ ):
712
+ RATIO = 3
713
+ if (
714
+ abs(
715
+ dis_by_directions['top'][i][1]
716
+ - dis_by_directions['bottom'][i][1]
717
+ )
718
+ < RATIO * axis_unit
719
+ ):
720
+
721
+ if priority_pos == PosRelationEnum.BOTTOM:
722
+ sub_obj_map_h[dis_by_directions['bottom'][i][0]].append(i)
723
+ else:
724
+ sub_obj_map_h[dis_by_directions['top'][i][0]].append(i)
725
+ continue
726
+
727
+ if dis_by_directions['left'][i][1] != float('inf') or dis_by_directions[
728
+ 'right'
729
+ ][i][1] != float('inf'):
730
+ if dis_by_directions['left'][i][1] != float(
731
+ 'inf'
732
+ ) and dis_by_directions['right'][i][1] != float('inf'):
733
+ if AXIS_MULPLICITY * axis_unit >= abs(
734
+ dis_by_directions['left'][i][1]
735
+ - dis_by_directions['right'][i][1]
736
+ ):
737
+ left_sub_bbox = subjects[dis_by_directions['left'][i][0]][
738
+ 'bbox'
739
+ ]
740
+ right_sub_bbox = subjects[dis_by_directions['right'][i][0]][
741
+ 'bbox'
742
+ ]
743
+
744
+ left_sub_bbox_y_axis = left_sub_bbox[3] - left_sub_bbox[1]
745
+ right_sub_bbox_y_axis = right_sub_bbox[3] - right_sub_bbox[1]
746
+
747
+ if (
748
+ abs(left_sub_bbox_y_axis - l_y_axis)
749
+ + dis_by_directions['left'][i][0]
750
+ > abs(right_sub_bbox_y_axis - l_y_axis)
751
+ + dis_by_directions['right'][i][0]
752
+ ):
753
+ left_or_right = dis_by_directions['right'][i]
754
+ else:
755
+ left_or_right = dis_by_directions['left'][i]
756
+ else:
757
+ left_or_right = dis_by_directions['left'][i]
758
+ if left_or_right[1] > dis_by_directions['right'][i][1]:
759
+ left_or_right = dis_by_directions['right'][i]
760
+ else:
761
+ left_or_right = dis_by_directions['left'][i]
762
+ if left_or_right[1] == float('inf'):
763
+ left_or_right = dis_by_directions['right'][i]
764
+ else:
765
+ left_or_right = [-1, float('inf')]
766
+
767
+ if dis_by_directions['top'][i][1] != float('inf') or dis_by_directions[
768
+ 'bottom'
769
+ ][i][1] != float('inf'):
770
+ if dis_by_directions['top'][i][1] != float('inf') and dis_by_directions[
771
+ 'bottom'
772
+ ][i][1] != float('inf'):
773
+ if AXIS_MULPLICITY * axis_unit >= abs(
774
+ dis_by_directions['top'][i][1]
775
+ - dis_by_directions['bottom'][i][1]
776
+ ):
777
+ top_bottom = subjects[dis_by_directions['bottom'][i][0]]['bbox']
778
+ bottom_top = subjects[dis_by_directions['top'][i][0]]['bbox']
779
+
780
+ top_bottom_x_axis = top_bottom[2] - top_bottom[0]
781
+ bottom_top_x_axis = bottom_top[2] - bottom_top[0]
782
+ if (
783
+ abs(top_bottom_x_axis - l_x_axis)
784
+ + dis_by_directions['bottom'][i][1]
785
+ > abs(bottom_top_x_axis - l_x_axis)
786
+ + dis_by_directions['top'][i][1]
787
+ ):
788
+ top_or_bottom = dis_by_directions['top'][i]
789
+ else:
790
+ top_or_bottom = dis_by_directions['bottom'][i]
791
+ else:
792
+ top_or_bottom = dis_by_directions['top'][i]
793
+ if top_or_bottom[1] > dis_by_directions['bottom'][i][1]:
794
+ top_or_bottom = dis_by_directions['bottom'][i]
795
+ else:
796
+ top_or_bottom = dis_by_directions['top'][i]
797
+ if top_or_bottom[1] == float('inf'):
798
+ top_or_bottom = dis_by_directions['bottom'][i]
799
+ else:
800
+ top_or_bottom = [-1, float('inf')]
801
+
802
+ if left_or_right[1] != float('inf') or top_or_bottom[1] != float('inf'):
803
+ if left_or_right[1] != float('inf') and top_or_bottom[1] != float(
804
+ 'inf'
805
+ ):
806
+ if AXIS_MULPLICITY * axis_unit >= abs(
807
+ left_or_right[1] - top_or_bottom[1]
808
+ ):
809
+ y_axis_bbox = subjects[left_or_right[0]]['bbox']
810
+ x_axis_bbox = subjects[top_or_bottom[0]]['bbox']
811
+
812
+ if (
813
+ abs((x_axis_bbox[2] - x_axis_bbox[0]) - l_x_axis) / l_x_axis
814
+ > abs((y_axis_bbox[3] - y_axis_bbox[1]) - l_y_axis)
815
+ / l_y_axis
816
+ ):
817
+ sub_obj_map_h[left_or_right[0]].append(i)
818
+ else:
819
+ sub_obj_map_h[top_or_bottom[0]].append(i)
820
+ else:
821
+ if left_or_right[1] > top_or_bottom[1]:
822
+ sub_obj_map_h[top_or_bottom[0]].append(i)
823
+ else:
824
+ sub_obj_map_h[left_or_right[0]].append(i)
825
+ else:
826
+ if left_or_right[1] != float('inf'):
827
+ sub_obj_map_h[left_or_right[0]].append(i)
828
+ else:
829
+ sub_obj_map_h[top_or_bottom[0]].append(i)
830
+ ret = []
831
+ for i in sub_obj_map_h.keys():
832
+ ret.append(
833
+ {
834
+ 'sub_bbox': {
835
+ 'bbox': subjects[i]['bbox'],
836
+ 'score': subjects[i]['score'],
837
+ },
838
+ 'obj_bboxes': [
839
+ {'score': objects[j]['score'], 'bbox': objects[j]['bbox']}
840
+ for j in sub_obj_map_h[i]
841
+ ],
842
+ 'sub_idx': i,
843
+ }
844
+ )
845
+ return ret
846
+
847
+ def get_imgs_v2(self, page_no: int):
848
+ with_captions = self.__tie_up_category_by_distance_v2(
849
+ page_no, 3, 4, PosRelationEnum.BOTTOM
850
+ )
851
+ with_footnotes = self.__tie_up_category_by_distance_v2(
852
+ page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL
853
+ )
854
+ ret = []
855
+ for v in with_captions:
856
+ record = {
857
+ 'image_body': v['sub_bbox'],
858
+ 'image_caption_list': v['obj_bboxes'],
859
+ }
860
+ filter_idx = v['sub_idx']
861
+ d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
862
+ record['image_footnote_list'] = d['obj_bboxes']
863
+ ret.append(record)
864
+ return ret
865
+
866
+ def get_tables_v2(self, page_no: int) -> list:
867
+ with_captions = self.__tie_up_category_by_distance_v2(
868
+ page_no, 5, 6, PosRelationEnum.UP
869
+ )
870
+ with_footnotes = self.__tie_up_category_by_distance_v2(
871
+ page_no, 5, 7, PosRelationEnum.ALL
872
+ )
873
+ ret = []
874
+ for v in with_captions:
875
+ record = {
876
+ 'table_body': v['sub_bbox'],
877
+ 'table_caption_list': v['obj_bboxes'],
878
+ }
879
+ filter_idx = v['sub_idx']
880
+ d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
881
+ record['table_footnote_list'] = d['obj_bboxes']
882
+ ret.append(record)
883
+ return ret
884
+
518
885
  def get_imgs(self, page_no: int):
519
886
  with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
520
887
  with_footnotes, _ = self.__tie_up_category_by_distance(
@@ -627,13 +994,13 @@ class MagicModel:
627
994
  span['type'] = ContentType.Image
628
995
  elif category_id == 5:
629
996
  # 获取table模型结果
630
- latex = layout_det.get("latex", None)
631
- html = layout_det.get("html", None)
997
+ latex = layout_det.get('latex', None)
998
+ html = layout_det.get('html', None)
632
999
  if latex:
633
- span["latex"] = latex
1000
+ span['latex'] = latex
634
1001
  elif html:
635
- span["html"] = html
636
- span["type"] = ContentType.Table
1002
+ span['html'] = html
1003
+ span['type'] = ContentType.Table
637
1004
  elif category_id == 13:
638
1005
  span['content'] = layout_det['latex']
639
1006
  span['type'] = ContentType.InlineEquation
@@ -648,10 +1015,10 @@ class MagicModel:
648
1015
 
649
1016
  def get_page_size(self, page_no: int): # 获取页面宽高
650
1017
  # 获取当前页的page对象
651
- page = self.__docs[page_no]
1018
+ page = self.__docs.get_page(page_no).get_page_info()
652
1019
  # 获取当前页的宽高
653
- page_w = page.rect.width
654
- page_h = page.rect.height
1020
+ page_w = page.w
1021
+ page_h = page.h
655
1022
  return page_w, page_h
656
1023
 
657
1024
  def __get_blocks_by_type(