magic-pdf 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. magic_pdf/config/__init__.py +0 -0
  2. magic_pdf/config/enums.py +7 -0
  3. magic_pdf/config/exceptions.py +32 -0
  4. magic_pdf/data/__init__.py +0 -0
  5. magic_pdf/data/data_reader_writer/__init__.py +12 -0
  6. magic_pdf/data/data_reader_writer/base.py +51 -0
  7. magic_pdf/data/data_reader_writer/filebase.py +59 -0
  8. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
  9. magic_pdf/data/data_reader_writer/s3.py +69 -0
  10. magic_pdf/data/dataset.py +194 -0
  11. magic_pdf/data/io/__init__.py +0 -0
  12. magic_pdf/data/io/base.py +42 -0
  13. magic_pdf/data/io/http.py +37 -0
  14. magic_pdf/data/io/s3.py +114 -0
  15. magic_pdf/data/read_api.py +95 -0
  16. magic_pdf/data/schemas.py +15 -0
  17. magic_pdf/data/utils.py +32 -0
  18. magic_pdf/dict2md/ocr_mkcontent.py +74 -234
  19. magic_pdf/libs/Constants.py +21 -8
  20. magic_pdf/libs/MakeContentConfig.py +1 -0
  21. magic_pdf/libs/boxbase.py +35 -0
  22. magic_pdf/libs/clean_memory.py +10 -0
  23. magic_pdf/libs/config_reader.py +53 -23
  24. magic_pdf/libs/draw_bbox.py +150 -65
  25. magic_pdf/libs/ocr_content_type.py +2 -0
  26. magic_pdf/libs/version.py +1 -1
  27. magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
  28. magic_pdf/model/magic_model.py +331 -15
  29. magic_pdf/model/pdf_extract_kit.py +164 -80
  30. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
  31. magic_pdf/model/ppTableModel.py +2 -2
  32. magic_pdf/model/pp_structure_v2.py +5 -2
  33. magic_pdf/model/v3/__init__.py +0 -0
  34. magic_pdf/model/v3/helpers.py +125 -0
  35. magic_pdf/para/para_split_v3.py +296 -0
  36. magic_pdf/pdf_parse_by_ocr.py +6 -3
  37. magic_pdf/pdf_parse_by_txt.py +6 -3
  38. magic_pdf/pdf_parse_union_core_v2.py +644 -0
  39. magic_pdf/pipe/AbsPipe.py +5 -1
  40. magic_pdf/pipe/OCRPipe.py +10 -4
  41. magic_pdf/pipe/TXTPipe.py +10 -4
  42. magic_pdf/pipe/UNIPipe.py +16 -7
  43. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
  44. magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
  45. magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
  46. magic_pdf/resources/model_config/model_configs.yaml +5 -13
  47. magic_pdf/tools/cli.py +14 -1
  48. magic_pdf/tools/common.py +18 -8
  49. magic_pdf/user_api.py +25 -6
  50. magic_pdf/utils/__init__.py +0 -0
  51. magic_pdf/utils/annotations.py +11 -0
  52. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
  53. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/METADATA +120 -75
  54. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
  55. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
  56. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
  57. {magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,7 @@
1
+ import enum
1
2
  import json
2
3
 
4
+ from magic_pdf.data.dataset import Dataset
3
5
  from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
4
6
  bbox_relative_pos, box_area, calculate_iou,
5
7
  calculate_overlap_area_in_bbox1_area_ratio,
@@ -9,6 +11,7 @@ from magic_pdf.libs.coordinate_transform import get_scale_ratio
9
11
  from magic_pdf.libs.local_math import float_gt
10
12
  from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
11
13
  from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
14
+ from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
12
15
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
13
16
  from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
14
17
 
@@ -16,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6
16
19
  MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
17
20
 
18
21
 
22
+ class PosRelationEnum(enum.Enum):
23
+ LEFT = 'left'
24
+ RIGHT = 'right'
25
+ UP = 'up'
26
+ BOTTOM = 'bottom'
27
+ ALL = 'all'
28
+
29
+
19
30
  class MagicModel:
20
31
  """每个函数没有得到元素的时候返回空list."""
21
32
 
@@ -24,7 +35,7 @@ class MagicModel:
24
35
  need_remove_list = []
25
36
  page_no = model_page_info['page_info']['page_no']
26
37
  horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
27
- model_page_info, self.__docs[page_no]
38
+ model_page_info, self.__docs.get_page(page_no)
28
39
  )
29
40
  layout_dets = model_page_info['layout_dets']
30
41
  for layout_det in layout_dets:
@@ -99,7 +110,7 @@ class MagicModel:
99
110
  for need_remove in need_remove_list:
100
111
  layout_dets.remove(need_remove)
101
112
 
102
- def __init__(self, model_list: list, docs: fitz.Document):
113
+ def __init__(self, model_list: list, docs: Dataset):
103
114
  self.__model_list = model_list
104
115
  self.__docs = docs
105
116
  """为所有模型数据添加bbox信息(缩放,poly->bbox)"""
@@ -110,6 +121,24 @@ class MagicModel:
110
121
  self.__fix_by_remove_high_iou_and_low_confidence()
111
122
  self.__fix_footnote()
112
123
 
124
+ def _bbox_distance(self, bbox1, bbox2):
125
+ left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
126
+ flags = [left, right, bottom, top]
127
+ count = sum([1 if v else 0 for v in flags])
128
+ if count > 1:
129
+ return float('inf')
130
+ if left or right:
131
+ l1 = bbox1[3] - bbox1[1]
132
+ l2 = bbox2[3] - bbox2[1]
133
+ else:
134
+ l1 = bbox1[2] - bbox1[0]
135
+ l2 = bbox2[2] - bbox2[0]
136
+
137
+ if l2 > l1 and (l2 - l1) / l1 > 0.3:
138
+ return float('inf')
139
+
140
+ return bbox_distance(bbox1, bbox2)
141
+
113
142
  def __fix_footnote(self):
114
143
  # 3: figure, 5: table, 7: footnote
115
144
  for model_page_info in self.__model_list:
@@ -144,7 +173,7 @@ class MagicModel:
144
173
  if pos_flag_count > 1:
145
174
  continue
146
175
  dis_figure_footnote[i] = min(
147
- bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
176
+ self._bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
148
177
  dis_figure_footnote.get(i, float('inf')),
149
178
  )
150
179
  for i in range(len(footnotes)):
@@ -163,7 +192,7 @@ class MagicModel:
163
192
  continue
164
193
 
165
194
  dis_table_footnote[i] = min(
166
- bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
195
+ self._bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
167
196
  dis_table_footnote.get(i, float('inf')),
168
197
  )
169
198
  for i in range(len(footnotes)):
@@ -195,9 +224,8 @@ class MagicModel:
195
224
  筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
196
225
  再求出筛选出的 subjects 和 object 的最短距离
197
226
  """
198
- def search_overlap_between_boxes(
199
- subject_idx, object_idx
200
- ):
227
+
228
+ def search_overlap_between_boxes(subject_idx, object_idx):
201
229
  idxes = [subject_idx, object_idx]
202
230
  x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
203
231
  y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
@@ -225,9 +253,9 @@ class MagicModel:
225
253
  for other_object in other_objects:
226
254
  ratio = max(
227
255
  ratio,
228
- get_overlap_area(
229
- merged_bbox, other_object['bbox']
230
- ) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
256
+ get_overlap_area(merged_bbox, other_object['bbox'])
257
+ * 1.0
258
+ / box_area(all_bboxes[object_idx]['bbox']),
231
259
  )
232
260
  if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
233
261
  break
@@ -345,12 +373,17 @@ class MagicModel:
345
373
  if all_bboxes[j]['category_id'] == subject_category_id:
346
374
  subject_idx, object_idx = j, i
347
375
 
348
- if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
376
+ if (
377
+ search_overlap_between_boxes(subject_idx, object_idx)
378
+ >= MERGE_BOX_OVERLAP_AREA_RATIO
379
+ ):
349
380
  dis[i][j] = float('inf')
350
381
  dis[j][i] = dis[i][j]
351
382
  continue
352
383
 
353
- dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
384
+ dis[i][j] = self._bbox_distance(
385
+ all_bboxes[subject_idx]['bbox'], all_bboxes[object_idx]['bbox']
386
+ )
354
387
  dis[j][i] = dis[i][j]
355
388
 
356
389
  used = set()
@@ -566,6 +599,289 @@ class MagicModel:
566
599
  with_caption_subject.add(j)
567
600
  return ret, total_subject_object_dis
568
601
 
602
+ def __tie_up_category_by_distance_v2(
603
+ self,
604
+ page_no: int,
605
+ subject_category_id: int,
606
+ object_category_id: int,
607
+ priority_pos: PosRelationEnum,
608
+ ):
609
+ """_summary_
610
+
611
+ Args:
612
+ page_no (int): _description_
613
+ subject_category_id (int): _description_
614
+ object_category_id (int): _description_
615
+ priority_pos (PosRelationEnum): _description_
616
+
617
+ Returns:
618
+ _type_: _description_
619
+ """
620
+ AXIS_MULPLICITY = 0.5
621
+ subjects = self.__reduct_overlap(
622
+ list(
623
+ map(
624
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
625
+ filter(
626
+ lambda x: x['category_id'] == subject_category_id,
627
+ self.__model_list[page_no]['layout_dets'],
628
+ ),
629
+ )
630
+ )
631
+ )
632
+
633
+ objects = self.__reduct_overlap(
634
+ list(
635
+ map(
636
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
637
+ filter(
638
+ lambda x: x['category_id'] == object_category_id,
639
+ self.__model_list[page_no]['layout_dets'],
640
+ ),
641
+ )
642
+ )
643
+ )
644
+ M = len(objects)
645
+
646
+ subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
647
+ objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
648
+
649
+ sub_obj_map_h = {i: [] for i in range(len(subjects))}
650
+
651
+ dis_by_directions = {
652
+ 'top': [[-1, float('inf')]] * M,
653
+ 'bottom': [[-1, float('inf')]] * M,
654
+ 'left': [[-1, float('inf')]] * M,
655
+ 'right': [[-1, float('inf')]] * M,
656
+ }
657
+
658
+ for i, obj in enumerate(objects):
659
+ l_x_axis, l_y_axis = (
660
+ obj['bbox'][2] - obj['bbox'][0],
661
+ obj['bbox'][3] - obj['bbox'][1],
662
+ )
663
+ axis_unit = min(l_x_axis, l_y_axis)
664
+ for j, sub in enumerate(subjects):
665
+
666
+ bbox1, bbox2, _ = _remove_overlap_between_bbox(
667
+ objects[i]['bbox'], subjects[j]['bbox']
668
+ )
669
+ left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
670
+ flags = [left, right, bottom, top]
671
+ if sum([1 if v else 0 for v in flags]) > 1:
672
+ continue
673
+
674
+ if left:
675
+ if dis_by_directions['left'][i][1] > bbox_distance(
676
+ obj['bbox'], sub['bbox']
677
+ ):
678
+ dis_by_directions['left'][i] = [
679
+ j,
680
+ bbox_distance(obj['bbox'], sub['bbox']),
681
+ ]
682
+ if right:
683
+ if dis_by_directions['right'][i][1] > bbox_distance(
684
+ obj['bbox'], sub['bbox']
685
+ ):
686
+ dis_by_directions['right'][i] = [
687
+ j,
688
+ bbox_distance(obj['bbox'], sub['bbox']),
689
+ ]
690
+ if bottom:
691
+ if dis_by_directions['bottom'][i][1] > bbox_distance(
692
+ obj['bbox'], sub['bbox']
693
+ ):
694
+ dis_by_directions['bottom'][i] = [
695
+ j,
696
+ bbox_distance(obj['bbox'], sub['bbox']),
697
+ ]
698
+ if top:
699
+ if dis_by_directions['top'][i][1] > bbox_distance(
700
+ obj['bbox'], sub['bbox']
701
+ ):
702
+ dis_by_directions['top'][i] = [
703
+ j,
704
+ bbox_distance(obj['bbox'], sub['bbox']),
705
+ ]
706
+
707
+ if (
708
+ dis_by_directions['top'][i][1] != float('inf')
709
+ and dis_by_directions['bottom'][i][1] != float('inf')
710
+ and priority_pos in (PosRelationEnum.BOTTOM, PosRelationEnum.UP)
711
+ ):
712
+ RATIO = 3
713
+ if (
714
+ abs(
715
+ dis_by_directions['top'][i][1]
716
+ - dis_by_directions['bottom'][i][1]
717
+ )
718
+ < RATIO * axis_unit
719
+ ):
720
+
721
+ if priority_pos == PosRelationEnum.BOTTOM:
722
+ sub_obj_map_h[dis_by_directions['bottom'][i][0]].append(i)
723
+ else:
724
+ sub_obj_map_h[dis_by_directions['top'][i][0]].append(i)
725
+ continue
726
+
727
+ if dis_by_directions['left'][i][1] != float('inf') or dis_by_directions[
728
+ 'right'
729
+ ][i][1] != float('inf'):
730
+ if dis_by_directions['left'][i][1] != float(
731
+ 'inf'
732
+ ) and dis_by_directions['right'][i][1] != float('inf'):
733
+ if AXIS_MULPLICITY * axis_unit >= abs(
734
+ dis_by_directions['left'][i][1]
735
+ - dis_by_directions['right'][i][1]
736
+ ):
737
+ left_sub_bbox = subjects[dis_by_directions['left'][i][0]][
738
+ 'bbox'
739
+ ]
740
+ right_sub_bbox = subjects[dis_by_directions['right'][i][0]][
741
+ 'bbox'
742
+ ]
743
+
744
+ left_sub_bbox_y_axis = left_sub_bbox[3] - left_sub_bbox[1]
745
+ right_sub_bbox_y_axis = right_sub_bbox[3] - right_sub_bbox[1]
746
+
747
+ if (
748
+ abs(left_sub_bbox_y_axis - l_y_axis)
749
+ + dis_by_directions['left'][i][0]
750
+ > abs(right_sub_bbox_y_axis - l_y_axis)
751
+ + dis_by_directions['right'][i][0]
752
+ ):
753
+ left_or_right = dis_by_directions['right'][i]
754
+ else:
755
+ left_or_right = dis_by_directions['left'][i]
756
+ else:
757
+ left_or_right = dis_by_directions['left'][i]
758
+ if left_or_right[1] > dis_by_directions['right'][i][1]:
759
+ left_or_right = dis_by_directions['right'][i]
760
+ else:
761
+ left_or_right = dis_by_directions['left'][i]
762
+ if left_or_right[1] == float('inf'):
763
+ left_or_right = dis_by_directions['right'][i]
764
+ else:
765
+ left_or_right = [-1, float('inf')]
766
+
767
+ if dis_by_directions['top'][i][1] != float('inf') or dis_by_directions[
768
+ 'bottom'
769
+ ][i][1] != float('inf'):
770
+ if dis_by_directions['top'][i][1] != float('inf') and dis_by_directions[
771
+ 'bottom'
772
+ ][i][1] != float('inf'):
773
+ if AXIS_MULPLICITY * axis_unit >= abs(
774
+ dis_by_directions['top'][i][1]
775
+ - dis_by_directions['bottom'][i][1]
776
+ ):
777
+ top_bottom = subjects[dis_by_directions['bottom'][i][0]]['bbox']
778
+ bottom_top = subjects[dis_by_directions['top'][i][0]]['bbox']
779
+
780
+ top_bottom_x_axis = top_bottom[2] - top_bottom[0]
781
+ bottom_top_x_axis = bottom_top[2] - bottom_top[0]
782
+ if (
783
+ abs(top_bottom_x_axis - l_x_axis)
784
+ + dis_by_directions['bottom'][i][1]
785
+ > abs(bottom_top_x_axis - l_x_axis)
786
+ + dis_by_directions['top'][i][1]
787
+ ):
788
+ top_or_bottom = dis_by_directions['top'][i]
789
+ else:
790
+ top_or_bottom = dis_by_directions['bottom'][i]
791
+ else:
792
+ top_or_bottom = dis_by_directions['top'][i]
793
+ if top_or_bottom[1] > dis_by_directions['bottom'][i][1]:
794
+ top_or_bottom = dis_by_directions['bottom'][i]
795
+ else:
796
+ top_or_bottom = dis_by_directions['top'][i]
797
+ if top_or_bottom[1] == float('inf'):
798
+ top_or_bottom = dis_by_directions['bottom'][i]
799
+ else:
800
+ top_or_bottom = [-1, float('inf')]
801
+
802
+ if left_or_right[1] != float('inf') or top_or_bottom[1] != float('inf'):
803
+ if left_or_right[1] != float('inf') and top_or_bottom[1] != float(
804
+ 'inf'
805
+ ):
806
+ if AXIS_MULPLICITY * axis_unit >= abs(
807
+ left_or_right[1] - top_or_bottom[1]
808
+ ):
809
+ y_axis_bbox = subjects[left_or_right[0]]['bbox']
810
+ x_axis_bbox = subjects[top_or_bottom[0]]['bbox']
811
+
812
+ if (
813
+ abs((x_axis_bbox[2] - x_axis_bbox[0]) - l_x_axis) / l_x_axis
814
+ > abs((y_axis_bbox[3] - y_axis_bbox[1]) - l_y_axis)
815
+ / l_y_axis
816
+ ):
817
+ sub_obj_map_h[left_or_right[0]].append(i)
818
+ else:
819
+ sub_obj_map_h[top_or_bottom[0]].append(i)
820
+ else:
821
+ if left_or_right[1] > top_or_bottom[1]:
822
+ sub_obj_map_h[top_or_bottom[0]].append(i)
823
+ else:
824
+ sub_obj_map_h[left_or_right[0]].append(i)
825
+ else:
826
+ if left_or_right[1] != float('inf'):
827
+ sub_obj_map_h[left_or_right[0]].append(i)
828
+ else:
829
+ sub_obj_map_h[top_or_bottom[0]].append(i)
830
+ ret = []
831
+ for i in sub_obj_map_h.keys():
832
+ ret.append(
833
+ {
834
+ 'sub_bbox': {
835
+ 'bbox': subjects[i]['bbox'],
836
+ 'score': subjects[i]['score'],
837
+ },
838
+ 'obj_bboxes': [
839
+ {'score': objects[j]['score'], 'bbox': objects[j]['bbox']}
840
+ for j in sub_obj_map_h[i]
841
+ ],
842
+ 'sub_idx': i,
843
+ }
844
+ )
845
+ return ret
846
+
847
+ def get_imgs_v2(self, page_no: int):
848
+ with_captions = self.__tie_up_category_by_distance_v2(
849
+ page_no, 3, 4, PosRelationEnum.BOTTOM
850
+ )
851
+ with_footnotes = self.__tie_up_category_by_distance_v2(
852
+ page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL
853
+ )
854
+ ret = []
855
+ for v in with_captions:
856
+ record = {
857
+ 'image_body': v['sub_bbox'],
858
+ 'image_caption_list': v['obj_bboxes'],
859
+ }
860
+ filter_idx = v['sub_idx']
861
+ d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
862
+ record['image_footnote_list'] = d['obj_bboxes']
863
+ ret.append(record)
864
+ return ret
865
+
866
+ def get_tables_v2(self, page_no: int) -> list:
867
+ with_captions = self.__tie_up_category_by_distance_v2(
868
+ page_no, 5, 6, PosRelationEnum.UP
869
+ )
870
+ with_footnotes = self.__tie_up_category_by_distance_v2(
871
+ page_no, 5, 7, PosRelationEnum.ALL
872
+ )
873
+ ret = []
874
+ for v in with_captions:
875
+ record = {
876
+ 'table_body': v['sub_bbox'],
877
+ 'table_caption_list': v['obj_bboxes'],
878
+ }
879
+ filter_idx = v['sub_idx']
880
+ d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes))
881
+ record['table_footnote_list'] = d['obj_bboxes']
882
+ ret.append(record)
883
+ return ret
884
+
569
885
  def get_imgs(self, page_no: int):
570
886
  with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
571
887
  with_footnotes, _ = self.__tie_up_category_by_distance(
@@ -699,10 +1015,10 @@ class MagicModel:
699
1015
 
700
1016
  def get_page_size(self, page_no: int): # 获取页面宽高
701
1017
  # 获取当前页的page对象
702
- page = self.__docs[page_no]
1018
+ page = self.__docs.get_page(page_no).get_page_info()
703
1019
  # 获取当前页的宽高
704
- page_w = page.rect.width
705
- page_h = page.rect.height
1020
+ page_w = page.w
1021
+ page_h = page.h
706
1022
  return page_w, page_h
707
1023
 
708
1024
  def __get_blocks_by_type(