magic-pdf 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +16 -22
  2. magic_pdf/filter/pdf_meta_scan.py +5 -19
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_check.py +52 -25
  7. magic_pdf/libs/pdf_image_tools.py +2 -1
  8. magic_pdf/libs/version.py +1 -1
  9. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  10. magic_pdf/model/magic_model.py +0 -30
  11. magic_pdf/model/pp_structure_v2.py +23 -3
  12. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +50 -29
  13. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +11 -9
  14. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
  15. magic_pdf/para/para_split_v3.py +21 -7
  16. magic_pdf/pdf_parse_union_core_v2.py +134 -146
  17. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  18. magic_pdf/pre_proc/cut_image.py +0 -37
  19. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  20. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  21. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  22. magic_pdf/rw/S3ReaderWriter.py +1 -1
  23. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/METADATA +3 -78
  24. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/RECORD +28 -79
  25. magic_pdf/dict2md/mkcontent.py +0 -438
  26. magic_pdf/layout/__init__.py +0 -0
  27. magic_pdf/layout/bbox_sort.py +0 -681
  28. magic_pdf/layout/layout_det_utils.py +0 -182
  29. magic_pdf/layout/layout_sort.py +0 -921
  30. magic_pdf/layout/layout_spiler_recog.py +0 -101
  31. magic_pdf/layout/mcol_sort.py +0 -336
  32. magic_pdf/libs/calc_span_stats.py +0 -239
  33. magic_pdf/libs/detect_language_from_model.py +0 -21
  34. magic_pdf/libs/nlp_utils.py +0 -203
  35. magic_pdf/libs/textbase.py +0 -33
  36. magic_pdf/libs/vis_utils.py +0 -308
  37. magic_pdf/para/block_continuation_processor.py +0 -562
  38. magic_pdf/para/block_termination_processor.py +0 -480
  39. magic_pdf/para/commons.py +0 -222
  40. magic_pdf/para/denoise.py +0 -246
  41. magic_pdf/para/draw.py +0 -121
  42. magic_pdf/para/exceptions.py +0 -198
  43. magic_pdf/para/layout_match_processor.py +0 -40
  44. magic_pdf/para/para_split.py +0 -807
  45. magic_pdf/para/para_split_v2.py +0 -959
  46. magic_pdf/para/raw_processor.py +0 -207
  47. magic_pdf/para/stats.py +0 -268
  48. magic_pdf/para/title_processor.py +0 -1014
  49. magic_pdf/pdf_parse_union_core.py +0 -345
  50. magic_pdf/post_proc/__init__.py +0 -0
  51. magic_pdf/post_proc/detect_para.py +0 -3472
  52. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  53. magic_pdf/post_proc/remove_footnote.py +0 -153
  54. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  55. magic_pdf/pre_proc/detect_equation.py +0 -134
  56. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  57. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  58. magic_pdf/pre_proc/detect_footnote.py +0 -170
  59. magic_pdf/pre_proc/detect_header.py +0 -64
  60. magic_pdf/pre_proc/detect_images.py +0 -647
  61. magic_pdf/pre_proc/detect_page_number.py +0 -64
  62. magic_pdf/pre_proc/detect_tables.py +0 -62
  63. magic_pdf/pre_proc/equations_replace.py +0 -550
  64. magic_pdf/pre_proc/fix_image.py +0 -244
  65. magic_pdf/pre_proc/fix_table.py +0 -270
  66. magic_pdf/pre_proc/main_text_font.py +0 -23
  67. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  68. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  69. magic_pdf/pre_proc/post_layout_split.py +0 -0
  70. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  71. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  72. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  73. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  74. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  75. magic_pdf/pre_proc/statistics.py +0 -12
  76. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/LICENSE.md +0 -0
  77. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/WHEEL +0 -0
  78. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/entry_points.txt +0 -0
  79. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/top_level.txt +0 -0
@@ -1,182 +0,0 @@
1
- from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
2
- from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
3
-
4
-
5
- def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
6
- """
7
- 在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox, 不用延长线
8
- 并且要考虑两个box左右相交的情况,如果相交了,那么右侧的box就不算最左侧。
9
- """
10
- left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]
11
- and any([
12
- box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
13
- this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
14
- box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
15
-
16
- # 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个
17
- if len(left_boxes) > 0:
18
- left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
19
- left_boxes = left_boxes[0]
20
- else:
21
- left_boxes = None
22
- return left_boxes
23
-
24
- def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
25
- """
26
- 找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
27
- """
28
- right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]
29
- and any([
30
- this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
31
- box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
32
- box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
33
-
34
- if len(right_bboxes)>0:
35
- right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
36
- right_bboxes = right_bboxes[0]
37
- else:
38
- right_bboxes = None
39
- return right_bboxes
40
-
41
- def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
42
- """
43
- 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
44
- """
45
- top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
46
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
47
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
48
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
49
-
50
- if len(top_bboxes)>0:
51
- top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
52
- top_bboxes = top_bboxes[0]
53
- else:
54
- top_bboxes = None
55
- return top_bboxes
56
-
57
- def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
58
- """
59
- 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
60
- """
61
- bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
62
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
63
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
64
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
65
-
66
- if len(bottom_bboxes)>0:
67
- bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
68
- bottom_bboxes = bottom_bboxes[0]
69
- else:
70
- bottom_bboxes = None
71
- return bottom_bboxes
72
-
73
- # ===================================================================================================================
74
- def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
75
- """
76
- 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
77
- """
78
- bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
79
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
80
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
81
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
82
-
83
- if len(bottom_bboxes)>0:
84
- # y0最小, X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
85
- bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
86
- bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
87
- # 然后再y1相同的情况下,找到x1最大的那个
88
- bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
89
- bottom_bboxes = bottom_bboxes[0]
90
- else:
91
- bottom_bboxes = None
92
- return bottom_bboxes
93
-
94
- def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
95
- """
96
- 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
97
- """
98
- bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
99
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
100
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
101
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
102
-
103
- if len(bottom_bboxes)>0:
104
- # y0最小, X0最小的那个
105
- bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
106
- bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
107
- # 然后再y0相同的情况下,找到x0最小的那个
108
- bottom_bboxes.sort(key=lambda x: x[X0_IDX])
109
- bottom_bboxes = bottom_bboxes[0]
110
- else:
111
- bottom_bboxes = None
112
- return bottom_bboxes
113
-
114
- def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
115
- """
116
- 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
117
- """
118
- top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
119
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
120
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
121
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
122
-
123
- if len(top_bboxes)>0:
124
- # y1最大, X0最小的那个
125
- top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
126
- top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
127
- # 然后再y1相同的情况下,找到x0最小的那个
128
- top_bboxes.sort(key=lambda x: x[X0_IDX])
129
- top_bboxes = top_bboxes[0]
130
- else:
131
- top_bboxes = None
132
- return top_bboxes
133
-
134
- def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
135
- """
136
- 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
137
- """
138
- top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
139
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
140
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
141
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
142
-
143
- if len(top_bboxes)>0:
144
- # y1最大, X1最大的那个
145
- top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
146
- top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
147
- # 然后再y1相同的情况下,找到x1最大的那个
148
- top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
149
- top_bboxes = top_bboxes[0]
150
- else:
151
- top_bboxes = None
152
- return top_bboxes
153
-
154
- # ===================================================================================================================
155
-
156
- def get_left_edge_bboxes(all_bboxes) -> list:
157
- """
158
- 返回最左边的bbox
159
- """
160
- left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
161
- return left_bboxes
162
-
163
- def get_right_edge_bboxes(all_bboxes) -> list:
164
- """
165
- 返回最右边的bbox
166
- """
167
- right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
168
- return right_bboxes
169
-
170
- def fix_vertical_bbox_pos(bboxes:list):
171
- """
172
- 检查这批bbox在垂直方向是否有轻微的重叠,如果重叠了,就把重叠的bbox往下移动一点
173
- 在x方向上必须一个包含或者被包含,或者完全重叠,不能只有部分重叠
174
- """
175
- bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
176
- for i in range(0, len(bboxes)):
177
- for j in range(i+1, len(bboxes)):
178
- if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
179
- # 如果两个bbox有部分重叠,那么就把下面的bbox往下移动一点
180
- bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
181
- break
182
- return bboxes