magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,182 +0,0 @@
1
- from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
2
- from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
3
-
4
-
5
- def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
6
- """
7
- 在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox, 不用延长线
8
- 并且要考虑两个box左右相交的情况,如果相交了,那么右侧的box就不算最左侧。
9
- """
10
- left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]
11
- and any([
12
- box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
13
- this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
14
- box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
15
-
16
- # 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个
17
- if len(left_boxes) > 0:
18
- left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
19
- left_boxes = left_boxes[0]
20
- else:
21
- left_boxes = None
22
- return left_boxes
23
-
24
- def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
25
- """
26
- 找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
27
- """
28
- right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]
29
- and any([
30
- this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
31
- box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
32
- box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
33
-
34
- if len(right_bboxes)>0:
35
- right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
36
- right_bboxes = right_bboxes[0]
37
- else:
38
- right_bboxes = None
39
- return right_bboxes
40
-
41
- def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
42
- """
43
- 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
44
- """
45
- top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
46
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
47
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
48
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
49
-
50
- if len(top_bboxes)>0:
51
- top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
52
- top_bboxes = top_bboxes[0]
53
- else:
54
- top_bboxes = None
55
- return top_bboxes
56
-
57
- def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
58
- """
59
- 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
60
- """
61
- bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
62
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
63
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
64
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
65
-
66
- if len(bottom_bboxes)>0:
67
- bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
68
- bottom_bboxes = bottom_bboxes[0]
69
- else:
70
- bottom_bboxes = None
71
- return bottom_bboxes
72
-
73
- # ===================================================================================================================
74
- def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
75
- """
76
- 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
77
- """
78
- bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
79
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
80
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
81
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
82
-
83
- if len(bottom_bboxes)>0:
84
- # y0最小, X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
85
- bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
86
- bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
87
- # 然后再y1相同的情况下,找到x1最大的那个
88
- bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
89
- bottom_bboxes = bottom_bboxes[0]
90
- else:
91
- bottom_bboxes = None
92
- return bottom_bboxes
93
-
94
- def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
95
- """
96
- 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
97
- """
98
- bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
99
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
100
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
101
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
102
-
103
- if len(bottom_bboxes)>0:
104
- # y0最小, X0最小的那个
105
- bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
106
- bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
107
- # 然后再y0相同的情况下,找到x0最小的那个
108
- bottom_bboxes.sort(key=lambda x: x[X0_IDX])
109
- bottom_bboxes = bottom_bboxes[0]
110
- else:
111
- bottom_bboxes = None
112
- return bottom_bboxes
113
-
114
- def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
115
- """
116
- 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
117
- """
118
- top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
119
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
120
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
121
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
122
-
123
- if len(top_bboxes)>0:
124
- # y1最大, X0最小的那个
125
- top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
126
- top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
127
- # 然后再y1相同的情况下,找到x0最小的那个
128
- top_bboxes.sort(key=lambda x: x[X0_IDX])
129
- top_bboxes = top_bboxes[0]
130
- else:
131
- top_bboxes = None
132
- return top_bboxes
133
-
134
- def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
135
- """
136
- 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
137
- """
138
- top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
139
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
140
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
141
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
142
-
143
- if len(top_bboxes)>0:
144
- # y1最大, X1最大的那个
145
- top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
146
- top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
147
- # 然后再y1相同的情况下,找到x1最大的那个
148
- top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
149
- top_bboxes = top_bboxes[0]
150
- else:
151
- top_bboxes = None
152
- return top_bboxes
153
-
154
- # ===================================================================================================================
155
-
156
- def get_left_edge_bboxes(all_bboxes) -> list:
157
- """
158
- 返回最左边的bbox
159
- """
160
- left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
161
- return left_bboxes
162
-
163
- def get_right_edge_bboxes(all_bboxes) -> list:
164
- """
165
- 返回最右边的bbox
166
- """
167
- right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
168
- return right_bboxes
169
-
170
- def fix_vertical_bbox_pos(bboxes:list):
171
- """
172
- 检查这批bbox在垂直方向是否有轻微的重叠,如果重叠了,就把重叠的bbox往下移动一点
173
- 在x方向上必须一个包含或者被包含,或者完全重叠,不能只有部分重叠
174
- """
175
- bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
176
- for i in range(0, len(bboxes)):
177
- for j in range(i+1, len(bboxes)):
178
- if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
179
- # 如果两个bbox有部分重叠,那么就把下面的bbox往下移动一点
180
- bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
181
- break
182
- return bboxes