magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,182 +0,0 @@
|
|
1
|
-
from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
|
2
|
-
from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
|
3
|
-
|
4
|
-
|
5
|
-
def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
|
6
|
-
"""
|
7
|
-
在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox, 不用延长线
|
8
|
-
并且要考虑两个box左右相交的情况,如果相交了,那么右侧的box就不算最左侧。
|
9
|
-
"""
|
10
|
-
left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]
|
11
|
-
and any([
|
12
|
-
box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
|
13
|
-
this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
|
14
|
-
box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
|
15
|
-
|
16
|
-
# 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个
|
17
|
-
if len(left_boxes) > 0:
|
18
|
-
left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
|
19
|
-
left_boxes = left_boxes[0]
|
20
|
-
else:
|
21
|
-
left_boxes = None
|
22
|
-
return left_boxes
|
23
|
-
|
24
|
-
def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
|
25
|
-
"""
|
26
|
-
找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
|
27
|
-
"""
|
28
|
-
right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]
|
29
|
-
and any([
|
30
|
-
this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
|
31
|
-
box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
|
32
|
-
box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
|
33
|
-
|
34
|
-
if len(right_bboxes)>0:
|
35
|
-
right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
|
36
|
-
right_bboxes = right_bboxes[0]
|
37
|
-
else:
|
38
|
-
right_bboxes = None
|
39
|
-
return right_bboxes
|
40
|
-
|
41
|
-
def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
|
42
|
-
"""
|
43
|
-
找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
|
44
|
-
"""
|
45
|
-
top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
|
46
|
-
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
|
47
|
-
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
|
48
|
-
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
|
49
|
-
|
50
|
-
if len(top_bboxes)>0:
|
51
|
-
top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
|
52
|
-
top_bboxes = top_bboxes[0]
|
53
|
-
else:
|
54
|
-
top_bboxes = None
|
55
|
-
return top_bboxes
|
56
|
-
|
57
|
-
def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
|
58
|
-
"""
|
59
|
-
找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
|
60
|
-
"""
|
61
|
-
bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
|
62
|
-
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
|
63
|
-
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
|
64
|
-
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
|
65
|
-
|
66
|
-
if len(bottom_bboxes)>0:
|
67
|
-
bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
|
68
|
-
bottom_bboxes = bottom_bboxes[0]
|
69
|
-
else:
|
70
|
-
bottom_bboxes = None
|
71
|
-
return bottom_bboxes
|
72
|
-
|
73
|
-
# ===================================================================================================================
|
74
|
-
def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
|
75
|
-
"""
|
76
|
-
找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
|
77
|
-
"""
|
78
|
-
bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
|
79
|
-
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
|
80
|
-
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
|
81
|
-
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
|
82
|
-
|
83
|
-
if len(bottom_bboxes)>0:
|
84
|
-
# y0最小, X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
|
85
|
-
bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
|
86
|
-
bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
|
87
|
-
# 然后再y1相同的情况下,找到x1最大的那个
|
88
|
-
bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
|
89
|
-
bottom_bboxes = bottom_bboxes[0]
|
90
|
-
else:
|
91
|
-
bottom_bboxes = None
|
92
|
-
return bottom_bboxes
|
93
|
-
|
94
|
-
def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
|
95
|
-
"""
|
96
|
-
找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
|
97
|
-
"""
|
98
|
-
bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
|
99
|
-
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
|
100
|
-
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
|
101
|
-
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
|
102
|
-
|
103
|
-
if len(bottom_bboxes)>0:
|
104
|
-
# y0最小, X0最小的那个
|
105
|
-
bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
|
106
|
-
bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
|
107
|
-
# 然后再y0相同的情况下,找到x0最小的那个
|
108
|
-
bottom_bboxes.sort(key=lambda x: x[X0_IDX])
|
109
|
-
bottom_bboxes = bottom_bboxes[0]
|
110
|
-
else:
|
111
|
-
bottom_bboxes = None
|
112
|
-
return bottom_bboxes
|
113
|
-
|
114
|
-
def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
|
115
|
-
"""
|
116
|
-
找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
|
117
|
-
"""
|
118
|
-
top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
|
119
|
-
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
|
120
|
-
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
|
121
|
-
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
|
122
|
-
|
123
|
-
if len(top_bboxes)>0:
|
124
|
-
# y1最大, X0最小的那个
|
125
|
-
top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
|
126
|
-
top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
|
127
|
-
# 然后再y1相同的情况下,找到x0最小的那个
|
128
|
-
top_bboxes.sort(key=lambda x: x[X0_IDX])
|
129
|
-
top_bboxes = top_bboxes[0]
|
130
|
-
else:
|
131
|
-
top_bboxes = None
|
132
|
-
return top_bboxes
|
133
|
-
|
134
|
-
def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
|
135
|
-
"""
|
136
|
-
找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
|
137
|
-
"""
|
138
|
-
top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
|
139
|
-
box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
|
140
|
-
this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
|
141
|
-
box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
|
142
|
-
|
143
|
-
if len(top_bboxes)>0:
|
144
|
-
# y1最大, X1最大的那个
|
145
|
-
top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
|
146
|
-
top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
|
147
|
-
# 然后再y1相同的情况下,找到x1最大的那个
|
148
|
-
top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
|
149
|
-
top_bboxes = top_bboxes[0]
|
150
|
-
else:
|
151
|
-
top_bboxes = None
|
152
|
-
return top_bboxes
|
153
|
-
|
154
|
-
# ===================================================================================================================
|
155
|
-
|
156
|
-
def get_left_edge_bboxes(all_bboxes) -> list:
|
157
|
-
"""
|
158
|
-
返回最左边的bbox
|
159
|
-
"""
|
160
|
-
left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
|
161
|
-
return left_bboxes
|
162
|
-
|
163
|
-
def get_right_edge_bboxes(all_bboxes) -> list:
|
164
|
-
"""
|
165
|
-
返回最右边的bbox
|
166
|
-
"""
|
167
|
-
right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
|
168
|
-
return right_bboxes
|
169
|
-
|
170
|
-
def fix_vertical_bbox_pos(bboxes:list):
|
171
|
-
"""
|
172
|
-
检查这批bbox在垂直方向是否有轻微的重叠,如果重叠了,就把重叠的bbox往下移动一点
|
173
|
-
在x方向上必须一个包含或者被包含,或者完全重叠,不能只有部分重叠
|
174
|
-
"""
|
175
|
-
bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
|
176
|
-
for i in range(0, len(bboxes)):
|
177
|
-
for j in range(i+1, len(bboxes)):
|
178
|
-
if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
|
179
|
-
# 如果两个bbox有部分重叠,那么就把下面的bbox往下移动一点
|
180
|
-
bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
|
181
|
-
break
|
182
|
-
return bboxes
|