magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. magic_pdf/filter/pdf_meta_scan.py +3 -17
  2. magic_pdf/libs/commons.py +0 -161
  3. magic_pdf/libs/draw_bbox.py +2 -3
  4. magic_pdf/libs/markdown_utils.py +0 -21
  5. magic_pdf/libs/pdf_image_tools.py +2 -1
  6. magic_pdf/libs/version.py +1 -1
  7. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  8. magic_pdf/model/magic_model.py +0 -30
  9. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  11. magic_pdf/para/para_split_v3.py +7 -2
  12. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  13. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  14. magic_pdf/pre_proc/cut_image.py +0 -37
  15. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  16. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  17. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  18. magic_pdf/rw/S3ReaderWriter.py +1 -1
  19. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  20. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
  21. magic_pdf/dict2md/mkcontent.py +0 -438
  22. magic_pdf/layout/__init__.py +0 -0
  23. magic_pdf/layout/bbox_sort.py +0 -681
  24. magic_pdf/layout/layout_det_utils.py +0 -182
  25. magic_pdf/layout/layout_sort.py +0 -921
  26. magic_pdf/layout/layout_spiler_recog.py +0 -101
  27. magic_pdf/layout/mcol_sort.py +0 -336
  28. magic_pdf/libs/calc_span_stats.py +0 -239
  29. magic_pdf/libs/detect_language_from_model.py +0 -21
  30. magic_pdf/libs/nlp_utils.py +0 -203
  31. magic_pdf/libs/textbase.py +0 -33
  32. magic_pdf/libs/vis_utils.py +0 -308
  33. magic_pdf/para/block_continuation_processor.py +0 -562
  34. magic_pdf/para/block_termination_processor.py +0 -480
  35. magic_pdf/para/commons.py +0 -222
  36. magic_pdf/para/denoise.py +0 -246
  37. magic_pdf/para/draw.py +0 -121
  38. magic_pdf/para/exceptions.py +0 -198
  39. magic_pdf/para/layout_match_processor.py +0 -40
  40. magic_pdf/para/para_split.py +0 -807
  41. magic_pdf/para/para_split_v2.py +0 -959
  42. magic_pdf/para/raw_processor.py +0 -207
  43. magic_pdf/para/stats.py +0 -268
  44. magic_pdf/para/title_processor.py +0 -1014
  45. magic_pdf/pdf_parse_union_core.py +0 -345
  46. magic_pdf/post_proc/__init__.py +0 -0
  47. magic_pdf/post_proc/detect_para.py +0 -3472
  48. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  49. magic_pdf/post_proc/remove_footnote.py +0 -153
  50. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  51. magic_pdf/pre_proc/detect_equation.py +0 -134
  52. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  53. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  54. magic_pdf/pre_proc/detect_footnote.py +0 -170
  55. magic_pdf/pre_proc/detect_header.py +0 -64
  56. magic_pdf/pre_proc/detect_images.py +0 -647
  57. magic_pdf/pre_proc/detect_page_number.py +0 -64
  58. magic_pdf/pre_proc/detect_tables.py +0 -62
  59. magic_pdf/pre_proc/equations_replace.py +0 -550
  60. magic_pdf/pre_proc/fix_image.py +0 -244
  61. magic_pdf/pre_proc/fix_table.py +0 -270
  62. magic_pdf/pre_proc/main_text_font.py +0 -23
  63. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  64. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  65. magic_pdf/pre_proc/post_layout_split.py +0 -0
  66. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  67. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  68. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  69. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  70. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  71. magic_pdf/pre_proc/statistics.py +0 -12
  72. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  73. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
  74. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  75. {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,681 +0,0 @@
1
- # 定义这里的bbox是一个list [x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None
2
- # 其中x0, y0代表左上角坐标,x1, y1代表右下角坐标,坐标原点在左上角。
3
-
4
-
5
-
6
- from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
7
- from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
8
- from magic_pdf.libs.commons import mymax
9
-
10
- X0_IDX = 0
11
- Y0_IDX = 1
12
- X1_IDX = 2
13
- Y1_IDX = 3
14
- CONTENT_IDX = 4
15
- IDX_X = 5
16
- IDX_Y = 6
17
- CONTENT_TYPE_IDX = 7
18
-
19
- X0_EXT_IDX = 8
20
- Y0_EXT_IDX = 9
21
- X1_EXT_IDX = 10
22
- Y1_EXT_IDX = 11
23
-
24
-
25
- def prepare_bboxes_for_layout_split(image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, text_raw_blocks: dict, page_boundry, page):
26
- """
27
- text_raw_blocks:结构参考test/assets/papre/pymu_textblocks.json
28
- 把bbox重新组装成一个list,每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None. 对于图片、公式来说,block_content是图片的地址, 对于段落来说,block_content是pymupdf里的block结构
29
- """
30
- all_bboxes = []
31
-
32
- for image in image_info:
33
- box = image['bbox']
34
- # 由于没有实现横向的栏切分,因此在这里先过滤掉一些小的图片。这些图片有可能影响layout,造成没有横向栏切分的情况下,layout切分不准确。例如 scihub_76500000/libgen.scimag76570000-76570999.zip_10.1186/s13287-019-1355-1
35
- # 把长宽都小于50的去掉
36
- if abs(box[0]-box[2]) < 50 and abs(box[1]-box[3]) < 50:
37
- continue
38
- all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'image', None, None, None, None])
39
-
40
- for table in table_info:
41
- box = table['bbox']
42
- all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'table', None, None, None, None])
43
-
44
- """由于公式与段落混合,因此公式不再参与layout划分,无需加入all_bboxes"""
45
- # 加入文本block
46
- text_block_temp = []
47
- for block in text_raw_blocks:
48
- bbox = block['bbox']
49
- text_block_temp.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
50
-
51
- text_block_new = resolve_bbox_overlap_for_layout_det(text_block_temp)
52
- text_block_new = filter_lines_bbox(text_block_new) # 去掉线条bbox,有可能让layout探测陷入无限循环
53
-
54
-
55
- """找出会影响layout的色块、横向分割线"""
56
- spilter_bboxes = get_spilter_of_page(page, [b['bbox'] for b in image_info]+[b['bbox'] for b in image_backup_info], [b['bbox'] for b in table_info], )
57
- # 还要去掉存在于spilter_bboxes里的text_block
58
- if len(spilter_bboxes) > 0:
59
- text_block_new = [box for box in text_block_new if not any([_is_in_or_part_overlap(box[:4], spilter_bbox) for spilter_bbox in spilter_bboxes])]
60
-
61
- for bbox in text_block_new:
62
- all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
63
-
64
- for bbox in spilter_bboxes:
65
- all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'spilter', None, None, None, None])
66
-
67
-
68
- return all_bboxes
69
-
70
- def resolve_bbox_overlap_for_layout_det(bboxes:list):
71
- """
72
- 1. 去掉bbox互相包含的,去掉被包含的
73
- 2. 上下方向上如果有重叠,就扩大大box范围,直到覆盖小box
74
- """
75
- def _is_in_other_bbox(i:int):
76
- """
77
- 判断i个box是否被其他box有所包含
78
- """
79
- for j in range(0, len(bboxes)):
80
- if j!=i and _is_in(bboxes[i][:4], bboxes[j][:4]):
81
- return True
82
- # elif j!=i and _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
83
- # return True
84
-
85
- return False
86
-
87
- # 首先去掉被包含的bbox
88
- new_bbox_1 = []
89
- for i in range(0, len(bboxes)):
90
- if not _is_in_other_bbox(i):
91
- new_bbox_1.append(bboxes[i])
92
-
93
- # 其次扩展大的box
94
- new_box = []
95
- new_bbox_2 = []
96
- len_1 = len(new_bbox_2)
97
- while True:
98
- merged_idx = []
99
- for i in range(0, len(new_bbox_1)):
100
- if i in merged_idx:
101
- continue
102
- for j in range(i+1, len(new_bbox_1)):
103
- if j in merged_idx:
104
- continue
105
- bx1 = new_bbox_1[i]
106
- bx2 = new_bbox_1[j]
107
- if i!=j and _is_vertical_full_overlap(bx1[:4], bx2[:4]):
108
- merged_box = min([bx1[0], bx2[0]]), min([bx1[1], bx2[1]]), max([bx1[2], bx2[2]]), max([bx1[3], bx2[3]])
109
- new_bbox_2.append(merged_box)
110
- merged_idx.append(i)
111
- merged_idx.append(j)
112
-
113
- for i in range(0, len(new_bbox_1)): # 没有合并的加入进来
114
- if i not in merged_idx:
115
- new_bbox_2.append(new_bbox_1[i])
116
-
117
- if len(new_bbox_2)==0 or len_1==len(new_bbox_2):
118
- break
119
- else:
120
- len_1 = len(new_bbox_2)
121
- new_box = new_bbox_2
122
- new_bbox_1, new_bbox_2 = new_bbox_2, []
123
-
124
- return new_box
125
-
126
-
127
- def filter_lines_bbox(bboxes: list):
128
- """
129
- 过滤掉bbox为空的行
130
- """
131
- new_box = []
132
- for box in bboxes:
133
- x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
134
- if abs(x0-x1)<=1 or abs(y0-y1)<=1:
135
- continue
136
- else:
137
- new_box.append(box)
138
- return new_box
139
-
140
-
141
- ################################################################################
142
- # 第一种排序算法
143
- # 以下是基于延长线遮挡做的一个算法
144
- #
145
- ################################################################################
146
- def find_all_left_bbox(this_bbox, all_bboxes) -> list:
147
- """
148
- 寻找this_bbox左边的所有bbox
149
- """
150
- left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
151
- return left_boxes
152
-
153
-
154
- def find_all_top_bbox(this_bbox, all_bboxes) -> list:
155
- """
156
- 寻找this_bbox上面的所有bbox
157
- """
158
- top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX]]
159
- return top_boxes
160
-
161
-
162
- def get_and_set_idx_x(this_bbox, all_bboxes) -> int:
163
- """
164
- 寻找this_bbox在all_bboxes中的遮挡深度 idx_x
165
- """
166
- if this_bbox[IDX_X] is not None:
167
- return this_bbox[IDX_X]
168
- else:
169
- all_left_bboxes = find_all_left_bbox(this_bbox, all_bboxes)
170
- if len(all_left_bboxes) == 0:
171
- this_bbox[IDX_X] = 0
172
- else:
173
- all_left_bboxes_idx = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_left_bboxes]
174
- max_idx_x = mymax(all_left_bboxes_idx)
175
- this_bbox[IDX_X] = max_idx_x + 1
176
- return this_bbox[IDX_X]
177
-
178
-
179
- def get_and_set_idx_y(this_bbox, all_bboxes) -> int:
180
- """
181
- 寻找this_bbox在all_bboxes中y方向的遮挡深度 idx_y
182
- """
183
- if this_bbox[IDX_Y] is not None:
184
- return this_bbox[IDX_Y]
185
- else:
186
- all_top_bboxes = find_all_top_bbox(this_bbox, all_bboxes)
187
- if len(all_top_bboxes) == 0:
188
- this_bbox[IDX_Y] = 0
189
- else:
190
- all_top_bboxes_idx = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_top_bboxes]
191
- max_idx_y = mymax(all_top_bboxes_idx)
192
- this_bbox[IDX_Y] = max_idx_y + 1
193
- return this_bbox[IDX_Y]
194
-
195
-
196
- def bbox_sort(all_bboxes: list):
197
- """
198
- 排序
199
- """
200
- all_bboxes_idx_x = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_bboxes]
201
- all_bboxes_idx_y = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_bboxes]
202
- all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
203
-
204
- all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
205
- all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
206
- all_bboxes_idx.sort(key=lambda x: x[0])
207
- sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
208
- return sorted_bboxes
209
-
210
-
211
- ################################################################################
212
- # 第二种排序算法
213
- # 下面的算法在计算idx_x和idx_y的时候不考虑延长线,而只考虑实际的长或者宽被遮挡的情况
214
- #
215
- ################################################################################
216
-
217
- def find_left_nearest_bbox(this_bbox, all_bboxes) -> list:
218
- """
219
- 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
220
- """
221
- left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
222
- box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
223
- this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
224
- box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
225
-
226
- # 然后再过滤一下,找到水平上距离this_bbox最近的那个
227
- if len(left_boxes) > 0:
228
- left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
229
- left_boxes = [left_boxes[0]]
230
- else:
231
- left_boxes = []
232
- return left_boxes
233
-
234
-
235
- def get_and_set_idx_x_2(this_bbox, all_bboxes):
236
- """
237
- 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
238
- 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
239
- """
240
- if this_bbox[IDX_X] is not None:
241
- return this_bbox[IDX_X]
242
- else:
243
- left_nearest_bbox = find_left_nearest_bbox(this_bbox, all_bboxes)
244
- if len(left_nearest_bbox) == 0:
245
- this_bbox[IDX_X] = 0
246
- else:
247
- left_idx_x = get_and_set_idx_x_2(left_nearest_bbox[0], all_bboxes)
248
- this_bbox[IDX_X] = left_idx_x + 1
249
- return this_bbox[IDX_X]
250
-
251
-
252
- def find_top_nearest_bbox(this_bbox, all_bboxes) -> list:
253
- """
254
- 在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
255
- """
256
- top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
257
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
258
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
259
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
260
- # 然后再过滤一下,找到水平上距离this_bbox最近的那个
261
- if len(top_boxes) > 0:
262
- top_boxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
263
- top_boxes = [top_boxes[0]]
264
- else:
265
- top_boxes = []
266
- return top_boxes
267
-
268
-
269
- def get_and_set_idx_y_2(this_bbox, all_bboxes):
270
- """
271
- 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
272
- 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
273
- """
274
- if this_bbox[IDX_Y] is not None:
275
- return this_bbox[IDX_Y]
276
- else:
277
- top_nearest_bbox = find_top_nearest_bbox(this_bbox, all_bboxes)
278
- if len(top_nearest_bbox) == 0:
279
- this_bbox[IDX_Y] = 0
280
- else:
281
- top_idx_y = get_and_set_idx_y_2(top_nearest_bbox[0], all_bboxes)
282
- this_bbox[IDX_Y] = top_idx_y + 1
283
- return this_bbox[IDX_Y]
284
-
285
-
286
- def paper_bbox_sort(all_bboxes: list, page_width, page_height):
287
- all_bboxes_idx_x = [get_and_set_idx_x_2(bbox, all_bboxes) for bbox in all_bboxes]
288
- all_bboxes_idx_y = [get_and_set_idx_y_2(bbox, all_bboxes) for bbox in all_bboxes]
289
- all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
290
-
291
- all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
292
- all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
293
- all_bboxes_idx.sort(key=lambda x: x[0])
294
- sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
295
- return sorted_bboxes
296
-
297
- ################################################################################
298
- """
299
- 第三种排序算法, 假设page的最左侧为X0,最右侧为X1,最上侧为Y0,最下侧为Y1
300
- 这个排序算法在第二种算法基础上增加对bbox的预处理步骤。预处理思路如下:
301
- 1. 首先在水平方向上对bbox进行扩展。扩展方法是:
302
- - 对每个bbox,找到其左边最近的bbox(也就是y方向有重叠),然后将其左边界扩展到左边最近bbox的右边界(x1+1),这里加1是为了避免重叠。如果没有左边的bbox,那么就将其左边界扩展到page的最左侧X0。
303
- - 对每个bbox,找到其右边最近的bbox(也就是y方向有重叠),然后将其右边界扩展到右边最近bbox的左边界(x0-1),这里减1是为了避免重叠。如果没有右边的bbox,那么就将其右边界扩展到page的最右侧X1。
304
- - 经过上面2个步骤,bbox扩展到了水平方向的最大范围。[左最近bbox.x1+1, 右最近bbox.x0-1]
305
-
306
- 2. 合并所有的连续水平方向的bbox, 合并方法是:
307
- - 对bbox进行y方向排序,然后从上到下遍历所有bbox,如果当前bbox和下一个bbox的x0, x1等于X0, X1,那么就合并这两个bbox。
308
-
309
- 3. 然后在垂直方向上对bbox进行扩展。扩展方法是:
310
- - 首先从page上切割掉合并后的水平bbox, 得到几个新的block
311
- 针对每个block
312
- - x0: 扎到位于左侧x=x0延长线的左侧所有的bboxes, 找到最大的x1,让x0=x1+1。如果没有,则x0=X0
313
- - x1: 找到位于右侧x=x1延长线右侧所有的bboxes, 找到最小的x0, 让x1=x0-1。如果没有,则x1=X1
314
- 随后在垂直方向上合并所有的连续的block,方法如下:
315
- - 对block进行x方向排序,然后从左到右遍历所有block,如果当前block和下一个block的x0, x1相等,那么就合并这两个block。
316
- 如果垂直切分后所有小bbox都被分配到了一个block, 那么分割就完成了。这些合并后的block打上标签'GOOD_LAYOUT’
317
- 如果在某个垂直方向上无法被完全分割到一个block,那么就将这个block打上标签'BAD_LAYOUT'。
318
- 至此完成,一个页面的预处理,天然的block要么属于'GOOD_LAYOUT',要么属于'BAD_LAYOUT'。针对含有'BAD_LAYOUT'的页面,可以先按照自上而下,自左到右进行天然排序,也可以先过滤掉这种书籍。
319
- (完成条件下次加强:进行水平方向切分,把混乱的layout部分尽可能切割出去)
320
- """
321
- ################################################################################
322
- def find_left_neighbor_bboxes(this_bbox, all_bboxes) -> list:
323
- """
324
- 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
325
- 这里使用扩展之后的bbox
326
- """
327
- left_boxes = [box for box in all_bboxes if box[X1_EXT_IDX] <= this_bbox[X0_EXT_IDX] and any([
328
- box[Y0_EXT_IDX] < this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX], box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX] < box[Y1_EXT_IDX],
329
- this_bbox[Y0_EXT_IDX] < box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX], this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX] < this_bbox[Y1_EXT_IDX],
330
- box[Y0_EXT_IDX]==this_bbox[Y0_EXT_IDX] and box[Y1_EXT_IDX]==this_bbox[Y1_EXT_IDX]])]
331
-
332
- # 然后再过滤一下,找到水平上距离this_bbox最近的那个
333
- if len(left_boxes) > 0:
334
- left_boxes.sort(key=lambda x: x[X1_EXT_IDX], reverse=True)
335
- left_boxes = left_boxes
336
- else:
337
- left_boxes = []
338
- return left_boxes
339
-
340
- def find_top_neighbor_bboxes(this_bbox, all_bboxes) -> list:
341
- """
342
- 在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
343
- 这里使用扩展之后的bbox
344
- """
345
- top_boxes = [box for box in all_bboxes if box[Y1_EXT_IDX] <= this_bbox[Y0_EXT_IDX] and any([
346
- box[X0_EXT_IDX] < this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX], box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX] < box[X1_EXT_IDX],
347
- this_bbox[X0_EXT_IDX] < box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX], this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX] < this_bbox[X1_EXT_IDX],
348
- box[X0_EXT_IDX]==this_bbox[X0_EXT_IDX] and box[X1_EXT_IDX]==this_bbox[X1_EXT_IDX]])]
349
- # 然后再过滤一下,找到水平上距离this_bbox最近的那个
350
- if len(top_boxes) > 0:
351
- top_boxes.sort(key=lambda x: x[Y1_EXT_IDX], reverse=True)
352
- top_boxes = top_boxes
353
- else:
354
- top_boxes = []
355
- return top_boxes
356
-
357
- def get_and_set_idx_x_2_ext(this_bbox, all_bboxes):
358
- """
359
- 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
360
- 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
361
- """
362
- if this_bbox[IDX_X] is not None:
363
- return this_bbox[IDX_X]
364
- else:
365
- left_nearest_bbox = find_left_neighbor_bboxes(this_bbox, all_bboxes)
366
- if len(left_nearest_bbox) == 0:
367
- this_bbox[IDX_X] = 0
368
- else:
369
- left_idx_x = [get_and_set_idx_x_2(b, all_bboxes) for b in left_nearest_bbox]
370
- this_bbox[IDX_X] = mymax(left_idx_x) + 1
371
- return this_bbox[IDX_X]
372
-
373
- def get_and_set_idx_y_2_ext(this_bbox, all_bboxes):
374
- """
375
- 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
376
- 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况
377
- """
378
- if this_bbox[IDX_Y] is not None:
379
- return this_bbox[IDX_Y]
380
- else:
381
- top_nearest_bbox = find_top_neighbor_bboxes(this_bbox, all_bboxes)
382
- if len(top_nearest_bbox) == 0:
383
- this_bbox[IDX_Y] = 0
384
- else:
385
- top_idx_y = [get_and_set_idx_y_2_ext(b, all_bboxes) for b in top_nearest_bbox]
386
- this_bbox[IDX_Y] = mymax(top_idx_y) + 1
387
- return this_bbox[IDX_Y]
388
-
389
- def _paper_bbox_sort_ext(all_bboxes: list):
390
- all_bboxes_idx_x = [get_and_set_idx_x_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
391
- all_bboxes_idx_y = [get_and_set_idx_y_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
392
- all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
393
-
394
- all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序
395
- all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
396
- all_bboxes_idx.sort(key=lambda x: x[0])
397
- sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
398
- return sorted_bboxes
399
-
400
- # ===============================================================================================
401
- def find_left_bbox_ext_line(this_bbox, all_bboxes) -> list:
402
- """
403
- 寻找this_bbox左边的所有bbox, 使用延长线
404
- """
405
- left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
406
- if len(left_boxes):
407
- left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
408
- left_boxes = left_boxes[0]
409
- else:
410
- left_boxes = None
411
-
412
- return left_boxes
413
-
414
- def find_right_bbox_ext_line(this_bbox, all_bboxes) -> list:
415
- """
416
- 寻找this_bbox右边的所有bbox, 使用延长线
417
- """
418
- right_boxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]]
419
- if len(right_boxes):
420
- right_boxes.sort(key=lambda x: x[X0_IDX])
421
- right_boxes = right_boxes[0]
422
- else:
423
- right_boxes = None
424
- return right_boxes
425
-
426
- # =============================================================================================
427
-
428
- def find_left_nearest_bbox_direct(this_bbox, all_bboxes) -> list:
429
- """
430
- 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox, 不用延长线并且不能像
431
- """
432
- left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
433
- box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
434
- this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
435
- box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
436
-
437
- # 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个
438
- if len(left_boxes) > 0:
439
- left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
440
- left_boxes = left_boxes[0]
441
- else:
442
- left_boxes = None
443
- return left_boxes
444
-
445
- def find_right_nearst_bbox_direct(this_bbox, all_bboxes) -> list:
446
- """
447
- 找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
448
- """
449
- right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] and any([
450
- this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
451
- box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
452
- box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
453
-
454
- if len(right_bboxes)>0:
455
- right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
456
- right_bboxes = right_bboxes[0]
457
- else:
458
- right_bboxes = None
459
- return right_bboxes
460
-
461
- def reset_idx_x_y(all_boxes:list)->list:
462
- for box in all_boxes:
463
- box[IDX_X] = None
464
- box[IDX_Y] = None
465
-
466
- return all_boxes
467
-
468
- # ===================================================================================================
469
- def find_top_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
470
- """
471
- 找到在this_bbox上方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
472
- """
473
- top_bboxes = [box for box in bboxes_collection if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
474
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
475
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
476
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
477
- # 然后再过滤一下,找到上方距离this_bbox最近的那个
478
- if len(top_bboxes) > 0:
479
- top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
480
- top_bboxes = top_bboxes[0]
481
- else:
482
- top_bboxes = None
483
- return top_bboxes
484
-
485
- def find_bottom_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
486
- """
487
- 找到在this_bbox下方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
488
- """
489
- bottom_bboxes = [box for box in bboxes_collection if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
490
- box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
491
- this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
492
- box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
493
- # 然后再过滤一下,找到水平上距离this_bbox最近的那个
494
- if len(bottom_bboxes) > 0:
495
- bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
496
- bottom_bboxes = bottom_bboxes[0]
497
- else:
498
- bottom_bboxes = None
499
- return bottom_bboxes
500
-
501
- def find_boundry_bboxes(bboxes:list) -> tuple:
502
- """
503
- 找到bboxes的边界——找到所有bbox里最小的(x0, y0), 最大的(x1, y1)
504
- """
505
- x0, y0, x1, y1 = bboxes[0][X0_IDX], bboxes[0][Y0_IDX], bboxes[0][X1_IDX], bboxes[0][Y1_IDX]
506
- for box in bboxes:
507
- x0 = min(box[X0_IDX], x0)
508
- y0 = min(box[Y0_IDX], y0)
509
- x1 = max(box[X1_IDX], x1)
510
- y1 = max(box[Y1_IDX], y1)
511
-
512
- return x0, y0, x1, y1
513
-
514
-
515
- def extend_bbox_vertical(bboxes:list, boundry_x0, boundry_y0, boundry_x1, boundry_y1) -> list:
516
- """
517
- 在垂直方向上扩展能够直接垂直打通的bbox,也就是那些上下都没有其他box的bbox
518
- """
519
- for box in bboxes:
520
- top_nearest_bbox = find_top_nearest_bbox_direct(box, bboxes)
521
- bottom_nearest_bbox = find_bottom_nearest_bbox_direct(box, bboxes)
522
- if top_nearest_bbox is None and bottom_nearest_bbox is None: # 独占一列
523
- box[X0_EXT_IDX] = box[X0_IDX]
524
- box[Y0_EXT_IDX] = boundry_y0
525
- box[X1_EXT_IDX] = box[X1_IDX]
526
- box[Y1_EXT_IDX] = boundry_y1
527
- # else:
528
- # if top_nearest_bbox is None:
529
- # box[Y0_EXT_IDX] = boundry_y0
530
- # else:
531
- # box[Y0_EXT_IDX] = top_nearest_bbox[Y1_IDX] + 1
532
- # if bottom_nearest_bbox is None:
533
- # box[Y1_EXT_IDX] = boundry_y1
534
- # else:
535
- # box[Y1_EXT_IDX] = bottom_nearest_bbox[Y0_IDX] - 1
536
- # box[X0_EXT_IDX] = box[X0_IDX]
537
- # box[X1_EXT_IDX] = box[X1_IDX]
538
- return bboxes
539
-
540
-
541
- # ===================================================================================================
542
-
543
- def paper_bbox_sort_v2(all_bboxes: list, page_width:int, page_height:int):
544
- """
545
- 增加预处理行为的排序:
546
- return:
547
- [
548
- {
549
- "layout_bbox": [x0, y0, x1, y1],
550
- "layout_label":"GOOD_LAYOUT/BAD_LAYOUT",
551
- "content_bboxes": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
552
- }
553
- ]
554
- """
555
- sorted_layouts = [] # 最后的返回结果
556
- page_x0, page_y0, page_x1, page_y1 = 1, 1, page_width-1, page_height-1
557
-
558
- all_bboxes = paper_bbox_sort(all_bboxes) # 大致拍下序
559
- # 首先在水平方向上扩展独占一行的bbox
560
- for bbox in all_bboxes:
561
- left_nearest_bbox = find_left_nearest_bbox_direct(bbox, all_bboxes) # 非扩展线
562
- right_nearest_bbox = find_right_nearst_bbox_direct(bbox, all_bboxes)
563
- if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
564
- bbox[X0_EXT_IDX] = page_x0
565
- bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
566
- bbox[X1_EXT_IDX] = page_x1
567
- bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
568
-
569
- # 此时独占一行的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
570
- if len(all_bboxes)==1:
571
- return [{"layout_bbox": [page_x0, page_y0, page_x1, page_y1], "layout_label":"GOOD_LAYOUT", "content_bboxes": all_bboxes}]
572
- if len(all_bboxes)==0:
573
- return []
574
-
575
- """
576
- 然后合并所有连续水平方向的bbox.
577
-
578
- """
579
- all_bboxes.sort(key=lambda x: x[Y0_IDX])
580
- h_bboxes = []
581
- h_bbox_group = []
582
- v_boxes = []
583
-
584
- for bbox in all_bboxes:
585
- if bbox[X0_IDX] == page_x0 and bbox[X1_IDX] == page_x1:
586
- h_bbox_group.append(bbox)
587
- else:
588
- if len(h_bbox_group)>0:
589
- h_bboxes.append(h_bbox_group)
590
- h_bbox_group = []
591
- # 最后一个group
592
- if len(h_bbox_group)>0:
593
- h_bboxes.append(h_bbox_group)
594
-
595
- """
596
- 现在h_bboxes里面是所有的group了,每个group都是一个list
597
- 对h_bboxes里的每个group进行计算放回到sorted_layouts里
598
- """
599
- for gp in h_bboxes:
600
- gp.sort(key=lambda x: x[Y0_IDX])
601
- block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
602
- # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
603
- x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
604
- block_info["layout_bbox"] = [x0, y0, x1, y1]
605
- sorted_layouts.append(block_info)
606
-
607
- # 接下来利用这些连续的水平bbox的layout_bbox的y0, y1,从水平上切分开其余的为几个部分
608
- h_split_lines = [page_y0]
609
- for gp in h_bboxes:
610
- layout_bbox = gp['layout_bbox']
611
- y0, y1 = layout_bbox[1], layout_bbox[3]
612
- h_split_lines.append(y0)
613
- h_split_lines.append(y1)
614
- h_split_lines.append(page_y1)
615
-
616
- unsplited_bboxes = []
617
- for i in range(0, len(h_split_lines), 2):
618
- start_y0, start_y1 = h_split_lines[i:i+2]
619
- # 然后找出[start_y0, start_y1]之间的其他bbox,这些组成一个未分割板块
620
- bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
621
- unsplited_bboxes.append(bboxes_in_block)
622
- # ================== 至此,水平方向的 已经切分排序完毕====================================
623
- """
624
- 接下来针对每个非水平的部分切分垂直方向的
625
- 此时,只剩下了无法被完全水平打通的bbox了。对这些box,优先进行垂直扩展,然后进行垂直切分.
626
- 分3步:
627
- 1. 先把能完全垂直打通的隔离出去当做一个layout
628
- 2. 其余的先垂直切分
629
- 3. 垂直切分之后的部分再尝试水平切分
630
- 4. 剩下的不能被切分的各个部分当成一个layout
631
- """
632
- # 对每部分进行垂直切分
633
- for bboxes_in_block in unsplited_bboxes:
634
- # 首先对这个block的bbox进行垂直方向上的扩展
635
- boundry_x0, boundry_y0, boundry_x1, boundry_y1 = find_boundry_bboxes(bboxes_in_block)
636
- # 进行垂直方向上的扩展
637
- extended_vertical_bboxes = extend_bbox_vertical(bboxes_in_block, boundry_x0, boundry_y0, boundry_x1, boundry_y1)
638
- # 然后对这个block进行垂直方向上的切分
639
- extend_bbox_vertical.sort(key=lambda x: x[X0_IDX]) # x方向上从小到大,代表了从左到右读取
640
- v_boxes_group = []
641
- for bbox in extended_vertical_bboxes:
642
- if bbox[Y0_IDX]==boundry_y0 and bbox[Y1_IDX]==boundry_y1:
643
- v_boxes_group.append(bbox)
644
- else:
645
- if len(v_boxes_group)>0:
646
- v_boxes.append(v_boxes_group)
647
- v_boxes_group = []
648
-
649
- if len(v_boxes_group)>0:
650
-
651
- v_boxes.append(v_boxes_group)
652
-
653
- # 把连续的垂直部分加入到sorted_layouts里。注意这个时候已经是连续的垂直部分了,因为上面已经做了
654
- for gp in v_boxes:
655
- gp.sort(key=lambda x: x[X0_IDX])
656
- block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
657
- # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
658
- x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
659
- block_info["layout_bbox"] = [x0, y0, x1, y1]
660
- sorted_layouts.append(block_info)
661
-
662
- # 在垂直方向上,划分子块,也就是用贯通的垂直线进行切分。这些被切分出来的块,极大可能是可被垂直切分的,如果不能完全的垂直切分,那么尝试水平切分。都不能的则当成一个layout
663
- v_split_lines = [boundry_x0]
664
- for gp in v_boxes:
665
- layout_bbox = gp['layout_bbox']
666
- x0, x1 = layout_bbox[0], layout_bbox[2]
667
- v_split_lines.append(x0)
668
- v_split_lines.append(x1)
669
- v_split_lines.append(boundry_x1)
670
-
671
- reset_idx_x_y(all_bboxes)
672
- all_boxes = _paper_bbox_sort_ext(all_bboxes)
673
- return all_boxes
674
-
675
-
676
-
677
-
678
-
679
-
680
-
681
-