magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,244 +0,0 @@
1
-
2
-
3
-
4
- import re
5
- from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
6
-
7
- from magic_pdf.libs.textbase import get_text_block_base_info
8
-
9
- def fix_image_vertical(image_bboxes:list, text_blocks:list):
10
- """
11
- 修正图片的位置
12
- 如果图片与文字block发生一定重叠(也就是图片切到了一部分文字),那么减少图片边缘,让文字和图片不再重叠。
13
- 只对垂直方向进行。
14
- """
15
- for image_bbox in image_bboxes:
16
- for text_block in text_blocks:
17
- text_bbox = text_block["bbox"]
18
- if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]):
19
- if text_bbox[1] < image_bbox[1]:#在图片上方
20
- image_bbox[1] = text_bbox[3]+1
21
- elif text_bbox[3]>image_bbox[3]:#在图片下方
22
- image_bbox[3] = text_bbox[1]-1
23
-
24
- return image_bboxes
25
-
26
- def __merge_if_common_edge(bbox1, bbox2):
27
- x_min_1, y_min_1, x_max_1, y_max_1 = bbox1
28
- x_min_2, y_min_2, x_max_2, y_max_2 = bbox2
29
-
30
- # 检查是否有公共的水平边
31
- if y_min_1 == y_min_2 or y_max_1 == y_max_2:
32
- # 确保一个框的x范围在另一个框的x范围内
33
- if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2):
34
- return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
35
-
36
- # 检查是否有公共的垂直边
37
- if x_min_1 == x_min_2 or x_max_1 == x_max_2:
38
- # 确保一个框的y范围在另一个框的y范围内
39
- if max(y_min_1, y_min_2) <= min(y_max_1, y_max_2):
40
- return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
41
-
42
- # 如果没有公共边
43
- return None
44
-
45
- def fix_seperated_image(image_bboxes:list):
46
- """
47
- 如果2个图片有一个边重叠,那么合并2个图片
48
- """
49
- new_images = []
50
- droped_img_idx = []
51
-
52
- for i in range(0, len(image_bboxes)):
53
- for j in range(i+1, len(image_bboxes)):
54
- new_img = __merge_if_common_edge(image_bboxes[i], image_bboxes[j])
55
- if new_img is not None:
56
- new_images.append(new_img)
57
- droped_img_idx.append(i)
58
- droped_img_idx.append(j)
59
- break
60
-
61
- for i in range(0, len(image_bboxes)):
62
- if i not in droped_img_idx:
63
- new_images.append(image_bboxes[i])
64
-
65
- return new_images
66
-
67
-
68
- def __check_img_title_pattern(text):
69
- """
70
- 检查文本段是否是表格的标题
71
- """
72
- patterns = [r"^(fig|figure).*", r"^(scheme).*"]
73
- text = text.strip()
74
- for pattern in patterns:
75
- match = re.match(pattern, text, re.IGNORECASE)
76
- if match:
77
- return True
78
- return False
79
-
80
- def __get_fig_caption_text(text_block):
81
- txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
82
- line_cnt = len(text_block['lines'])
83
- txt = txt.replace("Ž . ", '')
84
- return txt, line_cnt
85
-
86
-
87
- def __find_and_extend_bottom_caption(text_block, pymu_blocks, image_box):
88
- """
89
- 继续向下方寻找和图片caption字号,字体,颜色一样的文字框,合并入caption。
90
- text_block是已经找到的图片catpion(这个caption可能不全,多行被划分到多个pymu block里了)
91
- """
92
- combined_image_caption_text_block = list(text_block.copy()['bbox'])
93
- base_font_color, base_font_size, base_font_type = get_text_block_base_info(text_block)
94
- while True:
95
- tb_add = find_bottom_nearest_text_bbox(pymu_blocks, combined_image_caption_text_block)
96
- if not tb_add:
97
- break
98
- tb_font_color, tb_font_size, tb_font_type = get_text_block_base_info(tb_add)
99
- if tb_font_color==base_font_color and tb_font_size==base_font_size and tb_font_type==base_font_type:
100
- combined_image_caption_text_block[0] = min(combined_image_caption_text_block[0], tb_add['bbox'][0])
101
- combined_image_caption_text_block[2] = max(combined_image_caption_text_block[2], tb_add['bbox'][2])
102
- combined_image_caption_text_block[3] = tb_add['bbox'][3]
103
- else:
104
- break
105
-
106
- image_box[0] = min(image_box[0], combined_image_caption_text_block[0])
107
- image_box[1] = min(image_box[1], combined_image_caption_text_block[1])
108
- image_box[2] = max(image_box[2], combined_image_caption_text_block[2])
109
- image_box[3] = max(image_box[3], combined_image_caption_text_block[3])
110
- text_block['_image_caption'] = True
111
-
112
-
113
- def include_img_title(pymu_blocks, image_bboxes: list):
114
- """
115
- 向上方和下方寻找符合图片title的文本block,合并到图片里
116
- 如果图片上下都有fig的情况怎么办?寻找标题距离最近的那个。
117
- ---
118
- 增加对左侧和右侧图片标题的寻找
119
- """
120
-
121
-
122
- for tb in image_bboxes:
123
- # 优先找下方的
124
- max_find_cnt = 3 # 向上,向下最多找3个就停止
125
- temp_box = tb.copy()
126
- while max_find_cnt>0:
127
- text_block_btn = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
128
- if text_block_btn:
129
- txt, line_cnt = __get_fig_caption_text(text_block_btn)
130
- if len(txt.strip())>0:
131
- if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: # 设置line_cnt<=2目的是为了跳过子标题,或者有时候图片下方文字没有被图片识别模型放入图片里
132
- max_find_cnt = max_find_cnt - 1
133
- temp_box[3] = text_block_btn['bbox'][3]
134
- continue
135
- else:
136
- break
137
- else:
138
- temp_box[3] = text_block_btn['bbox'][3] # 宽度不变,扩大
139
- max_find_cnt = max_find_cnt - 1
140
- else:
141
- break
142
-
143
- max_find_cnt = 3 # 向上,向下最多找3个就停止
144
- temp_box = tb.copy()
145
- while max_find_cnt>0:
146
- text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
147
- if text_block_top:
148
- txt, line_cnt = __get_fig_caption_text(text_block_top)
149
- if len(txt.strip())>0:
150
- if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3:
151
- max_find_cnt = max_find_cnt - 1
152
- temp_box[1] = text_block_top['bbox'][1]
153
- continue
154
- else:
155
- break
156
- else:
157
- b = text_block_top['bbox']
158
- temp_box[1] = b[1] # 宽度不变,扩大
159
- max_find_cnt = max_find_cnt - 1
160
- else:
161
- break
162
-
163
- if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False :
164
- btn_text, _ = __get_fig_caption_text(text_block_btn)
165
- top_text, _ = __get_fig_caption_text(text_block_top)
166
- if __check_img_title_pattern(btn_text) and __check_img_title_pattern(top_text):
167
- # 取距离图片最近的
168
- btn_text_distance = text_block_btn['bbox'][1] - tb[3]
169
- top_text_distance = tb[1] - text_block_top['bbox'][3]
170
- if btn_text_distance<top_text_distance: # caption在下方
171
- __find_and_extend_bottom_caption(text_block_btn, pymu_blocks, tb)
172
- else:
173
- text_block = text_block_top
174
- tb[0] = min(tb[0], text_block['bbox'][0])
175
- tb[1] = min(tb[1], text_block['bbox'][1])
176
- tb[2] = max(tb[2], text_block['bbox'][2])
177
- tb[3] = max(tb[3], text_block['bbox'][3])
178
- text_block_btn['_image_caption'] = True
179
- continue
180
-
181
- text_block = text_block_btn # find_bottom_nearest_text_bbox(pymu_blocks, tb)
182
- if text_block and text_block.get("_image_caption", False) is False:
183
- first_text_line, _ = __get_fig_caption_text(text_block)
184
- if __check_img_title_pattern(first_text_line):
185
- # 发现特征之后,继续向相同方向寻找(想同颜色,想同大小,想同字体)的textblock
186
- __find_and_extend_bottom_caption(text_block, pymu_blocks, tb)
187
- continue
188
-
189
- text_block = text_block_top # find_top_nearest_text_bbox(pymu_blocks, tb)
190
- if text_block and text_block.get("_image_caption", False) is False:
191
- first_text_line, _ = __get_fig_caption_text(text_block)
192
- if __check_img_title_pattern(first_text_line):
193
- tb[0] = min(tb[0], text_block['bbox'][0])
194
- tb[1] = min(tb[1], text_block['bbox'][1])
195
- tb[2] = max(tb[2], text_block['bbox'][2])
196
- tb[3] = max(tb[3], text_block['bbox'][3])
197
- text_block['_image_caption'] = True
198
- continue
199
-
200
- """向左、向右寻找,暂时只寻找一次"""
201
- left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
202
- if left_text_block and left_text_block.get("_image_caption", False) is False:
203
- first_text_line, _ = __get_fig_caption_text(left_text_block)
204
- if __check_img_title_pattern(first_text_line):
205
- tb[0] = min(tb[0], left_text_block['bbox'][0])
206
- tb[1] = min(tb[1], left_text_block['bbox'][1])
207
- tb[2] = max(tb[2], left_text_block['bbox'][2])
208
- tb[3] = max(tb[3], left_text_block['bbox'][3])
209
- left_text_block['_image_caption'] = True
210
- continue
211
-
212
- right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
213
- if right_text_block and right_text_block.get("_image_caption", False) is False:
214
- first_text_line, _ = __get_fig_caption_text(right_text_block)
215
- if __check_img_title_pattern(first_text_line):
216
- tb[0] = min(tb[0], right_text_block['bbox'][0])
217
- tb[1] = min(tb[1], right_text_block['bbox'][1])
218
- tb[2] = max(tb[2], right_text_block['bbox'][2])
219
- tb[3] = max(tb[3], right_text_block['bbox'][3])
220
- right_text_block['_image_caption'] = True
221
- continue
222
-
223
- return image_bboxes
224
-
225
-
226
- def combine_images(image_bboxes:list):
227
- """
228
- 合并图片,如果图片有重叠,那么合并
229
- """
230
- new_images = []
231
- droped_img_idx = []
232
-
233
- for i in range(0, len(image_bboxes)):
234
- for j in range(i+1, len(image_bboxes)):
235
- if j not in droped_img_idx and _is_in_or_part_overlap(image_bboxes[i], image_bboxes[j]):
236
- # 合并
237
- image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
238
- droped_img_idx.append(j)
239
-
240
- for i in range(0, len(image_bboxes)):
241
- if i not in droped_img_idx:
242
- new_images.append(image_bboxes[i])
243
-
244
- return new_images
@@ -1,270 +0,0 @@
1
- from magic_pdf.libs.commons import fitz # pyMuPDF库
2
- import re
3
-
4
- from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox # json
5
-
6
-
7
- ## version 2
8
- def get_merged_line(page):
9
- """
10
- 这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线,并且将断开的线段进行了合并。
11
- :param page :fitz读取的当前页的内容
12
- """
13
- drawings_bbox = []
14
- drawings_line = []
15
- drawings = page.get_drawings() # 提取所有的矢量
16
- for p in drawings:
17
- drawings_bbox.append(p["rect"].irect) # (L, U, R, D)
18
-
19
- lines = []
20
- for L, U, R, D in drawings_bbox:
21
- if abs(D - U) <= 3: # 筛出水平的横线
22
- lines.append((L, U, R, D))
23
- U_groups = []
24
- visited = [False for _ in range(len(lines))]
25
- for i, (L1, U1, R1, D1) in enumerate(lines):
26
- if visited[i] == True:
27
- continue
28
- tmp_g = [(L1, U1, R1, D1)]
29
- for j, (L2, U2, R2, D2) in enumerate(lines):
30
- if i == j:
31
- continue
32
- if visited[j] == True:
33
- continue
34
- if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5: # 把高度一致的线放进一个group
35
- tmp_g.append((L2, U2, R2, D2))
36
- visited[j] = True
37
- U_groups.append(tmp_g)
38
-
39
- res = []
40
- for group in U_groups:
41
- group.sort(key = lambda LURD: (LURD[0], LURD[2]))
42
- LL, UU, RR, DD = group[0]
43
- for i, (L1, U1, R1, D1) in enumerate(group):
44
- if (L1 - RR) >= 5:
45
- cur_line = (LL, UU, RR, DD)
46
- res.append(cur_line)
47
- LL = L1
48
- else:
49
- RR = max(RR, R1)
50
- cur_line = (LL, UU, RR, DD)
51
- res.append(cur_line)
52
- return res
53
-
54
- def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
55
- """
56
- :param page :fitz读取的当前页的内容
57
- :param table_bboxes: list类型,每一个元素是一个元祖 (L, U, R, D)
58
- :param include_table_title: 是否将表格的标题也圈进来
59
- :param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
60
- """
61
-
62
- drawings_lines = get_merged_line(page)
63
- fix_table_bboxes = []
64
-
65
- for table in table_bboxes:
66
- (L, U, R, D) = table
67
- fix_table_L = []
68
- fix_table_U = []
69
- fix_table_R = []
70
- fix_table_D = []
71
- width = R - L
72
- width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
73
- height = D - U
74
- height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
75
- for line in drawings_lines:
76
- if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
77
- if (U - height_range) < line[1] < (U + height_range): # 上边界,在一定的高度范围内
78
- fix_table_U.append(line[1])
79
- fix_table_L.append(line[0])
80
- fix_table_R.append(line[2])
81
- elif (D - height_range) < line[1] < (D + height_range): # 下边界,在一定的高度范围内
82
- fix_table_D.append(line[1])
83
- fix_table_L.append(line[0])
84
- fix_table_R.append(line[2])
85
-
86
- if fix_table_U:
87
- U = min(fix_table_U)
88
- if fix_table_D:
89
- D = max(fix_table_D)
90
- if fix_table_L:
91
- L = min(fix_table_L)
92
- if fix_table_R:
93
- R = max(fix_table_R)
94
-
95
- if include_table_title: # 需要将表格标题包括
96
- text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] # 所有的text的block
97
- incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))] # 将与表格完全没有任何遮挡的文字筛除掉(比如另一栏的文字)
98
- upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0] # 将在表格线以上的text block筛选出来
99
- sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序,如果是同一个高度,则先左再右
100
-
101
- for idx in range(scan_line_num):
102
- if idx+1 <= len(sorted_filtered_text_blocks):
103
- line_temp = sorted_filtered_text_blocks[idx]['lines']
104
- if line_temp:
105
- text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
106
- check_en = re.match('Table', text) # 检查是否有Table开头的(英文)
107
- check_ch = re.match('表', text) # 检查是否有Table开头的(中文)
108
- if check_en or check_ch:
109
- if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
110
- U = sorted_filtered_text_blocks[idx]['bbox'][1]
111
-
112
- fix_table_bboxes.append([L-2, U-2, R+2, D+2])
113
-
114
- return fix_table_bboxes
115
-
116
- def __check_table_title_pattern(text):
117
- """
118
- 检查文本段是否是表格的标题
119
- """
120
- patterns = [r'^table\s\d+']
121
-
122
- for pattern in patterns:
123
- match = re.match(pattern, text, re.IGNORECASE)
124
- if match:
125
- return True
126
- else:
127
- return False
128
-
129
-
130
- def fix_table_text_block(pymu_blocks, table_bboxes: list):
131
- """
132
- 调整table, 如果table和上下的text block有相交区域,则将table的上下边界调整到text block的上下边界
133
- 例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
134
- """
135
- for tb in table_bboxes:
136
- (L, U, R, D) = tb
137
- for block in pymu_blocks:
138
- if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
139
- txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
140
- if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title,那么不调整。因为下一步会统一调整,如果这里进行了调整,后面的调整会造成调整到其他table的title上(在连续出现2个table的情况下)。
141
- tb[0] = min(tb[0], block['bbox'][0])
142
- tb[1] = min(tb[1], block['bbox'][1])
143
- tb[2] = max(tb[2], block['bbox'][2])
144
- tb[3] = max(tb[3], block['bbox'][3])
145
- block['_table'] = True # 占位,防止其他table再次占用
146
-
147
- """如果是个table的title,但是有部分重叠,那么修正这个title,使得和table不重叠"""
148
- if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
149
- block['bbox'] = list(block['bbox'])
150
- if block['bbox'][3] > U:
151
- block['bbox'][3] = U-1
152
- if block['bbox'][1] < D:
153
- block['bbox'][1] = D+1
154
-
155
-
156
- return table_bboxes
157
-
158
-
159
- def __get_table_caption_text(text_block):
160
- txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
161
- line_cnt = len(text_block['lines'])
162
- txt = txt.replace("Ž . ", '')
163
- return txt, line_cnt
164
-
165
-
166
- def include_table_title(pymu_blocks, table_bboxes: list):
167
- """
168
- 把表格的title也包含进来,扩展到table_bbox上
169
- """
170
- for tb in table_bboxes:
171
- max_find_cnt = 3 # 上上最多找3次
172
- temp_box = tb.copy()
173
- while max_find_cnt>0:
174
- text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
175
- if text_block_top:
176
- txt, line_cnt = __get_table_caption_text(text_block_top)
177
- if len(txt.strip())>0:
178
- if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
179
- max_find_cnt = max_find_cnt -1
180
- temp_box[1] = text_block_top['bbox'][1]
181
- continue
182
- else:
183
- break
184
- else:
185
- temp_box[1] = text_block_top['bbox'][1] # 宽度不变,扩大
186
- max_find_cnt = max_find_cnt - 1
187
- else:
188
- break
189
-
190
- max_find_cnt = 3 # 向下找
191
- temp_box = tb.copy()
192
- while max_find_cnt>0:
193
- text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
194
- if text_block_bottom:
195
- txt, line_cnt = __get_table_caption_text(text_block_bottom)
196
- if len(txt.strip())>0:
197
- if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
198
- max_find_cnt = max_find_cnt - 1
199
- temp_box[3] = text_block_bottom['bbox'][3]
200
- continue
201
- else:
202
- break
203
- else:
204
- temp_box[3] = text_block_bottom['bbox'][3]
205
- max_find_cnt = max_find_cnt - 1
206
- else:
207
- break
208
-
209
- if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
210
- btn_text, _ = __get_table_caption_text(text_block_bottom)
211
- top_text, _ = __get_table_caption_text(text_block_top)
212
- if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
213
- # 取距离最近的
214
- btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
215
- top_text_distance = tb[1] - text_block_top['bbox'][3]
216
- text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
217
- tb[0] = min(tb[0], text_block['bbox'][0])
218
- tb[1] = min(tb[1], text_block['bbox'][1])
219
- tb[2] = max(tb[2], text_block['bbox'][2])
220
- tb[3] = max(tb[3], text_block['bbox'][3])
221
- text_block_bottom['_table_caption'] = True
222
- continue
223
-
224
- # 如果以上条件都不满足,那么就向下找
225
- text_block = text_block_top
226
- if text_block and text_block.get("_table_caption", False) is False:
227
- first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
228
- if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
229
- tb[0] = min(tb[0], text_block['bbox'][0])
230
- tb[1] = min(tb[1], text_block['bbox'][1])
231
- tb[2] = max(tb[2], text_block['bbox'][2])
232
- tb[3] = max(tb[3], text_block['bbox'][3])
233
- text_block['_table_caption'] = True
234
- continue
235
-
236
- text_block = text_block_bottom
237
- if text_block and text_block.get("_table_caption", False) is False:
238
- first_text_line, _ = __get_table_caption_text(text_block)
239
- if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
240
- tb[0] = min(tb[0], text_block['bbox'][0])
241
- tb[1] = min(tb[1], text_block['bbox'][1])
242
- tb[2] = max(tb[2], text_block['bbox'][2])
243
- tb[3] = max(tb[3], text_block['bbox'][3])
244
- text_block['_table_caption'] = True
245
- continue
246
-
247
- """向左、向右寻找,暂时只寻找一次"""
248
- left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
249
- if left_text_block and left_text_block.get("_image_caption", False) is False:
250
- first_text_line, _ = __get_table_caption_text(left_text_block)
251
- if __check_table_title_pattern(first_text_line):
252
- tb[0] = min(tb[0], left_text_block['bbox'][0])
253
- tb[1] = min(tb[1], left_text_block['bbox'][1])
254
- tb[2] = max(tb[2], left_text_block['bbox'][2])
255
- tb[3] = max(tb[3], left_text_block['bbox'][3])
256
- left_text_block['_image_caption'] = True
257
- continue
258
-
259
- right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
260
- if right_text_block and right_text_block.get("_image_caption", False) is False:
261
- first_text_line, _ = __get_table_caption_text(right_text_block)
262
- if __check_table_title_pattern(first_text_line):
263
- tb[0] = min(tb[0], right_text_block['bbox'][0])
264
- tb[1] = min(tb[1], right_text_block['bbox'][1])
265
- tb[2] = max(tb[2], right_text_block['bbox'][2])
266
- tb[3] = max(tb[3], right_text_block['bbox'][3])
267
- right_text_block['_image_caption'] = True
268
- continue
269
-
270
- return table_bboxes
@@ -1,23 +0,0 @@
1
- import collections
2
-
3
-
4
- def get_main_text_font(pdf_docs):
5
- font_names = collections.Counter()
6
- for page in pdf_docs:
7
- blocks = page.get_text('dict')['blocks']
8
- if blocks is not None:
9
- for block in blocks:
10
- lines = block.get('lines')
11
- if lines is not None:
12
- for line in lines:
13
- span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
14
- 'font' in span and len(span['text']) > 0]
15
- if span_font:
16
- # main_text_font应该用基于字数最多的字体而不是span级别的统计
17
- # font_names.append(font_name for font_name in span_font)
18
- # block_fonts.append(font_name for font_name in span_font)
19
- for font, count in span_font:
20
- font_names[font] += count
21
- main_text_font = font_names.most_common(1)[0][0]
22
- return main_text_font
23
-
@@ -1,133 +0,0 @@
1
- import fitz
2
-
3
- from magic_pdf.layout.layout_sort import get_bboxes_layout
4
- from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
5
- from magic_pdf.libs.coordinate_transform import get_scale_ratio
6
-
7
-
8
- def get_center_point(bbox):
9
- """
10
- 根据边界框坐标信息,计算出该边界框的中心点坐标。
11
- Args:
12
- bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
13
- Returns:
14
- list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
15
- """
16
- return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
17
-
18
-
19
- def get_area(bbox):
20
- """
21
- 根据边界框坐标信息,计算出该边界框的面积。
22
- Args:
23
- bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
24
- Returns:
25
- float: 该边界框的面积。
26
- """
27
- return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
28
-
29
-
30
- def adjust_layouts(layout_bboxes, page_boundry, page_id):
31
- # 遍历所有布局框
32
- for i in range(len(layout_bboxes)):
33
- # 遍历当前布局框之后的布局框
34
- for j in range(i + 1, len(layout_bboxes)):
35
- # 判断两个布局框是否重叠
36
- if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
37
- # 计算每个布局框的中心点坐标和面积
38
- area_i = get_area(layout_bboxes[i])
39
- area_j = get_area(layout_bboxes[j])
40
-
41
- # 较大布局框和较小布局框的赋值
42
- if area_i > area_j:
43
- larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
44
- else:
45
- larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
46
-
47
- center_large = get_center_point(larger_layout)
48
- center_small = get_center_point(smaller_layout)
49
- # 计算横向和纵向的距离差
50
- distance_x = center_large[0] - center_small[0]
51
- distance_y = center_large[1] - center_small[1]
52
-
53
- # 根据距离差判断重叠方向并修正边界
54
- if abs(distance_x) > abs(distance_y): # 左右重叠
55
- if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
56
- larger_layout[0] = smaller_layout[2]+1
57
- if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
58
- larger_layout[2] = smaller_layout[0]-1
59
- else: # 上下重叠
60
- if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
61
- larger_layout[1] = smaller_layout[3]+1
62
- if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
63
- larger_layout[3] = smaller_layout[1]-1
64
- # 排序调整布局边界框列表
65
- new_bboxes = []
66
- for layout_bbox in layout_bboxes:
67
- new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
68
-
69
- layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
70
-
71
- # 返回排序调整后的布局边界框列表
72
- return layout_bboxes, layout_tree
73
-
74
-
75
- def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
76
- """
77
- 对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
78
-
79
- Args:
80
- layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
81
-
82
- Returns:
83
- list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
84
-
85
- """
86
- page_id = ocr_page_info['page_info']['page_no']-1
87
- horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
88
- # 初始化布局边界框列表
89
- layout_bboxes = []
90
- # 遍历每个子布局
91
- for sub_layout in layout_info:
92
- # 提取子布局的边界框坐标信息
93
- x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
94
- bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
95
- int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
96
-
97
- # 将子布局的边界框添加到列表中
98
- layout_bboxes.append(bbox)
99
-
100
- # 初始化新的布局边界框列表
101
- new_layout_bboxes = []
102
- # 遍历每个布局边界框
103
- for i in range(len(layout_bboxes)):
104
- # 初始化标记变量,用于判断当前边界框是否需要保留
105
- keep = True
106
- # 获取当前边界框的坐标信息
107
- box_i = layout_bboxes[i]
108
-
109
- # 遍历其他边界框
110
- for j in range(len(layout_bboxes)):
111
- # 排除当前边界框自身
112
- if i != j:
113
- # 获取其他边界框的坐标信息
114
- box_j = layout_bboxes[j]
115
- # 检测box_i是否被box_j包含
116
- if _is_in(box_i, box_j):
117
- # 如果当前边界框被其他边界框包含,则标记为不需要保留
118
- keep = False
119
- # 跳出内层循环
120
- break
121
-
122
- # 如果当前边界框需要保留,则添加到新的布局边界框列表中
123
- if keep:
124
- new_layout_bboxes.append(layout_bboxes[i])
125
-
126
- # 对新的布局边界框列表进行排序调整
127
- page_width = page.rect.width
128
- page_height = page.rect.height
129
- page_boundry = [0, 0, page_width, page_height]
130
- layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
131
-
132
- # 返回排序调整后的布局边界框列表
133
- return layout_bboxes, layout_tree