magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,284 +0,0 @@
1
- from collections import defaultdict
2
-
3
- from magic_pdf.libs.boxbase import calculate_iou
4
-
5
-
6
- def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
7
- return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
8
-
9
- def is_single_line_block(block):
10
- # Determine based on the width and height of the block
11
- block_width = block["X1"] - block["X0"]
12
- block_height = block["bbox"][3] - block["bbox"][1]
13
-
14
- # If the height of the block is close to the average character height and the width is large, it is considered a single line
15
- return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
16
-
17
- def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
18
- """
19
- This function gets the most common bboxes from the bboxes
20
-
21
- Parameters
22
- ----------
23
- bboxes : list
24
- bboxes
25
- page_height : float
26
- height of the page
27
- position : str, optional
28
- "top" or "bottom", by default "top"
29
- threshold : float, optional
30
- threshold, by default 0.25
31
- num_bboxes : int, optional
32
- number of bboxes to return, by default 3
33
- min_frequency : int, optional
34
- minimum frequency of the bbox, by default 2
35
-
36
- Returns
37
- -------
38
- common_bboxes : list
39
- common bboxes
40
- """
41
- # Filter bbox by position
42
- if position == "top":
43
- filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
44
- else:
45
- filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
46
-
47
- # Find the most common bbox
48
- bbox_count = defaultdict(int)
49
- for bbox in filtered_bboxes:
50
- bbox_count[tuple(bbox)] += 1
51
-
52
- # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
53
- common_bboxes = [
54
- bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
55
- ][:num_bboxes]
56
- return common_bboxes
57
-
58
- def detect_footer_header2(result_dict, similarity_threshold=0.5):
59
- """
60
- This function detects the header and footer of the document.
61
-
62
- Parameters
63
- ----------
64
- result_dict : dict
65
- result dictionary
66
-
67
- Returns
68
- -------
69
- result_dict : dict
70
- result dictionary
71
- """
72
- # Traverse all blocks in the document
73
- single_line_blocks = 0
74
- total_blocks = 0
75
- single_line_blocks = 0
76
-
77
- for page_id, blocks in result_dict.items():
78
- if page_id.startswith("page_"):
79
- for block_key, block in blocks.items():
80
- if block_key.startswith("block_"):
81
- total_blocks += 1
82
- if is_single_line_block(block):
83
- single_line_blocks += 1
84
-
85
- # If there are no blocks, skip the header and footer detection
86
- if total_blocks == 0:
87
- print("No blocks found. Skipping header/footer detection.")
88
- return result_dict
89
-
90
- # If most of the blocks are single-line, skip the header and footer detection
91
- if single_line_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
92
- # print("Skipping header/footer detection for text-dense document.")
93
- return result_dict
94
-
95
- # Collect the bounding boxes of all blocks
96
- all_bboxes = []
97
- all_texts = []
98
-
99
- for page_id, blocks in result_dict.items():
100
- if page_id.startswith("page_"):
101
- for block_key, block in blocks.items():
102
- if block_key.startswith("block_"):
103
- all_bboxes.append(block["bbox"])
104
-
105
- # Get the height of the page
106
- page_height = max(bbox[3] for bbox in all_bboxes)
107
-
108
- # Get the most common bbox lists for headers and footers
109
- common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
110
- common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
111
-
112
- # Detect and mark headers and footers
113
- for page_id, blocks in result_dict.items():
114
- if page_id.startswith("page_"):
115
- for block_key, block in blocks.items():
116
- if block_key.startswith("block_"):
117
- bbox = block["bbox"]
118
- text = block["text"]
119
-
120
- is_header = compare_bbox_with_list(bbox, common_header_bboxes)
121
- is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
122
- block["is_header"] = int(is_header)
123
- block["is_footer"] = int(is_footer)
124
-
125
- return result_dict
126
-
127
-
128
- def __get_page_size(page_sizes:list):
129
- """
130
- 页面大小可能不一样
131
- """
132
- w = sum([w for w,h in page_sizes])/len(page_sizes)
133
- h = sum([h for w,h in page_sizes])/len(page_sizes)
134
- return w, h
135
-
136
- def __calculate_iou(bbox1, bbox2):
137
- iou = calculate_iou(bbox1, bbox2)
138
- return iou
139
-
140
- def __is_same_pos(box1, box2, iou_threshold):
141
- iou = __calculate_iou(box1, box2)
142
- return iou >= iou_threshold
143
-
144
-
145
- def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int, page_range_threshold=0.2, iou_threshold=0.9):
146
- """
147
- common bbox必须大于page_cnt的1/3
148
- """
149
- min_occurance_cnt = max(3, page_cnt//4)
150
- header_det_bbox = []
151
- footer_det_bbox = []
152
-
153
- hdr_same_pos_group = []
154
- btn_same_pos_group = []
155
-
156
- page_w, page_h = __get_page_size(page_size)
157
- top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold)
158
-
159
- top_bbox = [b for b in bboxes if b[3]<top_y]
160
- bottom_bbox = [b for b in bboxes if b[1]>bottom_y]
161
- # 然后开始排序,寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个
162
- for i in range(0, len(top_bbox)):
163
- hdr_same_pos_group.append([top_bbox[i]])
164
- for j in range(i+1, len(top_bbox)):
165
- if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold):
166
- #header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])]
167
- hdr_same_pos_group[i].append(top_bbox[j])
168
-
169
- for i in range(0, len(bottom_bbox)):
170
- btn_same_pos_group.append([bottom_bbox[i]])
171
- for j in range(i+1, len(bottom_bbox)):
172
- if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold):
173
- #footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])]
174
- btn_same_pos_group[i].append(bottom_bbox[j])
175
-
176
- # 然后看下每一组的bbox,是否符合大于page_cnt一定比例
177
- hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt]
178
- btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt]
179
-
180
- # 平铺2个list[list]
181
- hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g]
182
- btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g]
183
- # 寻找hdr_same_pos_group中的box[3]最大值,btn_same_pos_group中的box[1]最小值
184
- hdr_same_pos_group.sort(key=lambda b:b[3])
185
- btn_same_pos_group.sort(key=lambda b:b[1])
186
-
187
- hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0
188
- btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h
189
-
190
- header_det_bbox = [0, 0, page_w, hdr_y]
191
- footer_det_bbox = [0, btn_y, page_w, page_h]
192
- # logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}")
193
- return header_det_bbox, footer_det_bbox, page_w, page_h
194
-
195
-
196
- def drop_footer_header(pdf_info_dict:dict):
197
- """
198
- 启用规则探测,在全局的视角上通过统计的方法。
199
- """
200
- header = []
201
- footer = []
202
-
203
- all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
204
- image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
205
- page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
206
- page_cnt = len(pdf_info_dict.keys()) # 一共多少页
207
- header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt)
208
-
209
- """"
210
- 把范围扩展到页面水平的整个方向上
211
- """
212
- if header:
213
- header = [0, 0, page_w, header[3]+1]
214
-
215
- if footer:
216
- footer = [0, footer[1]-1, page_w, page_h]
217
-
218
- # 找到footer, header范围之后,针对每一页pdf,从text、图片中删除这些范围内的内容
219
- # 移除text block
220
-
221
- for _, page_info in pdf_info_dict.items():
222
- header_text_blk = []
223
- footer_text_blk = []
224
- for blk in page_info['preproc_blocks']:
225
- blk_bbox = blk['bbox']
226
- if header and blk_bbox[3]<=header[3]:
227
- blk['tag'] = "header"
228
- header_text_blk.append(blk)
229
- elif footer and blk_bbox[1]>=footer[1]:
230
- blk['tag'] = "footer"
231
- footer_text_blk.append(blk)
232
-
233
- # 放入text_block_droped中
234
- page_info['droped_text_block'].extend(header_text_blk)
235
- page_info['droped_text_block'].extend(footer_text_blk)
236
-
237
- for blk in header_text_blk:
238
- page_info['preproc_blocks'].remove(blk)
239
- for blk in footer_text_blk:
240
- page_info['preproc_blocks'].remove(blk)
241
-
242
- """接下来把footer、header上的图片也删除掉。图片包括正常的和backup的"""
243
- header_image = []
244
- footer_image = []
245
-
246
- for image_info in page_info['images']:
247
- img_bbox = image_info['bbox']
248
- if header and img_bbox[3]<=header[3]:
249
- image_info['tag'] = "header"
250
- header_image.append(image_info)
251
- elif footer and img_bbox[1]>=footer[1]:
252
- image_info['tag'] = "footer"
253
- footer_image.append(image_info)
254
-
255
- page_info['droped_image_block'].extend(header_image)
256
- page_info['droped_image_block'].extend(footer_image)
257
-
258
- for img in header_image:
259
- page_info['images'].remove(img)
260
- for img in footer_image:
261
- page_info['images'].remove(img)
262
-
263
- """接下来吧backup的图片也删除掉"""
264
- header_image = []
265
- footer_image = []
266
-
267
- for image_info in page_info['image_backup']:
268
- img_bbox = image_info['bbox']
269
- if header and img_bbox[3]<=header[3]:
270
- image_info['tag'] = "header"
271
- header_image.append(image_info)
272
- elif footer and img_bbox[1]>=footer[1]:
273
- image_info['tag'] = "footer"
274
- footer_image.append(image_info)
275
-
276
- page_info['droped_image_block'].extend(header_image)
277
- page_info['droped_image_block'].extend(footer_image)
278
-
279
- for img in header_image:
280
- page_info['image_backup'].remove(img)
281
- for img in footer_image:
282
- page_info['image_backup'].remove(img)
283
-
284
- return header, footer
@@ -1,170 +0,0 @@
1
- from collections import Counter
2
- from magic_pdf.libs.commons import fitz # pyMuPDF库
3
- from magic_pdf.libs.coordinate_transform import get_scale_ratio
4
-
5
-
6
- def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path=None, debug_mode=False):
7
- """
8
- :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
9
- :param page :fitz读取的当前页的内容
10
- :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
11
- :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
12
- """
13
-
14
- #--------- 通过json_from_DocXchain来获取 footnote ---------#
15
- footnote_bbox_from_DocXChain = []
16
-
17
- xf_json = json_from_DocXchain_obj
18
- horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
19
-
20
- # {0: 'title', # 标题
21
- # 1: 'figure', # 图片
22
- # 2: 'plain text', # 文本
23
- # 3: 'header', # 页眉
24
- # 4: 'page number', # 页码
25
- # 5: 'footnote', # 脚注
26
- # 6: 'footer', # 页脚
27
- # 7: 'table', # 表格
28
- # 8: 'table caption', # 表格描述
29
- # 9: 'figure caption', # 图片描述
30
- # 10: 'equation', # 公式
31
- # 11: 'full column', # 单栏
32
- # 12: 'sub column', # 多栏
33
- # 13: 'embedding', # 嵌入公式
34
- # 14: 'isolated'} # 单行公式
35
- for xf in xf_json['layout_dets']:
36
- L = xf['poly'][0] / horizontal_scale_ratio
37
- U = xf['poly'][1] / vertical_scale_ratio
38
- R = xf['poly'][2] / horizontal_scale_ratio
39
- D = xf['poly'][5] / vertical_scale_ratio
40
- # L += pageL # 有的页面,artBox偏移了。不在(0,0)
41
- # R += pageL
42
- # U += pageU
43
- # D += pageU
44
- L, R = min(L, R), max(L, R)
45
- U, D = min(U, D), max(U, D)
46
- # if xf['category_id'] == 5 and xf['score'] >= 0.3:
47
- if xf['category_id'] == 5 and xf['score'] >= 0.43: # 新的footnote阈值
48
- footnote_bbox_from_DocXChain.append((L, U, R, D))
49
-
50
-
51
- footnote_final_names = []
52
- footnote_final_bboxs = []
53
- footnote_ID = 0
54
- for L, U, R, D in footnote_bbox_from_DocXChain:
55
- if debug_mode:
56
- # cur_footnote = page.get_pixmap(clip=(L,U,R,D))
57
- new_footnote_name = "footnote_{}_{}.png".format(page_ID, footnote_ID) # 脚注name
58
- # cur_footnote.save(md_bookname_save_path + '/' + new_footnote_name) # 把脚注存储在新建的文件夹,并命名
59
- footnote_final_names.append(new_footnote_name) # 把脚注的名字存在list中
60
- footnote_final_bboxs.append((L, U, R, D))
61
- footnote_ID += 1
62
-
63
-
64
- footnote_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
65
- curPage_all_footnote_bboxs = footnote_final_bboxs
66
- return curPage_all_footnote_bboxs
67
-
68
-
69
- def need_remove(block):
70
- if 'lines' in block and len(block['lines']) > 0:
71
- # block中只有一行,且该行文本全是大写字母,或字体为粗体bold关键词,SB关键词,把这个block捞回来
72
- if len(block['lines']) == 1:
73
- if 'spans' in block['lines'][0] and len(block['lines'][0]['spans']) == 1:
74
- font_keywords = ['SB', 'bold', 'Bold']
75
- if block['lines'][0]['spans'][0]['text'].isupper() or any(keyword in block['lines'][0]['spans'][0]['font'] for keyword in font_keywords):
76
- return True
77
- for line in block['lines']:
78
- if 'spans' in line and len(line['spans']) > 0:
79
- for span in line['spans']:
80
- # 检测"keyword"是否在span中,忽略大小写
81
- if "keyword" in span['text'].lower():
82
- return True
83
- return False
84
-
85
- def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_font):
86
- """
87
- 根据给定的文本块、页高和页码,解析出符合规则的脚注文本块,并返回其边界框。
88
-
89
- Args:
90
- remain_text_blocks (list): 包含所有待处理的文本块的列表。
91
- page_height (float): 页面的高度。
92
- page_id (int): 页面的ID。
93
-
94
- Returns:
95
- list: 符合规则的脚注文本块的边界框列表。
96
-
97
- """
98
- # if page_id > 20:
99
- if page_id > 2: # 为保证精确度,先只筛选前3页
100
- return []
101
- else:
102
- # 存储每一行的文本块大小的列表
103
- line_sizes = []
104
- # 存储每个文本块的平均行大小
105
- block_sizes = []
106
- # 存储每一行的字体信息
107
- # font_names = []
108
- font_names = Counter()
109
- if len(remain_text_blocks) > 0:
110
- for block in remain_text_blocks:
111
- block_line_sizes = []
112
- # block_fonts = []
113
- block_fonts = Counter()
114
- for line in block['lines']:
115
- # 提取每个span的size属性,并计算行大小
116
- span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
117
- if span_sizes:
118
- line_size = sum(span_sizes) / len(span_sizes)
119
- line_sizes.append(line_size)
120
- block_line_sizes.append(line_size)
121
- span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0]
122
- if span_font:
123
- # main_text_font应该用基于字数最多的字体而不是span级别的统计
124
- # font_names.append(font_name for font_name in span_font)
125
- # block_fonts.append(font_name for font_name in span_font)
126
- for font, count in span_font:
127
- # font_names.extend([font] * count)
128
- # block_fonts.extend([font] * count)
129
- font_names[font] += count
130
- block_fonts[font] += count
131
- if block_line_sizes:
132
- # 计算文本块的平均行大小
133
- block_size = sum(block_line_sizes) / len(block_line_sizes)
134
- # block_font = collections.Counter(block_fonts).most_common(1)[0][0]
135
- block_font = block_fonts.most_common(1)[0][0]
136
- block_sizes.append((block, block_size, block_font))
137
-
138
- # 计算main_text_size
139
- main_text_size = Counter(line_sizes).most_common(1)[0][0]
140
- # 计算main_text_font
141
- # main_text_font = collections.Counter(font_names).most_common(1)[0][0]
142
- # main_text_font = font_names.most_common(1)[0][0]
143
- # 删除一些可能被误识别为脚注的文本块
144
- block_sizes = [(block, block_size, block_font) for block, block_size, block_font in block_sizes if not need_remove(block)]
145
-
146
- # 检测footnote_block 并返回 footnote_bboxes
147
- # footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
148
- # block['bbox'][1] > page_height * 0.6 and block_size < main_text_size
149
- # and (len(block['lines']) < 5 or block_font != main_text_font)]
150
- # and len(block['lines']) < 5]
151
- footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
152
- block['bbox'][1] > page_height * 0.6 and
153
- # 较为严格的规则
154
- block_size < main_text_size and
155
- (len(block['lines']) < 5 or
156
- block_font != main_text_font)]
157
-
158
- # 较为宽松的规则
159
- # sum([block_size < main_text_size,
160
- # len(block['lines']) < 5,
161
- # block_font != main_text_font])
162
- # >= 2]
163
-
164
-
165
- return footnote_bboxes
166
- else:
167
- return []
168
-
169
-
170
-
@@ -1,64 +0,0 @@
1
- from magic_pdf.libs.commons import fitz # pyMuPDF库
2
- from magic_pdf.libs.coordinate_transform import get_scale_ratio
3
-
4
-
5
- def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
6
- """
7
- :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
8
- :param page :fitz读取的当前页的内容
9
- :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
10
- :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
11
- """
12
-
13
- #--------- 通过json_from_DocXchain来获取 header ---------#
14
- header_bbox_from_DocXChain = []
15
-
16
- xf_json = json_from_DocXchain_obj
17
- horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
18
-
19
- # {0: 'title', # 标题
20
- # 1: 'figure', # 图片
21
- # 2: 'plain text', # 文本
22
- # 3: 'header', # 页眉
23
- # 4: 'page number', # 页码
24
- # 5: 'footnote', # 脚注
25
- # 6: 'footer', # 页脚
26
- # 7: 'table', # 表格
27
- # 8: 'table caption', # 表格描述
28
- # 9: 'figure caption', # 图片描述
29
- # 10: 'equation', # 公式
30
- # 11: 'full column', # 单栏
31
- # 12: 'sub column', # 多栏
32
- # 13: 'embedding', # 嵌入公式
33
- # 14: 'isolated'} # 单行公式
34
- for xf in xf_json['layout_dets']:
35
- L = xf['poly'][0] / horizontal_scale_ratio
36
- U = xf['poly'][1] / vertical_scale_ratio
37
- R = xf['poly'][2] / horizontal_scale_ratio
38
- D = xf['poly'][5] / vertical_scale_ratio
39
- # L += pageL # 有的页面,artBox偏移了。不在(0,0)
40
- # R += pageL
41
- # U += pageU
42
- # D += pageU
43
- L, R = min(L, R), max(L, R)
44
- U, D = min(U, D), max(U, D)
45
- if xf['category_id'] == 3 and xf['score'] >= 0.3:
46
- header_bbox_from_DocXChain.append((L, U, R, D))
47
-
48
-
49
- header_final_names = []
50
- header_final_bboxs = []
51
- header_ID = 0
52
- for L, U, R, D in header_bbox_from_DocXChain:
53
- # cur_header = page.get_pixmap(clip=(L,U,R,D))
54
- new_header_name = "header_{}_{}.png".format(page_ID, header_ID) # 页眉name
55
- # cur_header.save(res_dir_path + '/' + new_header_name) # 把页眉存储在新建的文件夹,并命名
56
- header_final_names.append(new_header_name) # 把页面的名字存在list中
57
- header_final_bboxs.append((L, U, R, D))
58
- header_ID += 1
59
-
60
-
61
- header_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
62
- curPage_all_header_bboxs = header_final_bboxs
63
- return curPage_all_header_bboxs
64
-