magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,284 +0,0 @@
|
|
1
|
-
from collections import defaultdict
|
2
|
-
|
3
|
-
from magic_pdf.libs.boxbase import calculate_iou
|
4
|
-
|
5
|
-
|
6
|
-
def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
|
7
|
-
return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
|
8
|
-
|
9
|
-
def is_single_line_block(block):
|
10
|
-
# Determine based on the width and height of the block
|
11
|
-
block_width = block["X1"] - block["X0"]
|
12
|
-
block_height = block["bbox"][3] - block["bbox"][1]
|
13
|
-
|
14
|
-
# If the height of the block is close to the average character height and the width is large, it is considered a single line
|
15
|
-
return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
|
16
|
-
|
17
|
-
def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
|
18
|
-
"""
|
19
|
-
This function gets the most common bboxes from the bboxes
|
20
|
-
|
21
|
-
Parameters
|
22
|
-
----------
|
23
|
-
bboxes : list
|
24
|
-
bboxes
|
25
|
-
page_height : float
|
26
|
-
height of the page
|
27
|
-
position : str, optional
|
28
|
-
"top" or "bottom", by default "top"
|
29
|
-
threshold : float, optional
|
30
|
-
threshold, by default 0.25
|
31
|
-
num_bboxes : int, optional
|
32
|
-
number of bboxes to return, by default 3
|
33
|
-
min_frequency : int, optional
|
34
|
-
minimum frequency of the bbox, by default 2
|
35
|
-
|
36
|
-
Returns
|
37
|
-
-------
|
38
|
-
common_bboxes : list
|
39
|
-
common bboxes
|
40
|
-
"""
|
41
|
-
# Filter bbox by position
|
42
|
-
if position == "top":
|
43
|
-
filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
|
44
|
-
else:
|
45
|
-
filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
|
46
|
-
|
47
|
-
# Find the most common bbox
|
48
|
-
bbox_count = defaultdict(int)
|
49
|
-
for bbox in filtered_bboxes:
|
50
|
-
bbox_count[tuple(bbox)] += 1
|
51
|
-
|
52
|
-
# Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
|
53
|
-
common_bboxes = [
|
54
|
-
bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
|
55
|
-
][:num_bboxes]
|
56
|
-
return common_bboxes
|
57
|
-
|
58
|
-
def detect_footer_header2(result_dict, similarity_threshold=0.5):
|
59
|
-
"""
|
60
|
-
This function detects the header and footer of the document.
|
61
|
-
|
62
|
-
Parameters
|
63
|
-
----------
|
64
|
-
result_dict : dict
|
65
|
-
result dictionary
|
66
|
-
|
67
|
-
Returns
|
68
|
-
-------
|
69
|
-
result_dict : dict
|
70
|
-
result dictionary
|
71
|
-
"""
|
72
|
-
# Traverse all blocks in the document
|
73
|
-
single_line_blocks = 0
|
74
|
-
total_blocks = 0
|
75
|
-
single_line_blocks = 0
|
76
|
-
|
77
|
-
for page_id, blocks in result_dict.items():
|
78
|
-
if page_id.startswith("page_"):
|
79
|
-
for block_key, block in blocks.items():
|
80
|
-
if block_key.startswith("block_"):
|
81
|
-
total_blocks += 1
|
82
|
-
if is_single_line_block(block):
|
83
|
-
single_line_blocks += 1
|
84
|
-
|
85
|
-
# If there are no blocks, skip the header and footer detection
|
86
|
-
if total_blocks == 0:
|
87
|
-
print("No blocks found. Skipping header/footer detection.")
|
88
|
-
return result_dict
|
89
|
-
|
90
|
-
# If most of the blocks are single-line, skip the header and footer detection
|
91
|
-
if single_line_blocks / total_blocks > 0.5: # 50% of the blocks are single-line
|
92
|
-
# print("Skipping header/footer detection for text-dense document.")
|
93
|
-
return result_dict
|
94
|
-
|
95
|
-
# Collect the bounding boxes of all blocks
|
96
|
-
all_bboxes = []
|
97
|
-
all_texts = []
|
98
|
-
|
99
|
-
for page_id, blocks in result_dict.items():
|
100
|
-
if page_id.startswith("page_"):
|
101
|
-
for block_key, block in blocks.items():
|
102
|
-
if block_key.startswith("block_"):
|
103
|
-
all_bboxes.append(block["bbox"])
|
104
|
-
|
105
|
-
# Get the height of the page
|
106
|
-
page_height = max(bbox[3] for bbox in all_bboxes)
|
107
|
-
|
108
|
-
# Get the most common bbox lists for headers and footers
|
109
|
-
common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
|
110
|
-
common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
|
111
|
-
|
112
|
-
# Detect and mark headers and footers
|
113
|
-
for page_id, blocks in result_dict.items():
|
114
|
-
if page_id.startswith("page_"):
|
115
|
-
for block_key, block in blocks.items():
|
116
|
-
if block_key.startswith("block_"):
|
117
|
-
bbox = block["bbox"]
|
118
|
-
text = block["text"]
|
119
|
-
|
120
|
-
is_header = compare_bbox_with_list(bbox, common_header_bboxes)
|
121
|
-
is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
|
122
|
-
block["is_header"] = int(is_header)
|
123
|
-
block["is_footer"] = int(is_footer)
|
124
|
-
|
125
|
-
return result_dict
|
126
|
-
|
127
|
-
|
128
|
-
def __get_page_size(page_sizes:list):
|
129
|
-
"""
|
130
|
-
页面大小可能不一样
|
131
|
-
"""
|
132
|
-
w = sum([w for w,h in page_sizes])/len(page_sizes)
|
133
|
-
h = sum([h for w,h in page_sizes])/len(page_sizes)
|
134
|
-
return w, h
|
135
|
-
|
136
|
-
def __calculate_iou(bbox1, bbox2):
|
137
|
-
iou = calculate_iou(bbox1, bbox2)
|
138
|
-
return iou
|
139
|
-
|
140
|
-
def __is_same_pos(box1, box2, iou_threshold):
|
141
|
-
iou = __calculate_iou(box1, box2)
|
142
|
-
return iou >= iou_threshold
|
143
|
-
|
144
|
-
|
145
|
-
def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int, page_range_threshold=0.2, iou_threshold=0.9):
|
146
|
-
"""
|
147
|
-
common bbox必须大于page_cnt的1/3
|
148
|
-
"""
|
149
|
-
min_occurance_cnt = max(3, page_cnt//4)
|
150
|
-
header_det_bbox = []
|
151
|
-
footer_det_bbox = []
|
152
|
-
|
153
|
-
hdr_same_pos_group = []
|
154
|
-
btn_same_pos_group = []
|
155
|
-
|
156
|
-
page_w, page_h = __get_page_size(page_size)
|
157
|
-
top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold)
|
158
|
-
|
159
|
-
top_bbox = [b for b in bboxes if b[3]<top_y]
|
160
|
-
bottom_bbox = [b for b in bboxes if b[1]>bottom_y]
|
161
|
-
# 然后开始排序,寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个
|
162
|
-
for i in range(0, len(top_bbox)):
|
163
|
-
hdr_same_pos_group.append([top_bbox[i]])
|
164
|
-
for j in range(i+1, len(top_bbox)):
|
165
|
-
if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold):
|
166
|
-
#header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])]
|
167
|
-
hdr_same_pos_group[i].append(top_bbox[j])
|
168
|
-
|
169
|
-
for i in range(0, len(bottom_bbox)):
|
170
|
-
btn_same_pos_group.append([bottom_bbox[i]])
|
171
|
-
for j in range(i+1, len(bottom_bbox)):
|
172
|
-
if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold):
|
173
|
-
#footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])]
|
174
|
-
btn_same_pos_group[i].append(bottom_bbox[j])
|
175
|
-
|
176
|
-
# 然后看下每一组的bbox,是否符合大于page_cnt一定比例
|
177
|
-
hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt]
|
178
|
-
btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt]
|
179
|
-
|
180
|
-
# 平铺2个list[list]
|
181
|
-
hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g]
|
182
|
-
btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g]
|
183
|
-
# 寻找hdr_same_pos_group中的box[3]最大值,btn_same_pos_group中的box[1]最小值
|
184
|
-
hdr_same_pos_group.sort(key=lambda b:b[3])
|
185
|
-
btn_same_pos_group.sort(key=lambda b:b[1])
|
186
|
-
|
187
|
-
hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0
|
188
|
-
btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h
|
189
|
-
|
190
|
-
header_det_bbox = [0, 0, page_w, hdr_y]
|
191
|
-
footer_det_bbox = [0, btn_y, page_w, page_h]
|
192
|
-
# logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}")
|
193
|
-
return header_det_bbox, footer_det_bbox, page_w, page_h
|
194
|
-
|
195
|
-
|
196
|
-
def drop_footer_header(pdf_info_dict:dict):
|
197
|
-
"""
|
198
|
-
启用规则探测,在全局的视角上通过统计的方法。
|
199
|
-
"""
|
200
|
-
header = []
|
201
|
-
footer = []
|
202
|
-
|
203
|
-
all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
|
204
|
-
image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
|
205
|
-
page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
|
206
|
-
page_cnt = len(pdf_info_dict.keys()) # 一共多少页
|
207
|
-
header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt)
|
208
|
-
|
209
|
-
""""
|
210
|
-
把范围扩展到页面水平的整个方向上
|
211
|
-
"""
|
212
|
-
if header:
|
213
|
-
header = [0, 0, page_w, header[3]+1]
|
214
|
-
|
215
|
-
if footer:
|
216
|
-
footer = [0, footer[1]-1, page_w, page_h]
|
217
|
-
|
218
|
-
# 找到footer, header范围之后,针对每一页pdf,从text、图片中删除这些范围内的内容
|
219
|
-
# 移除text block
|
220
|
-
|
221
|
-
for _, page_info in pdf_info_dict.items():
|
222
|
-
header_text_blk = []
|
223
|
-
footer_text_blk = []
|
224
|
-
for blk in page_info['preproc_blocks']:
|
225
|
-
blk_bbox = blk['bbox']
|
226
|
-
if header and blk_bbox[3]<=header[3]:
|
227
|
-
blk['tag'] = "header"
|
228
|
-
header_text_blk.append(blk)
|
229
|
-
elif footer and blk_bbox[1]>=footer[1]:
|
230
|
-
blk['tag'] = "footer"
|
231
|
-
footer_text_blk.append(blk)
|
232
|
-
|
233
|
-
# 放入text_block_droped中
|
234
|
-
page_info['droped_text_block'].extend(header_text_blk)
|
235
|
-
page_info['droped_text_block'].extend(footer_text_blk)
|
236
|
-
|
237
|
-
for blk in header_text_blk:
|
238
|
-
page_info['preproc_blocks'].remove(blk)
|
239
|
-
for blk in footer_text_blk:
|
240
|
-
page_info['preproc_blocks'].remove(blk)
|
241
|
-
|
242
|
-
"""接下来把footer、header上的图片也删除掉。图片包括正常的和backup的"""
|
243
|
-
header_image = []
|
244
|
-
footer_image = []
|
245
|
-
|
246
|
-
for image_info in page_info['images']:
|
247
|
-
img_bbox = image_info['bbox']
|
248
|
-
if header and img_bbox[3]<=header[3]:
|
249
|
-
image_info['tag'] = "header"
|
250
|
-
header_image.append(image_info)
|
251
|
-
elif footer and img_bbox[1]>=footer[1]:
|
252
|
-
image_info['tag'] = "footer"
|
253
|
-
footer_image.append(image_info)
|
254
|
-
|
255
|
-
page_info['droped_image_block'].extend(header_image)
|
256
|
-
page_info['droped_image_block'].extend(footer_image)
|
257
|
-
|
258
|
-
for img in header_image:
|
259
|
-
page_info['images'].remove(img)
|
260
|
-
for img in footer_image:
|
261
|
-
page_info['images'].remove(img)
|
262
|
-
|
263
|
-
"""接下来吧backup的图片也删除掉"""
|
264
|
-
header_image = []
|
265
|
-
footer_image = []
|
266
|
-
|
267
|
-
for image_info in page_info['image_backup']:
|
268
|
-
img_bbox = image_info['bbox']
|
269
|
-
if header and img_bbox[3]<=header[3]:
|
270
|
-
image_info['tag'] = "header"
|
271
|
-
header_image.append(image_info)
|
272
|
-
elif footer and img_bbox[1]>=footer[1]:
|
273
|
-
image_info['tag'] = "footer"
|
274
|
-
footer_image.append(image_info)
|
275
|
-
|
276
|
-
page_info['droped_image_block'].extend(header_image)
|
277
|
-
page_info['droped_image_block'].extend(footer_image)
|
278
|
-
|
279
|
-
for img in header_image:
|
280
|
-
page_info['image_backup'].remove(img)
|
281
|
-
for img in footer_image:
|
282
|
-
page_info['image_backup'].remove(img)
|
283
|
-
|
284
|
-
return header, footer
|
@@ -1,170 +0,0 @@
|
|
1
|
-
from collections import Counter
|
2
|
-
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
3
|
-
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
4
|
-
|
5
|
-
|
6
|
-
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path=None, debug_mode=False):
|
7
|
-
"""
|
8
|
-
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
|
9
|
-
:param page :fitz读取的当前页的内容
|
10
|
-
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
|
11
|
-
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
|
12
|
-
"""
|
13
|
-
|
14
|
-
#--------- 通过json_from_DocXchain来获取 footnote ---------#
|
15
|
-
footnote_bbox_from_DocXChain = []
|
16
|
-
|
17
|
-
xf_json = json_from_DocXchain_obj
|
18
|
-
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
|
19
|
-
|
20
|
-
# {0: 'title', # 标题
|
21
|
-
# 1: 'figure', # 图片
|
22
|
-
# 2: 'plain text', # 文本
|
23
|
-
# 3: 'header', # 页眉
|
24
|
-
# 4: 'page number', # 页码
|
25
|
-
# 5: 'footnote', # 脚注
|
26
|
-
# 6: 'footer', # 页脚
|
27
|
-
# 7: 'table', # 表格
|
28
|
-
# 8: 'table caption', # 表格描述
|
29
|
-
# 9: 'figure caption', # 图片描述
|
30
|
-
# 10: 'equation', # 公式
|
31
|
-
# 11: 'full column', # 单栏
|
32
|
-
# 12: 'sub column', # 多栏
|
33
|
-
# 13: 'embedding', # 嵌入公式
|
34
|
-
# 14: 'isolated'} # 单行公式
|
35
|
-
for xf in xf_json['layout_dets']:
|
36
|
-
L = xf['poly'][0] / horizontal_scale_ratio
|
37
|
-
U = xf['poly'][1] / vertical_scale_ratio
|
38
|
-
R = xf['poly'][2] / horizontal_scale_ratio
|
39
|
-
D = xf['poly'][5] / vertical_scale_ratio
|
40
|
-
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
|
41
|
-
# R += pageL
|
42
|
-
# U += pageU
|
43
|
-
# D += pageU
|
44
|
-
L, R = min(L, R), max(L, R)
|
45
|
-
U, D = min(U, D), max(U, D)
|
46
|
-
# if xf['category_id'] == 5 and xf['score'] >= 0.3:
|
47
|
-
if xf['category_id'] == 5 and xf['score'] >= 0.43: # 新的footnote阈值
|
48
|
-
footnote_bbox_from_DocXChain.append((L, U, R, D))
|
49
|
-
|
50
|
-
|
51
|
-
footnote_final_names = []
|
52
|
-
footnote_final_bboxs = []
|
53
|
-
footnote_ID = 0
|
54
|
-
for L, U, R, D in footnote_bbox_from_DocXChain:
|
55
|
-
if debug_mode:
|
56
|
-
# cur_footnote = page.get_pixmap(clip=(L,U,R,D))
|
57
|
-
new_footnote_name = "footnote_{}_{}.png".format(page_ID, footnote_ID) # 脚注name
|
58
|
-
# cur_footnote.save(md_bookname_save_path + '/' + new_footnote_name) # 把脚注存储在新建的文件夹,并命名
|
59
|
-
footnote_final_names.append(new_footnote_name) # 把脚注的名字存在list中
|
60
|
-
footnote_final_bboxs.append((L, U, R, D))
|
61
|
-
footnote_ID += 1
|
62
|
-
|
63
|
-
|
64
|
-
footnote_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
|
65
|
-
curPage_all_footnote_bboxs = footnote_final_bboxs
|
66
|
-
return curPage_all_footnote_bboxs
|
67
|
-
|
68
|
-
|
69
|
-
def need_remove(block):
|
70
|
-
if 'lines' in block and len(block['lines']) > 0:
|
71
|
-
# block中只有一行,且该行文本全是大写字母,或字体为粗体bold关键词,SB关键词,把这个block捞回来
|
72
|
-
if len(block['lines']) == 1:
|
73
|
-
if 'spans' in block['lines'][0] and len(block['lines'][0]['spans']) == 1:
|
74
|
-
font_keywords = ['SB', 'bold', 'Bold']
|
75
|
-
if block['lines'][0]['spans'][0]['text'].isupper() or any(keyword in block['lines'][0]['spans'][0]['font'] for keyword in font_keywords):
|
76
|
-
return True
|
77
|
-
for line in block['lines']:
|
78
|
-
if 'spans' in line and len(line['spans']) > 0:
|
79
|
-
for span in line['spans']:
|
80
|
-
# 检测"keyword"是否在span中,忽略大小写
|
81
|
-
if "keyword" in span['text'].lower():
|
82
|
-
return True
|
83
|
-
return False
|
84
|
-
|
85
|
-
def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_font):
|
86
|
-
"""
|
87
|
-
根据给定的文本块、页高和页码,解析出符合规则的脚注文本块,并返回其边界框。
|
88
|
-
|
89
|
-
Args:
|
90
|
-
remain_text_blocks (list): 包含所有待处理的文本块的列表。
|
91
|
-
page_height (float): 页面的高度。
|
92
|
-
page_id (int): 页面的ID。
|
93
|
-
|
94
|
-
Returns:
|
95
|
-
list: 符合规则的脚注文本块的边界框列表。
|
96
|
-
|
97
|
-
"""
|
98
|
-
# if page_id > 20:
|
99
|
-
if page_id > 2: # 为保证精确度,先只筛选前3页
|
100
|
-
return []
|
101
|
-
else:
|
102
|
-
# 存储每一行的文本块大小的列表
|
103
|
-
line_sizes = []
|
104
|
-
# 存储每个文本块的平均行大小
|
105
|
-
block_sizes = []
|
106
|
-
# 存储每一行的字体信息
|
107
|
-
# font_names = []
|
108
|
-
font_names = Counter()
|
109
|
-
if len(remain_text_blocks) > 0:
|
110
|
-
for block in remain_text_blocks:
|
111
|
-
block_line_sizes = []
|
112
|
-
# block_fonts = []
|
113
|
-
block_fonts = Counter()
|
114
|
-
for line in block['lines']:
|
115
|
-
# 提取每个span的size属性,并计算行大小
|
116
|
-
span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
|
117
|
-
if span_sizes:
|
118
|
-
line_size = sum(span_sizes) / len(span_sizes)
|
119
|
-
line_sizes.append(line_size)
|
120
|
-
block_line_sizes.append(line_size)
|
121
|
-
span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0]
|
122
|
-
if span_font:
|
123
|
-
# main_text_font应该用基于字数最多的字体而不是span级别的统计
|
124
|
-
# font_names.append(font_name for font_name in span_font)
|
125
|
-
# block_fonts.append(font_name for font_name in span_font)
|
126
|
-
for font, count in span_font:
|
127
|
-
# font_names.extend([font] * count)
|
128
|
-
# block_fonts.extend([font] * count)
|
129
|
-
font_names[font] += count
|
130
|
-
block_fonts[font] += count
|
131
|
-
if block_line_sizes:
|
132
|
-
# 计算文本块的平均行大小
|
133
|
-
block_size = sum(block_line_sizes) / len(block_line_sizes)
|
134
|
-
# block_font = collections.Counter(block_fonts).most_common(1)[0][0]
|
135
|
-
block_font = block_fonts.most_common(1)[0][0]
|
136
|
-
block_sizes.append((block, block_size, block_font))
|
137
|
-
|
138
|
-
# 计算main_text_size
|
139
|
-
main_text_size = Counter(line_sizes).most_common(1)[0][0]
|
140
|
-
# 计算main_text_font
|
141
|
-
# main_text_font = collections.Counter(font_names).most_common(1)[0][0]
|
142
|
-
# main_text_font = font_names.most_common(1)[0][0]
|
143
|
-
# 删除一些可能被误识别为脚注的文本块
|
144
|
-
block_sizes = [(block, block_size, block_font) for block, block_size, block_font in block_sizes if not need_remove(block)]
|
145
|
-
|
146
|
-
# 检测footnote_block 并返回 footnote_bboxes
|
147
|
-
# footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
|
148
|
-
# block['bbox'][1] > page_height * 0.6 and block_size < main_text_size
|
149
|
-
# and (len(block['lines']) < 5 or block_font != main_text_font)]
|
150
|
-
# and len(block['lines']) < 5]
|
151
|
-
footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
|
152
|
-
block['bbox'][1] > page_height * 0.6 and
|
153
|
-
# 较为严格的规则
|
154
|
-
block_size < main_text_size and
|
155
|
-
(len(block['lines']) < 5 or
|
156
|
-
block_font != main_text_font)]
|
157
|
-
|
158
|
-
# 较为宽松的规则
|
159
|
-
# sum([block_size < main_text_size,
|
160
|
-
# len(block['lines']) < 5,
|
161
|
-
# block_font != main_text_font])
|
162
|
-
# >= 2]
|
163
|
-
|
164
|
-
|
165
|
-
return footnote_bboxes
|
166
|
-
else:
|
167
|
-
return []
|
168
|
-
|
169
|
-
|
170
|
-
|
@@ -1,64 +0,0 @@
|
|
1
|
-
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
2
|
-
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
3
|
-
|
4
|
-
|
5
|
-
def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
6
|
-
"""
|
7
|
-
:param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
|
8
|
-
:param page :fitz读取的当前页的内容
|
9
|
-
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
|
10
|
-
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
|
11
|
-
"""
|
12
|
-
|
13
|
-
#--------- 通过json_from_DocXchain来获取 header ---------#
|
14
|
-
header_bbox_from_DocXChain = []
|
15
|
-
|
16
|
-
xf_json = json_from_DocXchain_obj
|
17
|
-
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
|
18
|
-
|
19
|
-
# {0: 'title', # 标题
|
20
|
-
# 1: 'figure', # 图片
|
21
|
-
# 2: 'plain text', # 文本
|
22
|
-
# 3: 'header', # 页眉
|
23
|
-
# 4: 'page number', # 页码
|
24
|
-
# 5: 'footnote', # 脚注
|
25
|
-
# 6: 'footer', # 页脚
|
26
|
-
# 7: 'table', # 表格
|
27
|
-
# 8: 'table caption', # 表格描述
|
28
|
-
# 9: 'figure caption', # 图片描述
|
29
|
-
# 10: 'equation', # 公式
|
30
|
-
# 11: 'full column', # 单栏
|
31
|
-
# 12: 'sub column', # 多栏
|
32
|
-
# 13: 'embedding', # 嵌入公式
|
33
|
-
# 14: 'isolated'} # 单行公式
|
34
|
-
for xf in xf_json['layout_dets']:
|
35
|
-
L = xf['poly'][0] / horizontal_scale_ratio
|
36
|
-
U = xf['poly'][1] / vertical_scale_ratio
|
37
|
-
R = xf['poly'][2] / horizontal_scale_ratio
|
38
|
-
D = xf['poly'][5] / vertical_scale_ratio
|
39
|
-
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
|
40
|
-
# R += pageL
|
41
|
-
# U += pageU
|
42
|
-
# D += pageU
|
43
|
-
L, R = min(L, R), max(L, R)
|
44
|
-
U, D = min(U, D), max(U, D)
|
45
|
-
if xf['category_id'] == 3 and xf['score'] >= 0.3:
|
46
|
-
header_bbox_from_DocXChain.append((L, U, R, D))
|
47
|
-
|
48
|
-
|
49
|
-
header_final_names = []
|
50
|
-
header_final_bboxs = []
|
51
|
-
header_ID = 0
|
52
|
-
for L, U, R, D in header_bbox_from_DocXChain:
|
53
|
-
# cur_header = page.get_pixmap(clip=(L,U,R,D))
|
54
|
-
new_header_name = "header_{}_{}.png".format(page_ID, header_ID) # 页眉name
|
55
|
-
# cur_header.save(res_dir_path + '/' + new_header_name) # 把页眉存储在新建的文件夹,并命名
|
56
|
-
header_final_names.append(new_header_name) # 把页面的名字存在list中
|
57
|
-
header_final_bboxs.append((L, U, R, D))
|
58
|
-
header_ID += 1
|
59
|
-
|
60
|
-
|
61
|
-
header_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
|
62
|
-
curPage_all_header_bboxs = header_final_bboxs
|
63
|
-
return curPage_all_header_bboxs
|
64
|
-
|