magic-pdf 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/__init__.py +0 -0
- magic_pdf/cli/__init__.py +0 -0
- magic_pdf/cli/magicpdf.py +294 -0
- magic_pdf/dict2md/__init__.py +0 -0
- magic_pdf/dict2md/mkcontent.py +397 -0
- magic_pdf/dict2md/ocr_mkcontent.py +356 -0
- magic_pdf/filter/__init__.py +0 -0
- magic_pdf/filter/pdf_classify_by_type.py +381 -0
- magic_pdf/filter/pdf_meta_scan.py +368 -0
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +681 -0
- magic_pdf/layout/layout_det_utils.py +182 -0
- magic_pdf/layout/layout_sort.py +732 -0
- magic_pdf/layout/layout_spiler_recog.py +101 -0
- magic_pdf/layout/mcol_sort.py +336 -0
- magic_pdf/libs/Constants.py +11 -0
- magic_pdf/libs/MakeContentConfig.py +10 -0
- magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
- magic_pdf/libs/__init__.py +0 -0
- magic_pdf/libs/boxbase.py +408 -0
- magic_pdf/libs/calc_span_stats.py +239 -0
- magic_pdf/libs/commons.py +204 -0
- magic_pdf/libs/config_reader.py +63 -0
- magic_pdf/libs/convert_utils.py +5 -0
- magic_pdf/libs/coordinate_transform.py +9 -0
- magic_pdf/libs/detect_language_from_model.py +21 -0
- magic_pdf/libs/draw_bbox.py +227 -0
- magic_pdf/libs/drop_reason.py +27 -0
- magic_pdf/libs/drop_tag.py +19 -0
- magic_pdf/libs/hash_utils.py +15 -0
- magic_pdf/libs/json_compressor.py +27 -0
- magic_pdf/libs/language.py +31 -0
- magic_pdf/libs/markdown_utils.py +31 -0
- magic_pdf/libs/math.py +9 -0
- magic_pdf/libs/nlp_utils.py +203 -0
- magic_pdf/libs/ocr_content_type.py +21 -0
- magic_pdf/libs/path_utils.py +23 -0
- magic_pdf/libs/pdf_image_tools.py +33 -0
- magic_pdf/libs/safe_filename.py +11 -0
- magic_pdf/libs/textbase.py +33 -0
- magic_pdf/libs/version.py +1 -0
- magic_pdf/libs/vis_utils.py +308 -0
- magic_pdf/model/__init__.py +0 -0
- magic_pdf/model/doc_analyze_by_360layout.py +8 -0
- magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
- magic_pdf/model/magic_model.py +632 -0
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/para/block_continuation_processor.py +562 -0
- magic_pdf/para/block_termination_processor.py +480 -0
- magic_pdf/para/commons.py +222 -0
- magic_pdf/para/denoise.py +246 -0
- magic_pdf/para/draw.py +121 -0
- magic_pdf/para/exceptions.py +198 -0
- magic_pdf/para/layout_match_processor.py +40 -0
- magic_pdf/para/para_pipeline.py +297 -0
- magic_pdf/para/para_split.py +644 -0
- magic_pdf/para/para_split_v2.py +772 -0
- magic_pdf/para/raw_processor.py +207 -0
- magic_pdf/para/stats.py +268 -0
- magic_pdf/para/title_processor.py +1014 -0
- magic_pdf/pdf_parse_by_ocr.py +219 -0
- magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
- magic_pdf/pdf_parse_by_txt.py +410 -0
- magic_pdf/pdf_parse_by_txt_v2.py +56 -0
- magic_pdf/pdf_parse_for_train.py +685 -0
- magic_pdf/pdf_parse_union_core.py +241 -0
- magic_pdf/pipe/AbsPipe.py +112 -0
- magic_pdf/pipe/OCRPipe.py +28 -0
- magic_pdf/pipe/TXTPipe.py +29 -0
- magic_pdf/pipe/UNIPipe.py +83 -0
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +3472 -0
- magic_pdf/post_proc/pdf_post_filter.py +67 -0
- magic_pdf/post_proc/remove_footnote.py +153 -0
- magic_pdf/pre_proc/__init__.py +0 -0
- magic_pdf/pre_proc/citationmarker_remove.py +157 -0
- magic_pdf/pre_proc/construct_page_dict.py +72 -0
- magic_pdf/pre_proc/cut_image.py +71 -0
- magic_pdf/pre_proc/detect_equation.py +134 -0
- magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
- magic_pdf/pre_proc/detect_footnote.py +170 -0
- magic_pdf/pre_proc/detect_header.py +64 -0
- magic_pdf/pre_proc/detect_images.py +647 -0
- magic_pdf/pre_proc/detect_page_number.py +64 -0
- magic_pdf/pre_proc/detect_tables.py +62 -0
- magic_pdf/pre_proc/equations_replace.py +559 -0
- magic_pdf/pre_proc/fix_image.py +244 -0
- magic_pdf/pre_proc/fix_table.py +270 -0
- magic_pdf/pre_proc/main_text_font.py +23 -0
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
- magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
- magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
- magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
- magic_pdf/pre_proc/remove_footer_header.py +117 -0
- magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
- magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
- magic_pdf/pre_proc/solve_line_alien.py +29 -0
- magic_pdf/pre_proc/statistics.py +12 -0
- magic_pdf/rw/AbsReaderWriter.py +34 -0
- magic_pdf/rw/DiskReaderWriter.py +66 -0
- magic_pdf/rw/S3ReaderWriter.py +107 -0
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/spark/__init__.py +0 -0
- magic_pdf/spark/spark_api.py +51 -0
- magic_pdf/train_utils/__init__.py +0 -0
- magic_pdf/train_utils/convert_to_train_format.py +65 -0
- magic_pdf/train_utils/extract_caption.py +59 -0
- magic_pdf/train_utils/remove_footer_header.py +159 -0
- magic_pdf/train_utils/vis_utils.py +327 -0
- magic_pdf/user_api.py +136 -0
- magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
- magic_pdf-0.5.4.dist-info/METADATA +24 -0
- magic_pdf-0.5.4.dist-info/RECORD +121 -0
- magic_pdf-0.5.4.dist-info/WHEEL +5 -0
- magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,258 @@
|
|
1
|
+
from loguru import logger
|
2
|
+
|
3
|
+
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
|
4
|
+
__is_overlaps_y_exceeds_threshold, calculate_iou
|
5
|
+
from magic_pdf.libs.drop_tag import DropTag
|
6
|
+
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
|
7
|
+
|
8
|
+
def remove_overlaps_low_confidence_spans(spans):
|
9
|
+
dropped_spans = []
|
10
|
+
# 删除重叠spans中置信度低的的那些
|
11
|
+
for span1 in spans:
|
12
|
+
for span2 in spans:
|
13
|
+
if span1 != span2:
|
14
|
+
if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
|
15
|
+
if span1['score'] < span2['score']:
|
16
|
+
span_need_remove = span1
|
17
|
+
else:
|
18
|
+
span_need_remove = span2
|
19
|
+
if span_need_remove is not None and span_need_remove not in dropped_spans:
|
20
|
+
dropped_spans.append(span_need_remove)
|
21
|
+
|
22
|
+
if len(dropped_spans) > 0:
|
23
|
+
for span_need_remove in dropped_spans:
|
24
|
+
spans.remove(span_need_remove)
|
25
|
+
span_need_remove['tag'] = DropTag.SPAN_OVERLAP
|
26
|
+
|
27
|
+
return spans, dropped_spans
|
28
|
+
|
29
|
+
|
30
|
+
def remove_overlaps_min_spans(spans):
|
31
|
+
dropped_spans = []
|
32
|
+
# 删除重叠spans中较小的那些
|
33
|
+
for span1 in spans:
|
34
|
+
for span2 in spans:
|
35
|
+
if span1 != span2:
|
36
|
+
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
|
37
|
+
if overlap_box is not None:
|
38
|
+
span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
|
39
|
+
if span_need_remove is not None and span_need_remove not in dropped_spans:
|
40
|
+
dropped_spans.append(span_need_remove)
|
41
|
+
|
42
|
+
if len(dropped_spans) > 0:
|
43
|
+
for span_need_remove in dropped_spans:
|
44
|
+
spans.remove(span_need_remove)
|
45
|
+
span_need_remove['tag'] = DropTag.SPAN_OVERLAP
|
46
|
+
|
47
|
+
return spans, dropped_spans
|
48
|
+
|
49
|
+
|
50
|
+
def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
|
51
|
+
# 遍历spans, 判断是否在removed_span_block_bboxes中
|
52
|
+
# 如果是, 则删除该span 否则, 保留该span
|
53
|
+
need_remove_spans = []
|
54
|
+
for span in spans:
|
55
|
+
for removed_bbox in need_remove_spans_bboxes:
|
56
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
|
57
|
+
if span not in need_remove_spans:
|
58
|
+
need_remove_spans.append(span)
|
59
|
+
break
|
60
|
+
|
61
|
+
if len(need_remove_spans) > 0:
|
62
|
+
for span in need_remove_spans:
|
63
|
+
spans.remove(span)
|
64
|
+
|
65
|
+
return spans
|
66
|
+
|
67
|
+
|
68
|
+
def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
|
69
|
+
dropped_spans = []
|
70
|
+
for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
|
71
|
+
# logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
|
72
|
+
need_remove_spans = []
|
73
|
+
for span in spans:
|
74
|
+
# 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
|
75
|
+
for removed_bbox in removed_bboxes:
|
76
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
|
77
|
+
need_remove_spans.append(span)
|
78
|
+
break
|
79
|
+
# 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
|
80
|
+
elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
|
81
|
+
removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
|
82
|
+
need_remove_spans.append(span)
|
83
|
+
break
|
84
|
+
|
85
|
+
for span in need_remove_spans:
|
86
|
+
spans.remove(span)
|
87
|
+
span['tag'] = drop_tag
|
88
|
+
dropped_spans.append(span)
|
89
|
+
|
90
|
+
return spans, dropped_spans
|
91
|
+
|
92
|
+
|
93
|
+
def adjust_bbox_for_standalone_block(spans):
|
94
|
+
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
|
95
|
+
for sb_span in spans:
|
96
|
+
if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
97
|
+
for text_span in spans:
|
98
|
+
if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
|
99
|
+
# 判断span2的纵向高度是否被span所覆盖
|
100
|
+
if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
|
101
|
+
# 判断span2是否在span左边
|
102
|
+
if text_span['bbox'][0] < sb_span['bbox'][0]:
|
103
|
+
# 调整span的y0和span2的y0一致
|
104
|
+
sb_span['bbox'][1] = text_span['bbox'][1]
|
105
|
+
return spans
|
106
|
+
|
107
|
+
|
108
|
+
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
|
109
|
+
# displayed_list = []
|
110
|
+
# 如果spans为空,则不处理
|
111
|
+
if len(spans) == 0:
|
112
|
+
pass
|
113
|
+
else:
|
114
|
+
spans.sort(key=lambda span: span['bbox'][1])
|
115
|
+
|
116
|
+
lines = []
|
117
|
+
current_line = [spans[0]]
|
118
|
+
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
119
|
+
displayed_list.append(spans[0])
|
120
|
+
|
121
|
+
line_first_y0 = spans[0]["bbox"][1]
|
122
|
+
line_first_y = spans[0]["bbox"][3]
|
123
|
+
# 用于给行间公式搜索
|
124
|
+
# text_inline_lines = []
|
125
|
+
for span in spans[1:]:
|
126
|
+
# if span.get("content","") == "78.":
|
127
|
+
# print("debug")
|
128
|
+
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
129
|
+
# image和table类型,同上
|
130
|
+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
|
131
|
+
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
|
132
|
+
current_line):
|
133
|
+
# 传入
|
134
|
+
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
135
|
+
displayed_list.append(span)
|
136
|
+
# 则开始新行
|
137
|
+
lines.append(current_line)
|
138
|
+
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
|
139
|
+
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
140
|
+
current_line = [span]
|
141
|
+
line_first_y0 = span["bbox"][1]
|
142
|
+
line_first_y = span["bbox"][3]
|
143
|
+
continue
|
144
|
+
|
145
|
+
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
146
|
+
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
|
147
|
+
if span["type"] == "text":
|
148
|
+
line_first_y0 = span["bbox"][1]
|
149
|
+
line_first_y = span["bbox"][3]
|
150
|
+
current_line.append(span)
|
151
|
+
|
152
|
+
else:
|
153
|
+
# 否则,开始新行
|
154
|
+
lines.append(current_line)
|
155
|
+
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
156
|
+
current_line = [span]
|
157
|
+
line_first_y0 = span["bbox"][1]
|
158
|
+
line_first_y = span["bbox"][3]
|
159
|
+
|
160
|
+
# 添加最后一行
|
161
|
+
if current_line:
|
162
|
+
lines.append(current_line)
|
163
|
+
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
|
164
|
+
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
165
|
+
for line in text_inline_lines:
|
166
|
+
# 按照x0坐标排序
|
167
|
+
current_line = line[0]
|
168
|
+
current_line.sort(key=lambda span: span['bbox'][0])
|
169
|
+
|
170
|
+
# 调整每一个文字行内bbox统一
|
171
|
+
for line in text_inline_lines:
|
172
|
+
current_line, (line_first_y0, line_first_y) = line
|
173
|
+
for span in current_line:
|
174
|
+
span["bbox"][1] = line_first_y0
|
175
|
+
span["bbox"][3] = line_first_y
|
176
|
+
|
177
|
+
# return spans, displayed_list, text_inline_lines
|
178
|
+
|
179
|
+
|
180
|
+
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
|
181
|
+
# 错误行间公式转行内公式
|
182
|
+
j = 0
|
183
|
+
for i in range(len(displayed_list)):
|
184
|
+
# if i == 8:
|
185
|
+
# print("debug")
|
186
|
+
span = displayed_list[i]
|
187
|
+
span_y0, span_y = span["bbox"][1], span["bbox"][3]
|
188
|
+
|
189
|
+
while j < len(text_inline_lines):
|
190
|
+
text_line = text_inline_lines[j]
|
191
|
+
y0, y1 = text_line[1]
|
192
|
+
if (
|
193
|
+
span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
|
194
|
+
) and __is_overlaps_y_exceeds_threshold(
|
195
|
+
span['bbox'], (0, y0, 0, y1)
|
196
|
+
):
|
197
|
+
# 调整公式类型
|
198
|
+
if span["type"] == ContentType.InterlineEquation:
|
199
|
+
# 最后一行是行间公式
|
200
|
+
if j + 1 >= len(text_inline_lines):
|
201
|
+
span["type"] = ContentType.InlineEquation
|
202
|
+
span["bbox"][1] = y0
|
203
|
+
span["bbox"][3] = y1
|
204
|
+
else:
|
205
|
+
# 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
|
206
|
+
y0_next, y1_next = text_inline_lines[j + 1][1]
|
207
|
+
if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
|
208
|
+
y1 - y0) > span_y - span_y0:
|
209
|
+
span["type"] = ContentType.InlineEquation
|
210
|
+
span["bbox"][1] = y0
|
211
|
+
span["bbox"][3] = y1
|
212
|
+
break
|
213
|
+
elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
|
214
|
+
(0, y0, 0, y1)):
|
215
|
+
break
|
216
|
+
else:
|
217
|
+
j += 1
|
218
|
+
|
219
|
+
return spans
|
220
|
+
|
221
|
+
|
222
|
+
def get_qa_need_list(blocks):
|
223
|
+
# 创建 images, tables, interline_equations, inline_equations 的副本
|
224
|
+
images = []
|
225
|
+
tables = []
|
226
|
+
interline_equations = []
|
227
|
+
inline_equations = []
|
228
|
+
|
229
|
+
for block in blocks:
|
230
|
+
for line in block["lines"]:
|
231
|
+
for span in line["spans"]:
|
232
|
+
if span["type"] == ContentType.Image:
|
233
|
+
images.append(span)
|
234
|
+
elif span["type"] == ContentType.Table:
|
235
|
+
tables.append(span)
|
236
|
+
elif span["type"] == ContentType.InlineEquation:
|
237
|
+
inline_equations.append(span)
|
238
|
+
elif span["type"] == ContentType.InterlineEquation:
|
239
|
+
interline_equations.append(span)
|
240
|
+
else:
|
241
|
+
continue
|
242
|
+
return images, tables, interline_equations, inline_equations
|
243
|
+
|
244
|
+
|
245
|
+
def get_qa_need_list_v2(blocks):
|
246
|
+
# 创建 images, tables, interline_equations, inline_equations 的副本
|
247
|
+
images = []
|
248
|
+
tables = []
|
249
|
+
interline_equations = []
|
250
|
+
|
251
|
+
for block in blocks:
|
252
|
+
if block["type"] == BlockType.Image:
|
253
|
+
images.append(block)
|
254
|
+
elif block["type"] == BlockType.Table:
|
255
|
+
tables.append(block)
|
256
|
+
elif block["type"] == BlockType.InterlineEquation:
|
257
|
+
interline_equations.append(block)
|
258
|
+
return images, tables, interline_equations
|
@@ -0,0 +1,74 @@
|
|
1
|
+
from magic_pdf.libs.commons import fitz
|
2
|
+
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
|
3
|
+
from magic_pdf.libs.drop_reason import DropReason
|
4
|
+
|
5
|
+
|
6
|
+
def __area(box):
|
7
|
+
return (box[2] - box[0]) * (box[3] - box[1])
|
8
|
+
|
9
|
+
def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool:
|
10
|
+
"""
|
11
|
+
检查page是包含有颜色背景的矩形
|
12
|
+
"""
|
13
|
+
color_bg_rect = []
|
14
|
+
p_width, p_height = page.rect.width, page.rect.height
|
15
|
+
|
16
|
+
# 先找到最大的带背景矩形
|
17
|
+
blocks = page.get_cdrawings()
|
18
|
+
for block in blocks:
|
19
|
+
|
20
|
+
if 'fill' in block and block['fill']: # 过滤掉透明的
|
21
|
+
fill = list(block['fill'])
|
22
|
+
fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
|
23
|
+
if fill==(1.0,1.0,1.0):
|
24
|
+
continue
|
25
|
+
rect = block['rect']
|
26
|
+
# 过滤掉特别小的矩形
|
27
|
+
if __area(rect) < 10*10:
|
28
|
+
continue
|
29
|
+
# 为了防止是svg图片上的色块,这里过滤掉这类
|
30
|
+
|
31
|
+
if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]):
|
32
|
+
continue
|
33
|
+
color_bg_rect.append(rect)
|
34
|
+
|
35
|
+
# 找到最大的背景矩形
|
36
|
+
if len(color_bg_rect) > 0:
|
37
|
+
max_rect = max(color_bg_rect, key=lambda x:__area(x))
|
38
|
+
max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3]))
|
39
|
+
# 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
|
40
|
+
if max_rect[2]-max_rect[0] > 0.2*p_width and max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合
|
41
|
+
#看是否有文本块落入到这个矩形中
|
42
|
+
for text_block in text_blocks:
|
43
|
+
box = text_block['bbox']
|
44
|
+
box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
|
45
|
+
if _is_in(box_int, max_rect_int):
|
46
|
+
return True
|
47
|
+
|
48
|
+
return False
|
49
|
+
|
50
|
+
|
51
|
+
def __is_table_overlap_text_block(text_blocks, table_bbox):
|
52
|
+
"""
|
53
|
+
检查table_bbox是否覆盖了text_blocks里的文本块
|
54
|
+
TODO
|
55
|
+
"""
|
56
|
+
for text_block in text_blocks:
|
57
|
+
box = text_block['bbox']
|
58
|
+
if _is_in_or_part_overlap(table_bbox, box):
|
59
|
+
return True
|
60
|
+
return False
|
61
|
+
|
62
|
+
|
63
|
+
def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
|
64
|
+
"""
|
65
|
+
return:(True|False, err_msg)
|
66
|
+
True, 如果pdf符合要求
|
67
|
+
False, 如果pdf不符合要求
|
68
|
+
|
69
|
+
"""
|
70
|
+
if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
|
71
|
+
return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
|
72
|
+
|
73
|
+
|
74
|
+
return True, None
|
File without changes
|
@@ -0,0 +1,98 @@
|
|
1
|
+
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
|
2
|
+
from magic_pdf.libs.drop_reason import DropReason
|
3
|
+
|
4
|
+
def _remove_overlap_between_bbox(bbox1, bbox2):
|
5
|
+
if _is_part_overlap(bbox1, bbox2):
|
6
|
+
ix0, iy0, ix1, iy1 = bbox1
|
7
|
+
x0, y0, x1, y1 = bbox2
|
8
|
+
|
9
|
+
diff_x = min(x1, ix1) - max(x0, ix0)
|
10
|
+
diff_y = min(y1, iy1) - max(y0, iy0)
|
11
|
+
|
12
|
+
if diff_y > diff_x:
|
13
|
+
if x1 >= ix1:
|
14
|
+
mid = (x0 + ix1) // 2
|
15
|
+
ix1 = min(mid - 0.25, ix1)
|
16
|
+
x0 = max(mid + 0.25, x0)
|
17
|
+
else:
|
18
|
+
mid = (ix0 + x1) // 2
|
19
|
+
ix0 = max(mid + 0.25, ix0)
|
20
|
+
x1 = min(mid - 0.25, x1)
|
21
|
+
else:
|
22
|
+
if y1 >= iy1:
|
23
|
+
mid = (y0 + iy1) // 2
|
24
|
+
y0 = max(mid + 0.25, y0)
|
25
|
+
iy1 = min(iy1, mid-0.25)
|
26
|
+
else:
|
27
|
+
mid = (iy0 + y1) // 2
|
28
|
+
y1 = min(y1, mid-0.25)
|
29
|
+
iy0 = max(mid + 0.25, iy0)
|
30
|
+
|
31
|
+
if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
|
32
|
+
bbox1 = [ix0, iy0, ix1, iy1]
|
33
|
+
bbox2 = [x0, y0, x1, y1]
|
34
|
+
return bbox1, bbox2, None
|
35
|
+
else:
|
36
|
+
return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
|
37
|
+
else:
|
38
|
+
return bbox1, bbox2, None
|
39
|
+
|
40
|
+
|
41
|
+
def _remove_overlap_between_bboxes(arr):
|
42
|
+
drop_reasons = []
|
43
|
+
N = len(arr)
|
44
|
+
keeps = [True] * N
|
45
|
+
res = [None] * N
|
46
|
+
for i in range(N):
|
47
|
+
for j in range(N):
|
48
|
+
if i == j:
|
49
|
+
continue
|
50
|
+
if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
|
51
|
+
keeps[i] = False
|
52
|
+
|
53
|
+
for idx, v in enumerate(arr):
|
54
|
+
if not keeps[idx]:
|
55
|
+
continue
|
56
|
+
for i in range(N):
|
57
|
+
if res[i] is None:
|
58
|
+
continue
|
59
|
+
|
60
|
+
bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
|
61
|
+
if drop_reason is None:
|
62
|
+
v["bbox"] = bbox1
|
63
|
+
res[i]["bbox"] = bbox2
|
64
|
+
else:
|
65
|
+
if v["score"] > res[i]["score"]:
|
66
|
+
keeps[i] = False
|
67
|
+
res[i] = None
|
68
|
+
else:
|
69
|
+
keeps[idx] = False
|
70
|
+
drop_reasons.append(drop_reasons)
|
71
|
+
if keeps[idx]:
|
72
|
+
res[idx] = v
|
73
|
+
return res, drop_reasons
|
74
|
+
|
75
|
+
|
76
|
+
def remove_overlap_between_bbox_for_span(spans):
|
77
|
+
arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
|
78
|
+
res, drop_reasons = _remove_overlap_between_bboxes(arr)
|
79
|
+
ret = []
|
80
|
+
for i in range(len(res)):
|
81
|
+
if res[i] is None:
|
82
|
+
continue
|
83
|
+
spans[i]["bbox"] = res[i]["bbox"]
|
84
|
+
ret.append(spans[i])
|
85
|
+
return ret, drop_reasons
|
86
|
+
|
87
|
+
|
88
|
+
def remove_overlap_between_bbox_for_block(all_bboxes):
|
89
|
+
arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
|
90
|
+
res, drop_reasons = _remove_overlap_between_bboxes(arr)
|
91
|
+
ret = []
|
92
|
+
for i in range(len(res)):
|
93
|
+
if res[i] is None:
|
94
|
+
continue
|
95
|
+
all_bboxes[i][:4] = res[i]["bbox"]
|
96
|
+
ret.append(all_bboxes[i])
|
97
|
+
return ret, drop_reasons
|
98
|
+
|
@@ -0,0 +1,79 @@
|
|
1
|
+
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
|
2
|
+
from loguru import logger
|
3
|
+
|
4
|
+
from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
|
5
|
+
|
6
|
+
|
7
|
+
def __area(box):
|
8
|
+
return (box[2] - box[0]) * (box[3] - box[1])
|
9
|
+
|
10
|
+
|
11
|
+
def rectangle_position_determination(rect, p_width):
|
12
|
+
"""
|
13
|
+
判断矩形是否在页面中轴线附近。
|
14
|
+
|
15
|
+
Args:
|
16
|
+
rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
|
17
|
+
p_width (int): 页面宽度。
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
bool: 若矩形在页面中轴线附近则返回True,否则返回False。
|
21
|
+
"""
|
22
|
+
# 页面中轴线x坐标
|
23
|
+
x_axis = p_width / 2
|
24
|
+
# 矩形是否跨越中轴线
|
25
|
+
is_span = rect[0] < x_axis and rect[2] > x_axis
|
26
|
+
if is_span:
|
27
|
+
return True
|
28
|
+
else:
|
29
|
+
# 矩形与中轴线的距离,只算近的那一边
|
30
|
+
distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
|
31
|
+
# 判断矩形与中轴线的距离是否小于页面宽度的20%
|
32
|
+
if distance < p_width * 0.2:
|
33
|
+
return True
|
34
|
+
else:
|
35
|
+
return False
|
36
|
+
|
37
|
+
def remove_colored_strip_textblock(remain_text_blocks, page):
|
38
|
+
"""
|
39
|
+
根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_strip_textblock。
|
40
|
+
|
41
|
+
Args:
|
42
|
+
remain_text_blocks (list): 剩余文本块列表。
|
43
|
+
page (Page): 页面对象。
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
tuple: 剩余文本块列表和移除的文本块列表。
|
47
|
+
"""
|
48
|
+
colored_strip_textblocks = [] # 先构造一个空的返回
|
49
|
+
if len(remain_text_blocks) > 0:
|
50
|
+
p_width, p_height = page.rect.width, page.rect.height
|
51
|
+
blocks = page.get_cdrawings()
|
52
|
+
colored_strip_bg_rect = []
|
53
|
+
for block in blocks:
|
54
|
+
is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0) # 过滤掉透明的
|
55
|
+
rect = block['rect']
|
56
|
+
area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
|
57
|
+
rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
|
58
|
+
in_upper_half_page = rect[3] < p_height * 0.3 # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
|
59
|
+
aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4 # 找到长宽比超过4的矩形
|
60
|
+
|
61
|
+
if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
|
62
|
+
colored_strip_bg_rect.append(rect)
|
63
|
+
|
64
|
+
if len(colored_strip_bg_rect) > 0:
|
65
|
+
for colored_strip_block_bbox in colored_strip_bg_rect:
|
66
|
+
for text_block in remain_text_blocks:
|
67
|
+
text_bbox = text_block['bbox']
|
68
|
+
if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
|
69
|
+
logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
|
70
|
+
text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
|
71
|
+
colored_strip_textblocks.append(text_block)
|
72
|
+
|
73
|
+
if len(colored_strip_textblocks) > 0:
|
74
|
+
for colored_strip_textblock in colored_strip_textblocks:
|
75
|
+
if colored_strip_textblock in remain_text_blocks:
|
76
|
+
remain_text_blocks.remove(colored_strip_textblock)
|
77
|
+
|
78
|
+
return remain_text_blocks, colored_strip_textblocks
|
79
|
+
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import re
|
2
|
+
|
3
|
+
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
|
4
|
+
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
|
5
|
+
|
6
|
+
|
7
|
+
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
|
8
|
+
page_no_bboxs, page_w, page_h):
|
9
|
+
"""
|
10
|
+
删除页眉页脚,页码
|
11
|
+
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
|
12
|
+
"""
|
13
|
+
header = []
|
14
|
+
footer = []
|
15
|
+
if len(header) == 0:
|
16
|
+
model_header = header_bboxs
|
17
|
+
if model_header:
|
18
|
+
x0 = min([x for x, _, _, _ in model_header])
|
19
|
+
y0 = min([y for _, y, _, _ in model_header])
|
20
|
+
x1 = max([x1 for _, _, x1, _ in model_header])
|
21
|
+
y1 = max([y1 for _, _, _, y1 in model_header])
|
22
|
+
header = [x0, y0, x1, y1]
|
23
|
+
if len(footer) == 0:
|
24
|
+
model_footer = footer_bboxs
|
25
|
+
if model_footer:
|
26
|
+
x0 = min([x for x, _, _, _ in model_footer])
|
27
|
+
y0 = min([y for _, y, _, _ in model_footer])
|
28
|
+
x1 = max([x1 for _, _, x1, _ in model_footer])
|
29
|
+
y1 = max([y1 for _, _, _, y1 in model_footer])
|
30
|
+
footer = [x0, y0, x1, y1]
|
31
|
+
|
32
|
+
header_y0 = 0 if len(header) == 0 else header[3]
|
33
|
+
footer_y0 = page_h if len(footer) == 0 else footer[1]
|
34
|
+
if page_no_bboxs:
|
35
|
+
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
|
36
|
+
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
|
37
|
+
|
38
|
+
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
|
39
|
+
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
|
40
|
+
|
41
|
+
header_y0 = max(header_y0, top_max_y0)
|
42
|
+
footer_y0 = min(footer_y0, btn_min_y1)
|
43
|
+
|
44
|
+
content_boundry = [0, header_y0, page_w, footer_y0]
|
45
|
+
|
46
|
+
header = [0, 0, page_w, header_y0]
|
47
|
+
footer = [0, footer_y0, page_w, page_h]
|
48
|
+
|
49
|
+
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
|
50
|
+
text_block_to_remove = []
|
51
|
+
# 首先检查每个textblock
|
52
|
+
for blk in text_raw_blocks:
|
53
|
+
if len(blk['lines']) > 0:
|
54
|
+
for line in blk['lines']:
|
55
|
+
line_del = []
|
56
|
+
for span in line['spans']:
|
57
|
+
span_del = []
|
58
|
+
if span['bbox'][3] < header_y0:
|
59
|
+
span_del.append(span)
|
60
|
+
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
|
61
|
+
span_del.append(span)
|
62
|
+
for span in span_del:
|
63
|
+
line['spans'].remove(span)
|
64
|
+
if not line['spans']:
|
65
|
+
line_del.append(line)
|
66
|
+
|
67
|
+
for line in line_del:
|
68
|
+
blk['lines'].remove(line)
|
69
|
+
else:
|
70
|
+
# if not blk['lines']:
|
71
|
+
blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
|
72
|
+
text_block_to_remove.append(blk)
|
73
|
+
|
74
|
+
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
|
75
|
+
page_no_block_2_remove = []
|
76
|
+
if page_no_bboxs:
|
77
|
+
for pagenobox in page_no_bboxs:
|
78
|
+
for block in text_raw_blocks:
|
79
|
+
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
|
80
|
+
for line in block['lines']:
|
81
|
+
for span in line['spans']:
|
82
|
+
if _is_in_or_part_overlap(pagenobox, span['bbox']):
|
83
|
+
# span['text'] = ''
|
84
|
+
span['tag'] = PAGE_NO
|
85
|
+
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
|
86
|
+
if len(line['spans']) == 1 and len(block['lines']) == 1:
|
87
|
+
page_no_block_2_remove.append(block)
|
88
|
+
else:
|
89
|
+
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
|
90
|
+
if len(text_raw_blocks) > 0:
|
91
|
+
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
|
92
|
+
last_block = text_raw_blocks[0]
|
93
|
+
if len(last_block['lines']) == 1:
|
94
|
+
last_line = last_block['lines'][0]
|
95
|
+
if len(last_line['spans']) == 1:
|
96
|
+
last_span = last_line['spans'][0]
|
97
|
+
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
|
98
|
+
last_span[
|
99
|
+
'text']):
|
100
|
+
last_span['tag'] = PAGE_NO
|
101
|
+
page_no_block_2_remove.append(last_block)
|
102
|
+
|
103
|
+
for b in page_no_block_2_remove:
|
104
|
+
text_block_to_remove.append(b)
|
105
|
+
|
106
|
+
for blk in text_block_to_remove:
|
107
|
+
if blk in text_raw_blocks:
|
108
|
+
text_raw_blocks.remove(blk)
|
109
|
+
|
110
|
+
text_block_remain = text_raw_blocks
|
111
|
+
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
|
112
|
+
|
113
|
+
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
|
114
|
+
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
|
115
|
+
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
|
116
|
+
|
117
|
+
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
|