magic-pdf 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +11 -11
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +5 -5
- magic_pdf/libs/draw_bbox.py +3 -2
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +122 -76
- magic_pdf/model/sub_modules/model_init.py +40 -35
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +110 -53
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +202 -36
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +26 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/METADATA +11 -12
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/RECORD +58 -59
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -55
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/para/para_pipeline.py +0 -297
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
|
|
1
|
-
from loguru import logger
|
2
1
|
|
3
|
-
from magic_pdf.
|
4
|
-
|
5
|
-
from magic_pdf.libs.
|
6
|
-
|
2
|
+
from magic_pdf.config.drop_tag import DropTag
|
3
|
+
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
4
|
+
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
|
5
|
+
calculate_iou,
|
6
|
+
calculate_overlap_area_in_bbox1_area_ratio,
|
7
|
+
get_minbox_if_overlap_by_ratio)
|
7
8
|
|
8
9
|
|
9
10
|
def remove_overlaps_low_confidence_spans(spans):
|
@@ -21,7 +22,10 @@ def remove_overlaps_low_confidence_spans(spans):
|
|
21
22
|
span_need_remove = span1
|
22
23
|
else:
|
23
24
|
span_need_remove = span2
|
24
|
-
if
|
25
|
+
if (
|
26
|
+
span_need_remove is not None
|
27
|
+
and span_need_remove not in dropped_spans
|
28
|
+
):
|
25
29
|
dropped_spans.append(span_need_remove)
|
26
30
|
|
27
31
|
if len(dropped_spans) > 0:
|
@@ -38,12 +42,15 @@ def remove_overlaps_min_spans(spans):
|
|
38
42
|
for span1 in spans:
|
39
43
|
for span2 in spans:
|
40
44
|
if span1 != span2:
|
41
|
-
|
42
|
-
if
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
45
|
+
# span1 或 span2 任何一个都不应该在 dropped_spans 中
|
46
|
+
if span1 in dropped_spans or span2 in dropped_spans:
|
47
|
+
continue
|
48
|
+
else:
|
49
|
+
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
|
50
|
+
if overlap_box is not None:
|
51
|
+
span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
|
52
|
+
if span_need_remove is not None and span_need_remove not in dropped_spans:
|
53
|
+
dropped_spans.append(span_need_remove)
|
47
54
|
if len(dropped_spans) > 0:
|
48
55
|
for span_need_remove in dropped_spans:
|
49
56
|
spans.remove(span_need_remove)
|
@@ -58,7 +65,10 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
|
|
58
65
|
need_remove_spans = []
|
59
66
|
for span in spans:
|
60
67
|
for removed_bbox in need_remove_spans_bboxes:
|
61
|
-
if
|
68
|
+
if (
|
69
|
+
calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox)
|
70
|
+
> 0.5
|
71
|
+
):
|
62
72
|
if span not in need_remove_spans:
|
63
73
|
need_remove_spans.append(span)
|
64
74
|
break
|
@@ -78,12 +88,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
|
|
78
88
|
for span in spans:
|
79
89
|
# 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
|
80
90
|
for removed_bbox in removed_bboxes:
|
81
|
-
if
|
91
|
+
if (
|
92
|
+
calculate_overlap_area_in_bbox1_area_ratio(
|
93
|
+
span['bbox'], removed_bbox
|
94
|
+
)
|
95
|
+
> 0.5
|
96
|
+
):
|
82
97
|
need_remove_spans.append(span)
|
83
98
|
break
|
84
99
|
# 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span
|
85
|
-
elif
|
86
|
-
|
100
|
+
elif (
|
101
|
+
drop_tag == DropTag.FOOTNOTE
|
102
|
+
and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3]
|
103
|
+
and removed_bbox[0]
|
104
|
+
< (span['bbox'][0] + span['bbox'][2]) / 2
|
105
|
+
< removed_bbox[2]
|
106
|
+
):
|
87
107
|
need_remove_spans.append(span)
|
88
108
|
break
|
89
109
|
|
@@ -98,11 +118,18 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
|
|
98
118
|
def adjust_bbox_for_standalone_block(spans):
|
99
119
|
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
|
100
120
|
for sb_span in spans:
|
101
|
-
if sb_span['type'] in [
|
121
|
+
if sb_span['type'] in [
|
122
|
+
ContentType.InterlineEquation,
|
123
|
+
ContentType.Image,
|
124
|
+
ContentType.Table,
|
125
|
+
]:
|
102
126
|
for text_span in spans:
|
103
127
|
if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
|
104
128
|
# 判断span2的纵向高度是否被span所覆盖
|
105
|
-
if
|
129
|
+
if (
|
130
|
+
sb_span['bbox'][1] < text_span['bbox'][1]
|
131
|
+
and sb_span['bbox'][3] > text_span['bbox'][3]
|
132
|
+
):
|
106
133
|
# 判断span2是否在span左边
|
107
134
|
if text_span['bbox'][0] < sb_span['bbox'][0]:
|
108
135
|
# 调整span的y0和span2的y0一致
|
@@ -120,11 +147,15 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
|
|
120
147
|
|
121
148
|
lines = []
|
122
149
|
current_line = [spans[0]]
|
123
|
-
if spans[0][
|
150
|
+
if spans[0]['type'] in [
|
151
|
+
ContentType.InterlineEquation,
|
152
|
+
ContentType.Image,
|
153
|
+
ContentType.Table,
|
154
|
+
]:
|
124
155
|
displayed_list.append(spans[0])
|
125
156
|
|
126
|
-
line_first_y0 = spans[0][
|
127
|
-
line_first_y = spans[0][
|
157
|
+
line_first_y0 = spans[0]['bbox'][1]
|
158
|
+
line_first_y = spans[0]['bbox'][3]
|
128
159
|
# 用于给行间公式搜索
|
129
160
|
# text_inline_lines = []
|
130
161
|
for span in spans[1:]:
|
@@ -132,26 +163,43 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
|
|
132
163
|
# print("debug")
|
133
164
|
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
134
165
|
# image和table类型,同上
|
135
|
-
if span['type'] in [
|
136
|
-
|
137
|
-
|
166
|
+
if span['type'] in [
|
167
|
+
ContentType.InterlineEquation,
|
168
|
+
ContentType.Image,
|
169
|
+
ContentType.Table,
|
170
|
+
] or any(
|
171
|
+
s['type']
|
172
|
+
in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]
|
173
|
+
for s in current_line
|
174
|
+
):
|
138
175
|
# 传入
|
139
|
-
if span[
|
176
|
+
if span['type'] in [
|
177
|
+
ContentType.InterlineEquation,
|
178
|
+
ContentType.Image,
|
179
|
+
ContentType.Table,
|
180
|
+
]:
|
140
181
|
displayed_list.append(span)
|
141
182
|
# 则开始新行
|
142
183
|
lines.append(current_line)
|
143
|
-
if len(current_line) > 1 or current_line[0][
|
144
|
-
|
184
|
+
if len(current_line) > 1 or current_line[0]['type'] in [
|
185
|
+
ContentType.Text,
|
186
|
+
ContentType.InlineEquation,
|
187
|
+
]:
|
188
|
+
text_inline_lines.append(
|
189
|
+
(current_line, (line_first_y0, line_first_y))
|
190
|
+
)
|
145
191
|
current_line = [span]
|
146
|
-
line_first_y0 = span[
|
147
|
-
line_first_y = span[
|
192
|
+
line_first_y0 = span['bbox'][1]
|
193
|
+
line_first_y = span['bbox'][3]
|
148
194
|
continue
|
149
195
|
|
150
196
|
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
151
|
-
if __is_overlaps_y_exceeds_threshold(
|
152
|
-
|
153
|
-
|
154
|
-
|
197
|
+
if __is_overlaps_y_exceeds_threshold(
|
198
|
+
span['bbox'], current_line[-1]['bbox']
|
199
|
+
):
|
200
|
+
if span['type'] == 'text':
|
201
|
+
line_first_y0 = span['bbox'][1]
|
202
|
+
line_first_y = span['bbox'][3]
|
155
203
|
current_line.append(span)
|
156
204
|
|
157
205
|
else:
|
@@ -159,13 +207,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
|
|
159
207
|
lines.append(current_line)
|
160
208
|
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
161
209
|
current_line = [span]
|
162
|
-
line_first_y0 = span[
|
163
|
-
line_first_y = span[
|
210
|
+
line_first_y0 = span['bbox'][1]
|
211
|
+
line_first_y = span['bbox'][3]
|
164
212
|
|
165
213
|
# 添加最后一行
|
166
214
|
if current_line:
|
167
215
|
lines.append(current_line)
|
168
|
-
if len(current_line) > 1 or current_line[0][
|
216
|
+
if len(current_line) > 1 or current_line[0]['type'] in [
|
217
|
+
ContentType.Text,
|
218
|
+
ContentType.InlineEquation,
|
219
|
+
]:
|
169
220
|
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
170
221
|
for line in text_inline_lines:
|
171
222
|
# 按照x0坐标排序
|
@@ -176,8 +227,8 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
|
|
176
227
|
for line in text_inline_lines:
|
177
228
|
current_line, (line_first_y0, line_first_y) = line
|
178
229
|
for span in current_line:
|
179
|
-
span[
|
180
|
-
span[
|
230
|
+
span['bbox'][1] = line_first_y0
|
231
|
+
span['bbox'][3] = line_first_y
|
181
232
|
|
182
233
|
# return spans, displayed_list, text_inline_lines
|
183
234
|
|
@@ -189,34 +240,42 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
|
|
189
240
|
# if i == 8:
|
190
241
|
# print("debug")
|
191
242
|
span = displayed_list[i]
|
192
|
-
span_y0, span_y = span[
|
243
|
+
span_y0, span_y = span['bbox'][1], span['bbox'][3]
|
193
244
|
|
194
245
|
while j < len(text_inline_lines):
|
195
246
|
text_line = text_inline_lines[j]
|
196
247
|
y0, y1 = text_line[1]
|
197
248
|
if (
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
249
|
+
span_y0 < y0 < span_y
|
250
|
+
or span_y0 < y1 < span_y
|
251
|
+
or span_y0 < y0
|
252
|
+
and span_y > y1
|
253
|
+
) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
|
202
254
|
# 调整公式类型
|
203
|
-
if span[
|
255
|
+
if span['type'] == ContentType.InterlineEquation:
|
204
256
|
# 最后一行是行间公式
|
205
257
|
if j + 1 >= len(text_inline_lines):
|
206
|
-
span[
|
207
|
-
span[
|
208
|
-
span[
|
258
|
+
span['type'] = ContentType.InlineEquation
|
259
|
+
span['bbox'][1] = y0
|
260
|
+
span['bbox'][3] = y1
|
209
261
|
else:
|
210
262
|
# 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
|
211
263
|
y0_next, y1_next = text_inline_lines[j + 1][1]
|
212
|
-
if
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
264
|
+
if (
|
265
|
+
not __is_overlaps_y_exceeds_threshold(
|
266
|
+
span['bbox'], (0, y0_next, 0, y1_next)
|
267
|
+
)
|
268
|
+
and 3 * (y1 - y0) > span_y - span_y0
|
269
|
+
):
|
270
|
+
span['type'] = ContentType.InlineEquation
|
271
|
+
span['bbox'][1] = y0
|
272
|
+
span['bbox'][3] = y1
|
217
273
|
break
|
218
|
-
elif
|
219
|
-
|
274
|
+
elif (
|
275
|
+
span_y < y0
|
276
|
+
or span_y0 < y0 < span_y
|
277
|
+
and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1))
|
278
|
+
):
|
220
279
|
break
|
221
280
|
else:
|
222
281
|
j += 1
|
@@ -232,15 +291,15 @@ def get_qa_need_list(blocks):
|
|
232
291
|
inline_equations = []
|
233
292
|
|
234
293
|
for block in blocks:
|
235
|
-
for line in block[
|
236
|
-
for span in line[
|
237
|
-
if span[
|
294
|
+
for line in block['lines']:
|
295
|
+
for span in line['spans']:
|
296
|
+
if span['type'] == ContentType.Image:
|
238
297
|
images.append(span)
|
239
|
-
elif span[
|
298
|
+
elif span['type'] == ContentType.Table:
|
240
299
|
tables.append(span)
|
241
|
-
elif span[
|
300
|
+
elif span['type'] == ContentType.InlineEquation:
|
242
301
|
inline_equations.append(span)
|
243
|
-
elif span[
|
302
|
+
elif span['type'] == ContentType.InterlineEquation:
|
244
303
|
interline_equations.append(span)
|
245
304
|
else:
|
246
305
|
continue
|
@@ -254,10 +313,10 @@ def get_qa_need_list_v2(blocks):
|
|
254
313
|
interline_equations = []
|
255
314
|
|
256
315
|
for block in blocks:
|
257
|
-
if block[
|
316
|
+
if block['type'] == BlockType.Image:
|
258
317
|
images.append(block)
|
259
|
-
elif block[
|
318
|
+
elif block['type'] == BlockType.Table:
|
260
319
|
tables.append(block)
|
261
|
-
elif block[
|
320
|
+
elif block['type'] == BlockType.InterlineEquation:
|
262
321
|
interline_equations.append(block)
|
263
322
|
return images, tables, interline_equations
|
@@ -1,58 +1,65 @@
|
|
1
|
-
from magic_pdf.
|
1
|
+
from magic_pdf.config.drop_reason import DropReason
|
2
2
|
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
|
3
|
-
from magic_pdf.libs.
|
3
|
+
from magic_pdf.libs.commons import fitz
|
4
4
|
|
5
5
|
|
6
6
|
def __area(box):
|
7
7
|
return (box[2] - box[0]) * (box[3] - box[1])
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
9
|
+
|
10
|
+
def __is_contain_color_background_rect(
|
11
|
+
page: fitz.Page, text_blocks, image_bboxes
|
12
|
+
) -> bool:
|
13
|
+
"""检查page是包含有颜色背景的矩形."""
|
13
14
|
color_bg_rect = []
|
14
15
|
p_width, p_height = page.rect.width, page.rect.height
|
15
|
-
|
16
|
+
|
16
17
|
# 先找到最大的带背景矩形
|
17
18
|
blocks = page.get_cdrawings()
|
18
19
|
for block in blocks:
|
19
|
-
|
20
|
-
if 'fill' in block and block['fill']: # 过滤掉透明的
|
20
|
+
if 'fill' in block and block['fill']: # 过滤掉透明的
|
21
21
|
fill = list(block['fill'])
|
22
22
|
fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
|
23
|
-
if fill==(1.0,1.0,1.0):
|
23
|
+
if fill == (1.0, 1.0, 1.0):
|
24
24
|
continue
|
25
25
|
rect = block['rect']
|
26
26
|
# 过滤掉特别小的矩形
|
27
|
-
if __area(rect) < 10*10:
|
27
|
+
if __area(rect) < 10 * 10:
|
28
28
|
continue
|
29
29
|
# 为了防止是svg图片上的色块,这里过滤掉这类
|
30
|
-
|
31
|
-
if any(
|
30
|
+
|
31
|
+
if any(
|
32
|
+
[_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]
|
33
|
+
):
|
32
34
|
continue
|
33
35
|
color_bg_rect.append(rect)
|
34
|
-
|
36
|
+
|
35
37
|
# 找到最大的背景矩形
|
36
38
|
if len(color_bg_rect) > 0:
|
37
|
-
max_rect = max(color_bg_rect, key=lambda x:__area(x))
|
38
|
-
max_rect_int = (
|
39
|
+
max_rect = max(color_bg_rect, key=lambda x: __area(x))
|
40
|
+
max_rect_int = (
|
41
|
+
int(max_rect[0]),
|
42
|
+
int(max_rect[1]),
|
43
|
+
int(max_rect[2]),
|
44
|
+
int(max_rect[3]),
|
45
|
+
)
|
39
46
|
# 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
|
40
|
-
if
|
41
|
-
|
47
|
+
if (
|
48
|
+
max_rect[2] - max_rect[0] > 0.2 * p_width
|
49
|
+
and max_rect[3] - max_rect[1] > 0.1 * p_height
|
50
|
+
): # 宽度符合
|
51
|
+
# 看是否有文本块落入到这个矩形中
|
42
52
|
for text_block in text_blocks:
|
43
53
|
box = text_block['bbox']
|
44
54
|
box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
|
45
55
|
if _is_in(box_int, max_rect_int):
|
46
56
|
return True
|
47
|
-
|
57
|
+
|
48
58
|
return False
|
49
59
|
|
50
60
|
|
51
61
|
def __is_table_overlap_text_block(text_blocks, table_bbox):
|
52
|
-
"""
|
53
|
-
检查table_bbox是否覆盖了text_blocks里的文本块
|
54
|
-
TODO
|
55
|
-
"""
|
62
|
+
"""检查table_bbox是否覆盖了text_blocks里的文本块 TODO."""
|
56
63
|
for text_block in text_blocks:
|
57
64
|
box = text_block['bbox']
|
58
65
|
if _is_in_or_part_overlap(table_bbox, box):
|
@@ -60,15 +67,12 @@ def __is_table_overlap_text_block(text_blocks, table_bbox):
|
|
60
67
|
return False
|
61
68
|
|
62
69
|
|
63
|
-
def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
|
64
|
-
"""
|
65
|
-
return:(True|False, err_msg)
|
66
|
-
True, 如果pdf符合要求
|
67
|
-
False, 如果pdf不符合要求
|
68
|
-
|
69
|
-
"""
|
70
|
+
def pdf_filter(page: fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
|
71
|
+
"""return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
|
70
72
|
if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
|
71
|
-
return False, {
|
73
|
+
return False, {
|
74
|
+
'_need_drop': True,
|
75
|
+
'_drop_reason': DropReason.COLOR_BACKGROUND_TEXT_BOX,
|
76
|
+
}
|
72
77
|
|
73
|
-
|
74
|
-
return True, None
|
78
|
+
return True, None
|
@@ -1,8 +1,9 @@
|
|
1
|
-
from magic_pdf.
|
2
|
-
from magic_pdf.libs.
|
1
|
+
from magic_pdf.config.drop_reason import DropReason
|
2
|
+
from magic_pdf.libs.boxbase import _is_in, _is_part_overlap
|
3
|
+
|
3
4
|
|
4
5
|
def _remove_overlap_between_bbox(bbox1, bbox2):
|
5
|
-
|
6
|
+
if _is_part_overlap(bbox1, bbox2):
|
6
7
|
ix0, iy0, ix1, iy1 = bbox1
|
7
8
|
x0, y0, x1, y1 = bbox2
|
8
9
|
|
@@ -22,10 +23,10 @@ def _remove_overlap_between_bbox(bbox1, bbox2):
|
|
22
23
|
if y1 >= iy1:
|
23
24
|
mid = (y0 + iy1) // 2
|
24
25
|
y0 = max(mid + 0.25, y0)
|
25
|
-
iy1 = min(iy1, mid-0.25)
|
26
|
+
iy1 = min(iy1, mid - 0.25)
|
26
27
|
else:
|
27
28
|
mid = (iy0 + y1) // 2
|
28
|
-
y1 = min(y1, mid-0.25)
|
29
|
+
y1 = min(y1, mid - 0.25)
|
29
30
|
iy0 = max(mid + 0.25, iy0)
|
30
31
|
|
31
32
|
if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
|
@@ -34,8 +35,8 @@ def _remove_overlap_between_bbox(bbox1, bbox2):
|
|
34
35
|
return bbox1, bbox2, None
|
35
36
|
else:
|
36
37
|
return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
|
37
|
-
|
38
|
-
|
38
|
+
else:
|
39
|
+
return bbox1, bbox2, None
|
39
40
|
|
40
41
|
|
41
42
|
def _remove_overlap_between_bboxes(arr):
|
@@ -47,7 +48,7 @@ def _remove_overlap_between_bboxes(arr):
|
|
47
48
|
for j in range(N):
|
48
49
|
if i == j:
|
49
50
|
continue
|
50
|
-
if _is_in(arr[i][
|
51
|
+
if _is_in(arr[i]['bbox'], arr[j]['bbox']):
|
51
52
|
keeps[i] = False
|
52
53
|
|
53
54
|
for idx, v in enumerate(arr):
|
@@ -56,13 +57,15 @@ def _remove_overlap_between_bboxes(arr):
|
|
56
57
|
for i in range(N):
|
57
58
|
if res[i] is None:
|
58
59
|
continue
|
59
|
-
|
60
|
-
bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(
|
60
|
+
|
61
|
+
bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(
|
62
|
+
v['bbox'], res[i]['bbox']
|
63
|
+
)
|
61
64
|
if drop_reason is None:
|
62
|
-
v[
|
63
|
-
res[i][
|
65
|
+
v['bbox'] = bbox1
|
66
|
+
res[i]['bbox'] = bbox2
|
64
67
|
else:
|
65
|
-
if v[
|
68
|
+
if v['score'] > res[i]['score']:
|
66
69
|
keeps[i] = False
|
67
70
|
res[i] = None
|
68
71
|
else:
|
@@ -74,25 +77,24 @@ def _remove_overlap_between_bboxes(arr):
|
|
74
77
|
|
75
78
|
|
76
79
|
def remove_overlap_between_bbox_for_span(spans):
|
77
|
-
arr = [{
|
80
|
+
arr = [{'bbox': span['bbox'], 'score': span.get('score', 0.1)} for span in spans]
|
78
81
|
res, drop_reasons = _remove_overlap_between_bboxes(arr)
|
79
82
|
ret = []
|
80
83
|
for i in range(len(res)):
|
81
84
|
if res[i] is None:
|
82
85
|
continue
|
83
|
-
spans[i][
|
86
|
+
spans[i]['bbox'] = res[i]['bbox']
|
84
87
|
ret.append(spans[i])
|
85
88
|
return ret, drop_reasons
|
86
89
|
|
87
90
|
|
88
91
|
def remove_overlap_between_bbox_for_block(all_bboxes):
|
89
|
-
arr = [{
|
92
|
+
arr = [{'bbox': bbox[:4], 'score': bbox[-1]} for bbox in all_bboxes]
|
90
93
|
res, drop_reasons = _remove_overlap_between_bboxes(arr)
|
91
94
|
ret = []
|
92
95
|
for i in range(len(res)):
|
93
96
|
if res[i] is None:
|
94
97
|
continue
|
95
|
-
all_bboxes[i][:4] = res[i][
|
98
|
+
all_bboxes[i][:4] = res[i]['bbox']
|
96
99
|
ret.append(all_bboxes[i])
|
97
100
|
return ret, drop_reasons
|
98
|
-
|
@@ -1,7 +1,8 @@
|
|
1
|
-
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
|
2
1
|
from loguru import logger
|
3
2
|
|
4
|
-
from magic_pdf.
|
3
|
+
from magic_pdf.config.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
|
4
|
+
from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
|
5
|
+
calculate_overlap_area_2_minbox_area_ratio)
|
5
6
|
|
6
7
|
|
7
8
|
def __area(box):
|
@@ -9,8 +10,7 @@ def __area(box):
|
|
9
10
|
|
10
11
|
|
11
12
|
def rectangle_position_determination(rect, p_width):
|
12
|
-
"""
|
13
|
-
判断矩形是否在页面中轴线附近。
|
13
|
+
"""判断矩形是否在页面中轴线附近。
|
14
14
|
|
15
15
|
Args:
|
16
16
|
rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
|
@@ -34,9 +34,10 @@ def rectangle_position_determination(rect, p_width):
|
|
34
34
|
else:
|
35
35
|
return False
|
36
36
|
|
37
|
+
|
37
38
|
def remove_colored_strip_textblock(remain_text_blocks, page):
|
38
|
-
"""
|
39
|
-
|
39
|
+
"""根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_str
|
40
|
+
ip_textblock。
|
40
41
|
|
41
42
|
Args:
|
42
43
|
remain_text_blocks (list): 剩余文本块列表。
|
@@ -51,22 +52,44 @@ def remove_colored_strip_textblock(remain_text_blocks, page):
|
|
51
52
|
blocks = page.get_cdrawings()
|
52
53
|
colored_strip_bg_rect = []
|
53
54
|
for block in blocks:
|
54
|
-
is_filled =
|
55
|
+
is_filled = (
|
56
|
+
'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)
|
57
|
+
) # 过滤掉透明的
|
55
58
|
rect = block['rect']
|
56
59
|
area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
|
57
|
-
rectangle_position_determination_result = rectangle_position_determination(
|
58
|
-
|
59
|
-
|
60
|
+
rectangle_position_determination_result = rectangle_position_determination(
|
61
|
+
rect, p_width
|
62
|
+
)
|
63
|
+
in_upper_half_page = (
|
64
|
+
rect[3] < p_height * 0.3
|
65
|
+
) # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
|
66
|
+
aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (
|
67
|
+
rect[3] - rect[1]
|
68
|
+
) * 4 # 找到长宽比超过4的矩形
|
60
69
|
|
61
|
-
if
|
70
|
+
if (
|
71
|
+
is_filled
|
72
|
+
and area_is_large_enough
|
73
|
+
and rectangle_position_determination_result
|
74
|
+
and in_upper_half_page
|
75
|
+
and aspect_ratio_exceeds_4
|
76
|
+
):
|
62
77
|
colored_strip_bg_rect.append(rect)
|
63
78
|
|
64
79
|
if len(colored_strip_bg_rect) > 0:
|
65
80
|
for colored_strip_block_bbox in colored_strip_bg_rect:
|
66
81
|
for text_block in remain_text_blocks:
|
67
82
|
text_bbox = text_block['bbox']
|
68
|
-
if _is_in(text_bbox, colored_strip_block_bbox) or (
|
69
|
-
|
83
|
+
if _is_in(text_bbox, colored_strip_block_bbox) or (
|
84
|
+
_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox)
|
85
|
+
and calculate_overlap_area_2_minbox_area_ratio(
|
86
|
+
text_bbox, colored_strip_block_bbox
|
87
|
+
)
|
88
|
+
> 0.6
|
89
|
+
):
|
90
|
+
logger.info(
|
91
|
+
f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}'
|
92
|
+
)
|
70
93
|
text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
|
71
94
|
colored_strip_textblocks.append(text_block)
|
72
95
|
|
@@ -76,4 +99,3 @@ def remove_colored_strip_textblock(remain_text_blocks, page):
|
|
76
99
|
remain_text_blocks.remove(colored_strip_textblock)
|
77
100
|
|
78
101
|
return remain_text_blocks, colored_strip_textblocks
|
79
|
-
|
@@ -1,15 +1,12 @@
|
|
1
1
|
import re
|
2
2
|
|
3
|
+
from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
|
3
4
|
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
|
4
|
-
from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
|
5
5
|
|
6
6
|
|
7
7
|
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
|
8
8
|
page_no_bboxs, page_w, page_h):
|
9
|
-
"""
|
10
|
-
删除页眉页脚,页码
|
11
|
-
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
|
12
|
-
"""
|
9
|
+
"""删除页眉页脚,页码 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中."""
|
13
10
|
header = []
|
14
11
|
footer = []
|
15
12
|
if len(header) == 0:
|