magic-pdf 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +11 -11
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +5 -5
- magic_pdf/libs/draw_bbox.py +3 -2
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +122 -76
- magic_pdf/model/sub_modules/model_init.py +40 -35
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +110 -53
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +202 -36
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +26 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/METADATA +11 -12
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/RECORD +58 -59
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -55
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/para/para_pipeline.py +0 -297
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,60 +1,181 @@
|
|
1
|
-
from loguru import logger
|
2
1
|
|
3
|
-
from magic_pdf.
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
2
|
+
from magic_pdf.config.ocr_content_type import BlockType
|
3
|
+
from magic_pdf.libs.boxbase import (
|
4
|
+
calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
|
5
|
+
calculate_vertical_projection_overlap_ratio,
|
6
|
+
get_minbox_if_overlap_by_ratio)
|
7
|
+
from magic_pdf.pre_proc.remove_bbox_overlap import \
|
8
|
+
remove_overlap_between_bbox_for_block
|
9
|
+
|
10
|
+
|
11
|
+
def ocr_prepare_bboxes_for_layout_split(
|
12
|
+
img_blocks,
|
13
|
+
table_blocks,
|
14
|
+
discarded_blocks,
|
15
|
+
text_blocks,
|
16
|
+
title_blocks,
|
17
|
+
interline_equation_blocks,
|
18
|
+
page_w,
|
19
|
+
page_h,
|
20
|
+
):
|
12
21
|
all_bboxes = []
|
13
22
|
all_discarded_blocks = []
|
14
23
|
for image in img_blocks:
|
15
24
|
x0, y0, x1, y1 = image['bbox']
|
16
|
-
all_bboxes.append(
|
25
|
+
all_bboxes.append(
|
26
|
+
[
|
27
|
+
x0,
|
28
|
+
y0,
|
29
|
+
x1,
|
30
|
+
y1,
|
31
|
+
None,
|
32
|
+
None,
|
33
|
+
None,
|
34
|
+
BlockType.Image,
|
35
|
+
None,
|
36
|
+
None,
|
37
|
+
None,
|
38
|
+
None,
|
39
|
+
image['score'],
|
40
|
+
]
|
41
|
+
)
|
17
42
|
|
18
43
|
for table in table_blocks:
|
19
44
|
x0, y0, x1, y1 = table['bbox']
|
20
|
-
all_bboxes.append(
|
45
|
+
all_bboxes.append(
|
46
|
+
[
|
47
|
+
x0,
|
48
|
+
y0,
|
49
|
+
x1,
|
50
|
+
y1,
|
51
|
+
None,
|
52
|
+
None,
|
53
|
+
None,
|
54
|
+
BlockType.Table,
|
55
|
+
None,
|
56
|
+
None,
|
57
|
+
None,
|
58
|
+
None,
|
59
|
+
table['score'],
|
60
|
+
]
|
61
|
+
)
|
21
62
|
|
22
63
|
for text in text_blocks:
|
23
64
|
x0, y0, x1, y1 = text['bbox']
|
24
|
-
all_bboxes.append(
|
65
|
+
all_bboxes.append(
|
66
|
+
[
|
67
|
+
x0,
|
68
|
+
y0,
|
69
|
+
x1,
|
70
|
+
y1,
|
71
|
+
None,
|
72
|
+
None,
|
73
|
+
None,
|
74
|
+
BlockType.Text,
|
75
|
+
None,
|
76
|
+
None,
|
77
|
+
None,
|
78
|
+
None,
|
79
|
+
text['score'],
|
80
|
+
]
|
81
|
+
)
|
25
82
|
|
26
83
|
for title in title_blocks:
|
27
84
|
x0, y0, x1, y1 = title['bbox']
|
28
|
-
all_bboxes.append(
|
85
|
+
all_bboxes.append(
|
86
|
+
[
|
87
|
+
x0,
|
88
|
+
y0,
|
89
|
+
x1,
|
90
|
+
y1,
|
91
|
+
None,
|
92
|
+
None,
|
93
|
+
None,
|
94
|
+
BlockType.Title,
|
95
|
+
None,
|
96
|
+
None,
|
97
|
+
None,
|
98
|
+
None,
|
99
|
+
title['score'],
|
100
|
+
]
|
101
|
+
)
|
29
102
|
|
30
103
|
for interline_equation in interline_equation_blocks:
|
31
104
|
x0, y0, x1, y1 = interline_equation['bbox']
|
32
|
-
all_bboxes.append(
|
33
|
-
|
34
|
-
|
35
|
-
|
105
|
+
all_bboxes.append(
|
106
|
+
[
|
107
|
+
x0,
|
108
|
+
y0,
|
109
|
+
x1,
|
110
|
+
y1,
|
111
|
+
None,
|
112
|
+
None,
|
113
|
+
None,
|
114
|
+
BlockType.InterlineEquation,
|
115
|
+
None,
|
116
|
+
None,
|
117
|
+
None,
|
118
|
+
None,
|
119
|
+
interline_equation['score'],
|
120
|
+
]
|
121
|
+
)
|
122
|
+
|
123
|
+
"""block嵌套问题解决"""
|
124
|
+
"""文本框与标题框重叠,优先信任文本框"""
|
36
125
|
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
|
37
|
-
|
126
|
+
"""任何框体与舍弃框重叠,优先信任舍弃框"""
|
38
127
|
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
|
39
128
|
|
40
129
|
# interline_equation 与title或text框冲突的情况,分两种情况处理
|
41
|
-
|
130
|
+
"""interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
|
42
131
|
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
|
43
|
-
|
132
|
+
"""interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
|
44
133
|
# 通过后续大框套小框逻辑删除
|
45
134
|
|
46
|
-
|
135
|
+
"""discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)"""
|
47
136
|
for discarded in discarded_blocks:
|
48
137
|
x0, y0, x1, y1 = discarded['bbox']
|
49
|
-
all_discarded_blocks.append(
|
138
|
+
all_discarded_blocks.append(
|
139
|
+
[
|
140
|
+
x0,
|
141
|
+
y0,
|
142
|
+
x1,
|
143
|
+
y1,
|
144
|
+
None,
|
145
|
+
None,
|
146
|
+
None,
|
147
|
+
BlockType.Discarded,
|
148
|
+
None,
|
149
|
+
None,
|
150
|
+
None,
|
151
|
+
None,
|
152
|
+
discarded['score'],
|
153
|
+
]
|
154
|
+
)
|
50
155
|
# 将footnote加入到all_bboxes中,用来计算layout
|
51
156
|
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
|
52
|
-
all_bboxes.append(
|
53
|
-
|
54
|
-
|
157
|
+
all_bboxes.append(
|
158
|
+
[
|
159
|
+
x0,
|
160
|
+
y0,
|
161
|
+
x1,
|
162
|
+
y1,
|
163
|
+
None,
|
164
|
+
None,
|
165
|
+
None,
|
166
|
+
BlockType.Footnote,
|
167
|
+
None,
|
168
|
+
None,
|
169
|
+
None,
|
170
|
+
None,
|
171
|
+
discarded['score'],
|
172
|
+
]
|
173
|
+
)
|
174
|
+
|
175
|
+
"""经过以上处理后,还存在大框套小框的情况,则删除小框"""
|
55
176
|
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
|
56
177
|
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
|
57
|
-
|
178
|
+
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
|
58
179
|
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
59
180
|
|
60
181
|
return all_bboxes, all_discarded_blocks, drop_reasons
|
@@ -64,18 +185,64 @@ def add_bboxes(blocks, block_type, bboxes):
|
|
64
185
|
for block in blocks:
|
65
186
|
x0, y0, x1, y1 = block['bbox']
|
66
187
|
if block_type in [
|
67
|
-
BlockType.ImageBody,
|
68
|
-
BlockType.
|
188
|
+
BlockType.ImageBody,
|
189
|
+
BlockType.ImageCaption,
|
190
|
+
BlockType.ImageFootnote,
|
191
|
+
BlockType.TableBody,
|
192
|
+
BlockType.TableCaption,
|
193
|
+
BlockType.TableFootnote,
|
69
194
|
]:
|
70
|
-
bboxes.append(
|
195
|
+
bboxes.append(
|
196
|
+
[
|
197
|
+
x0,
|
198
|
+
y0,
|
199
|
+
x1,
|
200
|
+
y1,
|
201
|
+
None,
|
202
|
+
None,
|
203
|
+
None,
|
204
|
+
block_type,
|
205
|
+
None,
|
206
|
+
None,
|
207
|
+
None,
|
208
|
+
None,
|
209
|
+
block['score'],
|
210
|
+
block['group_id'],
|
211
|
+
]
|
212
|
+
)
|
71
213
|
else:
|
72
|
-
bboxes.append(
|
214
|
+
bboxes.append(
|
215
|
+
[
|
216
|
+
x0,
|
217
|
+
y0,
|
218
|
+
x1,
|
219
|
+
y1,
|
220
|
+
None,
|
221
|
+
None,
|
222
|
+
None,
|
223
|
+
block_type,
|
224
|
+
None,
|
225
|
+
None,
|
226
|
+
None,
|
227
|
+
None,
|
228
|
+
block['score'],
|
229
|
+
]
|
230
|
+
)
|
73
231
|
|
74
232
|
|
75
233
|
def ocr_prepare_bboxes_for_layout_split_v2(
|
76
|
-
|
77
|
-
|
78
|
-
|
234
|
+
img_body_blocks,
|
235
|
+
img_caption_blocks,
|
236
|
+
img_footnote_blocks,
|
237
|
+
table_body_blocks,
|
238
|
+
table_caption_blocks,
|
239
|
+
table_footnote_blocks,
|
240
|
+
discarded_blocks,
|
241
|
+
text_blocks,
|
242
|
+
title_blocks,
|
243
|
+
interline_equation_blocks,
|
244
|
+
page_w,
|
245
|
+
page_h,
|
79
246
|
):
|
80
247
|
all_bboxes = []
|
81
248
|
|
@@ -89,40 +256,40 @@ def ocr_prepare_bboxes_for_layout_split_v2(
|
|
89
256
|
add_bboxes(title_blocks, BlockType.Title, all_bboxes)
|
90
257
|
add_bboxes(interline_equation_blocks, BlockType.InterlineEquation, all_bboxes)
|
91
258
|
|
92
|
-
|
93
|
-
|
259
|
+
"""block嵌套问题解决"""
|
260
|
+
"""文本框与标题框重叠,优先信任文本框"""
|
94
261
|
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
|
95
|
-
|
262
|
+
"""任何框体与舍弃框重叠,优先信任舍弃框"""
|
96
263
|
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
|
97
264
|
|
98
265
|
# interline_equation 与title或text框冲突的情况,分两种情况处理
|
99
|
-
|
266
|
+
"""interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
|
100
267
|
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
|
101
|
-
|
268
|
+
"""interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
|
102
269
|
# 通过后续大框套小框逻辑删除
|
103
270
|
|
104
|
-
|
271
|
+
"""discarded_blocks"""
|
105
272
|
all_discarded_blocks = []
|
106
273
|
add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
|
107
274
|
|
108
|
-
|
275
|
+
"""footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的"""
|
109
276
|
footnote_blocks = []
|
110
277
|
for discarded in discarded_blocks:
|
111
278
|
x0, y0, x1, y1 = discarded['bbox']
|
112
279
|
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
|
113
280
|
footnote_blocks.append([x0, y0, x1, y1])
|
114
281
|
|
115
|
-
|
282
|
+
"""移除在footnote下面的任何框"""
|
116
283
|
need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks)
|
117
284
|
if len(need_remove_blocks) > 0:
|
118
285
|
for block in need_remove_blocks:
|
119
286
|
all_bboxes.remove(block)
|
120
287
|
all_discarded_blocks.append(block)
|
121
288
|
|
122
|
-
|
289
|
+
"""经过以上处理后,还存在大框套小框的情况,则删除小框"""
|
123
290
|
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
|
124
291
|
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
|
125
|
-
|
292
|
+
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
|
126
293
|
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
127
294
|
|
128
295
|
return all_bboxes, all_discarded_blocks
|
@@ -135,7 +302,13 @@ def find_blocks_under_footnote(all_bboxes, footnote_blocks):
|
|
135
302
|
for footnote_bbox in footnote_blocks:
|
136
303
|
footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox
|
137
304
|
# 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
|
138
|
-
if
|
305
|
+
if (
|
306
|
+
block_y0 >= footnote_y1
|
307
|
+
and calculate_vertical_projection_overlap_ratio(
|
308
|
+
(block_x0, block_y0, block_x1, block_y1), footnote_bbox
|
309
|
+
)
|
310
|
+
>= 0.8
|
311
|
+
):
|
139
312
|
if block not in need_remove_blocks:
|
140
313
|
need_remove_blocks.append(block)
|
141
314
|
break
|
@@ -203,7 +376,12 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
|
|
203
376
|
for block in all_bboxes:
|
204
377
|
for discarded_block in discarded_blocks:
|
205
378
|
block_bbox = block[:4]
|
206
|
-
if
|
379
|
+
if (
|
380
|
+
calculate_overlap_area_in_bbox1_area_ratio(
|
381
|
+
block_bbox, discarded_block['bbox']
|
382
|
+
)
|
383
|
+
> 0.6
|
384
|
+
):
|
207
385
|
if block not in need_remove:
|
208
386
|
need_remove.append(block)
|
209
387
|
break
|
@@ -223,10 +401,18 @@ def remove_overlaps_min_blocks(all_bboxes):
|
|
223
401
|
if block1 != block2:
|
224
402
|
block1_bbox = block1[:4]
|
225
403
|
block2_bbox = block2[:4]
|
226
|
-
overlap_box = get_minbox_if_overlap_by_ratio(
|
404
|
+
overlap_box = get_minbox_if_overlap_by_ratio(
|
405
|
+
block1_bbox, block2_bbox, 0.8
|
406
|
+
)
|
227
407
|
if overlap_box is not None:
|
228
|
-
block_to_remove = next(
|
229
|
-
|
408
|
+
block_to_remove = next(
|
409
|
+
(block for block in all_bboxes if block[:4] == overlap_box),
|
410
|
+
None,
|
411
|
+
)
|
412
|
+
if (
|
413
|
+
block_to_remove is not None
|
414
|
+
and block_to_remove not in need_remove
|
415
|
+
):
|
230
416
|
large_block = block1 if block1 != block_to_remove else block2
|
231
417
|
x1, y1, x2, y2 = large_block[:4]
|
232
418
|
sx1, sy1, sx2, sy2 = block_to_remove[:4]
|
@@ -1,8 +1,8 @@
|
|
1
|
+
from magic_pdf.config.drop_tag import DropTag
|
2
|
+
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
1
3
|
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
|
2
4
|
_is_in_or_part_overlap_with_area_ratio,
|
3
5
|
calculate_overlap_area_in_bbox1_area_ratio)
|
4
|
-
from magic_pdf.libs.drop_tag import DropTag
|
5
|
-
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
|
6
6
|
|
7
7
|
|
8
8
|
# 将每一个line中的span从左到右排序
|
@@ -24,7 +24,7 @@ def line_sort_spans_by_left_to_right(lines):
|
|
24
24
|
return line_objects
|
25
25
|
|
26
26
|
|
27
|
-
def merge_spans_to_line(spans):
|
27
|
+
def merge_spans_to_line(spans, threshold=0.6):
|
28
28
|
if len(spans) == 0:
|
29
29
|
return []
|
30
30
|
else:
|
@@ -49,7 +49,7 @@ def merge_spans_to_line(spans):
|
|
49
49
|
continue
|
50
50
|
|
51
51
|
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
52
|
-
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'],
|
52
|
+
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold):
|
53
53
|
current_line.append(span)
|
54
54
|
else:
|
55
55
|
# 否则,开始新行
|
@@ -157,7 +157,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
|
|
157
157
|
BlockType.ImageBody, BlockType.ImageCaption, BlockType.ImageFootnote,
|
158
158
|
BlockType.TableBody, BlockType.TableCaption, BlockType.TableFootnote
|
159
159
|
]:
|
160
|
-
block_dict[
|
160
|
+
block_dict['group_id'] = block[-1]
|
161
161
|
block_spans = []
|
162
162
|
for span in spans:
|
163
163
|
span_bbox = span['bbox']
|