magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -5,19 +5,18 @@ import time
|
|
5
5
|
from typing import List
|
6
6
|
|
7
7
|
import torch
|
8
|
+
import fitz
|
8
9
|
from loguru import logger
|
9
10
|
|
10
|
-
from magic_pdf.config.drop_reason import DropReason
|
11
11
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
12
12
|
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
13
13
|
from magic_pdf.data.dataset import Dataset, PageableData
|
14
14
|
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
15
15
|
from magic_pdf.libs.clean_memory import clean_memory
|
16
|
-
from magic_pdf.libs.commons import fitz, get_delta_time
|
17
16
|
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
|
18
17
|
from magic_pdf.libs.convert_utils import dict_to_list
|
19
18
|
from magic_pdf.libs.hash_utils import compute_md5
|
20
|
-
|
19
|
+
|
21
20
|
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
|
22
21
|
from magic_pdf.model.magic_model import MagicModel
|
23
22
|
|
@@ -34,13 +33,11 @@ except ImportError:
|
|
34
33
|
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
35
34
|
|
36
35
|
from magic_pdf.para.para_split_v3 import para_split
|
37
|
-
|
36
|
+
|
38
37
|
from magic_pdf.pre_proc.construct_page_dict import \
|
39
38
|
ocr_construct_page_component_v2
|
40
39
|
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
|
41
|
-
|
42
|
-
combine_chars_to_pymudict, remove_chars_in_text_blocks,
|
43
|
-
replace_equations_in_textblock)
|
40
|
+
|
44
41
|
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
|
45
42
|
ocr_prepare_bboxes_for_layout_split_v2
|
46
43
|
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
|
@@ -49,26 +46,6 @@ from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
|
|
49
46
|
from magic_pdf.pre_proc.ocr_span_list_modify import (
|
50
47
|
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
|
51
48
|
remove_overlaps_min_spans)
|
52
|
-
from magic_pdf.pre_proc.resolve_bbox_conflict import \
|
53
|
-
check_useful_block_horizontal_overlap
|
54
|
-
|
55
|
-
|
56
|
-
def remove_horizontal_overlap_block_which_smaller(all_bboxes):
|
57
|
-
useful_blocks = []
|
58
|
-
for bbox in all_bboxes:
|
59
|
-
useful_blocks.append({'bbox': bbox[:4]})
|
60
|
-
is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
|
61
|
-
check_useful_block_horizontal_overlap(useful_blocks)
|
62
|
-
)
|
63
|
-
if is_useful_block_horz_overlap:
|
64
|
-
logger.warning(
|
65
|
-
f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
|
66
|
-
) # noqa: E501
|
67
|
-
for bbox in all_bboxes.copy():
|
68
|
-
if smaller_bbox == bbox[:4]:
|
69
|
-
all_bboxes.remove(bbox)
|
70
|
-
|
71
|
-
return is_useful_block_horz_overlap, all_bboxes
|
72
49
|
|
73
50
|
|
74
51
|
def __replace_STX_ETX(text_str: str):
|
@@ -89,29 +66,26 @@ def __replace_STX_ETX(text_str: str):
|
|
89
66
|
|
90
67
|
|
91
68
|
def chars_to_content(span):
|
92
|
-
|
93
|
-
|
94
|
-
|
69
|
+
# 检查span中的char是否为空
|
70
|
+
if len(span['chars']) == 0:
|
71
|
+
span['content'] = ''
|
72
|
+
else:
|
95
73
|
# 先给chars按char['bbox']的中心点的x坐标排序
|
96
74
|
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
|
97
|
-
content = ''
|
98
75
|
|
99
76
|
# 求char的平均宽度
|
100
|
-
|
101
|
-
|
102
|
-
del span['chars']
|
103
|
-
return
|
104
|
-
else:
|
105
|
-
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
|
106
|
-
char_avg_width = char_width_sum / len(span['chars'])
|
77
|
+
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
|
78
|
+
char_avg_width = char_width_sum / len(span['chars'])
|
107
79
|
|
80
|
+
content = ''
|
108
81
|
for char in span['chars']:
|
109
82
|
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
|
110
83
|
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
|
111
84
|
content += ' '
|
112
85
|
content += char['c']
|
113
86
|
span['content'] = __replace_STX_ETX(content)
|
114
|
-
|
87
|
+
|
88
|
+
del span['chars']
|
115
89
|
|
116
90
|
|
117
91
|
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|
@@ -128,8 +102,13 @@ def fill_char_in_spans(spans, all_chars):
|
|
128
102
|
span['chars'].append(char)
|
129
103
|
break
|
130
104
|
|
105
|
+
empty_spans = []
|
106
|
+
|
131
107
|
for span in spans:
|
132
108
|
chars_to_content(span)
|
109
|
+
if len(span['content']) == 0:
|
110
|
+
empty_spans.append(span)
|
111
|
+
return empty_spans
|
133
112
|
|
134
113
|
|
135
114
|
# 使用鲁棒性更强的中心点坐标判断
|
@@ -162,48 +141,79 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
|
162
141
|
|
163
142
|
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
|
164
143
|
|
144
|
+
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
145
|
+
|
146
|
+
# @todo: 拿到char之后把倾斜角度较大的先删一遍
|
147
|
+
all_pymu_chars = []
|
148
|
+
for block in text_blocks_raw:
|
149
|
+
for line in block['lines']:
|
150
|
+
for span in line['spans']:
|
151
|
+
all_pymu_chars.extend(span['chars'])
|
152
|
+
|
153
|
+
# 计算所有sapn的高度的中位数
|
154
|
+
span_height_list = []
|
155
|
+
for span in spans:
|
156
|
+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
157
|
+
continue
|
158
|
+
span_height = span['bbox'][3] - span['bbox'][1]
|
159
|
+
span['height'] = span_height
|
160
|
+
span_height_list.append(span_height)
|
161
|
+
if len(span_height_list) == 0:
|
162
|
+
return spans
|
163
|
+
else:
|
164
|
+
median_span_height = statistics.median(span_height_list)
|
165
|
+
|
165
166
|
useful_spans = []
|
166
167
|
unuseful_spans = []
|
168
|
+
# 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值
|
169
|
+
vertical_spans = []
|
167
170
|
for span in spans:
|
168
|
-
|
171
|
+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
172
|
+
continue
|
173
|
+
for block in all_bboxes + all_discarded_blocks:
|
169
174
|
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
|
170
175
|
continue
|
171
|
-
else:
|
172
|
-
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
173
|
-
useful_spans.append(span)
|
174
|
-
break
|
175
|
-
for block in all_discarded_blocks:
|
176
176
|
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
177
|
-
|
177
|
+
if span['height'] > median_span_height * 3 and span['height'] > (span['bbox'][2] - span['bbox'][0]) * 3:
|
178
|
+
vertical_spans.append(span)
|
179
|
+
elif block in all_bboxes:
|
180
|
+
useful_spans.append(span)
|
181
|
+
else:
|
182
|
+
unuseful_spans.append(span)
|
183
|
+
|
184
|
+
del span['height']
|
185
|
+
|
178
186
|
break
|
179
187
|
|
180
|
-
|
188
|
+
"""垂直的span框直接用pymu的line进行填充"""
|
189
|
+
if len(vertical_spans) > 0:
|
190
|
+
text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
191
|
+
all_pymu_lines = []
|
192
|
+
for block in text_blocks:
|
193
|
+
for line in block['lines']:
|
194
|
+
all_pymu_lines.append(line)
|
181
195
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
196
|
+
for pymu_line in all_pymu_lines:
|
197
|
+
for span in vertical_spans:
|
198
|
+
if calculate_overlap_area_in_bbox1_area_ratio(pymu_line['bbox'], span['bbox']) > 0.5:
|
199
|
+
for pymu_span in pymu_line['spans']:
|
200
|
+
span['content'] += pymu_span['text']
|
201
|
+
break
|
188
202
|
|
189
|
-
|
203
|
+
for span in vertical_spans:
|
204
|
+
if len(span['content']) == 0:
|
205
|
+
spans.remove(span)
|
190
206
|
|
191
|
-
|
192
|
-
|
193
|
-
span['chars'] = []
|
194
|
-
new_spans.append(span)
|
207
|
+
"""水平的span框如果没有char则用ocr进行填充"""
|
208
|
+
new_spans = []
|
195
209
|
|
196
|
-
for span in unuseful_spans:
|
210
|
+
for span in useful_spans + unuseful_spans:
|
197
211
|
if span['type'] in [ContentType.Text]:
|
198
212
|
span['chars'] = []
|
199
213
|
new_spans.append(span)
|
200
214
|
|
201
|
-
fill_char_in_spans(new_spans, all_pymu_chars)
|
215
|
+
empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
|
202
216
|
|
203
|
-
empty_spans = []
|
204
|
-
for span in new_spans:
|
205
|
-
if len(span['content']) == 0:
|
206
|
-
empty_spans.append(span)
|
207
217
|
if len(empty_spans) > 0:
|
208
218
|
|
209
219
|
# 初始化ocr模型
|
@@ -216,55 +226,21 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
216
226
|
)
|
217
227
|
|
218
228
|
for span in empty_spans:
|
219
|
-
|
220
|
-
# 对span的bbox截图
|
229
|
+
# 对span的bbox截图再ocr
|
221
230
|
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
|
222
231
|
ocr_res = ocr_model.ocr(span_img, det=False)
|
223
|
-
# logger.info(f"ocr_res: {ocr_res}")
|
224
|
-
# logger.info(f"empty_span: {span}")
|
225
232
|
if ocr_res and len(ocr_res) > 0:
|
226
233
|
if len(ocr_res[0]) > 0:
|
227
234
|
ocr_text, ocr_score = ocr_res[0][0]
|
228
235
|
if ocr_score > 0.5 and len(ocr_text) > 0:
|
229
|
-
|
230
|
-
|
236
|
+
span['content'] = ocr_text
|
237
|
+
span['score'] = ocr_score
|
238
|
+
else:
|
239
|
+
spans.remove(span)
|
231
240
|
|
232
241
|
return spans
|
233
242
|
|
234
243
|
|
235
|
-
def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
|
236
|
-
text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
237
|
-
char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
|
238
|
-
'blocks'
|
239
|
-
]
|
240
|
-
text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
|
241
|
-
text_blocks = replace_equations_in_textblock(
|
242
|
-
text_blocks, inline_equations, interline_equations
|
243
|
-
)
|
244
|
-
text_blocks = remove_citation_marker(text_blocks)
|
245
|
-
text_blocks = remove_chars_in_text_blocks(text_blocks)
|
246
|
-
spans = []
|
247
|
-
for v in text_blocks:
|
248
|
-
for line in v['lines']:
|
249
|
-
for span in line['spans']:
|
250
|
-
bbox = span['bbox']
|
251
|
-
if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
|
252
|
-
continue
|
253
|
-
if span.get('type') not in (
|
254
|
-
ContentType.InlineEquation,
|
255
|
-
ContentType.InterlineEquation,
|
256
|
-
):
|
257
|
-
spans.append(
|
258
|
-
{
|
259
|
-
'bbox': list(span['bbox']),
|
260
|
-
'content': __replace_STX_ETX(span['text']),
|
261
|
-
'type': ContentType.Text,
|
262
|
-
'score': 1.0,
|
263
|
-
}
|
264
|
-
)
|
265
|
-
return spans
|
266
|
-
|
267
|
-
|
268
244
|
def replace_text_span(pymu_spans, ocr_spans):
|
269
245
|
return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
|
270
246
|
|
@@ -682,6 +658,23 @@ def parse_page_core(
|
|
682
658
|
"""顺便删除大水印并保留abandon的span"""
|
683
659
|
spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
|
684
660
|
|
661
|
+
"""删除重叠spans中置信度较低的那些"""
|
662
|
+
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|
663
|
+
"""删除重叠spans中较小的那些"""
|
664
|
+
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
665
|
+
|
666
|
+
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
|
667
|
+
if parse_mode == SupportedPdfParseMethod.TXT:
|
668
|
+
|
669
|
+
"""使用新版本的混合ocr方案"""
|
670
|
+
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
|
671
|
+
|
672
|
+
elif parse_mode == SupportedPdfParseMethod.OCR:
|
673
|
+
pass
|
674
|
+
else:
|
675
|
+
raise Exception('parse_mode must be txt or ocr')
|
676
|
+
|
677
|
+
|
685
678
|
"""先处理不需要排版的discarded_blocks"""
|
686
679
|
discarded_block_with_spans, spans = fill_spans_in_blocks(
|
687
680
|
all_discarded_blocks, spans, 0.4
|
@@ -706,26 +699,6 @@ def parse_page_core(
|
|
706
699
|
drop_reason,
|
707
700
|
)
|
708
701
|
|
709
|
-
"""删除重叠spans中置信度较低的那些"""
|
710
|
-
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|
711
|
-
"""删除重叠spans中较小的那些"""
|
712
|
-
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
713
|
-
|
714
|
-
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
|
715
|
-
if parse_mode == SupportedPdfParseMethod.TXT:
|
716
|
-
|
717
|
-
"""之前的公式替换方案"""
|
718
|
-
# pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
|
719
|
-
# spans = replace_text_span(pymu_spans, spans)
|
720
|
-
|
721
|
-
"""ocr 中文本类的 span 用 pymu spans 替换!"""
|
722
|
-
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
|
723
|
-
|
724
|
-
elif parse_mode == SupportedPdfParseMethod.OCR:
|
725
|
-
pass
|
726
|
-
else:
|
727
|
-
raise Exception('parse_mode must be txt or ocr')
|
728
|
-
|
729
702
|
"""对image和table截图"""
|
730
703
|
spans = ocr_cut_image_and_table(
|
731
704
|
spans, page_doc, page_id, pdf_bytes_md5, imageWriter
|
@@ -811,7 +784,7 @@ def pdf_parse_union(
|
|
811
784
|
if debug_mode:
|
812
785
|
time_now = time.time()
|
813
786
|
logger.info(
|
814
|
-
f'page_id: {page_id}, last_page_cost_time: {
|
787
|
+
f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
|
815
788
|
)
|
816
789
|
start_time = time_now
|
817
790
|
|
@@ -1,58 +1,3 @@
|
|
1
|
-
def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
|
2
|
-
interline_eq_info, raw_pymu_blocks,
|
3
|
-
removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
|
4
|
-
layout_tree,
|
5
|
-
page_w, page_h, footnote_bboxes_tmp):
|
6
|
-
"""
|
7
|
-
|
8
|
-
"""
|
9
|
-
return_dict = {}
|
10
|
-
|
11
|
-
return_dict['para_blocks'] = {}
|
12
|
-
return_dict['preproc_blocks'] = text_blocks_preproc
|
13
|
-
return_dict['images'] = image_info
|
14
|
-
return_dict['tables'] = table_info
|
15
|
-
return_dict['interline_equations'] = interline_eq_info
|
16
|
-
return_dict['inline_equations'] = inline_eq_info
|
17
|
-
return_dict['layout_bboxes'] = layout_bboxes
|
18
|
-
return_dict['pymu_raw_blocks'] = raw_pymu_blocks
|
19
|
-
return_dict['global_statistic'] = {}
|
20
|
-
|
21
|
-
return_dict['droped_text_block'] = removed_text_blocks
|
22
|
-
return_dict['droped_image_block'] = removed_image_blocks
|
23
|
-
return_dict['droped_table_block'] = []
|
24
|
-
return_dict['image_backup'] = images_backup
|
25
|
-
return_dict['table_backup'] = []
|
26
|
-
return_dict['page_idx'] = page_id
|
27
|
-
return_dict['page_size'] = [page_w, page_h]
|
28
|
-
return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
|
29
|
-
return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
|
30
|
-
|
31
|
-
return return_dict
|
32
|
-
|
33
|
-
|
34
|
-
def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
35
|
-
images, tables, interline_equations, inline_equations,
|
36
|
-
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
|
37
|
-
need_remove_spans_bboxes_dict):
|
38
|
-
return_dict = {
|
39
|
-
'preproc_blocks': blocks,
|
40
|
-
'layout_bboxes': layout_bboxes,
|
41
|
-
'page_idx': page_id,
|
42
|
-
'page_size': [page_w, page_h],
|
43
|
-
'_layout_tree': layout_tree,
|
44
|
-
'images': images,
|
45
|
-
'tables': tables,
|
46
|
-
'interline_equations': interline_equations,
|
47
|
-
'inline_equations': inline_equations,
|
48
|
-
'droped_text_block': dropped_text_block,
|
49
|
-
'droped_image_block': dropped_image_block,
|
50
|
-
'droped_table_block': dropped_table_block,
|
51
|
-
'dropped_equation_block': dropped_equation_block,
|
52
|
-
'droped_bboxes': need_remove_spans_bboxes_dict,
|
53
|
-
}
|
54
|
-
return return_dict
|
55
|
-
|
56
1
|
|
57
2
|
def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
58
3
|
images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
|
magic_pdf/pre_proc/cut_image.py
CHANGED
@@ -25,43 +25,6 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
|
|
25
25
|
return spans
|
26
26
|
|
27
27
|
|
28
|
-
def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
|
29
|
-
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
|
30
|
-
equation_inline_bboxes: list,
|
31
|
-
equation_interline_bboxes: list, imageWriter) -> dict:
|
32
|
-
"""返回一个dict, key为bbox, 值是图片地址."""
|
33
|
-
image_info = []
|
34
|
-
image_backup_info = []
|
35
|
-
table_info = []
|
36
|
-
inline_eq_info = []
|
37
|
-
interline_eq_info = []
|
38
|
-
|
39
|
-
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
|
40
|
-
|
41
|
-
def return_path(type):
|
42
|
-
return join_path(pdf_bytes_md5, type)
|
43
|
-
|
44
|
-
for bbox in image_bboxes:
|
45
|
-
if not check_img_bbox(bbox):
|
46
|
-
continue
|
47
|
-
image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
|
48
|
-
image_info.append({'bbox': bbox, 'image_path': image_path})
|
49
|
-
|
50
|
-
for bbox in images_overlap_backup:
|
51
|
-
if not check_img_bbox(bbox):
|
52
|
-
continue
|
53
|
-
image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
|
54
|
-
image_backup_info.append({'bbox': bbox, 'image_path': image_path})
|
55
|
-
|
56
|
-
for bbox in table_bboxes:
|
57
|
-
if not check_img_bbox(bbox):
|
58
|
-
continue
|
59
|
-
image_path = cut_image(bbox, page_num, page, return_path('tables'), imageWriter)
|
60
|
-
table_info.append({'bbox': bbox, 'image_path': image_path})
|
61
|
-
|
62
|
-
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
|
63
|
-
|
64
|
-
|
65
28
|
def check_img_bbox(bbox) -> bool:
|
66
29
|
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
|
67
30
|
logger.warning(f'image_bboxes: 错误的box, {bbox}')
|
@@ -1,184 +1,11 @@
|
|
1
|
-
|
2
1
|
from magic_pdf.config.ocr_content_type import BlockType
|
3
2
|
from magic_pdf.libs.boxbase import (
|
4
|
-
calculate_iou,
|
3
|
+
calculate_iou,
|
4
|
+
calculate_overlap_area_in_bbox1_area_ratio,
|
5
5
|
calculate_vertical_projection_overlap_ratio,
|
6
|
-
get_minbox_if_overlap_by_ratio
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
def ocr_prepare_bboxes_for_layout_split(
|
12
|
-
img_blocks,
|
13
|
-
table_blocks,
|
14
|
-
discarded_blocks,
|
15
|
-
text_blocks,
|
16
|
-
title_blocks,
|
17
|
-
interline_equation_blocks,
|
18
|
-
page_w,
|
19
|
-
page_h,
|
20
|
-
):
|
21
|
-
all_bboxes = []
|
22
|
-
all_discarded_blocks = []
|
23
|
-
for image in img_blocks:
|
24
|
-
x0, y0, x1, y1 = image['bbox']
|
25
|
-
all_bboxes.append(
|
26
|
-
[
|
27
|
-
x0,
|
28
|
-
y0,
|
29
|
-
x1,
|
30
|
-
y1,
|
31
|
-
None,
|
32
|
-
None,
|
33
|
-
None,
|
34
|
-
BlockType.Image,
|
35
|
-
None,
|
36
|
-
None,
|
37
|
-
None,
|
38
|
-
None,
|
39
|
-
image['score'],
|
40
|
-
]
|
41
|
-
)
|
42
|
-
|
43
|
-
for table in table_blocks:
|
44
|
-
x0, y0, x1, y1 = table['bbox']
|
45
|
-
all_bboxes.append(
|
46
|
-
[
|
47
|
-
x0,
|
48
|
-
y0,
|
49
|
-
x1,
|
50
|
-
y1,
|
51
|
-
None,
|
52
|
-
None,
|
53
|
-
None,
|
54
|
-
BlockType.Table,
|
55
|
-
None,
|
56
|
-
None,
|
57
|
-
None,
|
58
|
-
None,
|
59
|
-
table['score'],
|
60
|
-
]
|
61
|
-
)
|
62
|
-
|
63
|
-
for text in text_blocks:
|
64
|
-
x0, y0, x1, y1 = text['bbox']
|
65
|
-
all_bboxes.append(
|
66
|
-
[
|
67
|
-
x0,
|
68
|
-
y0,
|
69
|
-
x1,
|
70
|
-
y1,
|
71
|
-
None,
|
72
|
-
None,
|
73
|
-
None,
|
74
|
-
BlockType.Text,
|
75
|
-
None,
|
76
|
-
None,
|
77
|
-
None,
|
78
|
-
None,
|
79
|
-
text['score'],
|
80
|
-
]
|
81
|
-
)
|
82
|
-
|
83
|
-
for title in title_blocks:
|
84
|
-
x0, y0, x1, y1 = title['bbox']
|
85
|
-
all_bboxes.append(
|
86
|
-
[
|
87
|
-
x0,
|
88
|
-
y0,
|
89
|
-
x1,
|
90
|
-
y1,
|
91
|
-
None,
|
92
|
-
None,
|
93
|
-
None,
|
94
|
-
BlockType.Title,
|
95
|
-
None,
|
96
|
-
None,
|
97
|
-
None,
|
98
|
-
None,
|
99
|
-
title['score'],
|
100
|
-
]
|
101
|
-
)
|
102
|
-
|
103
|
-
for interline_equation in interline_equation_blocks:
|
104
|
-
x0, y0, x1, y1 = interline_equation['bbox']
|
105
|
-
all_bboxes.append(
|
106
|
-
[
|
107
|
-
x0,
|
108
|
-
y0,
|
109
|
-
x1,
|
110
|
-
y1,
|
111
|
-
None,
|
112
|
-
None,
|
113
|
-
None,
|
114
|
-
BlockType.InterlineEquation,
|
115
|
-
None,
|
116
|
-
None,
|
117
|
-
None,
|
118
|
-
None,
|
119
|
-
interline_equation['score'],
|
120
|
-
]
|
121
|
-
)
|
122
|
-
|
123
|
-
"""block嵌套问题解决"""
|
124
|
-
"""文本框与标题框重叠,优先信任文本框"""
|
125
|
-
all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
|
126
|
-
"""任何框体与舍弃框重叠,优先信任舍弃框"""
|
127
|
-
all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
|
128
|
-
|
129
|
-
# interline_equation 与title或text框冲突的情况,分两种情况处理
|
130
|
-
"""interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框"""
|
131
|
-
all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
|
132
|
-
"""interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框"""
|
133
|
-
# 通过后续大框套小框逻辑删除
|
134
|
-
|
135
|
-
"""discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)"""
|
136
|
-
for discarded in discarded_blocks:
|
137
|
-
x0, y0, x1, y1 = discarded['bbox']
|
138
|
-
all_discarded_blocks.append(
|
139
|
-
[
|
140
|
-
x0,
|
141
|
-
y0,
|
142
|
-
x1,
|
143
|
-
y1,
|
144
|
-
None,
|
145
|
-
None,
|
146
|
-
None,
|
147
|
-
BlockType.Discarded,
|
148
|
-
None,
|
149
|
-
None,
|
150
|
-
None,
|
151
|
-
None,
|
152
|
-
discarded['score'],
|
153
|
-
]
|
154
|
-
)
|
155
|
-
# 将footnote加入到all_bboxes中,用来计算layout
|
156
|
-
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
|
157
|
-
all_bboxes.append(
|
158
|
-
[
|
159
|
-
x0,
|
160
|
-
y0,
|
161
|
-
x1,
|
162
|
-
y1,
|
163
|
-
None,
|
164
|
-
None,
|
165
|
-
None,
|
166
|
-
BlockType.Footnote,
|
167
|
-
None,
|
168
|
-
None,
|
169
|
-
None,
|
170
|
-
None,
|
171
|
-
discarded['score'],
|
172
|
-
]
|
173
|
-
)
|
174
|
-
|
175
|
-
"""经过以上处理后,还存在大框套小框的情况,则删除小框"""
|
176
|
-
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
|
177
|
-
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
|
178
|
-
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
|
179
|
-
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
180
|
-
|
181
|
-
return all_bboxes, all_discarded_blocks, drop_reasons
|
6
|
+
get_minbox_if_overlap_by_ratio
|
7
|
+
)
|
8
|
+
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
|
182
9
|
|
183
10
|
|
184
11
|
def add_bboxes(blocks, block_type, bboxes):
|