magic-pdf 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/__init__.py +0 -0
- magic_pdf/cli/__init__.py +0 -0
- magic_pdf/cli/magicpdf.py +294 -0
- magic_pdf/dict2md/__init__.py +0 -0
- magic_pdf/dict2md/mkcontent.py +397 -0
- magic_pdf/dict2md/ocr_mkcontent.py +356 -0
- magic_pdf/filter/__init__.py +0 -0
- magic_pdf/filter/pdf_classify_by_type.py +381 -0
- magic_pdf/filter/pdf_meta_scan.py +368 -0
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +681 -0
- magic_pdf/layout/layout_det_utils.py +182 -0
- magic_pdf/layout/layout_sort.py +732 -0
- magic_pdf/layout/layout_spiler_recog.py +101 -0
- magic_pdf/layout/mcol_sort.py +336 -0
- magic_pdf/libs/Constants.py +11 -0
- magic_pdf/libs/MakeContentConfig.py +10 -0
- magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
- magic_pdf/libs/__init__.py +0 -0
- magic_pdf/libs/boxbase.py +408 -0
- magic_pdf/libs/calc_span_stats.py +239 -0
- magic_pdf/libs/commons.py +204 -0
- magic_pdf/libs/config_reader.py +63 -0
- magic_pdf/libs/convert_utils.py +5 -0
- magic_pdf/libs/coordinate_transform.py +9 -0
- magic_pdf/libs/detect_language_from_model.py +21 -0
- magic_pdf/libs/draw_bbox.py +227 -0
- magic_pdf/libs/drop_reason.py +27 -0
- magic_pdf/libs/drop_tag.py +19 -0
- magic_pdf/libs/hash_utils.py +15 -0
- magic_pdf/libs/json_compressor.py +27 -0
- magic_pdf/libs/language.py +31 -0
- magic_pdf/libs/markdown_utils.py +31 -0
- magic_pdf/libs/math.py +9 -0
- magic_pdf/libs/nlp_utils.py +203 -0
- magic_pdf/libs/ocr_content_type.py +21 -0
- magic_pdf/libs/path_utils.py +23 -0
- magic_pdf/libs/pdf_image_tools.py +33 -0
- magic_pdf/libs/safe_filename.py +11 -0
- magic_pdf/libs/textbase.py +33 -0
- magic_pdf/libs/version.py +1 -0
- magic_pdf/libs/vis_utils.py +308 -0
- magic_pdf/model/__init__.py +0 -0
- magic_pdf/model/doc_analyze_by_360layout.py +8 -0
- magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
- magic_pdf/model/magic_model.py +632 -0
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/para/block_continuation_processor.py +562 -0
- magic_pdf/para/block_termination_processor.py +480 -0
- magic_pdf/para/commons.py +222 -0
- magic_pdf/para/denoise.py +246 -0
- magic_pdf/para/draw.py +121 -0
- magic_pdf/para/exceptions.py +198 -0
- magic_pdf/para/layout_match_processor.py +40 -0
- magic_pdf/para/para_pipeline.py +297 -0
- magic_pdf/para/para_split.py +644 -0
- magic_pdf/para/para_split_v2.py +772 -0
- magic_pdf/para/raw_processor.py +207 -0
- magic_pdf/para/stats.py +268 -0
- magic_pdf/para/title_processor.py +1014 -0
- magic_pdf/pdf_parse_by_ocr.py +219 -0
- magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
- magic_pdf/pdf_parse_by_txt.py +410 -0
- magic_pdf/pdf_parse_by_txt_v2.py +56 -0
- magic_pdf/pdf_parse_for_train.py +685 -0
- magic_pdf/pdf_parse_union_core.py +241 -0
- magic_pdf/pipe/AbsPipe.py +112 -0
- magic_pdf/pipe/OCRPipe.py +28 -0
- magic_pdf/pipe/TXTPipe.py +29 -0
- magic_pdf/pipe/UNIPipe.py +83 -0
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +3472 -0
- magic_pdf/post_proc/pdf_post_filter.py +67 -0
- magic_pdf/post_proc/remove_footnote.py +153 -0
- magic_pdf/pre_proc/__init__.py +0 -0
- magic_pdf/pre_proc/citationmarker_remove.py +157 -0
- magic_pdf/pre_proc/construct_page_dict.py +72 -0
- magic_pdf/pre_proc/cut_image.py +71 -0
- magic_pdf/pre_proc/detect_equation.py +134 -0
- magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
- magic_pdf/pre_proc/detect_footnote.py +170 -0
- magic_pdf/pre_proc/detect_header.py +64 -0
- magic_pdf/pre_proc/detect_images.py +647 -0
- magic_pdf/pre_proc/detect_page_number.py +64 -0
- magic_pdf/pre_proc/detect_tables.py +62 -0
- magic_pdf/pre_proc/equations_replace.py +559 -0
- magic_pdf/pre_proc/fix_image.py +244 -0
- magic_pdf/pre_proc/fix_table.py +270 -0
- magic_pdf/pre_proc/main_text_font.py +23 -0
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
- magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
- magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
- magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
- magic_pdf/pre_proc/remove_footer_header.py +117 -0
- magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
- magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
- magic_pdf/pre_proc/solve_line_alien.py +29 -0
- magic_pdf/pre_proc/statistics.py +12 -0
- magic_pdf/rw/AbsReaderWriter.py +34 -0
- magic_pdf/rw/DiskReaderWriter.py +66 -0
- magic_pdf/rw/S3ReaderWriter.py +107 -0
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/spark/__init__.py +0 -0
- magic_pdf/spark/spark_api.py +51 -0
- magic_pdf/train_utils/__init__.py +0 -0
- magic_pdf/train_utils/convert_to_train_format.py +65 -0
- magic_pdf/train_utils/extract_caption.py +59 -0
- magic_pdf/train_utils/remove_footer_header.py +159 -0
- magic_pdf/train_utils/vis_utils.py +327 -0
- magic_pdf/user_api.py +136 -0
- magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
- magic_pdf-0.5.4.dist-info/METADATA +24 -0
- magic_pdf-0.5.4.dist-info/RECORD +121 -0
- magic_pdf-0.5.4.dist-info/WHEEL +5 -0
- magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
import fitz
|
2
|
+
|
3
|
+
from magic_pdf.layout.layout_sort import get_bboxes_layout
|
4
|
+
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
|
5
|
+
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
6
|
+
|
7
|
+
|
8
|
+
def get_center_point(bbox):
|
9
|
+
"""
|
10
|
+
根据边界框坐标信息,计算出该边界框的中心点坐标。
|
11
|
+
Args:
|
12
|
+
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
|
13
|
+
Returns:
|
14
|
+
list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
|
15
|
+
"""
|
16
|
+
return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
|
17
|
+
|
18
|
+
|
19
|
+
def get_area(bbox):
|
20
|
+
"""
|
21
|
+
根据边界框坐标信息,计算出该边界框的面积。
|
22
|
+
Args:
|
23
|
+
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
|
24
|
+
Returns:
|
25
|
+
float: 该边界框的面积。
|
26
|
+
"""
|
27
|
+
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
28
|
+
|
29
|
+
|
30
|
+
def adjust_layouts(layout_bboxes, page_boundry, page_id):
|
31
|
+
# 遍历所有布局框
|
32
|
+
for i in range(len(layout_bboxes)):
|
33
|
+
# 遍历当前布局框之后的布局框
|
34
|
+
for j in range(i + 1, len(layout_bboxes)):
|
35
|
+
# 判断两个布局框是否重叠
|
36
|
+
if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
|
37
|
+
# 计算每个布局框的中心点坐标和面积
|
38
|
+
area_i = get_area(layout_bboxes[i])
|
39
|
+
area_j = get_area(layout_bboxes[j])
|
40
|
+
|
41
|
+
# 较大布局框和较小布局框的赋值
|
42
|
+
if area_i > area_j:
|
43
|
+
larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
|
44
|
+
else:
|
45
|
+
larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
|
46
|
+
|
47
|
+
center_large = get_center_point(larger_layout)
|
48
|
+
center_small = get_center_point(smaller_layout)
|
49
|
+
# 计算横向和纵向的距离差
|
50
|
+
distance_x = center_large[0] - center_small[0]
|
51
|
+
distance_y = center_large[1] - center_small[1]
|
52
|
+
|
53
|
+
# 根据距离差判断重叠方向并修正边界
|
54
|
+
if abs(distance_x) > abs(distance_y): # 左右重叠
|
55
|
+
if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
|
56
|
+
larger_layout[0] = smaller_layout[2]+1
|
57
|
+
if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
|
58
|
+
larger_layout[2] = smaller_layout[0]-1
|
59
|
+
else: # 上下重叠
|
60
|
+
if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
|
61
|
+
larger_layout[1] = smaller_layout[3]+1
|
62
|
+
if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
|
63
|
+
larger_layout[3] = smaller_layout[1]-1
|
64
|
+
# 排序调整布局边界框列表
|
65
|
+
new_bboxes = []
|
66
|
+
for layout_bbox in layout_bboxes:
|
67
|
+
new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
|
68
|
+
|
69
|
+
layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
|
70
|
+
|
71
|
+
# 返回排序调整后的布局边界框列表
|
72
|
+
return layout_bboxes, layout_tree
|
73
|
+
|
74
|
+
|
75
|
+
def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
|
76
|
+
"""
|
77
|
+
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
|
78
|
+
|
79
|
+
Args:
|
80
|
+
layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
|
84
|
+
|
85
|
+
"""
|
86
|
+
page_id = ocr_page_info['page_info']['page_no']-1
|
87
|
+
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
|
88
|
+
# 初始化布局边界框列表
|
89
|
+
layout_bboxes = []
|
90
|
+
# 遍历每个子布局
|
91
|
+
for sub_layout in layout_info:
|
92
|
+
# 提取子布局的边界框坐标信息
|
93
|
+
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
|
94
|
+
bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
|
95
|
+
int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
|
96
|
+
|
97
|
+
# 将子布局的边界框添加到列表中
|
98
|
+
layout_bboxes.append(bbox)
|
99
|
+
|
100
|
+
# 初始化新的布局边界框列表
|
101
|
+
new_layout_bboxes = []
|
102
|
+
# 遍历每个布局边界框
|
103
|
+
for i in range(len(layout_bboxes)):
|
104
|
+
# 初始化标记变量,用于判断当前边界框是否需要保留
|
105
|
+
keep = True
|
106
|
+
# 获取当前边界框的坐标信息
|
107
|
+
box_i = layout_bboxes[i]
|
108
|
+
|
109
|
+
# 遍历其他边界框
|
110
|
+
for j in range(len(layout_bboxes)):
|
111
|
+
# 排除当前边界框自身
|
112
|
+
if i != j:
|
113
|
+
# 获取其他边界框的坐标信息
|
114
|
+
box_j = layout_bboxes[j]
|
115
|
+
# 检测box_i是否被box_j包含
|
116
|
+
if _is_in(box_i, box_j):
|
117
|
+
# 如果当前边界框被其他边界框包含,则标记为不需要保留
|
118
|
+
keep = False
|
119
|
+
# 跳出内层循环
|
120
|
+
break
|
121
|
+
|
122
|
+
# 如果当前边界框需要保留,则添加到新的布局边界框列表中
|
123
|
+
if keep:
|
124
|
+
new_layout_bboxes.append(layout_bboxes[i])
|
125
|
+
|
126
|
+
# 对新的布局边界框列表进行排序调整
|
127
|
+
page_width = page.rect.width
|
128
|
+
page_height = page.rect.height
|
129
|
+
page_boundry = [0, 0, page_width, page_height]
|
130
|
+
layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
|
131
|
+
|
132
|
+
# 返回排序调整后的布局边界框列表
|
133
|
+
return layout_bboxes, layout_tree
|
@@ -0,0 +1,336 @@
|
|
1
|
+
from loguru import logger
|
2
|
+
|
3
|
+
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
|
4
|
+
calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio
|
5
|
+
from magic_pdf.libs.drop_tag import DropTag
|
6
|
+
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
|
7
|
+
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
|
8
|
+
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
|
9
|
+
|
10
|
+
|
11
|
+
# 将每一个line中的span从左到右排序
|
12
|
+
def line_sort_spans_by_left_to_right(lines):
|
13
|
+
line_objects = []
|
14
|
+
for line in lines:
|
15
|
+
# 按照x0坐标排序
|
16
|
+
line.sort(key=lambda span: span['bbox'][0])
|
17
|
+
line_bbox = [
|
18
|
+
min(span['bbox'][0] for span in line), # x0
|
19
|
+
min(span['bbox'][1] for span in line), # y0
|
20
|
+
max(span['bbox'][2] for span in line), # x1
|
21
|
+
max(span['bbox'][3] for span in line), # y1
|
22
|
+
]
|
23
|
+
line_objects.append({
|
24
|
+
"bbox": line_bbox,
|
25
|
+
"spans": line,
|
26
|
+
})
|
27
|
+
return line_objects
|
28
|
+
|
29
|
+
|
30
|
+
def merge_spans_to_line(spans):
|
31
|
+
if len(spans) == 0:
|
32
|
+
return []
|
33
|
+
else:
|
34
|
+
# 按照y0坐标排序
|
35
|
+
spans.sort(key=lambda span: span['bbox'][1])
|
36
|
+
|
37
|
+
lines = []
|
38
|
+
current_line = [spans[0]]
|
39
|
+
for span in spans[1:]:
|
40
|
+
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
41
|
+
# image和table类型,同上
|
42
|
+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
|
43
|
+
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
|
44
|
+
current_line):
|
45
|
+
# 则开始新行
|
46
|
+
lines.append(current_line)
|
47
|
+
current_line = [span]
|
48
|
+
continue
|
49
|
+
|
50
|
+
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
51
|
+
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
|
52
|
+
current_line.append(span)
|
53
|
+
else:
|
54
|
+
# 否则,开始新行
|
55
|
+
lines.append(current_line)
|
56
|
+
current_line = [span]
|
57
|
+
|
58
|
+
# 添加最后一行
|
59
|
+
if current_line:
|
60
|
+
lines.append(current_line)
|
61
|
+
|
62
|
+
return lines
|
63
|
+
|
64
|
+
|
65
|
+
def merge_spans_to_line_by_layout(spans, layout_bboxes):
|
66
|
+
lines = []
|
67
|
+
new_spans = []
|
68
|
+
dropped_spans = []
|
69
|
+
for item in layout_bboxes:
|
70
|
+
layout_bbox = item['layout_bbox']
|
71
|
+
# 遍历spans,将每个span放入对应的layout中
|
72
|
+
layout_sapns = []
|
73
|
+
for span in spans:
|
74
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
|
75
|
+
layout_sapns.append(span)
|
76
|
+
# 如果layout_sapns不为空,则放入new_spans中
|
77
|
+
if len(layout_sapns) > 0:
|
78
|
+
new_spans.append(layout_sapns)
|
79
|
+
# 从spans删除已经放入layout_sapns中的span
|
80
|
+
for layout_sapn in layout_sapns:
|
81
|
+
spans.remove(layout_sapn)
|
82
|
+
|
83
|
+
if len(new_spans) > 0:
|
84
|
+
for layout_sapns in new_spans:
|
85
|
+
layout_lines = merge_spans_to_line(layout_sapns)
|
86
|
+
lines.extend(layout_lines)
|
87
|
+
|
88
|
+
# 对line中的span进行排序
|
89
|
+
lines = line_sort_spans_by_left_to_right(lines)
|
90
|
+
|
91
|
+
for span in spans:
|
92
|
+
span['tag'] = DropTag.NOT_IN_LAYOUT
|
93
|
+
dropped_spans.append(span)
|
94
|
+
|
95
|
+
return lines, dropped_spans
|
96
|
+
|
97
|
+
|
98
|
+
def merge_lines_to_block(lines):
|
99
|
+
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
|
100
|
+
blocks = []
|
101
|
+
for line in lines:
|
102
|
+
blocks.append(
|
103
|
+
{
|
104
|
+
"bbox": line["bbox"],
|
105
|
+
"lines": [line],
|
106
|
+
}
|
107
|
+
)
|
108
|
+
return blocks
|
109
|
+
|
110
|
+
|
111
|
+
def sort_blocks_by_layout(all_bboxes, layout_bboxes):
|
112
|
+
new_blocks = []
|
113
|
+
sort_blocks = []
|
114
|
+
for item in layout_bboxes:
|
115
|
+
layout_bbox = item['layout_bbox']
|
116
|
+
|
117
|
+
# 遍历blocks,将每个blocks放入对应的layout中
|
118
|
+
layout_blocks = []
|
119
|
+
for block in all_bboxes:
|
120
|
+
# 如果是footnote则跳过
|
121
|
+
if block[7] == BlockType.Footnote:
|
122
|
+
continue
|
123
|
+
block_bbox = block[:4]
|
124
|
+
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
|
125
|
+
layout_blocks.append(block)
|
126
|
+
|
127
|
+
# 如果layout_blocks不为空,则放入new_blocks中
|
128
|
+
if len(layout_blocks) > 0:
|
129
|
+
new_blocks.append(layout_blocks)
|
130
|
+
# 从all_bboxes删除已经放入layout_blocks中的block
|
131
|
+
for layout_block in layout_blocks:
|
132
|
+
all_bboxes.remove(layout_block)
|
133
|
+
|
134
|
+
# 如果new_blocks不为空,则对new_blocks中每个block进行排序
|
135
|
+
if len(new_blocks) > 0:
|
136
|
+
for bboxes_in_layout_block in new_blocks:
|
137
|
+
bboxes_in_layout_block.sort(key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
|
138
|
+
sort_blocks.extend(bboxes_in_layout_block)
|
139
|
+
|
140
|
+
# sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
|
141
|
+
return sort_blocks
|
142
|
+
|
143
|
+
|
144
|
+
def fill_spans_in_blocks(blocks, spans, radio):
|
145
|
+
'''
|
146
|
+
将allspans中的span按位置关系,放入blocks中
|
147
|
+
'''
|
148
|
+
block_with_spans = []
|
149
|
+
for block in blocks:
|
150
|
+
block_type = block[7]
|
151
|
+
block_bbox = block[0:4]
|
152
|
+
block_dict = {
|
153
|
+
'type': block_type,
|
154
|
+
'bbox': block_bbox,
|
155
|
+
}
|
156
|
+
block_spans = []
|
157
|
+
for span in spans:
|
158
|
+
span_bbox = span['bbox']
|
159
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
|
160
|
+
block_spans.append(span)
|
161
|
+
|
162
|
+
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
|
163
|
+
# displayed_list = []
|
164
|
+
# text_inline_lines = []
|
165
|
+
# modify_y_axis(block_spans, displayed_list, text_inline_lines)
|
166
|
+
|
167
|
+
'''模型识别错误的行间公式, type类型转换成行内公式'''
|
168
|
+
# block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
|
169
|
+
|
170
|
+
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
|
171
|
+
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
|
172
|
+
|
173
|
+
block_dict['spans'] = block_spans
|
174
|
+
block_with_spans.append(block_dict)
|
175
|
+
|
176
|
+
# 从spans删除已经放入block_spans中的span
|
177
|
+
if len(block_spans) > 0:
|
178
|
+
for span in block_spans:
|
179
|
+
spans.remove(span)
|
180
|
+
|
181
|
+
return block_with_spans, spans
|
182
|
+
|
183
|
+
|
184
|
+
def fix_block_spans(block_with_spans, img_blocks, table_blocks):
|
185
|
+
'''
|
186
|
+
1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
|
187
|
+
需要将caption和footnote的text_span放入相应img_block和table_block内的
|
188
|
+
caption_block和footnote_block中
|
189
|
+
2、同时需要删除block中的spans字段
|
190
|
+
'''
|
191
|
+
fix_blocks = []
|
192
|
+
for block in block_with_spans:
|
193
|
+
block_type = block['type']
|
194
|
+
|
195
|
+
if block_type == BlockType.Image:
|
196
|
+
block = fix_image_block(block, img_blocks)
|
197
|
+
elif block_type == BlockType.Table:
|
198
|
+
block = fix_table_block(block, table_blocks)
|
199
|
+
elif block_type in [BlockType.Text, BlockType.Title]:
|
200
|
+
block = fix_text_block(block)
|
201
|
+
elif block_type == BlockType.InterlineEquation:
|
202
|
+
block = fix_interline_block(block)
|
203
|
+
else:
|
204
|
+
continue
|
205
|
+
fix_blocks.append(block)
|
206
|
+
return fix_blocks
|
207
|
+
|
208
|
+
|
209
|
+
def fix_discarded_block(discarded_block_with_spans):
|
210
|
+
fix_discarded_blocks = []
|
211
|
+
for block in discarded_block_with_spans:
|
212
|
+
block = fix_text_block(block)
|
213
|
+
fix_discarded_blocks.append(block)
|
214
|
+
return fix_discarded_blocks
|
215
|
+
|
216
|
+
|
217
|
+
def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
|
218
|
+
block_spans = []
|
219
|
+
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
|
220
|
+
for span in spans:
|
221
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6:
|
222
|
+
block_spans.append(span)
|
223
|
+
block_lines = merge_spans_to_line(block_spans)
|
224
|
+
# 对line中的span进行排序
|
225
|
+
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
226
|
+
block = {
|
227
|
+
'bbox': block_bbox,
|
228
|
+
'type': block_type,
|
229
|
+
'lines': sort_block_lines
|
230
|
+
}
|
231
|
+
return block, block_spans
|
232
|
+
|
233
|
+
|
234
|
+
def make_body_block(span: dict, block_bbox: list, block_type: str):
|
235
|
+
# 创建body_block
|
236
|
+
body_line = {
|
237
|
+
'bbox': block_bbox,
|
238
|
+
'spans': [span],
|
239
|
+
}
|
240
|
+
body_block = {
|
241
|
+
'bbox': block_bbox,
|
242
|
+
'type': block_type,
|
243
|
+
'lines': [body_line]
|
244
|
+
}
|
245
|
+
return body_block
|
246
|
+
|
247
|
+
|
248
|
+
def fix_image_block(block, img_blocks):
|
249
|
+
block['blocks'] = []
|
250
|
+
# 遍历img_blocks,找到与当前block匹配的img_block
|
251
|
+
for img_block in img_blocks:
|
252
|
+
if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95):
|
253
|
+
|
254
|
+
# 创建img_body_block
|
255
|
+
for span in block['spans']:
|
256
|
+
if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']:
|
257
|
+
# 创建img_body_block
|
258
|
+
img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
|
259
|
+
block['blocks'].append(img_body_block)
|
260
|
+
|
261
|
+
# 从spans中移除img_body_block中已经放入的span
|
262
|
+
block['spans'].remove(span)
|
263
|
+
break
|
264
|
+
|
265
|
+
# 根据list长度,判断img_block中是否有img_caption
|
266
|
+
if img_block['img_caption_bbox'] is not None:
|
267
|
+
img_caption_block, img_caption_spans = merge_spans_to_block(
|
268
|
+
block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
|
269
|
+
)
|
270
|
+
block['blocks'].append(img_caption_block)
|
271
|
+
|
272
|
+
break
|
273
|
+
del block['spans']
|
274
|
+
return block
|
275
|
+
|
276
|
+
|
277
|
+
def fix_table_block(block, table_blocks):
|
278
|
+
block['blocks'] = []
|
279
|
+
# 遍历table_blocks,找到与当前block匹配的table_block
|
280
|
+
for table_block in table_blocks:
|
281
|
+
if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95):
|
282
|
+
|
283
|
+
# 创建table_body_block
|
284
|
+
for span in block['spans']:
|
285
|
+
if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']:
|
286
|
+
# 创建table_body_block
|
287
|
+
table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
|
288
|
+
block['blocks'].append(table_body_block)
|
289
|
+
|
290
|
+
# 从spans中移除img_body_block中已经放入的span
|
291
|
+
block['spans'].remove(span)
|
292
|
+
break
|
293
|
+
|
294
|
+
# 根据list长度,判断table_block中是否有caption
|
295
|
+
if table_block['table_caption_bbox'] is not None:
|
296
|
+
table_caption_block, table_caption_spans = merge_spans_to_block(
|
297
|
+
block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
|
298
|
+
)
|
299
|
+
block['blocks'].append(table_caption_block)
|
300
|
+
|
301
|
+
# 如果table_caption_block_spans不为空
|
302
|
+
if len(table_caption_spans) > 0:
|
303
|
+
# 一些span已经放入了caption_block中,需要从block['spans']中删除
|
304
|
+
for span in table_caption_spans:
|
305
|
+
block['spans'].remove(span)
|
306
|
+
|
307
|
+
# 根据list长度,判断table_block中是否有table_note
|
308
|
+
if table_block['table_footnote_bbox'] is not None:
|
309
|
+
table_footnote_block, table_footnote_spans = merge_spans_to_block(
|
310
|
+
block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
|
311
|
+
)
|
312
|
+
block['blocks'].append(table_footnote_block)
|
313
|
+
|
314
|
+
break
|
315
|
+
del block['spans']
|
316
|
+
return block
|
317
|
+
|
318
|
+
|
319
|
+
def fix_text_block(block):
|
320
|
+
# 文本block中的公式span都应该转换成行内type
|
321
|
+
for span in block['spans']:
|
322
|
+
if span['type'] == ContentType.InterlineEquation:
|
323
|
+
span['type'] = ContentType.InlineEquation
|
324
|
+
block_lines = merge_spans_to_line(block['spans'])
|
325
|
+
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
326
|
+
block['lines'] = sort_block_lines
|
327
|
+
del block['spans']
|
328
|
+
return block
|
329
|
+
|
330
|
+
|
331
|
+
def fix_interline_block(block):
|
332
|
+
block_lines = merge_spans_to_line(block['spans'])
|
333
|
+
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
|
334
|
+
block['lines'] = sort_block_lines
|
335
|
+
del block['spans']
|
336
|
+
return block
|