magic-pdf 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +24 -75
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,78 +0,0 @@
|
|
1
|
-
from magic_pdf.config.drop_reason import DropReason
|
2
|
-
from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
|
3
|
-
from magic_pdf.libs.commons import fitz
|
4
|
-
|
5
|
-
|
6
|
-
def __area(box):
|
7
|
-
return (box[2] - box[0]) * (box[3] - box[1])
|
8
|
-
|
9
|
-
|
10
|
-
def __is_contain_color_background_rect(
|
11
|
-
page: fitz.Page, text_blocks, image_bboxes
|
12
|
-
) -> bool:
|
13
|
-
"""检查page是包含有颜色背景的矩形."""
|
14
|
-
color_bg_rect = []
|
15
|
-
p_width, p_height = page.rect.width, page.rect.height
|
16
|
-
|
17
|
-
# 先找到最大的带背景矩形
|
18
|
-
blocks = page.get_cdrawings()
|
19
|
-
for block in blocks:
|
20
|
-
if 'fill' in block and block['fill']: # 过滤掉透明的
|
21
|
-
fill = list(block['fill'])
|
22
|
-
fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
|
23
|
-
if fill == (1.0, 1.0, 1.0):
|
24
|
-
continue
|
25
|
-
rect = block['rect']
|
26
|
-
# 过滤掉特别小的矩形
|
27
|
-
if __area(rect) < 10 * 10:
|
28
|
-
continue
|
29
|
-
# 为了防止是svg图片上的色块,这里过滤掉这类
|
30
|
-
|
31
|
-
if any(
|
32
|
-
[_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]
|
33
|
-
):
|
34
|
-
continue
|
35
|
-
color_bg_rect.append(rect)
|
36
|
-
|
37
|
-
# 找到最大的背景矩形
|
38
|
-
if len(color_bg_rect) > 0:
|
39
|
-
max_rect = max(color_bg_rect, key=lambda x: __area(x))
|
40
|
-
max_rect_int = (
|
41
|
-
int(max_rect[0]),
|
42
|
-
int(max_rect[1]),
|
43
|
-
int(max_rect[2]),
|
44
|
-
int(max_rect[3]),
|
45
|
-
)
|
46
|
-
# 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO
|
47
|
-
if (
|
48
|
-
max_rect[2] - max_rect[0] > 0.2 * p_width
|
49
|
-
and max_rect[3] - max_rect[1] > 0.1 * p_height
|
50
|
-
): # 宽度符合
|
51
|
-
# 看是否有文本块落入到这个矩形中
|
52
|
-
for text_block in text_blocks:
|
53
|
-
box = text_block['bbox']
|
54
|
-
box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
|
55
|
-
if _is_in(box_int, max_rect_int):
|
56
|
-
return True
|
57
|
-
|
58
|
-
return False
|
59
|
-
|
60
|
-
|
61
|
-
def __is_table_overlap_text_block(text_blocks, table_bbox):
|
62
|
-
"""检查table_bbox是否覆盖了text_blocks里的文本块 TODO."""
|
63
|
-
for text_block in text_blocks:
|
64
|
-
box = text_block['bbox']
|
65
|
-
if _is_in_or_part_overlap(table_bbox, box):
|
66
|
-
return True
|
67
|
-
return False
|
68
|
-
|
69
|
-
|
70
|
-
def pdf_filter(page: fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
|
71
|
-
"""return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
|
72
|
-
if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
|
73
|
-
return False, {
|
74
|
-
'_need_drop': True,
|
75
|
-
'_drop_reason': DropReason.COLOR_BACKGROUND_TEXT_BOX,
|
76
|
-
}
|
77
|
-
|
78
|
-
return True, None
|
File without changes
|
@@ -1,101 +0,0 @@
|
|
1
|
-
from loguru import logger
|
2
|
-
|
3
|
-
from magic_pdf.config.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
|
4
|
-
from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
|
5
|
-
calculate_overlap_area_2_minbox_area_ratio)
|
6
|
-
|
7
|
-
|
8
|
-
def __area(box):
|
9
|
-
return (box[2] - box[0]) * (box[3] - box[1])
|
10
|
-
|
11
|
-
|
12
|
-
def rectangle_position_determination(rect, p_width):
|
13
|
-
"""判断矩形是否在页面中轴线附近。
|
14
|
-
|
15
|
-
Args:
|
16
|
-
rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。
|
17
|
-
p_width (int): 页面宽度。
|
18
|
-
|
19
|
-
Returns:
|
20
|
-
bool: 若矩形在页面中轴线附近则返回True,否则返回False。
|
21
|
-
"""
|
22
|
-
# 页面中轴线x坐标
|
23
|
-
x_axis = p_width / 2
|
24
|
-
# 矩形是否跨越中轴线
|
25
|
-
is_span = rect[0] < x_axis and rect[2] > x_axis
|
26
|
-
if is_span:
|
27
|
-
return True
|
28
|
-
else:
|
29
|
-
# 矩形与中轴线的距离,只算近的那一边
|
30
|
-
distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
|
31
|
-
# 判断矩形与中轴线的距离是否小于页面宽度的20%
|
32
|
-
if distance < p_width * 0.2:
|
33
|
-
return True
|
34
|
-
else:
|
35
|
-
return False
|
36
|
-
|
37
|
-
|
38
|
-
def remove_colored_strip_textblock(remain_text_blocks, page):
|
39
|
-
"""根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_str
|
40
|
-
ip_textblock。
|
41
|
-
|
42
|
-
Args:
|
43
|
-
remain_text_blocks (list): 剩余文本块列表。
|
44
|
-
page (Page): 页面对象。
|
45
|
-
|
46
|
-
Returns:
|
47
|
-
tuple: 剩余文本块列表和移除的文本块列表。
|
48
|
-
"""
|
49
|
-
colored_strip_textblocks = [] # 先构造一个空的返回
|
50
|
-
if len(remain_text_blocks) > 0:
|
51
|
-
p_width, p_height = page.rect.width, page.rect.height
|
52
|
-
blocks = page.get_cdrawings()
|
53
|
-
colored_strip_bg_rect = []
|
54
|
-
for block in blocks:
|
55
|
-
is_filled = (
|
56
|
-
'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)
|
57
|
-
) # 过滤掉透明的
|
58
|
-
rect = block['rect']
|
59
|
-
area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形
|
60
|
-
rectangle_position_determination_result = rectangle_position_determination(
|
61
|
-
rect, p_width
|
62
|
-
)
|
63
|
-
in_upper_half_page = (
|
64
|
-
rect[3] < p_height * 0.3
|
65
|
-
) # 找到位于页面上半部分的矩形,下边界小于页面高度的30%
|
66
|
-
aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (
|
67
|
-
rect[3] - rect[1]
|
68
|
-
) * 4 # 找到长宽比超过4的矩形
|
69
|
-
|
70
|
-
if (
|
71
|
-
is_filled
|
72
|
-
and area_is_large_enough
|
73
|
-
and rectangle_position_determination_result
|
74
|
-
and in_upper_half_page
|
75
|
-
and aspect_ratio_exceeds_4
|
76
|
-
):
|
77
|
-
colored_strip_bg_rect.append(rect)
|
78
|
-
|
79
|
-
if len(colored_strip_bg_rect) > 0:
|
80
|
-
for colored_strip_block_bbox in colored_strip_bg_rect:
|
81
|
-
for text_block in remain_text_blocks:
|
82
|
-
text_bbox = text_block['bbox']
|
83
|
-
if _is_in(text_bbox, colored_strip_block_bbox) or (
|
84
|
-
_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox)
|
85
|
-
and calculate_overlap_area_2_minbox_area_ratio(
|
86
|
-
text_bbox, colored_strip_block_bbox
|
87
|
-
)
|
88
|
-
> 0.6
|
89
|
-
):
|
90
|
-
logger.info(
|
91
|
-
f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}'
|
92
|
-
)
|
93
|
-
text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
|
94
|
-
colored_strip_textblocks.append(text_block)
|
95
|
-
|
96
|
-
if len(colored_strip_textblocks) > 0:
|
97
|
-
for colored_strip_textblock in colored_strip_textblocks:
|
98
|
-
if colored_strip_textblock in remain_text_blocks:
|
99
|
-
remain_text_blocks.remove(colored_strip_textblock)
|
100
|
-
|
101
|
-
return remain_text_blocks, colored_strip_textblocks
|
@@ -1,114 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
|
3
|
-
from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
|
4
|
-
from magic_pdf.libs.boxbase import _is_in_or_part_overlap
|
5
|
-
|
6
|
-
|
7
|
-
def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
|
8
|
-
page_no_bboxs, page_w, page_h):
|
9
|
-
"""删除页眉页脚,页码 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中."""
|
10
|
-
header = []
|
11
|
-
footer = []
|
12
|
-
if len(header) == 0:
|
13
|
-
model_header = header_bboxs
|
14
|
-
if model_header:
|
15
|
-
x0 = min([x for x, _, _, _ in model_header])
|
16
|
-
y0 = min([y for _, y, _, _ in model_header])
|
17
|
-
x1 = max([x1 for _, _, x1, _ in model_header])
|
18
|
-
y1 = max([y1 for _, _, _, y1 in model_header])
|
19
|
-
header = [x0, y0, x1, y1]
|
20
|
-
if len(footer) == 0:
|
21
|
-
model_footer = footer_bboxs
|
22
|
-
if model_footer:
|
23
|
-
x0 = min([x for x, _, _, _ in model_footer])
|
24
|
-
y0 = min([y for _, y, _, _ in model_footer])
|
25
|
-
x1 = max([x1 for _, _, x1, _ in model_footer])
|
26
|
-
y1 = max([y1 for _, _, _, y1 in model_footer])
|
27
|
-
footer = [x0, y0, x1, y1]
|
28
|
-
|
29
|
-
header_y0 = 0 if len(header) == 0 else header[3]
|
30
|
-
footer_y0 = page_h if len(footer) == 0 else footer[1]
|
31
|
-
if page_no_bboxs:
|
32
|
-
top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
|
33
|
-
btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
|
34
|
-
|
35
|
-
top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
|
36
|
-
btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
|
37
|
-
|
38
|
-
header_y0 = max(header_y0, top_max_y0)
|
39
|
-
footer_y0 = min(footer_y0, btn_min_y1)
|
40
|
-
|
41
|
-
content_boundry = [0, header_y0, page_w, footer_y0]
|
42
|
-
|
43
|
-
header = [0, 0, page_w, header_y0]
|
44
|
-
footer = [0, footer_y0, page_w, page_h]
|
45
|
-
|
46
|
-
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
|
47
|
-
text_block_to_remove = []
|
48
|
-
# 首先检查每个textblock
|
49
|
-
for blk in text_raw_blocks:
|
50
|
-
if len(blk['lines']) > 0:
|
51
|
-
for line in blk['lines']:
|
52
|
-
line_del = []
|
53
|
-
for span in line['spans']:
|
54
|
-
span_del = []
|
55
|
-
if span['bbox'][3] < header_y0:
|
56
|
-
span_del.append(span)
|
57
|
-
elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
|
58
|
-
span_del.append(span)
|
59
|
-
for span in span_del:
|
60
|
-
line['spans'].remove(span)
|
61
|
-
if not line['spans']:
|
62
|
-
line_del.append(line)
|
63
|
-
|
64
|
-
for line in line_del:
|
65
|
-
blk['lines'].remove(line)
|
66
|
-
else:
|
67
|
-
# if not blk['lines']:
|
68
|
-
blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
|
69
|
-
text_block_to_remove.append(blk)
|
70
|
-
|
71
|
-
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
|
72
|
-
page_no_block_2_remove = []
|
73
|
-
if page_no_bboxs:
|
74
|
-
for pagenobox in page_no_bboxs:
|
75
|
-
for block in text_raw_blocks:
|
76
|
-
if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
|
77
|
-
for line in block['lines']:
|
78
|
-
for span in line['spans']:
|
79
|
-
if _is_in_or_part_overlap(pagenobox, span['bbox']):
|
80
|
-
# span['text'] = ''
|
81
|
-
span['tag'] = PAGE_NO
|
82
|
-
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
|
83
|
-
if len(line['spans']) == 1 and len(block['lines']) == 1:
|
84
|
-
page_no_block_2_remove.append(block)
|
85
|
-
else:
|
86
|
-
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
|
87
|
-
if len(text_raw_blocks) > 0:
|
88
|
-
text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
|
89
|
-
last_block = text_raw_blocks[0]
|
90
|
-
if len(last_block['lines']) == 1:
|
91
|
-
last_line = last_block['lines'][0]
|
92
|
-
if len(last_line['spans']) == 1:
|
93
|
-
last_span = last_line['spans'][0]
|
94
|
-
if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
|
95
|
-
last_span[
|
96
|
-
'text']):
|
97
|
-
last_span['tag'] = PAGE_NO
|
98
|
-
page_no_block_2_remove.append(last_block)
|
99
|
-
|
100
|
-
for b in page_no_block_2_remove:
|
101
|
-
text_block_to_remove.append(b)
|
102
|
-
|
103
|
-
for blk in text_block_to_remove:
|
104
|
-
if blk in text_raw_blocks:
|
105
|
-
text_raw_blocks.remove(blk)
|
106
|
-
|
107
|
-
text_block_remain = text_raw_blocks
|
108
|
-
image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
|
109
|
-
|
110
|
-
image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
|
111
|
-
table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
|
112
|
-
table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
|
113
|
-
|
114
|
-
return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
|
@@ -1,236 +0,0 @@
|
|
1
|
-
import math
|
2
|
-
import re
|
3
|
-
|
4
|
-
from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
|
5
|
-
VERTICAL_TEXT)
|
6
|
-
from magic_pdf.libs.boxbase import is_vbox_on_side
|
7
|
-
|
8
|
-
|
9
|
-
def detect_non_horizontal_texts(result_dict):
|
10
|
-
"""This function detects watermarks and vertical margin notes in the
|
11
|
-
document.
|
12
|
-
|
13
|
-
Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
|
14
|
-
If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
|
15
|
-
If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
|
16
|
-
|
17
|
-
Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
|
18
|
-
If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
|
19
|
-
If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
|
20
|
-
|
21
|
-
|
22
|
-
Parameters
|
23
|
-
----------
|
24
|
-
result_dict : dict
|
25
|
-
The result dictionary.
|
26
|
-
|
27
|
-
Returns
|
28
|
-
-------
|
29
|
-
result_dict : dict
|
30
|
-
The updated result dictionary.
|
31
|
-
"""
|
32
|
-
# Dictionary to store information about potential watermarks
|
33
|
-
potential_watermarks = {}
|
34
|
-
potential_margin_notes = {}
|
35
|
-
|
36
|
-
for page_id, page_content in result_dict.items():
|
37
|
-
if page_id.startswith('page_'):
|
38
|
-
for block_id, block_data in page_content.items():
|
39
|
-
if block_id.startswith('block_'):
|
40
|
-
if 'dir' in block_data:
|
41
|
-
coordinates_text = (
|
42
|
-
block_data['bbox'],
|
43
|
-
block_data['text'],
|
44
|
-
) # Tuple of coordinates and text
|
45
|
-
|
46
|
-
angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
|
47
|
-
angle = abs(math.degrees(angle))
|
48
|
-
|
49
|
-
if angle > 5 and angle < 85: # Check if direction is watermarks
|
50
|
-
if coordinates_text in potential_watermarks:
|
51
|
-
potential_watermarks[coordinates_text] += 1
|
52
|
-
else:
|
53
|
-
potential_watermarks[coordinates_text] = 1
|
54
|
-
|
55
|
-
if angle > 85 and angle < 105: # Check if direction is vertical
|
56
|
-
if coordinates_text in potential_margin_notes:
|
57
|
-
potential_margin_notes[coordinates_text] += (
|
58
|
-
1 # Increment count
|
59
|
-
)
|
60
|
-
else:
|
61
|
-
potential_margin_notes[coordinates_text] = (
|
62
|
-
1 # Initialize count
|
63
|
-
)
|
64
|
-
|
65
|
-
# Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
|
66
|
-
watermark_threshold = len(result_dict) // 2
|
67
|
-
watermarks = {
|
68
|
-
k: v for k, v in potential_watermarks.items() if v > watermark_threshold
|
69
|
-
}
|
70
|
-
|
71
|
-
# Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
|
72
|
-
margin_note_threshold = len(result_dict) // 2
|
73
|
-
margin_notes = {
|
74
|
-
k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
|
75
|
-
}
|
76
|
-
|
77
|
-
# Add watermark information to the result dictionary
|
78
|
-
for page_id, blocks in result_dict.items():
|
79
|
-
if page_id.startswith('page_'):
|
80
|
-
for block_id, block_data in blocks.items():
|
81
|
-
coordinates_text = (block_data['bbox'], block_data['text'])
|
82
|
-
if coordinates_text in watermarks:
|
83
|
-
block_data['is_watermark'] = 1
|
84
|
-
else:
|
85
|
-
block_data['is_watermark'] = 0
|
86
|
-
|
87
|
-
if coordinates_text in margin_notes:
|
88
|
-
block_data['is_vertical_margin_note'] = 1
|
89
|
-
else:
|
90
|
-
block_data['is_vertical_margin_note'] = 0
|
91
|
-
|
92
|
-
return result_dict
|
93
|
-
|
94
|
-
|
95
|
-
"""
|
96
|
-
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
|
97
|
-
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
|
98
|
-
"""
|
99
|
-
|
100
|
-
|
101
|
-
def __is_a_word(sentence):
|
102
|
-
# 如果输入是中文并且长度为1,则返回True
|
103
|
-
if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
|
104
|
-
return True
|
105
|
-
# 判断是否为单个英文单词或字符(包括ASCII标点)
|
106
|
-
elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
|
107
|
-
return True
|
108
|
-
else:
|
109
|
-
return False
|
110
|
-
|
111
|
-
|
112
|
-
def __get_text_color(num):
|
113
|
-
"""获取字体的颜色RGB值."""
|
114
|
-
blue = num & 255
|
115
|
-
green = (num >> 8) & 255
|
116
|
-
red = (num >> 16) & 255
|
117
|
-
return red, green, blue
|
118
|
-
|
119
|
-
|
120
|
-
def __is_empty_side_box(text_block):
|
121
|
-
"""是否是边缘上的空白没有任何内容的block."""
|
122
|
-
for line in text_block['lines']:
|
123
|
-
for span in line['spans']:
|
124
|
-
font_color = span['color']
|
125
|
-
r, g, b = __get_text_color(font_color)
|
126
|
-
if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
|
127
|
-
return False
|
128
|
-
|
129
|
-
return True
|
130
|
-
|
131
|
-
|
132
|
-
def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
|
133
|
-
"""返回删除了垂直,水印,旋转的textblock 删除的内容打上tag返回."""
|
134
|
-
removed_text_block = []
|
135
|
-
|
136
|
-
for i, block in enumerate(
|
137
|
-
pymu_text_block
|
138
|
-
): # 格式参考test/assets/papre/pymu_textblocks.json
|
139
|
-
lines = block['lines']
|
140
|
-
block_bbox = block['bbox']
|
141
|
-
if not is_vbox_on_side(
|
142
|
-
block_bbox, page_width, page_height, 0.2
|
143
|
-
): # 保证这些box必须在页面的两边
|
144
|
-
continue
|
145
|
-
|
146
|
-
if (
|
147
|
-
all(
|
148
|
-
[
|
149
|
-
__is_a_word(line['spans'][0]['text'])
|
150
|
-
for line in lines
|
151
|
-
if len(line['spans']) > 0
|
152
|
-
]
|
153
|
-
)
|
154
|
-
and len(lines) > 1
|
155
|
-
and all([len(line['spans']) == 1 for line in lines])
|
156
|
-
):
|
157
|
-
is_box_valign = (
|
158
|
-
(
|
159
|
-
len(
|
160
|
-
set(
|
161
|
-
[
|
162
|
-
int(line['spans'][0]['bbox'][0])
|
163
|
-
for line in lines
|
164
|
-
if len(line['spans']) > 0
|
165
|
-
]
|
166
|
-
)
|
167
|
-
)
|
168
|
-
== 1
|
169
|
-
)
|
170
|
-
and (
|
171
|
-
len(
|
172
|
-
[
|
173
|
-
int(line['spans'][0]['bbox'][0])
|
174
|
-
for line in lines
|
175
|
-
if len(line['spans']) > 0
|
176
|
-
]
|
177
|
-
)
|
178
|
-
> 1
|
179
|
-
)
|
180
|
-
) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字
|
181
|
-
|
182
|
-
if is_box_valign:
|
183
|
-
block['tag'] = VERTICAL_TEXT
|
184
|
-
removed_text_block.append(block)
|
185
|
-
continue
|
186
|
-
|
187
|
-
for line in lines:
|
188
|
-
if line['dir'] != (1, 0):
|
189
|
-
block['tag'] = ROTATE_TEXT
|
190
|
-
removed_text_block.append(
|
191
|
-
block
|
192
|
-
) # 只要有一个line不是dir=(1,0),就把整个block都删掉
|
193
|
-
break
|
194
|
-
|
195
|
-
for block in removed_text_block:
|
196
|
-
pymu_text_block.remove(block)
|
197
|
-
|
198
|
-
return pymu_text_block, removed_text_block
|
199
|
-
|
200
|
-
|
201
|
-
def get_side_boundry(rotate_bbox, page_width, page_height):
|
202
|
-
"""根据rotate_bbox,返回页面的左右正文边界."""
|
203
|
-
left_x = 0
|
204
|
-
right_x = page_width
|
205
|
-
for x in rotate_bbox:
|
206
|
-
box = x['bbox']
|
207
|
-
if box[2] < page_width / 2:
|
208
|
-
left_x = max(left_x, box[2])
|
209
|
-
else:
|
210
|
-
right_x = min(right_x, box[0])
|
211
|
-
|
212
|
-
return left_x + 1, right_x - 1
|
213
|
-
|
214
|
-
|
215
|
-
def remove_side_blank_block(pymu_text_block, page_width, page_height):
|
216
|
-
"""删除页面两侧的空白block."""
|
217
|
-
removed_text_block = []
|
218
|
-
|
219
|
-
for i, block in enumerate(
|
220
|
-
pymu_text_block
|
221
|
-
): # 格式参考test/assets/papre/pymu_textblocks.json
|
222
|
-
block_bbox = block['bbox']
|
223
|
-
if not is_vbox_on_side(
|
224
|
-
block_bbox, page_width, page_height, 0.2
|
225
|
-
): # 保证这些box必须在页面的两边
|
226
|
-
continue
|
227
|
-
|
228
|
-
if __is_empty_side_box(block):
|
229
|
-
block['tag'] = EMPTY_SIDE_BLOCK
|
230
|
-
removed_text_block.append(block)
|
231
|
-
continue
|
232
|
-
|
233
|
-
for block in removed_text_block:
|
234
|
-
pymu_text_block.remove(block)
|
235
|
-
|
236
|
-
return pymu_text_block, removed_text_block
|