magic-pdf 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +16 -22
- magic_pdf/filter/pdf_meta_scan.py +5 -19
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_check.py +52 -25
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/pp_structure_v2.py +23 -3
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +50 -29
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +11 -9
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
- magic_pdf/para/para_split_v3.py +21 -7
- magic_pdf/pdf_parse_union_core_v2.py +134 -146
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/METADATA +3 -78
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/RECORD +28 -79
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,17 @@
|
|
1
1
|
import copy
|
2
2
|
import time
|
3
|
-
|
4
3
|
import cv2
|
5
4
|
import numpy as np
|
5
|
+
|
6
6
|
from paddleocr import PaddleOCR
|
7
|
-
from
|
8
|
-
from
|
9
|
-
from
|
10
|
-
from
|
7
|
+
from ppocr.utils.logging import get_logger
|
8
|
+
from ppocr.utils.utility import alpha_to_color, binarize_img
|
9
|
+
from tools.infer.predict_system import sorted_boxes
|
10
|
+
from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
|
11
|
+
|
12
|
+
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
|
11
13
|
|
12
|
-
|
14
|
+
logger = get_logger()
|
13
15
|
|
14
16
|
|
15
17
|
class ModifiedPaddleOCR(PaddleOCR):
|
@@ -63,7 +65,7 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
63
65
|
|
64
66
|
if det and rec:
|
65
67
|
ocr_res = []
|
66
|
-
for
|
68
|
+
for img in imgs:
|
67
69
|
img = preprocess_image(img)
|
68
70
|
dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
|
69
71
|
if not dt_boxes and not rec_res:
|
@@ -75,7 +77,7 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
75
77
|
return ocr_res
|
76
78
|
elif det and not rec:
|
77
79
|
ocr_res = []
|
78
|
-
for
|
80
|
+
for img in imgs:
|
79
81
|
img = preprocess_image(img)
|
80
82
|
dt_boxes, elapse = self.text_detector(img)
|
81
83
|
if dt_boxes is None:
|
@@ -96,7 +98,7 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
96
98
|
else:
|
97
99
|
ocr_res = []
|
98
100
|
cls_res = []
|
99
|
-
for
|
101
|
+
for img in imgs:
|
100
102
|
if not isinstance(img, list):
|
101
103
|
img = preprocess_image(img)
|
102
104
|
img = [img]
|
@@ -2,8 +2,8 @@ import os
|
|
2
2
|
|
3
3
|
import cv2
|
4
4
|
import numpy as np
|
5
|
-
from
|
6
|
-
from
|
5
|
+
from ppstructure.table.predict_table import TableSystem
|
6
|
+
from ppstructure.utility import init_args
|
7
7
|
from PIL import Image
|
8
8
|
|
9
9
|
from magic_pdf.config.constants import * # noqa: F403
|
magic_pdf/para/para_split_v3.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
import copy
|
2
2
|
|
3
|
+
from loguru import logger
|
4
|
+
|
3
5
|
from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
|
4
6
|
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
7
|
+
from magic_pdf.libs.language import detect_lang
|
5
8
|
|
6
9
|
LINE_STOP_FLAG = (
|
7
10
|
'.',
|
@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
|
|
125
128
|
|
126
129
|
# 添加所有文本,包括空行,保持与block['lines']长度一致
|
127
130
|
lines_text_list.append(line_text)
|
131
|
+
block_text = ''.join(lines_text_list)
|
132
|
+
block_lang = detect_lang(block_text)
|
133
|
+
# logger.info(f"block_lang: {block_lang}")
|
128
134
|
|
129
135
|
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
|
130
136
|
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
|
@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
|
|
136
142
|
if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
|
137
143
|
right_close_num += 1
|
138
144
|
else:
|
139
|
-
#
|
140
|
-
|
141
|
-
|
142
|
-
if block_weight_radio >= 0.5:
|
145
|
+
# 类中文没有超长单词的情况,可以用统一的阈值
|
146
|
+
if block_lang in ['zh', 'ja', 'ko']:
|
143
147
|
closed_area = 0.26 * block_weight
|
144
148
|
else:
|
145
|
-
|
149
|
+
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
|
150
|
+
# block宽的阈值可以小些,block窄的阈值要大
|
151
|
+
if block_weight_radio >= 0.5:
|
152
|
+
closed_area = 0.26 * block_weight
|
153
|
+
else:
|
154
|
+
closed_area = 0.36 * block_weight
|
146
155
|
if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
|
147
156
|
right_not_close_num += 1
|
148
157
|
|
@@ -271,13 +280,18 @@ def __merge_2_text_blocks(block1, block2):
|
|
271
280
|
first_span = first_line['spans'][0]
|
272
281
|
if len(first_span['content']) > 0:
|
273
282
|
span_start_with_num = first_span['content'][0].isdigit()
|
283
|
+
span_start_with_big_char = first_span['content'][0].isupper()
|
274
284
|
if (
|
275
|
-
|
276
|
-
< line_height
|
285
|
+
# 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
|
286
|
+
abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
|
287
|
+
# 上一个block的最后一个span不是以特定符号结尾
|
277
288
|
and not last_span['content'].endswith(LINE_STOP_FLAG)
|
278
289
|
# 两个block宽度差距超过2倍也不合并
|
279
290
|
and abs(block1_weight - block2_weight) < min_block_weight
|
291
|
+
# 下一个block的第一个字符是数字
|
280
292
|
and not span_start_with_num
|
293
|
+
# 下一个block的第一个字符是大写字母
|
294
|
+
and not span_start_with_big_char
|
281
295
|
):
|
282
296
|
if block1['page_num'] != block2['page_num']:
|
283
297
|
for line in block1['lines']:
|
@@ -5,19 +5,18 @@ import time
|
|
5
5
|
from typing import List
|
6
6
|
|
7
7
|
import torch
|
8
|
+
import fitz
|
8
9
|
from loguru import logger
|
9
10
|
|
10
|
-
from magic_pdf.config.drop_reason import DropReason
|
11
11
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
12
12
|
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
13
13
|
from magic_pdf.data.dataset import Dataset, PageableData
|
14
14
|
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
15
15
|
from magic_pdf.libs.clean_memory import clean_memory
|
16
|
-
from magic_pdf.libs.commons import fitz, get_delta_time
|
17
16
|
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
|
18
17
|
from magic_pdf.libs.convert_utils import dict_to_list
|
19
18
|
from magic_pdf.libs.hash_utils import compute_md5
|
20
|
-
|
19
|
+
|
21
20
|
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
|
22
21
|
from magic_pdf.model.magic_model import MagicModel
|
23
22
|
|
@@ -31,44 +30,14 @@ try:
|
|
31
30
|
torchtext.disable_torchtext_deprecation_warning()
|
32
31
|
except ImportError:
|
33
32
|
pass
|
34
|
-
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
35
33
|
|
34
|
+
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
36
35
|
from magic_pdf.para.para_split_v3 import para_split
|
37
|
-
from magic_pdf.pre_proc.
|
38
|
-
from magic_pdf.pre_proc.construct_page_dict import \
|
39
|
-
ocr_construct_page_component_v2
|
36
|
+
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
|
40
37
|
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
|
41
|
-
from magic_pdf.pre_proc.
|
42
|
-
|
43
|
-
|
44
|
-
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
|
45
|
-
ocr_prepare_bboxes_for_layout_split_v2
|
46
|
-
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
|
47
|
-
fix_block_spans_v2,
|
48
|
-
fix_discarded_block)
|
49
|
-
from magic_pdf.pre_proc.ocr_span_list_modify import (
|
50
|
-
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
|
51
|
-
remove_overlaps_min_spans)
|
52
|
-
from magic_pdf.pre_proc.resolve_bbox_conflict import \
|
53
|
-
check_useful_block_horizontal_overlap
|
54
|
-
|
55
|
-
|
56
|
-
def remove_horizontal_overlap_block_which_smaller(all_bboxes):
|
57
|
-
useful_blocks = []
|
58
|
-
for bbox in all_bboxes:
|
59
|
-
useful_blocks.append({'bbox': bbox[:4]})
|
60
|
-
is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
|
61
|
-
check_useful_block_horizontal_overlap(useful_blocks)
|
62
|
-
)
|
63
|
-
if is_useful_block_horz_overlap:
|
64
|
-
logger.warning(
|
65
|
-
f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
|
66
|
-
) # noqa: E501
|
67
|
-
for bbox in all_bboxes.copy():
|
68
|
-
if smaller_bbox == bbox[:4]:
|
69
|
-
all_bboxes.remove(bbox)
|
70
|
-
|
71
|
-
return is_useful_block_horz_overlap, all_bboxes
|
38
|
+
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
|
39
|
+
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
|
40
|
+
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
|
72
41
|
|
73
42
|
|
74
43
|
def __replace_STX_ETX(text_str: str):
|
@@ -88,52 +57,67 @@ def __replace_STX_ETX(text_str: str):
|
|
88
57
|
return text_str
|
89
58
|
|
90
59
|
|
91
|
-
def
|
92
|
-
|
93
|
-
|
60
|
+
def __replace_0xfffd(text_str: str):
|
61
|
+
"""Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
|
62
|
+
if text_str:
|
63
|
+
s = text_str.replace('\ufffd', " ")
|
64
|
+
return s
|
65
|
+
return text_str
|
94
66
|
|
67
|
+
def chars_to_content(span):
|
68
|
+
# 检查span中的char是否为空
|
69
|
+
if len(span['chars']) == 0:
|
70
|
+
pass
|
71
|
+
# span['content'] = ''
|
72
|
+
else:
|
95
73
|
# 先给chars按char['bbox']的中心点的x坐标排序
|
96
74
|
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
|
97
|
-
content = ''
|
98
75
|
|
99
76
|
# 求char的平均宽度
|
100
|
-
|
101
|
-
|
102
|
-
del span['chars']
|
103
|
-
return
|
104
|
-
else:
|
105
|
-
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
|
106
|
-
char_avg_width = char_width_sum / len(span['chars'])
|
77
|
+
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
|
78
|
+
char_avg_width = char_width_sum / len(span['chars'])
|
107
79
|
|
80
|
+
content = ''
|
108
81
|
for char in span['chars']:
|
109
82
|
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
|
110
83
|
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
|
111
84
|
content += ' '
|
112
85
|
content += char['c']
|
113
|
-
|
114
|
-
|
86
|
+
|
87
|
+
span['content'] = __replace_0xfffd(content)
|
88
|
+
|
89
|
+
del span['chars']
|
115
90
|
|
116
91
|
|
117
92
|
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|
93
|
+
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
|
94
|
+
|
95
|
+
|
118
96
|
def fill_char_in_spans(spans, all_chars):
|
119
97
|
|
98
|
+
# 简单从上到下排一下序
|
99
|
+
spans = sorted(spans, key=lambda x: x['bbox'][1])
|
100
|
+
|
120
101
|
for char in all_chars:
|
121
102
|
for span in spans:
|
122
|
-
|
123
|
-
if char['c'] in LINE_STOP_FLAG:
|
124
|
-
char_is_line_stop_flag = True
|
125
|
-
else:
|
126
|
-
char_is_line_stop_flag = False
|
127
|
-
if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
|
103
|
+
if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
|
128
104
|
span['chars'].append(char)
|
129
105
|
break
|
130
106
|
|
107
|
+
empty_spans = []
|
108
|
+
|
131
109
|
for span in spans:
|
132
110
|
chars_to_content(span)
|
111
|
+
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
|
112
|
+
if len(span['content']) * span['height'] < span['width'] * 0.5:
|
113
|
+
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
|
114
|
+
empty_spans.append(span)
|
115
|
+
del span['height'], span['width']
|
116
|
+
return empty_spans
|
133
117
|
|
134
118
|
|
135
119
|
# 使用鲁棒性更强的中心点坐标判断
|
136
|
-
def calculate_char_in_span(char_bbox, span_bbox,
|
120
|
+
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
|
137
121
|
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
|
138
122
|
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
|
139
123
|
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
|
@@ -142,18 +126,26 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
|
142
126
|
if (
|
143
127
|
span_bbox[0] < char_center_x < span_bbox[2]
|
144
128
|
and span_bbox[1] < char_center_y < span_bbox[3]
|
145
|
-
and abs(char_center_y - span_center_y) < span_height
|
129
|
+
and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度
|
146
130
|
):
|
147
131
|
return True
|
148
132
|
else:
|
149
133
|
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
|
150
134
|
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
|
151
|
-
if
|
135
|
+
if char in LINE_STOP_FLAG:
|
152
136
|
if (
|
153
137
|
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
|
154
138
|
and char_center_x > span_bbox[0]
|
155
139
|
and span_bbox[1] < char_center_y < span_bbox[3]
|
156
|
-
and abs(char_center_y - span_center_y) < span_height
|
140
|
+
and abs(char_center_y - span_center_y) < span_height * span_height_radio
|
141
|
+
):
|
142
|
+
return True
|
143
|
+
elif char in LINE_START_FLAG:
|
144
|
+
if (
|
145
|
+
span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
|
146
|
+
and char_center_x < span_bbox[2]
|
147
|
+
and span_bbox[1] < char_center_y < span_bbox[3]
|
148
|
+
and abs(char_center_y - span_center_y) < span_height * span_height_radio
|
157
149
|
):
|
158
150
|
return True
|
159
151
|
else:
|
@@ -162,48 +154,80 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
|
162
154
|
|
163
155
|
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
|
164
156
|
|
157
|
+
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
|
158
|
+
|
159
|
+
all_pymu_chars = []
|
160
|
+
for block in text_blocks_raw:
|
161
|
+
for line in block['lines']:
|
162
|
+
cosine, sine = line['dir']
|
163
|
+
if abs (cosine) < 0.9 or abs(sine) > 0.1:
|
164
|
+
continue
|
165
|
+
for span in line['spans']:
|
166
|
+
all_pymu_chars.extend(span['chars'])
|
167
|
+
|
168
|
+
# 计算所有sapn的高度的中位数
|
169
|
+
span_height_list = []
|
170
|
+
for span in spans:
|
171
|
+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
172
|
+
continue
|
173
|
+
span_height = span['bbox'][3] - span['bbox'][1]
|
174
|
+
span['height'] = span_height
|
175
|
+
span['width'] = span['bbox'][2] - span['bbox'][0]
|
176
|
+
span_height_list.append(span_height)
|
177
|
+
if len(span_height_list) == 0:
|
178
|
+
return spans
|
179
|
+
else:
|
180
|
+
median_span_height = statistics.median(span_height_list)
|
181
|
+
|
165
182
|
useful_spans = []
|
166
183
|
unuseful_spans = []
|
184
|
+
# 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值
|
185
|
+
vertical_spans = []
|
167
186
|
for span in spans:
|
168
|
-
|
187
|
+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
188
|
+
continue
|
189
|
+
for block in all_bboxes + all_discarded_blocks:
|
169
190
|
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
|
170
191
|
continue
|
171
|
-
else:
|
172
|
-
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
173
|
-
useful_spans.append(span)
|
174
|
-
break
|
175
|
-
for block in all_discarded_blocks:
|
176
192
|
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
177
|
-
|
193
|
+
if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
|
194
|
+
vertical_spans.append(span)
|
195
|
+
elif block in all_bboxes:
|
196
|
+
useful_spans.append(span)
|
197
|
+
else:
|
198
|
+
unuseful_spans.append(span)
|
199
|
+
|
178
200
|
break
|
179
201
|
|
180
|
-
|
202
|
+
"""垂直的span框直接用pymu的line进行填充"""
|
203
|
+
if len(vertical_spans) > 0:
|
204
|
+
text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
205
|
+
all_pymu_lines = []
|
206
|
+
for block in text_blocks:
|
207
|
+
for line in block['lines']:
|
208
|
+
all_pymu_lines.append(line)
|
181
209
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
210
|
+
for pymu_line in all_pymu_lines:
|
211
|
+
for span in vertical_spans:
|
212
|
+
if calculate_overlap_area_in_bbox1_area_ratio(pymu_line['bbox'], span['bbox']) > 0.5:
|
213
|
+
for pymu_span in pymu_line['spans']:
|
214
|
+
span['content'] += pymu_span['text']
|
215
|
+
break
|
188
216
|
|
189
|
-
|
217
|
+
for span in vertical_spans:
|
218
|
+
if len(span['content']) == 0:
|
219
|
+
spans.remove(span)
|
190
220
|
|
191
|
-
|
192
|
-
|
193
|
-
span['chars'] = []
|
194
|
-
new_spans.append(span)
|
221
|
+
"""水平的span框如果没有char则用ocr进行填充"""
|
222
|
+
new_spans = []
|
195
223
|
|
196
|
-
for span in unuseful_spans:
|
224
|
+
for span in useful_spans + unuseful_spans:
|
197
225
|
if span['type'] in [ContentType.Text]:
|
198
226
|
span['chars'] = []
|
199
227
|
new_spans.append(span)
|
200
228
|
|
201
|
-
fill_char_in_spans(new_spans, all_pymu_chars)
|
229
|
+
empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
|
202
230
|
|
203
|
-
empty_spans = []
|
204
|
-
for span in new_spans:
|
205
|
-
if len(span['content']) == 0:
|
206
|
-
empty_spans.append(span)
|
207
231
|
if len(empty_spans) > 0:
|
208
232
|
|
209
233
|
# 初始化ocr模型
|
@@ -216,52 +240,19 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
216
240
|
)
|
217
241
|
|
218
242
|
for span in empty_spans:
|
219
|
-
|
220
|
-
# 对span的bbox截图
|
243
|
+
# 对span的bbox截图再ocr
|
221
244
|
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
|
222
245
|
ocr_res = ocr_model.ocr(span_img, det=False)
|
223
|
-
# logger.info(f"ocr_res: {ocr_res}")
|
224
|
-
# logger.info(f"empty_span: {span}")
|
225
246
|
if ocr_res and len(ocr_res) > 0:
|
226
247
|
if len(ocr_res[0]) > 0:
|
227
248
|
ocr_text, ocr_score = ocr_res[0][0]
|
249
|
+
# logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
|
228
250
|
if ocr_score > 0.5 and len(ocr_text) > 0:
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
251
|
+
span['content'] = ocr_text
|
252
|
+
span['score'] = ocr_score
|
253
|
+
else:
|
254
|
+
spans.remove(span)
|
233
255
|
|
234
|
-
|
235
|
-
def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
|
236
|
-
text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
|
237
|
-
char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
|
238
|
-
'blocks'
|
239
|
-
]
|
240
|
-
text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
|
241
|
-
text_blocks = replace_equations_in_textblock(
|
242
|
-
text_blocks, inline_equations, interline_equations
|
243
|
-
)
|
244
|
-
text_blocks = remove_citation_marker(text_blocks)
|
245
|
-
text_blocks = remove_chars_in_text_blocks(text_blocks)
|
246
|
-
spans = []
|
247
|
-
for v in text_blocks:
|
248
|
-
for line in v['lines']:
|
249
|
-
for span in line['spans']:
|
250
|
-
bbox = span['bbox']
|
251
|
-
if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
|
252
|
-
continue
|
253
|
-
if span.get('type') not in (
|
254
|
-
ContentType.InlineEquation,
|
255
|
-
ContentType.InterlineEquation,
|
256
|
-
):
|
257
|
-
spans.append(
|
258
|
-
{
|
259
|
-
'bbox': list(span['bbox']),
|
260
|
-
'content': __replace_STX_ETX(span['text']),
|
261
|
-
'type': ContentType.Text,
|
262
|
-
'score': 1.0,
|
263
|
-
}
|
264
|
-
)
|
265
256
|
return spans
|
266
257
|
|
267
258
|
|
@@ -682,6 +673,23 @@ def parse_page_core(
|
|
682
673
|
"""顺便删除大水印并保留abandon的span"""
|
683
674
|
spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
|
684
675
|
|
676
|
+
"""删除重叠spans中置信度较低的那些"""
|
677
|
+
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|
678
|
+
"""删除重叠spans中较小的那些"""
|
679
|
+
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
680
|
+
|
681
|
+
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
|
682
|
+
if parse_mode == SupportedPdfParseMethod.TXT:
|
683
|
+
|
684
|
+
"""使用新版本的混合ocr方案"""
|
685
|
+
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
|
686
|
+
|
687
|
+
elif parse_mode == SupportedPdfParseMethod.OCR:
|
688
|
+
pass
|
689
|
+
else:
|
690
|
+
raise Exception('parse_mode must be txt or ocr')
|
691
|
+
|
692
|
+
|
685
693
|
"""先处理不需要排版的discarded_blocks"""
|
686
694
|
discarded_block_with_spans, spans = fill_spans_in_blocks(
|
687
695
|
all_discarded_blocks, spans, 0.4
|
@@ -706,26 +714,6 @@ def parse_page_core(
|
|
706
714
|
drop_reason,
|
707
715
|
)
|
708
716
|
|
709
|
-
"""删除重叠spans中置信度较低的那些"""
|
710
|
-
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
|
711
|
-
"""删除重叠spans中较小的那些"""
|
712
|
-
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
|
713
|
-
|
714
|
-
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
|
715
|
-
if parse_mode == SupportedPdfParseMethod.TXT:
|
716
|
-
|
717
|
-
"""之前的公式替换方案"""
|
718
|
-
# pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
|
719
|
-
# spans = replace_text_span(pymu_spans, spans)
|
720
|
-
|
721
|
-
"""ocr 中文本类的 span 用 pymu spans 替换!"""
|
722
|
-
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
|
723
|
-
|
724
|
-
elif parse_mode == SupportedPdfParseMethod.OCR:
|
725
|
-
pass
|
726
|
-
else:
|
727
|
-
raise Exception('parse_mode must be txt or ocr')
|
728
|
-
|
729
717
|
"""对image和table截图"""
|
730
718
|
spans = ocr_cut_image_and_table(
|
731
719
|
spans, page_doc, page_id, pdf_bytes_md5, imageWriter
|
@@ -811,7 +799,7 @@ def pdf_parse_union(
|
|
811
799
|
if debug_mode:
|
812
800
|
time_now = time.time()
|
813
801
|
logger.info(
|
814
|
-
f'page_id: {page_id}, last_page_cost_time: {
|
802
|
+
f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
|
815
803
|
)
|
816
804
|
start_time = time_now
|
817
805
|
|
@@ -1,58 +1,3 @@
|
|
1
|
-
def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
|
2
|
-
interline_eq_info, raw_pymu_blocks,
|
3
|
-
removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
|
4
|
-
layout_tree,
|
5
|
-
page_w, page_h, footnote_bboxes_tmp):
|
6
|
-
"""
|
7
|
-
|
8
|
-
"""
|
9
|
-
return_dict = {}
|
10
|
-
|
11
|
-
return_dict['para_blocks'] = {}
|
12
|
-
return_dict['preproc_blocks'] = text_blocks_preproc
|
13
|
-
return_dict['images'] = image_info
|
14
|
-
return_dict['tables'] = table_info
|
15
|
-
return_dict['interline_equations'] = interline_eq_info
|
16
|
-
return_dict['inline_equations'] = inline_eq_info
|
17
|
-
return_dict['layout_bboxes'] = layout_bboxes
|
18
|
-
return_dict['pymu_raw_blocks'] = raw_pymu_blocks
|
19
|
-
return_dict['global_statistic'] = {}
|
20
|
-
|
21
|
-
return_dict['droped_text_block'] = removed_text_blocks
|
22
|
-
return_dict['droped_image_block'] = removed_image_blocks
|
23
|
-
return_dict['droped_table_block'] = []
|
24
|
-
return_dict['image_backup'] = images_backup
|
25
|
-
return_dict['table_backup'] = []
|
26
|
-
return_dict['page_idx'] = page_id
|
27
|
-
return_dict['page_size'] = [page_w, page_h]
|
28
|
-
return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用
|
29
|
-
return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
|
30
|
-
|
31
|
-
return return_dict
|
32
|
-
|
33
|
-
|
34
|
-
def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
35
|
-
images, tables, interline_equations, inline_equations,
|
36
|
-
dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
|
37
|
-
need_remove_spans_bboxes_dict):
|
38
|
-
return_dict = {
|
39
|
-
'preproc_blocks': blocks,
|
40
|
-
'layout_bboxes': layout_bboxes,
|
41
|
-
'page_idx': page_id,
|
42
|
-
'page_size': [page_w, page_h],
|
43
|
-
'_layout_tree': layout_tree,
|
44
|
-
'images': images,
|
45
|
-
'tables': tables,
|
46
|
-
'interline_equations': interline_equations,
|
47
|
-
'inline_equations': inline_equations,
|
48
|
-
'droped_text_block': dropped_text_block,
|
49
|
-
'droped_image_block': dropped_image_block,
|
50
|
-
'droped_table_block': dropped_table_block,
|
51
|
-
'dropped_equation_block': dropped_equation_block,
|
52
|
-
'droped_bboxes': need_remove_spans_bboxes_dict,
|
53
|
-
}
|
54
|
-
return return_dict
|
55
|
-
|
56
1
|
|
57
2
|
def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
58
3
|
images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
|
magic_pdf/pre_proc/cut_image.py
CHANGED
@@ -25,43 +25,6 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
|
|
25
25
|
return spans
|
26
26
|
|
27
27
|
|
28
|
-
def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
|
29
|
-
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
|
30
|
-
equation_inline_bboxes: list,
|
31
|
-
equation_interline_bboxes: list, imageWriter) -> dict:
|
32
|
-
"""返回一个dict, key为bbox, 值是图片地址."""
|
33
|
-
image_info = []
|
34
|
-
image_backup_info = []
|
35
|
-
table_info = []
|
36
|
-
inline_eq_info = []
|
37
|
-
interline_eq_info = []
|
38
|
-
|
39
|
-
# 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
|
40
|
-
|
41
|
-
def return_path(type):
|
42
|
-
return join_path(pdf_bytes_md5, type)
|
43
|
-
|
44
|
-
for bbox in image_bboxes:
|
45
|
-
if not check_img_bbox(bbox):
|
46
|
-
continue
|
47
|
-
image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
|
48
|
-
image_info.append({'bbox': bbox, 'image_path': image_path})
|
49
|
-
|
50
|
-
for bbox in images_overlap_backup:
|
51
|
-
if not check_img_bbox(bbox):
|
52
|
-
continue
|
53
|
-
image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
|
54
|
-
image_backup_info.append({'bbox': bbox, 'image_path': image_path})
|
55
|
-
|
56
|
-
for bbox in table_bboxes:
|
57
|
-
if not check_img_bbox(bbox):
|
58
|
-
continue
|
59
|
-
image_path = cut_image(bbox, page_num, page, return_path('tables'), imageWriter)
|
60
|
-
table_info.append({'bbox': bbox, 'image_path': image_path})
|
61
|
-
|
62
|
-
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
|
63
|
-
|
64
|
-
|
65
28
|
def check_img_bbox(bbox) -> bool:
|
66
29
|
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
|
67
30
|
logger.warning(f'image_bboxes: 错误的box, {bbox}')
|