magic-pdf 0.10.2__py3-none-any.whl → 0.10.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +24 -19
- magic_pdf/filter/pdf_meta_scan.py +2 -2
- magic_pdf/libs/pdf_check.py +52 -25
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/pp_structure_v2.py +23 -3
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +47 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +8 -6
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
- magic_pdf/para/para_split_v3.py +14 -5
- magic_pdf/pdf_parse_union_core_v2.py +46 -31
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +2 -2
- {magic_pdf-0.10.2.dist-info → magic_pdf-0.10.4.dist-info}/METADATA +1 -2
- {magic_pdf-0.10.2.dist-info → magic_pdf-0.10.4.dist-info}/RECORD +17 -17
- {magic_pdf-0.10.2.dist-info → magic_pdf-0.10.4.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.2.dist-info → magic_pdf-0.10.4.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.2.dist-info → magic_pdf-0.10.4.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.2.dist-info → magic_pdf-0.10.4.dist-info}/top_level.txt +0 -0
@@ -30,6 +30,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
|
30
30
|
for page_info in pdf_info_dict:
|
31
31
|
paras_of_layout = page_info.get('para_blocks')
|
32
32
|
if not paras_of_layout:
|
33
|
+
markdown_with_para_and_pagination.append({
|
34
|
+
'page_no':
|
35
|
+
page_no,
|
36
|
+
'md_content':
|
37
|
+
'',
|
38
|
+
})
|
39
|
+
page_no += 1
|
33
40
|
continue
|
34
41
|
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
35
42
|
paras_of_layout, 'mm', img_buket_path)
|
@@ -129,21 +136,19 @@ def __replace_ligatures(text: str):
|
|
129
136
|
|
130
137
|
|
131
138
|
def merge_para_with_text(para_block):
|
139
|
+
block_text = ''
|
140
|
+
for line in para_block['lines']:
|
141
|
+
for span in line['spans']:
|
142
|
+
if span['type'] in [ContentType.Text]:
|
143
|
+
block_text += span['content']
|
144
|
+
block_lang = detect_lang(block_text)
|
145
|
+
|
132
146
|
para_text = ''
|
133
147
|
for i, line in enumerate(para_block['lines']):
|
134
148
|
|
135
149
|
if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
|
136
150
|
para_text += ' \n'
|
137
151
|
|
138
|
-
line_text = ''
|
139
|
-
line_lang = ''
|
140
|
-
for span in line['spans']:
|
141
|
-
span_type = span['type']
|
142
|
-
if span_type == ContentType.Text:
|
143
|
-
line_text += span['content'].strip()
|
144
|
-
|
145
|
-
if line_text != '':
|
146
|
-
line_lang = detect_lang(line_text)
|
147
152
|
for j, span in enumerate(line['spans']):
|
148
153
|
|
149
154
|
span_type = span['type']
|
@@ -156,20 +161,20 @@ def merge_para_with_text(para_block):
|
|
156
161
|
content = f"\n$$\n{span['content']}\n$$\n"
|
157
162
|
|
158
163
|
content = content.strip()
|
159
|
-
|
164
|
+
|
165
|
+
if content:
|
160
166
|
langs = ['zh', 'ja', 'ko']
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
167
|
+
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
168
|
+
if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔
|
169
|
+
if j == len(line['spans']) - 1:
|
170
|
+
para_text += content
|
171
|
+
else:
|
172
|
+
para_text += f'{content} '
|
166
173
|
else:
|
167
174
|
if span_type in [ContentType.Text, ContentType.InlineEquation]:
|
168
175
|
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
169
|
-
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
|
176
|
+
if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
|
170
177
|
para_text += content[:-1]
|
171
|
-
elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
|
172
|
-
para_text += content
|
173
178
|
else: # 西方文本语境下 content间需要空格分隔
|
174
179
|
para_text += f'{content} '
|
175
180
|
elif span_type == ContentType.InterlineEquation:
|
@@ -177,7 +182,7 @@ def merge_para_with_text(para_block):
|
|
177
182
|
else:
|
178
183
|
continue
|
179
184
|
# 连写字符拆分
|
180
|
-
para_text = __replace_ligatures(para_text)
|
185
|
+
# para_text = __replace_ligatures(para_text)
|
181
186
|
|
182
187
|
return para_text
|
183
188
|
|
@@ -8,7 +8,7 @@ from loguru import logger
|
|
8
8
|
from magic_pdf.config.drop_reason import DropReason
|
9
9
|
from magic_pdf.libs.commons import get_top_percent_list, mymax
|
10
10
|
from magic_pdf.libs.language import detect_lang
|
11
|
-
from magic_pdf.libs.pdf_check import
|
11
|
+
from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
|
12
12
|
|
13
13
|
scan_max_page = 50
|
14
14
|
junk_limit_min = 10
|
@@ -323,7 +323,7 @@ def get_language(doc: fitz.Document):
|
|
323
323
|
|
324
324
|
def check_invalid_chars(pdf_bytes):
|
325
325
|
"""乱码检测."""
|
326
|
-
return
|
326
|
+
return detect_invalid_chars_by_pymupdf(pdf_bytes)
|
327
327
|
|
328
328
|
|
329
329
|
def pdf_meta_scan(pdf_bytes: bytes):
|
magic_pdf/libs/pdf_check.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
from io import BytesIO
|
2
|
-
import re
|
3
1
|
import fitz
|
4
2
|
import numpy as np
|
5
3
|
from loguru import logger
|
6
|
-
|
4
|
+
# import re
|
5
|
+
# from io import BytesIO
|
6
|
+
# from pdfminer.high_level import extract_text
|
7
7
|
|
8
8
|
|
9
9
|
def calculate_sample_count(total_page: int):
|
@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
|
|
14
14
|
return select_page_cnt
|
15
15
|
|
16
16
|
|
17
|
-
def extract_pages(src_pdf_bytes: bytes):
|
17
|
+
def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
|
18
18
|
pdf_docs = fitz.open("pdf", src_pdf_bytes)
|
19
19
|
total_page = len(pdf_docs)
|
20
20
|
if total_page == 0:
|
@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
|
|
33
33
|
return sample_docs
|
34
34
|
|
35
35
|
|
36
|
-
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
37
|
-
|
38
|
-
|
36
|
+
# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
37
|
+
# """"
|
38
|
+
# 检测PDF中是否包含非法字符
|
39
|
+
# """
|
40
|
+
# '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
|
41
|
+
# sample_docs = extract_pages(src_pdf_bytes)
|
42
|
+
# sample_pdf_bytes = sample_docs.tobytes()
|
43
|
+
# sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
44
|
+
# text = extract_text(sample_pdf_file_like_object)
|
45
|
+
# text = text.replace("\n", "")
|
46
|
+
# # logger.info(text)
|
47
|
+
# '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
48
|
+
# cid_pattern = re.compile(r'\(cid:\d+\)')
|
49
|
+
# matches = cid_pattern.findall(text)
|
50
|
+
# cid_count = len(matches)
|
51
|
+
# cid_len = sum(len(match) for match in matches)
|
52
|
+
# text_len = len(text)
|
53
|
+
# if text_len == 0:
|
54
|
+
# cid_chars_radio = 0
|
55
|
+
# else:
|
56
|
+
# cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
|
57
|
+
# logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
|
58
|
+
# '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
|
59
|
+
# if cid_chars_radio > 0.05:
|
60
|
+
# return False # 乱码文档
|
61
|
+
# else:
|
62
|
+
# return True # 正常文档
|
63
|
+
|
64
|
+
|
65
|
+
def count_replacement_characters(text: str) -> int:
|
66
|
+
"""
|
67
|
+
统计字符串中 0xfffd 字符的数量。
|
39
68
|
"""
|
40
|
-
''
|
69
|
+
return text.count('\ufffd')
|
70
|
+
|
71
|
+
|
72
|
+
def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
|
41
73
|
sample_docs = extract_pages(src_pdf_bytes)
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
cid_pattern = re.compile(r'\(cid:\d+\)')
|
49
|
-
matches = cid_pattern.findall(text)
|
50
|
-
cid_count = len(matches)
|
51
|
-
cid_len = sum(len(match) for match in matches)
|
52
|
-
text_len = len(text)
|
74
|
+
doc_text = ""
|
75
|
+
for page in sample_docs:
|
76
|
+
page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
|
77
|
+
doc_text += page_text
|
78
|
+
text_len = len(doc_text)
|
79
|
+
uffd_count = count_replacement_characters(doc_text)
|
53
80
|
if text_len == 0:
|
54
|
-
|
81
|
+
uffd_chars_radio = 0
|
55
82
|
else:
|
56
|
-
|
57
|
-
logger.info(f"
|
58
|
-
'''当一篇文章存在
|
59
|
-
if
|
83
|
+
uffd_chars_radio = uffd_count / text_len
|
84
|
+
logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
|
85
|
+
'''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
|
86
|
+
if uffd_chars_radio > 0.01:
|
60
87
|
return False # 乱码文档
|
61
88
|
else:
|
62
|
-
return True # 正常文档
|
89
|
+
return True # 正常文档
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.10.
|
1
|
+
__version__ = "0.10.4"
|
@@ -18,11 +18,31 @@ def region_to_bbox(region):
|
|
18
18
|
|
19
19
|
|
20
20
|
class CustomPaddleModel:
|
21
|
-
def __init__(self,
|
21
|
+
def __init__(self,
|
22
|
+
ocr: bool = False,
|
23
|
+
show_log: bool = False,
|
24
|
+
lang=None,
|
25
|
+
det_db_box_thresh=0.3,
|
26
|
+
use_dilation=True,
|
27
|
+
det_db_unclip_ratio=1.8
|
28
|
+
):
|
22
29
|
if lang is not None:
|
23
|
-
self.model = PPStructure(table=False,
|
30
|
+
self.model = PPStructure(table=False,
|
31
|
+
ocr=True,
|
32
|
+
show_log=show_log,
|
33
|
+
lang=lang,
|
34
|
+
det_db_box_thresh=det_db_box_thresh,
|
35
|
+
use_dilation=use_dilation,
|
36
|
+
det_db_unclip_ratio=det_db_unclip_ratio,
|
37
|
+
)
|
24
38
|
else:
|
25
|
-
self.model = PPStructure(table=False,
|
39
|
+
self.model = PPStructure(table=False,
|
40
|
+
ocr=True,
|
41
|
+
show_log=show_log,
|
42
|
+
det_db_box_thresh=det_db_box_thresh,
|
43
|
+
use_dilation=use_dilation,
|
44
|
+
det_db_unclip_ratio=det_db_unclip_ratio,
|
45
|
+
)
|
26
46
|
|
27
47
|
def __call__(self, img):
|
28
48
|
try:
|
@@ -1,9 +1,55 @@
|
|
1
|
+
import cv2
|
1
2
|
import numpy as np
|
2
3
|
from loguru import logger
|
3
|
-
|
4
|
+
from io import BytesIO
|
5
|
+
from PIL import Image
|
6
|
+
import base64
|
4
7
|
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
|
5
8
|
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
|
6
9
|
|
10
|
+
from ppocr.utils.utility import check_and_read
|
11
|
+
|
12
|
+
|
13
|
+
def img_decode(content: bytes):
|
14
|
+
np_arr = np.frombuffer(content, dtype=np.uint8)
|
15
|
+
return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
|
16
|
+
|
17
|
+
|
18
|
+
def check_img(img):
|
19
|
+
if isinstance(img, bytes):
|
20
|
+
img = img_decode(img)
|
21
|
+
if isinstance(img, str):
|
22
|
+
image_file = img
|
23
|
+
img, flag_gif, flag_pdf = check_and_read(image_file)
|
24
|
+
if not flag_gif and not flag_pdf:
|
25
|
+
with open(image_file, 'rb') as f:
|
26
|
+
img_str = f.read()
|
27
|
+
img = img_decode(img_str)
|
28
|
+
if img is None:
|
29
|
+
try:
|
30
|
+
buf = BytesIO()
|
31
|
+
image = BytesIO(img_str)
|
32
|
+
im = Image.open(image)
|
33
|
+
rgb = im.convert('RGB')
|
34
|
+
rgb.save(buf, 'jpeg')
|
35
|
+
buf.seek(0)
|
36
|
+
image_bytes = buf.read()
|
37
|
+
data_base64 = str(base64.b64encode(image_bytes),
|
38
|
+
encoding="utf-8")
|
39
|
+
image_decode = base64.b64decode(data_base64)
|
40
|
+
img_array = np.frombuffer(image_decode, np.uint8)
|
41
|
+
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
42
|
+
except:
|
43
|
+
logger.error("error in loading image:{}".format(image_file))
|
44
|
+
return None
|
45
|
+
if img is None:
|
46
|
+
logger.error("error in loading image:{}".format(image_file))
|
47
|
+
return None
|
48
|
+
if isinstance(img, np.ndarray) and len(img.shape) == 2:
|
49
|
+
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
50
|
+
|
51
|
+
return img
|
52
|
+
|
7
53
|
|
8
54
|
def bbox_to_points(bbox):
|
9
55
|
""" 将bbox格式转换为四个顶点的数组 """
|
@@ -1,15 +1,17 @@
|
|
1
1
|
import copy
|
2
2
|
import time
|
3
|
-
|
4
3
|
import cv2
|
5
4
|
import numpy as np
|
5
|
+
|
6
6
|
from paddleocr import PaddleOCR
|
7
|
-
from
|
8
|
-
from
|
9
|
-
from
|
10
|
-
from
|
7
|
+
from ppocr.utils.logging import get_logger
|
8
|
+
from ppocr.utils.utility import alpha_to_color, binarize_img
|
9
|
+
from tools.infer.predict_system import sorted_boxes
|
10
|
+
from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
|
11
|
+
|
12
|
+
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
|
11
13
|
|
12
|
-
|
14
|
+
logger = get_logger()
|
13
15
|
|
14
16
|
|
15
17
|
class ModifiedPaddleOCR(PaddleOCR):
|
@@ -2,8 +2,8 @@ import os
|
|
2
2
|
|
3
3
|
import cv2
|
4
4
|
import numpy as np
|
5
|
-
from
|
6
|
-
from
|
5
|
+
from ppstructure.table.predict_table import TableSystem
|
6
|
+
from ppstructure.utility import init_args
|
7
7
|
from PIL import Image
|
8
8
|
|
9
9
|
from magic_pdf.config.constants import * # noqa: F403
|
magic_pdf/para/para_split_v3.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
import copy
|
2
2
|
|
3
|
+
from loguru import logger
|
4
|
+
|
3
5
|
from magic_pdf.config.constants import CROSS_PAGE, LINES_DELETED
|
4
6
|
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
7
|
+
from magic_pdf.libs.language import detect_lang
|
5
8
|
|
6
9
|
LINE_STOP_FLAG = (
|
7
10
|
'.',
|
@@ -125,6 +128,9 @@ def __is_list_or_index_block(block):
|
|
125
128
|
|
126
129
|
# 添加所有文本,包括空行,保持与block['lines']长度一致
|
127
130
|
lines_text_list.append(line_text)
|
131
|
+
block_text = ''.join(lines_text_list)
|
132
|
+
block_lang = detect_lang(block_text)
|
133
|
+
# logger.info(f"block_lang: {block_lang}")
|
128
134
|
|
129
135
|
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
|
130
136
|
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
|
@@ -136,13 +142,16 @@ def __is_list_or_index_block(block):
|
|
136
142
|
if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height:
|
137
143
|
right_close_num += 1
|
138
144
|
else:
|
139
|
-
#
|
140
|
-
|
141
|
-
|
142
|
-
if block_weight_radio >= 0.5:
|
145
|
+
# 类中文没有超长单词的情况,可以用统一的阈值
|
146
|
+
if block_lang in ['zh', 'ja', 'ko']:
|
143
147
|
closed_area = 0.26 * block_weight
|
144
148
|
else:
|
145
|
-
|
149
|
+
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
|
150
|
+
# block宽的阈值可以小些,block窄的阈值要大
|
151
|
+
if block_weight_radio >= 0.5:
|
152
|
+
closed_area = 0.26 * block_weight
|
153
|
+
else:
|
154
|
+
closed_area = 0.36 * block_weight
|
146
155
|
if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
|
147
156
|
right_not_close_num += 1
|
148
157
|
|
@@ -30,22 +30,14 @@ try:
|
|
30
30
|
torchtext.disable_torchtext_deprecation_warning()
|
31
31
|
except ImportError:
|
32
32
|
pass
|
33
|
-
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
34
33
|
|
34
|
+
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
35
35
|
from magic_pdf.para.para_split_v3 import para_split
|
36
|
-
|
37
|
-
from magic_pdf.pre_proc.construct_page_dict import \
|
38
|
-
ocr_construct_page_component_v2
|
36
|
+
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
|
39
37
|
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
|
40
|
-
|
41
|
-
from magic_pdf.pre_proc.
|
42
|
-
|
43
|
-
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
|
44
|
-
fix_block_spans_v2,
|
45
|
-
fix_discarded_block)
|
46
|
-
from magic_pdf.pre_proc.ocr_span_list_modify import (
|
47
|
-
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
|
48
|
-
remove_overlaps_min_spans)
|
38
|
+
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
|
39
|
+
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
|
40
|
+
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
|
49
41
|
|
50
42
|
|
51
43
|
def __replace_STX_ETX(text_str: str):
|
@@ -65,10 +57,18 @@ def __replace_STX_ETX(text_str: str):
|
|
65
57
|
return text_str
|
66
58
|
|
67
59
|
|
60
|
+
def __replace_0xfffd(text_str: str):
|
61
|
+
"""Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
|
62
|
+
if text_str:
|
63
|
+
s = text_str.replace('\ufffd', " ")
|
64
|
+
return s
|
65
|
+
return text_str
|
66
|
+
|
68
67
|
def chars_to_content(span):
|
69
68
|
# 检查span中的char是否为空
|
70
69
|
if len(span['chars']) == 0:
|
71
|
-
|
70
|
+
pass
|
71
|
+
# span['content'] = ''
|
72
72
|
else:
|
73
73
|
# 先给chars按char['bbox']的中心点的x坐标排序
|
74
74
|
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
|
@@ -83,22 +83,24 @@ def chars_to_content(span):
|
|
83
83
|
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
|
84
84
|
content += ' '
|
85
85
|
content += char['c']
|
86
|
-
|
86
|
+
|
87
|
+
span['content'] = __replace_0xfffd(content)
|
87
88
|
|
88
89
|
del span['chars']
|
89
90
|
|
90
91
|
|
91
92
|
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
|
93
|
+
LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',)
|
94
|
+
|
95
|
+
|
92
96
|
def fill_char_in_spans(spans, all_chars):
|
93
97
|
|
98
|
+
# 简单从上到下排一下序
|
99
|
+
spans = sorted(spans, key=lambda x: x['bbox'][1])
|
100
|
+
|
94
101
|
for char in all_chars:
|
95
102
|
for span in spans:
|
96
|
-
|
97
|
-
if char['c'] in LINE_STOP_FLAG:
|
98
|
-
char_is_line_stop_flag = True
|
99
|
-
else:
|
100
|
-
char_is_line_stop_flag = False
|
101
|
-
if calculate_char_in_span(char['bbox'], span['bbox'], char_is_line_stop_flag):
|
103
|
+
if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
|
102
104
|
span['chars'].append(char)
|
103
105
|
break
|
104
106
|
|
@@ -106,13 +108,16 @@ def fill_char_in_spans(spans, all_chars):
|
|
106
108
|
|
107
109
|
for span in spans:
|
108
110
|
chars_to_content(span)
|
109
|
-
|
111
|
+
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
|
112
|
+
if len(span['content']) * span['height'] < span['width'] * 0.5:
|
113
|
+
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
|
110
114
|
empty_spans.append(span)
|
115
|
+
del span['height'], span['width']
|
111
116
|
return empty_spans
|
112
117
|
|
113
118
|
|
114
119
|
# 使用鲁棒性更强的中心点坐标判断
|
115
|
-
def calculate_char_in_span(char_bbox, span_bbox,
|
120
|
+
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
|
116
121
|
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
|
117
122
|
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
|
118
123
|
span_center_y = (span_bbox[1] + span_bbox[3]) / 2
|
@@ -121,18 +126,26 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
|
121
126
|
if (
|
122
127
|
span_bbox[0] < char_center_x < span_bbox[2]
|
123
128
|
and span_bbox[1] < char_center_y < span_bbox[3]
|
124
|
-
and abs(char_center_y - span_center_y) < span_height
|
129
|
+
and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过1/4span高度
|
125
130
|
):
|
126
131
|
return True
|
127
132
|
else:
|
128
133
|
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
|
129
134
|
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
|
130
|
-
if
|
135
|
+
if char in LINE_STOP_FLAG:
|
131
136
|
if (
|
132
137
|
(span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2]
|
133
138
|
and char_center_x > span_bbox[0]
|
134
139
|
and span_bbox[1] < char_center_y < span_bbox[3]
|
135
|
-
and abs(char_center_y - span_center_y) < span_height
|
140
|
+
and abs(char_center_y - span_center_y) < span_height * span_height_radio
|
141
|
+
):
|
142
|
+
return True
|
143
|
+
elif char in LINE_START_FLAG:
|
144
|
+
if (
|
145
|
+
span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height)
|
146
|
+
and char_center_x < span_bbox[2]
|
147
|
+
and span_bbox[1] < char_center_y < span_bbox[3]
|
148
|
+
and abs(char_center_y - span_center_y) < span_height * span_height_radio
|
136
149
|
):
|
137
150
|
return True
|
138
151
|
else:
|
@@ -141,12 +154,14 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
|
|
141
154
|
|
142
155
|
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
|
143
156
|
|
144
|
-
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.
|
157
|
+
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
|
145
158
|
|
146
|
-
# @todo: 拿到char之后把倾斜角度较大的先删一遍
|
147
159
|
all_pymu_chars = []
|
148
160
|
for block in text_blocks_raw:
|
149
161
|
for line in block['lines']:
|
162
|
+
cosine, sine = line['dir']
|
163
|
+
if abs (cosine) < 0.9 or abs(sine) > 0.1:
|
164
|
+
continue
|
150
165
|
for span in line['spans']:
|
151
166
|
all_pymu_chars.extend(span['chars'])
|
152
167
|
|
@@ -157,6 +172,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
157
172
|
continue
|
158
173
|
span_height = span['bbox'][3] - span['bbox'][1]
|
159
174
|
span['height'] = span_height
|
175
|
+
span['width'] = span['bbox'][2] - span['bbox'][0]
|
160
176
|
span_height_list.append(span_height)
|
161
177
|
if len(span_height_list) == 0:
|
162
178
|
return spans
|
@@ -174,15 +190,13 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
174
190
|
if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
|
175
191
|
continue
|
176
192
|
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
|
177
|
-
if span['height'] > median_span_height * 3 and span['height'] >
|
193
|
+
if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
|
178
194
|
vertical_spans.append(span)
|
179
195
|
elif block in all_bboxes:
|
180
196
|
useful_spans.append(span)
|
181
197
|
else:
|
182
198
|
unuseful_spans.append(span)
|
183
199
|
|
184
|
-
del span['height']
|
185
|
-
|
186
200
|
break
|
187
201
|
|
188
202
|
"""垂直的span框直接用pymu的line进行填充"""
|
@@ -232,6 +246,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
232
246
|
if ocr_res and len(ocr_res) > 0:
|
233
247
|
if len(ocr_res[0]) > 0:
|
234
248
|
ocr_text, ocr_score = ocr_res[0][0]
|
249
|
+
# logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
|
235
250
|
if ocr_score > 0.5 and len(ocr_text) > 0:
|
236
251
|
span['content'] = ocr_text
|
237
252
|
span['score'] = ocr_score
|
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
|
|
117
117
|
all_bboxes = remove_overlaps_min_blocks(all_bboxes)
|
118
118
|
all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
|
119
119
|
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
|
120
|
-
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
121
|
-
|
120
|
+
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
|
121
|
+
all_bboxes.sort(key=lambda x: x[0]+x[1])
|
122
122
|
return all_bboxes, all_discarded_blocks
|
123
123
|
|
124
124
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.10.
|
3
|
+
Version: 0.10.4
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -12,7 +12,6 @@ Requires-Dist: click>=8.1.7
|
|
12
12
|
Requires-Dist: fast-langdetect==0.2.0
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
14
|
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
-
Requires-Dist: pdfminer.six==20231228
|
16
15
|
Requires-Dist: pydantic<2.8.0,>=2.7.2
|
17
16
|
Requires-Dist: PyMuPDF>=1.24.9
|
18
17
|
Requires-Dist: scikit-learn>=1.0.2
|
@@ -1,7 +1,7 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
|
3
3
|
magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
|
4
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256
|
4
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=6Apku7-pW450HbHNTtbVLDyroRSKlQ57w9f0ScOaZv4,30879
|
5
5
|
magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
|
6
6
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
|
@@ -27,10 +27,10 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
|
|
27
27
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
28
28
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
29
29
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
30
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=pE-lEUsYAhZC3nSmbgYO42Kvk_bW8Ds-AL-QMPHFu8c,12941
|
31
31
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
32
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
33
|
-
magic_pdf/filter/pdf_meta_scan.py,sha256=
|
33
|
+
magic_pdf/filter/pdf_meta_scan.py,sha256=3ba7SxXu1z2r5N97Dxmp_L10Lo7llsrBlvtEAJeIJBQ,17403
|
34
34
|
magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
35
|
magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
36
|
magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
|
@@ -50,16 +50,16 @@ magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,10
|
|
50
50
|
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
51
51
|
magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
|
52
52
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
53
|
-
magic_pdf/libs/pdf_check.py,sha256=
|
53
|
+
magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
|
54
54
|
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
55
55
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
56
|
-
magic_pdf/libs/version.py,sha256=
|
56
|
+
magic_pdf/libs/version.py,sha256=fGZMaoPHZfTX9I4TDkr07gp-kj_1U_SD-gjQC_2flQs,23
|
57
57
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
58
58
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
|
59
59
|
magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
|
60
60
|
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
61
61
|
magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
|
62
|
-
magic_pdf/model/pp_structure_v2.py,sha256=
|
62
|
+
magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
|
63
63
|
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
64
|
magic_pdf/model/sub_modules/model_init.py,sha256=CnlZLsiSOmGJXQRASH-hMmuPiF6hYKCNfmzDTjQqy5g,5073
|
65
65
|
magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
|
@@ -94,8 +94,8 @@ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1
|
|
94
94
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
95
95
|
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
96
96
|
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
97
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=
|
98
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=
|
97
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=rwKphio9SZgiNgqASWOBWZIf6PPi3kvgQO_qJLc_diE,10726
|
98
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=d__xICejA_Q-Cz4cfajwroDjfA0dT4TL18XAFYYc4OQ,7265
|
99
99
|
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
|
100
100
|
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
101
101
|
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -108,9 +108,9 @@ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=_FKKOSKeceusx
|
|
108
108
|
magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
109
109
|
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
|
110
110
|
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
111
|
-
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=
|
111
|
+
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
|
112
112
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
113
|
-
magic_pdf/para/para_split_v3.py,sha256=
|
113
|
+
magic_pdf/para/para_split_v3.py,sha256=UOQe0HUVX7FAlMbJp1OkGfdM7JECWeqscv3s8Hge7ps,16922
|
114
114
|
magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
|
115
115
|
magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
|
116
116
|
magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
|
@@ -119,7 +119,7 @@ magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
119
119
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
120
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
121
121
|
magic_pdf/pre_proc/cut_image.py,sha256=U-ttnl3lAhhmgtkR1GGyPAVm0i0-6VscXf3E2EDy3lE,1187
|
122
|
-
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=
|
122
|
+
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
|
123
123
|
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
|
124
124
|
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
|
125
125
|
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
|
@@ -139,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
|
|
139
139
|
magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
|
140
140
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
141
141
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
142
|
-
magic_pdf-0.10.
|
143
|
-
magic_pdf-0.10.
|
144
|
-
magic_pdf-0.10.
|
145
|
-
magic_pdf-0.10.
|
146
|
-
magic_pdf-0.10.
|
147
|
-
magic_pdf-0.10.
|
142
|
+
magic_pdf-0.10.4.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
143
|
+
magic_pdf-0.10.4.dist-info/METADATA,sha256=pujqC_qUWiPT-L6R065MoL0QO9q4IEra0iW4BCRkxr4,36992
|
144
|
+
magic_pdf-0.10.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
145
|
+
magic_pdf-0.10.4.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
146
|
+
magic_pdf-0.10.4.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
147
|
+
magic_pdf-0.10.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|