magic-pdf 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +16 -22
- magic_pdf/filter/pdf_meta_scan.py +5 -19
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_check.py +52 -25
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/pp_structure_v2.py +23 -3
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +50 -29
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +11 -9
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
- magic_pdf/para/para_split_v3.py +21 -7
- magic_pdf/pdf_parse_union_core_v2.py +134 -146
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/METADATA +3 -78
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/RECORD +28 -79
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,6 @@ from loguru import logger
|
|
5
5
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
6
6
|
from magic_pdf.config.ocr_content_type import BlockType, ContentType
|
7
7
|
from magic_pdf.libs.commons import join_path
|
8
|
-
from magic_pdf.libs.language import detect_lang
|
9
8
|
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
10
9
|
from magic_pdf.para.para_split_v3 import ListLineTag
|
11
10
|
|
@@ -30,6 +29,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
|
|
30
29
|
for page_info in pdf_info_dict:
|
31
30
|
paras_of_layout = page_info.get('para_blocks')
|
32
31
|
if not paras_of_layout:
|
32
|
+
markdown_with_para_and_pagination.append({
|
33
|
+
'page_no':
|
34
|
+
page_no,
|
35
|
+
'md_content':
|
36
|
+
'',
|
37
|
+
})
|
38
|
+
page_no += 1
|
33
39
|
continue
|
34
40
|
page_markdown = ocr_mk_markdown_with_para_core_v2(
|
35
41
|
paras_of_layout, 'mm', img_buket_path)
|
@@ -136,14 +142,11 @@ def merge_para_with_text(para_block):
|
|
136
142
|
para_text += ' \n'
|
137
143
|
|
138
144
|
line_text = ''
|
139
|
-
line_lang = ''
|
140
145
|
for span in line['spans']:
|
141
146
|
span_type = span['type']
|
142
147
|
if span_type == ContentType.Text:
|
143
148
|
line_text += span['content'].strip()
|
144
149
|
|
145
|
-
if line_text != '':
|
146
|
-
line_lang = detect_lang(line_text)
|
147
150
|
for j, span in enumerate(line['spans']):
|
148
151
|
|
149
152
|
span_type = span['type']
|
@@ -157,27 +160,18 @@ def merge_para_with_text(para_block):
|
|
157
160
|
|
158
161
|
content = content.strip()
|
159
162
|
if content != '':
|
160
|
-
|
161
|
-
|
162
|
-
if
|
163
|
-
para_text += content
|
164
|
-
|
165
|
-
para_text += f'
|
166
|
-
|
167
|
-
|
168
|
-
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
169
|
-
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
|
170
|
-
para_text += content[:-1]
|
171
|
-
elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
|
172
|
-
para_text += content
|
173
|
-
else: # 西方文本语境下 content间需要空格分隔
|
174
|
-
para_text += f'{content} '
|
175
|
-
elif span_type == ContentType.InterlineEquation:
|
176
|
-
para_text += content
|
163
|
+
if span_type in [ContentType.Text, ContentType.InlineEquation]:
|
164
|
+
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
165
|
+
if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
|
166
|
+
para_text += content[:-1]
|
167
|
+
else: # content间需要空格分隔
|
168
|
+
para_text += f'{content} '
|
169
|
+
elif span_type == ContentType.InterlineEquation:
|
170
|
+
para_text += content
|
177
171
|
else:
|
178
172
|
continue
|
179
173
|
# 连写字符拆分
|
180
|
-
para_text = __replace_ligatures(para_text)
|
174
|
+
# para_text = __replace_ligatures(para_text)
|
181
175
|
|
182
176
|
return para_text
|
183
177
|
|
@@ -1,15 +1,14 @@
|
|
1
1
|
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
|
2
2
|
|
3
|
-
import sys
|
4
3
|
from collections import Counter
|
5
4
|
|
6
|
-
import
|
5
|
+
import fitz
|
7
6
|
from loguru import logger
|
8
7
|
|
9
8
|
from magic_pdf.config.drop_reason import DropReason
|
10
|
-
from magic_pdf.libs.commons import
|
9
|
+
from magic_pdf.libs.commons import get_top_percent_list, mymax
|
11
10
|
from magic_pdf.libs.language import detect_lang
|
12
|
-
from magic_pdf.libs.pdf_check import
|
11
|
+
from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
|
13
12
|
|
14
13
|
scan_max_page = 50
|
15
14
|
junk_limit_min = 10
|
@@ -324,7 +323,7 @@ def get_language(doc: fitz.Document):
|
|
324
323
|
|
325
324
|
def check_invalid_chars(pdf_bytes):
|
326
325
|
"""乱码检测."""
|
327
|
-
return
|
326
|
+
return detect_invalid_chars_by_pymupdf(pdf_bytes)
|
328
327
|
|
329
328
|
|
330
329
|
def pdf_meta_scan(pdf_bytes: bytes):
|
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
|
384
383
|
return res
|
385
384
|
|
386
385
|
|
387
|
-
@click.command()
|
388
|
-
@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
|
389
|
-
@click.option('--s3-profile', help='s3上的profile')
|
390
|
-
def main(s3_pdf_path: str, s3_profile: str):
|
391
|
-
""""""
|
392
|
-
try:
|
393
|
-
file_content = read_file(s3_pdf_path, s3_profile)
|
394
|
-
pdf_meta_scan(file_content)
|
395
|
-
except Exception as e:
|
396
|
-
print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
|
397
|
-
logger.exception(e)
|
398
|
-
|
399
|
-
|
400
386
|
if __name__ == '__main__':
|
401
|
-
|
387
|
+
pass
|
402
388
|
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
|
403
389
|
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
|
404
390
|
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
|
magic_pdf/libs/commons.py
CHANGED
@@ -1,34 +1,8 @@
|
|
1
|
-
import datetime
|
2
|
-
import json
|
3
|
-
import os, re, configparser
|
4
|
-
import subprocess
|
5
|
-
import time
|
6
|
-
|
7
|
-
import boto3
|
8
|
-
from loguru import logger
|
9
|
-
from boto3.s3.transfer import TransferConfig
|
10
|
-
from botocore.config import Config
|
11
|
-
|
12
|
-
import fitz # 1.23.9中已经切换到rebase
|
13
|
-
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
|
14
|
-
|
15
|
-
|
16
|
-
def get_delta_time(input_time):
|
17
|
-
return round(time.time() - input_time, 2)
|
18
|
-
|
19
1
|
|
20
2
|
def join_path(*args):
|
21
3
|
return '/'.join(str(s).rstrip('/') for s in args)
|
22
4
|
|
23
5
|
|
24
|
-
#配置全局的errlog_path,方便demo同步引用
|
25
|
-
error_log_path = "s3://llm-pdf-text/err_logs/"
|
26
|
-
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
|
27
|
-
json_dump_path = "s3://llm-pdf-text/json_dump/"
|
28
|
-
|
29
|
-
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
|
30
|
-
|
31
|
-
|
32
6
|
def get_top_percent_list(num_list, percent):
|
33
7
|
"""
|
34
8
|
获取列表中前百分之多少的元素
|
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
|
|
48
22
|
return top_percent_list
|
49
23
|
|
50
24
|
|
51
|
-
def formatted_time(time_stamp):
|
52
|
-
dt_object = datetime.datetime.fromtimestamp(time_stamp)
|
53
|
-
output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
|
54
|
-
return output_time
|
55
|
-
|
56
|
-
|
57
25
|
def mymax(alist: list):
|
58
26
|
if len(alist) == 0:
|
59
27
|
return 0 # 空是0, 0*0也是0大小q
|
60
28
|
else:
|
61
29
|
return max(alist)
|
62
30
|
|
63
|
-
def parse_aws_param(profile):
|
64
|
-
if isinstance(profile, str):
|
65
|
-
# 解析配置文件
|
66
|
-
config_file = join_path(os.path.expanduser("~"), ".aws", "config")
|
67
|
-
credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
|
68
|
-
config = configparser.ConfigParser()
|
69
|
-
config.read(credentials_file)
|
70
|
-
config.read(config_file)
|
71
|
-
# 获取 AWS 账户相关信息
|
72
|
-
ak = config.get(profile, "aws_access_key_id")
|
73
|
-
sk = config.get(profile, "aws_secret_access_key")
|
74
|
-
if profile == "default":
|
75
|
-
s3_str = config.get(f"{profile}", "s3")
|
76
|
-
else:
|
77
|
-
s3_str = config.get(f"profile {profile}", "s3")
|
78
|
-
end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
|
79
|
-
if end_match:
|
80
|
-
endpoint = end_match.group(1)
|
81
|
-
else:
|
82
|
-
raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
|
83
|
-
style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
|
84
|
-
if style_match:
|
85
|
-
addressing_style = style_match.group(1)
|
86
|
-
else:
|
87
|
-
addressing_style = "path"
|
88
|
-
elif isinstance(profile, dict):
|
89
|
-
ak = profile["ak"]
|
90
|
-
sk = profile["sk"]
|
91
|
-
endpoint = profile["endpoint"]
|
92
|
-
addressing_style = "auto"
|
93
|
-
|
94
|
-
return ak, sk, endpoint, addressing_style
|
95
|
-
|
96
31
|
|
97
32
|
def parse_bucket_key(s3_full_path: str):
|
98
33
|
"""
|
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
|
|
106
41
|
s3_full_path = s3_full_path[1:]
|
107
42
|
bucket, key = s3_full_path.split("/", 1)
|
108
43
|
return bucket, key
|
109
|
-
|
110
|
-
|
111
|
-
def read_file(pdf_path: str, s3_profile):
|
112
|
-
if pdf_path.startswith("s3://"):
|
113
|
-
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
|
114
|
-
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
|
115
|
-
config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
|
116
|
-
bucket_name, bucket_key = parse_bucket_key(pdf_path)
|
117
|
-
res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
|
118
|
-
file_content = res["Body"].read()
|
119
|
-
return file_content
|
120
|
-
else:
|
121
|
-
with open(pdf_path, "rb") as f:
|
122
|
-
return f.read()
|
123
|
-
|
124
|
-
|
125
|
-
def get_docx_model_output(pdf_model_output, page_id):
|
126
|
-
|
127
|
-
model_output_json = pdf_model_output[page_id]
|
128
|
-
|
129
|
-
return model_output_json
|
130
|
-
|
131
|
-
|
132
|
-
def list_dir(dir_path:str, s3_profile:str):
|
133
|
-
"""
|
134
|
-
列出dir_path下的所有文件
|
135
|
-
"""
|
136
|
-
ret = []
|
137
|
-
|
138
|
-
if dir_path.startswith("s3"):
|
139
|
-
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
|
140
|
-
s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
|
141
|
-
bucket, path = s3info[0][0], s3info[0][1]
|
142
|
-
try:
|
143
|
-
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
|
144
|
-
config=Config(s3={'addressing_style': addressing_style}))
|
145
|
-
def list_obj_scluster():
|
146
|
-
marker = None
|
147
|
-
while True:
|
148
|
-
list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
|
149
|
-
if marker:
|
150
|
-
list_kwargs['Marker'] = marker
|
151
|
-
response = cli.list_objects(**list_kwargs)
|
152
|
-
contents = response.get("Contents", [])
|
153
|
-
yield from contents
|
154
|
-
if not response.get("IsTruncated") or len(contents)==0:
|
155
|
-
break
|
156
|
-
marker = contents[-1]['Key']
|
157
|
-
|
158
|
-
|
159
|
-
for info in list_obj_scluster():
|
160
|
-
file_path = info['Key']
|
161
|
-
#size = info['Size']
|
162
|
-
|
163
|
-
if path!="":
|
164
|
-
afile = file_path[len(path):]
|
165
|
-
if afile.endswith(".json"):
|
166
|
-
ret.append(f"s3://{bucket}/{file_path}")
|
167
|
-
|
168
|
-
return ret
|
169
|
-
|
170
|
-
except Exception as e:
|
171
|
-
logger.exception(e)
|
172
|
-
exit(-1)
|
173
|
-
else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
|
174
|
-
|
175
|
-
for root, dirs, files in os.walk(dir_path):
|
176
|
-
for file in files:
|
177
|
-
if file.endswith(".json"):
|
178
|
-
ret.append(join_path(root, file))
|
179
|
-
ret.sort()
|
180
|
-
return ret
|
181
|
-
|
182
|
-
def get_img_s3_client(save_path:str, image_s3_config:str):
|
183
|
-
"""
|
184
|
-
"""
|
185
|
-
if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
|
186
|
-
ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
|
187
|
-
img_s3_client = boto3.client(
|
188
|
-
service_name="s3",
|
189
|
-
aws_access_key_id=ak,
|
190
|
-
aws_secret_access_key=sk,
|
191
|
-
endpoint_url=end_point,
|
192
|
-
config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
|
193
|
-
)
|
194
|
-
else:
|
195
|
-
img_s3_client = None
|
196
|
-
|
197
|
-
return img_s3_client
|
198
|
-
|
199
|
-
if __name__=="__main__":
|
200
|
-
s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
|
201
|
-
s3_profile = "langchao"
|
202
|
-
ret = list_dir(s3_path, s3_profile)
|
203
|
-
print(ret)
|
204
|
-
|
magic_pdf/libs/draw_bbox.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
+
import fitz
|
1
2
|
from magic_pdf.config.constants import CROSS_PAGE
|
2
|
-
from magic_pdf.config.ocr_content_type import
|
3
|
-
ContentType)
|
3
|
+
from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
|
4
4
|
from magic_pdf.data.dataset import PymuDocDataset
|
5
|
-
from magic_pdf.libs.commons import fitz # PyMuPDF
|
6
5
|
from magic_pdf.model.magic_model import MagicModel
|
7
6
|
|
8
7
|
|
magic_pdf/libs/markdown_utils.py
CHANGED
@@ -1,24 +1,3 @@
|
|
1
|
-
import re
|
2
|
-
|
3
|
-
|
4
|
-
def escape_special_markdown_char(pymu_blocks):
|
5
|
-
"""
|
6
|
-
转义正文里对markdown语法有特殊意义的字符
|
7
|
-
"""
|
8
|
-
special_chars = ["*", "`", "~", "$"]
|
9
|
-
for blk in pymu_blocks:
|
10
|
-
for line in blk['lines']:
|
11
|
-
for span in line['spans']:
|
12
|
-
for char in special_chars:
|
13
|
-
span_text = span['text']
|
14
|
-
span_type = span.get("_type", None)
|
15
|
-
if span_type in ['inline-equation', 'interline-equation']:
|
16
|
-
continue
|
17
|
-
elif span_text:
|
18
|
-
span['text'] = span['text'].replace(char, "\\" + char)
|
19
|
-
|
20
|
-
return pymu_blocks
|
21
|
-
|
22
1
|
|
23
2
|
def ocr_escape_special_markdown_char(content):
|
24
3
|
"""
|
magic_pdf/libs/pdf_check.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
from io import BytesIO
|
2
|
-
import re
|
3
1
|
import fitz
|
4
2
|
import numpy as np
|
5
3
|
from loguru import logger
|
6
|
-
|
4
|
+
# import re
|
5
|
+
# from io import BytesIO
|
6
|
+
# from pdfminer.high_level import extract_text
|
7
7
|
|
8
8
|
|
9
9
|
def calculate_sample_count(total_page: int):
|
@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
|
|
14
14
|
return select_page_cnt
|
15
15
|
|
16
16
|
|
17
|
-
def extract_pages(src_pdf_bytes: bytes):
|
17
|
+
def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
|
18
18
|
pdf_docs = fitz.open("pdf", src_pdf_bytes)
|
19
19
|
total_page = len(pdf_docs)
|
20
20
|
if total_page == 0:
|
@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
|
|
33
33
|
return sample_docs
|
34
34
|
|
35
35
|
|
36
|
-
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
37
|
-
|
38
|
-
|
36
|
+
# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
37
|
+
# """"
|
38
|
+
# 检测PDF中是否包含非法字符
|
39
|
+
# """
|
40
|
+
# '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
|
41
|
+
# sample_docs = extract_pages(src_pdf_bytes)
|
42
|
+
# sample_pdf_bytes = sample_docs.tobytes()
|
43
|
+
# sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
44
|
+
# text = extract_text(sample_pdf_file_like_object)
|
45
|
+
# text = text.replace("\n", "")
|
46
|
+
# # logger.info(text)
|
47
|
+
# '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
48
|
+
# cid_pattern = re.compile(r'\(cid:\d+\)')
|
49
|
+
# matches = cid_pattern.findall(text)
|
50
|
+
# cid_count = len(matches)
|
51
|
+
# cid_len = sum(len(match) for match in matches)
|
52
|
+
# text_len = len(text)
|
53
|
+
# if text_len == 0:
|
54
|
+
# cid_chars_radio = 0
|
55
|
+
# else:
|
56
|
+
# cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
|
57
|
+
# logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
|
58
|
+
# '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
|
59
|
+
# if cid_chars_radio > 0.05:
|
60
|
+
# return False # 乱码文档
|
61
|
+
# else:
|
62
|
+
# return True # 正常文档
|
63
|
+
|
64
|
+
|
65
|
+
def count_replacement_characters(text: str) -> int:
|
66
|
+
"""
|
67
|
+
统计字符串中 0xfffd 字符的数量。
|
39
68
|
"""
|
40
|
-
''
|
69
|
+
return text.count('\ufffd')
|
70
|
+
|
71
|
+
|
72
|
+
def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
|
41
73
|
sample_docs = extract_pages(src_pdf_bytes)
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
cid_pattern = re.compile(r'\(cid:\d+\)')
|
49
|
-
matches = cid_pattern.findall(text)
|
50
|
-
cid_count = len(matches)
|
51
|
-
cid_len = sum(len(match) for match in matches)
|
52
|
-
text_len = len(text)
|
74
|
+
doc_text = ""
|
75
|
+
for page in sample_docs:
|
76
|
+
page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
|
77
|
+
doc_text += page_text
|
78
|
+
text_len = len(doc_text)
|
79
|
+
uffd_count = count_replacement_characters(doc_text)
|
53
80
|
if text_len == 0:
|
54
|
-
|
81
|
+
uffd_chars_radio = 0
|
55
82
|
else:
|
56
|
-
|
57
|
-
logger.info(f"
|
58
|
-
'''当一篇文章存在
|
59
|
-
if
|
83
|
+
uffd_chars_radio = uffd_count / text_len
|
84
|
+
logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
|
85
|
+
'''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
|
86
|
+
if uffd_chars_radio > 0.01:
|
60
87
|
return False # 乱码文档
|
61
88
|
else:
|
62
|
-
return True # 正常文档
|
89
|
+
return True # 正常文档
|
@@ -1,9 +1,10 @@
|
|
1
1
|
from io import BytesIO
|
2
2
|
import cv2
|
3
|
+
import fitz
|
3
4
|
import numpy as np
|
4
5
|
from PIL import Image
|
5
6
|
from magic_pdf.data.data_reader_writer import DataWriter
|
6
|
-
from magic_pdf.libs.commons import
|
7
|
+
from magic_pdf.libs.commons import join_path
|
7
8
|
from magic_pdf.libs.hash_utils import compute_sha256
|
8
9
|
|
9
10
|
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.10.
|
1
|
+
__version__ = "0.10.3"
|
@@ -46,8 +46,8 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
|
|
46
46
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
47
47
|
pm = page.get_pixmap(matrix=mat, alpha=False)
|
48
48
|
|
49
|
-
# If the width or height exceeds
|
50
|
-
if pm.width >
|
49
|
+
# If the width or height exceeds 4500 after scaling, do not scale further.
|
50
|
+
if pm.width > 4500 or pm.height > 4500:
|
51
51
|
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
52
52
|
|
53
53
|
img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
|
magic_pdf/model/magic_model.py
CHANGED
@@ -1,16 +1,12 @@
|
|
1
1
|
import enum
|
2
|
-
import json
|
3
2
|
|
4
3
|
from magic_pdf.config.model_block_type import ModelBlockTypeEnum
|
5
4
|
from magic_pdf.config.ocr_content_type import CategoryId, ContentType
|
6
|
-
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
|
7
|
-
FileBasedDataWriter)
|
8
5
|
from magic_pdf.data.dataset import Dataset
|
9
6
|
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
|
10
7
|
bbox_relative_pos, box_area, calculate_iou,
|
11
8
|
calculate_overlap_area_in_bbox1_area_ratio,
|
12
9
|
get_overlap_area)
|
13
|
-
from magic_pdf.libs.commons import fitz, join_path
|
14
10
|
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
15
11
|
from magic_pdf.libs.local_math import float_gt
|
16
12
|
from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
|
@@ -1048,29 +1044,3 @@ class MagicModel:
|
|
1048
1044
|
def get_model_list(self, page_no):
|
1049
1045
|
return self.__model_list[page_no]
|
1050
1046
|
|
1051
|
-
|
1052
|
-
if __name__ == '__main__':
|
1053
|
-
drw = FileBasedDataReader(r'D:/project/20231108code-clean')
|
1054
|
-
if 0:
|
1055
|
-
pdf_file_path = r'linshixuqiu\19983-00.pdf'
|
1056
|
-
model_file_path = r'linshixuqiu\19983-00_new.json'
|
1057
|
-
pdf_bytes = drw.read(pdf_file_path)
|
1058
|
-
model_json_txt = drw.read(model_file_path).decode()
|
1059
|
-
model_list = json.loads(model_json_txt)
|
1060
|
-
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
|
1061
|
-
img_bucket_path = 'imgs'
|
1062
|
-
img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
|
1063
|
-
pdf_docs = fitz.open('pdf', pdf_bytes)
|
1064
|
-
magic_model = MagicModel(model_list, pdf_docs)
|
1065
|
-
|
1066
|
-
if 1:
|
1067
|
-
from magic_pdf.data.dataset import PymuDocDataset
|
1068
|
-
|
1069
|
-
model_list = json.loads(
|
1070
|
-
drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
|
1071
|
-
)
|
1072
|
-
pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
|
1073
|
-
|
1074
|
-
magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
|
1075
|
-
for i in range(7):
|
1076
|
-
print(magic_model.get_imgs(i))
|
@@ -18,11 +18,31 @@ def region_to_bbox(region):
|
|
18
18
|
|
19
19
|
|
20
20
|
class CustomPaddleModel:
|
21
|
-
def __init__(self,
|
21
|
+
def __init__(self,
|
22
|
+
ocr: bool = False,
|
23
|
+
show_log: bool = False,
|
24
|
+
lang=None,
|
25
|
+
det_db_box_thresh=0.3,
|
26
|
+
use_dilation=True,
|
27
|
+
det_db_unclip_ratio=1.8
|
28
|
+
):
|
22
29
|
if lang is not None:
|
23
|
-
self.model = PPStructure(table=False,
|
30
|
+
self.model = PPStructure(table=False,
|
31
|
+
ocr=True,
|
32
|
+
show_log=show_log,
|
33
|
+
lang=lang,
|
34
|
+
det_db_box_thresh=det_db_box_thresh,
|
35
|
+
use_dilation=use_dilation,
|
36
|
+
det_db_unclip_ratio=det_db_unclip_ratio,
|
37
|
+
)
|
24
38
|
else:
|
25
|
-
self.model = PPStructure(table=False,
|
39
|
+
self.model = PPStructure(table=False,
|
40
|
+
ocr=True,
|
41
|
+
show_log=show_log,
|
42
|
+
det_db_box_thresh=det_db_box_thresh,
|
43
|
+
use_dilation=use_dilation,
|
44
|
+
det_db_unclip_ratio=det_db_unclip_ratio,
|
45
|
+
)
|
26
46
|
|
27
47
|
def __call__(self, img):
|
28
48
|
try:
|
@@ -1,11 +1,55 @@
|
|
1
|
-
import
|
2
|
-
|
1
|
+
import cv2
|
3
2
|
import numpy as np
|
4
3
|
from loguru import logger
|
5
|
-
|
4
|
+
from io import BytesIO
|
5
|
+
from PIL import Image
|
6
|
+
import base64
|
6
7
|
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
|
7
8
|
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
|
8
9
|
|
10
|
+
from ppocr.utils.utility import check_and_read
|
11
|
+
|
12
|
+
|
13
|
+
def img_decode(content: bytes):
|
14
|
+
np_arr = np.frombuffer(content, dtype=np.uint8)
|
15
|
+
return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
|
16
|
+
|
17
|
+
|
18
|
+
def check_img(img):
|
19
|
+
if isinstance(img, bytes):
|
20
|
+
img = img_decode(img)
|
21
|
+
if isinstance(img, str):
|
22
|
+
image_file = img
|
23
|
+
img, flag_gif, flag_pdf = check_and_read(image_file)
|
24
|
+
if not flag_gif and not flag_pdf:
|
25
|
+
with open(image_file, 'rb') as f:
|
26
|
+
img_str = f.read()
|
27
|
+
img = img_decode(img_str)
|
28
|
+
if img is None:
|
29
|
+
try:
|
30
|
+
buf = BytesIO()
|
31
|
+
image = BytesIO(img_str)
|
32
|
+
im = Image.open(image)
|
33
|
+
rgb = im.convert('RGB')
|
34
|
+
rgb.save(buf, 'jpeg')
|
35
|
+
buf.seek(0)
|
36
|
+
image_bytes = buf.read()
|
37
|
+
data_base64 = str(base64.b64encode(image_bytes),
|
38
|
+
encoding="utf-8")
|
39
|
+
image_decode = base64.b64decode(data_base64)
|
40
|
+
img_array = np.frombuffer(image_decode, np.uint8)
|
41
|
+
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
42
|
+
except:
|
43
|
+
logger.error("error in loading image:{}".format(image_file))
|
44
|
+
return None
|
45
|
+
if img is None:
|
46
|
+
logger.error("error in loading image:{}".format(image_file))
|
47
|
+
return None
|
48
|
+
if isinstance(img, np.ndarray) and len(img.shape) == 2:
|
49
|
+
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
50
|
+
|
51
|
+
return img
|
52
|
+
|
9
53
|
|
10
54
|
def bbox_to_points(bbox):
|
11
55
|
""" 将bbox格式转换为四个顶点的数组 """
|
@@ -214,6 +258,9 @@ def get_ocr_result_list(ocr_res, useful_list):
|
|
214
258
|
if len(box_ocr_res) == 2:
|
215
259
|
p1, p2, p3, p4 = box_ocr_res[0]
|
216
260
|
text, score = box_ocr_res[1]
|
261
|
+
# logger.info(f"text: {text}, score: {score}")
|
262
|
+
if score < 0.6: # 过滤低置信度的结果
|
263
|
+
continue
|
217
264
|
else:
|
218
265
|
p1, p2, p3, p4 = box_ocr_res
|
219
266
|
text, score = "", 1
|
@@ -249,32 +296,6 @@ def get_ocr_result_list(ocr_res, useful_list):
|
|
249
296
|
return ocr_result_list
|
250
297
|
|
251
298
|
|
252
|
-
def calculate_angle_degrees(poly):
|
253
|
-
# 定义对角线的顶点
|
254
|
-
diagonal1 = (poly[0], poly[2])
|
255
|
-
diagonal2 = (poly[1], poly[3])
|
256
|
-
|
257
|
-
# 计算对角线的斜率
|
258
|
-
def slope(p1, p2):
|
259
|
-
return (p2[1] - p1[1]) / (p2[0] - p1[0]) if p2[0] != p1[0] else float('inf')
|
260
|
-
|
261
|
-
slope1 = slope(diagonal1[0], diagonal1[1])
|
262
|
-
slope2 = slope(diagonal2[0], diagonal2[1])
|
263
|
-
|
264
|
-
# 计算对角线与x轴的夹角(以弧度为单位)
|
265
|
-
angle1_radians = math.atan(slope1)
|
266
|
-
angle2_radians = math.atan(slope2)
|
267
|
-
|
268
|
-
# 将弧度转换为角度
|
269
|
-
angle1_degrees = math.degrees(angle1_radians)
|
270
|
-
angle2_degrees = math.degrees(angle2_radians)
|
271
|
-
|
272
|
-
# 取两条对角线与x轴夹角的平均值
|
273
|
-
average_angle_degrees = abs((angle1_degrees + angle2_degrees) / 2)
|
274
|
-
# logger.info(f"average_angle_degrees: {average_angle_degrees}")
|
275
|
-
return average_angle_degrees
|
276
|
-
|
277
|
-
|
278
299
|
def calculate_is_angle(poly):
|
279
300
|
p1, p2, p3, p4 = poly
|
280
301
|
height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2
|