magic-pdf 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/__init__.py +0 -0
- magic_pdf/cli/__init__.py +0 -0
- magic_pdf/cli/magicpdf.py +294 -0
- magic_pdf/dict2md/__init__.py +0 -0
- magic_pdf/dict2md/mkcontent.py +397 -0
- magic_pdf/dict2md/ocr_mkcontent.py +356 -0
- magic_pdf/filter/__init__.py +0 -0
- magic_pdf/filter/pdf_classify_by_type.py +381 -0
- magic_pdf/filter/pdf_meta_scan.py +368 -0
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +681 -0
- magic_pdf/layout/layout_det_utils.py +182 -0
- magic_pdf/layout/layout_sort.py +732 -0
- magic_pdf/layout/layout_spiler_recog.py +101 -0
- magic_pdf/layout/mcol_sort.py +336 -0
- magic_pdf/libs/Constants.py +11 -0
- magic_pdf/libs/MakeContentConfig.py +10 -0
- magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
- magic_pdf/libs/__init__.py +0 -0
- magic_pdf/libs/boxbase.py +408 -0
- magic_pdf/libs/calc_span_stats.py +239 -0
- magic_pdf/libs/commons.py +204 -0
- magic_pdf/libs/config_reader.py +63 -0
- magic_pdf/libs/convert_utils.py +5 -0
- magic_pdf/libs/coordinate_transform.py +9 -0
- magic_pdf/libs/detect_language_from_model.py +21 -0
- magic_pdf/libs/draw_bbox.py +227 -0
- magic_pdf/libs/drop_reason.py +27 -0
- magic_pdf/libs/drop_tag.py +19 -0
- magic_pdf/libs/hash_utils.py +15 -0
- magic_pdf/libs/json_compressor.py +27 -0
- magic_pdf/libs/language.py +31 -0
- magic_pdf/libs/markdown_utils.py +31 -0
- magic_pdf/libs/math.py +9 -0
- magic_pdf/libs/nlp_utils.py +203 -0
- magic_pdf/libs/ocr_content_type.py +21 -0
- magic_pdf/libs/path_utils.py +23 -0
- magic_pdf/libs/pdf_image_tools.py +33 -0
- magic_pdf/libs/safe_filename.py +11 -0
- magic_pdf/libs/textbase.py +33 -0
- magic_pdf/libs/version.py +1 -0
- magic_pdf/libs/vis_utils.py +308 -0
- magic_pdf/model/__init__.py +0 -0
- magic_pdf/model/doc_analyze_by_360layout.py +8 -0
- magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
- magic_pdf/model/magic_model.py +632 -0
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/para/block_continuation_processor.py +562 -0
- magic_pdf/para/block_termination_processor.py +480 -0
- magic_pdf/para/commons.py +222 -0
- magic_pdf/para/denoise.py +246 -0
- magic_pdf/para/draw.py +121 -0
- magic_pdf/para/exceptions.py +198 -0
- magic_pdf/para/layout_match_processor.py +40 -0
- magic_pdf/para/para_pipeline.py +297 -0
- magic_pdf/para/para_split.py +644 -0
- magic_pdf/para/para_split_v2.py +772 -0
- magic_pdf/para/raw_processor.py +207 -0
- magic_pdf/para/stats.py +268 -0
- magic_pdf/para/title_processor.py +1014 -0
- magic_pdf/pdf_parse_by_ocr.py +219 -0
- magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
- magic_pdf/pdf_parse_by_txt.py +410 -0
- magic_pdf/pdf_parse_by_txt_v2.py +56 -0
- magic_pdf/pdf_parse_for_train.py +685 -0
- magic_pdf/pdf_parse_union_core.py +241 -0
- magic_pdf/pipe/AbsPipe.py +112 -0
- magic_pdf/pipe/OCRPipe.py +28 -0
- magic_pdf/pipe/TXTPipe.py +29 -0
- magic_pdf/pipe/UNIPipe.py +83 -0
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +3472 -0
- magic_pdf/post_proc/pdf_post_filter.py +67 -0
- magic_pdf/post_proc/remove_footnote.py +153 -0
- magic_pdf/pre_proc/__init__.py +0 -0
- magic_pdf/pre_proc/citationmarker_remove.py +157 -0
- magic_pdf/pre_proc/construct_page_dict.py +72 -0
- magic_pdf/pre_proc/cut_image.py +71 -0
- magic_pdf/pre_proc/detect_equation.py +134 -0
- magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
- magic_pdf/pre_proc/detect_footnote.py +170 -0
- magic_pdf/pre_proc/detect_header.py +64 -0
- magic_pdf/pre_proc/detect_images.py +647 -0
- magic_pdf/pre_proc/detect_page_number.py +64 -0
- magic_pdf/pre_proc/detect_tables.py +62 -0
- magic_pdf/pre_proc/equations_replace.py +559 -0
- magic_pdf/pre_proc/fix_image.py +244 -0
- magic_pdf/pre_proc/fix_table.py +270 -0
- magic_pdf/pre_proc/main_text_font.py +23 -0
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
- magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
- magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
- magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
- magic_pdf/pre_proc/remove_footer_header.py +117 -0
- magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
- magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
- magic_pdf/pre_proc/solve_line_alien.py +29 -0
- magic_pdf/pre_proc/statistics.py +12 -0
- magic_pdf/rw/AbsReaderWriter.py +34 -0
- magic_pdf/rw/DiskReaderWriter.py +66 -0
- magic_pdf/rw/S3ReaderWriter.py +107 -0
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/spark/__init__.py +0 -0
- magic_pdf/spark/spark_api.py +51 -0
- magic_pdf/train_utils/__init__.py +0 -0
- magic_pdf/train_utils/convert_to_train_format.py +65 -0
- magic_pdf/train_utils/extract_caption.py +59 -0
- magic_pdf/train_utils/remove_footer_header.py +159 -0
- magic_pdf/train_utils/vis_utils.py +327 -0
- magic_pdf/user_api.py +136 -0
- magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
- magic_pdf-0.5.4.dist-info/METADATA +24 -0
- magic_pdf-0.5.4.dist-info/RECORD +121 -0
- magic_pdf-0.5.4.dist-info/WHEEL +5 -0
- magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,204 @@
|
|
1
|
+
import datetime
|
2
|
+
import json
|
3
|
+
import os, re, configparser
|
4
|
+
import subprocess
|
5
|
+
import time
|
6
|
+
|
7
|
+
import boto3
|
8
|
+
from loguru import logger
|
9
|
+
from boto3.s3.transfer import TransferConfig
|
10
|
+
from botocore.config import Config
|
11
|
+
|
12
|
+
import fitz # 1.23.9中已经切换到rebase
|
13
|
+
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
|
14
|
+
|
15
|
+
|
16
|
+
def get_delta_time(input_time):
|
17
|
+
return round(time.time() - input_time, 2)
|
18
|
+
|
19
|
+
|
20
|
+
def join_path(*args):
|
21
|
+
return '/'.join(str(s).rstrip('/') for s in args)
|
22
|
+
|
23
|
+
|
24
|
+
#配置全局的errlog_path,方便demo同步引用
|
25
|
+
error_log_path = "s3://llm-pdf-text/err_logs/"
|
26
|
+
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
|
27
|
+
json_dump_path = "s3://llm-pdf-text/json_dump/"
|
28
|
+
|
29
|
+
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
|
30
|
+
|
31
|
+
|
32
|
+
def get_top_percent_list(num_list, percent):
|
33
|
+
"""
|
34
|
+
获取列表中前百分之多少的元素
|
35
|
+
:param num_list:
|
36
|
+
:param percent:
|
37
|
+
:return:
|
38
|
+
"""
|
39
|
+
if len(num_list) == 0:
|
40
|
+
top_percent_list = []
|
41
|
+
else:
|
42
|
+
# 对imgs_len_list排序
|
43
|
+
sorted_imgs_len_list = sorted(num_list, reverse=True)
|
44
|
+
# 计算 percent 的索引
|
45
|
+
top_percent_index = int(len(sorted_imgs_len_list) * percent)
|
46
|
+
# 取前80%的元素
|
47
|
+
top_percent_list = sorted_imgs_len_list[:top_percent_index]
|
48
|
+
return top_percent_list
|
49
|
+
|
50
|
+
|
51
|
+
def formatted_time(time_stamp):
|
52
|
+
dt_object = datetime.datetime.fromtimestamp(time_stamp)
|
53
|
+
output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
|
54
|
+
return output_time
|
55
|
+
|
56
|
+
|
57
|
+
def mymax(alist: list):
|
58
|
+
if len(alist) == 0:
|
59
|
+
return 0 # 空是0, 0*0也是0大小q
|
60
|
+
else:
|
61
|
+
return max(alist)
|
62
|
+
|
63
|
+
def parse_aws_param(profile):
|
64
|
+
if isinstance(profile, str):
|
65
|
+
# 解析配置文件
|
66
|
+
config_file = join_path(os.path.expanduser("~"), ".aws", "config")
|
67
|
+
credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
|
68
|
+
config = configparser.ConfigParser()
|
69
|
+
config.read(credentials_file)
|
70
|
+
config.read(config_file)
|
71
|
+
# 获取 AWS 账户相关信息
|
72
|
+
ak = config.get(profile, "aws_access_key_id")
|
73
|
+
sk = config.get(profile, "aws_secret_access_key")
|
74
|
+
if profile == "default":
|
75
|
+
s3_str = config.get(f"{profile}", "s3")
|
76
|
+
else:
|
77
|
+
s3_str = config.get(f"profile {profile}", "s3")
|
78
|
+
end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
|
79
|
+
if end_match:
|
80
|
+
endpoint = end_match.group(1)
|
81
|
+
else:
|
82
|
+
raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
|
83
|
+
style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
|
84
|
+
if style_match:
|
85
|
+
addressing_style = style_match.group(1)
|
86
|
+
else:
|
87
|
+
addressing_style = "path"
|
88
|
+
elif isinstance(profile, dict):
|
89
|
+
ak = profile["ak"]
|
90
|
+
sk = profile["sk"]
|
91
|
+
endpoint = profile["endpoint"]
|
92
|
+
addressing_style = "auto"
|
93
|
+
|
94
|
+
return ak, sk, endpoint, addressing_style
|
95
|
+
|
96
|
+
|
97
|
+
def parse_bucket_key(s3_full_path: str):
|
98
|
+
"""
|
99
|
+
输入 s3://bucket/path/to/my/file.txt
|
100
|
+
输出 bucket, path/to/my/file.txt
|
101
|
+
"""
|
102
|
+
s3_full_path = s3_full_path.strip()
|
103
|
+
if s3_full_path.startswith("s3://"):
|
104
|
+
s3_full_path = s3_full_path[5:]
|
105
|
+
if s3_full_path.startswith("/"):
|
106
|
+
s3_full_path = s3_full_path[1:]
|
107
|
+
bucket, key = s3_full_path.split("/", 1)
|
108
|
+
return bucket, key
|
109
|
+
|
110
|
+
|
111
|
+
def read_file(pdf_path: str, s3_profile):
|
112
|
+
if pdf_path.startswith("s3://"):
|
113
|
+
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
|
114
|
+
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
|
115
|
+
config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
|
116
|
+
bucket_name, bucket_key = parse_bucket_key(pdf_path)
|
117
|
+
res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
|
118
|
+
file_content = res["Body"].read()
|
119
|
+
return file_content
|
120
|
+
else:
|
121
|
+
with open(pdf_path, "rb") as f:
|
122
|
+
return f.read()
|
123
|
+
|
124
|
+
|
125
|
+
def get_docx_model_output(pdf_model_output, page_id):
|
126
|
+
|
127
|
+
model_output_json = pdf_model_output[page_id]
|
128
|
+
|
129
|
+
return model_output_json
|
130
|
+
|
131
|
+
|
132
|
+
def list_dir(dir_path:str, s3_profile:str):
|
133
|
+
"""
|
134
|
+
列出dir_path下的所有文件
|
135
|
+
"""
|
136
|
+
ret = []
|
137
|
+
|
138
|
+
if dir_path.startswith("s3"):
|
139
|
+
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
|
140
|
+
s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
|
141
|
+
bucket, path = s3info[0][0], s3info[0][1]
|
142
|
+
try:
|
143
|
+
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
|
144
|
+
config=Config(s3={'addressing_style': addressing_style}))
|
145
|
+
def list_obj_scluster():
|
146
|
+
marker = None
|
147
|
+
while True:
|
148
|
+
list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
|
149
|
+
if marker:
|
150
|
+
list_kwargs['Marker'] = marker
|
151
|
+
response = cli.list_objects(**list_kwargs)
|
152
|
+
contents = response.get("Contents", [])
|
153
|
+
yield from contents
|
154
|
+
if not response.get("IsTruncated") or len(contents)==0:
|
155
|
+
break
|
156
|
+
marker = contents[-1]['Key']
|
157
|
+
|
158
|
+
|
159
|
+
for info in list_obj_scluster():
|
160
|
+
file_path = info['Key']
|
161
|
+
#size = info['Size']
|
162
|
+
|
163
|
+
if path!="":
|
164
|
+
afile = file_path[len(path):]
|
165
|
+
if afile.endswith(".json"):
|
166
|
+
ret.append(f"s3://{bucket}/{file_path}")
|
167
|
+
|
168
|
+
return ret
|
169
|
+
|
170
|
+
except Exception as e:
|
171
|
+
logger.exception(e)
|
172
|
+
exit(-1)
|
173
|
+
else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
|
174
|
+
|
175
|
+
for root, dirs, files in os.walk(dir_path):
|
176
|
+
for file in files:
|
177
|
+
if file.endswith(".json"):
|
178
|
+
ret.append(join_path(root, file))
|
179
|
+
ret.sort()
|
180
|
+
return ret
|
181
|
+
|
182
|
+
def get_img_s3_client(save_path:str, image_s3_config:str):
|
183
|
+
"""
|
184
|
+
"""
|
185
|
+
if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
|
186
|
+
ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
|
187
|
+
img_s3_client = boto3.client(
|
188
|
+
service_name="s3",
|
189
|
+
aws_access_key_id=ak,
|
190
|
+
aws_secret_access_key=sk,
|
191
|
+
endpoint_url=end_point,
|
192
|
+
config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
|
193
|
+
)
|
194
|
+
else:
|
195
|
+
img_s3_client = None
|
196
|
+
|
197
|
+
return img_s3_client
|
198
|
+
|
199
|
+
if __name__=="__main__":
|
200
|
+
s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
|
201
|
+
s3_profile = "langchao"
|
202
|
+
ret = list_dir(s3_path, s3_profile)
|
203
|
+
print(ret)
|
204
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
"""
|
2
|
+
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
|
3
|
+
|
4
|
+
"""
|
5
|
+
|
6
|
+
import json
|
7
|
+
import os
|
8
|
+
|
9
|
+
from loguru import logger
|
10
|
+
|
11
|
+
from magic_pdf.libs.commons import parse_bucket_key
|
12
|
+
|
13
|
+
|
14
|
+
def read_config():
|
15
|
+
home_dir = os.path.expanduser("~")
|
16
|
+
|
17
|
+
config_file = os.path.join(home_dir, "magic-pdf.json")
|
18
|
+
|
19
|
+
if not os.path.exists(config_file):
|
20
|
+
raise Exception(f"{config_file} not found")
|
21
|
+
|
22
|
+
with open(config_file, "r") as f:
|
23
|
+
config = json.load(f)
|
24
|
+
return config
|
25
|
+
|
26
|
+
|
27
|
+
def get_s3_config(bucket_name: str):
|
28
|
+
"""
|
29
|
+
~/magic-pdf.json 读出来
|
30
|
+
"""
|
31
|
+
config = read_config()
|
32
|
+
|
33
|
+
bucket_info = config.get("bucket_info")
|
34
|
+
if bucket_name not in bucket_info:
|
35
|
+
access_key, secret_key, storage_endpoint = bucket_info["[default]"]
|
36
|
+
else:
|
37
|
+
access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
|
38
|
+
|
39
|
+
if access_key is None or secret_key is None or storage_endpoint is None:
|
40
|
+
raise Exception("ak, sk or endpoint not found in magic-pdf.json")
|
41
|
+
|
42
|
+
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
|
43
|
+
|
44
|
+
return access_key, secret_key, storage_endpoint
|
45
|
+
|
46
|
+
|
47
|
+
def get_s3_config_dict(path: str):
|
48
|
+
access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
|
49
|
+
return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint}
|
50
|
+
|
51
|
+
|
52
|
+
def get_bucket_name(path):
|
53
|
+
bucket, key = parse_bucket_key(path)
|
54
|
+
return bucket
|
55
|
+
|
56
|
+
|
57
|
+
def get_local_dir():
|
58
|
+
config = read_config()
|
59
|
+
return config.get("temp-output-dir", "/tmp")
|
60
|
+
|
61
|
+
|
62
|
+
if __name__ == "__main__":
|
63
|
+
ak, sk, endpoint = get_s3_config("llm-raw")
|
@@ -0,0 +1,9 @@
|
|
1
|
+
def get_scale_ratio(model_page_info, page):
|
2
|
+
pix = page.get_pixmap(dpi=72)
|
3
|
+
pymu_width = int(pix.w)
|
4
|
+
pymu_height = int(pix.h)
|
5
|
+
width_from_json = model_page_info['page_info']['width']
|
6
|
+
height_from_json = model_page_info['page_info']['height']
|
7
|
+
horizontal_scale_ratio = width_from_json / pymu_width
|
8
|
+
vertical_scale_ratio = height_from_json / pymu_height
|
9
|
+
return horizontal_scale_ratio, vertical_scale_ratio
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from collections import Counter
|
2
|
+
|
3
|
+
from magic_pdf.libs.language import detect_lang
|
4
|
+
|
5
|
+
def get_language_from_model(model_list: list):
|
6
|
+
language_lst = []
|
7
|
+
for ocr_page_info in model_list:
|
8
|
+
page_text = ""
|
9
|
+
layout_dets = ocr_page_info["layout_dets"]
|
10
|
+
for layout_det in layout_dets:
|
11
|
+
category_id = layout_det["category_id"]
|
12
|
+
allow_category_id_list = [15]
|
13
|
+
if category_id in allow_category_id_list:
|
14
|
+
page_text += layout_det["text"]
|
15
|
+
page_language = detect_lang(page_text)
|
16
|
+
language_lst.append(page_language)
|
17
|
+
# 统计text_language_list中每种语言的个数
|
18
|
+
count_dict = Counter(language_lst)
|
19
|
+
# 输出text_language_list中出现的次数最多的语言
|
20
|
+
language = max(count_dict, key=count_dict.get)
|
21
|
+
return language
|
@@ -0,0 +1,227 @@
|
|
1
|
+
from magic_pdf.libs.Constants import CROSS_PAGE
|
2
|
+
from magic_pdf.libs.commons import fitz # PyMuPDF
|
3
|
+
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
|
4
|
+
|
5
|
+
|
6
|
+
def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
|
7
|
+
new_rgb = []
|
8
|
+
for item in rgb_config:
|
9
|
+
item = float(item) / 255
|
10
|
+
new_rgb.append(item)
|
11
|
+
page_data = bbox_list[i]
|
12
|
+
for bbox in page_data:
|
13
|
+
x0, y0, x1, y1 = bbox
|
14
|
+
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
15
|
+
if fill_config:
|
16
|
+
page.draw_rect(
|
17
|
+
rect_coords,
|
18
|
+
color=None,
|
19
|
+
fill=new_rgb,
|
20
|
+
fill_opacity=0.3,
|
21
|
+
width=0.5,
|
22
|
+
overlay=True,
|
23
|
+
) # Draw the rectangle
|
24
|
+
else:
|
25
|
+
page.draw_rect(
|
26
|
+
rect_coords,
|
27
|
+
color=new_rgb,
|
28
|
+
fill=None,
|
29
|
+
fill_opacity=1,
|
30
|
+
width=0.5,
|
31
|
+
overlay=True,
|
32
|
+
) # Draw the rectangle
|
33
|
+
|
34
|
+
|
35
|
+
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
|
36
|
+
new_rgb = []
|
37
|
+
for item in rgb_config:
|
38
|
+
item = float(item) / 255
|
39
|
+
new_rgb.append(item)
|
40
|
+
page_data = bbox_list[i]
|
41
|
+
for j, bbox in enumerate(page_data):
|
42
|
+
x0, y0, x1, y1 = bbox
|
43
|
+
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
44
|
+
if fill_config:
|
45
|
+
page.draw_rect(
|
46
|
+
rect_coords,
|
47
|
+
color=None,
|
48
|
+
fill=new_rgb,
|
49
|
+
fill_opacity=0.3,
|
50
|
+
width=0.5,
|
51
|
+
overlay=True,
|
52
|
+
) # Draw the rectangle
|
53
|
+
else:
|
54
|
+
page.draw_rect(
|
55
|
+
rect_coords,
|
56
|
+
color=new_rgb,
|
57
|
+
fill=None,
|
58
|
+
fill_opacity=1,
|
59
|
+
width=0.5,
|
60
|
+
overlay=True,
|
61
|
+
) # Draw the rectangle
|
62
|
+
page.insert_text(
|
63
|
+
(x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
|
64
|
+
) # Insert the index in the top left corner of the rectangle
|
65
|
+
|
66
|
+
|
67
|
+
def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
|
68
|
+
layout_bbox_list = []
|
69
|
+
dropped_bbox_list = []
|
70
|
+
tables_list, tables_body_list, tables_caption_list, tables_footnote_list = [], [], [], []
|
71
|
+
imgs_list, imgs_body_list, imgs_caption_list = [], [], []
|
72
|
+
titles_list = []
|
73
|
+
texts_list = []
|
74
|
+
interequations_list = []
|
75
|
+
for page in pdf_info:
|
76
|
+
page_layout_list = []
|
77
|
+
page_dropped_list = []
|
78
|
+
tables, tables_body, tables_caption, tables_footnote = [], [], [], []
|
79
|
+
imgs, imgs_body, imgs_caption = [], [], []
|
80
|
+
titles = []
|
81
|
+
texts = []
|
82
|
+
interequations = []
|
83
|
+
for layout in page["layout_bboxes"]:
|
84
|
+
page_layout_list.append(layout["layout_bbox"])
|
85
|
+
layout_bbox_list.append(page_layout_list)
|
86
|
+
for dropped_bbox in page["discarded_blocks"]:
|
87
|
+
page_dropped_list.append(dropped_bbox["bbox"])
|
88
|
+
dropped_bbox_list.append(page_dropped_list)
|
89
|
+
for block in page["para_blocks"]:
|
90
|
+
bbox = block["bbox"]
|
91
|
+
if block["type"] == BlockType.Table:
|
92
|
+
tables.append(bbox)
|
93
|
+
for nested_block in block["blocks"]:
|
94
|
+
bbox = nested_block["bbox"]
|
95
|
+
if nested_block["type"] == BlockType.TableBody:
|
96
|
+
tables_body.append(bbox)
|
97
|
+
elif nested_block["type"] == BlockType.TableCaption:
|
98
|
+
tables_caption.append(bbox)
|
99
|
+
elif nested_block["type"] == BlockType.TableFootnote:
|
100
|
+
tables_footnote.append(bbox)
|
101
|
+
elif block["type"] == BlockType.Image:
|
102
|
+
imgs.append(bbox)
|
103
|
+
for nested_block in block["blocks"]:
|
104
|
+
bbox = nested_block["bbox"]
|
105
|
+
if nested_block["type"] == BlockType.ImageBody:
|
106
|
+
imgs_body.append(bbox)
|
107
|
+
elif nested_block["type"] == BlockType.ImageCaption:
|
108
|
+
imgs_caption.append(bbox)
|
109
|
+
elif block["type"] == BlockType.Title:
|
110
|
+
titles.append(bbox)
|
111
|
+
elif block["type"] == BlockType.Text:
|
112
|
+
texts.append(bbox)
|
113
|
+
elif block["type"] == BlockType.InterlineEquation:
|
114
|
+
interequations.append(bbox)
|
115
|
+
tables_list.append(tables)
|
116
|
+
tables_body_list.append(tables_body)
|
117
|
+
tables_caption_list.append(tables_caption)
|
118
|
+
tables_footnote_list.append(tables_footnote)
|
119
|
+
imgs_list.append(imgs)
|
120
|
+
imgs_body_list.append(imgs_body)
|
121
|
+
imgs_caption_list.append(imgs_caption)
|
122
|
+
titles_list.append(titles)
|
123
|
+
texts_list.append(texts)
|
124
|
+
interequations_list.append(interequations)
|
125
|
+
|
126
|
+
pdf_docs = fitz.open("pdf", pdf_bytes)
|
127
|
+
for i, page in enumerate(pdf_docs):
|
128
|
+
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
|
129
|
+
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
|
130
|
+
draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
|
131
|
+
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
|
132
|
+
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
|
133
|
+
draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
|
134
|
+
draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
|
135
|
+
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
|
136
|
+
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
|
137
|
+
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
|
138
|
+
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
|
139
|
+
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
|
140
|
+
|
141
|
+
# Save the PDF
|
142
|
+
pdf_docs.save(f"{out_path}/layout.pdf")
|
143
|
+
|
144
|
+
|
145
|
+
def draw_span_bbox(pdf_info, pdf_bytes, out_path):
|
146
|
+
text_list = []
|
147
|
+
inline_equation_list = []
|
148
|
+
interline_equation_list = []
|
149
|
+
image_list = []
|
150
|
+
table_list = []
|
151
|
+
dropped_list = []
|
152
|
+
next_page_text_list = []
|
153
|
+
next_page_inline_equation_list = []
|
154
|
+
|
155
|
+
def get_span_info(span):
|
156
|
+
if span["type"] == ContentType.Text:
|
157
|
+
if span.get(CROSS_PAGE, False):
|
158
|
+
next_page_text_list.append(span["bbox"])
|
159
|
+
else:
|
160
|
+
page_text_list.append(span["bbox"])
|
161
|
+
elif span["type"] == ContentType.InlineEquation:
|
162
|
+
if span.get(CROSS_PAGE, False):
|
163
|
+
next_page_inline_equation_list.append(span["bbox"])
|
164
|
+
else:
|
165
|
+
page_inline_equation_list.append(span["bbox"])
|
166
|
+
elif span["type"] == ContentType.InterlineEquation:
|
167
|
+
page_interline_equation_list.append(span["bbox"])
|
168
|
+
elif span["type"] == ContentType.Image:
|
169
|
+
page_image_list.append(span["bbox"])
|
170
|
+
elif span["type"] == ContentType.Table:
|
171
|
+
page_table_list.append(span["bbox"])
|
172
|
+
|
173
|
+
for page in pdf_info:
|
174
|
+
page_text_list = []
|
175
|
+
page_inline_equation_list = []
|
176
|
+
page_interline_equation_list = []
|
177
|
+
page_image_list = []
|
178
|
+
page_table_list = []
|
179
|
+
page_dropped_list = []
|
180
|
+
|
181
|
+
# 将跨页的span放到移动到下一页的列表中
|
182
|
+
if len(next_page_text_list) > 0:
|
183
|
+
page_text_list.extend(next_page_text_list)
|
184
|
+
next_page_text_list.clear()
|
185
|
+
if len(next_page_inline_equation_list) > 0:
|
186
|
+
page_inline_equation_list.extend(next_page_inline_equation_list)
|
187
|
+
next_page_inline_equation_list.clear()
|
188
|
+
|
189
|
+
# 构造dropped_list
|
190
|
+
for block in page["discarded_blocks"]:
|
191
|
+
if block["type"] == BlockType.Discarded:
|
192
|
+
for line in block["lines"]:
|
193
|
+
for span in line["spans"]:
|
194
|
+
page_dropped_list.append(span["bbox"])
|
195
|
+
dropped_list.append(page_dropped_list)
|
196
|
+
# 构造其余useful_list
|
197
|
+
for block in page["para_blocks"]:
|
198
|
+
if block["type"] in [
|
199
|
+
BlockType.Text,
|
200
|
+
BlockType.Title,
|
201
|
+
BlockType.InterlineEquation,
|
202
|
+
]:
|
203
|
+
for line in block["lines"]:
|
204
|
+
for span in line["spans"]:
|
205
|
+
get_span_info(span)
|
206
|
+
elif block["type"] in [BlockType.Image, BlockType.Table]:
|
207
|
+
for sub_block in block["blocks"]:
|
208
|
+
for line in sub_block["lines"]:
|
209
|
+
for span in line["spans"]:
|
210
|
+
get_span_info(span)
|
211
|
+
text_list.append(page_text_list)
|
212
|
+
inline_equation_list.append(page_inline_equation_list)
|
213
|
+
interline_equation_list.append(page_interline_equation_list)
|
214
|
+
image_list.append(page_image_list)
|
215
|
+
table_list.append(page_table_list)
|
216
|
+
pdf_docs = fitz.open("pdf", pdf_bytes)
|
217
|
+
for i, page in enumerate(pdf_docs):
|
218
|
+
# 获取当前页面的数据
|
219
|
+
draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
|
220
|
+
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
|
221
|
+
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
|
222
|
+
draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
|
223
|
+
draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
|
224
|
+
draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
|
225
|
+
|
226
|
+
# Save the PDF
|
227
|
+
pdf_docs.save(f"{out_path}/spans.pdf")
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
class DropReason:
|
3
|
+
TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序
|
4
|
+
USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
|
5
|
+
COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持
|
6
|
+
TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
|
7
|
+
COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
|
8
|
+
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片,计算量太大,从而丢弃
|
9
|
+
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃
|
10
|
+
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大
|
11
|
+
MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
|
12
|
+
Exception = "_exception" # 解析中发生异常
|
13
|
+
ENCRYPTED = "encrypted" # PDF是加密的
|
14
|
+
EMPTY_PDF = "total_page=0" # PDF页面总数为0
|
15
|
+
NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析
|
16
|
+
DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
|
17
|
+
TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
|
18
|
+
TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败(例如一级、二级、三级标题)
|
19
|
+
PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
|
20
|
+
PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
|
21
|
+
NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
|
22
|
+
SPECIAL_PDF = "special_pdf"
|
23
|
+
PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
|
24
|
+
CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
|
25
|
+
NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
|
26
|
+
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
|
27
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
|
3
|
+
PAGE_NO = "page-no" # 页码
|
4
|
+
CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
|
5
|
+
VERTICAL_TEXT = 'vertical-text' # 垂直文本
|
6
|
+
ROTATE_TEXT = 'rotate-text' # 旋转文本
|
7
|
+
EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
|
8
|
+
ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
|
9
|
+
ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
|
10
|
+
|
11
|
+
|
12
|
+
class DropTag:
|
13
|
+
PAGE_NUMBER = "page_no"
|
14
|
+
HEADER = "header"
|
15
|
+
FOOTER = "footer"
|
16
|
+
FOOTNOTE = "footnote"
|
17
|
+
NOT_IN_LAYOUT = "not_in_layout"
|
18
|
+
SPAN_OVERLAP = "span_overlap"
|
19
|
+
BLOCK_OVERLAP = "block_overlap"
|
@@ -0,0 +1,15 @@
|
|
1
|
+
import hashlib
|
2
|
+
|
3
|
+
|
4
|
+
def compute_md5(file_bytes):
|
5
|
+
hasher = hashlib.md5()
|
6
|
+
hasher.update(file_bytes)
|
7
|
+
return hasher.hexdigest().upper()
|
8
|
+
|
9
|
+
|
10
|
+
def compute_sha256(input_string):
|
11
|
+
hasher = hashlib.sha256()
|
12
|
+
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
|
13
|
+
input_bytes = input_string.encode('utf-8')
|
14
|
+
hasher.update(input_bytes)
|
15
|
+
return hasher.hexdigest()
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import json
|
2
|
+
import brotli
|
3
|
+
import base64
|
4
|
+
|
5
|
+
class JsonCompressor:
|
6
|
+
|
7
|
+
@staticmethod
|
8
|
+
def compress_json(data):
|
9
|
+
"""
|
10
|
+
Compress a json object and encode it with base64
|
11
|
+
"""
|
12
|
+
json_str = json.dumps(data)
|
13
|
+
json_bytes = json_str.encode('utf-8')
|
14
|
+
compressed = brotli.compress(json_bytes, quality=6)
|
15
|
+
compressed_str = base64.b64encode(compressed).decode('utf-8') # convert bytes to string
|
16
|
+
return compressed_str
|
17
|
+
|
18
|
+
@staticmethod
|
19
|
+
def decompress_json(compressed_str):
|
20
|
+
"""
|
21
|
+
Decode the base64 string and decompress the json object
|
22
|
+
"""
|
23
|
+
compressed = base64.b64decode(compressed_str.encode('utf-8')) # convert string to bytes
|
24
|
+
decompressed_bytes = brotli.decompress(compressed)
|
25
|
+
json_str = decompressed_bytes.decode('utf-8')
|
26
|
+
data = json.loads(json_str)
|
27
|
+
return data
|
@@ -0,0 +1,31 @@
|
|
1
|
+
import regex
|
2
|
+
import unicodedata
|
3
|
+
from fast_langdetect import detect_langs
|
4
|
+
|
5
|
+
RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
|
6
|
+
|
7
|
+
|
8
|
+
def remove_bad_chars(text):
|
9
|
+
return RE_BAD_CHARS.sub("", text)
|
10
|
+
|
11
|
+
|
12
|
+
def detect_lang(text: str) -> str:
|
13
|
+
if len(text) == 0:
|
14
|
+
return ""
|
15
|
+
try:
|
16
|
+
lang_upper = detect_langs(text)
|
17
|
+
except:
|
18
|
+
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
|
19
|
+
lang_upper = detect_langs(html_no_ctrl_chars)
|
20
|
+
try:
|
21
|
+
lang = lang_upper.lower()
|
22
|
+
except:
|
23
|
+
lang = ""
|
24
|
+
return lang
|
25
|
+
|
26
|
+
|
27
|
+
if __name__ == '__main__':
|
28
|
+
print(detect_lang("This is a test."))
|
29
|
+
print(detect_lang("<html>This is a test</html>"))
|
30
|
+
print(detect_lang("这个是中文测试。"))
|
31
|
+
print(detect_lang("<html>这个是中文测试。</html>"))
|