magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -55,5 +55,8 @@ class FileBasedDataWriter(DataWriter):
|
|
55
55
|
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
|
56
56
|
fn_path = os.path.join(self._parent_dir, path)
|
57
57
|
|
58
|
+
if not os.path.exists(os.path.dirname(fn_path)):
|
59
|
+
os.makedirs(os.path.dirname(fn_path), exist_ok=True)
|
60
|
+
|
58
61
|
with open(fn_path, 'wb') as f:
|
59
62
|
f.write(data)
|
@@ -1,13 +1,12 @@
|
|
1
1
|
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
|
2
2
|
|
3
|
-
import sys
|
4
3
|
from collections import Counter
|
5
4
|
|
6
|
-
import
|
5
|
+
import fitz
|
7
6
|
from loguru import logger
|
8
7
|
|
9
8
|
from magic_pdf.config.drop_reason import DropReason
|
10
|
-
from magic_pdf.libs.commons import
|
9
|
+
from magic_pdf.libs.commons import get_top_percent_list, mymax
|
11
10
|
from magic_pdf.libs.language import detect_lang
|
12
11
|
from magic_pdf.libs.pdf_check import detect_invalid_chars
|
13
12
|
|
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
|
384
383
|
return res
|
385
384
|
|
386
385
|
|
387
|
-
@click.command()
|
388
|
-
@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
|
389
|
-
@click.option('--s3-profile', help='s3上的profile')
|
390
|
-
def main(s3_pdf_path: str, s3_profile: str):
|
391
|
-
""""""
|
392
|
-
try:
|
393
|
-
file_content = read_file(s3_pdf_path, s3_profile)
|
394
|
-
pdf_meta_scan(file_content)
|
395
|
-
except Exception as e:
|
396
|
-
print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
|
397
|
-
logger.exception(e)
|
398
|
-
|
399
|
-
|
400
386
|
if __name__ == '__main__':
|
401
|
-
|
387
|
+
pass
|
402
388
|
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
|
403
389
|
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
|
404
390
|
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
|
magic_pdf/libs/commons.py
CHANGED
@@ -1,34 +1,8 @@
|
|
1
|
-
import datetime
|
2
|
-
import json
|
3
|
-
import os, re, configparser
|
4
|
-
import subprocess
|
5
|
-
import time
|
6
|
-
|
7
|
-
import boto3
|
8
|
-
from loguru import logger
|
9
|
-
from boto3.s3.transfer import TransferConfig
|
10
|
-
from botocore.config import Config
|
11
|
-
|
12
|
-
import fitz # 1.23.9中已经切换到rebase
|
13
|
-
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
|
14
|
-
|
15
|
-
|
16
|
-
def get_delta_time(input_time):
|
17
|
-
return round(time.time() - input_time, 2)
|
18
|
-
|
19
1
|
|
20
2
|
def join_path(*args):
|
21
3
|
return '/'.join(str(s).rstrip('/') for s in args)
|
22
4
|
|
23
5
|
|
24
|
-
#配置全局的errlog_path,方便demo同步引用
|
25
|
-
error_log_path = "s3://llm-pdf-text/err_logs/"
|
26
|
-
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
|
27
|
-
json_dump_path = "s3://llm-pdf-text/json_dump/"
|
28
|
-
|
29
|
-
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
|
30
|
-
|
31
|
-
|
32
6
|
def get_top_percent_list(num_list, percent):
|
33
7
|
"""
|
34
8
|
获取列表中前百分之多少的元素
|
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
|
|
48
22
|
return top_percent_list
|
49
23
|
|
50
24
|
|
51
|
-
def formatted_time(time_stamp):
|
52
|
-
dt_object = datetime.datetime.fromtimestamp(time_stamp)
|
53
|
-
output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
|
54
|
-
return output_time
|
55
|
-
|
56
|
-
|
57
25
|
def mymax(alist: list):
|
58
26
|
if len(alist) == 0:
|
59
27
|
return 0 # 空是0, 0*0也是0大小q
|
60
28
|
else:
|
61
29
|
return max(alist)
|
62
30
|
|
63
|
-
def parse_aws_param(profile):
|
64
|
-
if isinstance(profile, str):
|
65
|
-
# 解析配置文件
|
66
|
-
config_file = join_path(os.path.expanduser("~"), ".aws", "config")
|
67
|
-
credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
|
68
|
-
config = configparser.ConfigParser()
|
69
|
-
config.read(credentials_file)
|
70
|
-
config.read(config_file)
|
71
|
-
# 获取 AWS 账户相关信息
|
72
|
-
ak = config.get(profile, "aws_access_key_id")
|
73
|
-
sk = config.get(profile, "aws_secret_access_key")
|
74
|
-
if profile == "default":
|
75
|
-
s3_str = config.get(f"{profile}", "s3")
|
76
|
-
else:
|
77
|
-
s3_str = config.get(f"profile {profile}", "s3")
|
78
|
-
end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
|
79
|
-
if end_match:
|
80
|
-
endpoint = end_match.group(1)
|
81
|
-
else:
|
82
|
-
raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
|
83
|
-
style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
|
84
|
-
if style_match:
|
85
|
-
addressing_style = style_match.group(1)
|
86
|
-
else:
|
87
|
-
addressing_style = "path"
|
88
|
-
elif isinstance(profile, dict):
|
89
|
-
ak = profile["ak"]
|
90
|
-
sk = profile["sk"]
|
91
|
-
endpoint = profile["endpoint"]
|
92
|
-
addressing_style = "auto"
|
93
|
-
|
94
|
-
return ak, sk, endpoint, addressing_style
|
95
|
-
|
96
31
|
|
97
32
|
def parse_bucket_key(s3_full_path: str):
|
98
33
|
"""
|
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
|
|
106
41
|
s3_full_path = s3_full_path[1:]
|
107
42
|
bucket, key = s3_full_path.split("/", 1)
|
108
43
|
return bucket, key
|
109
|
-
|
110
|
-
|
111
|
-
def read_file(pdf_path: str, s3_profile):
|
112
|
-
if pdf_path.startswith("s3://"):
|
113
|
-
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
|
114
|
-
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
|
115
|
-
config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
|
116
|
-
bucket_name, bucket_key = parse_bucket_key(pdf_path)
|
117
|
-
res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
|
118
|
-
file_content = res["Body"].read()
|
119
|
-
return file_content
|
120
|
-
else:
|
121
|
-
with open(pdf_path, "rb") as f:
|
122
|
-
return f.read()
|
123
|
-
|
124
|
-
|
125
|
-
def get_docx_model_output(pdf_model_output, page_id):
|
126
|
-
|
127
|
-
model_output_json = pdf_model_output[page_id]
|
128
|
-
|
129
|
-
return model_output_json
|
130
|
-
|
131
|
-
|
132
|
-
def list_dir(dir_path:str, s3_profile:str):
|
133
|
-
"""
|
134
|
-
列出dir_path下的所有文件
|
135
|
-
"""
|
136
|
-
ret = []
|
137
|
-
|
138
|
-
if dir_path.startswith("s3"):
|
139
|
-
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
|
140
|
-
s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
|
141
|
-
bucket, path = s3info[0][0], s3info[0][1]
|
142
|
-
try:
|
143
|
-
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
|
144
|
-
config=Config(s3={'addressing_style': addressing_style}))
|
145
|
-
def list_obj_scluster():
|
146
|
-
marker = None
|
147
|
-
while True:
|
148
|
-
list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
|
149
|
-
if marker:
|
150
|
-
list_kwargs['Marker'] = marker
|
151
|
-
response = cli.list_objects(**list_kwargs)
|
152
|
-
contents = response.get("Contents", [])
|
153
|
-
yield from contents
|
154
|
-
if not response.get("IsTruncated") or len(contents)==0:
|
155
|
-
break
|
156
|
-
marker = contents[-1]['Key']
|
157
|
-
|
158
|
-
|
159
|
-
for info in list_obj_scluster():
|
160
|
-
file_path = info['Key']
|
161
|
-
#size = info['Size']
|
162
|
-
|
163
|
-
if path!="":
|
164
|
-
afile = file_path[len(path):]
|
165
|
-
if afile.endswith(".json"):
|
166
|
-
ret.append(f"s3://{bucket}/{file_path}")
|
167
|
-
|
168
|
-
return ret
|
169
|
-
|
170
|
-
except Exception as e:
|
171
|
-
logger.exception(e)
|
172
|
-
exit(-1)
|
173
|
-
else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
|
174
|
-
|
175
|
-
for root, dirs, files in os.walk(dir_path):
|
176
|
-
for file in files:
|
177
|
-
if file.endswith(".json"):
|
178
|
-
ret.append(join_path(root, file))
|
179
|
-
ret.sort()
|
180
|
-
return ret
|
181
|
-
|
182
|
-
def get_img_s3_client(save_path:str, image_s3_config:str):
|
183
|
-
"""
|
184
|
-
"""
|
185
|
-
if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
|
186
|
-
ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
|
187
|
-
img_s3_client = boto3.client(
|
188
|
-
service_name="s3",
|
189
|
-
aws_access_key_id=ak,
|
190
|
-
aws_secret_access_key=sk,
|
191
|
-
endpoint_url=end_point,
|
192
|
-
config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
|
193
|
-
)
|
194
|
-
else:
|
195
|
-
img_s3_client = None
|
196
|
-
|
197
|
-
return img_s3_client
|
198
|
-
|
199
|
-
if __name__=="__main__":
|
200
|
-
s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
|
201
|
-
s3_profile = "langchao"
|
202
|
-
ret = list_dir(s3_path, s3_profile)
|
203
|
-
print(ret)
|
204
|
-
|
magic_pdf/libs/draw_bbox.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
|
+
import fitz
|
1
2
|
from magic_pdf.config.constants import CROSS_PAGE
|
2
|
-
from magic_pdf.config.ocr_content_type import
|
3
|
-
ContentType)
|
3
|
+
from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
|
4
4
|
from magic_pdf.data.dataset import PymuDocDataset
|
5
|
-
from magic_pdf.libs.commons import fitz # PyMuPDF
|
6
5
|
from magic_pdf.model.magic_model import MagicModel
|
7
6
|
|
8
7
|
|
magic_pdf/libs/markdown_utils.py
CHANGED
@@ -1,24 +1,3 @@
|
|
1
|
-
import re
|
2
|
-
|
3
|
-
|
4
|
-
def escape_special_markdown_char(pymu_blocks):
|
5
|
-
"""
|
6
|
-
转义正文里对markdown语法有特殊意义的字符
|
7
|
-
"""
|
8
|
-
special_chars = ["*", "`", "~", "$"]
|
9
|
-
for blk in pymu_blocks:
|
10
|
-
for line in blk['lines']:
|
11
|
-
for span in line['spans']:
|
12
|
-
for char in special_chars:
|
13
|
-
span_text = span['text']
|
14
|
-
span_type = span.get("_type", None)
|
15
|
-
if span_type in ['inline-equation', 'interline-equation']:
|
16
|
-
continue
|
17
|
-
elif span_text:
|
18
|
-
span['text'] = span['text'].replace(char, "\\" + char)
|
19
|
-
|
20
|
-
return pymu_blocks
|
21
|
-
|
22
1
|
|
23
2
|
def ocr_escape_special_markdown_char(content):
|
24
3
|
"""
|
@@ -1,9 +1,10 @@
|
|
1
1
|
from io import BytesIO
|
2
2
|
import cv2
|
3
|
+
import fitz
|
3
4
|
import numpy as np
|
4
5
|
from PIL import Image
|
5
6
|
from magic_pdf.data.data_reader_writer import DataWriter
|
6
|
-
from magic_pdf.libs.commons import
|
7
|
+
from magic_pdf.libs.commons import join_path
|
7
8
|
from magic_pdf.libs.hash_utils import compute_sha256
|
8
9
|
|
9
10
|
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.10.
|
1
|
+
__version__ = "0.10.2"
|
@@ -46,8 +46,8 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
|
|
46
46
|
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
47
47
|
pm = page.get_pixmap(matrix=mat, alpha=False)
|
48
48
|
|
49
|
-
# If the width or height exceeds
|
50
|
-
if pm.width >
|
49
|
+
# If the width or height exceeds 4500 after scaling, do not scale further.
|
50
|
+
if pm.width > 4500 or pm.height > 4500:
|
51
51
|
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
52
52
|
|
53
53
|
img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
|
magic_pdf/model/magic_model.py
CHANGED
@@ -1,16 +1,12 @@
|
|
1
1
|
import enum
|
2
|
-
import json
|
3
2
|
|
4
3
|
from magic_pdf.config.model_block_type import ModelBlockTypeEnum
|
5
4
|
from magic_pdf.config.ocr_content_type import CategoryId, ContentType
|
6
|
-
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
|
7
|
-
FileBasedDataWriter)
|
8
5
|
from magic_pdf.data.dataset import Dataset
|
9
6
|
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
|
10
7
|
bbox_relative_pos, box_area, calculate_iou,
|
11
8
|
calculate_overlap_area_in_bbox1_area_ratio,
|
12
9
|
get_overlap_area)
|
13
|
-
from magic_pdf.libs.commons import fitz, join_path
|
14
10
|
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
15
11
|
from magic_pdf.libs.local_math import float_gt
|
16
12
|
from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
|
@@ -1048,29 +1044,3 @@ class MagicModel:
|
|
1048
1044
|
def get_model_list(self, page_no):
|
1049
1045
|
return self.__model_list[page_no]
|
1050
1046
|
|
1051
|
-
|
1052
|
-
if __name__ == '__main__':
|
1053
|
-
drw = FileBasedDataReader(r'D:/project/20231108code-clean')
|
1054
|
-
if 0:
|
1055
|
-
pdf_file_path = r'linshixuqiu\19983-00.pdf'
|
1056
|
-
model_file_path = r'linshixuqiu\19983-00_new.json'
|
1057
|
-
pdf_bytes = drw.read(pdf_file_path)
|
1058
|
-
model_json_txt = drw.read(model_file_path).decode()
|
1059
|
-
model_list = json.loads(model_json_txt)
|
1060
|
-
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
|
1061
|
-
img_bucket_path = 'imgs'
|
1062
|
-
img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
|
1063
|
-
pdf_docs = fitz.open('pdf', pdf_bytes)
|
1064
|
-
magic_model = MagicModel(model_list, pdf_docs)
|
1065
|
-
|
1066
|
-
if 1:
|
1067
|
-
from magic_pdf.data.dataset import PymuDocDataset
|
1068
|
-
|
1069
|
-
model_list = json.loads(
|
1070
|
-
drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
|
1071
|
-
)
|
1072
|
-
pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
|
1073
|
-
|
1074
|
-
magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
|
1075
|
-
for i in range(7):
|
1076
|
-
print(magic_model.get_imgs(i))
|
@@ -1,5 +1,3 @@
|
|
1
|
-
import math
|
2
|
-
|
3
1
|
import numpy as np
|
4
2
|
from loguru import logger
|
5
3
|
|
@@ -214,6 +212,9 @@ def get_ocr_result_list(ocr_res, useful_list):
|
|
214
212
|
if len(box_ocr_res) == 2:
|
215
213
|
p1, p2, p3, p4 = box_ocr_res[0]
|
216
214
|
text, score = box_ocr_res[1]
|
215
|
+
# logger.info(f"text: {text}, score: {score}")
|
216
|
+
if score < 0.6: # 过滤低置信度的结果
|
217
|
+
continue
|
217
218
|
else:
|
218
219
|
p1, p2, p3, p4 = box_ocr_res
|
219
220
|
text, score = "", 1
|
@@ -249,32 +250,6 @@ def get_ocr_result_list(ocr_res, useful_list):
|
|
249
250
|
return ocr_result_list
|
250
251
|
|
251
252
|
|
252
|
-
def calculate_angle_degrees(poly):
|
253
|
-
# 定义对角线的顶点
|
254
|
-
diagonal1 = (poly[0], poly[2])
|
255
|
-
diagonal2 = (poly[1], poly[3])
|
256
|
-
|
257
|
-
# 计算对角线的斜率
|
258
|
-
def slope(p1, p2):
|
259
|
-
return (p2[1] - p1[1]) / (p2[0] - p1[0]) if p2[0] != p1[0] else float('inf')
|
260
|
-
|
261
|
-
slope1 = slope(diagonal1[0], diagonal1[1])
|
262
|
-
slope2 = slope(diagonal2[0], diagonal2[1])
|
263
|
-
|
264
|
-
# 计算对角线与x轴的夹角(以弧度为单位)
|
265
|
-
angle1_radians = math.atan(slope1)
|
266
|
-
angle2_radians = math.atan(slope2)
|
267
|
-
|
268
|
-
# 将弧度转换为角度
|
269
|
-
angle1_degrees = math.degrees(angle1_radians)
|
270
|
-
angle2_degrees = math.degrees(angle2_radians)
|
271
|
-
|
272
|
-
# 取两条对角线与x轴夹角的平均值
|
273
|
-
average_angle_degrees = abs((angle1_degrees + angle2_degrees) / 2)
|
274
|
-
# logger.info(f"average_angle_degrees: {average_angle_degrees}")
|
275
|
-
return average_angle_degrees
|
276
|
-
|
277
|
-
|
278
253
|
def calculate_is_angle(poly):
|
279
254
|
p1, p2, p3, p4 = poly
|
280
255
|
height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2
|
@@ -63,7 +63,7 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
63
63
|
|
64
64
|
if det and rec:
|
65
65
|
ocr_res = []
|
66
|
-
for
|
66
|
+
for img in imgs:
|
67
67
|
img = preprocess_image(img)
|
68
68
|
dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
|
69
69
|
if not dt_boxes and not rec_res:
|
@@ -75,7 +75,7 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
75
75
|
return ocr_res
|
76
76
|
elif det and not rec:
|
77
77
|
ocr_res = []
|
78
|
-
for
|
78
|
+
for img in imgs:
|
79
79
|
img = preprocess_image(img)
|
80
80
|
dt_boxes, elapse = self.text_detector(img)
|
81
81
|
if dt_boxes is None:
|
@@ -96,7 +96,7 @@ class ModifiedPaddleOCR(PaddleOCR):
|
|
96
96
|
else:
|
97
97
|
ocr_res = []
|
98
98
|
cls_res = []
|
99
|
-
for
|
99
|
+
for img in imgs:
|
100
100
|
if not isinstance(img, list):
|
101
101
|
img = preprocess_image(img)
|
102
102
|
img = [img]
|
magic_pdf/para/para_split_v3.py
CHANGED
@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
|
|
271
271
|
first_span = first_line['spans'][0]
|
272
272
|
if len(first_span['content']) > 0:
|
273
273
|
span_start_with_num = first_span['content'][0].isdigit()
|
274
|
+
span_start_with_big_char = first_span['content'][0].isupper()
|
274
275
|
if (
|
275
|
-
|
276
|
-
< line_height
|
276
|
+
# 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
|
277
|
+
abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
|
278
|
+
# 上一个block的最后一个span不是以特定符号结尾
|
277
279
|
and not last_span['content'].endswith(LINE_STOP_FLAG)
|
278
280
|
# 两个block宽度差距超过2倍也不合并
|
279
281
|
and abs(block1_weight - block2_weight) < min_block_weight
|
282
|
+
# 下一个block的第一个字符是数字
|
280
283
|
and not span_start_with_num
|
284
|
+
# 下一个block的第一个字符是大写字母
|
285
|
+
and not span_start_with_big_char
|
281
286
|
):
|
282
287
|
if block1['page_num'] != block2['page_num']:
|
283
288
|
for line in block1['lines']:
|