magic-pdf 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/libs/pdf_check.py +12 -9
- magic_pdf/libs/version.py +1 -1
- {magic_pdf-0.5.6.dist-info → magic_pdf-0.5.7.dist-info}/METADATA +1 -1
- {magic_pdf-0.5.6.dist-info → magic_pdf-0.5.7.dist-info}/RECORD +7 -7
- {magic_pdf-0.5.6.dist-info → magic_pdf-0.5.7.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.6.dist-info → magic_pdf-0.5.7.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.6.dist-info → magic_pdf-0.5.7.dist-info}/top_level.txt +0 -0
magic_pdf/libs/pdf_check.py
CHANGED
@@ -6,15 +6,11 @@ from loguru import logger
|
|
6
6
|
from pdfminer.high_level import extract_text
|
7
7
|
|
8
8
|
|
9
|
-
def calculate_sample_count(total_page: int
|
9
|
+
def calculate_sample_count(total_page: int):
|
10
10
|
"""
|
11
11
|
根据总页数和采样率计算采样页面的数量。
|
12
12
|
"""
|
13
|
-
select_page_cnt =
|
14
|
-
if select_page_cnt < 5:
|
15
|
-
select_page_cnt = min(10, total_page)
|
16
|
-
elif select_page_cnt > 10:
|
17
|
-
select_page_cnt = 10
|
13
|
+
select_page_cnt = min(10, total_page)
|
18
14
|
return select_page_cnt
|
19
15
|
|
20
16
|
|
@@ -41,19 +37,26 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
|
41
37
|
""""
|
42
38
|
检测PDF中是否包含非法字符
|
43
39
|
"""
|
44
|
-
'''
|
40
|
+
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
|
45
41
|
sample_docs = extract_pages(src_pdf_bytes)
|
46
42
|
sample_pdf_bytes = sample_docs.tobytes()
|
47
43
|
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
48
44
|
text = extract_text(sample_pdf_file_like_object)
|
45
|
+
text = text.replace("\n", "")
|
49
46
|
# logger.info(text)
|
50
47
|
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
51
48
|
cid_pattern = re.compile(r'\(cid:\d+\)')
|
52
49
|
matches = cid_pattern.findall(text)
|
53
50
|
cid_count = len(matches)
|
51
|
+
cid_len = sum(len(match) for match in matches)
|
54
52
|
text_len = len(text)
|
55
|
-
|
56
|
-
|
53
|
+
if text_len == 0:
|
54
|
+
cid_chars_radio = 0
|
55
|
+
else:
|
56
|
+
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
|
57
|
+
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
|
58
|
+
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
|
59
|
+
if cid_chars_radio > 0.05:
|
57
60
|
return False # 乱码文档
|
58
61
|
else:
|
59
62
|
return True # 正常文档
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.7"
|
@@ -42,11 +42,11 @@ magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
|
42
42
|
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
43
43
|
magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
|
44
44
|
magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,538
|
45
|
-
magic_pdf/libs/pdf_check.py,sha256=
|
45
|
+
magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
|
46
46
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
47
47
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
48
48
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
49
|
-
magic_pdf/libs/version.py,sha256=
|
49
|
+
magic_pdf/libs/version.py,sha256=KiyyYbyEe0O858kmiWcg1RdmqGUYtk_JqRmc3_Ev2Q8,22
|
50
50
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
51
51
|
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
52
52
|
magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -117,8 +117,8 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
117
117
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
118
118
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
119
119
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
122
|
-
magic_pdf-0.5.
|
123
|
-
magic_pdf-0.5.
|
124
|
-
magic_pdf-0.5.
|
120
|
+
magic_pdf-0.5.7.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
121
|
+
magic_pdf-0.5.7.dist-info/METADATA,sha256=6tyRzBGDgaq7hCfgzI_KjOhnW_nStuIse-6bmB8WxN8,814
|
122
|
+
magic_pdf-0.5.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
123
|
+
magic_pdf-0.5.7.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
124
|
+
magic_pdf-0.5.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|