magic-pdf 0.5.6__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,15 +6,11 @@ from loguru import logger
6
6
  from pdfminer.high_level import extract_text
7
7
 
8
8
 
9
- def calculate_sample_count(total_page: int, sample_ratio=0.1):
9
+ def calculate_sample_count(total_page: int):
10
10
  """
11
11
  根据总页数和采样率计算采样页面的数量。
12
12
  """
13
- select_page_cnt = int(total_page * sample_ratio)
14
- if select_page_cnt < 5:
15
- select_page_cnt = min(10, total_page)
16
- elif select_page_cnt > 10:
17
- select_page_cnt = 10
13
+ select_page_cnt = min(10, total_page)
18
14
  return select_page_cnt
19
15
 
20
16
 
@@ -41,19 +37,26 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
41
37
  """"
42
38
  检测PDF中是否包含非法字符
43
39
  """
44
- '''需要使用'''
40
+ '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
45
41
  sample_docs = extract_pages(src_pdf_bytes)
46
42
  sample_pdf_bytes = sample_docs.tobytes()
47
43
  sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
48
44
  text = extract_text(sample_pdf_file_like_object)
45
+ text = text.replace("\n", "")
49
46
  # logger.info(text)
50
47
  '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
51
48
  cid_pattern = re.compile(r'\(cid:\d+\)')
52
49
  matches = cid_pattern.findall(text)
53
50
  cid_count = len(matches)
51
+ cid_len = sum(len(match) for match in matches)
54
52
  text_len = len(text)
55
- logger.info(f"cid_count: {cid_count}, text_len: {text_len}")
56
- if cid_count > 10:
53
+ if text_len == 0:
54
+ cid_chars_radio = 0
55
+ else:
56
+ cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
57
+ logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
58
+ '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
59
+ if cid_chars_radio > 0.05:
57
60
  return False # 乱码文档
58
61
  else:
59
62
  return True # 正常文档
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.6"
1
+ __version__ = "0.5.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.6
3
+ Version: 0.5.7
4
4
  Requires-Python: >=3.9
5
5
  License-File: LICENSE.md
6
6
  Requires-Dist: boto3 >=1.28.43
@@ -42,11 +42,11 @@ magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
42
42
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
43
43
  magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
44
44
  magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,538
45
- magic_pdf/libs/pdf_check.py,sha256=LeCoMTVaVPWTgE0MSD6OnyXbpdjV7HfiX1RD6xesIWM,1911
45
+ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
46
46
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
47
47
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
48
48
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
49
- magic_pdf/libs/version.py,sha256=CMH34Gt1AqO7z_TqRj94XwohGoVCf8aes0djkqm45mk,22
49
+ magic_pdf/libs/version.py,sha256=KiyyYbyEe0O858kmiWcg1RdmqGUYtk_JqRmc3_Ev2Q8,22
50
50
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
51
51
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
52
52
  magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -117,8 +117,8 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
117
117
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
118
118
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
119
119
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
120
- magic_pdf-0.5.6.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
121
- magic_pdf-0.5.6.dist-info/METADATA,sha256=R1Rjdsta6IJ197EPwgSb7c-LtgPg2HnLibsGKRUa-i4,814
122
- magic_pdf-0.5.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
123
- magic_pdf-0.5.6.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
124
- magic_pdf-0.5.6.dist-info/RECORD,,
120
+ magic_pdf-0.5.7.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
121
+ magic_pdf-0.5.7.dist-info/METADATA,sha256=6tyRzBGDgaq7hCfgzI_KjOhnW_nStuIse-6bmB8WxN8,814
122
+ magic_pdf-0.5.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
123
+ magic_pdf-0.5.7.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
124
+ magic_pdf-0.5.7.dist-info/RECORD,,