magic-pdf 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/cli/magicpdf.py CHANGED
@@ -50,9 +50,7 @@ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
50
50
 
51
51
 
52
52
  def prepare_env(pdf_file_name, method):
53
- local_parent_dir = os.path.join(
54
- get_local_dir(), "magic-pdf", pdf_file_name, method
55
- )
53
+ local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
56
54
 
57
55
  local_image_dir = os.path.join(str(local_parent_dir), "images")
58
56
  local_md_dir = local_parent_dir
@@ -62,7 +60,7 @@ def prepare_env(pdf_file_name, method):
62
60
 
63
61
 
64
62
  def write_to_csv(csv_file_path, csv_data):
65
- with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile:
63
+ with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
66
64
  # 创建csv writer对象
67
65
  csv_writer = csv.writer(csvfile)
68
66
  # 写入数据
@@ -70,65 +68,92 @@ def write_to_csv(csv_file_path, csv_data):
70
68
  print(f"数据已成功追加到 '{csv_file_path}'")
71
69
 
72
70
 
73
- def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
71
+ def do_parse(
72
+ pdf_file_name,
73
+ pdf_bytes,
74
+ model_list,
75
+ parse_method,
76
+ f_draw_span_bbox=True,
77
+ f_draw_layout_bbox=True,
78
+ f_dump_md=True,
79
+ f_dump_middle_json=True,
80
+ f_dump_model_json=True,
81
+ f_dump_orig_pdf=True,
82
+ f_dump_content_list=True,
83
+ ):
84
+
85
+ local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
86
+ image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
87
+ image_dir = (os.path.basename(local_image_dir),)
88
+
74
89
  if parse_method == "auto":
75
- jso_useful_key = {
76
- "_pdf_type": "",
77
- "model_list": model_list
78
- }
90
+ jso_useful_key = {"_pdf_type": "", "model_list": model_list}
79
91
  pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
80
92
  elif parse_method == "txt":
81
93
  pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
82
94
  elif parse_method == "ocr":
83
95
  pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
84
96
  else:
85
- print("unknow parse method")
97
+ print("unknown parse method")
86
98
  sys.exit(1)
87
99
 
88
100
  pipe.pipe_classify()
89
101
 
90
- '''如果没有传入有效的模型数据,则使用内置paddle解析'''
102
+ """如果没有传入有效的模型数据,则使用内置paddle解析"""
91
103
  if len(model_list) == 0:
92
104
  pipe.pipe_analyze()
93
105
 
94
106
  pipe.pipe_parse()
95
- pdf_info = pipe.pdf_mid_data['pdf_info']
96
- draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
97
- draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
107
+ pdf_info = pipe.pdf_mid_data["pdf_info"]
108
+ if f_draw_layout_bbox:
109
+ draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
110
+ if f_draw_span_bbox:
111
+ draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
98
112
 
99
113
  # write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
100
114
  # [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
101
115
 
102
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
103
- '''写markdown'''
104
- md_writer.write(
105
- content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
106
- )
107
- '''写middle_json'''
108
- md_writer.write(
109
- content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
110
- path=f"{pdf_file_name}_middle.json",
111
- mode=AbsReaderWriter.MODE_TXT,
112
- )
113
- '''写model_json'''
114
- md_writer.write(
115
- content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
116
- path=f"{pdf_file_name}_model.json",
117
- mode=AbsReaderWriter.MODE_TXT,
118
- )
119
- '''写源pdf'''
120
- md_writer.write(
121
- content=pdf_bytes,
122
- path=f"{pdf_file_name}_origin.pdf",
123
- mode=AbsReaderWriter.MODE_BIN,
124
- )
125
- content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
126
- '''写content_list'''
127
- md_writer.write(
128
- content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
129
- path=f"{pdf_file_name}_content_list.json",
130
- mode=AbsReaderWriter.MODE_TXT
131
- )
116
+ md_content = pipe.pipe_mk_markdown(str(image_dir), drop_mode=DropMode.NONE)
117
+ if f_dump_md:
118
+ """写markdown"""
119
+ md_writer.write(
120
+ content=md_content,
121
+ path=f"{pdf_file_name}.md",
122
+ mode=AbsReaderWriter.MODE_TXT,
123
+ )
124
+
125
+ if f_dump_middle_json:
126
+ """写middle_json"""
127
+ md_writer.write(
128
+ content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
129
+ path=f"{pdf_file_name}_middle.json",
130
+ mode=AbsReaderWriter.MODE_TXT,
131
+ )
132
+
133
+ if f_dump_model_json:
134
+ """写model_json"""
135
+ md_writer.write(
136
+ content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
137
+ path=f"{pdf_file_name}_model.json",
138
+ mode=AbsReaderWriter.MODE_TXT,
139
+ )
140
+
141
+ if f_dump_orig_pdf:
142
+ """写源pdf"""
143
+ md_writer.write(
144
+ content=pdf_bytes,
145
+ path=f"{pdf_file_name}_origin.pdf",
146
+ mode=AbsReaderWriter.MODE_BIN,
147
+ )
148
+
149
+ content_list = pipe.pipe_mk_uni_format(str(image_dir), drop_mode=DropMode.NONE)
150
+ if f_dump_content_list:
151
+ """写content_list"""
152
+ md_writer.write(
153
+ content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
154
+ path=f"{pdf_file_name}_content_list.json",
155
+ mode=AbsReaderWriter.MODE_TXT,
156
+ )
132
157
 
133
158
 
134
159
  @click.group()
@@ -177,21 +202,12 @@ def json_command(json, method):
177
202
  s3_file_path = jso.get("path")
178
203
  pdf_file_name = Path(s3_file_path).stem
179
204
  pdf_data = read_s3_path(s3_file_path)
180
- local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
181
-
182
- local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
183
- local_md_dir
184
- )
185
205
 
186
- _do_parse(
206
+ do_parse(
187
207
  pdf_file_name,
188
208
  pdf_data,
189
209
  jso["doc_layout_result"],
190
210
  method,
191
- local_image_rw,
192
- local_md_rw,
193
- os.path.basename(local_image_dir),
194
- local_md_dir
195
211
  )
196
212
 
197
213
 
@@ -233,21 +249,11 @@ def local_json_command(local_json, method):
233
249
  s3_file_path = jso.get("path")
234
250
  pdf_file_name = Path(s3_file_path).stem
235
251
  pdf_data = read_s3_path(s3_file_path)
236
- local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
237
-
238
- local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
239
- local_md_dir
240
- )
241
-
242
- _do_parse(
252
+ do_parse(
243
253
  pdf_file_name,
244
254
  pdf_data,
245
255
  jso["doc_layout_result"],
246
256
  method,
247
- local_image_rw,
248
- local_md_rw,
249
- os.path.basename(local_image_dir),
250
- local_md_dir
251
257
  )
252
258
 
253
259
 
@@ -274,7 +280,9 @@ def pdf_command(pdf, model, method):
274
280
  if model_path is None:
275
281
  model_path = pdf.replace(".pdf", ".json")
276
282
  if not os.path.exists(model_path):
277
- logger.warning(f"not found json {model_path} existed, use paddle analyze")
283
+ logger.warning(
284
+ f"not found json {model_path} existed, use paddle analyze"
285
+ )
278
286
  # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
279
287
  model_json = "[]"
280
288
  else:
@@ -286,19 +294,12 @@ def pdf_command(pdf, model, method):
286
294
 
287
295
  jso = json_parse.loads(get_model_json(model))
288
296
  pdf_file_name = Path(pdf).stem
289
- local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
290
- local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
291
- local_md_dir
292
- )
293
- _do_parse(
297
+
298
+ do_parse(
294
299
  pdf_file_name,
295
300
  pdf_data,
296
301
  jso,
297
302
  method,
298
- local_image_rw,
299
- local_md_rw,
300
- os.path.basename(local_image_dir),
301
- local_md_dir
302
303
  )
303
304
 
304
305
 
@@ -6,15 +6,11 @@ from loguru import logger
6
6
  from pdfminer.high_level import extract_text
7
7
 
8
8
 
9
- def calculate_sample_count(total_page: int, sample_ratio=0.1):
9
+ def calculate_sample_count(total_page: int):
10
10
  """
11
11
  根据总页数和采样率计算采样页面的数量。
12
12
  """
13
- select_page_cnt = int(total_page * sample_ratio)
14
- if select_page_cnt < 5:
15
- select_page_cnt = min(10, total_page)
16
- elif select_page_cnt > 10:
17
- select_page_cnt = 10
13
+ select_page_cnt = min(10, total_page)
18
14
  return select_page_cnt
19
15
 
20
16
 
@@ -41,19 +37,26 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
41
37
  """"
42
38
  检测PDF中是否包含非法字符
43
39
  """
44
- '''需要使用'''
40
+ '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
45
41
  sample_docs = extract_pages(src_pdf_bytes)
46
42
  sample_pdf_bytes = sample_docs.tobytes()
47
43
  sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
48
44
  text = extract_text(sample_pdf_file_like_object)
45
+ text = text.replace("\n", "")
49
46
  # logger.info(text)
50
47
  '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
51
48
  cid_pattern = re.compile(r'\(cid:\d+\)')
52
49
  matches = cid_pattern.findall(text)
53
50
  cid_count = len(matches)
51
+ cid_len = sum(len(match) for match in matches)
54
52
  text_len = len(text)
55
- logger.info(f"cid_count: {cid_count}, text_len: {text_len}")
56
- if cid_count > 10:
53
+ if text_len == 0:
54
+ cid_chars_radio = 0
55
+ else:
56
+ cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
57
+ logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
58
+ '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
59
+ if cid_chars_radio > 0.05:
57
60
  return False # 乱码文档
58
61
  else:
59
62
  return True # 正常文档
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.6"
1
+ __version__ = "0.5.8"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.6
3
+ Version: 0.5.8
4
4
  Requires-Python: >=3.9
5
5
  License-File: LICENSE.md
6
6
  Requires-Dist: boto3 >=1.28.43
@@ -7,7 +7,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
7
7
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
8
8
  magic_pdf/user_api.py,sha256=R4onPBXlA8GARwlqBh5wmxUtTxwQb-PUeFK7eTAWzoU,4971
9
9
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- magic_pdf/cli/magicpdf.py,sha256=IoyuWsnJp5lLDS4G9brtCqNdIWKb57Ini4uftkCl2Mg,11357
10
+ magic_pdf/cli/magicpdf.py,sha256=aMmY_J83_8IdkkxyZUHg8WzIbCA_oW4cyjlUUZE0Wvc,11117
11
11
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
13
13
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -42,11 +42,11 @@ magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
42
42
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
43
43
  magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
44
44
  magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,538
45
- magic_pdf/libs/pdf_check.py,sha256=LeCoMTVaVPWTgE0MSD6OnyXbpdjV7HfiX1RD6xesIWM,1911
45
+ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
46
46
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
47
47
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
48
48
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
49
- magic_pdf/libs/version.py,sha256=CMH34Gt1AqO7z_TqRj94XwohGoVCf8aes0djkqm45mk,22
49
+ magic_pdf/libs/version.py,sha256=bDuZ37zImJZsQ3a4pW87q4kg-zsIBrUFAv1aumIf_7k,22
50
50
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
51
51
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
52
52
  magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -117,8 +117,8 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
117
117
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
118
118
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
119
119
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
120
- magic_pdf-0.5.6.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
121
- magic_pdf-0.5.6.dist-info/METADATA,sha256=R1Rjdsta6IJ197EPwgSb7c-LtgPg2HnLibsGKRUa-i4,814
122
- magic_pdf-0.5.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
123
- magic_pdf-0.5.6.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
124
- magic_pdf-0.5.6.dist-info/RECORD,,
120
+ magic_pdf-0.5.8.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
121
+ magic_pdf-0.5.8.dist-info/METADATA,sha256=Z7HrhP7T0_dQOjCX-CztMe77Mbt90IoY8JH0IhmRHH0,814
122
+ magic_pdf-0.5.8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
123
+ magic_pdf-0.5.8.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
124
+ magic_pdf-0.5.8.dist-info/RECORD,,