magic-pdf 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/cli/magicpdf.py CHANGED
@@ -50,9 +50,7 @@ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
50
50
 
51
51
 
52
52
  def prepare_env(pdf_file_name, method):
53
- local_parent_dir = os.path.join(
54
- get_local_dir(), "magic-pdf", pdf_file_name, method
55
- )
53
+ local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
56
54
 
57
55
  local_image_dir = os.path.join(str(local_parent_dir), "images")
58
56
  local_md_dir = local_parent_dir
@@ -62,7 +60,7 @@ def prepare_env(pdf_file_name, method):
62
60
 
63
61
 
64
62
  def write_to_csv(csv_file_path, csv_data):
65
- with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile:
63
+ with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
66
64
  # 创建csv writer对象
67
65
  csv_writer = csv.writer(csvfile)
68
66
  # 写入数据
@@ -70,65 +68,92 @@ def write_to_csv(csv_file_path, csv_data):
70
68
  print(f"数据已成功追加到 '{csv_file_path}'")
71
69
 
72
70
 
73
- def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
71
+ def do_parse(
72
+ pdf_file_name,
73
+ pdf_bytes,
74
+ model_list,
75
+ parse_method,
76
+ f_draw_span_bbox=True,
77
+ f_draw_layout_bbox=True,
78
+ f_dump_md=True,
79
+ f_dump_middle_json=True,
80
+ f_dump_model_json=True,
81
+ f_dump_orig_pdf=True,
82
+ f_dump_content_list=True,
83
+ ):
84
+
85
+ local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
86
+ image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
87
+ image_dir = (os.path.basename(local_image_dir),)
88
+
74
89
  if parse_method == "auto":
75
- jso_useful_key = {
76
- "_pdf_type": "",
77
- "model_list": model_list
78
- }
90
+ jso_useful_key = {"_pdf_type": "", "model_list": model_list}
79
91
  pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
80
92
  elif parse_method == "txt":
81
93
  pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
82
94
  elif parse_method == "ocr":
83
95
  pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
84
96
  else:
85
- print("unknow parse method")
97
+ print("unknown parse method")
86
98
  sys.exit(1)
87
99
 
88
100
  pipe.pipe_classify()
89
101
 
90
- '''如果没有传入有效的模型数据,则使用内置paddle解析'''
102
+ """如果没有传入有效的模型数据,则使用内置paddle解析"""
91
103
  if len(model_list) == 0:
92
104
  pipe.pipe_analyze()
93
105
 
94
106
  pipe.pipe_parse()
95
- pdf_info = pipe.pdf_mid_data['pdf_info']
96
- draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
97
- draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
107
+ pdf_info = pipe.pdf_mid_data["pdf_info"]
108
+ if f_draw_layout_bbox:
109
+ draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
110
+ if f_draw_span_bbox:
111
+ draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
98
112
 
99
113
  # write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
100
114
  # [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
101
115
 
102
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
103
- '''写markdown'''
104
- md_writer.write(
105
- content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
106
- )
107
- '''写middle_json'''
108
- md_writer.write(
109
- content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
110
- path=f"{pdf_file_name}_middle.json",
111
- mode=AbsReaderWriter.MODE_TXT,
112
- )
113
- '''写model_json'''
114
- md_writer.write(
115
- content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
116
- path=f"{pdf_file_name}_model.json",
117
- mode=AbsReaderWriter.MODE_TXT,
118
- )
119
- '''写源pdf'''
120
- md_writer.write(
121
- content=pdf_bytes,
122
- path=f"{pdf_file_name}_origin.pdf",
123
- mode=AbsReaderWriter.MODE_BIN,
124
- )
125
- content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
126
- '''写content_list'''
127
- md_writer.write(
128
- content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
129
- path=f"{pdf_file_name}_content_list.json",
130
- mode=AbsReaderWriter.MODE_TXT
131
- )
116
+ md_content = pipe.pipe_mk_markdown(str(image_dir), drop_mode=DropMode.NONE)
117
+ if f_dump_md:
118
+ """写markdown"""
119
+ md_writer.write(
120
+ content=md_content,
121
+ path=f"{pdf_file_name}.md",
122
+ mode=AbsReaderWriter.MODE_TXT,
123
+ )
124
+
125
+ if f_dump_middle_json:
126
+ """写middle_json"""
127
+ md_writer.write(
128
+ content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
129
+ path=f"{pdf_file_name}_middle.json",
130
+ mode=AbsReaderWriter.MODE_TXT,
131
+ )
132
+
133
+ if f_dump_model_json:
134
+ """写model_json"""
135
+ md_writer.write(
136
+ content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
137
+ path=f"{pdf_file_name}_model.json",
138
+ mode=AbsReaderWriter.MODE_TXT,
139
+ )
140
+
141
+ if f_dump_orig_pdf:
142
+ """写源pdf"""
143
+ md_writer.write(
144
+ content=pdf_bytes,
145
+ path=f"{pdf_file_name}_origin.pdf",
146
+ mode=AbsReaderWriter.MODE_BIN,
147
+ )
148
+
149
+ content_list = pipe.pipe_mk_uni_format(str(image_dir), drop_mode=DropMode.NONE)
150
+ if f_dump_content_list:
151
+ """写content_list"""
152
+ md_writer.write(
153
+ content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
154
+ path=f"{pdf_file_name}_content_list.json",
155
+ mode=AbsReaderWriter.MODE_TXT,
156
+ )
132
157
 
133
158
 
134
159
  @click.group()
@@ -177,21 +202,12 @@ def json_command(json, method):
177
202
  s3_file_path = jso.get("path")
178
203
  pdf_file_name = Path(s3_file_path).stem
179
204
  pdf_data = read_s3_path(s3_file_path)
180
- local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
181
-
182
- local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
183
- local_md_dir
184
- )
185
205
 
186
- _do_parse(
206
+ do_parse(
187
207
  pdf_file_name,
188
208
  pdf_data,
189
209
  jso["doc_layout_result"],
190
210
  method,
191
- local_image_rw,
192
- local_md_rw,
193
- os.path.basename(local_image_dir),
194
- local_md_dir
195
211
  )
196
212
 
197
213
 
@@ -233,21 +249,11 @@ def local_json_command(local_json, method):
233
249
  s3_file_path = jso.get("path")
234
250
  pdf_file_name = Path(s3_file_path).stem
235
251
  pdf_data = read_s3_path(s3_file_path)
236
- local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
237
-
238
- local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
239
- local_md_dir
240
- )
241
-
242
- _do_parse(
252
+ do_parse(
243
253
  pdf_file_name,
244
254
  pdf_data,
245
255
  jso["doc_layout_result"],
246
256
  method,
247
- local_image_rw,
248
- local_md_rw,
249
- os.path.basename(local_image_dir),
250
- local_md_dir
251
257
  )
252
258
 
253
259
 
@@ -274,7 +280,9 @@ def pdf_command(pdf, model, method):
274
280
  if model_path is None:
275
281
  model_path = pdf.replace(".pdf", ".json")
276
282
  if not os.path.exists(model_path):
277
- logger.warning(f"not found json {model_path} existed, use paddle analyze")
283
+ logger.warning(
284
+ f"not found json {model_path} existed, use paddle analyze"
285
+ )
278
286
  # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
279
287
  model_json = "[]"
280
288
  else:
@@ -286,19 +294,12 @@ def pdf_command(pdf, model, method):
286
294
 
287
295
  jso = json_parse.loads(get_model_json(model))
288
296
  pdf_file_name = Path(pdf).stem
289
- local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
290
- local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
291
- local_md_dir
292
- )
293
- _do_parse(
297
+
298
+ do_parse(
294
299
  pdf_file_name,
295
300
  pdf_data,
296
301
  jso,
297
302
  method,
298
- local_image_rw,
299
- local_md_rw,
300
- os.path.basename(local_image_dir),
301
- local_md_dir
302
303
  )
303
304
 
304
305
 
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.7"
1
+ __version__ = "0.5.8"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.7
3
+ Version: 0.5.8
4
4
  Requires-Python: >=3.9
5
5
  License-File: LICENSE.md
6
6
  Requires-Dist: boto3 >=1.28.43
@@ -7,7 +7,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
7
7
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
8
8
  magic_pdf/user_api.py,sha256=R4onPBXlA8GARwlqBh5wmxUtTxwQb-PUeFK7eTAWzoU,4971
9
9
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- magic_pdf/cli/magicpdf.py,sha256=IoyuWsnJp5lLDS4G9brtCqNdIWKb57Ini4uftkCl2Mg,11357
10
+ magic_pdf/cli/magicpdf.py,sha256=aMmY_J83_8IdkkxyZUHg8WzIbCA_oW4cyjlUUZE0Wvc,11117
11
11
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
13
13
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -46,7 +46,7 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
46
46
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
47
47
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
48
48
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
49
- magic_pdf/libs/version.py,sha256=KiyyYbyEe0O858kmiWcg1RdmqGUYtk_JqRmc3_Ev2Q8,22
49
+ magic_pdf/libs/version.py,sha256=bDuZ37zImJZsQ3a4pW87q4kg-zsIBrUFAv1aumIf_7k,22
50
50
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
51
51
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
52
52
  magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -117,8 +117,8 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
117
117
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
118
118
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
119
119
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
120
- magic_pdf-0.5.7.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
121
- magic_pdf-0.5.7.dist-info/METADATA,sha256=6tyRzBGDgaq7hCfgzI_KjOhnW_nStuIse-6bmB8WxN8,814
122
- magic_pdf-0.5.7.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
123
- magic_pdf-0.5.7.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
124
- magic_pdf-0.5.7.dist-info/RECORD,,
120
+ magic_pdf-0.5.8.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
121
+ magic_pdf-0.5.8.dist-info/METADATA,sha256=Z7HrhP7T0_dQOjCX-CztMe77Mbt90IoY8JH0IhmRHH0,814
122
+ magic_pdf-0.5.8.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
123
+ magic_pdf-0.5.8.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
124
+ magic_pdf-0.5.8.dist-info/RECORD,,