magic-pdf 0.5.10__py3-none-any.whl → 0.5.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/cli/magicpdf.py CHANGED
@@ -23,7 +23,6 @@ python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home
23
23
 
24
24
  import os
25
25
  import json as json_parse
26
- import sys
27
26
  import click
28
27
  from loguru import logger
29
28
  from pathlib import Path
@@ -46,6 +45,7 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
46
45
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
47
46
  import csv
48
47
  import copy
48
+ import magic_pdf.model as model_config
49
49
 
50
50
  parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
51
51
 
@@ -66,21 +66,21 @@ def write_to_csv(csv_file_path, csv_data):
66
66
  csv_writer = csv.writer(csvfile)
67
67
  # 写入数据
68
68
  csv_writer.writerow(csv_data)
69
- print(f"数据已成功追加到 '{csv_file_path}'")
69
+ logger.info(f"数据已成功追加到 '{csv_file_path}'")
70
70
 
71
71
 
72
72
  def do_parse(
73
- pdf_file_name,
74
- pdf_bytes,
75
- model_list,
76
- parse_method,
77
- f_draw_span_bbox=True,
78
- f_draw_layout_bbox=True,
79
- f_dump_md=True,
80
- f_dump_middle_json=True,
81
- f_dump_model_json=True,
82
- f_dump_orig_pdf=True,
83
- f_dump_content_list=True,
73
+ pdf_file_name,
74
+ pdf_bytes,
75
+ model_list,
76
+ parse_method,
77
+ f_draw_span_bbox=True,
78
+ f_draw_layout_bbox=True,
79
+ f_dump_md=True,
80
+ f_dump_middle_json=True,
81
+ f_dump_model_json=True,
82
+ f_dump_orig_pdf=True,
83
+ f_dump_content_list=True,
84
84
  ):
85
85
  orig_model_list = copy.deepcopy(model_list)
86
86
 
@@ -96,14 +96,18 @@ def do_parse(
96
96
  elif parse_method == "ocr":
97
97
  pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
98
98
  else:
99
- print("unknown parse method")
100
- sys.exit(1)
99
+ logger.error("unknown parse method")
100
+ exit(1)
101
101
 
102
102
  pipe.pipe_classify()
103
103
 
104
- """如果没有传入有效的模型数据,则使用内置paddle解析"""
104
+ """如果没有传入有效的模型数据,则使用内置model解析"""
105
105
  if len(model_list) == 0:
106
- pipe.pipe_analyze()
106
+ if model_config.__use_inside_model__:
107
+ pipe.pipe_analyze()
108
+ else:
109
+ logger.error("need model list input")
110
+ exit(1)
107
111
 
108
112
  pipe.pipe_parse()
109
113
  pdf_info = pipe.pdf_mid_data["pdf_info"]
@@ -170,10 +174,13 @@ def cli():
170
174
  help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
171
175
  default="auto",
172
176
  )
173
- def json_command(json, method):
177
+ @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
178
+ def json_command(json, method, inside_model):
179
+ model_config.__use_inside_model__ = inside_model
180
+
174
181
  if not json.startswith("s3://"):
175
- print("usage: python magipdf.py --json s3://some_bucket/some_path")
176
- sys.exit(1)
182
+ logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
183
+ exit(1)
177
184
 
178
185
  def read_s3_path(s3path):
179
186
  bucket, key = parse_s3path(s3path)
@@ -218,7 +225,10 @@ def json_command(json, method):
218
225
  help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
219
226
  default="auto",
220
227
  )
221
- def local_json_command(local_json, method):
228
+ @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
229
+ def local_json_command(local_json, method, inside_model):
230
+ model_config.__use_inside_model__ = inside_model
231
+
222
232
  def read_s3_path(s3path):
223
233
  bucket, key = parse_s3path(s3path)
224
234
 
@@ -267,7 +277,10 @@ def local_json_command(local_json, method):
267
277
  help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
268
278
  default="auto",
269
279
  )
270
- def pdf_command(pdf, model, method):
280
+ @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
281
+ def pdf_command(pdf, model, method, inside_model):
282
+ model_config.__use_inside_model__ = inside_model
283
+
271
284
  def read_fn(path):
272
285
  disk_rw = DiskReaderWriter(os.path.dirname(path))
273
286
  return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.10"
1
+ __version__ = "0.5.11"
@@ -0,0 +1 @@
1
+ __use_inside_model__ = False
@@ -2,9 +2,10 @@ import fitz
2
2
  import cv2
3
3
  from PIL import Image
4
4
  import numpy as np
5
+ from loguru import logger
5
6
 
6
7
  from magic_pdf.model.model_list import MODEL
7
- from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
8
+ import magic_pdf.model as model_config
8
9
 
9
10
 
10
11
  def dict_compare(d1, d2):
@@ -41,6 +42,13 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
41
42
 
42
43
 
43
44
  def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
45
+
46
+ if model_config.__use_inside_model__:
47
+ from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
48
+ else:
49
+ logger.error("use_inside_model is False, not allow to use inside model")
50
+ exit(1)
51
+
44
52
  images = load_images_from_pdf(pdf_bytes)
45
53
  custom_model = None
46
54
  if model == MODEL.Paddle:
@@ -1,7 +1,12 @@
1
1
  import random
2
2
 
3
3
  from loguru import logger
4
- from paddleocr import PPStructure
4
+
5
+ try:
6
+ from paddleocr import PPStructure
7
+ except ImportError:
8
+ logger.error('paddleocr not installed, please install by "pip install magic-pdf[cpu]" or "pip install magic-pdf[gpu]"')
9
+ exit(1)
5
10
 
6
11
 
7
12
  def region_to_bbox(region):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.10
3
+ Version: 0.5.11
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/magicpdf/Magic-PDF
6
6
  Requires-Python: >=3.9
@@ -22,11 +22,26 @@ Requires-Dist: wordninja >=2.0.0
22
22
  Requires-Dist: scikit-learn >=1.0.2
23
23
  Requires-Dist: nltk ==3.8.1
24
24
  Requires-Dist: s3pathlib >=2.1.1
25
- Requires-Dist: paddleocr
26
25
  Requires-Dist: pdfminer.six >=20231228
26
+ Requires-Dist: Levenshtein
27
+ Requires-Dist: nltk
28
+ Requires-Dist: rapidfuzz
29
+ Requires-Dist: statistics
30
+ Requires-Dist: openxlab
31
+ Requires-Dist: pandas
32
+ Requires-Dist: numpy
33
+ Requires-Dist: matplotlib
34
+ Requires-Dist: seaborn
35
+ Requires-Dist: scipy
36
+ Requires-Dist: scikit-learn
37
+ Requires-Dist: tqdm
38
+ Requires-Dist: htmltabletomd
39
+ Requires-Dist: pypandoc
27
40
  Provides-Extra: cpu
41
+ Requires-Dist: paddleocr ; extra == 'cpu'
28
42
  Requires-Dist: paddlepaddle ; extra == 'cpu'
29
43
  Provides-Extra: gpu
44
+ Requires-Dist: paddleocr ; extra == 'gpu'
30
45
  Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
31
46
 
32
47
  <div id="top"></div>
@@ -64,6 +79,10 @@ Key features include:
64
79
  - Compatibility with CPU and GPU environments
65
80
  - Available for Windows, Linux, and macOS platforms
66
81
 
82
+ ## Project Panorama
83
+
84
+ ![Project Panorama](docs/images/project_panorama_en.png)
85
+
67
86
  ## Getting Started
68
87
 
69
88
  ### Requirements
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
5
5
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
6
6
  magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
7
7
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- magic_pdf/cli/magicpdf.py,sha256=FF6flO6wUcKG9Qx_FG6-xhHfmQzQWLjwrkMa5kowDgs,10937
8
+ magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
9
9
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
11
11
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -44,14 +44,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
44
44
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
45
45
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
46
46
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
47
- magic_pdf/libs/version.py,sha256=1nlPInsRzDbcDPveZ3ghSJ6v6KveN9n6gnj-twW4DkI,23
47
+ magic_pdf/libs/version.py,sha256=xFez9dUQrcuZqZRWuEIsCbMskoR-Ke1_uUZ51Kyt1tw,23
48
48
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
49
49
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
50
- magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=QD4NWEIz8UXdIG4V_3P8EaYesxk6PvC1SOtTWEy2GEY,2007
50
+ magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
51
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=N3DqbVT1hc4s9KhppWDmZWkCj2ExKltoLrQl2IWGk7c,2231
52
52
  magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
53
53
  magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
54
- magic_pdf/model/pp_structure_v2.py,sha256=qsyt9vFDGaVizBMiSaeFVHTDsJTrIHx46Ec2J8SOj1A,2469
54
+ magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
55
55
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
57
57
  magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
115
115
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
116
116
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
117
117
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
118
- magic_pdf-0.5.10.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
- magic_pdf-0.5.10.dist-info/METADATA,sha256=B3e0sVOyFhk47EfHPuLFRUNxzdasWYx3XuYR53LSJX8,4175
120
- magic_pdf-0.5.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
- magic_pdf-0.5.10.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
- magic_pdf-0.5.10.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
- magic_pdf-0.5.10.dist-info/RECORD,,
118
+ magic_pdf-0.5.11.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
+ magic_pdf-0.5.11.dist-info/METADATA,sha256=wMDXFCmnlXQKkUdp891cG46MrbDn92TlPPD8T7AT3tE,4649
120
+ magic_pdf-0.5.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
+ magic_pdf-0.5.11.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
+ magic_pdf-0.5.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
+ magic_pdf-0.5.11.dist-info/RECORD,,