magic-pdf 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/cli/magicpdf.py CHANGED
@@ -290,7 +290,11 @@ def pdf_command(pdf, model, method, inside_model):
290
290
  def get_model_json(model_path):
291
291
  # 这里处理pdf和模型相关的逻辑
292
292
  if model_path is None:
293
- model_path = pdf.replace(".pdf", ".json")
293
+ file_name_without_extension, extension = os.path.splitext(pdf)
294
+ if extension == ".pdf":
295
+ model_path = file_name_without_extension + ".json"
296
+ else:
297
+ raise Exception("pdf_path input error")
294
298
  if not os.path.exists(model_path):
295
299
  logger.warning(
296
300
  f"not found json {model_path} existed"
@@ -1,22 +1,15 @@
1
- import regex
2
1
  import unicodedata
3
- from fast_langdetect import detect_langs
4
-
5
- RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
6
-
7
-
8
- def remove_bad_chars(text):
9
- return RE_BAD_CHARS.sub("", text)
2
+ from fast_langdetect import detect_language
10
3
 
11
4
 
12
5
  def detect_lang(text: str) -> str:
13
6
  if len(text) == 0:
14
7
  return ""
15
8
  try:
16
- lang_upper = detect_langs(text)
9
+ lang_upper = detect_language(text)
17
10
  except:
18
11
  html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
19
- lang_upper = detect_langs(html_no_ctrl_chars)
12
+ lang_upper = detect_language(html_no_ctrl_chars)
20
13
  try:
21
14
  lang = lang_upper.lower()
22
15
  except:
@@ -1,7 +1,5 @@
1
1
 
2
2
 
3
- from s3pathlib import S3Path
4
-
5
3
  def remove_non_official_s3_args(s3path):
6
4
  """
7
5
  example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
@@ -10,8 +8,19 @@ def remove_non_official_s3_args(s3path):
10
8
  return arr[0]
11
9
 
12
10
  def parse_s3path(s3path: str):
13
- p = S3Path(remove_non_official_s3_args(s3path))
14
- return p.bucket, p.key
11
+ # from s3pathlib import S3Path
12
+ # p = S3Path(remove_non_official_s3_args(s3path))
13
+ # return p.bucket, p.key
14
+ s3path = remove_non_official_s3_args(s3path).strip()
15
+ if s3path.startswith(('s3://', 's3a://')):
16
+ prefix, path = s3path.split('://', 1)
17
+ bucket_name, key = path.split('/', 1)
18
+ return bucket_name, key
19
+ elif s3path.startswith('/'):
20
+ raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
21
+ else:
22
+ raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
23
+
15
24
 
16
25
  def parse_s3_range_params(s3path: str):
17
26
  """
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.11"
1
+ __version__ = "0.5.13"
@@ -1,9 +1,6 @@
1
1
  import fitz
2
- import cv2
3
- from PIL import Image
4
2
  import numpy as np
5
3
  from loguru import logger
6
-
7
4
  from magic_pdf.model.model_list import MODEL
8
5
  import magic_pdf.model as model_config
9
6
 
@@ -23,6 +20,11 @@ def remove_duplicates_dicts(lst):
23
20
 
24
21
 
25
22
  def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
23
+ try:
24
+ import cv2
25
+ from PIL import Image
26
+ except ImportError:
27
+ logger.error("opencv-python and Pillow are not installed, please install by pip.")
26
28
  images = []
27
29
  with fitz.open("pdf", pdf_bytes) as doc:
28
30
  for index in range(0, doc.page_count):
@@ -0,0 +1,231 @@
1
+ Metadata-Version: 2.1
2
+ Name: magic-pdf
3
+ Version: 0.5.13
4
+ Summary: A practical tool for converting PDF to Markdown
5
+ Home-page: https://github.com/magicpdf/Magic-PDF
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE.md
9
+ Requires-Dist: boto3 >=1.28.43
10
+ Requires-Dist: Brotli >=1.1.0
11
+ Requires-Dist: click >=8.1.7
12
+ Requires-Dist: PyMuPDF >=1.24.7
13
+ Requires-Dist: loguru >=0.6.0
14
+ Requires-Dist: numpy >=1.21.6
15
+ Requires-Dist: fast-langdetect >=0.1.1
16
+ Requires-Dist: wordninja >=2.0.0
17
+ Requires-Dist: scikit-learn >=1.0.2
18
+ Requires-Dist: pdfminer.six >=20231228
19
+ Requires-Dist: numpy <2.0.0
20
+ Provides-Extra: cpu
21
+ Requires-Dist: paddleocr ; extra == 'cpu'
22
+ Requires-Dist: paddlepaddle ; extra == 'cpu'
23
+ Provides-Extra: gpu
24
+ Requires-Dist: paddleocr ; extra == 'gpu'
25
+ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
26
+
27
+ <div id="top"></div>
28
+ <div align="center">
29
+
30
+ [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
31
+ [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
32
+ [![license](https://img.shields.io/github/license/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU/tree/main/LICENSE)
33
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
34
+ [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
35
+
36
+ [English](README.md) | [简体中文](README_zh-CN.md)
37
+
38
+ </div>
39
+
40
+ <div align="center">
41
+
42
+ </div>
43
+
44
+ # MinerU
45
+
46
+
47
+ ## Introduction
48
+
49
+ MinerU is a one-stop, open-source, high-quality data extraction tool, includes the following primary features:
50
+
51
+ - [Magic-PDF](#Magic-PDF) PDF Document Extraction
52
+ - [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction
53
+
54
+
55
+ # Magic-PDF
56
+
57
+
58
+ ## Introduction
59
+
60
+ Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
61
+
62
+ Key features include:
63
+
64
+ - Support for multiple front-end model inputs
65
+ - Removal of headers, footers, footnotes, and page numbers
66
+ - Human-readable layout formatting
67
+ - Retains the original document's structure and formatting, including headings, paragraphs, lists, and more
68
+ - Extraction and display of images and tables within markdown
69
+ - Conversion of equations into LaTeX format
70
+ - Automatic detection and conversion of garbled PDFs
71
+ - Compatibility with CPU and GPU environments
72
+ - Available for Windows, Linux, and macOS platforms
73
+
74
+
75
+ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
76
+
77
+
78
+
79
+ ## Project Panorama
80
+
81
+ ![Project Panorama](docs/images/project_panorama_en.png)
82
+
83
+
84
+ ## Flowchart
85
+
86
+ ![Flowchart](docs/images/flowchart_en.png)
87
+
88
+ ### Submodule Repositories
89
+
90
+ - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
91
+ - A Comprehensive Toolkit for High-Quality PDF Content Extraction
92
+
93
+ ## Getting Started
94
+
95
+ ### Requirements
96
+
97
+ - Python >= 3.9
98
+
99
+ ### Usage Instructions
100
+
101
+ #### 1. Install Magic-PDF
102
+
103
+ ```bash
104
+ pip install magic-pdf
105
+ ```
106
+
107
+ #### 2. Usage via Command Line
108
+
109
+ ###### simple
110
+
111
+ ```bash
112
+ cp magic-pdf.template.json ~/magic-pdf.json
113
+ magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
114
+ ```
115
+ After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
116
+
117
+ ###### more
118
+
119
+ ```bash
120
+ magic-pdf --help
121
+ ```
122
+
123
+ #### 3. Usage via Api
124
+
125
+ ###### Local
126
+ ```python
127
+ image_writer = DiskReaderWriter(local_image_dir)
128
+ image_dir = str(os.path.basename(local_image_dir))
129
+ jso_useful_key = {"_pdf_type": "", "model_list": model_json}
130
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
131
+ pipe.pipe_classify()
132
+ pipe.pipe_parse()
133
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
134
+ ```
135
+
136
+ ###### Object Storage
137
+ ```python
138
+ s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
139
+ image_dir = "s3://img_bucket/"
140
+ s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
141
+ pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
142
+ jso_useful_key = {"_pdf_type": "", "model_list": model_json}
143
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
144
+ pipe.pipe_classify()
145
+ pipe.pipe_parse()
146
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
147
+ ```
148
+
149
+ Demo can be referred to [demo.py](demo/demo.py)
150
+
151
+
152
+ # Magic-Doc
153
+
154
+
155
+ ## Introduction
156
+
157
+ Magic-Doc is a tool designed to convert web pages or multi-format e-books into markdown format.
158
+
159
+ Key Features Include:
160
+
161
+ - Web Page Extraction
162
+ - Cross-modal precise parsing of text, images, tables, and formula information.
163
+
164
+ - E-Book Document Extraction
165
+ - Supports various document formats including epub, mobi, with full adaptation for text and images.
166
+
167
+ - Language Type Identification
168
+ - Accurate recognition of 176 languages.
169
+
170
+ https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
171
+
172
+
173
+
174
+ https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
175
+
176
+
177
+
178
+ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
179
+
180
+
181
+
182
+
183
+ ## Project Repository
184
+
185
+ - [Magic-Doc](https://github.com/InternLM/magic-doc)
186
+ Outstanding Webpage and E-book Extraction Tool
187
+
188
+
189
+ # All Thanks To Our Contributors
190
+
191
+ <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
192
+ <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
193
+ </a>
194
+
195
+
196
+ # License Information
197
+
198
+ [LICENSE.md](LICENSE.md)
199
+
200
+ The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
201
+
202
+
203
+ # Acknowledgments
204
+
205
+ - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
206
+ - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
207
+ - [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
208
+ - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
209
+
210
+
211
+ # Citation
212
+
213
+ ```bibtex
214
+ @misc{2024mineru,
215
+ title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
216
+ author={MinerU Contributors},
217
+ howpublished = {\url{https://github.com/opendatalab/MinerU}},
218
+ year={2024}
219
+ }
220
+ ```
221
+
222
+
223
+ # Star History
224
+
225
+ <a>
226
+ <picture>
227
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
228
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
229
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
230
+ </picture>
231
+ </a>
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
5
5
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
6
6
  magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
7
7
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
8
+ magic_pdf/cli/magicpdf.py,sha256=d4Wy2g7t_GsclV4r0vQR0enIh08-Ml2n1jf1zdrq4LE,11852
9
9
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
11
11
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -34,21 +34,21 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
34
34
  magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
35
35
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
36
36
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
37
- magic_pdf/libs/language.py,sha256=klymhpJyFSc9ukUPwIQmCx1DwGMuXueosaFjmMzETQw,812
37
+ magic_pdf/libs/language.py,sha256=l0LGIz-dlerU9Xct-7ypNKGNEI_q-CTadsJAnVTF9VY,692
38
38
  magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
39
39
  magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
40
40
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
41
41
  magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
42
- magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,538
42
+ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
43
43
  magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
44
44
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
45
45
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
46
46
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
47
- magic_pdf/libs/version.py,sha256=xFez9dUQrcuZqZRWuEIsCbMskoR-Ke1_uUZ51Kyt1tw,23
47
+ magic_pdf/libs/version.py,sha256=jEM-pQV3SLNuNue5fxlBM8hWNuJydsyqi_WBzC1VQaM,23
48
48
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
49
49
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
50
50
  magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
51
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=N3DqbVT1hc4s9KhppWDmZWkCj2ExKltoLrQl2IWGk7c,2231
51
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=8z4NX7Lk7CcPl1BQiNYL6dDiP63M3f6m3dmW6rjHCqg,2370
52
52
  magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
53
53
  magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
54
54
  magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
115
115
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
116
116
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
117
117
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
118
- magic_pdf-0.5.11.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
- magic_pdf-0.5.11.dist-info/METADATA,sha256=wMDXFCmnlXQKkUdp891cG46MrbDn92TlPPD8T7AT3tE,4649
120
- magic_pdf-0.5.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
- magic_pdf-0.5.11.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
- magic_pdf-0.5.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
- magic_pdf-0.5.11.dist-info/RECORD,,
118
+ magic_pdf-0.5.13.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
+ magic_pdf-0.5.13.dist-info/METADATA,sha256=g5VqQbFmBpLwZyVNivClRek2vVoBAGwhjuT8Tnq3Wtc,6673
120
+ magic_pdf-0.5.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
+ magic_pdf-0.5.13.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
+ magic_pdf-0.5.13.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
+ magic_pdf-0.5.13.dist-info/RECORD,,
@@ -1,152 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: magic-pdf
3
- Version: 0.5.11
4
- Summary: A practical tool for converting PDF to Markdown
5
- Home-page: https://github.com/magicpdf/Magic-PDF
6
- Requires-Python: >=3.9
7
- Description-Content-Type: text/markdown
8
- License-File: LICENSE.md
9
- Requires-Dist: boto3 >=1.28.43
10
- Requires-Dist: Brotli >=1.1.0
11
- Requires-Dist: click >=8.1.7
12
- Requires-Dist: Distance >=0.1.3
13
- Requires-Dist: PyMuPDF >=1.24.5
14
- Requires-Dist: loguru >=0.6.0
15
- Requires-Dist: matplotlib >=3.8.3
16
- Requires-Dist: numpy >=1.21.6
17
- Requires-Dist: pandas >=1.3.5
18
- Requires-Dist: fast-langdetect >=0.1.1
19
- Requires-Dist: regex >=2023.12.25
20
- Requires-Dist: termcolor >=2.4.0
21
- Requires-Dist: wordninja >=2.0.0
22
- Requires-Dist: scikit-learn >=1.0.2
23
- Requires-Dist: nltk ==3.8.1
24
- Requires-Dist: s3pathlib >=2.1.1
25
- Requires-Dist: pdfminer.six >=20231228
26
- Requires-Dist: Levenshtein
27
- Requires-Dist: nltk
28
- Requires-Dist: rapidfuzz
29
- Requires-Dist: statistics
30
- Requires-Dist: openxlab
31
- Requires-Dist: pandas
32
- Requires-Dist: numpy
33
- Requires-Dist: matplotlib
34
- Requires-Dist: seaborn
35
- Requires-Dist: scipy
36
- Requires-Dist: scikit-learn
37
- Requires-Dist: tqdm
38
- Requires-Dist: htmltabletomd
39
- Requires-Dist: pypandoc
40
- Provides-Extra: cpu
41
- Requires-Dist: paddleocr ; extra == 'cpu'
42
- Requires-Dist: paddlepaddle ; extra == 'cpu'
43
- Provides-Extra: gpu
44
- Requires-Dist: paddleocr ; extra == 'gpu'
45
- Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
46
-
47
- <div id="top"></div>
48
- <div align="center">
49
-
50
- [![stars](https://img.shields.io/github/stars/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF)
51
- [![forks](https://img.shields.io/github/forks/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF)
52
- [![license](https://img.shields.io/github/license/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF/tree/main/LICENSE)
53
- [![issue resolution](https://img.shields.io/github/issues-closed-raw/magicpdf/Magic-PDF)](https://github.com/magicpdf/Magic-PDF/issues)
54
- [![open issues](https://img.shields.io/github/issues-raw/magicpdf/Magic-PDF)](https://github.com/magicpdf/Magic-PDF/issues)
55
-
56
- [English](README.md) | [简体中文](README_zh-CN.md)
57
-
58
- </div>
59
-
60
- <div align="center">
61
-
62
- </div>
63
-
64
- # Magic-PDF
65
-
66
- ## Introduction
67
-
68
- Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
69
-
70
- Key features include:
71
-
72
- - Support for multiple front-end model inputs
73
- - Removal of headers, footers, footnotes, and page numbers
74
- - Human-readable layout formatting
75
- - Retains the original document's structure and formatting, including headings, paragraphs, lists, and more
76
- - Extraction and display of images and tables within markdown
77
- - Conversion of equations into LaTeX format
78
- - Automatic detection and conversion of garbled PDFs
79
- - Compatibility with CPU and GPU environments
80
- - Available for Windows, Linux, and macOS platforms
81
-
82
- ## Project Panorama
83
-
84
- ![Project Panorama](docs/images/project_panorama_en.png)
85
-
86
- ## Getting Started
87
-
88
- ### Requirements
89
-
90
- - Python 3.9 or newer
91
-
92
- ### Usage Instructions
93
-
94
- #### 1. Install Magic-PDF
95
- ```bash
96
- pip install magic-pdf
97
- ```
98
-
99
- #### 2. Usage via Command Line
100
-
101
- ###### simple
102
- ```bash
103
- cp magic-pdf.template.json to ~/magic-pdf.json
104
- magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
105
- ```
106
- ###### more
107
- ```bash
108
- magic-pdf --help
109
- ```
110
-
111
- #### 3. Usage via Api
112
-
113
- ###### Local
114
- ```python
115
- image_writer = DiskReaderWriter(local_image_dir)
116
- image_dir = str(os.path.basename(local_image_dir))
117
- jso_useful_key = {"_pdf_type": "", "model_list": model_json}
118
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
119
- pipe.pipe_classify()
120
- pipe.pipe_parse()
121
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
122
- ```
123
-
124
- ###### Object Storage
125
- ```python
126
- s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
127
- image_dir = "s3://img_bucket/"
128
- s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
129
- pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
130
- jso_useful_key = {"_pdf_type": "", "model_list": model_json}
131
- pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
132
- pipe.pipe_classify()
133
- pipe.pipe_parse()
134
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
135
- ```
136
-
137
- Demo can be referred to [demo.py](https://github.com/magicpdf/Magic-PDF/blob/master/demo/demo.py)
138
-
139
- ## All Thanks To Our Contributors
140
-
141
- <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
142
- <img src="https://contrib.rocks/image?repo=magicpdf/Magic-PDF" />
143
- </a>
144
-
145
- ## License Information
146
-
147
- See [LICENSE.md](https://github.com/magicpdf/Magic-PDF/blob/master/LICENSE.md) for details.
148
-
149
- ## Acknowledgments
150
-
151
- - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
152
- - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)