magic-pdf 0.5.11__py3-none-any.whl → 0.5.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,6 @@
1
- import regex
2
1
  import unicodedata
3
2
  from fast_langdetect import detect_langs
4
3
 
5
- RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
6
-
7
-
8
- def remove_bad_chars(text):
9
- return RE_BAD_CHARS.sub("", text)
10
-
11
4
 
12
5
  def detect_lang(text: str) -> str:
13
6
  if len(text) == 0:
@@ -1,7 +1,5 @@
1
1
 
2
2
 
3
- from s3pathlib import S3Path
4
-
5
3
  def remove_non_official_s3_args(s3path):
6
4
  """
7
5
  example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
@@ -10,8 +8,19 @@ def remove_non_official_s3_args(s3path):
10
8
  return arr[0]
11
9
 
12
10
  def parse_s3path(s3path: str):
13
- p = S3Path(remove_non_official_s3_args(s3path))
14
- return p.bucket, p.key
11
+ # from s3pathlib import S3Path
12
+ # p = S3Path(remove_non_official_s3_args(s3path))
13
+ # return p.bucket, p.key
14
+ s3path = remove_non_official_s3_args(s3path).strip()
15
+ if s3path.startswith(('s3://', 's3a://')):
16
+ prefix, path = s3path.split('://', 1)
17
+ bucket_name, key = path.split('/', 1)
18
+ return bucket_name, key
19
+ elif s3path.startswith('/'):
20
+ raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
21
+ else:
22
+ raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
23
+
15
24
 
16
25
  def parse_s3_range_params(s3path: str):
17
26
  """
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.11"
1
+ __version__ = "0.5.12"
@@ -1,9 +1,6 @@
1
1
  import fitz
2
- import cv2
3
- from PIL import Image
4
2
  import numpy as np
5
3
  from loguru import logger
6
-
7
4
  from magic_pdf.model.model_list import MODEL
8
5
  import magic_pdf.model as model_config
9
6
 
@@ -23,6 +20,11 @@ def remove_duplicates_dicts(lst):
23
20
 
24
21
 
25
22
  def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
23
+ try:
24
+ import cv2
25
+ from PIL import Image
26
+ except ImportError:
27
+ logger.error("opencv-python and Pillow are not installed, please install by pip.")
26
28
  images = []
27
29
  with fitz.open("pdf", pdf_bytes) as doc:
28
30
  for index in range(0, doc.page_count):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.11
3
+ Version: 0.5.12
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/magicpdf/Magic-PDF
6
6
  Requires-Python: >=3.9
@@ -9,34 +9,13 @@ License-File: LICENSE.md
9
9
  Requires-Dist: boto3 >=1.28.43
10
10
  Requires-Dist: Brotli >=1.1.0
11
11
  Requires-Dist: click >=8.1.7
12
- Requires-Dist: Distance >=0.1.3
13
- Requires-Dist: PyMuPDF >=1.24.5
12
+ Requires-Dist: PyMuPDF >=1.24.7
14
13
  Requires-Dist: loguru >=0.6.0
15
- Requires-Dist: matplotlib >=3.8.3
16
14
  Requires-Dist: numpy >=1.21.6
17
- Requires-Dist: pandas >=1.3.5
18
15
  Requires-Dist: fast-langdetect >=0.1.1
19
- Requires-Dist: regex >=2023.12.25
20
- Requires-Dist: termcolor >=2.4.0
21
16
  Requires-Dist: wordninja >=2.0.0
22
17
  Requires-Dist: scikit-learn >=1.0.2
23
- Requires-Dist: nltk ==3.8.1
24
- Requires-Dist: s3pathlib >=2.1.1
25
18
  Requires-Dist: pdfminer.six >=20231228
26
- Requires-Dist: Levenshtein
27
- Requires-Dist: nltk
28
- Requires-Dist: rapidfuzz
29
- Requires-Dist: statistics
30
- Requires-Dist: openxlab
31
- Requires-Dist: pandas
32
- Requires-Dist: numpy
33
- Requires-Dist: matplotlib
34
- Requires-Dist: seaborn
35
- Requires-Dist: scipy
36
- Requires-Dist: scikit-learn
37
- Requires-Dist: tqdm
38
- Requires-Dist: htmltabletomd
39
- Requires-Dist: pypandoc
40
19
  Provides-Extra: cpu
41
20
  Requires-Dist: paddleocr ; extra == 'cpu'
42
21
  Requires-Dist: paddlepaddle ; extra == 'cpu'
@@ -47,11 +26,11 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
47
26
  <div id="top"></div>
48
27
  <div align="center">
49
28
 
50
- [![stars](https://img.shields.io/github/stars/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF)
51
- [![forks](https://img.shields.io/github/forks/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF)
52
- [![license](https://img.shields.io/github/license/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF/tree/main/LICENSE)
53
- [![issue resolution](https://img.shields.io/github/issues-closed-raw/magicpdf/Magic-PDF)](https://github.com/magicpdf/Magic-PDF/issues)
54
- [![open issues](https://img.shields.io/github/issues-raw/magicpdf/Magic-PDF)](https://github.com/magicpdf/Magic-PDF/issues)
29
+ [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
30
+ [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
31
+ [![license](https://img.shields.io/github/license/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU/tree/main/LICENSE)
32
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
33
+ [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
55
34
 
56
35
  [English](README.md) | [简体中文](README_zh-CN.md)
57
36
 
@@ -61,8 +40,20 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
61
40
 
62
41
  </div>
63
42
 
43
+ # MinerU
44
+
45
+
46
+ ## Introduction
47
+
48
+ MinerU is a one-stop, open-source data extraction tool, primarily includes the following features:
49
+
50
+ - [Magic-PDF](#Magic-PDF) PDF Document Extraction
51
+ - [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction
52
+
53
+
64
54
  # Magic-PDF
65
55
 
56
+
66
57
  ## Introduction
67
58
 
68
59
  Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
@@ -79,19 +70,38 @@ Key features include:
79
70
  - Compatibility with CPU and GPU environments
80
71
  - Available for Windows, Linux, and macOS platforms
81
72
 
73
+
74
+ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
75
+
76
+
77
+
82
78
  ## Project Panorama
83
79
 
84
80
  ![Project Panorama](docs/images/project_panorama_en.png)
85
81
 
82
+
83
+ ## Flowchart
84
+
85
+ ![Flowchart](docs/images/flowchart_en.png)
86
+
87
+ ### Submodule Repositories
88
+
89
+ - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
90
+ - A Comprehensive Toolkit for High-Quality PDF Content Extraction
91
+ - [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
92
+ - An end-to-end PDF document comprehension evaluation suite designed for large-scale model data scenarios
93
+
94
+
86
95
  ## Getting Started
87
96
 
88
97
  ### Requirements
89
98
 
90
- - Python 3.9 or newer
99
+ - Python >= 3.9
91
100
 
92
101
  ### Usage Instructions
93
102
 
94
103
  #### 1. Install Magic-PDF
104
+
95
105
  ```bash
96
106
  pip install magic-pdf
97
107
  ```
@@ -99,11 +109,14 @@ pip install magic-pdf
99
109
  #### 2. Usage via Command Line
100
110
 
101
111
  ###### simple
112
+
102
113
  ```bash
103
114
  cp magic-pdf.template.json to ~/magic-pdf.json
104
115
  magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
105
116
  ```
117
+
106
118
  ###### more
119
+
107
120
  ```bash
108
121
  magic-pdf --help
109
122
  ```
@@ -134,19 +147,61 @@ pipe.pipe_parse()
134
147
  md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
135
148
  ```
136
149
 
137
- Demo can be referred to [demo.py](https://github.com/magicpdf/Magic-PDF/blob/master/demo/demo.py)
150
+ Demo can be referred to [demo.py](demo/demo.py)
151
+
138
152
 
139
153
  ## All Thanks To Our Contributors
140
154
 
141
155
  <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
142
- <img src="https://contrib.rocks/image?repo=magicpdf/Magic-PDF" />
156
+ <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
143
157
  </a>
144
158
 
159
+
145
160
  ## License Information
146
161
 
147
- See [LICENSE.md](https://github.com/magicpdf/Magic-PDF/blob/master/LICENSE.md) for details.
162
+ [LICENSE.md](LICENSE.md)
163
+
164
+ The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
165
+
148
166
 
149
167
  ## Acknowledgments
150
168
 
151
169
  - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
152
170
  - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
171
+
172
+
173
+ # Magic-Doc
174
+
175
+
176
+ ## Introduction
177
+
178
+ Magic-Doc is a tool designed to convert web pages or multi-format e-books into markdown format.
179
+
180
+ Key Features Include:
181
+
182
+ - Web Page Extraction
183
+ - Cross-modal precise parsing of text, images, tables, and formula information.
184
+
185
+ - E-Book Document Extraction
186
+ - Supports various document formats including epub, mobi, with full adaptation for text and images.
187
+
188
+ - Language Type Identification
189
+ - Accurate recognition of 176 languages.
190
+
191
+ https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
192
+
193
+
194
+
195
+ https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
196
+
197
+
198
+
199
+ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
200
+
201
+
202
+
203
+
204
+ ## Project Repository
205
+
206
+ - [Magic-Doc](https://github.com/magicpdf/Magic-Doc)
207
+ Outstanding Webpage and E-book Extraction Tool
@@ -34,21 +34,21 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
34
34
  magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
35
35
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
36
36
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
37
- magic_pdf/libs/language.py,sha256=klymhpJyFSc9ukUPwIQmCx1DwGMuXueosaFjmMzETQw,812
37
+ magic_pdf/libs/language.py,sha256=U8bOttqtJiBvqOFUksiHeeC3vgjzJIWTLqQrmorg7T0,683
38
38
  magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
39
39
  magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
40
40
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
41
41
  magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
42
- magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,538
42
+ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
43
43
  magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
44
44
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
45
45
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
46
46
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
47
- magic_pdf/libs/version.py,sha256=xFez9dUQrcuZqZRWuEIsCbMskoR-Ke1_uUZ51Kyt1tw,23
47
+ magic_pdf/libs/version.py,sha256=LznNzk7nDbJCv7NVxCOu958-1uT_nFJ79_3vJt7WPDc,23
48
48
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
49
49
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
50
50
  magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
51
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=N3DqbVT1hc4s9KhppWDmZWkCj2ExKltoLrQl2IWGk7c,2231
51
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=8z4NX7Lk7CcPl1BQiNYL6dDiP63M3f6m3dmW6rjHCqg,2370
52
52
  magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
53
53
  magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
54
54
  magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
115
115
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
116
116
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
117
117
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
118
- magic_pdf-0.5.11.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
- magic_pdf-0.5.11.dist-info/METADATA,sha256=wMDXFCmnlXQKkUdp891cG46MrbDn92TlPPD8T7AT3tE,4649
120
- magic_pdf-0.5.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
- magic_pdf-0.5.11.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
- magic_pdf-0.5.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
- magic_pdf-0.5.11.dist-info/RECORD,,
118
+ magic_pdf-0.5.12.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
+ magic_pdf-0.5.12.dist-info/METADATA,sha256=iNTDKGkj4D77ErkS0P1dNZ4ttFriYHbTSjsEE3f8MP0,5917
120
+ magic_pdf-0.5.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
+ magic_pdf-0.5.12.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
+ magic_pdf-0.5.12.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
+ magic_pdf-0.5.12.dist-info/RECORD,,