magic-pdf 0.5.11__py3-none-any.whl → 0.5.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/libs/language.py +0 -7
- magic_pdf/libs/path_utils.py +13 -4
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +5 -3
- {magic_pdf-0.5.11.dist-info → magic_pdf-0.5.12.dist-info}/METADATA +87 -32
- {magic_pdf-0.5.11.dist-info → magic_pdf-0.5.12.dist-info}/RECORD +10 -10
- {magic_pdf-0.5.11.dist-info → magic_pdf-0.5.12.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.11.dist-info → magic_pdf-0.5.12.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.11.dist-info → magic_pdf-0.5.12.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.5.11.dist-info → magic_pdf-0.5.12.dist-info}/top_level.txt +0 -0
magic_pdf/libs/language.py
CHANGED
@@ -1,13 +1,6 @@
|
|
1
|
-
import regex
|
2
1
|
import unicodedata
|
3
2
|
from fast_langdetect import detect_langs
|
4
3
|
|
5
|
-
RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
|
6
|
-
|
7
|
-
|
8
|
-
def remove_bad_chars(text):
|
9
|
-
return RE_BAD_CHARS.sub("", text)
|
10
|
-
|
11
4
|
|
12
5
|
def detect_lang(text: str) -> str:
|
13
6
|
if len(text) == 0:
|
magic_pdf/libs/path_utils.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
|
2
2
|
|
3
|
-
from s3pathlib import S3Path
|
4
|
-
|
5
3
|
def remove_non_official_s3_args(s3path):
|
6
4
|
"""
|
7
5
|
example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
|
@@ -10,8 +8,19 @@ def remove_non_official_s3_args(s3path):
|
|
10
8
|
return arr[0]
|
11
9
|
|
12
10
|
def parse_s3path(s3path: str):
|
13
|
-
|
14
|
-
|
11
|
+
# from s3pathlib import S3Path
|
12
|
+
# p = S3Path(remove_non_official_s3_args(s3path))
|
13
|
+
# return p.bucket, p.key
|
14
|
+
s3path = remove_non_official_s3_args(s3path).strip()
|
15
|
+
if s3path.startswith(('s3://', 's3a://')):
|
16
|
+
prefix, path = s3path.split('://', 1)
|
17
|
+
bucket_name, key = path.split('/', 1)
|
18
|
+
return bucket_name, key
|
19
|
+
elif s3path.startswith('/'):
|
20
|
+
raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
|
21
|
+
else:
|
22
|
+
raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
|
23
|
+
|
15
24
|
|
16
25
|
def parse_s3_range_params(s3path: str):
|
17
26
|
"""
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.12"
|
@@ -1,9 +1,6 @@
|
|
1
1
|
import fitz
|
2
|
-
import cv2
|
3
|
-
from PIL import Image
|
4
2
|
import numpy as np
|
5
3
|
from loguru import logger
|
6
|
-
|
7
4
|
from magic_pdf.model.model_list import MODEL
|
8
5
|
import magic_pdf.model as model_config
|
9
6
|
|
@@ -23,6 +20,11 @@ def remove_duplicates_dicts(lst):
|
|
23
20
|
|
24
21
|
|
25
22
|
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
23
|
+
try:
|
24
|
+
import cv2
|
25
|
+
from PIL import Image
|
26
|
+
except ImportError:
|
27
|
+
logger.error("opencv-python and Pillow are not installed, please install by pip.")
|
26
28
|
images = []
|
27
29
|
with fitz.open("pdf", pdf_bytes) as doc:
|
28
30
|
for index in range(0, doc.page_count):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.12
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/magicpdf/Magic-PDF
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,34 +9,13 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3 >=1.28.43
|
10
10
|
Requires-Dist: Brotli >=1.1.0
|
11
11
|
Requires-Dist: click >=8.1.7
|
12
|
-
Requires-Dist:
|
13
|
-
Requires-Dist: PyMuPDF >=1.24.5
|
12
|
+
Requires-Dist: PyMuPDF >=1.24.7
|
14
13
|
Requires-Dist: loguru >=0.6.0
|
15
|
-
Requires-Dist: matplotlib >=3.8.3
|
16
14
|
Requires-Dist: numpy >=1.21.6
|
17
|
-
Requires-Dist: pandas >=1.3.5
|
18
15
|
Requires-Dist: fast-langdetect >=0.1.1
|
19
|
-
Requires-Dist: regex >=2023.12.25
|
20
|
-
Requires-Dist: termcolor >=2.4.0
|
21
16
|
Requires-Dist: wordninja >=2.0.0
|
22
17
|
Requires-Dist: scikit-learn >=1.0.2
|
23
|
-
Requires-Dist: nltk ==3.8.1
|
24
|
-
Requires-Dist: s3pathlib >=2.1.1
|
25
18
|
Requires-Dist: pdfminer.six >=20231228
|
26
|
-
Requires-Dist: Levenshtein
|
27
|
-
Requires-Dist: nltk
|
28
|
-
Requires-Dist: rapidfuzz
|
29
|
-
Requires-Dist: statistics
|
30
|
-
Requires-Dist: openxlab
|
31
|
-
Requires-Dist: pandas
|
32
|
-
Requires-Dist: numpy
|
33
|
-
Requires-Dist: matplotlib
|
34
|
-
Requires-Dist: seaborn
|
35
|
-
Requires-Dist: scipy
|
36
|
-
Requires-Dist: scikit-learn
|
37
|
-
Requires-Dist: tqdm
|
38
|
-
Requires-Dist: htmltabletomd
|
39
|
-
Requires-Dist: pypandoc
|
40
19
|
Provides-Extra: cpu
|
41
20
|
Requires-Dist: paddleocr ; extra == 'cpu'
|
42
21
|
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
@@ -47,11 +26,11 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
|
47
26
|
<div id="top"></div>
|
48
27
|
<div align="center">
|
49
28
|
|
50
|
-
[](https://github.com/opendatalab/MinerU)
|
30
|
+
[](https://github.com/opendatalab/MinerU)
|
31
|
+
[](https://github.com/opendatalab/MinerU/tree/main/LICENSE)
|
32
|
+
[](https://github.com/opendatalab/MinerU/issues)
|
33
|
+
[](https://github.com/opendatalab/MinerU/issues)
|
55
34
|
|
56
35
|
[English](README.md) | [简体中文](README_zh-CN.md)
|
57
36
|
|
@@ -61,8 +40,20 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
|
61
40
|
|
62
41
|
</div>
|
63
42
|
|
43
|
+
# MinerU
|
44
|
+
|
45
|
+
|
46
|
+
## Introduction
|
47
|
+
|
48
|
+
MinerU is a one-stop, open-source data extraction tool, primarily includes the following features:
|
49
|
+
|
50
|
+
- [Magic-PDF](#Magic-PDF) PDF Document Extraction
|
51
|
+
- [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction
|
52
|
+
|
53
|
+
|
64
54
|
# Magic-PDF
|
65
55
|
|
56
|
+
|
66
57
|
## Introduction
|
67
58
|
|
68
59
|
Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
|
@@ -79,19 +70,38 @@ Key features include:
|
|
79
70
|
- Compatibility with CPU and GPU environments
|
80
71
|
- Available for Windows, Linux, and macOS platforms
|
81
72
|
|
73
|
+
|
74
|
+
https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
|
75
|
+
|
76
|
+
|
77
|
+
|
82
78
|
## Project Panorama
|
83
79
|
|
84
80
|

|
85
81
|
|
82
|
+
|
83
|
+
## Flowchart
|
84
|
+
|
85
|
+

|
86
|
+
|
87
|
+
### Submodule Repositories
|
88
|
+
|
89
|
+
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
90
|
+
- A Comprehensive Toolkit for High-Quality PDF Content Extraction
|
91
|
+
- [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
|
92
|
+
- An end-to-end PDF document comprehension evaluation suite designed for large-scale model data scenarios
|
93
|
+
|
94
|
+
|
86
95
|
## Getting Started
|
87
96
|
|
88
97
|
### Requirements
|
89
98
|
|
90
|
-
- Python 3.9
|
99
|
+
- Python >= 3.9
|
91
100
|
|
92
101
|
### Usage Instructions
|
93
102
|
|
94
103
|
#### 1. Install Magic-PDF
|
104
|
+
|
95
105
|
```bash
|
96
106
|
pip install magic-pdf
|
97
107
|
```
|
@@ -99,11 +109,14 @@ pip install magic-pdf
|
|
99
109
|
#### 2. Usage via Command Line
|
100
110
|
|
101
111
|
###### simple
|
112
|
+
|
102
113
|
```bash
|
103
114
|
cp magic-pdf.template.json to ~/magic-pdf.json
|
104
115
|
magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
|
105
116
|
```
|
117
|
+
|
106
118
|
###### more
|
119
|
+
|
107
120
|
```bash
|
108
121
|
magic-pdf --help
|
109
122
|
```
|
@@ -134,19 +147,61 @@ pipe.pipe_parse()
|
|
134
147
|
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
135
148
|
```
|
136
149
|
|
137
|
-
Demo can be referred to [demo.py](
|
150
|
+
Demo can be referred to [demo.py](demo/demo.py)
|
151
|
+
|
138
152
|
|
139
153
|
## All Thanks To Our Contributors
|
140
154
|
|
141
155
|
<a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
|
142
|
-
<img src="https://contrib.rocks/image?repo=
|
156
|
+
<img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
|
143
157
|
</a>
|
144
158
|
|
159
|
+
|
145
160
|
## License Information
|
146
161
|
|
147
|
-
|
162
|
+
[LICENSE.md](LICENSE.md)
|
163
|
+
|
164
|
+
The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
|
165
|
+
|
148
166
|
|
149
167
|
## Acknowledgments
|
150
168
|
|
151
169
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
152
170
|
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
171
|
+
|
172
|
+
|
173
|
+
# Magic-Doc
|
174
|
+
|
175
|
+
|
176
|
+
## Introduction
|
177
|
+
|
178
|
+
Magic-Doc is a tool designed to convert web pages or multi-format e-books into markdown format.
|
179
|
+
|
180
|
+
Key Features Include:
|
181
|
+
|
182
|
+
- Web Page Extraction
|
183
|
+
- Cross-modal precise parsing of text, images, tables, and formula information.
|
184
|
+
|
185
|
+
- E-Book Document Extraction
|
186
|
+
- Supports various document formats including epub, mobi, with full adaptation for text and images.
|
187
|
+
|
188
|
+
- Language Type Identification
|
189
|
+
- Accurate recognition of 176 languages.
|
190
|
+
|
191
|
+
https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
|
196
|
+
|
197
|
+
|
198
|
+
|
199
|
+
https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
|
200
|
+
|
201
|
+
|
202
|
+
|
203
|
+
|
204
|
+
## Project Repository
|
205
|
+
|
206
|
+
- [Magic-Doc](https://github.com/magicpdf/Magic-Doc)
|
207
|
+
Outstanding Webpage and E-book Extraction Tool
|
@@ -34,21 +34,21 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
|
|
34
34
|
magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
|
35
35
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
36
36
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
37
|
-
magic_pdf/libs/language.py,sha256=
|
37
|
+
magic_pdf/libs/language.py,sha256=U8bOttqtJiBvqOFUksiHeeC3vgjzJIWTLqQrmorg7T0,683
|
38
38
|
magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
|
39
39
|
magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
40
40
|
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
41
41
|
magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
|
42
|
-
magic_pdf/libs/path_utils.py,sha256=
|
42
|
+
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
43
43
|
magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
|
44
44
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
45
45
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
46
46
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
47
|
-
magic_pdf/libs/version.py,sha256=
|
47
|
+
magic_pdf/libs/version.py,sha256=LznNzk7nDbJCv7NVxCOu958-1uT_nFJ79_3vJt7WPDc,23
|
48
48
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
49
49
|
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
50
50
|
magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
|
51
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
51
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=8z4NX7Lk7CcPl1BQiNYL6dDiP63M3f6m3dmW6rjHCqg,2370
|
52
52
|
magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
|
53
53
|
magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
|
54
54
|
magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
|
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
115
115
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
116
116
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
117
117
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
118
|
-
magic_pdf-0.5.
|
119
|
-
magic_pdf-0.5.
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
122
|
-
magic_pdf-0.5.
|
123
|
-
magic_pdf-0.5.
|
118
|
+
magic_pdf-0.5.12.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
119
|
+
magic_pdf-0.5.12.dist-info/METADATA,sha256=iNTDKGkj4D77ErkS0P1dNZ4ttFriYHbTSjsEE3f8MP0,5917
|
120
|
+
magic_pdf-0.5.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
121
|
+
magic_pdf-0.5.12.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
|
122
|
+
magic_pdf-0.5.12.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
123
|
+
magic_pdf-0.5.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|