magic-pdf 0.5.12__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/cli/magicpdf.py +5 -1
- magic_pdf/libs/language.py +3 -3
- magic_pdf/libs/version.py +1 -1
- {magic_pdf-0.5.12.dist-info → magic_pdf-0.5.13.dist-info}/METADATA +52 -28
- {magic_pdf-0.5.12.dist-info → magic_pdf-0.5.13.dist-info}/RECORD +9 -9
- {magic_pdf-0.5.12.dist-info → magic_pdf-0.5.13.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.12.dist-info → magic_pdf-0.5.13.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.12.dist-info → magic_pdf-0.5.13.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.5.12.dist-info → magic_pdf-0.5.13.dist-info}/top_level.txt +0 -0
magic_pdf/cli/magicpdf.py
CHANGED
@@ -290,7 +290,11 @@ def pdf_command(pdf, model, method, inside_model):
|
|
290
290
|
def get_model_json(model_path):
|
291
291
|
# 这里处理pdf和模型相关的逻辑
|
292
292
|
if model_path is None:
|
293
|
-
|
293
|
+
file_name_without_extension, extension = os.path.splitext(pdf)
|
294
|
+
if extension == ".pdf":
|
295
|
+
model_path = file_name_without_extension + ".json"
|
296
|
+
else:
|
297
|
+
raise Exception("pdf_path input error")
|
294
298
|
if not os.path.exists(model_path):
|
295
299
|
logger.warning(
|
296
300
|
f"not found json {model_path} existed"
|
magic_pdf/libs/language.py
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
import unicodedata
|
2
|
-
from fast_langdetect import
|
2
|
+
from fast_langdetect import detect_language
|
3
3
|
|
4
4
|
|
5
5
|
def detect_lang(text: str) -> str:
|
6
6
|
if len(text) == 0:
|
7
7
|
return ""
|
8
8
|
try:
|
9
|
-
lang_upper =
|
9
|
+
lang_upper = detect_language(text)
|
10
10
|
except:
|
11
11
|
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
|
12
|
-
lang_upper =
|
12
|
+
lang_upper = detect_language(html_no_ctrl_chars)
|
13
13
|
try:
|
14
14
|
lang = lang_upper.lower()
|
15
15
|
except:
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.13"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.13
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/magicpdf/Magic-PDF
|
6
6
|
Requires-Python: >=3.9
|
@@ -16,6 +16,7 @@ Requires-Dist: fast-langdetect >=0.1.1
|
|
16
16
|
Requires-Dist: wordninja >=2.0.0
|
17
17
|
Requires-Dist: scikit-learn >=1.0.2
|
18
18
|
Requires-Dist: pdfminer.six >=20231228
|
19
|
+
Requires-Dist: numpy <2.0.0
|
19
20
|
Provides-Extra: cpu
|
20
21
|
Requires-Dist: paddleocr ; extra == 'cpu'
|
21
22
|
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
@@ -45,7 +46,7 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
|
45
46
|
|
46
47
|
## Introduction
|
47
48
|
|
48
|
-
MinerU is a one-stop, open-source data extraction tool,
|
49
|
+
MinerU is a one-stop, open-source, high-quality data extraction tool, includes the following primary features:
|
49
50
|
|
50
51
|
- [Magic-PDF](#Magic-PDF) PDF Document Extraction
|
51
52
|
- [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction
|
@@ -71,7 +72,7 @@ Key features include:
|
|
71
72
|
- Available for Windows, Linux, and macOS platforms
|
72
73
|
|
73
74
|
|
74
|
-
https://github.com/
|
75
|
+
https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
|
75
76
|
|
76
77
|
|
77
78
|
|
@@ -88,9 +89,6 @@ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3
|
|
88
89
|
|
89
90
|
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
90
91
|
- A Comprehensive Toolkit for High-Quality PDF Content Extraction
|
91
|
-
- [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
|
92
|
-
- An end-to-end PDF document comprehension evaluation suite designed for large-scale model data scenarios
|
93
|
-
|
94
92
|
|
95
93
|
## Getting Started
|
96
94
|
|
@@ -111,9 +109,10 @@ pip install magic-pdf
|
|
111
109
|
###### simple
|
112
110
|
|
113
111
|
```bash
|
114
|
-
cp magic-pdf.template.json
|
112
|
+
cp magic-pdf.template.json ~/magic-pdf.json
|
115
113
|
magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
|
116
114
|
```
|
115
|
+
After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
|
117
116
|
|
118
117
|
###### more
|
119
118
|
|
@@ -150,26 +149,6 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
|
150
149
|
Demo can be referred to [demo.py](demo/demo.py)
|
151
150
|
|
152
151
|
|
153
|
-
## All Thanks To Our Contributors
|
154
|
-
|
155
|
-
<a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
|
156
|
-
<img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
|
157
|
-
</a>
|
158
|
-
|
159
|
-
|
160
|
-
## License Information
|
161
|
-
|
162
|
-
[LICENSE.md](LICENSE.md)
|
163
|
-
|
164
|
-
The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
|
165
|
-
|
166
|
-
|
167
|
-
## Acknowledgments
|
168
|
-
|
169
|
-
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
170
|
-
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
171
|
-
|
172
|
-
|
173
152
|
# Magic-Doc
|
174
153
|
|
175
154
|
|
@@ -203,5 +182,50 @@ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d7
|
|
203
182
|
|
204
183
|
## Project Repository
|
205
184
|
|
206
|
-
- [Magic-Doc](https://github.com/
|
185
|
+
- [Magic-Doc](https://github.com/InternLM/magic-doc)
|
207
186
|
Outstanding Webpage and E-book Extraction Tool
|
187
|
+
|
188
|
+
|
189
|
+
# All Thanks To Our Contributors
|
190
|
+
|
191
|
+
<a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
|
192
|
+
<img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
|
193
|
+
</a>
|
194
|
+
|
195
|
+
|
196
|
+
# License Information
|
197
|
+
|
198
|
+
[LICENSE.md](LICENSE.md)
|
199
|
+
|
200
|
+
The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
|
201
|
+
|
202
|
+
|
203
|
+
# Acknowledgments
|
204
|
+
|
205
|
+
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
206
|
+
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
207
|
+
- [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
|
208
|
+
- [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
|
209
|
+
|
210
|
+
|
211
|
+
# Citation
|
212
|
+
|
213
|
+
```bibtex
|
214
|
+
@misc{2024mineru,
|
215
|
+
title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
|
216
|
+
author={MinerU Contributors},
|
217
|
+
howpublished = {\url{https://github.com/opendatalab/MinerU}},
|
218
|
+
year={2024}
|
219
|
+
}
|
220
|
+
```
|
221
|
+
|
222
|
+
|
223
|
+
# Star History
|
224
|
+
|
225
|
+
<a>
|
226
|
+
<picture>
|
227
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
|
228
|
+
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
|
229
|
+
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
|
230
|
+
</picture>
|
231
|
+
</a>
|
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
|
|
5
5
|
magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
|
6
6
|
magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
|
7
7
|
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
magic_pdf/cli/magicpdf.py,sha256=
|
8
|
+
magic_pdf/cli/magicpdf.py,sha256=d4Wy2g7t_GsclV4r0vQR0enIh08-Ml2n1jf1zdrq4LE,11852
|
9
9
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
11
11
|
magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
|
@@ -34,7 +34,7 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
|
|
34
34
|
magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
|
35
35
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
36
36
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
37
|
-
magic_pdf/libs/language.py,sha256=
|
37
|
+
magic_pdf/libs/language.py,sha256=l0LGIz-dlerU9Xct-7ypNKGNEI_q-CTadsJAnVTF9VY,692
|
38
38
|
magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
|
39
39
|
magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
40
40
|
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
@@ -44,7 +44,7 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
44
44
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
45
45
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
46
46
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
47
|
-
magic_pdf/libs/version.py,sha256=
|
47
|
+
magic_pdf/libs/version.py,sha256=jEM-pQV3SLNuNue5fxlBM8hWNuJydsyqi_WBzC1VQaM,23
|
48
48
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
49
49
|
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
50
50
|
magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
|
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
115
115
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
116
116
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
117
117
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
118
|
-
magic_pdf-0.5.
|
119
|
-
magic_pdf-0.5.
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
122
|
-
magic_pdf-0.5.
|
123
|
-
magic_pdf-0.5.
|
118
|
+
magic_pdf-0.5.13.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
119
|
+
magic_pdf-0.5.13.dist-info/METADATA,sha256=g5VqQbFmBpLwZyVNivClRek2vVoBAGwhjuT8Tnq3Wtc,6673
|
120
|
+
magic_pdf-0.5.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
121
|
+
magic_pdf-0.5.13.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
|
122
|
+
magic_pdf-0.5.13.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
123
|
+
magic_pdf-0.5.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|