magic-pdf 0.5.12__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/cli/magicpdf.py CHANGED
@@ -290,7 +290,11 @@ def pdf_command(pdf, model, method, inside_model):
290
290
  def get_model_json(model_path):
291
291
  # 这里处理pdf和模型相关的逻辑
292
292
  if model_path is None:
293
- model_path = pdf.replace(".pdf", ".json")
293
+ file_name_without_extension, extension = os.path.splitext(pdf)
294
+ if extension == ".pdf":
295
+ model_path = file_name_without_extension + ".json"
296
+ else:
297
+ raise Exception("pdf_path input error")
294
298
  if not os.path.exists(model_path):
295
299
  logger.warning(
296
300
  f"not found json {model_path} existed"
@@ -1,15 +1,15 @@
1
1
  import unicodedata
2
- from fast_langdetect import detect_langs
2
+ from fast_langdetect import detect_language
3
3
 
4
4
 
5
5
  def detect_lang(text: str) -> str:
6
6
  if len(text) == 0:
7
7
  return ""
8
8
  try:
9
- lang_upper = detect_langs(text)
9
+ lang_upper = detect_language(text)
10
10
  except:
11
11
  html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
12
- lang_upper = detect_langs(html_no_ctrl_chars)
12
+ lang_upper = detect_language(html_no_ctrl_chars)
13
13
  try:
14
14
  lang = lang_upper.lower()
15
15
  except:
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.12"
1
+ __version__ = "0.5.13"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.12
3
+ Version: 0.5.13
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/magicpdf/Magic-PDF
6
6
  Requires-Python: >=3.9
@@ -16,6 +16,7 @@ Requires-Dist: fast-langdetect >=0.1.1
16
16
  Requires-Dist: wordninja >=2.0.0
17
17
  Requires-Dist: scikit-learn >=1.0.2
18
18
  Requires-Dist: pdfminer.six >=20231228
19
+ Requires-Dist: numpy <2.0.0
19
20
  Provides-Extra: cpu
20
21
  Requires-Dist: paddleocr ; extra == 'cpu'
21
22
  Requires-Dist: paddlepaddle ; extra == 'cpu'
@@ -45,7 +46,7 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
45
46
 
46
47
  ## Introduction
47
48
 
48
- MinerU is a one-stop, open-source data extraction tool, primarily includes the following features:
49
+ MinerU is a one-stop, open-source, high-quality data extraction tool, includes the following primary features:
49
50
 
50
51
  - [Magic-PDF](#Magic-PDF) PDF Document Extraction
51
52
  - [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction
@@ -71,7 +72,7 @@ Key features include:
71
72
  - Available for Windows, Linux, and macOS platforms
72
73
 
73
74
 
74
- https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
75
+ https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
75
76
 
76
77
 
77
78
 
@@ -88,9 +89,6 @@ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3
88
89
 
89
90
  - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
90
91
  - A Comprehensive Toolkit for High-Quality PDF Content Extraction
91
- - [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
92
- - An end-to-end PDF document comprehension evaluation suite designed for large-scale model data scenarios
93
-
94
92
 
95
93
  ## Getting Started
96
94
 
@@ -111,9 +109,10 @@ pip install magic-pdf
111
109
  ###### simple
112
110
 
113
111
  ```bash
114
- cp magic-pdf.template.json to ~/magic-pdf.json
112
+ cp magic-pdf.template.json ~/magic-pdf.json
115
113
  magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
116
114
  ```
115
+ After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
117
116
 
118
117
  ###### more
119
118
 
@@ -150,26 +149,6 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
150
149
  Demo can be referred to [demo.py](demo/demo.py)
151
150
 
152
151
 
153
- ## All Thanks To Our Contributors
154
-
155
- <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
156
- <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
157
- </a>
158
-
159
-
160
- ## License Information
161
-
162
- [LICENSE.md](LICENSE.md)
163
-
164
- The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
165
-
166
-
167
- ## Acknowledgments
168
-
169
- - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
170
- - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
171
-
172
-
173
152
  # Magic-Doc
174
153
 
175
154
 
@@ -203,5 +182,50 @@ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d7
203
182
 
204
183
  ## Project Repository
205
184
 
206
- - [Magic-Doc](https://github.com/magicpdf/Magic-Doc)
185
+ - [Magic-Doc](https://github.com/InternLM/magic-doc)
207
186
  Outstanding Webpage and E-book Extraction Tool
187
+
188
+
189
+ # All Thanks To Our Contributors
190
+
191
+ <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
192
+ <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
193
+ </a>
194
+
195
+
196
+ # License Information
197
+
198
+ [LICENSE.md](LICENSE.md)
199
+
200
+ The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
201
+
202
+
203
+ # Acknowledgments
204
+
205
+ - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
206
+ - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
207
+ - [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
208
+ - [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
209
+
210
+
211
+ # Citation
212
+
213
+ ```bibtex
214
+ @misc{2024mineru,
215
+ title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
216
+ author={MinerU Contributors},
217
+ howpublished = {\url{https://github.com/opendatalab/MinerU}},
218
+ year={2024}
219
+ }
220
+ ```
221
+
222
+
223
+ # Star History
224
+
225
+ <a>
226
+ <picture>
227
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
228
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
229
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
230
+ </picture>
231
+ </a>
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
5
5
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
6
6
  magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
7
7
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
8
+ magic_pdf/cli/magicpdf.py,sha256=d4Wy2g7t_GsclV4r0vQR0enIh08-Ml2n1jf1zdrq4LE,11852
9
9
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
11
11
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -34,7 +34,7 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
34
34
  magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
35
35
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
36
36
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
37
- magic_pdf/libs/language.py,sha256=U8bOttqtJiBvqOFUksiHeeC3vgjzJIWTLqQrmorg7T0,683
37
+ magic_pdf/libs/language.py,sha256=l0LGIz-dlerU9Xct-7ypNKGNEI_q-CTadsJAnVTF9VY,692
38
38
  magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
39
39
  magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
40
40
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
@@ -44,7 +44,7 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
44
44
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
45
45
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
46
46
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
47
- magic_pdf/libs/version.py,sha256=LznNzk7nDbJCv7NVxCOu958-1uT_nFJ79_3vJt7WPDc,23
47
+ magic_pdf/libs/version.py,sha256=jEM-pQV3SLNuNue5fxlBM8hWNuJydsyqi_WBzC1VQaM,23
48
48
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
49
49
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
50
50
  magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
115
115
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
116
116
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
117
117
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
118
- magic_pdf-0.5.12.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
- magic_pdf-0.5.12.dist-info/METADATA,sha256=iNTDKGkj4D77ErkS0P1dNZ4ttFriYHbTSjsEE3f8MP0,5917
120
- magic_pdf-0.5.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
- magic_pdf-0.5.12.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
- magic_pdf-0.5.12.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
- magic_pdf-0.5.12.dist-info/RECORD,,
118
+ magic_pdf-0.5.13.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
+ magic_pdf-0.5.13.dist-info/METADATA,sha256=g5VqQbFmBpLwZyVNivClRek2vVoBAGwhjuT8Tnq3Wtc,6673
120
+ magic_pdf-0.5.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
+ magic_pdf-0.5.13.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
+ magic_pdf-0.5.13.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
+ magic_pdf-0.5.13.dist-info/RECORD,,