PyPI - magic-pdf - Versions diffs - 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

magic-pdf 0.5.11py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

magic_pdf/cli/magicpdf.py +5 -1
magic_pdf/libs/language.py +3 -10
magic_pdf/libs/path_utils.py +13 -4
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +5 -3
magic_pdf-0.5.13.dist-info/METADATA +231 -0
{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/RECORD +11 -11
magic_pdf-0.5.11.dist-info/METADATA +0 -152
{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/WHEEL +0 -0
{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/top_level.txt +0 -0

magic_pdf/cli/magicpdf.py CHANGED Viewed

@@ -290,7 +290,11 @@ def pdf_command(pdf, model, method, inside_model):
     def get_model_json(model_path):
         # 这里处理pdf和模型相关的逻辑
         if model_path is None:
-            model_path = pdf.replace(".pdf", ".json")
+            file_name_without_extension, extension = os.path.splitext(pdf)
+            if extension == ".pdf":
+                model_path = file_name_without_extension + ".json"
+            else:
+                raise Exception("pdf_path input error")
             if not os.path.exists(model_path):
                 logger.warning(
                     f"not found json {model_path} existed"

magic_pdf/libs/language.py CHANGED Viewed

@@ -1,22 +1,15 @@
-import regex
 import unicodedata
-from fast_langdetect import detect_langs
-RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
-def remove_bad_chars(text):
-    return RE_BAD_CHARS.sub("", text)
+from fast_langdetect import detect_language
 def detect_lang(text: str) -> str:
     if len(text) == 0:
         return ""
     try:
-        lang_upper = detect_langs(text)
+        lang_upper = detect_language(text)
     except:
         html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
-        lang_upper = detect_langs(html_no_ctrl_chars)
+        lang_upper = detect_language(html_no_ctrl_chars)
     try:
         lang = lang_upper.lower()
     except:

magic_pdf/libs/path_utils.py CHANGED Viewed

@@ -1,7 +1,5 @@
-from s3pathlib import S3Path
 def remove_non_official_s3_args(s3path):
     """
     example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
@@ -10,8 +8,19 @@ def remove_non_official_s3_args(s3path):
     return arr[0]
 def parse_s3path(s3path: str):
-    p = S3Path(remove_non_official_s3_args(s3path))
-    return p.bucket, p.key
+    # from s3pathlib import S3Path
+    # p = S3Path(remove_non_official_s3_args(s3path))
+    # return p.bucket, p.key
+    s3path = remove_non_official_s3_args(s3path).strip()
+    if s3path.startswith(('s3://', 's3a://')):
+        prefix, path = s3path.split('://', 1)
+        bucket_name, key = path.split('/', 1)
+        return bucket_name, key
+    elif s3path.startswith('/'):
+        raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
+    else:
+        raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
 def parse_s3_range_params(s3path: str):
     """

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.11"
1	+ __version__ = "0.5.13"

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import fitz
-import cv2
-from PIL import Image
 import numpy as np
 from loguru import logger
 from magic_pdf.model.model_list import MODEL
 import magic_pdf.model as model_config
@@ -23,6 +20,11 @@ def remove_duplicates_dicts(lst):
 def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
+    try:
+        import cv2
+        from PIL import Image
+    except ImportError:
+        logger.error("opencv-python and Pillow are not installed, please install by pip.")
     images = []
     with fitz.open("pdf", pdf_bytes) as doc:
         for index in range(0, doc.page_count):

magic_pdf-0.5.13.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,231 @@
+Metadata-Version: 2.1
+Name: magic-pdf
+Version: 0.5.13
+Summary: A practical tool for converting PDF to Markdown
+Home-page: https://github.com/magicpdf/Magic-PDF
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE.md
+Requires-Dist: boto3 >=1.28.43
+Requires-Dist: Brotli >=1.1.0
+Requires-Dist: click >=8.1.7
+Requires-Dist: PyMuPDF >=1.24.7
+Requires-Dist: loguru >=0.6.0
+Requires-Dist: numpy >=1.21.6
+Requires-Dist: fast-langdetect >=0.1.1
+Requires-Dist: wordninja >=2.0.0
+Requires-Dist: scikit-learn >=1.0.2
+Requires-Dist: pdfminer.six >=20231228
+Requires-Dist: numpy <2.0.0
+Provides-Extra: cpu
+Requires-Dist: paddleocr ; extra == 'cpu'
+Requires-Dist: paddlepaddle ; extra == 'cpu'
+Provides-Extra: gpu
+Requires-Dist: paddleocr ; extra == 'gpu'
+Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
+<div id="top"></div>
+<div align="center">
+[![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
+[![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
+[![license](https://img.shields.io/github/license/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU/tree/main/LICENSE)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[English](README.md) | [简体中文](README_zh-CN.md)
+</div>
+<div align="center">
+</div>
+# MinerU
+## Introduction
+MinerU is a one-stop, open-source, high-quality data extraction tool, includes the following primary features:
+- [Magic-PDF](#Magic-PDF)  PDF Document Extraction
+- [Magic-Doc](#Magic-Doc)  Webpage & E-book Extraction
+# Magic-PDF
+## Introduction
+Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
+Key features include:
+- Support for multiple front-end model inputs
+- Removal of headers, footers, footnotes, and page numbers
+- Human-readable layout formatting
+- Retains the original document's structure and formatting, including headings, paragraphs, lists, and more
+- Extraction and display of images and tables within markdown
+- Conversion of equations into LaTeX format
+- Automatic detection and conversion of garbled PDFs
+- Compatibility with CPU and GPU environments
+- Available for Windows, Linux, and macOS platforms
+https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
+## Project Panorama
+![Project Panorama](docs/images/project_panorama_en.png)
+## Flowchart
+![Flowchart](docs/images/flowchart_en.png)
+### Submodule Repositories
+- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
+  - A Comprehensive Toolkit for High-Quality PDF Content Extraction
+## Getting Started
+### Requirements
+- Python >= 3.9
+### Usage Instructions
+#### 1. Install Magic-PDF
+```bash
+pip install magic-pdf
+```
+#### 2. Usage via Command Line
+###### simple
+```bash
+cp magic-pdf.template.json ~/magic-pdf.json
+magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
+```
+After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".
+###### more
+```bash
+magic-pdf --help
+```
+#### 3. Usage via Api
+###### Local
+```python
+image_writer = DiskReaderWriter(local_image_dir)
+image_dir = str(os.path.basename(local_image_dir))
+jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+pipe.pipe_classify()
+pipe.pipe_parse()
+md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+```
+###### Object Storage
+```python
+s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
+image_dir = "s3://img_bucket/"
+s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
+pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
+jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
+pipe.pipe_classify()
+pipe.pipe_parse()
+md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+```
+Demo can be referred to [demo.py](demo/demo.py)
+# Magic-Doc
+## Introduction
+Magic-Doc is a tool designed to convert web pages or multi-format e-books into markdown format.
+Key Features Include:
+- Web Page Extraction
+  - Cross-modal precise parsing of text, images, tables, and formula information.
+- E-Book Document Extraction
+  - Supports various document formats including epub, mobi, with full adaptation for text and images.
+- Language Type Identification
+  - Accurate recognition of 176 languages.
+https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
+https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
+https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
+## Project Repository
+- [Magic-Doc](https://github.com/InternLM/magic-doc)
+  Outstanding Webpage and E-book Extraction Tool
+# All Thanks To Our Contributors
+<a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
+</a>
+# License Information
+[LICENSE.md](LICENSE.md)
+The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
+# Acknowledgments
+- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
+- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
+- [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
+- [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
+# Citation
+```bibtex
+@misc{2024mineru,
+    title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
+    author={MinerU Contributors},
+    howpublished = {\url{https://github.com/opendatalab/MinerU}},
+    year={2024}
+}
+```
+# Star History
+<a>
+ <picture>
+   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
+   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
+   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
+ </picture>
+</a>

{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/RECORD RENAMED Viewed

@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
 magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
 magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
 magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
+magic_pdf/cli/magicpdf.py,sha256=d4Wy2g7t_GsclV4r0vQR0enIh08-Ml2n1jf1zdrq4LE,11852
 magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
 magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -34,21 +34,21 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
 magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
 magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
 magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
-magic_pdf/libs/language.py,sha256=klymhpJyFSc9ukUPwIQmCx1DwGMuXueosaFjmMzETQw,812
+magic_pdf/libs/language.py,sha256=l0LGIz-dlerU9Xct-7ypNKGNEI_q-CTadsJAnVTF9VY,692
 magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
 magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
 magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
 magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
-magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,538
+magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
 magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
 magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
 magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
 magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
-magic_pdf/libs/version.py,sha256=xFez9dUQrcuZqZRWuEIsCbMskoR-Ke1_uUZ51Kyt1tw,23
+magic_pdf/libs/version.py,sha256=jEM-pQV3SLNuNue5fxlBM8hWNuJydsyqi_WBzC1VQaM,23
 magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
 magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
 magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
-magic_pdf/model/doc_analyze_by_custom_model.py,sha256=N3DqbVT1hc4s9KhppWDmZWkCj2ExKltoLrQl2IWGk7c,2231
+magic_pdf/model/doc_analyze_by_custom_model.py,sha256=8z4NX7Lk7CcPl1BQiNYL6dDiP63M3f6m3dmW6rjHCqg,2370
 magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
 magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
 magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
 magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
 magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
 magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
-magic_pdf-0.5.11.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
-magic_pdf-0.5.11.dist-info/METADATA,sha256=wMDXFCmnlXQKkUdp891cG46MrbDn92TlPPD8T7AT3tE,4649
-magic_pdf-0.5.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-magic_pdf-0.5.11.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
-magic_pdf-0.5.11.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
-magic_pdf-0.5.11.dist-info/RECORD,,
+magic_pdf-0.5.13.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+magic_pdf-0.5.13.dist-info/METADATA,sha256=g5VqQbFmBpLwZyVNivClRek2vVoBAGwhjuT8Tnq3Wtc,6673
+magic_pdf-0.5.13.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+magic_pdf-0.5.13.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
+magic_pdf-0.5.13.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
+magic_pdf-0.5.13.dist-info/RECORD,,

magic_pdf-0.5.11.dist-info/METADATA DELETED Viewed

@@ -1,152 +0,0 @@
-Metadata-Version: 2.1
-Name: magic-pdf
-Version: 0.5.11
-Summary: A practical tool for converting PDF to Markdown
-Home-page: https://github.com/magicpdf/Magic-PDF
-Requires-Python: >=3.9
-Description-Content-Type: text/markdown
-License-File: LICENSE.md
-Requires-Dist: boto3 >=1.28.43
-Requires-Dist: Brotli >=1.1.0
-Requires-Dist: click >=8.1.7
-Requires-Dist: Distance >=0.1.3
-Requires-Dist: PyMuPDF >=1.24.5
-Requires-Dist: loguru >=0.6.0
-Requires-Dist: matplotlib >=3.8.3
-Requires-Dist: numpy >=1.21.6
-Requires-Dist: pandas >=1.3.5
-Requires-Dist: fast-langdetect >=0.1.1
-Requires-Dist: regex >=2023.12.25
-Requires-Dist: termcolor >=2.4.0
-Requires-Dist: wordninja >=2.0.0
-Requires-Dist: scikit-learn >=1.0.2
-Requires-Dist: nltk ==3.8.1
-Requires-Dist: s3pathlib >=2.1.1
-Requires-Dist: pdfminer.six >=20231228
-Requires-Dist: Levenshtein
-Requires-Dist: nltk
-Requires-Dist: rapidfuzz
-Requires-Dist: statistics
-Requires-Dist: openxlab
-Requires-Dist: pandas
-Requires-Dist: numpy
-Requires-Dist: matplotlib
-Requires-Dist: seaborn
-Requires-Dist: scipy
-Requires-Dist: scikit-learn
-Requires-Dist: tqdm
-Requires-Dist: htmltabletomd
-Requires-Dist: pypandoc
-Provides-Extra: cpu
-Requires-Dist: paddleocr ; extra == 'cpu'
-Requires-Dist: paddlepaddle ; extra == 'cpu'
-Provides-Extra: gpu
-Requires-Dist: paddleocr ; extra == 'gpu'
-Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
-<div id="top"></div>
-<div align="center">
-[![stars](https://img.shields.io/github/stars/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF)
-[![forks](https://img.shields.io/github/forks/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF)
-[![license](https://img.shields.io/github/license/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF/tree/main/LICENSE)
-[![issue resolution](https://img.shields.io/github/issues-closed-raw/magicpdf/Magic-PDF)](https://github.com/magicpdf/Magic-PDF/issues)
-[![open issues](https://img.shields.io/github/issues-raw/magicpdf/Magic-PDF)](https://github.com/magicpdf/Magic-PDF/issues)
-[English](README.md) | [简体中文](README_zh-CN.md)
-</div>
-<div align="center">
-</div>
-# Magic-PDF
-## Introduction
-Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
-Key features include:
-- Support for multiple front-end model inputs
-- Removal of headers, footers, footnotes, and page numbers
-- Human-readable layout formatting
-- Retains the original document's structure and formatting, including headings, paragraphs, lists, and more
-- Extraction and display of images and tables within markdown
-- Conversion of equations into LaTeX format
-- Automatic detection and conversion of garbled PDFs
-- Compatibility with CPU and GPU environments
-- Available for Windows, Linux, and macOS platforms
-## Project Panorama
-![Project Panorama](docs/images/project_panorama_en.png)
-## Getting Started
-### Requirements
-- Python 3.9 or newer
-### Usage Instructions
-#### 1. Install Magic-PDF
-```bash
-pip install magic-pdf
-```
-#### 2. Usage via Command Line
-###### simple
-```bash
-cp magic-pdf.template.json to ~/magic-pdf.json
-magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
-```
-###### more
-```bash
-magic-pdf --help
-```
-#### 3. Usage via Api
-###### Local
-```python
-image_writer = DiskReaderWriter(local_image_dir)
-image_dir = str(os.path.basename(local_image_dir))
-jso_useful_key = {"_pdf_type": "", "model_list": model_json}
-pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
-pipe.pipe_classify()
-pipe.pipe_parse()
-md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-```
-###### Object Storage
-```python
-s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
-image_dir = "s3://img_bucket/"
-s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
-pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
-jso_useful_key = {"_pdf_type": "", "model_list": model_json}
-pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
-pipe.pipe_classify()
-pipe.pipe_parse()
-md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
-```
-Demo can be referred to [demo.py](https://github.com/magicpdf/Magic-PDF/blob/master/demo/demo.py)
-## All Thanks To Our Contributors
-<a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=magicpdf/Magic-PDF" />
-</a>
-## License Information
-See [LICENSE.md](https://github.com/magicpdf/Magic-PDF/blob/master/LICENSE.md) for details.
-## Acknowledgments
-- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
-- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)

{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{magic_pdf-0.5.11.dist-info → magic_pdf-0.5.13.dist-info}/top_level.txt RENAMED Viewed

File without changes

magic-pdf 0.5.11__py3-none-any.whl → 0.5.13__py3-none-any.whl

magic-pdf 0.5.11py3-none-any.whl → 0.5.13py3-none-any.whl