PyPI - magic-pdf - Versions diffs - 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

magic-pdf 0.7.0b1py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

magic_pdf/dict2md/ocr_mkcontent.py +134 -76
magic_pdf/integrations/__init__.py +0 -0
magic_pdf/integrations/rag/__init__.py +0 -0
magic_pdf/integrations/rag/api.py +82 -0
magic_pdf/integrations/rag/type.py +82 -0
magic_pdf/integrations/rag/utils.py +285 -0
magic_pdf/layout/layout_sort.py +472 -283
magic_pdf/libs/Constants.py +27 -1
magic_pdf/libs/boxbase.py +169 -149
magic_pdf/libs/draw_bbox.py +113 -87
magic_pdf/libs/ocr_content_type.py +21 -18
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
magic_pdf/model/magic_model.py +230 -161
magic_pdf/model/model_list.py +8 -0
magic_pdf/model/pdf_extract_kit.py +135 -22
magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
magic_pdf/model/ppTableModel.py +67 -0
magic_pdf/para/para_split_v2.py +76 -74
magic_pdf/pdf_parse_union_core.py +34 -6
magic_pdf/pipe/AbsPipe.py +4 -1
magic_pdf/pipe/OCRPipe.py +7 -4
magic_pdf/pipe/TXTPipe.py +7 -4
magic_pdf/pipe/UNIPipe.py +11 -6
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
magic_pdf/resources/model_config/model_configs.yaml +3 -1
magic_pdf/tools/cli.py +56 -29
magic_pdf/tools/cli_dev.py +61 -64
magic_pdf/tools/common.py +57 -37
magic_pdf/user_api.py +17 -9
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +71 -33
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +38 -32
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0

magic_pdf/tools/common.py CHANGED Viewed

@@ -1,22 +1,25 @@
-import os
-import json as json_parse
 import copy
+import json as json_parse
+import os
 import click
 from loguru import logger
+import magic_pdf.model as model_config
+from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
+                                      drow_model_bbox)
 from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
-from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
-from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.pipe.OCRPipe import OCRPipe
 from magic_pdf.pipe.TXTPipe import TXTPipe
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-import magic_pdf.model as model_config
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 def prepare_env(output_dir, pdf_file_name, method):
     local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
-    local_image_dir = os.path.join(str(local_parent_dir), "images")
+    local_image_dir = os.path.join(str(local_parent_dir), 'images')
     local_md_dir = local_parent_dir
     os.makedirs(local_image_dir, exist_ok=True)
     os.makedirs(local_md_dir, exist_ok=True)
@@ -29,6 +32,7 @@ def do_parse(
     pdf_bytes,
     model_list,
     parse_method,
+    debug_able,
     f_draw_span_bbox=True,
     f_draw_layout_bbox=True,
     f_dump_md=True,
@@ -38,24 +42,34 @@ def do_parse(
     f_dump_content_list=False,
     f_make_md_mode=MakeMode.MM_MD,
     f_draw_model_bbox=False,
+    start_page_id=0,
+    end_page_id=None,
 ):
+    if debug_able:
+        logger.warning("debug mode is on")
+        f_dump_content_list = True
+        f_draw_model_bbox = True
     orig_model_list = copy.deepcopy(model_list)
-    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
+    local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
+                                                parse_method)
-    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
-        local_md_dir
-    )
+    image_writer, md_writer = DiskReaderWriter(
+        local_image_dir), DiskReaderWriter(local_md_dir)
     image_dir = str(os.path.basename(local_image_dir))
-    if parse_method == "auto":
-        jso_useful_key = {"_pdf_type": "", "model_list": model_list}
-        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
-    elif parse_method == "txt":
-        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
-    elif parse_method == "ocr":
-        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
+    if parse_method == 'auto':
+        jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
+        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
+                       start_page_id=start_page_id, end_page_id=end_page_id)
+    elif parse_method == 'txt':
+        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
+                       start_page_id=start_page_id, end_page_id=end_page_id)
+    elif parse_method == 'ocr':
+        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
+                       start_page_id=start_page_id, end_page_id=end_page_id)
     else:
-        logger.error("unknown parse method")
+        logger.error('unknown parse method')
         exit(1)
     pipe.pipe_classify()
@@ -65,58 +79,64 @@ def do_parse(
             pipe.pipe_analyze()
             orig_model_list = copy.deepcopy(pipe.model_list)
         else:
-            logger.error("need model list input")
+            logger.error('need model list input')
             exit(2)
     pipe.pipe_parse()
-    pdf_info = pipe.pdf_mid_data["pdf_info"]
+    pdf_info = pipe.pdf_mid_data['pdf_info']
     if f_draw_layout_bbox:
-        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
+        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
     if f_draw_span_bbox:
-        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
+        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
     if f_draw_model_bbox:
-        drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
+        drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
-    md_content = pipe.pipe_mk_markdown(
-        image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
-    )
+    md_content = pipe.pipe_mk_markdown(image_dir,
+                                       drop_mode=DropMode.NONE,
+                                       md_make_mode=f_make_md_mode)
     if f_dump_md:
         md_writer.write(
             content=md_content,
-            path=f"{pdf_file_name}.md",
+            path=f'{pdf_file_name}.md',
             mode=AbsReaderWriter.MODE_TXT,
         )
     if f_dump_middle_json:
         md_writer.write(
-            content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
-            path="middle.json",
+            content=json_parse.dumps(pipe.pdf_mid_data,
+                                     ensure_ascii=False,
+                                     indent=4),
+            path=f'{pdf_file_name}_middle.json',
             mode=AbsReaderWriter.MODE_TXT,
         )
     if f_dump_model_json:
         md_writer.write(
-            content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
-            path="model.json",
+            content=json_parse.dumps(orig_model_list,
+                                     ensure_ascii=False,
+                                     indent=4),
+            path=f'{pdf_file_name}_model.json',
             mode=AbsReaderWriter.MODE_TXT,
         )
     if f_dump_orig_pdf:
         md_writer.write(
             content=pdf_bytes,
-            path="origin.pdf",
+            path=f'{pdf_file_name}_origin.pdf',
             mode=AbsReaderWriter.MODE_BIN,
         )
     content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
     if f_dump_content_list:
         md_writer.write(
-            content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
-            path="content_list.json",
+            content=json_parse.dumps(content_list,
+                                     ensure_ascii=False,
+                                     indent=4),
+            path=f'{pdf_file_name}_content_list.json',
             mode=AbsReaderWriter.MODE_TXT,
         )
-    logger.info(f"local output dir is {local_md_dir}")
+    logger.info(f'local output dir is {local_md_dir}')
-parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
+parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])

magic_pdf/user_api.py CHANGED Viewed

@@ -25,8 +25,9 @@ PARSE_TYPE_TXT = "txt"
 PARSE_TYPE_OCR = "ocr"
-def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
-                  **kwargs):
+def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
+                  start_page_id=0, end_page_id=None,
+                  *args, **kwargs):
     """
     解析文本类pdf
     """
@@ -34,7 +35,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
         pdf_bytes,
         pdf_models,
         imageWriter,
-        start_page_id=start_page,
+        start_page_id=start_page_id,
+        end_page_id=end_page_id,
         debug_mode=is_debug,
     )
@@ -45,8 +47,9 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
     return pdf_info_dict
-def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
-                  **kwargs):
+def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
+                  start_page_id=0, end_page_id=None,
+                  *args, **kwargs):
     """
     解析ocr类pdf
     """
@@ -54,7 +57,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
         pdf_bytes,
         pdf_models,
         imageWriter,
-        start_page_id=start_page,
+        start_page_id=start_page_id,
+        end_page_id=end_page_id,
         debug_mode=is_debug,
     )
@@ -65,8 +69,9 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
     return pdf_info_dict
-def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
+def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
                     input_model_is_empty: bool = False,
+                    start_page_id=0, end_page_id=None,
                     *args, **kwargs):
     """
     ocr和文本混合的pdf，全部解析出来
@@ -78,7 +83,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
                 pdf_bytes,
                 pdf_models,
                 imageWriter,
-                start_page_id=start_page,
+                start_page_id=start_page_id,
+                end_page_id=end_page_id,
                 debug_mode=is_debug,
             )
         except Exception as e:
@@ -89,7 +95,9 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
     if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
         logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
         if input_model_is_empty:
-            pdf_models = doc_analyze(pdf_bytes, ocr=True)
+            pdf_models = doc_analyze(pdf_bytes, ocr=True,
+                                     start_page_id=start_page_id,
+                                     end_page_id=end_page_id)
         pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
         if pdf_info_dict is None:
             raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")

{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: magic-pdf
-Version: 0.7.0b1
+Version: 0.8.0
 Summary: A practical tool for converting PDF to Markdown
 Home-page: https://github.com/opendatalab/MinerU
 Requires-Python: >=3.9
@@ -9,13 +9,14 @@ License-File: LICENSE.md
 Requires-Dist: boto3>=1.28.43
 Requires-Dist: Brotli>=1.1.0
 Requires-Dist: click>=8.1.7
-Requires-Dist: PyMuPDF>=1.24.9
+Requires-Dist: fast-langdetect==0.2.0
 Requires-Dist: loguru>=0.6.0
 Requires-Dist: numpy<2.0.0,>=1.21.6
-Requires-Dist: fast-langdetect==0.2.0
-Requires-Dist: wordninja>=2.0.0
-Requires-Dist: scikit-learn>=1.0.2
 Requires-Dist: pdfminer.six==20231228
+Requires-Dist: pydantic<2.8.0,>=2.7.2
+Requires-Dist: PyMuPDF>=1.24.9
+Requires-Dist: scikit-learn>=1.0.2
+Requires-Dist: wordninja>=2.0.0
 Provides-Extra: full
 Requires-Dist: unimernet==0.1.6; extra == "full"
 Requires-Dist: ultralytics; extra == "full"
@@ -39,6 +40,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
 </p>
 <!-- icon -->
 [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
 [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
 [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
@@ -46,17 +48,26 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
 [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
 [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
 [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
+[![HuggingFace](https://img.shields.io/badge/HuggingFace-Demo-yellow.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC)](https://huggingface.co/spaces/opendatalab/MinerU)
+[![ModelScope](https://img.shields.io/badge/ModelScope-Demo-purple?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
+[![Paper](https://img.shields.io/badge/Paper-arXiv-green)](#)
 <a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 <!-- language -->
 [English](README.md) | [简体中文](README_zh-CN.md)
 <!-- hot link -->
 <p align="center">
 <a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
 </p>
 <!-- join us -->
 <p align="center">
     👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
 </p>
@@ -64,11 +75,14 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
 </div>
 # Changelog
+- 2024/09/09: Version 0.8.0 released, supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.
+- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
 - 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
 - 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
 - 2024/07/05: Initial open-source release
 <!-- TABLE OF CONTENT -->
 <details open="open">
   <summary><h2 style="display: inline-block">Table of Contents</h2></summary>
   <ol>
@@ -107,10 +121,10 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
   </ol>
 </details>
 # MinerU
 ## Project Introduction
 MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
 MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
 Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [issue](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
@@ -134,6 +148,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
 If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
 If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
 There are three different ways to experience MinerU:
 - [Online Demo (No Installation Required)](#online-demo)
 - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
 - [Linux/Windows + CUDA](#Using-GPU)
@@ -191,7 +206,9 @@ In non-mainline environments, due to the diversity of hardware and software conf
     <tr>
         <td colspan="2">Recommended Configuration 16G+ VRAM</td>
         <td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
-        16G or more can enable layout, formula recognition, and OCR acceleration simultaneously</td>
+        16G or more can enable layout, formula recognition, and OCR acceleration simultaneously<br>
+        24G or more can enable layout, formula recognition, OCR acceleration and table recognition simultaneously
+        </td>
     </tr>
 </table>
@@ -202,51 +219,73 @@ In non-mainline environments, due to the diversity of hardware and software conf
 ### Quick CPU Demo
 #### 1. Install magic-pdf
 ```bash
 conda create -n MinerU python=3.10
 conda activate MinerU
-pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com
+pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
 ```
 #### 2. Download model weight files
 Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for detailed instructions.
 > ❗️After downloading the models, please make sure to verify the completeness of the model files.
->
+>
 > Check if the model file sizes match the description on the webpage. If possible, use sha256 to verify the integrity of the files.
 #### 3. Copy and configure the template file
 You can find the `magic-pdf.template.json` template configuration file in the root directory of the repository.
 > ❗️Make sure to execute the following command to copy the configuration file to your **user directory**; otherwise, the program will not run.
->
+>
 > The user directory for Windows is `C:\Users\YourUsername`, for Linux it is `/home/YourUsername`, and for macOS it is `/Users/YourUsername`.
 ```bash
 cp magic-pdf.template.json ~/magic-pdf.json
 ```
 Find the `magic-pdf.json` file in your user directory and configure the "models-dir" path to point to the directory where the model weight files were downloaded in [Step 2](#2-download-model-weight-files).
 > ❗️Make sure to correctly configure the **absolute path** to the model weight files directory, otherwise the program will not run because it can't find the model files.
 >
 > On Windows, this path should include the drive letter and all backslashes (`\`) in the path should be replaced with forward slashes (`/`) to avoid syntax errors in the JSON file due to escape sequences.
->
+>
 > For example: If the models are stored in the "models" directory at the root of the D drive, the "model-dir" value should be `D:/models`.
 ```json
 {
   // other config
   "models-dir": "D:/models",
   "table-config": {
+        "model": "TableMaster", // Another option of this value is 'struct_eqtable'
         "is_table_recog_enable": false, // Table recognition is disabled by default, modify this value to enable it
         "max_time": 400
     }
 }
 ```
 ### Using GPU
 If your device supports CUDA and meets the GPU requirements of the mainline environment, you can use GPU acceleration. Please select the appropriate guide based on your system:
 - [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
 - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
+- Quick Deployment with Docker
+    > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
+    >
+    > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
+    >
+    > ```bash
+    > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+    > ```
+  ```bash
+  wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
+  docker build -t mineru:latest .
+  docker run --rm -it --gpus=all mineru:latest /bin/bash
+  magic-pdf --help
+  ```
 ## Usage
@@ -260,12 +299,12 @@ Options:
   -v, --version                display the version and exit
   -p, --path PATH              local pdf filepath or directory  [required]
   -o, --output-dir TEXT        output local directory
-  -m, --method [ocr|txt|auto]  the method for parsing pdf.
+  -m, --method [ocr|txt|auto]  the method for parsing pdf.
                                ocr: using ocr technique to extract information from pdf,
                                txt: suitable for the text-based pdf only and outperform ocr,
                                auto: automatically choose the best method for parsing pdf
                                   from ocr and txt.
-                               without method specified, auto will be used by default.
+                               without method specified, auto will be used by default.
   --help                       Show this message and exit.
@@ -280,13 +319,13 @@ magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
 The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
 ```text
-├── some_pdf.md                 # markdown file
-├── images                      # directory for storing images
-├── layout.pdf                  # layout diagram
-├── middle.json                 # MinerU intermediate processing result
-├── model.json                  # model inference result
-├── origin.pdf                  # original PDF file
-└── spans.pdf                   # smallest granularity bbox position information diagram
+├── some_pdf.md                          # markdown file
+├── images                               # directory for storing images
+├── some_pdf_layout.pdf                  # layout diagram
+├── some_pdf_middle.json                 # MinerU intermediate processing result
+├── some_pdf_model.json                  # model inference result
+├── some_pdf_origin.pdf                  # original PDF file
+└── some_pdf_spans.pdf                   # smallest granularity bbox position information diagram
 ```
 For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
@@ -294,6 +333,7 @@ For more information about the output files, please refer to the [Output File De
 ### API
 Processing files from local disk
 ```python
 image_writer = DiskReaderWriter(local_image_dir)
 image_dir = str(os.path.basename(local_image_dir))
@@ -306,6 +346,7 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
 ```
 Processing files from object storage
 ```python
 s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
 image_dir = "s3://img_bucket/"
@@ -320,10 +361,10 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
 ```
 For detailed implementation, refer to:
 - [demo.py Simplest Processing Method](demo/demo.py)
 - [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
 ### Development Guide
 TODO
@@ -335,30 +376,25 @@ TODO
 - [ ] Code block recognition within the text
 - [ ] Table of contents recognition
 - [x] Table recognition
-- [ ] Chemical formula recognition
+- [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
 - [ ] Geometric shape recognition
 # Known Issues
 - Reading order is segmented based on rules, which can cause disordered sequences in some cases
 - Vertical text is not supported
 - Lists, code blocks, and table of contents are not yet supported in the layout model
 - Comic books, art books, elementary school textbooks, and exercise books are not well-parsed yet
 - Enabling OCR may produce better results in PDFs with a high density of formulas
 - If you are processing PDFs with a large number of formulas, it is strongly recommended to enable the OCR function. When using PyMuPDF to extract text, overlapping text lines can occur, leading to inaccurate formula insertion positions.
-- **Table Recognition** is currently in the testing phase; recognition speed is slow, and accuracy needs improvement. Below are some performance test results in an Ubuntu 22.04 LTS + Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz + NVIDIA GeForce RTX 4090 environment for reference.
-| Table Size     | Parsing Time        |
-|---------------|----------------------------|
-| 6\*5 55kb     | 37s                   |
-| 16\*12 284kb  | 3m18s                 |
-| 44\*7 559kb   | 4m12s                 |
 # FAQ
 [FAQ in Chinese](docs/FAQ_zh_cn.md)
 [FAQ in English](docs/FAQ_en_us.md)
 # All Thanks To Our Contributors
 <a href="https://github.com/opendatalab/MinerU/graphs/contributors">
@@ -371,8 +407,8 @@ TODO
 This project currently uses PyMuPDF to achieve advanced functionality. However, since it adheres to the AGPL license, it may impose restrictions on certain usage scenarios. In future iterations, we plan to explore and replace it with a more permissive PDF processing library to enhance user-friendliness and flexibility.
 # Acknowledgments
 - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
 - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
 - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
@@ -409,9 +445,11 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
 </a>
 # Magic-doc
 [Magic-Doc](https://github.com/InternLM/magic-doc) Fast speed ppt/pptx/doc/docx/pdf extraction tool
 # Magic-html
 [Magic-HTML](https://github.com/opendatalab/magic-html) Mixed web page extraction tool
 # Links

magic-pdf 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl

magic-pdf 0.7.0b1py3-none-any.whl → 0.8.0py3-none-any.whl