magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. magic_pdf/config/constants.py +7 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/base.py +13 -1
  4. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  5. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  6. magic_pdf/data/dataset.py +188 -5
  7. magic_pdf/data/read_api.py +59 -12
  8. magic_pdf/data/utils.py +35 -0
  9. magic_pdf/dict2md/ocr_mkcontent.py +16 -15
  10. magic_pdf/filter/__init__.py +32 -0
  11. magic_pdf/filter/pdf_meta_scan.py +3 -2
  12. magic_pdf/libs/clean_memory.py +11 -4
  13. magic_pdf/libs/config_reader.py +9 -0
  14. magic_pdf/libs/draw_bbox.py +19 -22
  15. magic_pdf/libs/language.py +3 -0
  16. magic_pdf/libs/pdf_check.py +30 -30
  17. magic_pdf/libs/version.py +1 -1
  18. magic_pdf/model/__init__.py +1 -1
  19. magic_pdf/model/batch_analyze.py +275 -0
  20. magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
  21. magic_pdf/model/magic_model.py +4 -435
  22. magic_pdf/model/model_list.py +1 -0
  23. magic_pdf/model/pdf_extract_kit.py +35 -5
  24. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  25. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  26. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  27. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  29. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  30. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  31. magic_pdf/model/sub_modules/model_init.py +43 -7
  32. magic_pdf/model/sub_modules/model_utils.py +17 -5
  33. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  34. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  35. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  36. magic_pdf/operators/__init__.py +94 -0
  37. magic_pdf/operators/models.py +154 -0
  38. magic_pdf/operators/pipes.py +191 -0
  39. magic_pdf/pdf_parse_union_core_v2.py +77 -27
  40. magic_pdf/post_proc/__init__.py +1 -0
  41. magic_pdf/post_proc/llm_aided.py +133 -0
  42. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  43. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  44. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  45. magic_pdf/tools/cli.py +36 -11
  46. magic_pdf/tools/common.py +120 -61
  47. magic_pdf/utils/office_to_pdf.py +29 -0
  48. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
  49. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
  50. magic_pdf/para/__init__.py +0 -0
  51. magic_pdf/pdf_parse_by_ocr.py +0 -23
  52. magic_pdf/pdf_parse_by_txt.py +0 -24
  53. magic_pdf/pipe/AbsPipe.py +0 -98
  54. magic_pdf/pipe/OCRPipe.py +0 -41
  55. magic_pdf/pipe/TXTPipe.py +0 -41
  56. magic_pdf/pipe/UNIPipe.py +0 -98
  57. magic_pdf/pipe/__init__.py +0 -0
  58. magic_pdf/rw/AbsReaderWriter.py +0 -17
  59. magic_pdf/rw/DiskReaderWriter.py +0 -74
  60. magic_pdf/rw/S3ReaderWriter.py +0 -142
  61. magic_pdf/rw/__init__.py +0 -0
  62. magic_pdf/user_api.py +0 -121
  63. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  64. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
  65. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
  66. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
  67. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py CHANGED
@@ -1,5 +1,3 @@
1
- import copy
2
- import json as json_parse
3
1
  import os
4
2
 
5
3
  import click
@@ -7,13 +5,13 @@ import fitz
7
5
  from loguru import logger
8
6
 
9
7
  import magic_pdf.model as model_config
8
+ from magic_pdf.config.enums import SupportedPdfParseMethod
10
9
  from magic_pdf.config.make_content_config import DropMode, MakeMode
11
10
  from magic_pdf.data.data_reader_writer import FileBasedDataWriter
12
- from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
13
- draw_model_bbox, draw_span_bbox)
14
- from magic_pdf.pipe.OCRPipe import OCRPipe
15
- from magic_pdf.pipe.TXTPipe import TXTPipe
16
- from magic_pdf.pipe.UNIPipe import UNIPipe
11
+ from magic_pdf.data.dataset import PymuDocDataset
12
+ from magic_pdf.libs.draw_bbox import draw_char_bbox
13
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
14
+ from magic_pdf.operators.models import InferenceResult
17
15
 
18
16
  # from io import BytesIO
19
17
  # from pypdf import PdfReader, PdfWriter
@@ -56,7 +54,11 @@ def prepare_env(output_dir, pdf_file_name, method):
56
54
  def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
57
55
  document = fitz.open('pdf', pdf_bytes)
58
56
  output_document = fitz.open()
59
- end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
57
+ end_page_id = (
58
+ end_page_id
59
+ if end_page_id is not None and end_page_id >= 0
60
+ else len(document) - 1
61
+ )
60
62
  if end_page_id > len(document) - 1:
61
63
  logger.warning('end_page_id is out of range, use pdf_docs length')
62
64
  end_page_id = len(document) - 1
@@ -82,6 +84,7 @@ def do_parse(
82
84
  f_make_md_mode=MakeMode.MM_MD,
83
85
  f_draw_model_bbox=False,
84
86
  f_draw_line_sort_bbox=False,
87
+ f_draw_char_bbox=False,
85
88
  start_page_id=0,
86
89
  end_page_id=None,
87
90
  lang=None,
@@ -93,79 +96,135 @@ def do_parse(
93
96
  logger.warning('debug mode is on')
94
97
  f_draw_model_bbox = True
95
98
  f_draw_line_sort_bbox = True
99
+ # f_draw_char_bbox = True
96
100
 
97
- if lang == "":
98
- lang = None
101
+ pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
102
+ pdf_bytes, start_page_id, end_page_id
103
+ )
99
104
 
100
- pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
105
+ local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
101
106
 
102
- orig_model_list = copy.deepcopy(model_list)
103
- local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
104
- parse_method)
105
-
106
- image_writer, md_writer = FileBasedDataWriter(
107
- local_image_dir), FileBasedDataWriter(local_md_dir)
107
+ image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
108
+ local_md_dir
109
+ )
108
110
  image_dir = str(os.path.basename(local_image_dir))
109
111
 
110
- if parse_method == 'auto':
111
- jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
112
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
113
- # start_page_id=start_page_id, end_page_id=end_page_id,
114
- lang=lang,
115
- layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
116
- elif parse_method == 'txt':
117
- pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
118
- # start_page_id=start_page_id, end_page_id=end_page_id,
119
- lang=lang,
120
- layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
121
- elif parse_method == 'ocr':
122
- pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
123
- # start_page_id=start_page_id, end_page_id=end_page_id,
124
- lang=lang,
125
- layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
126
- else:
127
- logger.error('unknown parse method')
128
- exit(1)
129
-
130
- pipe.pipe_classify()
112
+ ds = PymuDocDataset(pdf_bytes, lang=lang)
131
113
 
132
114
  if len(model_list) == 0:
133
115
  if model_config.__use_inside_model__:
134
- pipe.pipe_analyze()
135
- orig_model_list = copy.deepcopy(pipe.model_list)
116
+ if parse_method == 'auto':
117
+ if ds.classify() == SupportedPdfParseMethod.TXT:
118
+ infer_result = ds.apply(
119
+ doc_analyze,
120
+ ocr=False,
121
+ lang=ds._lang,
122
+ layout_model=layout_model,
123
+ formula_enable=formula_enable,
124
+ table_enable=table_enable,
125
+ )
126
+ pipe_result = infer_result.pipe_txt_mode(
127
+ image_writer, debug_mode=True, lang=ds._lang
128
+ )
129
+ else:
130
+ infer_result = ds.apply(
131
+ doc_analyze,
132
+ ocr=True,
133
+ lang=ds._lang,
134
+ layout_model=layout_model,
135
+ formula_enable=formula_enable,
136
+ table_enable=table_enable,
137
+ )
138
+ pipe_result = infer_result.pipe_ocr_mode(
139
+ image_writer, debug_mode=True, lang=ds._lang
140
+ )
141
+
142
+ elif parse_method == 'txt':
143
+ infer_result = ds.apply(
144
+ doc_analyze,
145
+ ocr=False,
146
+ lang=ds._lang,
147
+ layout_model=layout_model,
148
+ formula_enable=formula_enable,
149
+ table_enable=table_enable,
150
+ )
151
+ pipe_result = infer_result.pipe_txt_mode(
152
+ image_writer, debug_mode=True, lang=ds._lang
153
+ )
154
+ elif parse_method == 'ocr':
155
+ infer_result = ds.apply(
156
+ doc_analyze,
157
+ ocr=True,
158
+ lang=ds._lang,
159
+ layout_model=layout_model,
160
+ formula_enable=formula_enable,
161
+ table_enable=table_enable,
162
+ )
163
+ pipe_result = infer_result.pipe_ocr_mode(
164
+ image_writer, debug_mode=True, lang=ds._lang
165
+ )
166
+ else:
167
+ logger.error('unknown parse method')
168
+ exit(1)
136
169
  else:
137
170
  logger.error('need model list input')
138
171
  exit(2)
172
+ else:
173
+
174
+ infer_result = InferenceResult(model_list, ds)
175
+ if parse_method == 'ocr':
176
+ pipe_result = infer_result.pipe_ocr_mode(
177
+ image_writer, debug_mode=True, lang=ds._lang
178
+ )
179
+ elif parse_method == 'txt':
180
+ pipe_result = infer_result.pipe_txt_mode(
181
+ image_writer, debug_mode=True, lang=ds._lang
182
+ )
183
+ else:
184
+ if ds.classify() == SupportedPdfParseMethod.TXT:
185
+ pipe_result = infer_result.pipe_txt_mode(
186
+ image_writer, debug_mode=True, lang=ds._lang
187
+ )
188
+ else:
189
+ pipe_result = infer_result.pipe_ocr_mode(
190
+ image_writer, debug_mode=True, lang=ds._lang
191
+ )
192
+
193
+
194
+ if f_draw_model_bbox:
195
+ infer_result.draw_model(
196
+ os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
197
+ )
139
198
 
140
- pipe.pipe_parse()
141
- pdf_info = pipe.pdf_mid_data['pdf_info']
142
199
  if f_draw_layout_bbox:
143
- draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
200
+ pipe_result.draw_layout(
201
+ os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
202
+ )
144
203
  if f_draw_span_bbox:
145
- draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
146
- if f_draw_model_bbox:
147
- draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
204
+ pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))
205
+
148
206
  if f_draw_line_sort_bbox:
149
- draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
207
+ pipe_result.draw_line_sort(
208
+ os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
209
+ )
210
+
211
+ if f_draw_char_bbox:
212
+ draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
150
213
 
151
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
152
214
  if f_dump_md:
153
- md_writer.write_string(
215
+ pipe_result.dump_md(
216
+ md_writer,
154
217
  f'{pdf_file_name}.md',
155
- md_content
218
+ image_dir,
219
+ drop_mode=DropMode.NONE,
220
+ md_make_mode=f_make_md_mode,
156
221
  )
157
222
 
158
223
  if f_dump_middle_json:
159
- md_writer.write_string(
160
- f'{pdf_file_name}_middle.json',
161
- json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
162
- )
224
+ pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
163
225
 
164
226
  if f_dump_model_json:
165
- md_writer.write_string(
166
- f'{pdf_file_name}_model.json',
167
- json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
168
- )
227
+ infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
169
228
 
170
229
  if f_dump_orig_pdf:
171
230
  md_writer.write(
@@ -173,11 +232,11 @@ def do_parse(
173
232
  pdf_bytes,
174
233
  )
175
234
 
176
- content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
177
235
  if f_dump_content_list:
178
- md_writer.write_string(
236
+ pipe_result.dump_content_list(
237
+ md_writer,
179
238
  f'{pdf_file_name}_content_list.json',
180
- json_parse.dumps(content_list, ensure_ascii=False, indent=4)
239
+ image_dir
181
240
  )
182
241
 
183
242
  logger.info(f'local output dir is {local_md_dir}')
@@ -0,0 +1,29 @@
1
+ import os
2
+ import subprocess
3
+ from pathlib import Path
4
+
5
+
6
+ class ConvertToPdfError(Exception):
7
+ def __init__(self, msg):
8
+ self.msg = msg
9
+ super().__init__(self.msg)
10
+
11
+
12
+ def convert_file_to_pdf(input_path, output_dir):
13
+ if not os.path.isfile(input_path):
14
+ raise FileNotFoundError(f"The input file {input_path} does not exist.")
15
+
16
+ os.makedirs(output_dir, exist_ok=True)
17
+
18
+ cmd = [
19
+ 'soffice',
20
+ '--headless',
21
+ '--convert-to', 'pdf',
22
+ '--outdir', str(output_dir),
23
+ str(input_path)
24
+ ]
25
+
26
+ process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
27
+
28
+ if process.returncode != 0:
29
+ raise ConvertToPdfError(process.stderr.decode())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.10.5
3
+ Version: 1.0.0
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -9,25 +9,30 @@ License-File: LICENSE.md
9
9
  Requires-Dist: boto3>=1.28.43
10
10
  Requires-Dist: Brotli>=1.1.0
11
11
  Requires-Dist: click>=8.1.7
12
- Requires-Dist: fast-langdetect==0.2.0
12
+ Requires-Dist: fast-langdetect>=0.2.3
13
13
  Requires-Dist: loguru>=0.6.0
14
14
  Requires-Dist: numpy<2.0.0,>=1.21.6
15
- Requires-Dist: pydantic<2.8.0,>=2.7.2
15
+ Requires-Dist: pydantic>=2.7.2
16
16
  Requires-Dist: PyMuPDF>=1.24.9
17
17
  Requires-Dist: scikit-learn>=1.0.2
18
- Requires-Dist: torch<=2.3.1,>=2.2.2
18
+ Requires-Dist: torch>=2.2.2
19
19
  Requires-Dist: transformers
20
+ Requires-Dist: pdfminer.six==20231228
20
21
  Provides-Extra: full
21
- Requires-Dist: unimernet==0.2.1; extra == "full"
22
- Requires-Dist: ultralytics; extra == "full"
22
+ Requires-Dist: unimernet==0.2.3; extra == "full"
23
+ Requires-Dist: torch<=2.3.1,>=2.2.2; extra == "full"
24
+ Requires-Dist: torchvision<=0.18.1,>=0.17.2; extra == "full"
25
+ Requires-Dist: ultralytics>=8.3.48; extra == "full"
23
26
  Requires-Dist: paddleocr==2.7.3; extra == "full"
24
27
  Requires-Dist: struct-eqtable==0.3.2; extra == "full"
25
28
  Requires-Dist: einops; extra == "full"
26
29
  Requires-Dist: accelerate; extra == "full"
27
30
  Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
28
31
  Requires-Dist: rapidocr-paddle; extra == "full"
29
- Requires-Dist: rapid-table; extra == "full"
32
+ Requires-Dist: rapidocr-onnxruntime; extra == "full"
33
+ Requires-Dist: rapid-table==0.3.0; extra == "full"
30
34
  Requires-Dist: PyYAML; extra == "full"
35
+ Requires-Dist: openai; extra == "full"
31
36
  Requires-Dist: detectron2; extra == "full"
32
37
  Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
33
38
  Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
@@ -56,7 +61,7 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
56
61
  [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
57
62
  [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
58
63
 
59
- [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
64
+ [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://mineru.org.cn/OpenSourceTools/Extractor?source=github)
60
65
  [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
61
66
  [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
62
67
  [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
@@ -73,6 +78,11 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
73
78
 
74
79
  <p align="center">
75
80
  <a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
81
+ <br>
82
+ <br>
83
+ <a href="https://mineru.org.cn/client?source=github">
84
+ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple interface and smooth interactions. Enjoy it without any fuss!</a>🚀🚀🚀
85
+
76
86
  </p>
77
87
 
78
88
  <!-- join us -->
@@ -84,6 +94,15 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
84
94
  </div>
85
95
 
86
96
  # Changelog
97
+ - 2025/01/10 1.0.0 released. This is our first official release, where we have introduced a completely new API interface and enhanced compatibility through extensive refactoring, as well as a brand new automatic language identification feature:
98
+ - New API Interface
99
+ - For the data-side API, we have introduced the Dataset class, designed to provide a robust and flexible data processing framework. This framework currently supports a variety of document formats, including images (.jpg and .png), PDFs, Word documents (.doc and .docx), and PowerPoint presentations (.ppt and .pptx). It ensures effective support for data processing tasks ranging from simple to complex.
100
+ - For the user-side API, we have meticulously designed the MinerU processing workflow as a series of composable Stages. Each Stage represents a specific processing step, allowing users to define new Stages according to their needs and creatively combine these stages to customize their data processing workflows.
101
+ - Enhanced Compatibility
102
+ - By optimizing the dependency environment and configuration items, we ensure stable and efficient operation on ARM architecture Linux systems.
103
+ - We have deeply integrated with Huawei Ascend NPU acceleration, providing autonomous and controllable high-performance computing capabilities. This supports the localization and development of AI application platforms in China. [Ascend NPU Acceleration](docs/README_Ascend_NPU_Acceleration_zh_CN.md)
104
+ - Automatic Language Identification
105
+ - By introducing a new language recognition model, setting the `lang` configuration to `auto` during document parsing will automatically select the appropriate OCR language model, improving the accuracy of scanned document parsing.
87
106
  - 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
88
107
  - Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
89
108
  - Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
@@ -123,6 +142,7 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
123
142
  <li><a href="#online-demo">Online Demo</a></li>
124
143
  <li><a href="#quick-cpu-demo">Quick CPU Demo</a></li>
125
144
  <li><a href="#using-gpu">Using GPU</a></li>
145
+ <li><a href="#using-npu">Using NPU</a></li>
126
146
  </ul>
127
147
  </li>
128
148
  <li><a href="#usage">Usage</a>
@@ -171,7 +191,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
171
191
  - OCR supports detection and recognition of 84 languages.
172
192
  - Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
173
193
  - Supports various visualization results, including layout visualization and span visualization, for efficient confirmation of output quality.
174
- - Supports both CPU and GPU environments.
194
+ - Supports running in a pure CPU environment, and also supports GPU(CUDA)/NPU(CANN)/MPS acceleration
175
195
  - Compatible with Windows, Linux, and Mac platforms.
176
196
 
177
197
  ## Quick Start
@@ -182,7 +202,10 @@ There are three different ways to experience MinerU:
182
202
 
183
203
  - [Online Demo (No Installation Required)](#online-demo)
184
204
  - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
185
- - [Linux/Windows + CUDA](#Using-GPU)
205
+ - Accelerate inference by using CUDA/CANN/MPS
206
+ - [Linux/Windows + CUDA](#Using-GPU)
207
+ - [Linux + CANN](#using-npu)
208
+ - [MacOS + MPS](#using-mps)
186
209
 
187
210
  > [!WARNING]
188
211
  > **Pre-installation Notice—Hardware and Software Environment Support**
@@ -198,20 +221,24 @@ There are three different ways to experience MinerU:
198
221
  <td colspan="3" rowspan="2">Operating System</td>
199
222
  </tr>
200
223
  <tr>
201
- <td>Ubuntu 22.04 LTS</td>
224
+ <td>Linux after 2019</td>
202
225
  <td>Windows 10 / 11</td>
203
226
  <td>macOS 11+</td>
204
227
  </tr>
205
228
  <tr>
206
229
  <td colspan="3">CPU</td>
207
- <td>x86_64(unsupported ARM Linux)</td>
230
+ <td>x86_64 / arm64</td>
208
231
  <td>x86_64(unsupported ARM Windows)</td>
209
232
  <td>x86_64 / arm64</td>
210
233
  </tr>
211
234
  <tr>
212
- <td colspan="3">Memory</td>
235
+ <td colspan="3">Memory Requirements</td>
213
236
  <td colspan="3">16GB or more, recommended 32GB+</td>
214
237
  </tr>
238
+ <tr>
239
+ <td colspan="3">Storage Requirements</td>
240
+ <td colspan="3">20GB or more, with a preference for SSD</td>
241
+ </tr>
215
242
  <tr>
216
243
  <td colspan="3">Python Version</td>
217
244
  <td colspan="3">3.10(Please make sure to create a Python 3.10 virtual environment using conda)</td>
@@ -228,6 +255,12 @@ There are three different ways to experience MinerU:
228
255
  <td>11.8 (manual installation) + cuDNN v8.7.0 (manual installation)</td>
229
256
  <td>None</td>
230
257
  </tr>
258
+ <tr>
259
+ <td colspan="3">CANN Environment(NPU support)</td>
260
+ <td>8.0+(Ascend 910b)</td>
261
+ <td>None</td>
262
+ <td>None</td>
263
+ </tr>
231
264
  <tr>
232
265
  <td rowspan="2">GPU Hardware Support List</td>
233
266
  <td colspan="2">GPU VRAM 8GB or more</td>
@@ -240,7 +273,7 @@ There are three different ways to experience MinerU:
240
273
  ### Online Demo
241
274
 
242
275
  Stable Version (Stable version verified by QA):
243
- [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
276
+ [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://mineru.org.cn/OpenSourceTools/Extractor?source=github)
244
277
 
245
278
  Test Version (Synced with dev branch updates, testing new features):
246
279
  [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
@@ -253,7 +286,7 @@ Test Version (Synced with dev branch updates, testing new features):
253
286
  ```bash
254
287
  conda create -n MinerU python=3.10
255
288
  conda activate MinerU
256
- pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
289
+ pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
257
290
  ```
258
291
 
259
292
  #### 2. Download model weight files
@@ -278,7 +311,7 @@ You can modify certain configurations in this file to enable or disable features
278
311
  {
279
312
  // other config
280
313
  "layout-config": {
281
- "model": "layoutlmv3" // Please change to "doclayout_yolo" when using doclayout_yolo.
314
+ "model": "doclayout_yolo" // Please change to "layoutlmv3" when using layoutlmv3.
282
315
  },
283
316
  "formula-config": {
284
317
  "mfd_model": "yolo_v8_mfd",
@@ -287,7 +320,7 @@ You can modify certain configurations in this file to enable or disable features
287
320
  },
288
321
  "table-config": {
289
322
  "model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
290
- "enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
323
+ "enable": true, // The table recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
291
324
  "max_time": 400
292
325
  }
293
326
  }
@@ -309,29 +342,49 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
309
342
  > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
310
343
  > ```
311
344
  ```bash
312
- wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
345
+ wget https://github.com/opendatalab/MinerU/raw/master/docker/global/Dockerfile -O Dockerfile
313
346
  docker build -t mineru:latest .
314
- docker run --rm -it --gpus=all mineru:latest /bin/bash
347
+ docker run --rm -it --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
315
348
  magic-pdf --help
316
349
  ```
317
350
 
351
+ ### Using NPU
352
+
353
+ If your device has NPU acceleration hardware, you can follow the tutorial below to use NPU acceleration:
354
+
355
+ [Ascend NPU Acceleration](docs/README_Ascend_NPU_Acceleration_zh_CN.md)
356
+
357
+ ### Using MPS
358
+
359
+ If your device uses Apple silicon chips, you can enable MPS acceleration for certain supported tasks (such as layout detection and formula detection).
360
+
361
+ You can enable MPS acceleration by setting the `device-mode` parameter to `mps` in the `magic-pdf.json` configuration file.
362
+
363
+ ```json
364
+ {
365
+ // other config
366
+ "device-mode": "mps"
367
+ }
368
+ ```
369
+
370
+ > [!TIP]
371
+ > Since the formula recognition task cannot utilize MPS acceleration, you can disable the formula recognition feature in tasks where it is not needed to achieve optimal performance.
372
+ >
373
+ > You can disable the formula recognition feature by setting the `enable` parameter in the `formula-config` section to `false`.
374
+
318
375
  ## Usage
319
376
 
320
377
  ### Command Line
321
378
 
322
- [Using MinerU via Command Line](https://mineru.readthedocs.io/en/latest/user_guide/quick_start/command_line.html)
379
+ [Using MinerU via Command Line](https://mineru.readthedocs.io/en/latest/user_guide/usage/command_line.html)
323
380
 
324
381
  > [!TIP]
325
382
  > For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
326
383
 
327
384
  ### API
328
385
 
329
- [Using MinerU via Python API](https://mineru.readthedocs.io/en/latest/user_guide/quick_start/to_markdown.html)
330
-
331
- For detailed implementation, refer to:
386
+ [Using MinerU via Python API](https://mineru.readthedocs.io/en/latest/user_guide/usage/api.html)
332
387
 
333
- - [demo.py Simplest Processing Method](demo/demo.py)
334
- - [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
335
388
 
336
389
  ### Deploy Derived Projects
337
390