magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +130 -76
  2. magic_pdf/integrations/__init__.py +0 -0
  3. magic_pdf/integrations/rag/__init__.py +0 -0
  4. magic_pdf/integrations/rag/api.py +82 -0
  5. magic_pdf/integrations/rag/type.py +82 -0
  6. magic_pdf/integrations/rag/utils.py +285 -0
  7. magic_pdf/layout/layout_sort.py +472 -283
  8. magic_pdf/libs/boxbase.py +188 -149
  9. magic_pdf/libs/draw_bbox.py +113 -87
  10. magic_pdf/libs/ocr_content_type.py +21 -18
  11. magic_pdf/libs/version.py +1 -1
  12. magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
  13. magic_pdf/model/magic_model.py +283 -166
  14. magic_pdf/model/model_list.py +8 -0
  15. magic_pdf/model/pdf_extract_kit.py +105 -15
  16. magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
  17. magic_pdf/para/para_split_v2.py +26 -27
  18. magic_pdf/pdf_parse_union_core.py +34 -6
  19. magic_pdf/pipe/AbsPipe.py +4 -1
  20. magic_pdf/pipe/OCRPipe.py +7 -4
  21. magic_pdf/pipe/TXTPipe.py +7 -4
  22. magic_pdf/pipe/UNIPipe.py +11 -6
  23. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
  24. magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
  25. magic_pdf/tools/cli.py +56 -29
  26. magic_pdf/tools/cli_dev.py +61 -64
  27. magic_pdf/tools/common.py +57 -37
  28. magic_pdf/user_api.py +17 -9
  29. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
  30. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
  31. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
  32. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
  33. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
  34. {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py CHANGED
@@ -1,22 +1,25 @@
1
- import os
2
- import json as json_parse
3
1
  import copy
2
+ import json as json_parse
3
+ import os
4
+
4
5
  import click
5
6
  from loguru import logger
7
+
8
+ import magic_pdf.model as model_config
9
+ from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
10
+ drow_model_bbox)
6
11
  from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
7
- from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
8
- from magic_pdf.pipe.UNIPipe import UNIPipe
9
12
  from magic_pdf.pipe.OCRPipe import OCRPipe
10
13
  from magic_pdf.pipe.TXTPipe import TXTPipe
11
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
14
+ from magic_pdf.pipe.UNIPipe import UNIPipe
12
15
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
13
- import magic_pdf.model as model_config
16
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
14
17
 
15
18
 
16
19
  def prepare_env(output_dir, pdf_file_name, method):
17
20
  local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
18
21
 
19
- local_image_dir = os.path.join(str(local_parent_dir), "images")
22
+ local_image_dir = os.path.join(str(local_parent_dir), 'images')
20
23
  local_md_dir = local_parent_dir
21
24
  os.makedirs(local_image_dir, exist_ok=True)
22
25
  os.makedirs(local_md_dir, exist_ok=True)
@@ -29,6 +32,7 @@ def do_parse(
29
32
  pdf_bytes,
30
33
  model_list,
31
34
  parse_method,
35
+ debug_able,
32
36
  f_draw_span_bbox=True,
33
37
  f_draw_layout_bbox=True,
34
38
  f_dump_md=True,
@@ -38,24 +42,34 @@ def do_parse(
38
42
  f_dump_content_list=False,
39
43
  f_make_md_mode=MakeMode.MM_MD,
40
44
  f_draw_model_bbox=False,
45
+ start_page_id=0,
46
+ end_page_id=None,
41
47
  ):
48
+ if debug_able:
49
+ logger.warning('debug mode is on')
50
+ f_dump_content_list = True
51
+ f_draw_model_bbox = True
52
+
42
53
  orig_model_list = copy.deepcopy(model_list)
43
- local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
54
+ local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
55
+ parse_method)
44
56
 
45
- image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
46
- local_md_dir
47
- )
57
+ image_writer, md_writer = DiskReaderWriter(
58
+ local_image_dir), DiskReaderWriter(local_md_dir)
48
59
  image_dir = str(os.path.basename(local_image_dir))
49
60
 
50
- if parse_method == "auto":
51
- jso_useful_key = {"_pdf_type": "", "model_list": model_list}
52
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
53
- elif parse_method == "txt":
54
- pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
55
- elif parse_method == "ocr":
56
- pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
61
+ if parse_method == 'auto':
62
+ jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
63
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
64
+ start_page_id=start_page_id, end_page_id=end_page_id)
65
+ elif parse_method == 'txt':
66
+ pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
67
+ start_page_id=start_page_id, end_page_id=end_page_id)
68
+ elif parse_method == 'ocr':
69
+ pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
70
+ start_page_id=start_page_id, end_page_id=end_page_id)
57
71
  else:
58
- logger.error("unknown parse method")
72
+ logger.error('unknown parse method')
59
73
  exit(1)
60
74
 
61
75
  pipe.pipe_classify()
@@ -65,58 +79,64 @@ def do_parse(
65
79
  pipe.pipe_analyze()
66
80
  orig_model_list = copy.deepcopy(pipe.model_list)
67
81
  else:
68
- logger.error("need model list input")
82
+ logger.error('need model list input')
69
83
  exit(2)
70
84
 
71
85
  pipe.pipe_parse()
72
- pdf_info = pipe.pdf_mid_data["pdf_info"]
86
+ pdf_info = pipe.pdf_mid_data['pdf_info']
73
87
  if f_draw_layout_bbox:
74
- draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
88
+ draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
75
89
  if f_draw_span_bbox:
76
- draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
90
+ draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
77
91
  if f_draw_model_bbox:
78
- drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
92
+ drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
79
93
 
80
- md_content = pipe.pipe_mk_markdown(
81
- image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
82
- )
94
+ md_content = pipe.pipe_mk_markdown(image_dir,
95
+ drop_mode=DropMode.NONE,
96
+ md_make_mode=f_make_md_mode)
83
97
  if f_dump_md:
84
98
  md_writer.write(
85
99
  content=md_content,
86
- path=f"{pdf_file_name}.md",
100
+ path=f'{pdf_file_name}.md',
87
101
  mode=AbsReaderWriter.MODE_TXT,
88
102
  )
89
103
 
90
104
  if f_dump_middle_json:
91
105
  md_writer.write(
92
- content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
93
- path="middle.json",
106
+ content=json_parse.dumps(pipe.pdf_mid_data,
107
+ ensure_ascii=False,
108
+ indent=4),
109
+ path=f'{pdf_file_name}_middle.json',
94
110
  mode=AbsReaderWriter.MODE_TXT,
95
111
  )
96
112
 
97
113
  if f_dump_model_json:
98
114
  md_writer.write(
99
- content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
100
- path="model.json",
115
+ content=json_parse.dumps(orig_model_list,
116
+ ensure_ascii=False,
117
+ indent=4),
118
+ path=f'{pdf_file_name}_model.json',
101
119
  mode=AbsReaderWriter.MODE_TXT,
102
120
  )
103
121
 
104
122
  if f_dump_orig_pdf:
105
123
  md_writer.write(
106
124
  content=pdf_bytes,
107
- path="origin.pdf",
125
+ path=f'{pdf_file_name}_origin.pdf',
108
126
  mode=AbsReaderWriter.MODE_BIN,
109
127
  )
110
128
 
111
129
  content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
112
130
  if f_dump_content_list:
113
131
  md_writer.write(
114
- content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
115
- path="content_list.json",
132
+ content=json_parse.dumps(content_list,
133
+ ensure_ascii=False,
134
+ indent=4),
135
+ path=f'{pdf_file_name}_content_list.json',
116
136
  mode=AbsReaderWriter.MODE_TXT,
117
137
  )
118
138
 
119
- logger.info(f"local output dir is {local_md_dir}")
139
+ logger.info(f'local output dir is {local_md_dir}')
120
140
 
121
141
 
122
- parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
142
+ parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
magic_pdf/user_api.py CHANGED
@@ -25,8 +25,9 @@ PARSE_TYPE_TXT = "txt"
25
25
  PARSE_TYPE_OCR = "ocr"
26
26
 
27
27
 
28
- def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
29
- **kwargs):
28
+ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
29
+ start_page_id=0, end_page_id=None,
30
+ *args, **kwargs):
30
31
  """
31
32
  解析文本类pdf
32
33
  """
@@ -34,7 +35,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
34
35
  pdf_bytes,
35
36
  pdf_models,
36
37
  imageWriter,
37
- start_page_id=start_page,
38
+ start_page_id=start_page_id,
39
+ end_page_id=end_page_id,
38
40
  debug_mode=is_debug,
39
41
  )
40
42
 
@@ -45,8 +47,9 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
45
47
  return pdf_info_dict
46
48
 
47
49
 
48
- def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
49
- **kwargs):
50
+ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
51
+ start_page_id=0, end_page_id=None,
52
+ *args, **kwargs):
50
53
  """
51
54
  解析ocr类pdf
52
55
  """
@@ -54,7 +57,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
54
57
  pdf_bytes,
55
58
  pdf_models,
56
59
  imageWriter,
57
- start_page_id=start_page,
60
+ start_page_id=start_page_id,
61
+ end_page_id=end_page_id,
58
62
  debug_mode=is_debug,
59
63
  )
60
64
 
@@ -65,8 +69,9 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
65
69
  return pdf_info_dict
66
70
 
67
71
 
68
- def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
72
+ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
69
73
  input_model_is_empty: bool = False,
74
+ start_page_id=0, end_page_id=None,
70
75
  *args, **kwargs):
71
76
  """
72
77
  ocr和文本混合的pdf,全部解析出来
@@ -78,7 +83,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
78
83
  pdf_bytes,
79
84
  pdf_models,
80
85
  imageWriter,
81
- start_page_id=start_page,
86
+ start_page_id=start_page_id,
87
+ end_page_id=end_page_id,
82
88
  debug_mode=is_debug,
83
89
  )
84
90
  except Exception as e:
@@ -89,7 +95,9 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
89
95
  if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
90
96
  logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
91
97
  if input_model_is_empty:
92
- pdf_models = doc_analyze(pdf_bytes, ocr=True)
98
+ pdf_models = doc_analyze(pdf_bytes, ocr=True,
99
+ start_page_id=start_page_id,
100
+ end_page_id=end_page_id)
93
101
  pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
94
102
  if pdf_info_dict is None:
95
103
  raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.7.1
3
+ Version: 0.8.1
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -9,13 +9,14 @@ License-File: LICENSE.md
9
9
  Requires-Dist: boto3>=1.28.43
10
10
  Requires-Dist: Brotli>=1.1.0
11
11
  Requires-Dist: click>=8.1.7
12
- Requires-Dist: PyMuPDF>=1.24.9
12
+ Requires-Dist: fast-langdetect==0.2.0
13
13
  Requires-Dist: loguru>=0.6.0
14
14
  Requires-Dist: numpy<2.0.0,>=1.21.6
15
- Requires-Dist: fast-langdetect==0.2.0
16
- Requires-Dist: wordninja>=2.0.0
17
- Requires-Dist: scikit-learn>=1.0.2
18
15
  Requires-Dist: pdfminer.six==20231228
16
+ Requires-Dist: pydantic<2.8.0,>=2.7.2
17
+ Requires-Dist: PyMuPDF>=1.24.9
18
+ Requires-Dist: scikit-learn>=1.0.2
19
+ Requires-Dist: wordninja>=2.0.0
19
20
  Provides-Extra: full
20
21
  Requires-Dist: unimernet==0.1.6; extra == "full"
21
22
  Requires-Dist: ultralytics; extra == "full"
@@ -39,6 +40,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
39
40
  </p>
40
41
 
41
42
  <!-- icon -->
43
+
42
44
  [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
43
45
  [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
44
46
  [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
@@ -46,17 +48,27 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
46
48
  [![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
47
49
  [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
48
50
  [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
51
+
52
+ [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
53
+ [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
54
+ [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
55
+ [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
56
+ [![Paper](https://img.shields.io/badge/Paper-arXiv-green)](#)
57
+
49
58
  <a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
50
59
 
51
60
  <!-- language -->
61
+
52
62
  [English](README.md) | [简体中文](README_zh-CN.md)
53
63
 
54
64
  <!-- hot link -->
65
+
55
66
  <p align="center">
56
67
  <a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
57
68
  </p>
58
69
 
59
70
  <!-- join us -->
71
+
60
72
  <p align="center">
61
73
  👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
62
74
  </p>
@@ -64,12 +76,14 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
64
76
  </div>
65
77
 
66
78
  # Changelog
79
+ - 2024/09/09: Version 0.8.0 released, supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.
67
80
  - 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
68
81
  - 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
69
82
  - 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
70
83
  - 2024/07/05: Initial open-source release
71
84
 
72
85
  <!-- TABLE OF CONTENT -->
86
+
73
87
  <details open="open">
74
88
  <summary><h2 style="display: inline-block">Table of Contents</h2></summary>
75
89
  <ol>
@@ -108,10 +122,10 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
108
122
  </ol>
109
123
  </details>
110
124
 
111
-
112
-
113
125
  # MinerU
126
+
114
127
  ## Project Introduction
128
+
115
129
  MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
116
130
  MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
117
131
  Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [issue](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
@@ -135,6 +149,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
135
149
  If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
136
150
  If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
137
151
  There are three different ways to experience MinerU:
152
+
138
153
  - [Online Demo (No Installation Required)](#online-demo)
139
154
  - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
140
155
  - [Linux/Windows + CUDA](#Using-GPU)
@@ -192,44 +207,56 @@ In non-mainline environments, due to the diversity of hardware and software conf
192
207
  <tr>
193
208
  <td colspan="2">Recommended Configuration 16G+ VRAM</td>
194
209
  <td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
195
- 16G or more can enable layout, formula recognition, and OCR acceleration simultaneously</td>
210
+ 16G or more can enable layout, formula recognition, and OCR acceleration simultaneously<br>
211
+ 24G or more can enable layout, formula recognition, OCR acceleration and table recognition simultaneously
212
+ </td>
196
213
  </tr>
197
214
  </table>
198
215
 
199
216
  ### Online Demo
200
217
 
201
- [Click here for the online demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
218
+ [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
219
+ [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
220
+ [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
202
221
 
203
222
  ### Quick CPU Demo
204
223
 
205
224
  #### 1. Install magic-pdf
225
+
206
226
  ```bash
207
227
  conda create -n MinerU python=3.10
208
228
  conda activate MinerU
209
229
  pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
210
230
  ```
231
+
211
232
  #### 2. Download model weight files
212
233
 
213
234
  Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for detailed instructions.
235
+
214
236
  > ❗️After downloading the models, please make sure to verify the completeness of the model files.
215
- >
237
+ >
216
238
  > Check if the model file sizes match the description on the webpage. If possible, use sha256 to verify the integrity of the files.
217
239
 
218
240
  #### 3. Copy and configure the template file
241
+
219
242
  You can find the `magic-pdf.template.json` template configuration file in the root directory of the repository.
243
+
220
244
  > ❗️Make sure to execute the following command to copy the configuration file to your **user directory**; otherwise, the program will not run.
221
- >
245
+ >
222
246
  > The user directory for Windows is `C:\Users\YourUsername`, for Linux it is `/home/YourUsername`, and for macOS it is `/Users/YourUsername`.
247
+
223
248
  ```bash
224
249
  cp magic-pdf.template.json ~/magic-pdf.json
225
250
  ```
226
251
 
227
252
  Find the `magic-pdf.json` file in your user directory and configure the "models-dir" path to point to the directory where the model weight files were downloaded in [Step 2](#2-download-model-weight-files).
253
+
228
254
  > ❗️Make sure to correctly configure the **absolute path** to the model weight files directory, otherwise the program will not run because it can't find the model files.
229
255
  >
230
256
  > On Windows, this path should include the drive letter and all backslashes (`\`) in the path should be replaced with forward slashes (`/`) to avoid syntax errors in the JSON file due to escape sequences.
231
- >
257
+ >
232
258
  > For example: If the models are stored in the "models" directory at the root of the D drive, the "model-dir" value should be `D:/models`.
259
+
233
260
  ```json
234
261
  {
235
262
  // other config
@@ -242,13 +269,26 @@ Find the `magic-pdf.json` file in your user directory and configure the "models-
242
269
  }
243
270
  ```
244
271
 
245
-
246
272
  ### Using GPU
273
+
247
274
  If your device supports CUDA and meets the GPU requirements of the mainline environment, you can use GPU acceleration. Please select the appropriate guide based on your system:
248
275
 
249
276
  - [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
250
277
  - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
251
-
278
+ - Quick Deployment with Docker
279
+ > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
280
+ >
281
+ > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
282
+ >
283
+ > ```bash
284
+ > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
285
+ > ```
286
+ ```bash
287
+ wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
288
+ docker build -t mineru:latest .
289
+ docker run --rm -it --gpus=all mineru:latest /bin/bash
290
+ magic-pdf --help
291
+ ```
252
292
 
253
293
  ## Usage
254
294
 
@@ -262,12 +302,12 @@ Options:
262
302
  -v, --version display the version and exit
263
303
  -p, --path PATH local pdf filepath or directory [required]
264
304
  -o, --output-dir TEXT output local directory
265
- -m, --method [ocr|txt|auto] the method for parsing pdf.
305
+ -m, --method [ocr|txt|auto] the method for parsing pdf.
266
306
  ocr: using ocr technique to extract information from pdf,
267
307
  txt: suitable for the text-based pdf only and outperform ocr,
268
308
  auto: automatically choose the best method for parsing pdf
269
309
  from ocr and txt.
270
- without method specified, auto will be used by default.
310
+ without method specified, auto will be used by default.
271
311
  --help Show this message and exit.
272
312
 
273
313
 
@@ -282,13 +322,13 @@ magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
282
322
  The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
283
323
 
284
324
  ```text
285
- ├── some_pdf.md # markdown file
286
- ├── images # directory for storing images
287
- ├── layout.pdf # layout diagram
288
- ├── middle.json # MinerU intermediate processing result
289
- ├── model.json # model inference result
290
- ├── origin.pdf # original PDF file
291
- └── spans.pdf # smallest granularity bbox position information diagram
325
+ ├── some_pdf.md # markdown file
326
+ ├── images # directory for storing images
327
+ ├── some_pdf_layout.pdf # layout diagram
328
+ ├── some_pdf_middle.json # MinerU intermediate processing result
329
+ ├── some_pdf_model.json # model inference result
330
+ ├── some_pdf_origin.pdf # original PDF file
331
+ └── some_pdf_spans.pdf # smallest granularity bbox position information diagram
292
332
  ```
293
333
 
294
334
  For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
@@ -296,6 +336,7 @@ For more information about the output files, please refer to the [Output File De
296
336
  ### API
297
337
 
298
338
  Processing files from local disk
339
+
299
340
  ```python
300
341
  image_writer = DiskReaderWriter(local_image_dir)
301
342
  image_dir = str(os.path.basename(local_image_dir))
@@ -308,6 +349,7 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
308
349
  ```
309
350
 
310
351
  Processing files from object storage
352
+
311
353
  ```python
312
354
  s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
313
355
  image_dir = "s3://img_bucket/"
@@ -322,10 +364,10 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
322
364
  ```
323
365
 
324
366
  For detailed implementation, refer to:
367
+
325
368
  - [demo.py Simplest Processing Method](demo/demo.py)
326
369
  - [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
327
370
 
328
-
329
371
  ### Development Guide
330
372
 
331
373
  TODO
@@ -337,10 +379,11 @@ TODO
337
379
  - [ ] Code block recognition within the text
338
380
  - [ ] Table of contents recognition
339
381
  - [x] Table recognition
340
- - [ ] Chemical formula recognition
382
+ - [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
341
383
  - [ ] Geometric shape recognition
342
384
 
343
385
  # Known Issues
386
+
344
387
  - Reading order is segmented based on rules, which can cause disordered sequences in some cases
345
388
  - Vertical text is not supported
346
389
  - Lists, code blocks, and table of contents are not yet supported in the layout model
@@ -350,11 +393,11 @@ TODO
350
393
 
351
394
 
352
395
  # FAQ
396
+
353
397
  [FAQ in Chinese](docs/FAQ_zh_cn.md)
354
398
 
355
399
  [FAQ in English](docs/FAQ_en_us.md)
356
400
 
357
-
358
401
  # All Thanks To Our Contributors
359
402
 
360
403
  <a href="https://github.com/opendatalab/MinerU/graphs/contributors">
@@ -367,8 +410,8 @@ TODO
367
410
 
368
411
  This project currently uses PyMuPDF to achieve advanced functionality. However, since it adheres to the AGPL license, it may impose restrictions on certain usage scenarios. In future iterations, we plan to explore and replace it with a more permissive PDF processing library to enhance user-friendliness and flexibility.
369
412
 
370
-
371
413
  # Acknowledgments
414
+
372
415
  - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
373
416
  - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
374
417
  - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
@@ -405,9 +448,11 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
405
448
  </a>
406
449
 
407
450
  # Magic-doc
451
+
408
452
  [Magic-Doc](https://github.com/InternLM/magic-doc) Fast speed ppt/pptx/doc/docx/pdf extraction tool
409
453
 
410
454
  # Magic-html
455
+
411
456
  [Magic-HTML](https://github.com/opendatalab/magic-html) Mixed web page extraction tool
412
457
 
413
458
  # Links