magic-pdf 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +130 -76
- magic_pdf/integrations/__init__.py +0 -0
- magic_pdf/integrations/rag/__init__.py +0 -0
- magic_pdf/integrations/rag/api.py +82 -0
- magic_pdf/integrations/rag/type.py +82 -0
- magic_pdf/integrations/rag/utils.py +285 -0
- magic_pdf/layout/layout_sort.py +472 -283
- magic_pdf/libs/boxbase.py +169 -149
- magic_pdf/libs/draw_bbox.py +113 -87
- magic_pdf/libs/ocr_content_type.py +21 -18
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
- magic_pdf/model/magic_model.py +227 -161
- magic_pdf/model/model_list.py +8 -0
- magic_pdf/model/pdf_extract_kit.py +105 -15
- magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
- magic_pdf/para/para_split_v2.py +26 -27
- magic_pdf/pdf_parse_union_core.py +34 -6
- magic_pdf/pipe/AbsPipe.py +4 -1
- magic_pdf/pipe/OCRPipe.py +7 -4
- magic_pdf/pipe/TXTPipe.py +7 -4
- magic_pdf/pipe/UNIPipe.py +11 -6
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
- magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
- magic_pdf/tools/cli.py +56 -29
- magic_pdf/tools/cli_dev.py +61 -64
- magic_pdf/tools/common.py +57 -37
- magic_pdf/user_api.py +17 -9
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +68 -26
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +34 -29
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py
CHANGED
@@ -1,22 +1,25 @@
|
|
1
|
-
import os
|
2
|
-
import json as json_parse
|
3
1
|
import copy
|
2
|
+
import json as json_parse
|
3
|
+
import os
|
4
|
+
|
4
5
|
import click
|
5
6
|
from loguru import logger
|
7
|
+
|
8
|
+
import magic_pdf.model as model_config
|
9
|
+
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
|
10
|
+
drow_model_bbox)
|
6
11
|
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
7
|
-
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
|
8
|
-
from magic_pdf.pipe.UNIPipe import UNIPipe
|
9
12
|
from magic_pdf.pipe.OCRPipe import OCRPipe
|
10
13
|
from magic_pdf.pipe.TXTPipe import TXTPipe
|
11
|
-
from magic_pdf.
|
14
|
+
from magic_pdf.pipe.UNIPipe import UNIPipe
|
12
15
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
13
|
-
|
16
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
14
17
|
|
15
18
|
|
16
19
|
def prepare_env(output_dir, pdf_file_name, method):
|
17
20
|
local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
|
18
21
|
|
19
|
-
local_image_dir = os.path.join(str(local_parent_dir),
|
22
|
+
local_image_dir = os.path.join(str(local_parent_dir), 'images')
|
20
23
|
local_md_dir = local_parent_dir
|
21
24
|
os.makedirs(local_image_dir, exist_ok=True)
|
22
25
|
os.makedirs(local_md_dir, exist_ok=True)
|
@@ -29,6 +32,7 @@ def do_parse(
|
|
29
32
|
pdf_bytes,
|
30
33
|
model_list,
|
31
34
|
parse_method,
|
35
|
+
debug_able,
|
32
36
|
f_draw_span_bbox=True,
|
33
37
|
f_draw_layout_bbox=True,
|
34
38
|
f_dump_md=True,
|
@@ -38,24 +42,34 @@ def do_parse(
|
|
38
42
|
f_dump_content_list=False,
|
39
43
|
f_make_md_mode=MakeMode.MM_MD,
|
40
44
|
f_draw_model_bbox=False,
|
45
|
+
start_page_id=0,
|
46
|
+
end_page_id=None,
|
41
47
|
):
|
48
|
+
if debug_able:
|
49
|
+
logger.warning("debug mode is on")
|
50
|
+
f_dump_content_list = True
|
51
|
+
f_draw_model_bbox = True
|
52
|
+
|
42
53
|
orig_model_list = copy.deepcopy(model_list)
|
43
|
-
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
54
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
55
|
+
parse_method)
|
44
56
|
|
45
|
-
image_writer, md_writer = DiskReaderWriter(
|
46
|
-
local_md_dir
|
47
|
-
)
|
57
|
+
image_writer, md_writer = DiskReaderWriter(
|
58
|
+
local_image_dir), DiskReaderWriter(local_md_dir)
|
48
59
|
image_dir = str(os.path.basename(local_image_dir))
|
49
60
|
|
50
|
-
if parse_method ==
|
51
|
-
jso_useful_key = {
|
52
|
-
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
61
|
+
if parse_method == 'auto':
|
62
|
+
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
63
|
+
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
|
64
|
+
start_page_id=start_page_id, end_page_id=end_page_id)
|
65
|
+
elif parse_method == 'txt':
|
66
|
+
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
67
|
+
start_page_id=start_page_id, end_page_id=end_page_id)
|
68
|
+
elif parse_method == 'ocr':
|
69
|
+
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
70
|
+
start_page_id=start_page_id, end_page_id=end_page_id)
|
57
71
|
else:
|
58
|
-
logger.error(
|
72
|
+
logger.error('unknown parse method')
|
59
73
|
exit(1)
|
60
74
|
|
61
75
|
pipe.pipe_classify()
|
@@ -65,58 +79,64 @@ def do_parse(
|
|
65
79
|
pipe.pipe_analyze()
|
66
80
|
orig_model_list = copy.deepcopy(pipe.model_list)
|
67
81
|
else:
|
68
|
-
logger.error(
|
82
|
+
logger.error('need model list input')
|
69
83
|
exit(2)
|
70
84
|
|
71
85
|
pipe.pipe_parse()
|
72
|
-
pdf_info = pipe.pdf_mid_data[
|
86
|
+
pdf_info = pipe.pdf_mid_data['pdf_info']
|
73
87
|
if f_draw_layout_bbox:
|
74
|
-
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
|
88
|
+
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
|
75
89
|
if f_draw_span_bbox:
|
76
|
-
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
|
90
|
+
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
|
77
91
|
if f_draw_model_bbox:
|
78
|
-
drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
|
92
|
+
drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
|
79
93
|
|
80
|
-
md_content = pipe.pipe_mk_markdown(
|
81
|
-
|
82
|
-
|
94
|
+
md_content = pipe.pipe_mk_markdown(image_dir,
|
95
|
+
drop_mode=DropMode.NONE,
|
96
|
+
md_make_mode=f_make_md_mode)
|
83
97
|
if f_dump_md:
|
84
98
|
md_writer.write(
|
85
99
|
content=md_content,
|
86
|
-
path=f
|
100
|
+
path=f'{pdf_file_name}.md',
|
87
101
|
mode=AbsReaderWriter.MODE_TXT,
|
88
102
|
)
|
89
103
|
|
90
104
|
if f_dump_middle_json:
|
91
105
|
md_writer.write(
|
92
|
-
content=json_parse.dumps(pipe.pdf_mid_data,
|
93
|
-
|
106
|
+
content=json_parse.dumps(pipe.pdf_mid_data,
|
107
|
+
ensure_ascii=False,
|
108
|
+
indent=4),
|
109
|
+
path=f'{pdf_file_name}_middle.json',
|
94
110
|
mode=AbsReaderWriter.MODE_TXT,
|
95
111
|
)
|
96
112
|
|
97
113
|
if f_dump_model_json:
|
98
114
|
md_writer.write(
|
99
|
-
content=json_parse.dumps(orig_model_list,
|
100
|
-
|
115
|
+
content=json_parse.dumps(orig_model_list,
|
116
|
+
ensure_ascii=False,
|
117
|
+
indent=4),
|
118
|
+
path=f'{pdf_file_name}_model.json',
|
101
119
|
mode=AbsReaderWriter.MODE_TXT,
|
102
120
|
)
|
103
121
|
|
104
122
|
if f_dump_orig_pdf:
|
105
123
|
md_writer.write(
|
106
124
|
content=pdf_bytes,
|
107
|
-
path=
|
125
|
+
path=f'{pdf_file_name}_origin.pdf',
|
108
126
|
mode=AbsReaderWriter.MODE_BIN,
|
109
127
|
)
|
110
128
|
|
111
129
|
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
112
130
|
if f_dump_content_list:
|
113
131
|
md_writer.write(
|
114
|
-
content=json_parse.dumps(content_list,
|
115
|
-
|
132
|
+
content=json_parse.dumps(content_list,
|
133
|
+
ensure_ascii=False,
|
134
|
+
indent=4),
|
135
|
+
path=f'{pdf_file_name}_content_list.json',
|
116
136
|
mode=AbsReaderWriter.MODE_TXT,
|
117
137
|
)
|
118
138
|
|
119
|
-
logger.info(f
|
139
|
+
logger.info(f'local output dir is {local_md_dir}')
|
120
140
|
|
121
141
|
|
122
|
-
parse_pdf_methods = click.Choice([
|
142
|
+
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
|
magic_pdf/user_api.py
CHANGED
@@ -25,8 +25,9 @@ PARSE_TYPE_TXT = "txt"
|
|
25
25
|
PARSE_TYPE_OCR = "ocr"
|
26
26
|
|
27
27
|
|
28
|
-
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
29
|
-
|
28
|
+
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
29
|
+
start_page_id=0, end_page_id=None,
|
30
|
+
*args, **kwargs):
|
30
31
|
"""
|
31
32
|
解析文本类pdf
|
32
33
|
"""
|
@@ -34,7 +35,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
34
35
|
pdf_bytes,
|
35
36
|
pdf_models,
|
36
37
|
imageWriter,
|
37
|
-
start_page_id=
|
38
|
+
start_page_id=start_page_id,
|
39
|
+
end_page_id=end_page_id,
|
38
40
|
debug_mode=is_debug,
|
39
41
|
)
|
40
42
|
|
@@ -45,8 +47,9 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
45
47
|
return pdf_info_dict
|
46
48
|
|
47
49
|
|
48
|
-
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
49
|
-
|
50
|
+
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
51
|
+
start_page_id=0, end_page_id=None,
|
52
|
+
*args, **kwargs):
|
50
53
|
"""
|
51
54
|
解析ocr类pdf
|
52
55
|
"""
|
@@ -54,7 +57,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
54
57
|
pdf_bytes,
|
55
58
|
pdf_models,
|
56
59
|
imageWriter,
|
57
|
-
start_page_id=
|
60
|
+
start_page_id=start_page_id,
|
61
|
+
end_page_id=end_page_id,
|
58
62
|
debug_mode=is_debug,
|
59
63
|
)
|
60
64
|
|
@@ -65,8 +69,9 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
65
69
|
return pdf_info_dict
|
66
70
|
|
67
71
|
|
68
|
-
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
72
|
+
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
69
73
|
input_model_is_empty: bool = False,
|
74
|
+
start_page_id=0, end_page_id=None,
|
70
75
|
*args, **kwargs):
|
71
76
|
"""
|
72
77
|
ocr和文本混合的pdf,全部解析出来
|
@@ -78,7 +83,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
78
83
|
pdf_bytes,
|
79
84
|
pdf_models,
|
80
85
|
imageWriter,
|
81
|
-
start_page_id=
|
86
|
+
start_page_id=start_page_id,
|
87
|
+
end_page_id=end_page_id,
|
82
88
|
debug_mode=is_debug,
|
83
89
|
)
|
84
90
|
except Exception as e:
|
@@ -89,7 +95,9 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
89
95
|
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
|
90
96
|
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
|
91
97
|
if input_model_is_empty:
|
92
|
-
pdf_models = doc_analyze(pdf_bytes, ocr=True
|
98
|
+
pdf_models = doc_analyze(pdf_bytes, ocr=True,
|
99
|
+
start_page_id=start_page_id,
|
100
|
+
end_page_id=end_page_id)
|
93
101
|
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
94
102
|
if pdf_info_dict is None:
|
95
103
|
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.8.0
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,13 +9,14 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3>=1.28.43
|
10
10
|
Requires-Dist: Brotli>=1.1.0
|
11
11
|
Requires-Dist: click>=8.1.7
|
12
|
-
Requires-Dist:
|
12
|
+
Requires-Dist: fast-langdetect==0.2.0
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
14
|
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
-
Requires-Dist: fast-langdetect==0.2.0
|
16
|
-
Requires-Dist: wordninja>=2.0.0
|
17
|
-
Requires-Dist: scikit-learn>=1.0.2
|
18
15
|
Requires-Dist: pdfminer.six==20231228
|
16
|
+
Requires-Dist: pydantic<2.8.0,>=2.7.2
|
17
|
+
Requires-Dist: PyMuPDF>=1.24.9
|
18
|
+
Requires-Dist: scikit-learn>=1.0.2
|
19
|
+
Requires-Dist: wordninja>=2.0.0
|
19
20
|
Provides-Extra: full
|
20
21
|
Requires-Dist: unimernet==0.1.6; extra == "full"
|
21
22
|
Requires-Dist: ultralytics; extra == "full"
|
@@ -39,6 +40,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
39
40
|
</p>
|
40
41
|
|
41
42
|
<!-- icon -->
|
43
|
+
|
42
44
|
[](https://github.com/opendatalab/MinerU)
|
43
45
|
[](https://github.com/opendatalab/MinerU)
|
44
46
|
[](https://github.com/opendatalab/MinerU/issues)
|
@@ -46,17 +48,26 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
46
48
|
[](https://badge.fury.io/py/magic-pdf)
|
47
49
|
[](https://pepy.tech/project/magic-pdf)
|
48
50
|
[](https://pepy.tech/project/magic-pdf)
|
51
|
+
|
52
|
+
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
53
|
+
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
54
|
+
[](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
|
55
|
+
[](#)
|
56
|
+
|
49
57
|
<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
50
58
|
|
51
59
|
<!-- language -->
|
60
|
+
|
52
61
|
[English](README.md) | [简体中文](README_zh-CN.md)
|
53
62
|
|
54
63
|
<!-- hot link -->
|
64
|
+
|
55
65
|
<p align="center">
|
56
66
|
<a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
|
57
67
|
</p>
|
58
68
|
|
59
69
|
<!-- join us -->
|
70
|
+
|
60
71
|
<p align="center">
|
61
72
|
👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
|
62
73
|
</p>
|
@@ -64,12 +75,14 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
64
75
|
</div>
|
65
76
|
|
66
77
|
# Changelog
|
78
|
+
- 2024/09/09: Version 0.8.0 released, supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.
|
67
79
|
- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
|
68
80
|
- 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
|
69
81
|
- 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
|
70
82
|
- 2024/07/05: Initial open-source release
|
71
83
|
|
72
84
|
<!-- TABLE OF CONTENT -->
|
85
|
+
|
73
86
|
<details open="open">
|
74
87
|
<summary><h2 style="display: inline-block">Table of Contents</h2></summary>
|
75
88
|
<ol>
|
@@ -108,10 +121,10 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
108
121
|
</ol>
|
109
122
|
</details>
|
110
123
|
|
111
|
-
|
112
|
-
|
113
124
|
# MinerU
|
125
|
+
|
114
126
|
## Project Introduction
|
127
|
+
|
115
128
|
MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
|
116
129
|
MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
|
117
130
|
Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [issue](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
|
@@ -135,6 +148,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
|
135
148
|
If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
|
136
149
|
If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
|
137
150
|
There are three different ways to experience MinerU:
|
151
|
+
|
138
152
|
- [Online Demo (No Installation Required)](#online-demo)
|
139
153
|
- [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
|
140
154
|
- [Linux/Windows + CUDA](#Using-GPU)
|
@@ -192,7 +206,9 @@ In non-mainline environments, due to the diversity of hardware and software conf
|
|
192
206
|
<tr>
|
193
207
|
<td colspan="2">Recommended Configuration 16G+ VRAM</td>
|
194
208
|
<td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
|
195
|
-
16G or more can enable layout, formula recognition, and OCR acceleration simultaneously
|
209
|
+
16G or more can enable layout, formula recognition, and OCR acceleration simultaneously<br>
|
210
|
+
24G or more can enable layout, formula recognition, OCR acceleration and table recognition simultaneously
|
211
|
+
</td>
|
196
212
|
</tr>
|
197
213
|
</table>
|
198
214
|
|
@@ -203,33 +219,41 @@ In non-mainline environments, due to the diversity of hardware and software conf
|
|
203
219
|
### Quick CPU Demo
|
204
220
|
|
205
221
|
#### 1. Install magic-pdf
|
222
|
+
|
206
223
|
```bash
|
207
224
|
conda create -n MinerU python=3.10
|
208
225
|
conda activate MinerU
|
209
226
|
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
|
210
227
|
```
|
228
|
+
|
211
229
|
#### 2. Download model weight files
|
212
230
|
|
213
231
|
Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for detailed instructions.
|
232
|
+
|
214
233
|
> ❗️After downloading the models, please make sure to verify the completeness of the model files.
|
215
|
-
>
|
234
|
+
>
|
216
235
|
> Check if the model file sizes match the description on the webpage. If possible, use sha256 to verify the integrity of the files.
|
217
236
|
|
218
237
|
#### 3. Copy and configure the template file
|
238
|
+
|
219
239
|
You can find the `magic-pdf.template.json` template configuration file in the root directory of the repository.
|
240
|
+
|
220
241
|
> ❗️Make sure to execute the following command to copy the configuration file to your **user directory**; otherwise, the program will not run.
|
221
|
-
>
|
242
|
+
>
|
222
243
|
> The user directory for Windows is `C:\Users\YourUsername`, for Linux it is `/home/YourUsername`, and for macOS it is `/Users/YourUsername`.
|
244
|
+
|
223
245
|
```bash
|
224
246
|
cp magic-pdf.template.json ~/magic-pdf.json
|
225
247
|
```
|
226
248
|
|
227
249
|
Find the `magic-pdf.json` file in your user directory and configure the "models-dir" path to point to the directory where the model weight files were downloaded in [Step 2](#2-download-model-weight-files).
|
250
|
+
|
228
251
|
> ❗️Make sure to correctly configure the **absolute path** to the model weight files directory, otherwise the program will not run because it can't find the model files.
|
229
252
|
>
|
230
253
|
> On Windows, this path should include the drive letter and all backslashes (`\`) in the path should be replaced with forward slashes (`/`) to avoid syntax errors in the JSON file due to escape sequences.
|
231
|
-
>
|
254
|
+
>
|
232
255
|
> For example: If the models are stored in the "models" directory at the root of the D drive, the "model-dir" value should be `D:/models`.
|
256
|
+
|
233
257
|
```json
|
234
258
|
{
|
235
259
|
// other config
|
@@ -242,13 +266,26 @@ Find the `magic-pdf.json` file in your user directory and configure the "models-
|
|
242
266
|
}
|
243
267
|
```
|
244
268
|
|
245
|
-
|
246
269
|
### Using GPU
|
270
|
+
|
247
271
|
If your device supports CUDA and meets the GPU requirements of the mainline environment, you can use GPU acceleration. Please select the appropriate guide based on your system:
|
248
272
|
|
249
273
|
- [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
|
250
274
|
- [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
|
251
|
-
|
275
|
+
- Quick Deployment with Docker
|
276
|
+
> Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
|
277
|
+
>
|
278
|
+
> Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
|
279
|
+
>
|
280
|
+
> ```bash
|
281
|
+
> docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
|
282
|
+
> ```
|
283
|
+
```bash
|
284
|
+
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
|
285
|
+
docker build -t mineru:latest .
|
286
|
+
docker run --rm -it --gpus=all mineru:latest /bin/bash
|
287
|
+
magic-pdf --help
|
288
|
+
```
|
252
289
|
|
253
290
|
## Usage
|
254
291
|
|
@@ -262,12 +299,12 @@ Options:
|
|
262
299
|
-v, --version display the version and exit
|
263
300
|
-p, --path PATH local pdf filepath or directory [required]
|
264
301
|
-o, --output-dir TEXT output local directory
|
265
|
-
-m, --method [ocr|txt|auto] the method for parsing pdf.
|
302
|
+
-m, --method [ocr|txt|auto] the method for parsing pdf.
|
266
303
|
ocr: using ocr technique to extract information from pdf,
|
267
304
|
txt: suitable for the text-based pdf only and outperform ocr,
|
268
305
|
auto: automatically choose the best method for parsing pdf
|
269
306
|
from ocr and txt.
|
270
|
-
without method specified, auto will be used by default.
|
307
|
+
without method specified, auto will be used by default.
|
271
308
|
--help Show this message and exit.
|
272
309
|
|
273
310
|
|
@@ -282,13 +319,13 @@ magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
|
|
282
319
|
The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
|
283
320
|
|
284
321
|
```text
|
285
|
-
├── some_pdf.md
|
286
|
-
├── images
|
287
|
-
├──
|
288
|
-
├──
|
289
|
-
├──
|
290
|
-
├──
|
291
|
-
└──
|
322
|
+
├── some_pdf.md # markdown file
|
323
|
+
├── images # directory for storing images
|
324
|
+
├── some_pdf_layout.pdf # layout diagram
|
325
|
+
├── some_pdf_middle.json # MinerU intermediate processing result
|
326
|
+
├── some_pdf_model.json # model inference result
|
327
|
+
├── some_pdf_origin.pdf # original PDF file
|
328
|
+
└── some_pdf_spans.pdf # smallest granularity bbox position information diagram
|
292
329
|
```
|
293
330
|
|
294
331
|
For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
@@ -296,6 +333,7 @@ For more information about the output files, please refer to the [Output File De
|
|
296
333
|
### API
|
297
334
|
|
298
335
|
Processing files from local disk
|
336
|
+
|
299
337
|
```python
|
300
338
|
image_writer = DiskReaderWriter(local_image_dir)
|
301
339
|
image_dir = str(os.path.basename(local_image_dir))
|
@@ -308,6 +346,7 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
|
308
346
|
```
|
309
347
|
|
310
348
|
Processing files from object storage
|
349
|
+
|
311
350
|
```python
|
312
351
|
s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
|
313
352
|
image_dir = "s3://img_bucket/"
|
@@ -322,10 +361,10 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
|
322
361
|
```
|
323
362
|
|
324
363
|
For detailed implementation, refer to:
|
364
|
+
|
325
365
|
- [demo.py Simplest Processing Method](demo/demo.py)
|
326
366
|
- [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
|
327
367
|
|
328
|
-
|
329
368
|
### Development Guide
|
330
369
|
|
331
370
|
TODO
|
@@ -337,10 +376,11 @@ TODO
|
|
337
376
|
- [ ] Code block recognition within the text
|
338
377
|
- [ ] Table of contents recognition
|
339
378
|
- [x] Table recognition
|
340
|
-
- [ ] Chemical formula recognition
|
379
|
+
- [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
|
341
380
|
- [ ] Geometric shape recognition
|
342
381
|
|
343
382
|
# Known Issues
|
383
|
+
|
344
384
|
- Reading order is segmented based on rules, which can cause disordered sequences in some cases
|
345
385
|
- Vertical text is not supported
|
346
386
|
- Lists, code blocks, and table of contents are not yet supported in the layout model
|
@@ -350,11 +390,11 @@ TODO
|
|
350
390
|
|
351
391
|
|
352
392
|
# FAQ
|
393
|
+
|
353
394
|
[FAQ in Chinese](docs/FAQ_zh_cn.md)
|
354
395
|
|
355
396
|
[FAQ in English](docs/FAQ_en_us.md)
|
356
397
|
|
357
|
-
|
358
398
|
# All Thanks To Our Contributors
|
359
399
|
|
360
400
|
<a href="https://github.com/opendatalab/MinerU/graphs/contributors">
|
@@ -367,8 +407,8 @@ TODO
|
|
367
407
|
|
368
408
|
This project currently uses PyMuPDF to achieve advanced functionality. However, since it adheres to the AGPL license, it may impose restrictions on certain usage scenarios. In future iterations, we plan to explore and replace it with a more permissive PDF processing library to enhance user-friendliness and flexibility.
|
369
409
|
|
370
|
-
|
371
410
|
# Acknowledgments
|
411
|
+
|
372
412
|
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
373
413
|
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
|
374
414
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
@@ -405,9 +445,11 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
|
|
405
445
|
</a>
|
406
446
|
|
407
447
|
# Magic-doc
|
448
|
+
|
408
449
|
[Magic-Doc](https://github.com/InternLM/magic-doc) Fast speed ppt/pptx/doc/docx/pdf extraction tool
|
409
450
|
|
410
451
|
# Magic-html
|
452
|
+
|
411
453
|
[Magic-HTML](https://github.com/opendatalab/magic-html) Mixed web page extraction tool
|
412
454
|
|
413
455
|
# Links
|