magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +130 -76
- magic_pdf/integrations/__init__.py +0 -0
- magic_pdf/integrations/rag/__init__.py +0 -0
- magic_pdf/integrations/rag/api.py +82 -0
- magic_pdf/integrations/rag/type.py +82 -0
- magic_pdf/integrations/rag/utils.py +285 -0
- magic_pdf/layout/layout_sort.py +472 -283
- magic_pdf/libs/boxbase.py +188 -149
- magic_pdf/libs/draw_bbox.py +113 -87
- magic_pdf/libs/ocr_content_type.py +21 -18
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
- magic_pdf/model/magic_model.py +283 -166
- magic_pdf/model/model_list.py +8 -0
- magic_pdf/model/pdf_extract_kit.py +105 -15
- magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
- magic_pdf/para/para_split_v2.py +26 -27
- magic_pdf/pdf_parse_union_core.py +34 -6
- magic_pdf/pipe/AbsPipe.py +4 -1
- magic_pdf/pipe/OCRPipe.py +7 -4
- magic_pdf/pipe/TXTPipe.py +7 -4
- magic_pdf/pipe/UNIPipe.py +11 -6
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
- magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
- magic_pdf/tools/cli.py +56 -29
- magic_pdf/tools/cli_dev.py +61 -64
- magic_pdf/tools/common.py +57 -37
- magic_pdf/user_api.py +17 -9
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py
CHANGED
@@ -1,22 +1,25 @@
|
|
1
|
-
import os
|
2
|
-
import json as json_parse
|
3
1
|
import copy
|
2
|
+
import json as json_parse
|
3
|
+
import os
|
4
|
+
|
4
5
|
import click
|
5
6
|
from loguru import logger
|
7
|
+
|
8
|
+
import magic_pdf.model as model_config
|
9
|
+
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
|
10
|
+
drow_model_bbox)
|
6
11
|
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
7
|
-
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
|
8
|
-
from magic_pdf.pipe.UNIPipe import UNIPipe
|
9
12
|
from magic_pdf.pipe.OCRPipe import OCRPipe
|
10
13
|
from magic_pdf.pipe.TXTPipe import TXTPipe
|
11
|
-
from magic_pdf.
|
14
|
+
from magic_pdf.pipe.UNIPipe import UNIPipe
|
12
15
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
13
|
-
|
16
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
14
17
|
|
15
18
|
|
16
19
|
def prepare_env(output_dir, pdf_file_name, method):
|
17
20
|
local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
|
18
21
|
|
19
|
-
local_image_dir = os.path.join(str(local_parent_dir),
|
22
|
+
local_image_dir = os.path.join(str(local_parent_dir), 'images')
|
20
23
|
local_md_dir = local_parent_dir
|
21
24
|
os.makedirs(local_image_dir, exist_ok=True)
|
22
25
|
os.makedirs(local_md_dir, exist_ok=True)
|
@@ -29,6 +32,7 @@ def do_parse(
|
|
29
32
|
pdf_bytes,
|
30
33
|
model_list,
|
31
34
|
parse_method,
|
35
|
+
debug_able,
|
32
36
|
f_draw_span_bbox=True,
|
33
37
|
f_draw_layout_bbox=True,
|
34
38
|
f_dump_md=True,
|
@@ -38,24 +42,34 @@ def do_parse(
|
|
38
42
|
f_dump_content_list=False,
|
39
43
|
f_make_md_mode=MakeMode.MM_MD,
|
40
44
|
f_draw_model_bbox=False,
|
45
|
+
start_page_id=0,
|
46
|
+
end_page_id=None,
|
41
47
|
):
|
48
|
+
if debug_able:
|
49
|
+
logger.warning('debug mode is on')
|
50
|
+
f_dump_content_list = True
|
51
|
+
f_draw_model_bbox = True
|
52
|
+
|
42
53
|
orig_model_list = copy.deepcopy(model_list)
|
43
|
-
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
54
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
55
|
+
parse_method)
|
44
56
|
|
45
|
-
image_writer, md_writer = DiskReaderWriter(
|
46
|
-
local_md_dir
|
47
|
-
)
|
57
|
+
image_writer, md_writer = DiskReaderWriter(
|
58
|
+
local_image_dir), DiskReaderWriter(local_md_dir)
|
48
59
|
image_dir = str(os.path.basename(local_image_dir))
|
49
60
|
|
50
|
-
if parse_method ==
|
51
|
-
jso_useful_key = {
|
52
|
-
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
61
|
+
if parse_method == 'auto':
|
62
|
+
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
63
|
+
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
|
64
|
+
start_page_id=start_page_id, end_page_id=end_page_id)
|
65
|
+
elif parse_method == 'txt':
|
66
|
+
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
67
|
+
start_page_id=start_page_id, end_page_id=end_page_id)
|
68
|
+
elif parse_method == 'ocr':
|
69
|
+
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
70
|
+
start_page_id=start_page_id, end_page_id=end_page_id)
|
57
71
|
else:
|
58
|
-
logger.error(
|
72
|
+
logger.error('unknown parse method')
|
59
73
|
exit(1)
|
60
74
|
|
61
75
|
pipe.pipe_classify()
|
@@ -65,58 +79,64 @@ def do_parse(
|
|
65
79
|
pipe.pipe_analyze()
|
66
80
|
orig_model_list = copy.deepcopy(pipe.model_list)
|
67
81
|
else:
|
68
|
-
logger.error(
|
82
|
+
logger.error('need model list input')
|
69
83
|
exit(2)
|
70
84
|
|
71
85
|
pipe.pipe_parse()
|
72
|
-
pdf_info = pipe.pdf_mid_data[
|
86
|
+
pdf_info = pipe.pdf_mid_data['pdf_info']
|
73
87
|
if f_draw_layout_bbox:
|
74
|
-
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
|
88
|
+
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
|
75
89
|
if f_draw_span_bbox:
|
76
|
-
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
|
90
|
+
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
|
77
91
|
if f_draw_model_bbox:
|
78
|
-
drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
|
92
|
+
drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
|
79
93
|
|
80
|
-
md_content = pipe.pipe_mk_markdown(
|
81
|
-
|
82
|
-
|
94
|
+
md_content = pipe.pipe_mk_markdown(image_dir,
|
95
|
+
drop_mode=DropMode.NONE,
|
96
|
+
md_make_mode=f_make_md_mode)
|
83
97
|
if f_dump_md:
|
84
98
|
md_writer.write(
|
85
99
|
content=md_content,
|
86
|
-
path=f
|
100
|
+
path=f'{pdf_file_name}.md',
|
87
101
|
mode=AbsReaderWriter.MODE_TXT,
|
88
102
|
)
|
89
103
|
|
90
104
|
if f_dump_middle_json:
|
91
105
|
md_writer.write(
|
92
|
-
content=json_parse.dumps(pipe.pdf_mid_data,
|
93
|
-
|
106
|
+
content=json_parse.dumps(pipe.pdf_mid_data,
|
107
|
+
ensure_ascii=False,
|
108
|
+
indent=4),
|
109
|
+
path=f'{pdf_file_name}_middle.json',
|
94
110
|
mode=AbsReaderWriter.MODE_TXT,
|
95
111
|
)
|
96
112
|
|
97
113
|
if f_dump_model_json:
|
98
114
|
md_writer.write(
|
99
|
-
content=json_parse.dumps(orig_model_list,
|
100
|
-
|
115
|
+
content=json_parse.dumps(orig_model_list,
|
116
|
+
ensure_ascii=False,
|
117
|
+
indent=4),
|
118
|
+
path=f'{pdf_file_name}_model.json',
|
101
119
|
mode=AbsReaderWriter.MODE_TXT,
|
102
120
|
)
|
103
121
|
|
104
122
|
if f_dump_orig_pdf:
|
105
123
|
md_writer.write(
|
106
124
|
content=pdf_bytes,
|
107
|
-
path=
|
125
|
+
path=f'{pdf_file_name}_origin.pdf',
|
108
126
|
mode=AbsReaderWriter.MODE_BIN,
|
109
127
|
)
|
110
128
|
|
111
129
|
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
112
130
|
if f_dump_content_list:
|
113
131
|
md_writer.write(
|
114
|
-
content=json_parse.dumps(content_list,
|
115
|
-
|
132
|
+
content=json_parse.dumps(content_list,
|
133
|
+
ensure_ascii=False,
|
134
|
+
indent=4),
|
135
|
+
path=f'{pdf_file_name}_content_list.json',
|
116
136
|
mode=AbsReaderWriter.MODE_TXT,
|
117
137
|
)
|
118
138
|
|
119
|
-
logger.info(f
|
139
|
+
logger.info(f'local output dir is {local_md_dir}')
|
120
140
|
|
121
141
|
|
122
|
-
parse_pdf_methods = click.Choice([
|
142
|
+
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
|
magic_pdf/user_api.py
CHANGED
@@ -25,8 +25,9 @@ PARSE_TYPE_TXT = "txt"
|
|
25
25
|
PARSE_TYPE_OCR = "ocr"
|
26
26
|
|
27
27
|
|
28
|
-
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
29
|
-
|
28
|
+
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
29
|
+
start_page_id=0, end_page_id=None,
|
30
|
+
*args, **kwargs):
|
30
31
|
"""
|
31
32
|
解析文本类pdf
|
32
33
|
"""
|
@@ -34,7 +35,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
34
35
|
pdf_bytes,
|
35
36
|
pdf_models,
|
36
37
|
imageWriter,
|
37
|
-
start_page_id=
|
38
|
+
start_page_id=start_page_id,
|
39
|
+
end_page_id=end_page_id,
|
38
40
|
debug_mode=is_debug,
|
39
41
|
)
|
40
42
|
|
@@ -45,8 +47,9 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
45
47
|
return pdf_info_dict
|
46
48
|
|
47
49
|
|
48
|
-
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
49
|
-
|
50
|
+
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
51
|
+
start_page_id=0, end_page_id=None,
|
52
|
+
*args, **kwargs):
|
50
53
|
"""
|
51
54
|
解析ocr类pdf
|
52
55
|
"""
|
@@ -54,7 +57,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
54
57
|
pdf_bytes,
|
55
58
|
pdf_models,
|
56
59
|
imageWriter,
|
57
|
-
start_page_id=
|
60
|
+
start_page_id=start_page_id,
|
61
|
+
end_page_id=end_page_id,
|
58
62
|
debug_mode=is_debug,
|
59
63
|
)
|
60
64
|
|
@@ -65,8 +69,9 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
65
69
|
return pdf_info_dict
|
66
70
|
|
67
71
|
|
68
|
-
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
72
|
+
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
69
73
|
input_model_is_empty: bool = False,
|
74
|
+
start_page_id=0, end_page_id=None,
|
70
75
|
*args, **kwargs):
|
71
76
|
"""
|
72
77
|
ocr和文本混合的pdf,全部解析出来
|
@@ -78,7 +83,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
78
83
|
pdf_bytes,
|
79
84
|
pdf_models,
|
80
85
|
imageWriter,
|
81
|
-
start_page_id=
|
86
|
+
start_page_id=start_page_id,
|
87
|
+
end_page_id=end_page_id,
|
82
88
|
debug_mode=is_debug,
|
83
89
|
)
|
84
90
|
except Exception as e:
|
@@ -89,7 +95,9 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
89
95
|
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
|
90
96
|
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
|
91
97
|
if input_model_is_empty:
|
92
|
-
pdf_models = doc_analyze(pdf_bytes, ocr=True
|
98
|
+
pdf_models = doc_analyze(pdf_bytes, ocr=True,
|
99
|
+
start_page_id=start_page_id,
|
100
|
+
end_page_id=end_page_id)
|
93
101
|
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
94
102
|
if pdf_info_dict is None:
|
95
103
|
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.8.1
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,13 +9,14 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3>=1.28.43
|
10
10
|
Requires-Dist: Brotli>=1.1.0
|
11
11
|
Requires-Dist: click>=8.1.7
|
12
|
-
Requires-Dist:
|
12
|
+
Requires-Dist: fast-langdetect==0.2.0
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
14
|
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
-
Requires-Dist: fast-langdetect==0.2.0
|
16
|
-
Requires-Dist: wordninja>=2.0.0
|
17
|
-
Requires-Dist: scikit-learn>=1.0.2
|
18
15
|
Requires-Dist: pdfminer.six==20231228
|
16
|
+
Requires-Dist: pydantic<2.8.0,>=2.7.2
|
17
|
+
Requires-Dist: PyMuPDF>=1.24.9
|
18
|
+
Requires-Dist: scikit-learn>=1.0.2
|
19
|
+
Requires-Dist: wordninja>=2.0.0
|
19
20
|
Provides-Extra: full
|
20
21
|
Requires-Dist: unimernet==0.1.6; extra == "full"
|
21
22
|
Requires-Dist: ultralytics; extra == "full"
|
@@ -39,6 +40,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
39
40
|
</p>
|
40
41
|
|
41
42
|
<!-- icon -->
|
43
|
+
|
42
44
|
[](https://github.com/opendatalab/MinerU)
|
43
45
|
[](https://github.com/opendatalab/MinerU)
|
44
46
|
[](https://github.com/opendatalab/MinerU/issues)
|
@@ -46,17 +48,27 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
46
48
|
[](https://badge.fury.io/py/magic-pdf)
|
47
49
|
[](https://pepy.tech/project/magic-pdf)
|
48
50
|
[](https://pepy.tech/project/magic-pdf)
|
51
|
+
|
52
|
+
[](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
|
53
|
+
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
54
|
+
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
55
|
+
[](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
|
56
|
+
[](#)
|
57
|
+
|
49
58
|
<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
50
59
|
|
51
60
|
<!-- language -->
|
61
|
+
|
52
62
|
[English](README.md) | [简体中文](README_zh-CN.md)
|
53
63
|
|
54
64
|
<!-- hot link -->
|
65
|
+
|
55
66
|
<p align="center">
|
56
67
|
<a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
|
57
68
|
</p>
|
58
69
|
|
59
70
|
<!-- join us -->
|
71
|
+
|
60
72
|
<p align="center">
|
61
73
|
👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
|
62
74
|
</p>
|
@@ -64,12 +76,14 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
64
76
|
</div>
|
65
77
|
|
66
78
|
# Changelog
|
79
|
+
- 2024/09/09: Version 0.8.0 released, supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.
|
67
80
|
- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
|
68
81
|
- 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
|
69
82
|
- 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
|
70
83
|
- 2024/07/05: Initial open-source release
|
71
84
|
|
72
85
|
<!-- TABLE OF CONTENT -->
|
86
|
+
|
73
87
|
<details open="open">
|
74
88
|
<summary><h2 style="display: inline-block">Table of Contents</h2></summary>
|
75
89
|
<ol>
|
@@ -108,10 +122,10 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
108
122
|
</ol>
|
109
123
|
</details>
|
110
124
|
|
111
|
-
|
112
|
-
|
113
125
|
# MinerU
|
126
|
+
|
114
127
|
## Project Introduction
|
128
|
+
|
115
129
|
MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
|
116
130
|
MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
|
117
131
|
Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [issue](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
|
@@ -135,6 +149,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
|
135
149
|
If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
|
136
150
|
If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
|
137
151
|
There are three different ways to experience MinerU:
|
152
|
+
|
138
153
|
- [Online Demo (No Installation Required)](#online-demo)
|
139
154
|
- [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
|
140
155
|
- [Linux/Windows + CUDA](#Using-GPU)
|
@@ -192,44 +207,56 @@ In non-mainline environments, due to the diversity of hardware and software conf
|
|
192
207
|
<tr>
|
193
208
|
<td colspan="2">Recommended Configuration 16G+ VRAM</td>
|
194
209
|
<td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
|
195
|
-
16G or more can enable layout, formula recognition, and OCR acceleration simultaneously
|
210
|
+
16G or more can enable layout, formula recognition, and OCR acceleration simultaneously<br>
|
211
|
+
24G or more can enable layout, formula recognition, OCR acceleration and table recognition simultaneously
|
212
|
+
</td>
|
196
213
|
</tr>
|
197
214
|
</table>
|
198
215
|
|
199
216
|
### Online Demo
|
200
217
|
|
201
|
-
[
|
218
|
+
[](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
|
219
|
+
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
220
|
+
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
202
221
|
|
203
222
|
### Quick CPU Demo
|
204
223
|
|
205
224
|
#### 1. Install magic-pdf
|
225
|
+
|
206
226
|
```bash
|
207
227
|
conda create -n MinerU python=3.10
|
208
228
|
conda activate MinerU
|
209
229
|
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
|
210
230
|
```
|
231
|
+
|
211
232
|
#### 2. Download model weight files
|
212
233
|
|
213
234
|
Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for detailed instructions.
|
235
|
+
|
214
236
|
> ❗️After downloading the models, please make sure to verify the completeness of the model files.
|
215
|
-
>
|
237
|
+
>
|
216
238
|
> Check if the model file sizes match the description on the webpage. If possible, use sha256 to verify the integrity of the files.
|
217
239
|
|
218
240
|
#### 3. Copy and configure the template file
|
241
|
+
|
219
242
|
You can find the `magic-pdf.template.json` template configuration file in the root directory of the repository.
|
243
|
+
|
220
244
|
> ❗️Make sure to execute the following command to copy the configuration file to your **user directory**; otherwise, the program will not run.
|
221
|
-
>
|
245
|
+
>
|
222
246
|
> The user directory for Windows is `C:\Users\YourUsername`, for Linux it is `/home/YourUsername`, and for macOS it is `/Users/YourUsername`.
|
247
|
+
|
223
248
|
```bash
|
224
249
|
cp magic-pdf.template.json ~/magic-pdf.json
|
225
250
|
```
|
226
251
|
|
227
252
|
Find the `magic-pdf.json` file in your user directory and configure the "models-dir" path to point to the directory where the model weight files were downloaded in [Step 2](#2-download-model-weight-files).
|
253
|
+
|
228
254
|
> ❗️Make sure to correctly configure the **absolute path** to the model weight files directory, otherwise the program will not run because it can't find the model files.
|
229
255
|
>
|
230
256
|
> On Windows, this path should include the drive letter and all backslashes (`\`) in the path should be replaced with forward slashes (`/`) to avoid syntax errors in the JSON file due to escape sequences.
|
231
|
-
>
|
257
|
+
>
|
232
258
|
> For example: If the models are stored in the "models" directory at the root of the D drive, the "model-dir" value should be `D:/models`.
|
259
|
+
|
233
260
|
```json
|
234
261
|
{
|
235
262
|
// other config
|
@@ -242,13 +269,26 @@ Find the `magic-pdf.json` file in your user directory and configure the "models-
|
|
242
269
|
}
|
243
270
|
```
|
244
271
|
|
245
|
-
|
246
272
|
### Using GPU
|
273
|
+
|
247
274
|
If your device supports CUDA and meets the GPU requirements of the mainline environment, you can use GPU acceleration. Please select the appropriate guide based on your system:
|
248
275
|
|
249
276
|
- [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
|
250
277
|
- [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
|
251
|
-
|
278
|
+
- Quick Deployment with Docker
|
279
|
+
> Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
|
280
|
+
>
|
281
|
+
> Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
|
282
|
+
>
|
283
|
+
> ```bash
|
284
|
+
> docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
|
285
|
+
> ```
|
286
|
+
```bash
|
287
|
+
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
|
288
|
+
docker build -t mineru:latest .
|
289
|
+
docker run --rm -it --gpus=all mineru:latest /bin/bash
|
290
|
+
magic-pdf --help
|
291
|
+
```
|
252
292
|
|
253
293
|
## Usage
|
254
294
|
|
@@ -262,12 +302,12 @@ Options:
|
|
262
302
|
-v, --version display the version and exit
|
263
303
|
-p, --path PATH local pdf filepath or directory [required]
|
264
304
|
-o, --output-dir TEXT output local directory
|
265
|
-
-m, --method [ocr|txt|auto] the method for parsing pdf.
|
305
|
+
-m, --method [ocr|txt|auto] the method for parsing pdf.
|
266
306
|
ocr: using ocr technique to extract information from pdf,
|
267
307
|
txt: suitable for the text-based pdf only and outperform ocr,
|
268
308
|
auto: automatically choose the best method for parsing pdf
|
269
309
|
from ocr and txt.
|
270
|
-
without method specified, auto will be used by default.
|
310
|
+
without method specified, auto will be used by default.
|
271
311
|
--help Show this message and exit.
|
272
312
|
|
273
313
|
|
@@ -282,13 +322,13 @@ magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
|
|
282
322
|
The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
|
283
323
|
|
284
324
|
```text
|
285
|
-
├── some_pdf.md
|
286
|
-
├── images
|
287
|
-
├──
|
288
|
-
├──
|
289
|
-
├──
|
290
|
-
├──
|
291
|
-
└──
|
325
|
+
├── some_pdf.md # markdown file
|
326
|
+
├── images # directory for storing images
|
327
|
+
├── some_pdf_layout.pdf # layout diagram
|
328
|
+
├── some_pdf_middle.json # MinerU intermediate processing result
|
329
|
+
├── some_pdf_model.json # model inference result
|
330
|
+
├── some_pdf_origin.pdf # original PDF file
|
331
|
+
└── some_pdf_spans.pdf # smallest granularity bbox position information diagram
|
292
332
|
```
|
293
333
|
|
294
334
|
For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
@@ -296,6 +336,7 @@ For more information about the output files, please refer to the [Output File De
|
|
296
336
|
### API
|
297
337
|
|
298
338
|
Processing files from local disk
|
339
|
+
|
299
340
|
```python
|
300
341
|
image_writer = DiskReaderWriter(local_image_dir)
|
301
342
|
image_dir = str(os.path.basename(local_image_dir))
|
@@ -308,6 +349,7 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
|
308
349
|
```
|
309
350
|
|
310
351
|
Processing files from object storage
|
352
|
+
|
311
353
|
```python
|
312
354
|
s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
|
313
355
|
image_dir = "s3://img_bucket/"
|
@@ -322,10 +364,10 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
|
322
364
|
```
|
323
365
|
|
324
366
|
For detailed implementation, refer to:
|
367
|
+
|
325
368
|
- [demo.py Simplest Processing Method](demo/demo.py)
|
326
369
|
- [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
|
327
370
|
|
328
|
-
|
329
371
|
### Development Guide
|
330
372
|
|
331
373
|
TODO
|
@@ -337,10 +379,11 @@ TODO
|
|
337
379
|
- [ ] Code block recognition within the text
|
338
380
|
- [ ] Table of contents recognition
|
339
381
|
- [x] Table recognition
|
340
|
-
- [ ] Chemical formula recognition
|
382
|
+
- [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
|
341
383
|
- [ ] Geometric shape recognition
|
342
384
|
|
343
385
|
# Known Issues
|
386
|
+
|
344
387
|
- Reading order is segmented based on rules, which can cause disordered sequences in some cases
|
345
388
|
- Vertical text is not supported
|
346
389
|
- Lists, code blocks, and table of contents are not yet supported in the layout model
|
@@ -350,11 +393,11 @@ TODO
|
|
350
393
|
|
351
394
|
|
352
395
|
# FAQ
|
396
|
+
|
353
397
|
[FAQ in Chinese](docs/FAQ_zh_cn.md)
|
354
398
|
|
355
399
|
[FAQ in English](docs/FAQ_en_us.md)
|
356
400
|
|
357
|
-
|
358
401
|
# All Thanks To Our Contributors
|
359
402
|
|
360
403
|
<a href="https://github.com/opendatalab/MinerU/graphs/contributors">
|
@@ -367,8 +410,8 @@ TODO
|
|
367
410
|
|
368
411
|
This project currently uses PyMuPDF to achieve advanced functionality. However, since it adheres to the AGPL license, it may impose restrictions on certain usage scenarios. In future iterations, we plan to explore and replace it with a more permissive PDF processing library to enhance user-friendliness and flexibility.
|
369
412
|
|
370
|
-
|
371
413
|
# Acknowledgments
|
414
|
+
|
372
415
|
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
373
416
|
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
|
374
417
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
@@ -405,9 +448,11 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
|
|
405
448
|
</a>
|
406
449
|
|
407
450
|
# Magic-doc
|
451
|
+
|
408
452
|
[Magic-Doc](https://github.com/InternLM/magic-doc) Fast speed ppt/pptx/doc/docx/pdf extraction tool
|
409
453
|
|
410
454
|
# Magic-html
|
455
|
+
|
411
456
|
[Magic-HTML](https://github.com/opendatalab/magic-html) Mixed web page extraction tool
|
412
457
|
|
413
458
|
# Links
|