magic-pdf 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +134 -76
- magic_pdf/integrations/__init__.py +0 -0
- magic_pdf/integrations/rag/__init__.py +0 -0
- magic_pdf/integrations/rag/api.py +82 -0
- magic_pdf/integrations/rag/type.py +82 -0
- magic_pdf/integrations/rag/utils.py +285 -0
- magic_pdf/layout/layout_sort.py +472 -283
- magic_pdf/libs/Constants.py +27 -1
- magic_pdf/libs/boxbase.py +169 -149
- magic_pdf/libs/draw_bbox.py +113 -87
- magic_pdf/libs/ocr_content_type.py +21 -18
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
- magic_pdf/model/magic_model.py +230 -161
- magic_pdf/model/model_list.py +8 -0
- magic_pdf/model/pdf_extract_kit.py +135 -22
- magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
- magic_pdf/model/ppTableModel.py +67 -0
- magic_pdf/para/para_split_v2.py +76 -74
- magic_pdf/pdf_parse_union_core.py +34 -6
- magic_pdf/pipe/AbsPipe.py +4 -1
- magic_pdf/pipe/OCRPipe.py +7 -4
- magic_pdf/pipe/TXTPipe.py +7 -4
- magic_pdf/pipe/UNIPipe.py +11 -6
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
- magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
- magic_pdf/resources/model_config/model_configs.yaml +3 -1
- magic_pdf/tools/cli.py +56 -29
- magic_pdf/tools/cli_dev.py +61 -64
- magic_pdf/tools/common.py +57 -37
- magic_pdf/user_api.py +17 -9
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +71 -33
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +38 -32
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py
CHANGED
@@ -1,22 +1,25 @@
|
|
1
|
-
import os
|
2
|
-
import json as json_parse
|
3
1
|
import copy
|
2
|
+
import json as json_parse
|
3
|
+
import os
|
4
|
+
|
4
5
|
import click
|
5
6
|
from loguru import logger
|
7
|
+
|
8
|
+
import magic_pdf.model as model_config
|
9
|
+
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
|
10
|
+
drow_model_bbox)
|
6
11
|
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
|
7
|
-
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox
|
8
|
-
from magic_pdf.pipe.UNIPipe import UNIPipe
|
9
12
|
from magic_pdf.pipe.OCRPipe import OCRPipe
|
10
13
|
from magic_pdf.pipe.TXTPipe import TXTPipe
|
11
|
-
from magic_pdf.
|
14
|
+
from magic_pdf.pipe.UNIPipe import UNIPipe
|
12
15
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
13
|
-
|
16
|
+
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
14
17
|
|
15
18
|
|
16
19
|
def prepare_env(output_dir, pdf_file_name, method):
|
17
20
|
local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
|
18
21
|
|
19
|
-
local_image_dir = os.path.join(str(local_parent_dir),
|
22
|
+
local_image_dir = os.path.join(str(local_parent_dir), 'images')
|
20
23
|
local_md_dir = local_parent_dir
|
21
24
|
os.makedirs(local_image_dir, exist_ok=True)
|
22
25
|
os.makedirs(local_md_dir, exist_ok=True)
|
@@ -29,6 +32,7 @@ def do_parse(
|
|
29
32
|
pdf_bytes,
|
30
33
|
model_list,
|
31
34
|
parse_method,
|
35
|
+
debug_able,
|
32
36
|
f_draw_span_bbox=True,
|
33
37
|
f_draw_layout_bbox=True,
|
34
38
|
f_dump_md=True,
|
@@ -38,24 +42,34 @@ def do_parse(
|
|
38
42
|
f_dump_content_list=False,
|
39
43
|
f_make_md_mode=MakeMode.MM_MD,
|
40
44
|
f_draw_model_bbox=False,
|
45
|
+
start_page_id=0,
|
46
|
+
end_page_id=None,
|
41
47
|
):
|
48
|
+
if debug_able:
|
49
|
+
logger.warning("debug mode is on")
|
50
|
+
f_dump_content_list = True
|
51
|
+
f_draw_model_bbox = True
|
52
|
+
|
42
53
|
orig_model_list = copy.deepcopy(model_list)
|
43
|
-
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
54
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
55
|
+
parse_method)
|
44
56
|
|
45
|
-
image_writer, md_writer = DiskReaderWriter(
|
46
|
-
local_md_dir
|
47
|
-
)
|
57
|
+
image_writer, md_writer = DiskReaderWriter(
|
58
|
+
local_image_dir), DiskReaderWriter(local_md_dir)
|
48
59
|
image_dir = str(os.path.basename(local_image_dir))
|
49
60
|
|
50
|
-
if parse_method ==
|
51
|
-
jso_useful_key = {
|
52
|
-
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
61
|
+
if parse_method == 'auto':
|
62
|
+
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
63
|
+
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
|
64
|
+
start_page_id=start_page_id, end_page_id=end_page_id)
|
65
|
+
elif parse_method == 'txt':
|
66
|
+
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
67
|
+
start_page_id=start_page_id, end_page_id=end_page_id)
|
68
|
+
elif parse_method == 'ocr':
|
69
|
+
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
70
|
+
start_page_id=start_page_id, end_page_id=end_page_id)
|
57
71
|
else:
|
58
|
-
logger.error(
|
72
|
+
logger.error('unknown parse method')
|
59
73
|
exit(1)
|
60
74
|
|
61
75
|
pipe.pipe_classify()
|
@@ -65,58 +79,64 @@ def do_parse(
|
|
65
79
|
pipe.pipe_analyze()
|
66
80
|
orig_model_list = copy.deepcopy(pipe.model_list)
|
67
81
|
else:
|
68
|
-
logger.error(
|
82
|
+
logger.error('need model list input')
|
69
83
|
exit(2)
|
70
84
|
|
71
85
|
pipe.pipe_parse()
|
72
|
-
pdf_info = pipe.pdf_mid_data[
|
86
|
+
pdf_info = pipe.pdf_mid_data['pdf_info']
|
73
87
|
if f_draw_layout_bbox:
|
74
|
-
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
|
88
|
+
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
|
75
89
|
if f_draw_span_bbox:
|
76
|
-
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
|
90
|
+
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
|
77
91
|
if f_draw_model_bbox:
|
78
|
-
drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir)
|
92
|
+
drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
|
79
93
|
|
80
|
-
md_content = pipe.pipe_mk_markdown(
|
81
|
-
|
82
|
-
|
94
|
+
md_content = pipe.pipe_mk_markdown(image_dir,
|
95
|
+
drop_mode=DropMode.NONE,
|
96
|
+
md_make_mode=f_make_md_mode)
|
83
97
|
if f_dump_md:
|
84
98
|
md_writer.write(
|
85
99
|
content=md_content,
|
86
|
-
path=f
|
100
|
+
path=f'{pdf_file_name}.md',
|
87
101
|
mode=AbsReaderWriter.MODE_TXT,
|
88
102
|
)
|
89
103
|
|
90
104
|
if f_dump_middle_json:
|
91
105
|
md_writer.write(
|
92
|
-
content=json_parse.dumps(pipe.pdf_mid_data,
|
93
|
-
|
106
|
+
content=json_parse.dumps(pipe.pdf_mid_data,
|
107
|
+
ensure_ascii=False,
|
108
|
+
indent=4),
|
109
|
+
path=f'{pdf_file_name}_middle.json',
|
94
110
|
mode=AbsReaderWriter.MODE_TXT,
|
95
111
|
)
|
96
112
|
|
97
113
|
if f_dump_model_json:
|
98
114
|
md_writer.write(
|
99
|
-
content=json_parse.dumps(orig_model_list,
|
100
|
-
|
115
|
+
content=json_parse.dumps(orig_model_list,
|
116
|
+
ensure_ascii=False,
|
117
|
+
indent=4),
|
118
|
+
path=f'{pdf_file_name}_model.json',
|
101
119
|
mode=AbsReaderWriter.MODE_TXT,
|
102
120
|
)
|
103
121
|
|
104
122
|
if f_dump_orig_pdf:
|
105
123
|
md_writer.write(
|
106
124
|
content=pdf_bytes,
|
107
|
-
path=
|
125
|
+
path=f'{pdf_file_name}_origin.pdf',
|
108
126
|
mode=AbsReaderWriter.MODE_BIN,
|
109
127
|
)
|
110
128
|
|
111
129
|
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
112
130
|
if f_dump_content_list:
|
113
131
|
md_writer.write(
|
114
|
-
content=json_parse.dumps(content_list,
|
115
|
-
|
132
|
+
content=json_parse.dumps(content_list,
|
133
|
+
ensure_ascii=False,
|
134
|
+
indent=4),
|
135
|
+
path=f'{pdf_file_name}_content_list.json',
|
116
136
|
mode=AbsReaderWriter.MODE_TXT,
|
117
137
|
)
|
118
138
|
|
119
|
-
logger.info(f
|
139
|
+
logger.info(f'local output dir is {local_md_dir}')
|
120
140
|
|
121
141
|
|
122
|
-
parse_pdf_methods = click.Choice([
|
142
|
+
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
|
magic_pdf/user_api.py
CHANGED
@@ -25,8 +25,9 @@ PARSE_TYPE_TXT = "txt"
|
|
25
25
|
PARSE_TYPE_OCR = "ocr"
|
26
26
|
|
27
27
|
|
28
|
-
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
29
|
-
|
28
|
+
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
29
|
+
start_page_id=0, end_page_id=None,
|
30
|
+
*args, **kwargs):
|
30
31
|
"""
|
31
32
|
解析文本类pdf
|
32
33
|
"""
|
@@ -34,7 +35,8 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
34
35
|
pdf_bytes,
|
35
36
|
pdf_models,
|
36
37
|
imageWriter,
|
37
|
-
start_page_id=
|
38
|
+
start_page_id=start_page_id,
|
39
|
+
end_page_id=end_page_id,
|
38
40
|
debug_mode=is_debug,
|
39
41
|
)
|
40
42
|
|
@@ -45,8 +47,9 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
45
47
|
return pdf_info_dict
|
46
48
|
|
47
49
|
|
48
|
-
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
49
|
-
|
50
|
+
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
51
|
+
start_page_id=0, end_page_id=None,
|
52
|
+
*args, **kwargs):
|
50
53
|
"""
|
51
54
|
解析ocr类pdf
|
52
55
|
"""
|
@@ -54,7 +57,8 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
54
57
|
pdf_bytes,
|
55
58
|
pdf_models,
|
56
59
|
imageWriter,
|
57
|
-
start_page_id=
|
60
|
+
start_page_id=start_page_id,
|
61
|
+
end_page_id=end_page_id,
|
58
62
|
debug_mode=is_debug,
|
59
63
|
)
|
60
64
|
|
@@ -65,8 +69,9 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
|
|
65
69
|
return pdf_info_dict
|
66
70
|
|
67
71
|
|
68
|
-
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
72
|
+
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
|
69
73
|
input_model_is_empty: bool = False,
|
74
|
+
start_page_id=0, end_page_id=None,
|
70
75
|
*args, **kwargs):
|
71
76
|
"""
|
72
77
|
ocr和文本混合的pdf,全部解析出来
|
@@ -78,7 +83,8 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
78
83
|
pdf_bytes,
|
79
84
|
pdf_models,
|
80
85
|
imageWriter,
|
81
|
-
start_page_id=
|
86
|
+
start_page_id=start_page_id,
|
87
|
+
end_page_id=end_page_id,
|
82
88
|
debug_mode=is_debug,
|
83
89
|
)
|
84
90
|
except Exception as e:
|
@@ -89,7 +95,9 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
89
95
|
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
|
90
96
|
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
|
91
97
|
if input_model_is_empty:
|
92
|
-
pdf_models = doc_analyze(pdf_bytes, ocr=True
|
98
|
+
pdf_models = doc_analyze(pdf_bytes, ocr=True,
|
99
|
+
start_page_id=start_page_id,
|
100
|
+
end_page_id=end_page_id)
|
93
101
|
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
94
102
|
if pdf_info_dict is None:
|
95
103
|
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.8.0
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,13 +9,14 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3>=1.28.43
|
10
10
|
Requires-Dist: Brotli>=1.1.0
|
11
11
|
Requires-Dist: click>=8.1.7
|
12
|
-
Requires-Dist:
|
12
|
+
Requires-Dist: fast-langdetect==0.2.0
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
14
|
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
-
Requires-Dist: fast-langdetect==0.2.0
|
16
|
-
Requires-Dist: wordninja>=2.0.0
|
17
|
-
Requires-Dist: scikit-learn>=1.0.2
|
18
15
|
Requires-Dist: pdfminer.six==20231228
|
16
|
+
Requires-Dist: pydantic<2.8.0,>=2.7.2
|
17
|
+
Requires-Dist: PyMuPDF>=1.24.9
|
18
|
+
Requires-Dist: scikit-learn>=1.0.2
|
19
|
+
Requires-Dist: wordninja>=2.0.0
|
19
20
|
Provides-Extra: full
|
20
21
|
Requires-Dist: unimernet==0.1.6; extra == "full"
|
21
22
|
Requires-Dist: ultralytics; extra == "full"
|
@@ -39,6 +40,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
39
40
|
</p>
|
40
41
|
|
41
42
|
<!-- icon -->
|
43
|
+
|
42
44
|
[](https://github.com/opendatalab/MinerU)
|
43
45
|
[](https://github.com/opendatalab/MinerU)
|
44
46
|
[](https://github.com/opendatalab/MinerU/issues)
|
@@ -46,17 +48,26 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
46
48
|
[](https://badge.fury.io/py/magic-pdf)
|
47
49
|
[](https://pepy.tech/project/magic-pdf)
|
48
50
|
[](https://pepy.tech/project/magic-pdf)
|
51
|
+
|
52
|
+
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
53
|
+
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
54
|
+
[](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
|
55
|
+
[](#)
|
56
|
+
|
49
57
|
<a href="https://trendshift.io/repositories/11174" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11174" alt="opendatalab%2FMinerU | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
50
58
|
|
51
59
|
<!-- language -->
|
60
|
+
|
52
61
|
[English](README.md) | [简体中文](README_zh-CN.md)
|
53
62
|
|
54
63
|
<!-- hot link -->
|
64
|
+
|
55
65
|
<p align="center">
|
56
66
|
<a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
|
57
67
|
</p>
|
58
68
|
|
59
69
|
<!-- join us -->
|
70
|
+
|
60
71
|
<p align="center">
|
61
72
|
👋 join us on <a href="https://discord.gg/Tdedn9GTXq" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
|
62
73
|
</p>
|
@@ -64,11 +75,14 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
64
75
|
</div>
|
65
76
|
|
66
77
|
# Changelog
|
78
|
+
- 2024/09/09: Version 0.8.0 released, supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.
|
79
|
+
- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
|
67
80
|
- 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
|
68
81
|
- 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
|
69
82
|
- 2024/07/05: Initial open-source release
|
70
83
|
|
71
84
|
<!-- TABLE OF CONTENT -->
|
85
|
+
|
72
86
|
<details open="open">
|
73
87
|
<summary><h2 style="display: inline-block">Table of Contents</h2></summary>
|
74
88
|
<ol>
|
@@ -107,10 +121,10 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
107
121
|
</ol>
|
108
122
|
</details>
|
109
123
|
|
110
|
-
|
111
|
-
|
112
124
|
# MinerU
|
125
|
+
|
113
126
|
## Project Introduction
|
127
|
+
|
114
128
|
MinerU is a tool that converts PDFs into machine-readable formats (e.g., markdown, JSON), allowing for easy extraction into any format.
|
115
129
|
MinerU was born during the pre-training process of [InternLM](https://github.com/InternLM/InternLM). We focus on solving symbol conversion issues in scientific literature and hope to contribute to technological development in the era of large models.
|
116
130
|
Compared to well-known commercial products, MinerU is still young. If you encounter any issues or if the results are not as expected, please submit an issue on [issue](https://github.com/opendatalab/MinerU/issues) and **attach the relevant PDF**.
|
@@ -134,6 +148,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
|
134
148
|
If you encounter any installation issues, please first consult the <a href="#faq">FAQ</a>. </br>
|
135
149
|
If the parsing results are not as expected, refer to the <a href="#known-issues">Known Issues</a>. </br>
|
136
150
|
There are three different ways to experience MinerU:
|
151
|
+
|
137
152
|
- [Online Demo (No Installation Required)](#online-demo)
|
138
153
|
- [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
|
139
154
|
- [Linux/Windows + CUDA](#Using-GPU)
|
@@ -191,7 +206,9 @@ In non-mainline environments, due to the diversity of hardware and software conf
|
|
191
206
|
<tr>
|
192
207
|
<td colspan="2">Recommended Configuration 16G+ VRAM</td>
|
193
208
|
<td colspan="2">3090/3090ti/4070ti super/4080/4090<br>
|
194
|
-
16G or more can enable layout, formula recognition, and OCR acceleration simultaneously
|
209
|
+
16G or more can enable layout, formula recognition, and OCR acceleration simultaneously<br>
|
210
|
+
24G or more can enable layout, formula recognition, OCR acceleration and table recognition simultaneously
|
211
|
+
</td>
|
195
212
|
</tr>
|
196
213
|
</table>
|
197
214
|
|
@@ -202,51 +219,73 @@ In non-mainline environments, due to the diversity of hardware and software conf
|
|
202
219
|
### Quick CPU Demo
|
203
220
|
|
204
221
|
#### 1. Install magic-pdf
|
222
|
+
|
205
223
|
```bash
|
206
224
|
conda create -n MinerU python=3.10
|
207
225
|
conda activate MinerU
|
208
|
-
pip install magic-pdf[full]
|
226
|
+
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
|
209
227
|
```
|
228
|
+
|
210
229
|
#### 2. Download model weight files
|
211
230
|
|
212
231
|
Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for detailed instructions.
|
232
|
+
|
213
233
|
> ❗️After downloading the models, please make sure to verify the completeness of the model files.
|
214
|
-
>
|
234
|
+
>
|
215
235
|
> Check if the model file sizes match the description on the webpage. If possible, use sha256 to verify the integrity of the files.
|
216
236
|
|
217
237
|
#### 3. Copy and configure the template file
|
238
|
+
|
218
239
|
You can find the `magic-pdf.template.json` template configuration file in the root directory of the repository.
|
240
|
+
|
219
241
|
> ❗️Make sure to execute the following command to copy the configuration file to your **user directory**; otherwise, the program will not run.
|
220
|
-
>
|
242
|
+
>
|
221
243
|
> The user directory for Windows is `C:\Users\YourUsername`, for Linux it is `/home/YourUsername`, and for macOS it is `/Users/YourUsername`.
|
244
|
+
|
222
245
|
```bash
|
223
246
|
cp magic-pdf.template.json ~/magic-pdf.json
|
224
247
|
```
|
225
248
|
|
226
249
|
Find the `magic-pdf.json` file in your user directory and configure the "models-dir" path to point to the directory where the model weight files were downloaded in [Step 2](#2-download-model-weight-files).
|
250
|
+
|
227
251
|
> ❗️Make sure to correctly configure the **absolute path** to the model weight files directory, otherwise the program will not run because it can't find the model files.
|
228
252
|
>
|
229
253
|
> On Windows, this path should include the drive letter and all backslashes (`\`) in the path should be replaced with forward slashes (`/`) to avoid syntax errors in the JSON file due to escape sequences.
|
230
|
-
>
|
254
|
+
>
|
231
255
|
> For example: If the models are stored in the "models" directory at the root of the D drive, the "model-dir" value should be `D:/models`.
|
256
|
+
|
232
257
|
```json
|
233
258
|
{
|
234
259
|
// other config
|
235
260
|
"models-dir": "D:/models",
|
236
261
|
"table-config": {
|
262
|
+
"model": "TableMaster", // Another option of this value is 'struct_eqtable'
|
237
263
|
"is_table_recog_enable": false, // Table recognition is disabled by default, modify this value to enable it
|
238
264
|
"max_time": 400
|
239
265
|
}
|
240
266
|
}
|
241
267
|
```
|
242
268
|
|
243
|
-
|
244
269
|
### Using GPU
|
270
|
+
|
245
271
|
If your device supports CUDA and meets the GPU requirements of the mainline environment, you can use GPU acceleration. Please select the appropriate guide based on your system:
|
246
272
|
|
247
273
|
- [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
|
248
274
|
- [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
|
249
|
-
|
275
|
+
- Quick Deployment with Docker
|
276
|
+
> Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
|
277
|
+
>
|
278
|
+
> Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
|
279
|
+
>
|
280
|
+
> ```bash
|
281
|
+
> docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
|
282
|
+
> ```
|
283
|
+
```bash
|
284
|
+
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
|
285
|
+
docker build -t mineru:latest .
|
286
|
+
docker run --rm -it --gpus=all mineru:latest /bin/bash
|
287
|
+
magic-pdf --help
|
288
|
+
```
|
250
289
|
|
251
290
|
## Usage
|
252
291
|
|
@@ -260,12 +299,12 @@ Options:
|
|
260
299
|
-v, --version display the version and exit
|
261
300
|
-p, --path PATH local pdf filepath or directory [required]
|
262
301
|
-o, --output-dir TEXT output local directory
|
263
|
-
-m, --method [ocr|txt|auto] the method for parsing pdf.
|
302
|
+
-m, --method [ocr|txt|auto] the method for parsing pdf.
|
264
303
|
ocr: using ocr technique to extract information from pdf,
|
265
304
|
txt: suitable for the text-based pdf only and outperform ocr,
|
266
305
|
auto: automatically choose the best method for parsing pdf
|
267
306
|
from ocr and txt.
|
268
|
-
without method specified, auto will be used by default.
|
307
|
+
without method specified, auto will be used by default.
|
269
308
|
--help Show this message and exit.
|
270
309
|
|
271
310
|
|
@@ -280,13 +319,13 @@ magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
|
|
280
319
|
The results will be saved in the `{some_output_dir}` directory. The output file list is as follows:
|
281
320
|
|
282
321
|
```text
|
283
|
-
├── some_pdf.md
|
284
|
-
├── images
|
285
|
-
├──
|
286
|
-
├──
|
287
|
-
├──
|
288
|
-
├──
|
289
|
-
└──
|
322
|
+
├── some_pdf.md # markdown file
|
323
|
+
├── images # directory for storing images
|
324
|
+
├── some_pdf_layout.pdf # layout diagram
|
325
|
+
├── some_pdf_middle.json # MinerU intermediate processing result
|
326
|
+
├── some_pdf_model.json # model inference result
|
327
|
+
├── some_pdf_origin.pdf # original PDF file
|
328
|
+
└── some_pdf_spans.pdf # smallest granularity bbox position information diagram
|
290
329
|
```
|
291
330
|
|
292
331
|
For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
@@ -294,6 +333,7 @@ For more information about the output files, please refer to the [Output File De
|
|
294
333
|
### API
|
295
334
|
|
296
335
|
Processing files from local disk
|
336
|
+
|
297
337
|
```python
|
298
338
|
image_writer = DiskReaderWriter(local_image_dir)
|
299
339
|
image_dir = str(os.path.basename(local_image_dir))
|
@@ -306,6 +346,7 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
|
306
346
|
```
|
307
347
|
|
308
348
|
Processing files from object storage
|
349
|
+
|
309
350
|
```python
|
310
351
|
s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
|
311
352
|
image_dir = "s3://img_bucket/"
|
@@ -320,10 +361,10 @@ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
|
|
320
361
|
```
|
321
362
|
|
322
363
|
For detailed implementation, refer to:
|
364
|
+
|
323
365
|
- [demo.py Simplest Processing Method](demo/demo.py)
|
324
366
|
- [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
|
325
367
|
|
326
|
-
|
327
368
|
### Development Guide
|
328
369
|
|
329
370
|
TODO
|
@@ -335,30 +376,25 @@ TODO
|
|
335
376
|
- [ ] Code block recognition within the text
|
336
377
|
- [ ] Table of contents recognition
|
337
378
|
- [x] Table recognition
|
338
|
-
- [ ] Chemical formula recognition
|
379
|
+
- [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
|
339
380
|
- [ ] Geometric shape recognition
|
340
381
|
|
341
382
|
# Known Issues
|
383
|
+
|
342
384
|
- Reading order is segmented based on rules, which can cause disordered sequences in some cases
|
343
385
|
- Vertical text is not supported
|
344
386
|
- Lists, code blocks, and table of contents are not yet supported in the layout model
|
345
387
|
- Comic books, art books, elementary school textbooks, and exercise books are not well-parsed yet
|
346
388
|
- Enabling OCR may produce better results in PDFs with a high density of formulas
|
347
389
|
- If you are processing PDFs with a large number of formulas, it is strongly recommended to enable the OCR function. When using PyMuPDF to extract text, overlapping text lines can occur, leading to inaccurate formula insertion positions.
|
348
|
-
- **Table Recognition** is currently in the testing phase; recognition speed is slow, and accuracy needs improvement. Below are some performance test results in an Ubuntu 22.04 LTS + Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz + NVIDIA GeForce RTX 4090 environment for reference.
|
349
390
|
|
350
|
-
| Table Size | Parsing Time |
|
351
|
-
|---------------|----------------------------|
|
352
|
-
| 6\*5 55kb | 37s |
|
353
|
-
| 16\*12 284kb | 3m18s |
|
354
|
-
| 44\*7 559kb | 4m12s |
|
355
391
|
|
356
392
|
# FAQ
|
393
|
+
|
357
394
|
[FAQ in Chinese](docs/FAQ_zh_cn.md)
|
358
395
|
|
359
396
|
[FAQ in English](docs/FAQ_en_us.md)
|
360
397
|
|
361
|
-
|
362
398
|
# All Thanks To Our Contributors
|
363
399
|
|
364
400
|
<a href="https://github.com/opendatalab/MinerU/graphs/contributors">
|
@@ -371,8 +407,8 @@ TODO
|
|
371
407
|
|
372
408
|
This project currently uses PyMuPDF to achieve advanced functionality. However, since it adheres to the AGPL license, it may impose restrictions on certain usage scenarios. In future iterations, we plan to explore and replace it with a more permissive PDF processing library to enhance user-friendliness and flexibility.
|
373
409
|
|
374
|
-
|
375
410
|
# Acknowledgments
|
411
|
+
|
376
412
|
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
377
413
|
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
|
378
414
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
@@ -409,9 +445,11 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
|
|
409
445
|
</a>
|
410
446
|
|
411
447
|
# Magic-doc
|
448
|
+
|
412
449
|
[Magic-Doc](https://github.com/InternLM/magic-doc) Fast speed ppt/pptx/doc/docx/pdf extraction tool
|
413
450
|
|
414
451
|
# Magic-html
|
452
|
+
|
415
453
|
[Magic-HTML](https://github.com/opendatalab/magic-html) Mixed web page extraction tool
|
416
454
|
|
417
455
|
# Links
|