magic-pdf 0.10.6__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +2 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +13 -1
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +14 -13
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +8 -12
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -125
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +4 -51
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +33 -22
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +30 -4
- magic_pdf/model/sub_modules/model_utils.py +8 -2
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/{model/operators.py → operators/models.py} +2 -38
- magic_pdf/{pipe/operators.py → operators/pipes.py} +70 -17
- magic_pdf/pdf_parse_union_core_v2.py +68 -17
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +28 -18
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +73 -23
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +50 -53
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -22
- magic_pdf/pdf_parse_by_txt.py +0 -23
- magic_pdf/pipe/AbsPipe.py +0 -99
- magic_pdf/pipe/OCRPipe.py +0 -80
- magic_pdf/pipe/TXTPipe.py +0 -42
- magic_pdf/pipe/UNIPipe.py +0 -150
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -144
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,14 @@ def remove_overlaps_low_confidence_spans(spans):
|
|
33
33
|
return spans, dropped_spans
|
34
34
|
|
35
35
|
|
36
|
+
def check_chars_is_overlap_in_span(chars):
|
37
|
+
for i in range(len(chars)):
|
38
|
+
for j in range(i + 1, len(chars)):
|
39
|
+
if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.9:
|
40
|
+
return True
|
41
|
+
return False
|
42
|
+
|
43
|
+
|
36
44
|
def remove_overlaps_min_spans(spans):
|
37
45
|
dropped_spans = []
|
38
46
|
# 删除重叠spans中较小的那些
|
Binary file
|
magic_pdf/tools/cli.py
CHANGED
@@ -1,13 +1,20 @@
|
|
1
1
|
import os
|
2
|
-
|
3
|
-
|
2
|
+
import shutil
|
3
|
+
import tempfile
|
4
4
|
import click
|
5
|
+
import fitz
|
5
6
|
from loguru import logger
|
7
|
+
from pathlib import Path
|
6
8
|
|
7
9
|
import magic_pdf.model as model_config
|
8
10
|
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
9
11
|
from magic_pdf.libs.version import __version__
|
10
12
|
from magic_pdf.tools.common import do_parse, parse_pdf_methods
|
13
|
+
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
|
14
|
+
|
15
|
+
pdf_suffixes = ['.pdf']
|
16
|
+
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
|
17
|
+
image_suffixes = ['.png', '.jpeg', '.jpg']
|
11
18
|
|
12
19
|
|
13
20
|
@click.command()
|
@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
|
|
21
28
|
'path',
|
22
29
|
type=click.Path(exists=True),
|
23
30
|
required=True,
|
24
|
-
help='local
|
31
|
+
help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
|
25
32
|
)
|
26
33
|
@click.option(
|
27
34
|
'-o',
|
@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
|
|
83
90
|
model_config.__use_inside_model__ = True
|
84
91
|
model_config.__model_mode__ = 'full'
|
85
92
|
os.makedirs(output_dir, exist_ok=True)
|
93
|
+
temp_dir = tempfile.mkdtemp()
|
94
|
+
def read_fn(path: Path):
|
95
|
+
if path.suffix in ms_office_suffixes:
|
96
|
+
convert_file_to_pdf(str(path), temp_dir)
|
97
|
+
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
|
98
|
+
elif path.suffix in image_suffixes:
|
99
|
+
with open(str(path), 'rb') as f:
|
100
|
+
bits = f.read()
|
101
|
+
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
|
102
|
+
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
|
103
|
+
with open(fn, 'wb') as f:
|
104
|
+
f.write(pdf_bytes)
|
105
|
+
elif path.suffix in pdf_suffixes:
|
106
|
+
fn = str(path)
|
107
|
+
else:
|
108
|
+
raise Exception(f"Unknown file suffix: {path.suffix}")
|
109
|
+
|
110
|
+
disk_rw = FileBasedDataReader(os.path.dirname(fn))
|
111
|
+
return disk_rw.read(os.path.basename(fn))
|
86
112
|
|
87
|
-
def
|
88
|
-
disk_rw = FileBasedDataReader(os.path.dirname(path))
|
89
|
-
return disk_rw.read(os.path.basename(path))
|
90
|
-
|
91
|
-
def parse_doc(doc_path: str):
|
113
|
+
def parse_doc(doc_path: Path):
|
92
114
|
try:
|
93
115
|
file_name = str(Path(doc_path).stem)
|
94
116
|
pdf_data = read_fn(doc_path)
|
@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
|
|
108
130
|
logger.exception(e)
|
109
131
|
|
110
132
|
if os.path.isdir(path):
|
111
|
-
for doc_path in Path(path).glob('
|
112
|
-
|
133
|
+
for doc_path in Path(path).glob('*'):
|
134
|
+
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
|
135
|
+
parse_doc(doc_path)
|
113
136
|
else:
|
114
|
-
parse_doc(path)
|
137
|
+
parse_doc(Path(path))
|
138
|
+
|
139
|
+
shutil.rmtree(temp_dir)
|
115
140
|
|
116
141
|
|
117
142
|
if __name__ == '__main__':
|
magic_pdf/tools/common.py
CHANGED
@@ -9,8 +9,9 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
|
|
9
9
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
10
10
|
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
11
11
|
from magic_pdf.data.dataset import PymuDocDataset
|
12
|
+
from magic_pdf.libs.draw_bbox import draw_char_bbox
|
12
13
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
13
|
-
from magic_pdf.
|
14
|
+
from magic_pdf.operators.models import InferenceResult
|
14
15
|
|
15
16
|
# from io import BytesIO
|
16
17
|
# from pypdf import PdfReader, PdfWriter
|
@@ -83,6 +84,7 @@ def do_parse(
|
|
83
84
|
f_make_md_mode=MakeMode.MM_MD,
|
84
85
|
f_draw_model_bbox=False,
|
85
86
|
f_draw_line_sort_bbox=False,
|
87
|
+
f_draw_char_bbox=False,
|
86
88
|
start_page_id=0,
|
87
89
|
end_page_id=None,
|
88
90
|
lang=None,
|
@@ -94,9 +96,7 @@ def do_parse(
|
|
94
96
|
logger.warning('debug mode is on')
|
95
97
|
f_draw_model_bbox = True
|
96
98
|
f_draw_line_sort_bbox = True
|
97
|
-
|
98
|
-
if lang == '':
|
99
|
-
lang = None
|
99
|
+
# f_draw_char_bbox = True
|
100
100
|
|
101
101
|
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
|
102
102
|
pdf_bytes, start_page_id, end_page_id
|
@@ -109,7 +109,7 @@ def do_parse(
|
|
109
109
|
)
|
110
110
|
image_dir = str(os.path.basename(local_image_dir))
|
111
111
|
|
112
|
-
ds = PymuDocDataset(pdf_bytes)
|
112
|
+
ds = PymuDocDataset(pdf_bytes, lang=lang)
|
113
113
|
|
114
114
|
if len(model_list) == 0:
|
115
115
|
if model_config.__use_inside_model__:
|
@@ -118,50 +118,50 @@ def do_parse(
|
|
118
118
|
infer_result = ds.apply(
|
119
119
|
doc_analyze,
|
120
120
|
ocr=False,
|
121
|
-
lang=
|
121
|
+
lang=ds._lang,
|
122
122
|
layout_model=layout_model,
|
123
123
|
formula_enable=formula_enable,
|
124
124
|
table_enable=table_enable,
|
125
125
|
)
|
126
126
|
pipe_result = infer_result.pipe_txt_mode(
|
127
|
-
image_writer, debug_mode=True, lang=
|
127
|
+
image_writer, debug_mode=True, lang=ds._lang
|
128
128
|
)
|
129
129
|
else:
|
130
130
|
infer_result = ds.apply(
|
131
131
|
doc_analyze,
|
132
132
|
ocr=True,
|
133
|
-
lang=
|
133
|
+
lang=ds._lang,
|
134
134
|
layout_model=layout_model,
|
135
135
|
formula_enable=formula_enable,
|
136
136
|
table_enable=table_enable,
|
137
137
|
)
|
138
138
|
pipe_result = infer_result.pipe_ocr_mode(
|
139
|
-
image_writer, debug_mode=True, lang=
|
139
|
+
image_writer, debug_mode=True, lang=ds._lang
|
140
140
|
)
|
141
141
|
|
142
142
|
elif parse_method == 'txt':
|
143
143
|
infer_result = ds.apply(
|
144
144
|
doc_analyze,
|
145
145
|
ocr=False,
|
146
|
-
lang=
|
146
|
+
lang=ds._lang,
|
147
147
|
layout_model=layout_model,
|
148
148
|
formula_enable=formula_enable,
|
149
149
|
table_enable=table_enable,
|
150
150
|
)
|
151
151
|
pipe_result = infer_result.pipe_txt_mode(
|
152
|
-
image_writer, debug_mode=True, lang=
|
152
|
+
image_writer, debug_mode=True, lang=ds._lang
|
153
153
|
)
|
154
154
|
elif parse_method == 'ocr':
|
155
155
|
infer_result = ds.apply(
|
156
156
|
doc_analyze,
|
157
157
|
ocr=True,
|
158
|
-
lang=
|
158
|
+
lang=ds._lang,
|
159
159
|
layout_model=layout_model,
|
160
160
|
formula_enable=formula_enable,
|
161
161
|
table_enable=table_enable,
|
162
162
|
)
|
163
163
|
pipe_result = infer_result.pipe_ocr_mode(
|
164
|
-
image_writer, debug_mode=True, lang=
|
164
|
+
image_writer, debug_mode=True, lang=ds._lang
|
165
165
|
)
|
166
166
|
else:
|
167
167
|
logger.error('unknown parse method')
|
@@ -170,19 +170,26 @@ def do_parse(
|
|
170
170
|
logger.error('need model list input')
|
171
171
|
exit(2)
|
172
172
|
else:
|
173
|
+
|
173
174
|
infer_result = InferenceResult(model_list, ds)
|
174
175
|
if parse_method == 'ocr':
|
175
176
|
pipe_result = infer_result.pipe_ocr_mode(
|
176
|
-
image_writer, debug_mode=True, lang=
|
177
|
+
image_writer, debug_mode=True, lang=ds._lang
|
177
178
|
)
|
178
179
|
elif parse_method == 'txt':
|
179
180
|
pipe_result = infer_result.pipe_txt_mode(
|
180
|
-
image_writer, debug_mode=True, lang=
|
181
|
+
image_writer, debug_mode=True, lang=ds._lang
|
181
182
|
)
|
182
183
|
else:
|
183
|
-
|
184
|
-
|
185
|
-
|
184
|
+
if ds.classify() == SupportedPdfParseMethod.TXT:
|
185
|
+
pipe_result = infer_result.pipe_txt_mode(
|
186
|
+
image_writer, debug_mode=True, lang=ds._lang
|
187
|
+
)
|
188
|
+
else:
|
189
|
+
pipe_result = infer_result.pipe_ocr_mode(
|
190
|
+
image_writer, debug_mode=True, lang=ds._lang
|
191
|
+
)
|
192
|
+
|
186
193
|
|
187
194
|
if f_draw_model_bbox:
|
188
195
|
infer_result.draw_model(
|
@@ -201,6 +208,9 @@ def do_parse(
|
|
201
208
|
os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
|
202
209
|
)
|
203
210
|
|
211
|
+
if f_draw_char_bbox:
|
212
|
+
draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
|
213
|
+
|
204
214
|
if f_dump_md:
|
205
215
|
pipe_result.dump_md(
|
206
216
|
md_writer,
|
@@ -0,0 +1,29 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
|
6
|
+
class ConvertToPdfError(Exception):
|
7
|
+
def __init__(self, msg):
|
8
|
+
self.msg = msg
|
9
|
+
super().__init__(self.msg)
|
10
|
+
|
11
|
+
|
12
|
+
def convert_file_to_pdf(input_path, output_dir):
|
13
|
+
if not os.path.isfile(input_path):
|
14
|
+
raise FileNotFoundError(f"The input file {input_path} does not exist.")
|
15
|
+
|
16
|
+
os.makedirs(output_dir, exist_ok=True)
|
17
|
+
|
18
|
+
cmd = [
|
19
|
+
'soffice',
|
20
|
+
'--headless',
|
21
|
+
'--convert-to', 'pdf',
|
22
|
+
'--outdir', str(output_dir),
|
23
|
+
str(input_path)
|
24
|
+
]
|
25
|
+
|
26
|
+
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
27
|
+
|
28
|
+
if process.returncode != 0:
|
29
|
+
raise ConvertToPdfError(process.stderr.decode())
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.
|
3
|
+
Version: 1.0.0
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,17 +9,17 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3>=1.28.43
|
10
10
|
Requires-Dist: Brotli>=1.1.0
|
11
11
|
Requires-Dist: click>=8.1.7
|
12
|
-
Requires-Dist: fast-langdetect
|
12
|
+
Requires-Dist: fast-langdetect>=0.2.3
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
14
|
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
-
Requires-Dist: pydantic
|
15
|
+
Requires-Dist: pydantic>=2.7.2
|
16
16
|
Requires-Dist: PyMuPDF>=1.24.9
|
17
17
|
Requires-Dist: scikit-learn>=1.0.2
|
18
18
|
Requires-Dist: torch>=2.2.2
|
19
19
|
Requires-Dist: transformers
|
20
20
|
Requires-Dist: pdfminer.six==20231228
|
21
21
|
Provides-Extra: full
|
22
|
-
Requires-Dist: unimernet==0.2.
|
22
|
+
Requires-Dist: unimernet==0.2.3; extra == "full"
|
23
23
|
Requires-Dist: torch<=2.3.1,>=2.2.2; extra == "full"
|
24
24
|
Requires-Dist: torchvision<=0.18.1,>=0.17.2; extra == "full"
|
25
25
|
Requires-Dist: ultralytics>=8.3.48; extra == "full"
|
@@ -29,8 +29,10 @@ Requires-Dist: einops; extra == "full"
|
|
29
29
|
Requires-Dist: accelerate; extra == "full"
|
30
30
|
Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
|
31
31
|
Requires-Dist: rapidocr-paddle; extra == "full"
|
32
|
-
Requires-Dist:
|
32
|
+
Requires-Dist: rapidocr-onnxruntime; extra == "full"
|
33
|
+
Requires-Dist: rapid-table==0.3.0; extra == "full"
|
33
34
|
Requires-Dist: PyYAML; extra == "full"
|
35
|
+
Requires-Dist: openai; extra == "full"
|
34
36
|
Requires-Dist: detectron2; extra == "full"
|
35
37
|
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
|
36
38
|
Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
@@ -59,7 +61,7 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
|
59
61
|
[](https://pepy.tech/project/magic-pdf)
|
60
62
|
[](https://pepy.tech/project/magic-pdf)
|
61
63
|
|
62
|
-
[](https://
|
64
|
+
[](https://mineru.org.cn/OpenSourceTools/Extractor?source=github)
|
63
65
|
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
64
66
|
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
65
67
|
[](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
|
@@ -76,6 +78,11 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
|
76
78
|
|
77
79
|
<p align="center">
|
78
80
|
<a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
|
81
|
+
<br>
|
82
|
+
<br>
|
83
|
+
<a href="https://mineru.org.cn/client?source=github">
|
84
|
+
Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple interface and smooth interactions. Enjoy it without any fuss!</a>🚀🚀🚀
|
85
|
+
|
79
86
|
</p>
|
80
87
|
|
81
88
|
<!-- join us -->
|
@@ -87,6 +94,15 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
|
87
94
|
</div>
|
88
95
|
|
89
96
|
# Changelog
|
97
|
+
- 2025/01/10 1.0.0 released. This is our first official release, where we have introduced a completely new API interface and enhanced compatibility through extensive refactoring, as well as a brand new automatic language identification feature:
|
98
|
+
- New API Interface
|
99
|
+
- For the data-side API, we have introduced the Dataset class, designed to provide a robust and flexible data processing framework. This framework currently supports a variety of document formats, including images (.jpg and .png), PDFs, Word documents (.doc and .docx), and PowerPoint presentations (.ppt and .pptx). It ensures effective support for data processing tasks ranging from simple to complex.
|
100
|
+
- For the user-side API, we have meticulously designed the MinerU processing workflow as a series of composable Stages. Each Stage represents a specific processing step, allowing users to define new Stages according to their needs and creatively combine these stages to customize their data processing workflows.
|
101
|
+
- Enhanced Compatibility
|
102
|
+
- By optimizing the dependency environment and configuration items, we ensure stable and efficient operation on ARM architecture Linux systems.
|
103
|
+
- We have deeply integrated with Huawei Ascend NPU acceleration, providing autonomous and controllable high-performance computing capabilities. This supports the localization and development of AI application platforms in China. [Ascend NPU Acceleration](docs/README_Ascend_NPU_Acceleration_zh_CN.md)
|
104
|
+
- Automatic Language Identification
|
105
|
+
- By introducing a new language recognition model, setting the `lang` configuration to `auto` during document parsing will automatically select the appropriate OCR language model, improving the accuracy of scanned document parsing.
|
90
106
|
- 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
|
91
107
|
- Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
|
92
108
|
- Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
|
@@ -126,6 +142,7 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
|
126
142
|
<li><a href="#online-demo">Online Demo</a></li>
|
127
143
|
<li><a href="#quick-cpu-demo">Quick CPU Demo</a></li>
|
128
144
|
<li><a href="#using-gpu">Using GPU</a></li>
|
145
|
+
<li><a href="#using-npu">Using NPU</a></li>
|
129
146
|
</ul>
|
130
147
|
</li>
|
131
148
|
<li><a href="#usage">Usage</a>
|
@@ -174,7 +191,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
|
174
191
|
- OCR supports detection and recognition of 84 languages.
|
175
192
|
- Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
|
176
193
|
- Supports various visualization results, including layout visualization and span visualization, for efficient confirmation of output quality.
|
177
|
-
- Supports
|
194
|
+
- Supports running in a pure CPU environment, and also supports GPU(CUDA)/NPU(CANN)/MPS acceleration
|
178
195
|
- Compatible with Windows, Linux, and Mac platforms.
|
179
196
|
|
180
197
|
## Quick Start
|
@@ -185,7 +202,10 @@ There are three different ways to experience MinerU:
|
|
185
202
|
|
186
203
|
- [Online Demo (No Installation Required)](#online-demo)
|
187
204
|
- [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
|
188
|
-
-
|
205
|
+
- Accelerate inference by using CUDA/CANN/MPS
|
206
|
+
- [Linux/Windows + CUDA](#Using-GPU)
|
207
|
+
- [Linux + CANN](#using-npu)
|
208
|
+
- [MacOS + MPS](#using-mps)
|
189
209
|
|
190
210
|
> [!WARNING]
|
191
211
|
> **Pre-installation Notice—Hardware and Software Environment Support**
|
@@ -201,20 +221,24 @@ There are three different ways to experience MinerU:
|
|
201
221
|
<td colspan="3" rowspan="2">Operating System</td>
|
202
222
|
</tr>
|
203
223
|
<tr>
|
204
|
-
<td>
|
224
|
+
<td>Linux after 2019</td>
|
205
225
|
<td>Windows 10 / 11</td>
|
206
226
|
<td>macOS 11+</td>
|
207
227
|
</tr>
|
208
228
|
<tr>
|
209
229
|
<td colspan="3">CPU</td>
|
210
|
-
<td>x86_64
|
230
|
+
<td>x86_64 / arm64</td>
|
211
231
|
<td>x86_64(unsupported ARM Windows)</td>
|
212
232
|
<td>x86_64 / arm64</td>
|
213
233
|
</tr>
|
214
234
|
<tr>
|
215
|
-
<td colspan="3">Memory</td>
|
235
|
+
<td colspan="3">Memory Requirements</td>
|
216
236
|
<td colspan="3">16GB or more, recommended 32GB+</td>
|
217
237
|
</tr>
|
238
|
+
<tr>
|
239
|
+
<td colspan="3">Storage Requirements</td>
|
240
|
+
<td colspan="3">20GB or more, with a preference for SSD</td>
|
241
|
+
</tr>
|
218
242
|
<tr>
|
219
243
|
<td colspan="3">Python Version</td>
|
220
244
|
<td colspan="3">3.10(Please make sure to create a Python 3.10 virtual environment using conda)</td>
|
@@ -231,6 +255,12 @@ There are three different ways to experience MinerU:
|
|
231
255
|
<td>11.8 (manual installation) + cuDNN v8.7.0 (manual installation)</td>
|
232
256
|
<td>None</td>
|
233
257
|
</tr>
|
258
|
+
<tr>
|
259
|
+
<td colspan="3">CANN Environment(NPU support)</td>
|
260
|
+
<td>8.0+(Ascend 910b)</td>
|
261
|
+
<td>None</td>
|
262
|
+
<td>None</td>
|
263
|
+
</tr>
|
234
264
|
<tr>
|
235
265
|
<td rowspan="2">GPU Hardware Support List</td>
|
236
266
|
<td colspan="2">GPU VRAM 8GB or more</td>
|
@@ -243,7 +273,7 @@ There are three different ways to experience MinerU:
|
|
243
273
|
### Online Demo
|
244
274
|
|
245
275
|
Stable Version (Stable version verified by QA):
|
246
|
-
[](https://
|
276
|
+
[](https://mineru.org.cn/OpenSourceTools/Extractor?source=github)
|
247
277
|
|
248
278
|
Test Version (Synced with dev branch updates, testing new features):
|
249
279
|
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
@@ -256,7 +286,7 @@ Test Version (Synced with dev branch updates, testing new features):
|
|
256
286
|
```bash
|
257
287
|
conda create -n MinerU python=3.10
|
258
288
|
conda activate MinerU
|
259
|
-
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
|
289
|
+
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
|
260
290
|
```
|
261
291
|
|
262
292
|
#### 2. Download model weight files
|
@@ -281,7 +311,7 @@ You can modify certain configurations in this file to enable or disable features
|
|
281
311
|
{
|
282
312
|
// other config
|
283
313
|
"layout-config": {
|
284
|
-
"model": "
|
314
|
+
"model": "doclayout_yolo" // Please change to "layoutlmv3" when using layoutlmv3.
|
285
315
|
},
|
286
316
|
"formula-config": {
|
287
317
|
"mfd_model": "yolo_v8_mfd",
|
@@ -290,7 +320,7 @@ You can modify certain configurations in this file to enable or disable features
|
|
290
320
|
},
|
291
321
|
"table-config": {
|
292
322
|
"model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
|
293
|
-
"enable":
|
323
|
+
"enable": true, // The table recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
|
294
324
|
"max_time": 400
|
295
325
|
}
|
296
326
|
}
|
@@ -312,29 +342,49 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
|
|
312
342
|
> docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
|
313
343
|
> ```
|
314
344
|
```bash
|
315
|
-
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
|
345
|
+
wget https://github.com/opendatalab/MinerU/raw/master/docker/global/Dockerfile -O Dockerfile
|
316
346
|
docker build -t mineru:latest .
|
317
|
-
docker run --rm -it --gpus=all mineru:latest /bin/bash
|
347
|
+
docker run --rm -it --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
|
318
348
|
magic-pdf --help
|
319
349
|
```
|
320
350
|
|
351
|
+
### Using NPU
|
352
|
+
|
353
|
+
If your device has NPU acceleration hardware, you can follow the tutorial below to use NPU acceleration:
|
354
|
+
|
355
|
+
[Ascend NPU Acceleration](docs/README_Ascend_NPU_Acceleration_zh_CN.md)
|
356
|
+
|
357
|
+
### Using MPS
|
358
|
+
|
359
|
+
If your device uses Apple silicon chips, you can enable MPS acceleration for certain supported tasks (such as layout detection and formula detection).
|
360
|
+
|
361
|
+
You can enable MPS acceleration by setting the `device-mode` parameter to `mps` in the `magic-pdf.json` configuration file.
|
362
|
+
|
363
|
+
```json
|
364
|
+
{
|
365
|
+
// other config
|
366
|
+
"device-mode": "mps"
|
367
|
+
}
|
368
|
+
```
|
369
|
+
|
370
|
+
> [!TIP]
|
371
|
+
> Since the formula recognition task cannot utilize MPS acceleration, you can disable the formula recognition feature in tasks where it is not needed to achieve optimal performance.
|
372
|
+
>
|
373
|
+
> You can disable the formula recognition feature by setting the `enable` parameter in the `formula-config` section to `false`.
|
374
|
+
|
321
375
|
## Usage
|
322
376
|
|
323
377
|
### Command Line
|
324
378
|
|
325
|
-
[Using MinerU via Command Line](https://mineru.readthedocs.io/en/latest/user_guide/
|
379
|
+
[Using MinerU via Command Line](https://mineru.readthedocs.io/en/latest/user_guide/usage/command_line.html)
|
326
380
|
|
327
381
|
> [!TIP]
|
328
382
|
> For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
329
383
|
|
330
384
|
### API
|
331
385
|
|
332
|
-
[Using MinerU via Python API](https://mineru.readthedocs.io/en/latest/user_guide/
|
333
|
-
|
334
|
-
For detailed implementation, refer to:
|
386
|
+
[Using MinerU via Python API](https://mineru.readthedocs.io/en/latest/user_guide/usage/api.html)
|
335
387
|
|
336
|
-
- [demo.py Simplest Processing Method](demo/demo.py)
|
337
|
-
- [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
|
338
388
|
|
339
389
|
### Deploy Derived Projects
|
340
390
|
|