magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +7 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/base.py +13 -1
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +188 -5
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +16 -15
- magic_pdf/filter/__init__.py +32 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -2
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +19 -22
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/pdf_check.py +30 -30
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +35 -5
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +43 -7
- magic_pdf/model/sub_modules/model_utils.py +17 -5
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/operators/models.py +154 -0
- magic_pdf/operators/pipes.py +191 -0
- magic_pdf/pdf_parse_union_core_v2.py +77 -27
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +120 -61
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -23
- magic_pdf/pdf_parse_by_txt.py +0 -24
- magic_pdf/pipe/AbsPipe.py +0 -98
- magic_pdf/pipe/OCRPipe.py +0 -41
- magic_pdf/pipe/TXTPipe.py +0 -41
- magic_pdf/pipe/UNIPipe.py +0 -98
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -121
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
import copy
|
2
|
-
import json as json_parse
|
3
1
|
import os
|
4
2
|
|
5
3
|
import click
|
@@ -7,13 +5,13 @@ import fitz
|
|
7
5
|
from loguru import logger
|
8
6
|
|
9
7
|
import magic_pdf.model as model_config
|
8
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
10
9
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
11
10
|
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
12
|
-
from magic_pdf.
|
13
|
-
|
14
|
-
from magic_pdf.
|
15
|
-
from magic_pdf.
|
16
|
-
from magic_pdf.pipe.UNIPipe import UNIPipe
|
11
|
+
from magic_pdf.data.dataset import PymuDocDataset
|
12
|
+
from magic_pdf.libs.draw_bbox import draw_char_bbox
|
13
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
14
|
+
from magic_pdf.operators.models import InferenceResult
|
17
15
|
|
18
16
|
# from io import BytesIO
|
19
17
|
# from pypdf import PdfReader, PdfWriter
|
@@ -56,7 +54,11 @@ def prepare_env(output_dir, pdf_file_name, method):
|
|
56
54
|
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
|
57
55
|
document = fitz.open('pdf', pdf_bytes)
|
58
56
|
output_document = fitz.open()
|
59
|
-
end_page_id =
|
57
|
+
end_page_id = (
|
58
|
+
end_page_id
|
59
|
+
if end_page_id is not None and end_page_id >= 0
|
60
|
+
else len(document) - 1
|
61
|
+
)
|
60
62
|
if end_page_id > len(document) - 1:
|
61
63
|
logger.warning('end_page_id is out of range, use pdf_docs length')
|
62
64
|
end_page_id = len(document) - 1
|
@@ -82,6 +84,7 @@ def do_parse(
|
|
82
84
|
f_make_md_mode=MakeMode.MM_MD,
|
83
85
|
f_draw_model_bbox=False,
|
84
86
|
f_draw_line_sort_bbox=False,
|
87
|
+
f_draw_char_bbox=False,
|
85
88
|
start_page_id=0,
|
86
89
|
end_page_id=None,
|
87
90
|
lang=None,
|
@@ -93,79 +96,135 @@ def do_parse(
|
|
93
96
|
logger.warning('debug mode is on')
|
94
97
|
f_draw_model_bbox = True
|
95
98
|
f_draw_line_sort_bbox = True
|
99
|
+
# f_draw_char_bbox = True
|
96
100
|
|
97
|
-
|
98
|
-
|
101
|
+
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
|
102
|
+
pdf_bytes, start_page_id, end_page_id
|
103
|
+
)
|
99
104
|
|
100
|
-
|
105
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
|
101
106
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
image_writer, md_writer = FileBasedDataWriter(
|
107
|
-
local_image_dir), FileBasedDataWriter(local_md_dir)
|
107
|
+
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
|
108
|
+
local_md_dir
|
109
|
+
)
|
108
110
|
image_dir = str(os.path.basename(local_image_dir))
|
109
111
|
|
110
|
-
|
111
|
-
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
112
|
-
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
|
113
|
-
# start_page_id=start_page_id, end_page_id=end_page_id,
|
114
|
-
lang=lang,
|
115
|
-
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
116
|
-
elif parse_method == 'txt':
|
117
|
-
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
118
|
-
# start_page_id=start_page_id, end_page_id=end_page_id,
|
119
|
-
lang=lang,
|
120
|
-
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
121
|
-
elif parse_method == 'ocr':
|
122
|
-
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
123
|
-
# start_page_id=start_page_id, end_page_id=end_page_id,
|
124
|
-
lang=lang,
|
125
|
-
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
126
|
-
else:
|
127
|
-
logger.error('unknown parse method')
|
128
|
-
exit(1)
|
129
|
-
|
130
|
-
pipe.pipe_classify()
|
112
|
+
ds = PymuDocDataset(pdf_bytes, lang=lang)
|
131
113
|
|
132
114
|
if len(model_list) == 0:
|
133
115
|
if model_config.__use_inside_model__:
|
134
|
-
|
135
|
-
|
116
|
+
if parse_method == 'auto':
|
117
|
+
if ds.classify() == SupportedPdfParseMethod.TXT:
|
118
|
+
infer_result = ds.apply(
|
119
|
+
doc_analyze,
|
120
|
+
ocr=False,
|
121
|
+
lang=ds._lang,
|
122
|
+
layout_model=layout_model,
|
123
|
+
formula_enable=formula_enable,
|
124
|
+
table_enable=table_enable,
|
125
|
+
)
|
126
|
+
pipe_result = infer_result.pipe_txt_mode(
|
127
|
+
image_writer, debug_mode=True, lang=ds._lang
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
infer_result = ds.apply(
|
131
|
+
doc_analyze,
|
132
|
+
ocr=True,
|
133
|
+
lang=ds._lang,
|
134
|
+
layout_model=layout_model,
|
135
|
+
formula_enable=formula_enable,
|
136
|
+
table_enable=table_enable,
|
137
|
+
)
|
138
|
+
pipe_result = infer_result.pipe_ocr_mode(
|
139
|
+
image_writer, debug_mode=True, lang=ds._lang
|
140
|
+
)
|
141
|
+
|
142
|
+
elif parse_method == 'txt':
|
143
|
+
infer_result = ds.apply(
|
144
|
+
doc_analyze,
|
145
|
+
ocr=False,
|
146
|
+
lang=ds._lang,
|
147
|
+
layout_model=layout_model,
|
148
|
+
formula_enable=formula_enable,
|
149
|
+
table_enable=table_enable,
|
150
|
+
)
|
151
|
+
pipe_result = infer_result.pipe_txt_mode(
|
152
|
+
image_writer, debug_mode=True, lang=ds._lang
|
153
|
+
)
|
154
|
+
elif parse_method == 'ocr':
|
155
|
+
infer_result = ds.apply(
|
156
|
+
doc_analyze,
|
157
|
+
ocr=True,
|
158
|
+
lang=ds._lang,
|
159
|
+
layout_model=layout_model,
|
160
|
+
formula_enable=formula_enable,
|
161
|
+
table_enable=table_enable,
|
162
|
+
)
|
163
|
+
pipe_result = infer_result.pipe_ocr_mode(
|
164
|
+
image_writer, debug_mode=True, lang=ds._lang
|
165
|
+
)
|
166
|
+
else:
|
167
|
+
logger.error('unknown parse method')
|
168
|
+
exit(1)
|
136
169
|
else:
|
137
170
|
logger.error('need model list input')
|
138
171
|
exit(2)
|
172
|
+
else:
|
173
|
+
|
174
|
+
infer_result = InferenceResult(model_list, ds)
|
175
|
+
if parse_method == 'ocr':
|
176
|
+
pipe_result = infer_result.pipe_ocr_mode(
|
177
|
+
image_writer, debug_mode=True, lang=ds._lang
|
178
|
+
)
|
179
|
+
elif parse_method == 'txt':
|
180
|
+
pipe_result = infer_result.pipe_txt_mode(
|
181
|
+
image_writer, debug_mode=True, lang=ds._lang
|
182
|
+
)
|
183
|
+
else:
|
184
|
+
if ds.classify() == SupportedPdfParseMethod.TXT:
|
185
|
+
pipe_result = infer_result.pipe_txt_mode(
|
186
|
+
image_writer, debug_mode=True, lang=ds._lang
|
187
|
+
)
|
188
|
+
else:
|
189
|
+
pipe_result = infer_result.pipe_ocr_mode(
|
190
|
+
image_writer, debug_mode=True, lang=ds._lang
|
191
|
+
)
|
192
|
+
|
193
|
+
|
194
|
+
if f_draw_model_bbox:
|
195
|
+
infer_result.draw_model(
|
196
|
+
os.path.join(local_md_dir, f'{pdf_file_name}_model.pdf')
|
197
|
+
)
|
139
198
|
|
140
|
-
pipe.pipe_parse()
|
141
|
-
pdf_info = pipe.pdf_mid_data['pdf_info']
|
142
199
|
if f_draw_layout_bbox:
|
143
|
-
|
200
|
+
pipe_result.draw_layout(
|
201
|
+
os.path.join(local_md_dir, f'{pdf_file_name}_layout.pdf')
|
202
|
+
)
|
144
203
|
if f_draw_span_bbox:
|
145
|
-
|
146
|
-
|
147
|
-
draw_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
|
204
|
+
pipe_result.draw_span(os.path.join(local_md_dir, f'{pdf_file_name}_spans.pdf'))
|
205
|
+
|
148
206
|
if f_draw_line_sort_bbox:
|
149
|
-
|
207
|
+
pipe_result.draw_line_sort(
|
208
|
+
os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
|
209
|
+
)
|
210
|
+
|
211
|
+
if f_draw_char_bbox:
|
212
|
+
draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
|
150
213
|
|
151
|
-
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
|
152
214
|
if f_dump_md:
|
153
|
-
|
215
|
+
pipe_result.dump_md(
|
216
|
+
md_writer,
|
154
217
|
f'{pdf_file_name}.md',
|
155
|
-
|
218
|
+
image_dir,
|
219
|
+
drop_mode=DropMode.NONE,
|
220
|
+
md_make_mode=f_make_md_mode,
|
156
221
|
)
|
157
222
|
|
158
223
|
if f_dump_middle_json:
|
159
|
-
md_writer.
|
160
|
-
f'{pdf_file_name}_middle.json',
|
161
|
-
json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
|
162
|
-
)
|
224
|
+
pipe_result.dump_middle_json(md_writer, f'{pdf_file_name}_middle.json')
|
163
225
|
|
164
226
|
if f_dump_model_json:
|
165
|
-
md_writer.
|
166
|
-
f'{pdf_file_name}_model.json',
|
167
|
-
json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
|
168
|
-
)
|
227
|
+
infer_result.dump_model(md_writer, f'{pdf_file_name}_model.json')
|
169
228
|
|
170
229
|
if f_dump_orig_pdf:
|
171
230
|
md_writer.write(
|
@@ -173,11 +232,11 @@ def do_parse(
|
|
173
232
|
pdf_bytes,
|
174
233
|
)
|
175
234
|
|
176
|
-
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
177
235
|
if f_dump_content_list:
|
178
|
-
|
236
|
+
pipe_result.dump_content_list(
|
237
|
+
md_writer,
|
179
238
|
f'{pdf_file_name}_content_list.json',
|
180
|
-
|
239
|
+
image_dir
|
181
240
|
)
|
182
241
|
|
183
242
|
logger.info(f'local output dir is {local_md_dir}')
|
@@ -0,0 +1,29 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
|
6
|
+
class ConvertToPdfError(Exception):
|
7
|
+
def __init__(self, msg):
|
8
|
+
self.msg = msg
|
9
|
+
super().__init__(self.msg)
|
10
|
+
|
11
|
+
|
12
|
+
def convert_file_to_pdf(input_path, output_dir):
|
13
|
+
if not os.path.isfile(input_path):
|
14
|
+
raise FileNotFoundError(f"The input file {input_path} does not exist.")
|
15
|
+
|
16
|
+
os.makedirs(output_dir, exist_ok=True)
|
17
|
+
|
18
|
+
cmd = [
|
19
|
+
'soffice',
|
20
|
+
'--headless',
|
21
|
+
'--convert-to', 'pdf',
|
22
|
+
'--outdir', str(output_dir),
|
23
|
+
str(input_path)
|
24
|
+
]
|
25
|
+
|
26
|
+
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
27
|
+
|
28
|
+
if process.returncode != 0:
|
29
|
+
raise ConvertToPdfError(process.stderr.decode())
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.
|
3
|
+
Version: 1.0.0
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,25 +9,30 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3>=1.28.43
|
10
10
|
Requires-Dist: Brotli>=1.1.0
|
11
11
|
Requires-Dist: click>=8.1.7
|
12
|
-
Requires-Dist: fast-langdetect
|
12
|
+
Requires-Dist: fast-langdetect>=0.2.3
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
14
|
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
-
Requires-Dist: pydantic
|
15
|
+
Requires-Dist: pydantic>=2.7.2
|
16
16
|
Requires-Dist: PyMuPDF>=1.24.9
|
17
17
|
Requires-Dist: scikit-learn>=1.0.2
|
18
|
-
Requires-Dist: torch
|
18
|
+
Requires-Dist: torch>=2.2.2
|
19
19
|
Requires-Dist: transformers
|
20
|
+
Requires-Dist: pdfminer.six==20231228
|
20
21
|
Provides-Extra: full
|
21
|
-
Requires-Dist: unimernet==0.2.
|
22
|
-
Requires-Dist:
|
22
|
+
Requires-Dist: unimernet==0.2.3; extra == "full"
|
23
|
+
Requires-Dist: torch<=2.3.1,>=2.2.2; extra == "full"
|
24
|
+
Requires-Dist: torchvision<=0.18.1,>=0.17.2; extra == "full"
|
25
|
+
Requires-Dist: ultralytics>=8.3.48; extra == "full"
|
23
26
|
Requires-Dist: paddleocr==2.7.3; extra == "full"
|
24
27
|
Requires-Dist: struct-eqtable==0.3.2; extra == "full"
|
25
28
|
Requires-Dist: einops; extra == "full"
|
26
29
|
Requires-Dist: accelerate; extra == "full"
|
27
30
|
Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
|
28
31
|
Requires-Dist: rapidocr-paddle; extra == "full"
|
29
|
-
Requires-Dist:
|
32
|
+
Requires-Dist: rapidocr-onnxruntime; extra == "full"
|
33
|
+
Requires-Dist: rapid-table==0.3.0; extra == "full"
|
30
34
|
Requires-Dist: PyYAML; extra == "full"
|
35
|
+
Requires-Dist: openai; extra == "full"
|
31
36
|
Requires-Dist: detectron2; extra == "full"
|
32
37
|
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
|
33
38
|
Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
@@ -56,7 +61,7 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
|
56
61
|
[](https://pepy.tech/project/magic-pdf)
|
57
62
|
[](https://pepy.tech/project/magic-pdf)
|
58
63
|
|
59
|
-
[](https://
|
64
|
+
[](https://mineru.org.cn/OpenSourceTools/Extractor?source=github)
|
60
65
|
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
61
66
|
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
62
67
|
[](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
|
@@ -73,6 +78,11 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
|
73
78
|
|
74
79
|
<p align="center">
|
75
80
|
<a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit: High-Quality PDF Extraction Toolkit</a>🔥🔥🔥
|
81
|
+
<br>
|
82
|
+
<br>
|
83
|
+
<a href="https://mineru.org.cn/client?source=github">
|
84
|
+
Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple interface and smooth interactions. Enjoy it without any fuss!</a>🚀🚀🚀
|
85
|
+
|
76
86
|
</p>
|
77
87
|
|
78
88
|
<!-- join us -->
|
@@ -84,6 +94,15 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
|
84
94
|
</div>
|
85
95
|
|
86
96
|
# Changelog
|
97
|
+
- 2025/01/10 1.0.0 released. This is our first official release, where we have introduced a completely new API interface and enhanced compatibility through extensive refactoring, as well as a brand new automatic language identification feature:
|
98
|
+
- New API Interface
|
99
|
+
- For the data-side API, we have introduced the Dataset class, designed to provide a robust and flexible data processing framework. This framework currently supports a variety of document formats, including images (.jpg and .png), PDFs, Word documents (.doc and .docx), and PowerPoint presentations (.ppt and .pptx). It ensures effective support for data processing tasks ranging from simple to complex.
|
100
|
+
- For the user-side API, we have meticulously designed the MinerU processing workflow as a series of composable Stages. Each Stage represents a specific processing step, allowing users to define new Stages according to their needs and creatively combine these stages to customize their data processing workflows.
|
101
|
+
- Enhanced Compatibility
|
102
|
+
- By optimizing the dependency environment and configuration items, we ensure stable and efficient operation on ARM architecture Linux systems.
|
103
|
+
- We have deeply integrated with Huawei Ascend NPU acceleration, providing autonomous and controllable high-performance computing capabilities. This supports the localization and development of AI application platforms in China. [Ascend NPU Acceleration](docs/README_Ascend_NPU_Acceleration_zh_CN.md)
|
104
|
+
- Automatic Language Identification
|
105
|
+
- By introducing a new language recognition model, setting the `lang` configuration to `auto` during document parsing will automatically select the appropriate OCR language model, improving the accuracy of scanned document parsing.
|
87
106
|
- 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
|
88
107
|
- Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
|
89
108
|
- Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
|
@@ -123,6 +142,7 @@ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
|
123
142
|
<li><a href="#online-demo">Online Demo</a></li>
|
124
143
|
<li><a href="#quick-cpu-demo">Quick CPU Demo</a></li>
|
125
144
|
<li><a href="#using-gpu">Using GPU</a></li>
|
145
|
+
<li><a href="#using-npu">Using NPU</a></li>
|
126
146
|
</ul>
|
127
147
|
</li>
|
128
148
|
<li><a href="#usage">Usage</a>
|
@@ -171,7 +191,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
|
|
171
191
|
- OCR supports detection and recognition of 84 languages.
|
172
192
|
- Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
|
173
193
|
- Supports various visualization results, including layout visualization and span visualization, for efficient confirmation of output quality.
|
174
|
-
- Supports
|
194
|
+
- Supports running in a pure CPU environment, and also supports GPU(CUDA)/NPU(CANN)/MPS acceleration
|
175
195
|
- Compatible with Windows, Linux, and Mac platforms.
|
176
196
|
|
177
197
|
## Quick Start
|
@@ -182,7 +202,10 @@ There are three different ways to experience MinerU:
|
|
182
202
|
|
183
203
|
- [Online Demo (No Installation Required)](#online-demo)
|
184
204
|
- [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
|
185
|
-
-
|
205
|
+
- Accelerate inference by using CUDA/CANN/MPS
|
206
|
+
- [Linux/Windows + CUDA](#Using-GPU)
|
207
|
+
- [Linux + CANN](#using-npu)
|
208
|
+
- [MacOS + MPS](#using-mps)
|
186
209
|
|
187
210
|
> [!WARNING]
|
188
211
|
> **Pre-installation Notice—Hardware and Software Environment Support**
|
@@ -198,20 +221,24 @@ There are three different ways to experience MinerU:
|
|
198
221
|
<td colspan="3" rowspan="2">Operating System</td>
|
199
222
|
</tr>
|
200
223
|
<tr>
|
201
|
-
<td>
|
224
|
+
<td>Linux after 2019</td>
|
202
225
|
<td>Windows 10 / 11</td>
|
203
226
|
<td>macOS 11+</td>
|
204
227
|
</tr>
|
205
228
|
<tr>
|
206
229
|
<td colspan="3">CPU</td>
|
207
|
-
<td>x86_64
|
230
|
+
<td>x86_64 / arm64</td>
|
208
231
|
<td>x86_64(unsupported ARM Windows)</td>
|
209
232
|
<td>x86_64 / arm64</td>
|
210
233
|
</tr>
|
211
234
|
<tr>
|
212
|
-
<td colspan="3">Memory</td>
|
235
|
+
<td colspan="3">Memory Requirements</td>
|
213
236
|
<td colspan="3">16GB or more, recommended 32GB+</td>
|
214
237
|
</tr>
|
238
|
+
<tr>
|
239
|
+
<td colspan="3">Storage Requirements</td>
|
240
|
+
<td colspan="3">20GB or more, with a preference for SSD</td>
|
241
|
+
</tr>
|
215
242
|
<tr>
|
216
243
|
<td colspan="3">Python Version</td>
|
217
244
|
<td colspan="3">3.10(Please make sure to create a Python 3.10 virtual environment using conda)</td>
|
@@ -228,6 +255,12 @@ There are three different ways to experience MinerU:
|
|
228
255
|
<td>11.8 (manual installation) + cuDNN v8.7.0 (manual installation)</td>
|
229
256
|
<td>None</td>
|
230
257
|
</tr>
|
258
|
+
<tr>
|
259
|
+
<td colspan="3">CANN Environment(NPU support)</td>
|
260
|
+
<td>8.0+(Ascend 910b)</td>
|
261
|
+
<td>None</td>
|
262
|
+
<td>None</td>
|
263
|
+
</tr>
|
231
264
|
<tr>
|
232
265
|
<td rowspan="2">GPU Hardware Support List</td>
|
233
266
|
<td colspan="2">GPU VRAM 8GB or more</td>
|
@@ -240,7 +273,7 @@ There are three different ways to experience MinerU:
|
|
240
273
|
### Online Demo
|
241
274
|
|
242
275
|
Stable Version (Stable version verified by QA):
|
243
|
-
[](https://
|
276
|
+
[](https://mineru.org.cn/OpenSourceTools/Extractor?source=github)
|
244
277
|
|
245
278
|
Test Version (Synced with dev branch updates, testing new features):
|
246
279
|
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
@@ -253,7 +286,7 @@ Test Version (Synced with dev branch updates, testing new features):
|
|
253
286
|
```bash
|
254
287
|
conda create -n MinerU python=3.10
|
255
288
|
conda activate MinerU
|
256
|
-
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
|
289
|
+
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
|
257
290
|
```
|
258
291
|
|
259
292
|
#### 2. Download model weight files
|
@@ -278,7 +311,7 @@ You can modify certain configurations in this file to enable or disable features
|
|
278
311
|
{
|
279
312
|
// other config
|
280
313
|
"layout-config": {
|
281
|
-
"model": "
|
314
|
+
"model": "doclayout_yolo" // Please change to "layoutlmv3" when using layoutlmv3.
|
282
315
|
},
|
283
316
|
"formula-config": {
|
284
317
|
"mfd_model": "yolo_v8_mfd",
|
@@ -287,7 +320,7 @@ You can modify certain configurations in this file to enable or disable features
|
|
287
320
|
},
|
288
321
|
"table-config": {
|
289
322
|
"model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
|
290
|
-
"enable":
|
323
|
+
"enable": true, // The table recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
|
291
324
|
"max_time": 400
|
292
325
|
}
|
293
326
|
}
|
@@ -309,29 +342,49 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
|
|
309
342
|
> docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
|
310
343
|
> ```
|
311
344
|
```bash
|
312
|
-
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
|
345
|
+
wget https://github.com/opendatalab/MinerU/raw/master/docker/global/Dockerfile -O Dockerfile
|
313
346
|
docker build -t mineru:latest .
|
314
|
-
docker run --rm -it --gpus=all mineru:latest /bin/bash
|
347
|
+
docker run --rm -it --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
|
315
348
|
magic-pdf --help
|
316
349
|
```
|
317
350
|
|
351
|
+
### Using NPU
|
352
|
+
|
353
|
+
If your device has NPU acceleration hardware, you can follow the tutorial below to use NPU acceleration:
|
354
|
+
|
355
|
+
[Ascend NPU Acceleration](docs/README_Ascend_NPU_Acceleration_zh_CN.md)
|
356
|
+
|
357
|
+
### Using MPS
|
358
|
+
|
359
|
+
If your device uses Apple silicon chips, you can enable MPS acceleration for certain supported tasks (such as layout detection and formula detection).
|
360
|
+
|
361
|
+
You can enable MPS acceleration by setting the `device-mode` parameter to `mps` in the `magic-pdf.json` configuration file.
|
362
|
+
|
363
|
+
```json
|
364
|
+
{
|
365
|
+
// other config
|
366
|
+
"device-mode": "mps"
|
367
|
+
}
|
368
|
+
```
|
369
|
+
|
370
|
+
> [!TIP]
|
371
|
+
> Since the formula recognition task cannot utilize MPS acceleration, you can disable the formula recognition feature in tasks where it is not needed to achieve optimal performance.
|
372
|
+
>
|
373
|
+
> You can disable the formula recognition feature by setting the `enable` parameter in the `formula-config` section to `false`.
|
374
|
+
|
318
375
|
## Usage
|
319
376
|
|
320
377
|
### Command Line
|
321
378
|
|
322
|
-
[Using MinerU via Command Line](https://mineru.readthedocs.io/en/latest/user_guide/
|
379
|
+
[Using MinerU via Command Line](https://mineru.readthedocs.io/en/latest/user_guide/usage/command_line.html)
|
323
380
|
|
324
381
|
> [!TIP]
|
325
382
|
> For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
326
383
|
|
327
384
|
### API
|
328
385
|
|
329
|
-
[Using MinerU via Python API](https://mineru.readthedocs.io/en/latest/user_guide/
|
330
|
-
|
331
|
-
For detailed implementation, refer to:
|
386
|
+
[Using MinerU via Python API](https://mineru.readthedocs.io/en/latest/user_guide/usage/api.html)
|
332
387
|
|
333
|
-
- [demo.py Simplest Processing Method](demo/demo.py)
|
334
|
-
- [magic_pdf_parse_main.py More Detailed Processing Workflow](demo/magic_pdf_parse_main.py)
|
335
388
|
|
336
389
|
### Deploy Derived Projects
|
337
390
|
|