magic-pdf 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/batch_build_dataset.py +156 -0
- magic_pdf/data/dataset.py +44 -24
- magic_pdf/data/utils.py +108 -9
- magic_pdf/dict2md/ocr_mkcontent.py +4 -3
- magic_pdf/libs/pdf_image_tools.py +11 -6
- magic_pdf/libs/performance_stats.py +12 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/batch_analyze.py +175 -201
- magic_pdf/model/doc_analyze_by_custom_model.py +137 -92
- magic_pdf/model/pdf_extract_kit.py +5 -38
- magic_pdf/model/sub_modules/language_detection/utils.py +2 -4
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +24 -19
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +3 -1
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +3 -1
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +31 -102
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +13 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +189 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +8 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +163 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +2351 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +9 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +132 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +132 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +1084 -0
- magic_pdf/model/sub_modules/model_init.py +50 -37
- magic_pdf/model/sub_modules/model_utils.py +17 -11
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py +1 -0
- magic_pdf/model/sub_modules/ocr/{paddleocr → paddleocr2pytorch}/ocr_utils.py +102 -97
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +193 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +39 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +8 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +48 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +418 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +25 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +105 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +62 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +269 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +290 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +516 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +136 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +234 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +638 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +76 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +43 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +23 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +109 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +54 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +58 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +29 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +456 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +117 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +228 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +33 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +20 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +179 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +690 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +383 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +162 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +8421 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +163 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +167 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +95 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +4399 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +153 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +3688 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +185 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +6623 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +128 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +151 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +49 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +1 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py +1 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +106 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +217 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +440 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +104 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +227 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +10 -18
- magic_pdf/pdf_parse_union_core_v2.py +112 -74
- magic_pdf/post_proc/para_split_v3.py +16 -13
- magic_pdf/pre_proc/ocr_dict_merge.py +9 -1
- magic_pdf/pre_proc/ocr_span_list_modify.py +51 -0
- magic_pdf/resources/model_config/model_configs.yaml +1 -1
- magic_pdf/tools/cli.py +30 -12
- magic_pdf/tools/common.py +90 -12
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/METADATA +51 -41
- magic_pdf-1.3.0.dist-info/RECORD +202 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +0 -204
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +0 -213
- magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py +0 -37
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +0 -71
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +0 -46
- magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +0 -351
- magic_pdf-1.2.1.dist-info/RECORD +0 -147
- /magic_pdf/model/sub_modules/{ocr/paddleocr/__init__.py → mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py} +0 -0
- /magic_pdf/model/sub_modules/{table/structeqtable → ocr/paddleocr2pytorch/pytorchocr}/__init__.py +0 -0
- /magic_pdf/model/sub_modules/{table/tablemaster → ocr/paddleocr2pytorch/pytorchocr/modeling}/__init__.py +0 -0
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/WHEEL +0 -0
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.2.1.dist-info → magic_pdf-1.3.0.dist-info}/top_level.txt +0 -0
@@ -41,6 +41,57 @@ def check_chars_is_overlap_in_span(chars):
|
|
41
41
|
return False
|
42
42
|
|
43
43
|
|
44
|
+
def remove_x_overlapping_chars(span, median_width):
|
45
|
+
"""
|
46
|
+
Remove characters from a span that overlap significantly on the x-axis.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
median_width:
|
50
|
+
span (dict): A span containing a list of chars, each with bbox coordinates
|
51
|
+
in the format [x0, y0, x1, y1]
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
dict: The span with overlapping characters removed
|
55
|
+
"""
|
56
|
+
if 'chars' not in span or len(span['chars']) < 2:
|
57
|
+
return span
|
58
|
+
|
59
|
+
overlap_threshold = median_width * 0.3
|
60
|
+
|
61
|
+
i = 0
|
62
|
+
while i < len(span['chars']) - 1:
|
63
|
+
char1 = span['chars'][i]
|
64
|
+
char2 = span['chars'][i + 1]
|
65
|
+
|
66
|
+
# Calculate overlap width
|
67
|
+
x_left = max(char1['bbox'][0], char2['bbox'][0])
|
68
|
+
x_right = min(char1['bbox'][2], char2['bbox'][2])
|
69
|
+
|
70
|
+
if x_right > x_left: # There is overlap
|
71
|
+
overlap_width = x_right - x_left
|
72
|
+
|
73
|
+
if overlap_width > overlap_threshold:
|
74
|
+
if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
|
75
|
+
# Determine which character to remove
|
76
|
+
width1 = char1['bbox'][2] - char1['bbox'][0]
|
77
|
+
width2 = char2['bbox'][2] - char2['bbox'][0]
|
78
|
+
if width1 < width2:
|
79
|
+
# Remove the narrower character
|
80
|
+
span['chars'].pop(i)
|
81
|
+
else:
|
82
|
+
span['chars'].pop(i + 1)
|
83
|
+
else:
|
84
|
+
i += 1
|
85
|
+
|
86
|
+
# Don't increment i since we need to check the new pair
|
87
|
+
else:
|
88
|
+
i += 1
|
89
|
+
else:
|
90
|
+
i += 1
|
91
|
+
|
92
|
+
return span
|
93
|
+
|
94
|
+
|
44
95
|
def remove_overlaps_min_spans(spans):
|
45
96
|
dropped_spans = []
|
46
97
|
# 删除重叠spans中较小的那些
|
@@ -2,7 +2,7 @@ weights:
|
|
2
2
|
layoutlmv3: Layout/LayoutLMv3/model_final.pth
|
3
3
|
doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
|
4
4
|
yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
|
5
|
-
unimernet_small: MFR/
|
5
|
+
unimernet_small: MFR/unimernet_hf_small_2503
|
6
6
|
struct_eqtable: TabRec/StructEqTable
|
7
7
|
tablemaster: TabRec/TableMaster
|
8
8
|
rapid_table: TabRec/RapidTable
|
magic_pdf/tools/cli.py
CHANGED
@@ -1,15 +1,18 @@
|
|
1
1
|
import os
|
2
2
|
import shutil
|
3
3
|
import tempfile
|
4
|
+
from pathlib import Path
|
5
|
+
|
4
6
|
import click
|
5
7
|
import fitz
|
6
8
|
from loguru import logger
|
7
|
-
from pathlib import Path
|
8
9
|
|
9
10
|
import magic_pdf.model as model_config
|
11
|
+
from magic_pdf.data.batch_build_dataset import batch_build_dataset
|
10
12
|
from magic_pdf.data.data_reader_writer import FileBasedDataReader
|
13
|
+
from magic_pdf.data.dataset import Dataset
|
11
14
|
from magic_pdf.libs.version import __version__
|
12
|
-
from magic_pdf.tools.common import do_parse, parse_pdf_methods
|
15
|
+
from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
|
13
16
|
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
|
14
17
|
|
15
18
|
pdf_suffixes = ['.pdf']
|
@@ -87,37 +90,38 @@ without method specified, auto will be used by default.""",
|
|
87
90
|
default=None,
|
88
91
|
)
|
89
92
|
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
|
90
|
-
model_config.__use_inside_model__ = True
|
91
|
-
model_config.__model_mode__ = 'full'
|
92
93
|
os.makedirs(output_dir, exist_ok=True)
|
93
94
|
temp_dir = tempfile.mkdtemp()
|
94
95
|
def read_fn(path: Path):
|
95
96
|
if path.suffix in ms_office_suffixes:
|
96
97
|
convert_file_to_pdf(str(path), temp_dir)
|
97
|
-
fn = os.path.join(temp_dir, f
|
98
|
+
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
|
98
99
|
elif path.suffix in image_suffixes:
|
99
100
|
with open(str(path), 'rb') as f:
|
100
101
|
bits = f.read()
|
101
102
|
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
|
102
|
-
fn = os.path.join(temp_dir, f
|
103
|
+
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
|
103
104
|
with open(fn, 'wb') as f:
|
104
105
|
f.write(pdf_bytes)
|
105
106
|
elif path.suffix in pdf_suffixes:
|
106
107
|
fn = str(path)
|
107
108
|
else:
|
108
|
-
raise Exception(f
|
109
|
-
|
109
|
+
raise Exception(f'Unknown file suffix: {path.suffix}')
|
110
|
+
|
110
111
|
disk_rw = FileBasedDataReader(os.path.dirname(fn))
|
111
112
|
return disk_rw.read(os.path.basename(fn))
|
112
113
|
|
113
|
-
def parse_doc(doc_path: Path):
|
114
|
+
def parse_doc(doc_path: Path, dataset: Dataset | None = None):
|
114
115
|
try:
|
115
116
|
file_name = str(Path(doc_path).stem)
|
116
|
-
|
117
|
+
if dataset is None:
|
118
|
+
pdf_data_or_dataset = read_fn(doc_path)
|
119
|
+
else:
|
120
|
+
pdf_data_or_dataset = dataset
|
117
121
|
do_parse(
|
118
122
|
output_dir,
|
119
123
|
file_name,
|
120
|
-
|
124
|
+
pdf_data_or_dataset,
|
121
125
|
[],
|
122
126
|
method,
|
123
127
|
debug_able,
|
@@ -130,9 +134,23 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
|
|
130
134
|
logger.exception(e)
|
131
135
|
|
132
136
|
if os.path.isdir(path):
|
137
|
+
doc_paths = []
|
133
138
|
for doc_path in Path(path).glob('*'):
|
134
139
|
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
|
135
|
-
|
140
|
+
if doc_path.suffix in ms_office_suffixes:
|
141
|
+
convert_file_to_pdf(str(doc_path), temp_dir)
|
142
|
+
doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
|
143
|
+
elif doc_path.suffix in image_suffixes:
|
144
|
+
with open(str(doc_path), 'rb') as f:
|
145
|
+
bits = f.read()
|
146
|
+
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
|
147
|
+
fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
|
148
|
+
with open(fn, 'wb') as f:
|
149
|
+
f.write(pdf_bytes)
|
150
|
+
doc_path = Path(fn)
|
151
|
+
doc_paths.append(doc_path)
|
152
|
+
datasets = batch_build_dataset(doc_paths, 4, lang)
|
153
|
+
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
|
136
154
|
else:
|
137
155
|
parse_doc(Path(path))
|
138
156
|
|
magic_pdf/tools/common.py
CHANGED
@@ -8,10 +8,10 @@ import magic_pdf.model as model_config
|
|
8
8
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
9
9
|
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
10
10
|
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
|
11
|
-
from magic_pdf.data.dataset import PymuDocDataset
|
11
|
+
from magic_pdf.data.dataset import Dataset, PymuDocDataset
|
12
12
|
from magic_pdf.libs.draw_bbox import draw_char_bbox
|
13
|
-
from magic_pdf.model.doc_analyze_by_custom_model import
|
14
|
-
|
13
|
+
from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
|
14
|
+
doc_analyze)
|
15
15
|
|
16
16
|
# from io import BytesIO
|
17
17
|
# from pypdf import PdfReader, PdfWriter
|
@@ -67,13 +67,13 @@ def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_i
|
|
67
67
|
return output_bytes
|
68
68
|
|
69
69
|
|
70
|
-
def
|
70
|
+
def _do_parse(
|
71
71
|
output_dir,
|
72
72
|
pdf_file_name,
|
73
|
-
|
73
|
+
pdf_bytes_or_dataset,
|
74
74
|
model_list,
|
75
75
|
parse_method,
|
76
|
-
debug_able,
|
76
|
+
debug_able=False,
|
77
77
|
f_draw_span_bbox=True,
|
78
78
|
f_draw_layout_bbox=True,
|
79
79
|
f_dump_md=True,
|
@@ -92,16 +92,21 @@ def do_parse(
|
|
92
92
|
formula_enable=None,
|
93
93
|
table_enable=None,
|
94
94
|
):
|
95
|
+
from magic_pdf.operators.models import InferenceResult
|
95
96
|
if debug_able:
|
96
97
|
logger.warning('debug mode is on')
|
97
98
|
f_draw_model_bbox = True
|
98
99
|
f_draw_line_sort_bbox = True
|
99
100
|
# f_draw_char_bbox = True
|
100
101
|
|
101
|
-
|
102
|
-
pdf_bytes
|
103
|
-
|
104
|
-
|
102
|
+
if isinstance(pdf_bytes_or_dataset, bytes):
|
103
|
+
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
|
104
|
+
pdf_bytes_or_dataset, start_page_id, end_page_id
|
105
|
+
)
|
106
|
+
ds = PymuDocDataset(pdf_bytes, lang=lang)
|
107
|
+
else:
|
108
|
+
ds = pdf_bytes_or_dataset
|
109
|
+
pdf_bytes = ds._raw_data
|
105
110
|
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
|
106
111
|
|
107
112
|
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
|
@@ -109,8 +114,6 @@ def do_parse(
|
|
109
114
|
)
|
110
115
|
image_dir = str(os.path.basename(local_image_dir))
|
111
116
|
|
112
|
-
ds = PymuDocDataset(pdf_bytes, lang=lang)
|
113
|
-
|
114
117
|
if len(model_list) == 0:
|
115
118
|
if model_config.__use_inside_model__:
|
116
119
|
if parse_method == 'auto':
|
@@ -241,5 +244,80 @@ def do_parse(
|
|
241
244
|
|
242
245
|
logger.info(f'local output dir is {local_md_dir}')
|
243
246
|
|
247
|
+
def do_parse(
|
248
|
+
output_dir,
|
249
|
+
pdf_file_name,
|
250
|
+
pdf_bytes_or_dataset,
|
251
|
+
model_list,
|
252
|
+
parse_method,
|
253
|
+
debug_able=False,
|
254
|
+
f_draw_span_bbox=True,
|
255
|
+
f_draw_layout_bbox=True,
|
256
|
+
f_dump_md=True,
|
257
|
+
f_dump_middle_json=True,
|
258
|
+
f_dump_model_json=True,
|
259
|
+
f_dump_orig_pdf=True,
|
260
|
+
f_dump_content_list=True,
|
261
|
+
f_make_md_mode=MakeMode.MM_MD,
|
262
|
+
f_draw_model_bbox=False,
|
263
|
+
f_draw_line_sort_bbox=False,
|
264
|
+
f_draw_char_bbox=False,
|
265
|
+
start_page_id=0,
|
266
|
+
end_page_id=None,
|
267
|
+
lang=None,
|
268
|
+
layout_model=None,
|
269
|
+
formula_enable=None,
|
270
|
+
table_enable=None,
|
271
|
+
):
|
272
|
+
parallel_count = 1
|
273
|
+
if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
|
274
|
+
parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])
|
275
|
+
|
276
|
+
if parallel_count > 1:
|
277
|
+
if isinstance(pdf_bytes_or_dataset, bytes):
|
278
|
+
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
|
279
|
+
pdf_bytes_or_dataset, start_page_id, end_page_id
|
280
|
+
)
|
281
|
+
ds = PymuDocDataset(pdf_bytes, lang=lang)
|
282
|
+
else:
|
283
|
+
ds = pdf_bytes_or_dataset
|
284
|
+
batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
|
285
|
+
else:
|
286
|
+
_do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)
|
287
|
+
|
288
|
+
|
289
|
+
def batch_do_parse(
|
290
|
+
output_dir,
|
291
|
+
pdf_file_names: list[str],
|
292
|
+
pdf_bytes_or_datasets: list[bytes | Dataset],
|
293
|
+
parse_method,
|
294
|
+
debug_able=False,
|
295
|
+
f_draw_span_bbox=True,
|
296
|
+
f_draw_layout_bbox=True,
|
297
|
+
f_dump_md=True,
|
298
|
+
f_dump_middle_json=True,
|
299
|
+
f_dump_model_json=True,
|
300
|
+
f_dump_orig_pdf=True,
|
301
|
+
f_dump_content_list=True,
|
302
|
+
f_make_md_mode=MakeMode.MM_MD,
|
303
|
+
f_draw_model_bbox=False,
|
304
|
+
f_draw_line_sort_bbox=False,
|
305
|
+
f_draw_char_bbox=False,
|
306
|
+
lang=None,
|
307
|
+
layout_model=None,
|
308
|
+
formula_enable=None,
|
309
|
+
table_enable=None,
|
310
|
+
):
|
311
|
+
dss = []
|
312
|
+
for v in pdf_bytes_or_datasets:
|
313
|
+
if isinstance(v, bytes):
|
314
|
+
dss.append(PymuDocDataset(v, lang=lang))
|
315
|
+
else:
|
316
|
+
dss.append(v)
|
317
|
+
|
318
|
+
infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
319
|
+
for idx, infer_result in enumerate(infer_results):
|
320
|
+
_do_parse(output_dir, pdf_file_names[idx], dss[idx], infer_result.get_infer_res(), parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
|
321
|
+
|
244
322
|
|
245
323
|
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.3.0
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,35 +9,30 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3>=1.28.43
|
10
10
|
Requires-Dist: Brotli>=1.1.0
|
11
11
|
Requires-Dist: click>=8.1.7
|
12
|
-
Requires-Dist: fast-langdetect
|
12
|
+
Requires-Dist: fast-langdetect<0.3.0,>=0.2.3
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
|
-
Requires-Dist: numpy
|
15
|
-
Requires-Dist: pydantic
|
16
|
-
Requires-Dist: PyMuPDF
|
14
|
+
Requires-Dist: numpy>=1.21.6
|
15
|
+
Requires-Dist: pydantic<2.11,>=2.7.2
|
16
|
+
Requires-Dist: PyMuPDF<1.25.0,>=1.24.9
|
17
17
|
Requires-Dist: scikit-learn>=1.0.2
|
18
|
-
Requires-Dist: torch
|
19
|
-
Requires-Dist:
|
18
|
+
Requires-Dist: torch!=2.5.0,!=2.5.1,<=2.6.0,>=2.2.2
|
19
|
+
Requires-Dist: torchvision
|
20
|
+
Requires-Dist: transformers<5.0.0,>=4.49.0
|
20
21
|
Requires-Dist: pdfminer.six==20231228
|
22
|
+
Requires-Dist: tqdm>=4.67.1
|
21
23
|
Provides-Extra: full
|
22
|
-
Requires-Dist: unimernet==0.2.3; extra == "full"
|
23
|
-
Requires-Dist: torch<=2.3.1,>=2.2.2; extra == "full"
|
24
|
-
Requires-Dist: torchvision<=0.18.1,>=0.17.2; extra == "full"
|
25
24
|
Requires-Dist: ultralytics>=8.3.48; extra == "full"
|
26
|
-
Requires-Dist: paddleocr==2.7.3; extra == "full"
|
27
|
-
Requires-Dist: struct-eqtable==0.3.2; extra == "full"
|
28
|
-
Requires-Dist: einops; extra == "full"
|
29
|
-
Requires-Dist: accelerate; extra == "full"
|
30
25
|
Requires-Dist: doclayout-yolo==0.0.2b1; extra == "full"
|
31
|
-
Requires-Dist:
|
32
|
-
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.4; extra == "full"
|
26
|
+
Requires-Dist: dill<1,>=0.3.9; extra == "full"
|
33
27
|
Requires-Dist: rapid-table<2.0.0,>=1.0.3; extra == "full"
|
34
|
-
Requires-Dist: PyYAML; extra == "full"
|
35
|
-
Requires-Dist:
|
36
|
-
Requires-Dist:
|
37
|
-
Requires-Dist:
|
38
|
-
Requires-Dist:
|
28
|
+
Requires-Dist: PyYAML<7,>=6.0.2; extra == "full"
|
29
|
+
Requires-Dist: ftfy<7,>=6.3.1; extra == "full"
|
30
|
+
Requires-Dist: openai<2,>=1.70.0; extra == "full"
|
31
|
+
Requires-Dist: shapely<3,>=2.0.7; extra == "full"
|
32
|
+
Requires-Dist: pyclipper<2,>=1.3.0; extra == "full"
|
33
|
+
Requires-Dist: omegaconf<3,>=2.3.0; extra == "full"
|
34
|
+
Requires-Dist: matplotlib>=3.10; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
39
35
|
Requires-Dist: matplotlib<=3.9.0; platform_system == "Windows" and extra == "full"
|
40
|
-
Requires-Dist: paddlepaddle==2.6.1; platform_system == "Windows" and extra == "full"
|
41
36
|
Provides-Extra: lite
|
42
37
|
Requires-Dist: paddleocr==2.7.3; extra == "lite"
|
43
38
|
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
|
@@ -94,6 +89,23 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
|
94
89
|
</div>
|
95
90
|
|
96
91
|
# Changelog
|
92
|
+
- 2025/04/03 Release of 1.3.0, in this version we made many optimizations and improvements:
|
93
|
+
- Installation and compatibility optimization
|
94
|
+
- By removing the use of `layoutlmv3` in layout, resolved compatibility issues caused by `detectron2`.
|
95
|
+
- Torch version compatibility extended to 2.2~2.6 (excluding 2.5).
|
96
|
+
- CUDA compatibility supports 11.8/12.4/12.6 (CUDA version determined by torch), resolving compatibility issues for some users with 50-series and H-series GPUs.
|
97
|
+
- Python compatible versions expanded to 3.10~3.12, solving the problem of automatic downgrade to 0.6.1 during installation in non-3.10 environments.
|
98
|
+
- Offline deployment process optimized; no internet connection required after successful deployment to download any model files.
|
99
|
+
- Performance optimization
|
100
|
+
- By supporting batch processing of multiple PDF files ([script example](demo/batch_demo.py)), improved parsing speed for small files in batches (compared to version 1.0.1, formula parsing speed increased by over 1400%, overall parsing speed increased by over 500%).
|
101
|
+
- Optimized loading and usage of the mfr model, reducing GPU memory usage and improving parsing speed (requires re-execution of the [model download process](docs/how_to_download_models_en.md) to obtain incremental updates of model files).
|
102
|
+
- Optimized GPU memory usage, requiring only a minimum of 6GB to run this project.
|
103
|
+
- Improved running speed on MPS devices.
|
104
|
+
- Parsing effect optimization
|
105
|
+
- Updated the mfr model to `unimernet(2503)`, solving the issue of lost line breaks in multi-line formulas.
|
106
|
+
- Usability Optimization
|
107
|
+
- By using `paddleocr2torch`, completely replaced the use of the `paddle` framework and `paddleocr` in the project, resolving conflicts between `paddle` and `torch`, as well as thread safety issues caused by the `paddle` framework.
|
108
|
+
- Added a real-time progress bar during the parsing process to accurately track progress, making the wait less painful.
|
97
109
|
- 2025/03/03 1.2.1 released, fixed several bugs:
|
98
110
|
- Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers
|
99
111
|
- Fixed caption matching inaccuracies in certain scenarios
|
@@ -262,7 +274,7 @@ There are three different ways to experience MinerU:
|
|
262
274
|
</tr>
|
263
275
|
<tr>
|
264
276
|
<td colspan="3">Python Version</td>
|
265
|
-
<td colspan="3">3.10
|
277
|
+
<td colspan="3">3.10~3.12</td>
|
266
278
|
</tr>
|
267
279
|
<tr>
|
268
280
|
<td colspan="3">Nvidia Driver Version</td>
|
@@ -272,8 +284,8 @@ There are three different ways to experience MinerU:
|
|
272
284
|
</tr>
|
273
285
|
<tr>
|
274
286
|
<td colspan="3">CUDA Environment</td>
|
275
|
-
<td>
|
276
|
-
<td>11.8
|
287
|
+
<td>11.8/12.4/12.6</td>
|
288
|
+
<td>11.8/12.4/12.6</td>
|
277
289
|
<td>None</td>
|
278
290
|
</tr>
|
279
291
|
<tr>
|
@@ -283,11 +295,11 @@ There are three different ways to experience MinerU:
|
|
283
295
|
<td>None</td>
|
284
296
|
</tr>
|
285
297
|
<tr>
|
286
|
-
<td rowspan="2">GPU Hardware Support List</td>
|
287
|
-
<td colspan="2">GPU VRAM
|
288
|
-
<td colspan="2">
|
289
|
-
|
290
|
-
<td rowspan="2">
|
298
|
+
<td rowspan="2">GPU/MPS Hardware Support List</td>
|
299
|
+
<td colspan="2">GPU VRAM 6GB or more</td>
|
300
|
+
<td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
|
301
|
+
More than 6GB VRAM </td>
|
302
|
+
<td rowspan="2">apple slicon</td>
|
291
303
|
</tr>
|
292
304
|
</table>
|
293
305
|
|
@@ -304,9 +316,9 @@ Synced with dev branch updates:
|
|
304
316
|
#### 1. Install magic-pdf
|
305
317
|
|
306
318
|
```bash
|
307
|
-
conda create -n mineru python
|
319
|
+
conda create -n mineru 'python<3.13' -y
|
308
320
|
conda activate mineru
|
309
|
-
pip install -U "magic-pdf[full]"
|
321
|
+
pip install -U "magic-pdf[full]"
|
310
322
|
```
|
311
323
|
|
312
324
|
#### 2. Download model weight files
|
@@ -331,7 +343,7 @@ You can modify certain configurations in this file to enable or disable features
|
|
331
343
|
{
|
332
344
|
// other config
|
333
345
|
"layout-config": {
|
334
|
-
"model": "doclayout_yolo"
|
346
|
+
"model": "doclayout_yolo"
|
335
347
|
},
|
336
348
|
"formula-config": {
|
337
349
|
"mfd_model": "yolo_v8_mfd",
|
@@ -339,8 +351,8 @@ You can modify certain configurations in this file to enable or disable features
|
|
339
351
|
"enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
|
340
352
|
},
|
341
353
|
"table-config": {
|
342
|
-
"model": "rapid_table",
|
343
|
-
"sub_model": "slanet_plus",
|
354
|
+
"model": "rapid_table",
|
355
|
+
"sub_model": "slanet_plus",
|
344
356
|
"enable": true, // The table recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
|
345
357
|
"max_time": 400
|
346
358
|
}
|
@@ -355,7 +367,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
|
|
355
367
|
- [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
|
356
368
|
- Quick Deployment with Docker
|
357
369
|
> [!IMPORTANT]
|
358
|
-
> Docker requires a GPU with at least
|
370
|
+
> Docker requires a GPU with at least 6GB of VRAM, and all acceleration features are enabled by default.
|
359
371
|
>
|
360
372
|
> Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
|
361
373
|
>
|
@@ -377,7 +389,7 @@ If your device has NPU acceleration hardware, you can follow the tutorial below
|
|
377
389
|
|
378
390
|
### Using MPS
|
379
391
|
|
380
|
-
If your device uses Apple silicon chips, you can enable MPS acceleration for
|
392
|
+
If your device uses Apple silicon chips, you can enable MPS acceleration for your tasks.
|
381
393
|
|
382
394
|
You can enable MPS acceleration by setting the `device-mode` parameter to `mps` in the `magic-pdf.json` configuration file.
|
383
395
|
|
@@ -388,10 +400,6 @@ You can enable MPS acceleration by setting the `device-mode` parameter to `mps`
|
|
388
400
|
}
|
389
401
|
```
|
390
402
|
|
391
|
-
> [!TIP]
|
392
|
-
> Since the formula recognition task cannot utilize MPS acceleration, you can disable the formula recognition feature in tasks where it is not needed to achieve optimal performance.
|
393
|
-
>
|
394
|
-
> You can disable the formula recognition feature by setting the `enable` parameter in the `formula-config` section to `false`.
|
395
403
|
|
396
404
|
## Usage
|
397
405
|
|
@@ -465,6 +473,8 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
|
|
465
473
|
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
|
466
474
|
- [RapidTable](https://github.com/RapidAI/RapidTable)
|
467
475
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
476
|
+
- [RapidOCR](https://github.com/RapidAI/RapidOCR)
|
477
|
+
- [PaddleOCR2Pytorch](https://github.com/frotms/PaddleOCR2Pytorch)
|
468
478
|
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
469
479
|
- [layoutreader](https://github.com/ppaanngggg/layoutreader)
|
470
480
|
- [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
|