magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +7 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/base.py +13 -1
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +188 -5
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +16 -15
- magic_pdf/filter/__init__.py +32 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -2
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +19 -22
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/pdf_check.py +30 -30
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +35 -5
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +43 -7
- magic_pdf/model/sub_modules/model_utils.py +17 -5
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/operators/models.py +154 -0
- magic_pdf/operators/pipes.py +191 -0
- magic_pdf/pdf_parse_union_core_v2.py +77 -27
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +120 -61
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -23
- magic_pdf/pdf_parse_by_txt.py +0 -24
- magic_pdf/pipe/AbsPipe.py +0 -98
- magic_pdf/pipe/OCRPipe.py +0 -41
- magic_pdf/pipe/TXTPipe.py +0 -41
- magic_pdf/pipe/UNIPipe.py +0 -98
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -121
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,31 @@
|
|
1
|
+
import os
|
1
2
|
import time
|
2
3
|
|
3
|
-
|
4
|
-
import
|
4
|
+
# 关闭paddle的信号处理
|
5
|
+
import paddle
|
5
6
|
from loguru import logger
|
6
7
|
|
8
|
+
paddle.disable_signal_handler()
|
9
|
+
|
10
|
+
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
11
|
+
|
12
|
+
try:
|
13
|
+
import torchtext
|
14
|
+
|
15
|
+
if torchtext.__version__ >= '0.18.0':
|
16
|
+
torchtext.disable_torchtext_deprecation_warning()
|
17
|
+
except ImportError:
|
18
|
+
pass
|
19
|
+
|
20
|
+
import magic_pdf.model as model_config
|
21
|
+
from magic_pdf.data.dataset import Dataset
|
7
22
|
from magic_pdf.libs.clean_memory import clean_memory
|
8
|
-
from magic_pdf.libs.config_reader import
|
9
|
-
|
23
|
+
from magic_pdf.libs.config_reader import (get_device, get_formula_config,
|
24
|
+
get_layout_config,
|
25
|
+
get_local_models_dir,
|
26
|
+
get_table_recog_config)
|
10
27
|
from magic_pdf.model.model_list import MODEL
|
11
|
-
|
28
|
+
from magic_pdf.operators.models import InferenceResult
|
12
29
|
|
13
30
|
|
14
31
|
def dict_compare(d1, d2):
|
@@ -19,47 +36,12 @@ def remove_duplicates_dicts(lst):
|
|
19
36
|
unique_dicts = []
|
20
37
|
for dict_item in lst:
|
21
38
|
if not any(
|
22
|
-
|
39
|
+
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
|
23
40
|
):
|
24
41
|
unique_dicts.append(dict_item)
|
25
42
|
return unique_dicts
|
26
43
|
|
27
44
|
|
28
|
-
def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
|
29
|
-
try:
|
30
|
-
from PIL import Image
|
31
|
-
except ImportError:
|
32
|
-
logger.error("Pillow not installed, please install by pip.")
|
33
|
-
exit(1)
|
34
|
-
|
35
|
-
images = []
|
36
|
-
with fitz.open("pdf", pdf_bytes) as doc:
|
37
|
-
pdf_page_num = doc.page_count
|
38
|
-
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
39
|
-
if end_page_id > pdf_page_num - 1:
|
40
|
-
logger.warning("end_page_id is out of range, use images length")
|
41
|
-
end_page_id = pdf_page_num - 1
|
42
|
-
|
43
|
-
for index in range(0, doc.page_count):
|
44
|
-
if start_page_id <= index <= end_page_id:
|
45
|
-
page = doc[index]
|
46
|
-
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
47
|
-
pm = page.get_pixmap(matrix=mat, alpha=False)
|
48
|
-
|
49
|
-
# If the width or height exceeds 4500 after scaling, do not scale further.
|
50
|
-
if pm.width > 4500 or pm.height > 4500:
|
51
|
-
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
52
|
-
|
53
|
-
img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
|
54
|
-
img = np.array(img)
|
55
|
-
img_dict = {"img": img, "width": pm.width, "height": pm.height}
|
56
|
-
else:
|
57
|
-
img_dict = {"img": [], "width": 0, "height": 0}
|
58
|
-
|
59
|
-
images.append(img_dict)
|
60
|
-
return images
|
61
|
-
|
62
|
-
|
63
45
|
class ModelSingleton:
|
64
46
|
_instance = None
|
65
47
|
_models = {}
|
@@ -69,117 +51,147 @@ class ModelSingleton:
|
|
69
51
|
cls._instance = super().__new__(cls)
|
70
52
|
return cls._instance
|
71
53
|
|
72
|
-
def get_model(
|
54
|
+
def get_model(
|
55
|
+
self,
|
56
|
+
ocr: bool,
|
57
|
+
show_log: bool,
|
58
|
+
lang=None,
|
59
|
+
layout_model=None,
|
60
|
+
formula_enable=None,
|
61
|
+
table_enable=None,
|
62
|
+
):
|
73
63
|
key = (ocr, show_log, lang, layout_model, formula_enable, table_enable)
|
74
64
|
if key not in self._models:
|
75
|
-
self._models[key] = custom_model_init(
|
76
|
-
|
65
|
+
self._models[key] = custom_model_init(
|
66
|
+
ocr=ocr,
|
67
|
+
show_log=show_log,
|
68
|
+
lang=lang,
|
69
|
+
layout_model=layout_model,
|
70
|
+
formula_enable=formula_enable,
|
71
|
+
table_enable=table_enable,
|
72
|
+
)
|
77
73
|
return self._models[key]
|
78
74
|
|
79
75
|
|
80
|
-
def custom_model_init(
|
81
|
-
|
76
|
+
def custom_model_init(
|
77
|
+
ocr: bool = False,
|
78
|
+
show_log: bool = False,
|
79
|
+
lang=None,
|
80
|
+
layout_model=None,
|
81
|
+
formula_enable=None,
|
82
|
+
table_enable=None,
|
83
|
+
):
|
82
84
|
|
83
85
|
model = None
|
84
86
|
|
85
|
-
if model_config.__model_mode__ ==
|
86
|
-
logger.warning(
|
87
|
-
|
87
|
+
if model_config.__model_mode__ == 'lite':
|
88
|
+
logger.warning(
|
89
|
+
'The Lite mode is provided for developers to conduct testing only, and the output quality is '
|
90
|
+
'not guaranteed to be reliable.'
|
91
|
+
)
|
88
92
|
model = MODEL.Paddle
|
89
|
-
elif model_config.__model_mode__ ==
|
93
|
+
elif model_config.__model_mode__ == 'full':
|
90
94
|
model = MODEL.PEK
|
91
95
|
|
92
96
|
if model_config.__use_inside_model__:
|
93
97
|
model_init_start = time.time()
|
94
98
|
if model == MODEL.Paddle:
|
95
99
|
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
|
100
|
+
|
96
101
|
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log, lang=lang)
|
97
102
|
elif model == MODEL.PEK:
|
98
103
|
from magic_pdf.model.pdf_extract_kit import CustomPEKModel
|
104
|
+
|
99
105
|
# 从配置文件读取model-dir和device
|
100
106
|
local_models_dir = get_local_models_dir()
|
101
107
|
device = get_device()
|
102
108
|
|
103
109
|
layout_config = get_layout_config()
|
104
110
|
if layout_model is not None:
|
105
|
-
layout_config[
|
111
|
+
layout_config['model'] = layout_model
|
106
112
|
|
107
113
|
formula_config = get_formula_config()
|
108
114
|
if formula_enable is not None:
|
109
|
-
formula_config[
|
115
|
+
formula_config['enable'] = formula_enable
|
110
116
|
|
111
117
|
table_config = get_table_recog_config()
|
112
118
|
if table_enable is not None:
|
113
|
-
table_config[
|
119
|
+
table_config['enable'] = table_enable
|
114
120
|
|
115
121
|
model_input = {
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
122
|
+
'ocr': ocr,
|
123
|
+
'show_log': show_log,
|
124
|
+
'models_dir': local_models_dir,
|
125
|
+
'device': device,
|
126
|
+
'table_config': table_config,
|
127
|
+
'layout_config': layout_config,
|
128
|
+
'formula_config': formula_config,
|
129
|
+
'lang': lang,
|
124
130
|
}
|
125
131
|
|
126
132
|
custom_model = CustomPEKModel(**model_input)
|
127
133
|
else:
|
128
|
-
logger.error(
|
134
|
+
logger.error('Not allow model_name!')
|
129
135
|
exit(1)
|
130
136
|
model_init_cost = time.time() - model_init_start
|
131
|
-
logger.info(f
|
137
|
+
logger.info(f'model init cost: {model_init_cost}')
|
132
138
|
else:
|
133
|
-
logger.error(
|
139
|
+
logger.error('use_inside_model is False, not allow to use inside model')
|
134
140
|
exit(1)
|
135
141
|
|
136
142
|
return custom_model
|
137
143
|
|
138
144
|
|
139
|
-
def doc_analyze(
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
+
def doc_analyze(
|
146
|
+
dataset: Dataset,
|
147
|
+
ocr: bool = False,
|
148
|
+
show_log: bool = False,
|
149
|
+
start_page_id=0,
|
150
|
+
end_page_id=None,
|
151
|
+
lang=None,
|
152
|
+
layout_model=None,
|
153
|
+
formula_enable=None,
|
154
|
+
table_enable=None,
|
155
|
+
) -> InferenceResult:
|
145
156
|
|
146
157
|
model_manager = ModelSingleton()
|
147
|
-
custom_model = model_manager.get_model(
|
148
|
-
|
149
|
-
|
150
|
-
pdf_page_num = doc.page_count
|
151
|
-
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
|
152
|
-
if end_page_id > pdf_page_num - 1:
|
153
|
-
logger.warning("end_page_id is out of range, use images length")
|
154
|
-
end_page_id = pdf_page_num - 1
|
155
|
-
|
156
|
-
images = load_images_from_pdf(pdf_bytes, start_page_id=start_page_id, end_page_id=end_page_id)
|
158
|
+
custom_model = model_manager.get_model(
|
159
|
+
ocr, show_log, lang, layout_model, formula_enable, table_enable
|
160
|
+
)
|
157
161
|
|
158
162
|
model_json = []
|
159
163
|
doc_analyze_start = time.time()
|
160
164
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
+
if end_page_id is None:
|
166
|
+
end_page_id = len(dataset)
|
167
|
+
|
168
|
+
for index in range(len(dataset)):
|
169
|
+
page_data = dataset.get_page(index)
|
170
|
+
img_dict = page_data.get_image()
|
171
|
+
img = img_dict['img']
|
172
|
+
page_width = img_dict['width']
|
173
|
+
page_height = img_dict['height']
|
165
174
|
if start_page_id <= index <= end_page_id:
|
166
175
|
page_start = time.time()
|
167
176
|
result = custom_model(img)
|
168
177
|
logger.info(f'-----page_id : {index}, page total time: {round(time.time() - page_start, 2)}-----')
|
169
178
|
else:
|
170
179
|
result = []
|
171
|
-
|
172
|
-
|
180
|
+
|
181
|
+
page_info = {'page_no': index, 'height': page_height, 'width': page_width}
|
182
|
+
page_dict = {'layout_dets': result, 'page_info': page_info}
|
173
183
|
model_json.append(page_dict)
|
174
184
|
|
175
185
|
gc_start = time.time()
|
176
|
-
clean_memory()
|
186
|
+
clean_memory(get_device())
|
177
187
|
gc_time = round(time.time() - gc_start, 2)
|
178
|
-
logger.info(f
|
188
|
+
logger.info(f'gc time: {gc_time}')
|
179
189
|
|
180
190
|
doc_analyze_time = round(time.time() - doc_analyze_start, 2)
|
181
|
-
doc_analyze_speed = round(
|
182
|
-
logger.info(
|
183
|
-
|
191
|
+
doc_analyze_speed = round((end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
|
192
|
+
logger.info(
|
193
|
+
f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
|
194
|
+
f' speed: {doc_analyze_speed} pages/second'
|
195
|
+
)
|
184
196
|
|
185
|
-
return model_json
|
197
|
+
return InferenceResult(model_json, dataset)
|