magic-pdf 0.10.6__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +2 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +13 -1
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +14 -13
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +8 -12
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -125
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +4 -51
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +33 -22
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +30 -4
- magic_pdf/model/sub_modules/model_utils.py +8 -2
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/{model/operators.py → operators/models.py} +2 -38
- magic_pdf/{pipe/operators.py → operators/pipes.py} +70 -17
- magic_pdf/pdf_parse_union_core_v2.py +71 -17
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +28 -18
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/METADATA +73 -23
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/RECORD +50 -53
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -22
- magic_pdf/pdf_parse_by_txt.py +0 -23
- magic_pdf/pipe/AbsPipe.py +0 -99
- magic_pdf/pipe/OCRPipe.py +0 -80
- magic_pdf/pipe/TXTPipe.py +0 -42
- magic_pdf/pipe/UNIPipe.py +0 -150
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -144
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,275 @@
|
|
1
|
+
import time
|
2
|
+
|
3
|
+
import cv2
|
4
|
+
import numpy as np
|
5
|
+
import torch
|
6
|
+
from loguru import logger
|
7
|
+
from PIL import Image
|
8
|
+
|
9
|
+
from magic_pdf.config.constants import MODEL_NAME
|
10
|
+
from magic_pdf.config.exceptions import CUDA_NOT_AVAILABLE
|
11
|
+
from magic_pdf.data.dataset import Dataset
|
12
|
+
from magic_pdf.libs.clean_memory import clean_memory
|
13
|
+
from magic_pdf.libs.config_reader import get_device
|
14
|
+
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
|
15
|
+
from magic_pdf.model.pdf_extract_kit import CustomPEKModel
|
16
|
+
from magic_pdf.model.sub_modules.model_utils import (
|
17
|
+
clean_vram, crop_img, get_res_list_from_layout_res)
|
18
|
+
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import (
|
19
|
+
get_adjusted_mfdetrec_res, get_ocr_result_list)
|
20
|
+
from magic_pdf.operators.models import InferenceResult
|
21
|
+
|
22
|
+
YOLO_LAYOUT_BASE_BATCH_SIZE = 4
|
23
|
+
MFD_BASE_BATCH_SIZE = 1
|
24
|
+
MFR_BASE_BATCH_SIZE = 16
|
25
|
+
|
26
|
+
|
27
|
+
class BatchAnalyze:
|
28
|
+
def __init__(self, model: CustomPEKModel, batch_ratio: int):
|
29
|
+
self.model = model
|
30
|
+
self.batch_ratio = batch_ratio
|
31
|
+
|
32
|
+
def __call__(self, images: list) -> list:
|
33
|
+
images_layout_res = []
|
34
|
+
|
35
|
+
layout_start_time = time.time()
|
36
|
+
if self.model.layout_model_name == MODEL_NAME.LAYOUTLMv3:
|
37
|
+
# layoutlmv3
|
38
|
+
for image in images:
|
39
|
+
layout_res = self.model.layout_model(image, ignore_catids=[])
|
40
|
+
images_layout_res.append(layout_res)
|
41
|
+
elif self.model.layout_model_name == MODEL_NAME.DocLayout_YOLO:
|
42
|
+
# doclayout_yolo
|
43
|
+
layout_images = []
|
44
|
+
modified_images = []
|
45
|
+
for image_index, image in enumerate(images):
|
46
|
+
pil_img = Image.fromarray(image)
|
47
|
+
width, height = pil_img.size
|
48
|
+
if height > width:
|
49
|
+
input_res = {'poly': [0, 0, width, 0, width, height, 0, height]}
|
50
|
+
new_image, useful_list = crop_img(
|
51
|
+
input_res, pil_img, crop_paste_x=width // 2, crop_paste_y=0
|
52
|
+
)
|
53
|
+
layout_images.append(new_image)
|
54
|
+
modified_images.append([image_index, useful_list])
|
55
|
+
else:
|
56
|
+
layout_images.append(pil_img)
|
57
|
+
|
58
|
+
images_layout_res += self.model.layout_model.batch_predict(
|
59
|
+
layout_images, self.batch_ratio * YOLO_LAYOUT_BASE_BATCH_SIZE
|
60
|
+
)
|
61
|
+
|
62
|
+
for image_index, useful_list in modified_images:
|
63
|
+
for res in images_layout_res[image_index]:
|
64
|
+
for i in range(len(res['poly'])):
|
65
|
+
if i % 2 == 0:
|
66
|
+
res['poly'][i] = (
|
67
|
+
res['poly'][i] - useful_list[0] + useful_list[2]
|
68
|
+
)
|
69
|
+
else:
|
70
|
+
res['poly'][i] = (
|
71
|
+
res['poly'][i] - useful_list[1] + useful_list[3]
|
72
|
+
)
|
73
|
+
logger.info(
|
74
|
+
f'layout time: {round(time.time() - layout_start_time, 2)}, image num: {len(images)}'
|
75
|
+
)
|
76
|
+
|
77
|
+
if self.model.apply_formula:
|
78
|
+
# 公式检测
|
79
|
+
mfd_start_time = time.time()
|
80
|
+
images_mfd_res = self.model.mfd_model.batch_predict(
|
81
|
+
images, self.batch_ratio * MFD_BASE_BATCH_SIZE
|
82
|
+
)
|
83
|
+
logger.info(
|
84
|
+
f'mfd time: {round(time.time() - mfd_start_time, 2)}, image num: {len(images)}'
|
85
|
+
)
|
86
|
+
|
87
|
+
# 公式识别
|
88
|
+
mfr_start_time = time.time()
|
89
|
+
images_formula_list = self.model.mfr_model.batch_predict(
|
90
|
+
images_mfd_res,
|
91
|
+
images,
|
92
|
+
batch_size=self.batch_ratio * MFR_BASE_BATCH_SIZE,
|
93
|
+
)
|
94
|
+
for image_index in range(len(images)):
|
95
|
+
images_layout_res[image_index] += images_formula_list[image_index]
|
96
|
+
logger.info(
|
97
|
+
f'mfr time: {round(time.time() - mfr_start_time, 2)}, image num: {len(images)}'
|
98
|
+
)
|
99
|
+
|
100
|
+
# 清理显存
|
101
|
+
clean_vram(self.model.device, vram_threshold=8)
|
102
|
+
|
103
|
+
ocr_time = 0
|
104
|
+
ocr_count = 0
|
105
|
+
table_time = 0
|
106
|
+
table_count = 0
|
107
|
+
# reference: magic_pdf/model/doc_analyze_by_custom_model.py:doc_analyze
|
108
|
+
for index in range(len(images)):
|
109
|
+
layout_res = images_layout_res[index]
|
110
|
+
pil_img = Image.fromarray(images[index])
|
111
|
+
|
112
|
+
ocr_res_list, table_res_list, single_page_mfdetrec_res = (
|
113
|
+
get_res_list_from_layout_res(layout_res)
|
114
|
+
)
|
115
|
+
# ocr识别
|
116
|
+
ocr_start = time.time()
|
117
|
+
# Process each area that requires OCR processing
|
118
|
+
for res in ocr_res_list:
|
119
|
+
new_image, useful_list = crop_img(
|
120
|
+
res, pil_img, crop_paste_x=50, crop_paste_y=50
|
121
|
+
)
|
122
|
+
adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
|
123
|
+
single_page_mfdetrec_res, useful_list
|
124
|
+
)
|
125
|
+
|
126
|
+
# OCR recognition
|
127
|
+
new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
|
128
|
+
|
129
|
+
if self.model.apply_ocr:
|
130
|
+
ocr_res = self.model.ocr_model.ocr(
|
131
|
+
new_image, mfd_res=adjusted_mfdetrec_res
|
132
|
+
)[0]
|
133
|
+
else:
|
134
|
+
ocr_res = self.model.ocr_model.ocr(
|
135
|
+
new_image, mfd_res=adjusted_mfdetrec_res, rec=False
|
136
|
+
)[0]
|
137
|
+
|
138
|
+
# Integration results
|
139
|
+
if ocr_res:
|
140
|
+
ocr_result_list = get_ocr_result_list(ocr_res, useful_list)
|
141
|
+
layout_res.extend(ocr_result_list)
|
142
|
+
ocr_time += time.time() - ocr_start
|
143
|
+
ocr_count += len(ocr_res_list)
|
144
|
+
|
145
|
+
# 表格识别 table recognition
|
146
|
+
if self.model.apply_table:
|
147
|
+
table_start = time.time()
|
148
|
+
for res in table_res_list:
|
149
|
+
new_image, _ = crop_img(res, pil_img)
|
150
|
+
single_table_start_time = time.time()
|
151
|
+
html_code = None
|
152
|
+
if self.model.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
|
153
|
+
with torch.no_grad():
|
154
|
+
table_result = self.model.table_model.predict(
|
155
|
+
new_image, 'html'
|
156
|
+
)
|
157
|
+
if len(table_result) > 0:
|
158
|
+
html_code = table_result[0]
|
159
|
+
elif self.model.table_model_name == MODEL_NAME.TABLE_MASTER:
|
160
|
+
html_code = self.model.table_model.img2html(new_image)
|
161
|
+
elif self.model.table_model_name == MODEL_NAME.RAPID_TABLE:
|
162
|
+
html_code, table_cell_bboxes, elapse = (
|
163
|
+
self.model.table_model.predict(new_image)
|
164
|
+
)
|
165
|
+
run_time = time.time() - single_table_start_time
|
166
|
+
if run_time > self.model.table_max_time:
|
167
|
+
logger.warning(
|
168
|
+
f'table recognition processing exceeds max time {self.model.table_max_time}s'
|
169
|
+
)
|
170
|
+
# 判断是否返回正常
|
171
|
+
if html_code:
|
172
|
+
expected_ending = html_code.strip().endswith(
|
173
|
+
'</html>'
|
174
|
+
) or html_code.strip().endswith('</table>')
|
175
|
+
if expected_ending:
|
176
|
+
res['html'] = html_code
|
177
|
+
else:
|
178
|
+
logger.warning(
|
179
|
+
'table recognition processing fails, not found expected HTML table end'
|
180
|
+
)
|
181
|
+
else:
|
182
|
+
logger.warning(
|
183
|
+
'table recognition processing fails, not get html return'
|
184
|
+
)
|
185
|
+
table_time += time.time() - table_start
|
186
|
+
table_count += len(table_res_list)
|
187
|
+
|
188
|
+
if self.model.apply_ocr:
|
189
|
+
logger.info(f'ocr time: {round(ocr_time, 2)}, image num: {ocr_count}')
|
190
|
+
else:
|
191
|
+
logger.info(f'det time: {round(ocr_time, 2)}, image num: {ocr_count}')
|
192
|
+
if self.model.apply_table:
|
193
|
+
logger.info(f'table time: {round(table_time, 2)}, image num: {table_count}')
|
194
|
+
|
195
|
+
return images_layout_res
|
196
|
+
|
197
|
+
|
198
|
+
def doc_batch_analyze(
|
199
|
+
dataset: Dataset,
|
200
|
+
ocr: bool = False,
|
201
|
+
show_log: bool = False,
|
202
|
+
start_page_id=0,
|
203
|
+
end_page_id=None,
|
204
|
+
lang=None,
|
205
|
+
layout_model=None,
|
206
|
+
formula_enable=None,
|
207
|
+
table_enable=None,
|
208
|
+
batch_ratio: int | None = None,
|
209
|
+
) -> InferenceResult:
|
210
|
+
"""Perform batch analysis on a document dataset.
|
211
|
+
|
212
|
+
Args:
|
213
|
+
dataset (Dataset): The dataset containing document pages to be analyzed.
|
214
|
+
ocr (bool, optional): Flag to enable OCR (Optical Character Recognition). Defaults to False.
|
215
|
+
show_log (bool, optional): Flag to enable logging. Defaults to False.
|
216
|
+
start_page_id (int, optional): The starting page ID for analysis. Defaults to 0.
|
217
|
+
end_page_id (int, optional): The ending page ID for analysis. Defaults to None, which means analyze till the last page.
|
218
|
+
lang (str, optional): Language for OCR. Defaults to None.
|
219
|
+
layout_model (optional): Layout model to be used for analysis. Defaults to None.
|
220
|
+
formula_enable (optional): Flag to enable formula detection. Defaults to None.
|
221
|
+
table_enable (optional): Flag to enable table detection. Defaults to None.
|
222
|
+
batch_ratio (int | None, optional): Ratio for batch processing. Defaults to None, which sets it to 1.
|
223
|
+
|
224
|
+
Raises:
|
225
|
+
CUDA_NOT_AVAILABLE: If CUDA is not available, raises an exception as batch analysis is not supported in CPU mode.
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
InferenceResult: The result of the batch analysis containing the analyzed data and the dataset.
|
229
|
+
"""
|
230
|
+
|
231
|
+
if not torch.cuda.is_available():
|
232
|
+
raise CUDA_NOT_AVAILABLE('batch analyze not support in CPU mode')
|
233
|
+
|
234
|
+
lang = None if lang == '' else lang
|
235
|
+
# TODO: auto detect batch size
|
236
|
+
batch_ratio = 1 if batch_ratio is None else batch_ratio
|
237
|
+
end_page_id = end_page_id if end_page_id else len(dataset)
|
238
|
+
|
239
|
+
model_manager = ModelSingleton()
|
240
|
+
custom_model: CustomPEKModel = model_manager.get_model(
|
241
|
+
ocr, show_log, lang, layout_model, formula_enable, table_enable
|
242
|
+
)
|
243
|
+
batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
|
244
|
+
|
245
|
+
model_json = []
|
246
|
+
|
247
|
+
# batch analyze
|
248
|
+
images = []
|
249
|
+
for index in range(len(dataset)):
|
250
|
+
if start_page_id <= index <= end_page_id:
|
251
|
+
page_data = dataset.get_page(index)
|
252
|
+
img_dict = page_data.get_image()
|
253
|
+
images.append(img_dict['img'])
|
254
|
+
analyze_result = batch_model(images)
|
255
|
+
|
256
|
+
for index in range(len(dataset)):
|
257
|
+
page_data = dataset.get_page(index)
|
258
|
+
img_dict = page_data.get_image()
|
259
|
+
page_width = img_dict['width']
|
260
|
+
page_height = img_dict['height']
|
261
|
+
if start_page_id <= index <= end_page_id:
|
262
|
+
result = analyze_result.pop(0)
|
263
|
+
else:
|
264
|
+
result = []
|
265
|
+
|
266
|
+
page_info = {'page_no': index, 'height': page_height, 'width': page_width}
|
267
|
+
page_dict = {'layout_dets': result, 'page_info': page_info}
|
268
|
+
model_json.append(page_dict)
|
269
|
+
|
270
|
+
# TODO: clean memory when gpu memory is not enough
|
271
|
+
clean_memory_start_time = time.time()
|
272
|
+
clean_memory(get_device())
|
273
|
+
logger.info(f'clean memory time: {round(time.time() - clean_memory_start_time, 2)}')
|
274
|
+
|
275
|
+
return InferenceResult(model_json, dataset)
|
@@ -1,16 +1,13 @@
|
|
1
1
|
import os
|
2
2
|
import time
|
3
3
|
|
4
|
-
import fitz
|
5
|
-
import numpy as np
|
6
|
-
from loguru import logger
|
7
|
-
|
8
4
|
# 关闭paddle的信号处理
|
9
5
|
import paddle
|
6
|
+
from loguru import logger
|
7
|
+
|
10
8
|
paddle.disable_signal_handler()
|
11
9
|
|
12
10
|
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
13
|
-
os.environ['YOLO_VERBOSE'] = 'False' # disable yolo logger
|
14
11
|
|
15
12
|
try:
|
16
13
|
import torchtext
|
@@ -28,7 +25,7 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
|
|
28
25
|
get_local_models_dir,
|
29
26
|
get_table_recog_config)
|
30
27
|
from magic_pdf.model.model_list import MODEL
|
31
|
-
from magic_pdf.
|
28
|
+
from magic_pdf.operators.models import InferenceResult
|
32
29
|
|
33
30
|
|
34
31
|
def dict_compare(d1, d2):
|
@@ -45,47 +42,6 @@ def remove_duplicates_dicts(lst):
|
|
45
42
|
return unique_dicts
|
46
43
|
|
47
44
|
|
48
|
-
def load_images_from_pdf(
|
49
|
-
pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None
|
50
|
-
) -> list:
|
51
|
-
try:
|
52
|
-
from PIL import Image
|
53
|
-
except ImportError:
|
54
|
-
logger.error('Pillow not installed, please install by pip.')
|
55
|
-
exit(1)
|
56
|
-
|
57
|
-
images = []
|
58
|
-
with fitz.open('pdf', pdf_bytes) as doc:
|
59
|
-
pdf_page_num = doc.page_count
|
60
|
-
end_page_id = (
|
61
|
-
end_page_id
|
62
|
-
if end_page_id is not None and end_page_id >= 0
|
63
|
-
else pdf_page_num - 1
|
64
|
-
)
|
65
|
-
if end_page_id > pdf_page_num - 1:
|
66
|
-
logger.warning('end_page_id is out of range, use images length')
|
67
|
-
end_page_id = pdf_page_num - 1
|
68
|
-
|
69
|
-
for index in range(0, doc.page_count):
|
70
|
-
if start_page_id <= index <= end_page_id:
|
71
|
-
page = doc[index]
|
72
|
-
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
73
|
-
pm = page.get_pixmap(matrix=mat, alpha=False)
|
74
|
-
|
75
|
-
# If the width or height exceeds 4500 after scaling, do not scale further.
|
76
|
-
if pm.width > 4500 or pm.height > 4500:
|
77
|
-
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
78
|
-
|
79
|
-
img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
|
80
|
-
img = np.array(img)
|
81
|
-
img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
|
82
|
-
else:
|
83
|
-
img_dict = {'img': [], 'width': 0, 'height': 0}
|
84
|
-
|
85
|
-
images.append(img_dict)
|
86
|
-
return images
|
87
|
-
|
88
|
-
|
89
45
|
class ModelSingleton:
|
90
46
|
_instance = None
|
91
47
|
_models = {}
|
@@ -198,9 +154,6 @@ def doc_analyze(
|
|
198
154
|
table_enable=None,
|
199
155
|
) -> InferenceResult:
|
200
156
|
|
201
|
-
if lang == '':
|
202
|
-
lang = None
|
203
|
-
|
204
157
|
model_manager = ModelSingleton()
|
205
158
|
custom_model = model_manager.get_model(
|
206
159
|
ocr, show_log, lang, layout_model, formula_enable, table_enable
|
@@ -230,7 +183,7 @@ def doc_analyze(
|
|
230
183
|
model_json.append(page_dict)
|
231
184
|
|
232
185
|
gc_start = time.time()
|
233
|
-
clean_memory()
|
186
|
+
clean_memory(get_device())
|
234
187
|
gc_time = round(time.time() - gc_start, 2)
|
235
188
|
logger.info(f'gc time: {gc_time}')
|
236
189
|
|