magic-pdf 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/batch_build_dataset.py +156 -0
- magic_pdf/data/dataset.py +56 -25
- magic_pdf/data/utils.py +108 -9
- magic_pdf/dict2md/ocr_mkcontent.py +4 -3
- magic_pdf/libs/pdf_image_tools.py +11 -6
- magic_pdf/libs/performance_stats.py +12 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/batch_analyze.py +175 -201
- magic_pdf/model/doc_analyze_by_custom_model.py +142 -92
- magic_pdf/model/pdf_extract_kit.py +5 -38
- magic_pdf/model/sub_modules/language_detection/utils.py +2 -4
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +24 -19
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +3 -1
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +3 -1
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +31 -102
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +13 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +189 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +8 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +163 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +2351 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +9 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +132 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +132 -0
- magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +1084 -0
- magic_pdf/model/sub_modules/model_init.py +50 -37
- magic_pdf/model/sub_modules/model_utils.py +18 -12
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py +1 -0
- magic_pdf/model/sub_modules/ocr/{paddleocr → paddleocr2pytorch}/ocr_utils.py +102 -97
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +193 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +39 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +8 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +48 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +418 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +25 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +105 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +62 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +269 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +290 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +516 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +136 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +234 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +638 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +76 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +43 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +23 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +109 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +54 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +58 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +29 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +456 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +117 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +228 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +33 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +20 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +179 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +690 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +383 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +162 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +8421 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +163 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +167 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +95 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +4399 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +153 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +3688 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +185 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +6623 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +128 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +151 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +49 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +1 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py +1 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +106 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +217 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +440 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +104 -0
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +227 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +15 -19
- magic_pdf/pdf_parse_union_core_v2.py +112 -74
- magic_pdf/pre_proc/ocr_dict_merge.py +9 -1
- magic_pdf/pre_proc/ocr_span_list_modify.py +51 -0
- magic_pdf/resources/model_config/model_configs.yaml +1 -1
- magic_pdf/resources/slanet_plus/slanet-plus.onnx +0 -0
- magic_pdf/tools/cli.py +30 -12
- magic_pdf/tools/common.py +90 -12
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/METADATA +92 -59
- magic_pdf-1.3.1.dist-info/RECORD +203 -0
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/WHEEL +1 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +0 -204
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +0 -213
- magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py +0 -37
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +0 -71
- magic_pdf/resources/model_config/UniMERNet/demo.yaml +0 -46
- magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +0 -351
- magic_pdf-1.2.2.dist-info/RECORD +0 -147
- /magic_pdf/model/sub_modules/{ocr/paddleocr/__init__.py → mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py} +0 -0
- /magic_pdf/model/sub_modules/{table/structeqtable → ocr/paddleocr2pytorch/pytorchocr}/__init__.py +0 -0
- /magic_pdf/model/sub_modules/{table/tablemaster → ocr/paddleocr2pytorch/pytorchocr/modeling}/__init__.py +0 -0
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,27 +1,19 @@
|
|
1
1
|
import os
|
2
2
|
import time
|
3
|
+
|
4
|
+
import numpy as np
|
3
5
|
import torch
|
4
6
|
|
5
7
|
os.environ['FLAGS_npu_jit_compile'] = '0' # 关闭paddle的jit编译
|
6
8
|
os.environ['FLAGS_use_stride_kernel'] = '0'
|
7
9
|
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
|
8
10
|
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
9
|
-
|
10
|
-
import paddle
|
11
|
-
paddle.disable_signal_handler()
|
11
|
+
|
12
12
|
|
13
13
|
from loguru import logger
|
14
14
|
|
15
|
-
from magic_pdf.model.batch_analyze import BatchAnalyze
|
16
15
|
from magic_pdf.model.sub_modules.model_utils import get_vram
|
17
|
-
|
18
|
-
try:
|
19
|
-
import torchtext
|
20
|
-
if torchtext.__version__ >= '0.18.0':
|
21
|
-
torchtext.disable_torchtext_deprecation_warning()
|
22
|
-
except ImportError:
|
23
|
-
pass
|
24
|
-
|
16
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
25
17
|
import magic_pdf.model as model_config
|
26
18
|
from magic_pdf.data.dataset import Dataset
|
27
19
|
from magic_pdf.libs.clean_memory import clean_memory
|
@@ -30,8 +22,6 @@ from magic_pdf.libs.config_reader import (get_device, get_formula_config,
|
|
30
22
|
get_local_models_dir,
|
31
23
|
get_table_recog_config)
|
32
24
|
from magic_pdf.model.model_list import MODEL
|
33
|
-
from magic_pdf.operators.models import InferenceResult
|
34
|
-
|
35
25
|
|
36
26
|
class ModelSingleton:
|
37
27
|
_instance = None
|
@@ -72,9 +62,7 @@ def custom_model_init(
|
|
72
62
|
formula_enable=None,
|
73
63
|
table_enable=None,
|
74
64
|
):
|
75
|
-
|
76
65
|
model = None
|
77
|
-
|
78
66
|
if model_config.__model_mode__ == 'lite':
|
79
67
|
logger.warning(
|
80
68
|
'The Lite mode is provided for developers to conduct testing only, and the output quality is '
|
@@ -132,7 +120,6 @@ def custom_model_init(
|
|
132
120
|
|
133
121
|
return custom_model
|
134
122
|
|
135
|
-
|
136
123
|
def doc_analyze(
|
137
124
|
dataset: Dataset,
|
138
125
|
ocr: bool = False,
|
@@ -143,102 +130,165 @@ def doc_analyze(
|
|
143
130
|
layout_model=None,
|
144
131
|
formula_enable=None,
|
145
132
|
table_enable=None,
|
146
|
-
)
|
147
|
-
|
133
|
+
):
|
148
134
|
end_page_id = (
|
149
135
|
end_page_id
|
150
136
|
if end_page_id is not None and end_page_id >= 0
|
151
137
|
else len(dataset) - 1
|
152
138
|
)
|
153
139
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
)
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
140
|
+
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
|
141
|
+
images = []
|
142
|
+
page_wh_list = []
|
143
|
+
for index in range(len(dataset)):
|
144
|
+
if start_page_id <= index <= end_page_id:
|
145
|
+
page_data = dataset.get_page(index)
|
146
|
+
img_dict = page_data.get_image()
|
147
|
+
images.append(img_dict['img'])
|
148
|
+
page_wh_list.append((img_dict['width'], img_dict['height']))
|
149
|
+
if lang is None or lang == 'auto':
|
150
|
+
images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(dataset))]
|
151
|
+
else:
|
152
|
+
images_with_extra_info = [(images[index], ocr, lang) for index in range(len(dataset))]
|
162
153
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
154
|
+
if len(images) >= MIN_BATCH_INFERENCE_SIZE:
|
155
|
+
batch_size = MIN_BATCH_INFERENCE_SIZE
|
156
|
+
batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
|
157
|
+
else:
|
158
|
+
batch_images = [images_with_extra_info]
|
168
159
|
|
169
|
-
|
170
|
-
|
171
|
-
|
160
|
+
results = []
|
161
|
+
for sn, batch_image in enumerate(batch_images):
|
162
|
+
_, result = may_batch_image_analyze(batch_image, sn, ocr, show_log,layout_model, formula_enable, table_enable)
|
163
|
+
results.extend(result)
|
172
164
|
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
165
|
+
model_json = []
|
166
|
+
for index in range(len(dataset)):
|
167
|
+
if start_page_id <= index <= end_page_id:
|
168
|
+
result = results.pop(0)
|
169
|
+
page_width, page_height = page_wh_list.pop(0)
|
170
|
+
else:
|
171
|
+
result = []
|
172
|
+
page_height = 0
|
173
|
+
page_width = 0
|
179
174
|
|
180
|
-
|
181
|
-
|
175
|
+
page_info = {'page_no': index, 'width': page_width, 'height': page_height}
|
176
|
+
page_dict = {'layout_dets': result, 'page_info': page_info}
|
177
|
+
model_json.append(page_dict)
|
182
178
|
|
183
|
-
|
184
|
-
|
179
|
+
from magic_pdf.operators.models import InferenceResult
|
180
|
+
return InferenceResult(model_json, dataset)
|
185
181
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
182
|
+
def batch_doc_analyze(
|
183
|
+
datasets: list[Dataset],
|
184
|
+
parse_method: str,
|
185
|
+
show_log: bool = False,
|
186
|
+
lang=None,
|
187
|
+
layout_model=None,
|
188
|
+
formula_enable=None,
|
189
|
+
table_enable=None,
|
190
|
+
):
|
191
|
+
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
|
192
|
+
batch_size = MIN_BATCH_INFERENCE_SIZE
|
193
|
+
images = []
|
194
|
+
page_wh_list = []
|
198
195
|
|
196
|
+
images_with_extra_info = []
|
197
|
+
for dataset in datasets:
|
199
198
|
for index in range(len(dataset)):
|
200
|
-
if
|
201
|
-
|
202
|
-
page_width, page_height = page_wh_list.pop(0)
|
199
|
+
if lang is None or lang == 'auto':
|
200
|
+
_lang = dataset._lang
|
203
201
|
else:
|
204
|
-
|
205
|
-
page_height = 0
|
206
|
-
page_width = 0
|
202
|
+
_lang = lang
|
207
203
|
|
208
|
-
|
204
|
+
page_data = dataset.get_page(index)
|
205
|
+
img_dict = page_data.get_image()
|
206
|
+
images.append(img_dict['img'])
|
207
|
+
page_wh_list.append((img_dict['width'], img_dict['height']))
|
208
|
+
if parse_method == 'auto':
|
209
|
+
images_with_extra_info.append((images[-1], dataset.classify() == SupportedPdfParseMethod.OCR, _lang))
|
210
|
+
else:
|
211
|
+
images_with_extra_info.append((images[-1], parse_method == 'ocr', _lang))
|
212
|
+
|
213
|
+
batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
|
214
|
+
results = []
|
215
|
+
for sn, batch_image in enumerate(batch_images):
|
216
|
+
_, result = may_batch_image_analyze(batch_image, sn, True, show_log, layout_model, formula_enable, table_enable)
|
217
|
+
results.extend(result)
|
218
|
+
|
219
|
+
infer_results = []
|
220
|
+
from magic_pdf.operators.models import InferenceResult
|
221
|
+
for index in range(len(datasets)):
|
222
|
+
dataset = datasets[index]
|
223
|
+
model_json = []
|
224
|
+
for i in range(len(dataset)):
|
225
|
+
result = results.pop(0)
|
226
|
+
page_width, page_height = page_wh_list.pop(0)
|
227
|
+
page_info = {'page_no': i, 'width': page_width, 'height': page_height}
|
209
228
|
page_dict = {'layout_dets': result, 'page_info': page_info}
|
210
229
|
model_json.append(page_dict)
|
230
|
+
infer_results.append(InferenceResult(model_json, dataset))
|
231
|
+
return infer_results
|
211
232
|
|
212
|
-
else:
|
213
|
-
# single analyze
|
214
233
|
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
234
|
+
def may_batch_image_analyze(
|
235
|
+
images_with_extra_info: list[(np.ndarray, bool, str)],
|
236
|
+
idx: int,
|
237
|
+
ocr: bool,
|
238
|
+
show_log: bool = False,
|
239
|
+
layout_model=None,
|
240
|
+
formula_enable=None,
|
241
|
+
table_enable=None):
|
242
|
+
# os.environ['CUDA_VISIBLE_DEVICES'] = str(idx)
|
243
|
+
|
244
|
+
from magic_pdf.model.batch_analyze import BatchAnalyze
|
245
|
+
|
246
|
+
model_manager = ModelSingleton()
|
247
|
+
|
248
|
+
# images = [image for image, _, _ in images_with_extra_info]
|
249
|
+
batch_ratio = 1
|
250
|
+
device = get_device()
|
251
|
+
|
252
|
+
if str(device).startswith('npu'):
|
253
|
+
import torch_npu
|
254
|
+
if torch_npu.npu.is_available():
|
255
|
+
torch.npu.set_compile_mode(jit_compile=False)
|
256
|
+
|
257
|
+
if str(device).startswith('npu') or str(device).startswith('cuda'):
|
258
|
+
vram = get_vram(device)
|
259
|
+
if vram is not None:
|
260
|
+
gpu_memory = int(os.getenv('VIRTUAL_VRAM_SIZE', round(vram)))
|
261
|
+
if gpu_memory >= 16:
|
262
|
+
batch_ratio = 16
|
263
|
+
elif gpu_memory >= 12:
|
264
|
+
batch_ratio = 8
|
265
|
+
elif gpu_memory >= 8:
|
266
|
+
batch_ratio = 4
|
267
|
+
elif gpu_memory >= 6:
|
268
|
+
batch_ratio = 2
|
225
269
|
else:
|
226
|
-
|
270
|
+
batch_ratio = 1
|
271
|
+
logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
|
272
|
+
else:
|
273
|
+
# Default batch_ratio when VRAM can't be determined
|
274
|
+
batch_ratio = 1
|
275
|
+
logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}')
|
227
276
|
|
228
|
-
page_info = {'page_no': index, 'width': page_width, 'height': page_height}
|
229
|
-
page_dict = {'layout_dets': result, 'page_info': page_info}
|
230
|
-
model_json.append(page_dict)
|
231
277
|
|
232
|
-
|
233
|
-
clean_memory(get_device())
|
234
|
-
gc_time = round(time.time() - gc_start, 2)
|
235
|
-
logger.info(f'gc time: {gc_time}')
|
236
|
-
|
237
|
-
doc_analyze_time = round(time.time() - doc_analyze_start, 2)
|
238
|
-
doc_analyze_speed = round((end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
|
239
|
-
logger.info(
|
240
|
-
f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
|
241
|
-
f' speed: {doc_analyze_speed} pages/second'
|
242
|
-
)
|
278
|
+
# doc_analyze_start = time.time()
|
243
279
|
|
244
|
-
|
280
|
+
batch_model = BatchAnalyze(model_manager, batch_ratio, show_log, layout_model, formula_enable, table_enable)
|
281
|
+
results = batch_model(images_with_extra_info)
|
282
|
+
|
283
|
+
# gc_start = time.time()
|
284
|
+
clean_memory(get_device())
|
285
|
+
# gc_time = round(time.time() - gc_start, 2)
|
286
|
+
# logger.debug(f'gc time: {gc_time}')
|
287
|
+
|
288
|
+
# doc_analyze_time = round(time.time() - doc_analyze_start, 2)
|
289
|
+
# doc_analyze_speed = round(len(images) / doc_analyze_time, 2)
|
290
|
+
# logger.debug(
|
291
|
+
# f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
|
292
|
+
# f' speed: {doc_analyze_speed} pages/second'
|
293
|
+
# )
|
294
|
+
return idx, results
|
@@ -3,28 +3,18 @@ import os
|
|
3
3
|
import time
|
4
4
|
|
5
5
|
import cv2
|
6
|
-
import numpy as np
|
7
6
|
import torch
|
8
7
|
import yaml
|
9
8
|
from loguru import logger
|
10
|
-
from PIL import Image
|
11
9
|
|
12
10
|
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
13
11
|
|
14
|
-
try:
|
15
|
-
import torchtext
|
16
|
-
|
17
|
-
if torchtext.__version__ >= '0.18.0':
|
18
|
-
torchtext.disable_torchtext_deprecation_warning()
|
19
|
-
except ImportError:
|
20
|
-
pass
|
21
|
-
|
22
12
|
from magic_pdf.config.constants import *
|
23
13
|
from magic_pdf.model.model_list import AtomicModel
|
24
14
|
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
|
25
15
|
from magic_pdf.model.sub_modules.model_utils import (
|
26
16
|
clean_vram, crop_img, get_res_list_from_layout_res)
|
27
|
-
from magic_pdf.model.sub_modules.ocr.
|
17
|
+
from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
|
28
18
|
get_adjusted_mfdetrec_res, get_ocr_result_list)
|
29
19
|
|
30
20
|
|
@@ -120,7 +110,7 @@ class CustomPEKModel:
|
|
120
110
|
atom_model_name=AtomicModel.MFR,
|
121
111
|
mfr_weight_dir=mfr_weight_dir,
|
122
112
|
mfr_cfg_path=mfr_cfg_path,
|
123
|
-
device=
|
113
|
+
device=self.device,
|
124
114
|
)
|
125
115
|
|
126
116
|
# 初始化layout模型
|
@@ -174,11 +164,6 @@ class CustomPEKModel:
|
|
174
164
|
logger.info('DocAnalysis init done!')
|
175
165
|
|
176
166
|
def __call__(self, image):
|
177
|
-
|
178
|
-
pil_img = Image.fromarray(image)
|
179
|
-
width, height = pil_img.size
|
180
|
-
# logger.info(f'width: {width}, height: {height}')
|
181
|
-
|
182
167
|
# layout检测
|
183
168
|
layout_start = time.time()
|
184
169
|
layout_res = []
|
@@ -186,24 +171,6 @@ class CustomPEKModel:
|
|
186
171
|
# layoutlmv3
|
187
172
|
layout_res = self.layout_model(image, ignore_catids=[])
|
188
173
|
elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
|
189
|
-
# doclayout_yolo
|
190
|
-
# if height > width:
|
191
|
-
# input_res = {"poly":[0,0,width,0,width,height,0,height]}
|
192
|
-
# new_image, useful_list = crop_img(input_res, pil_img, crop_paste_x=width//2, crop_paste_y=0)
|
193
|
-
# paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
|
194
|
-
# layout_res = self.layout_model.predict(new_image)
|
195
|
-
# for res in layout_res:
|
196
|
-
# p1, p2, p3, p4, p5, p6, p7, p8 = res['poly']
|
197
|
-
# p1 = p1 - paste_x + xmin
|
198
|
-
# p2 = p2 - paste_y + ymin
|
199
|
-
# p3 = p3 - paste_x + xmin
|
200
|
-
# p4 = p4 - paste_y + ymin
|
201
|
-
# p5 = p5 - paste_x + xmin
|
202
|
-
# p6 = p6 - paste_y + ymin
|
203
|
-
# p7 = p7 - paste_x + xmin
|
204
|
-
# p8 = p8 - paste_y + ymin
|
205
|
-
# res['poly'] = [p1, p2, p3, p4, p5, p6, p7, p8]
|
206
|
-
# else:
|
207
174
|
layout_res = self.layout_model.predict(image)
|
208
175
|
|
209
176
|
layout_cost = round(time.time() - layout_start, 2)
|
@@ -234,11 +201,11 @@ class CustomPEKModel:
|
|
234
201
|
ocr_start = time.time()
|
235
202
|
# Process each area that requires OCR processing
|
236
203
|
for res in ocr_res_list:
|
237
|
-
new_image, useful_list = crop_img(res,
|
204
|
+
new_image, useful_list = crop_img(res, image, crop_paste_x=50, crop_paste_y=50)
|
238
205
|
adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(single_page_mfdetrec_res, useful_list)
|
239
206
|
|
240
207
|
# OCR recognition
|
241
|
-
new_image = cv2.cvtColor(
|
208
|
+
new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
|
242
209
|
|
243
210
|
if self.apply_ocr:
|
244
211
|
ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
|
@@ -260,7 +227,7 @@ class CustomPEKModel:
|
|
260
227
|
if self.apply_table:
|
261
228
|
table_start = time.time()
|
262
229
|
for res in table_res_list:
|
263
|
-
new_image, _ = crop_img(res,
|
230
|
+
new_image, _ = crop_img(res, image)
|
264
231
|
single_table_start_time = time.time()
|
265
232
|
html_code = None
|
266
233
|
if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
|
@@ -3,8 +3,6 @@ import os
|
|
3
3
|
from pathlib import Path
|
4
4
|
|
5
5
|
import yaml
|
6
|
-
from PIL import Image
|
7
|
-
|
8
6
|
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
9
7
|
|
10
8
|
from magic_pdf.config.constants import MODEL_NAME
|
@@ -42,7 +40,7 @@ def get_text_images(simple_images):
|
|
42
40
|
)
|
43
41
|
text_images = []
|
44
42
|
for simple_image in simple_images:
|
45
|
-
image =
|
43
|
+
image = simple_image['img']
|
46
44
|
layout_res = temp_layout_model.predict(image)
|
47
45
|
# 给textblock截图
|
48
46
|
for res in layout_res:
|
@@ -51,7 +49,7 @@ def get_text_images(simple_images):
|
|
51
49
|
# 初步清洗(宽和高都小于100)
|
52
50
|
if x2 - x1 < 100 and y2 - y1 < 100:
|
53
51
|
continue
|
54
|
-
text_images.append(image
|
52
|
+
text_images.append(image[y1:y2, x1:x2])
|
55
53
|
return text_images
|
56
54
|
|
57
55
|
|
@@ -2,9 +2,9 @@
|
|
2
2
|
import time
|
3
3
|
from collections import Counter
|
4
4
|
from uuid import uuid4
|
5
|
-
|
5
|
+
import cv2
|
6
|
+
import numpy as np
|
6
7
|
import torch
|
7
|
-
from PIL import Image
|
8
8
|
from loguru import logger
|
9
9
|
from ultralytics import YOLO
|
10
10
|
|
@@ -29,7 +29,7 @@ def split_images(image, result_images=None):
|
|
29
29
|
if result_images is None:
|
30
30
|
result_images = []
|
31
31
|
|
32
|
-
|
32
|
+
height, width = image.shape[:2]
|
33
33
|
long_side = max(width, height) # 获取较长边长度
|
34
34
|
|
35
35
|
if long_side <= 400:
|
@@ -44,16 +44,14 @@ def split_images(image, result_images=None):
|
|
44
44
|
# 判断裁剪区域是否超出图片范围,如果超出则不进行裁剪保存操作
|
45
45
|
if x + new_long_side > width:
|
46
46
|
continue
|
47
|
-
|
48
|
-
sub_image = image.crop(box)
|
47
|
+
sub_image = image[0:height, x:x + new_long_side]
|
49
48
|
sub_images.append(sub_image)
|
50
49
|
else: # 如果高度是较长边
|
51
50
|
for y in range(0, height, new_long_side):
|
52
51
|
# 判断裁剪区域是否超出图片范围,如果超出则不进行裁剪保存操作
|
53
52
|
if y + new_long_side > height:
|
54
53
|
continue
|
55
|
-
|
56
|
-
sub_image = image.crop(box)
|
54
|
+
sub_image = image[y:y + new_long_side, 0:width]
|
57
55
|
sub_images.append(sub_image)
|
58
56
|
|
59
57
|
for sub_image in sub_images:
|
@@ -64,24 +62,32 @@ def split_images(image, result_images=None):
|
|
64
62
|
|
65
63
|
def resize_images_to_224(image):
|
66
64
|
"""
|
67
|
-
若分辨率小于224则用黑色背景补齐到224*224大小,若大于等于224则调整为224*224
|
65
|
+
若分辨率小于224则用黑色背景补齐到224*224大小,若大于等于224则调整为224*224大小。
|
66
|
+
Works directly with NumPy arrays.
|
68
67
|
"""
|
69
68
|
try:
|
70
|
-
|
69
|
+
height, width = image.shape[:2]
|
70
|
+
|
71
71
|
if width < 224 or height < 224:
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
72
|
+
# Create black background
|
73
|
+
new_image = np.zeros((224, 224, 3), dtype=np.uint8)
|
74
|
+
# Calculate paste position (ensure they're not negative)
|
75
|
+
paste_x = max(0, (224 - width) // 2)
|
76
|
+
paste_y = max(0, (224 - height) // 2)
|
77
|
+
# Make sure we don't exceed the boundaries of new_image
|
78
|
+
paste_width = min(width, 224)
|
79
|
+
paste_height = min(height, 224)
|
80
|
+
# Paste original image onto black background
|
81
|
+
new_image[paste_y:paste_y + paste_height, paste_x:paste_x + paste_width] = image[:paste_height, :paste_width]
|
76
82
|
image = new_image
|
77
83
|
else:
|
78
|
-
|
84
|
+
# Resize using cv2
|
85
|
+
image = cv2.resize(image, (224, 224), interpolation=cv2.INTER_LANCZOS4)
|
79
86
|
|
80
|
-
# uuid = str(uuid4())
|
81
|
-
# image.save(f"/tmp/{uuid}.jpg")
|
82
87
|
return image
|
83
88
|
except Exception as e:
|
84
|
-
logger.exception(e)
|
89
|
+
logger.exception(f"Error in resize_images_to_224: {e}")
|
90
|
+
return None
|
85
91
|
|
86
92
|
|
87
93
|
class YOLOv11LangDetModel(object):
|
@@ -96,8 +102,7 @@ class YOLOv11LangDetModel(object):
|
|
96
102
|
def do_detect(self, images: list):
|
97
103
|
all_images = []
|
98
104
|
for image in images:
|
99
|
-
|
100
|
-
# logger.info(f"image size: {width} x {height}")
|
105
|
+
height, width = image.shape[:2]
|
101
106
|
if width < 100 and height < 100:
|
102
107
|
continue
|
103
108
|
temp_images = split_images(image)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
from doclayout_yolo import YOLOv10
|
2
|
+
from tqdm import tqdm
|
2
3
|
|
3
4
|
|
4
5
|
class DocLayoutYOLOModel(object):
|
@@ -31,7 +32,8 @@ class DocLayoutYOLOModel(object):
|
|
31
32
|
|
32
33
|
def batch_predict(self, images: list, batch_size: int) -> list:
|
33
34
|
images_layout_res = []
|
34
|
-
for index in range(0, len(images), batch_size):
|
35
|
+
# for index in range(0, len(images), batch_size):
|
36
|
+
for index in tqdm(range(0, len(images), batch_size), desc="Layout Predict"):
|
35
37
|
doclayout_yolo_res = [
|
36
38
|
image_res.cpu()
|
37
39
|
for image_res in self.model.predict(
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from tqdm import tqdm
|
1
2
|
from ultralytics import YOLO
|
2
3
|
|
3
4
|
|
@@ -14,7 +15,8 @@ class YOLOv8MFDModel(object):
|
|
14
15
|
|
15
16
|
def batch_predict(self, images: list, batch_size: int) -> list:
|
16
17
|
images_mfd_res = []
|
17
|
-
for index in range(0, len(images), batch_size):
|
18
|
+
# for index in range(0, len(images), batch_size):
|
19
|
+
for index in tqdm(range(0, len(images), batch_size), desc="MFD Predict"):
|
18
20
|
mfd_res = [
|
19
21
|
image_res.cpu()
|
20
22
|
for image_res in self.mfd_model.predict(
|