magic-pdf 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +24 -0
- magic_pdf/filter/__init__.py +1 -1
- magic_pdf/filter/pdf_classify_by_type.py +6 -4
- magic_pdf/filter/pdf_meta_scan.py +4 -4
- magic_pdf/libs/pdf_check.py +11 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +31 -39
- magic_pdf/model/magic_model.py +161 -4
- magic_pdf/model/pdf_extract_kit.py +0 -7
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +4 -3
- magic_pdf/model/sub_modules/model_init.py +28 -14
- magic_pdf/pdf_parse_union_core_v2.py +45 -32
- magic_pdf/post_proc/llm_aided.py +14 -16
- magic_pdf/pre_proc/ocr_dict_merge.py +14 -2
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/METADATA +49 -41
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/RECORD +20 -21
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/WHEEL +1 -1
- magic_pdf/post_proc/llm_aided_ocr.py +0 -689
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/top_level.txt +0 -0
@@ -126,11 +126,35 @@ def detect_language(text):
|
|
126
126
|
return 'empty'
|
127
127
|
|
128
128
|
|
129
|
+
def full_to_half(text: str) -> str:
|
130
|
+
"""Convert full-width characters to half-width characters using code point manipulation.
|
131
|
+
|
132
|
+
Args:
|
133
|
+
text: String containing full-width characters
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
String with full-width characters converted to half-width
|
137
|
+
"""
|
138
|
+
result = []
|
139
|
+
for char in text:
|
140
|
+
code = ord(char)
|
141
|
+
# Full-width ASCII variants (FF01-FF5E)
|
142
|
+
if 0xFF01 <= code <= 0xFF5E:
|
143
|
+
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
|
144
|
+
# Full-width space
|
145
|
+
elif code == 0x3000:
|
146
|
+
result.append(' ')
|
147
|
+
else:
|
148
|
+
result.append(char)
|
149
|
+
return ''.join(result)
|
150
|
+
|
151
|
+
|
129
152
|
def merge_para_with_text(para_block):
|
130
153
|
block_text = ''
|
131
154
|
for line in para_block['lines']:
|
132
155
|
for span in line['spans']:
|
133
156
|
if span['type'] in [ContentType.Text]:
|
157
|
+
span['content'] = full_to_half(span['content'])
|
134
158
|
block_text += span['content']
|
135
159
|
block_lang = detect_lang(block_text)
|
136
160
|
|
magic_pdf/filter/__init__.py
CHANGED
@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
|
|
23
23
|
pdf_meta['image_info_per_page'],
|
24
24
|
pdf_meta['text_len_per_page'],
|
25
25
|
pdf_meta['imgs_per_page'],
|
26
|
-
pdf_meta['text_layout_per_page'],
|
26
|
+
# pdf_meta['text_layout_per_page'],
|
27
27
|
pdf_meta['invalid_chars'],
|
28
28
|
)
|
29
29
|
if is_text_pdf:
|
@@ -305,7 +305,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
|
305
305
|
|
306
306
|
|
307
307
|
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
|
308
|
-
text_layout_list: list,
|
308
|
+
# text_layout_list: list,
|
309
|
+
invalid_chars: bool):
|
309
310
|
"""
|
310
311
|
这里的图片和页面长度单位是pts
|
311
312
|
:param total_page:
|
@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
|
|
321
322
|
'by_text_len': classify_by_text_len(text_len_list, total_page),
|
322
323
|
'by_avg_words': classify_by_avg_words(text_len_list),
|
323
324
|
'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
|
324
|
-
'by_text_layout': classify_by_text_layout(text_layout_list),
|
325
|
+
# 'by_text_layout': classify_by_text_layout(text_layout_list),
|
325
326
|
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
|
326
327
|
'by_invalid_chars': invalid_chars,
|
327
328
|
}
|
@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
|
|
332
333
|
return False, results
|
333
334
|
else:
|
334
335
|
logger.warning(
|
335
|
-
f"
|
336
|
+
f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
|
336
337
|
f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
|
337
|
-
f" by_text_layout: {results['by_text_layout']},
|
338
|
+
# f" by_text_layout: {results['by_text_layout']},"
|
339
|
+
f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
|
338
340
|
f" by_invalid_chars: {results['by_invalid_chars']}",
|
339
341
|
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
|
340
342
|
return False, results
|
@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
|
356
356
|
# logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
|
357
357
|
text_len_per_page = get_pdf_textlen_per_page(doc)
|
358
358
|
# logger.info(f"text_len_per_page: {text_len_per_page}")
|
359
|
-
text_layout_per_page = get_pdf_text_layout_per_page(doc)
|
359
|
+
# text_layout_per_page = get_pdf_text_layout_per_page(doc)
|
360
360
|
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
|
361
|
-
text_language = get_language(doc)
|
361
|
+
# text_language = get_language(doc)
|
362
362
|
# logger.info(f"text_language: {text_language}")
|
363
363
|
invalid_chars = check_invalid_chars(pdf_bytes)
|
364
364
|
# logger.info(f"invalid_chars: {invalid_chars}")
|
@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
|
372
372
|
'page_height_pts': int(page_height_pts),
|
373
373
|
'image_info_per_page': image_info_per_page,
|
374
374
|
'text_len_per_page': text_len_per_page,
|
375
|
-
'text_layout_per_page': text_layout_per_page,
|
376
|
-
'text_language': text_language,
|
375
|
+
# 'text_layout_per_page': text_layout_per_page,
|
376
|
+
# 'text_language': text_language,
|
377
377
|
# "svgs_per_page": svgs_per_page,
|
378
378
|
'imgs_per_page': imgs_per_page, # 增加每页img数量list
|
379
379
|
'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list
|
magic_pdf/libs/pdf_check.py
CHANGED
@@ -4,6 +4,7 @@ from loguru import logger
|
|
4
4
|
import re
|
5
5
|
from io import BytesIO
|
6
6
|
from pdfminer.high_level import extract_text
|
7
|
+
from pdfminer.layout import LAParams
|
7
8
|
|
8
9
|
|
9
10
|
def calculate_sample_count(total_page: int):
|
@@ -41,7 +42,16 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
|
41
42
|
sample_docs = extract_pages(src_pdf_bytes)
|
42
43
|
sample_pdf_bytes = sample_docs.tobytes()
|
43
44
|
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
44
|
-
|
45
|
+
laparams = LAParams(
|
46
|
+
line_overlap=0.5,
|
47
|
+
char_margin=2.0,
|
48
|
+
line_margin=0.5,
|
49
|
+
word_margin=0.1,
|
50
|
+
boxes_flow=None,
|
51
|
+
detect_vertical=False,
|
52
|
+
all_texts=False,
|
53
|
+
)
|
54
|
+
text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
|
45
55
|
text = text.replace("\n", "")
|
46
56
|
# logger.info(text)
|
47
57
|
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.
|
1
|
+
__version__ = "1.2.0"
|
@@ -1,21 +1,22 @@
|
|
1
1
|
import os
|
2
2
|
import time
|
3
|
+
import torch
|
3
4
|
|
5
|
+
os.environ['FLAGS_npu_jit_compile'] = '0' # 关闭paddle的jit编译
|
6
|
+
os.environ['FLAGS_use_stride_kernel'] = '0'
|
7
|
+
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
|
8
|
+
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
4
9
|
# 关闭paddle的信号处理
|
5
10
|
import paddle
|
6
|
-
|
11
|
+
paddle.disable_signal_handler()
|
12
|
+
|
7
13
|
from loguru import logger
|
8
14
|
|
9
15
|
from magic_pdf.model.batch_analyze import BatchAnalyze
|
10
16
|
from magic_pdf.model.sub_modules.model_utils import get_vram
|
11
17
|
|
12
|
-
paddle.disable_signal_handler()
|
13
|
-
|
14
|
-
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
|
15
|
-
|
16
18
|
try:
|
17
19
|
import torchtext
|
18
|
-
|
19
20
|
if torchtext.__version__ >= '0.18.0':
|
20
21
|
torchtext.disable_torchtext_deprecation_warning()
|
21
22
|
except ImportError:
|
@@ -32,20 +33,6 @@ from magic_pdf.model.model_list import MODEL
|
|
32
33
|
from magic_pdf.operators.models import InferenceResult
|
33
34
|
|
34
35
|
|
35
|
-
def dict_compare(d1, d2):
|
36
|
-
return d1.items() == d2.items()
|
37
|
-
|
38
|
-
|
39
|
-
def remove_duplicates_dicts(lst):
|
40
|
-
unique_dicts = []
|
41
|
-
for dict_item in lst:
|
42
|
-
if not any(
|
43
|
-
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
|
44
|
-
):
|
45
|
-
unique_dicts.append(dict_item)
|
46
|
-
return unique_dicts
|
47
|
-
|
48
|
-
|
49
36
|
class ModelSingleton:
|
50
37
|
_instance = None
|
51
38
|
_models = {}
|
@@ -158,7 +145,11 @@ def doc_analyze(
|
|
158
145
|
table_enable=None,
|
159
146
|
) -> InferenceResult:
|
160
147
|
|
161
|
-
end_page_id =
|
148
|
+
end_page_id = (
|
149
|
+
end_page_id
|
150
|
+
if end_page_id is not None and end_page_id >= 0
|
151
|
+
else len(dataset) - 1
|
152
|
+
)
|
162
153
|
|
163
154
|
model_manager = ModelSingleton()
|
164
155
|
custom_model = model_manager.get_model(
|
@@ -166,6 +157,7 @@ def doc_analyze(
|
|
166
157
|
)
|
167
158
|
|
168
159
|
batch_analyze = False
|
160
|
+
batch_ratio = 1
|
169
161
|
device = get_device()
|
170
162
|
|
171
163
|
npu_support = False
|
@@ -178,21 +170,19 @@ def doc_analyze(
|
|
178
170
|
gpu_memory = int(os.getenv("VIRTUAL_VRAM_SIZE", round(get_vram(device))))
|
179
171
|
if gpu_memory is not None and gpu_memory >= 8:
|
180
172
|
|
181
|
-
if
|
182
|
-
batch_ratio =
|
183
|
-
elif
|
184
|
-
batch_ratio = 4
|
185
|
-
elif 12 < gpu_memory <= 16:
|
186
|
-
batch_ratio = 8
|
187
|
-
elif 16 < gpu_memory <= 24:
|
173
|
+
if gpu_memory >= 40:
|
174
|
+
batch_ratio = 32
|
175
|
+
elif gpu_memory >=20:
|
188
176
|
batch_ratio = 16
|
177
|
+
elif gpu_memory >= 16:
|
178
|
+
batch_ratio = 8
|
179
|
+
elif gpu_memory >= 10:
|
180
|
+
batch_ratio = 4
|
189
181
|
else:
|
190
|
-
batch_ratio =
|
182
|
+
batch_ratio = 2
|
191
183
|
|
192
|
-
|
193
|
-
|
194
|
-
batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
|
195
|
-
batch_analyze = True
|
184
|
+
logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
|
185
|
+
batch_analyze = True
|
196
186
|
|
197
187
|
model_json = []
|
198
188
|
doc_analyze_start = time.time()
|
@@ -200,24 +190,26 @@ def doc_analyze(
|
|
200
190
|
if batch_analyze:
|
201
191
|
# batch analyze
|
202
192
|
images = []
|
193
|
+
page_wh_list = []
|
203
194
|
for index in range(len(dataset)):
|
204
195
|
if start_page_id <= index <= end_page_id:
|
205
196
|
page_data = dataset.get_page(index)
|
206
197
|
img_dict = page_data.get_image()
|
207
198
|
images.append(img_dict['img'])
|
199
|
+
page_wh_list.append((img_dict['width'], img_dict['height']))
|
200
|
+
batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
|
208
201
|
analyze_result = batch_model(images)
|
209
202
|
|
210
203
|
for index in range(len(dataset)):
|
211
|
-
page_data = dataset.get_page(index)
|
212
|
-
img_dict = page_data.get_image()
|
213
|
-
page_width = img_dict['width']
|
214
|
-
page_height = img_dict['height']
|
215
204
|
if start_page_id <= index <= end_page_id:
|
216
205
|
result = analyze_result.pop(0)
|
206
|
+
page_width, page_height = page_wh_list.pop(0)
|
217
207
|
else:
|
218
208
|
result = []
|
209
|
+
page_height = 0
|
210
|
+
page_width = 0
|
219
211
|
|
220
|
-
page_info = {'page_no': index, '
|
212
|
+
page_info = {'page_no': index, 'width': page_width, 'height': page_height}
|
221
213
|
page_dict = {'layout_dets': result, 'page_info': page_info}
|
222
214
|
model_json.append(page_dict)
|
223
215
|
|
@@ -237,7 +229,7 @@ def doc_analyze(
|
|
237
229
|
else:
|
238
230
|
result = []
|
239
231
|
|
240
|
-
page_info = {'page_no': index, '
|
232
|
+
page_info = {'page_no': index, 'width': page_width, 'height': page_height}
|
241
233
|
page_dict = {'layout_dets': result, 'page_info': page_info}
|
242
234
|
model_json.append(page_dict)
|
243
235
|
|
magic_pdf/model/magic_model.py
CHANGED
@@ -450,11 +450,168 @@ class MagicModel:
|
|
450
450
|
)
|
451
451
|
return ret
|
452
452
|
|
453
|
+
|
454
|
+
def __tie_up_category_by_distance_v3(
|
455
|
+
self,
|
456
|
+
page_no: int,
|
457
|
+
subject_category_id: int,
|
458
|
+
object_category_id: int,
|
459
|
+
priority_pos: PosRelationEnum,
|
460
|
+
):
|
461
|
+
subjects = self.__reduct_overlap(
|
462
|
+
list(
|
463
|
+
map(
|
464
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
465
|
+
filter(
|
466
|
+
lambda x: x['category_id'] == subject_category_id,
|
467
|
+
self.__model_list[page_no]['layout_dets'],
|
468
|
+
),
|
469
|
+
)
|
470
|
+
)
|
471
|
+
)
|
472
|
+
objects = self.__reduct_overlap(
|
473
|
+
list(
|
474
|
+
map(
|
475
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
476
|
+
filter(
|
477
|
+
lambda x: x['category_id'] == object_category_id,
|
478
|
+
self.__model_list[page_no]['layout_dets'],
|
479
|
+
),
|
480
|
+
)
|
481
|
+
)
|
482
|
+
)
|
483
|
+
|
484
|
+
ret = []
|
485
|
+
N, M = len(subjects), len(objects)
|
486
|
+
subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
|
487
|
+
objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
|
488
|
+
|
489
|
+
OBJ_IDX_OFFSET = 10000
|
490
|
+
SUB_BIT_KIND, OBJ_BIT_KIND = 0, 1
|
491
|
+
|
492
|
+
all_boxes_with_idx = [(i, SUB_BIT_KIND, sub['bbox'][0], sub['bbox'][1]) for i, sub in enumerate(subjects)] + [(i + OBJ_IDX_OFFSET , OBJ_BIT_KIND, obj['bbox'][0], obj['bbox'][1]) for i, obj in enumerate(objects)]
|
493
|
+
seen_idx = set()
|
494
|
+
seen_sub_idx = set()
|
495
|
+
|
496
|
+
while N > len(seen_sub_idx):
|
497
|
+
candidates = []
|
498
|
+
for idx, kind, x0, y0 in all_boxes_with_idx:
|
499
|
+
if idx in seen_idx:
|
500
|
+
continue
|
501
|
+
candidates.append((idx, kind, x0, y0))
|
502
|
+
|
503
|
+
if len(candidates) == 0:
|
504
|
+
break
|
505
|
+
left_x = min([v[2] for v in candidates])
|
506
|
+
top_y = min([v[3] for v in candidates])
|
507
|
+
|
508
|
+
candidates.sort(key=lambda x: (x[2]-left_x) ** 2 + (x[3] - top_y) ** 2)
|
509
|
+
|
510
|
+
|
511
|
+
fst_idx, fst_kind, left_x, top_y = candidates[0]
|
512
|
+
candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y)**2)
|
513
|
+
nxt = None
|
514
|
+
|
515
|
+
for i in range(1, len(candidates)):
|
516
|
+
if candidates[i][1] ^ fst_kind == 1:
|
517
|
+
nxt = candidates[i]
|
518
|
+
break
|
519
|
+
if nxt is None:
|
520
|
+
break
|
521
|
+
|
522
|
+
if fst_kind == SUB_BIT_KIND:
|
523
|
+
sub_idx, obj_idx = fst_idx, nxt[0] - OBJ_IDX_OFFSET
|
524
|
+
|
525
|
+
else:
|
526
|
+
sub_idx, obj_idx = nxt[0], fst_idx - OBJ_IDX_OFFSET
|
527
|
+
|
528
|
+
pair_dis = bbox_distance(subjects[sub_idx]['bbox'], objects[obj_idx]['bbox'])
|
529
|
+
nearest_dis = float('inf')
|
530
|
+
for i in range(N):
|
531
|
+
if i in seen_idx:continue
|
532
|
+
nearest_dis = min(nearest_dis, bbox_distance(subjects[i]['bbox'], objects[obj_idx]['bbox']))
|
533
|
+
|
534
|
+
if pair_dis >= 3*nearest_dis:
|
535
|
+
seen_idx.add(sub_idx)
|
536
|
+
continue
|
537
|
+
|
538
|
+
|
539
|
+
seen_idx.add(sub_idx)
|
540
|
+
seen_idx.add(obj_idx + OBJ_IDX_OFFSET)
|
541
|
+
seen_sub_idx.add(sub_idx)
|
542
|
+
|
543
|
+
ret.append(
|
544
|
+
{
|
545
|
+
'sub_bbox': {
|
546
|
+
'bbox': subjects[sub_idx]['bbox'],
|
547
|
+
'score': subjects[sub_idx]['score'],
|
548
|
+
},
|
549
|
+
'obj_bboxes': [
|
550
|
+
{'score': objects[obj_idx]['score'], 'bbox': objects[obj_idx]['bbox']}
|
551
|
+
],
|
552
|
+
'sub_idx': sub_idx,
|
553
|
+
}
|
554
|
+
)
|
555
|
+
|
556
|
+
for i in range(len(objects)):
|
557
|
+
j = i + OBJ_IDX_OFFSET
|
558
|
+
if j in seen_idx:
|
559
|
+
continue
|
560
|
+
seen_idx.add(j)
|
561
|
+
nearest_dis, nearest_sub_idx = float('inf'), -1
|
562
|
+
for k in range(len(subjects)):
|
563
|
+
dis = bbox_distance(objects[i]['bbox'], subjects[k]['bbox'])
|
564
|
+
if dis < nearest_dis:
|
565
|
+
nearest_dis = dis
|
566
|
+
nearest_sub_idx = k
|
567
|
+
|
568
|
+
for k in range(len(subjects)):
|
569
|
+
if k != nearest_sub_idx: continue
|
570
|
+
if k in seen_sub_idx:
|
571
|
+
for kk in range(len(ret)):
|
572
|
+
if ret[kk]['sub_idx'] == k:
|
573
|
+
ret[kk]['obj_bboxes'].append({'score': objects[i]['score'], 'bbox': objects[i]['bbox']})
|
574
|
+
break
|
575
|
+
else:
|
576
|
+
ret.append(
|
577
|
+
{
|
578
|
+
'sub_bbox': {
|
579
|
+
'bbox': subjects[k]['bbox'],
|
580
|
+
'score': subjects[k]['score'],
|
581
|
+
},
|
582
|
+
'obj_bboxes': [
|
583
|
+
{'score': objects[i]['score'], 'bbox': objects[i]['bbox']}
|
584
|
+
],
|
585
|
+
'sub_idx': k,
|
586
|
+
}
|
587
|
+
)
|
588
|
+
seen_sub_idx.add(k)
|
589
|
+
seen_idx.add(k)
|
590
|
+
|
591
|
+
|
592
|
+
for i in range(len(subjects)):
|
593
|
+
if i in seen_sub_idx:
|
594
|
+
continue
|
595
|
+
ret.append(
|
596
|
+
{
|
597
|
+
'sub_bbox': {
|
598
|
+
'bbox': subjects[i]['bbox'],
|
599
|
+
'score': subjects[i]['score'],
|
600
|
+
},
|
601
|
+
'obj_bboxes': [],
|
602
|
+
'sub_idx': i,
|
603
|
+
}
|
604
|
+
)
|
605
|
+
|
606
|
+
|
607
|
+
return ret
|
608
|
+
|
609
|
+
|
453
610
|
def get_imgs_v2(self, page_no: int):
|
454
|
-
with_captions = self.
|
611
|
+
with_captions = self.__tie_up_category_by_distance_v3(
|
455
612
|
page_no, 3, 4, PosRelationEnum.BOTTOM
|
456
613
|
)
|
457
|
-
with_footnotes = self.
|
614
|
+
with_footnotes = self.__tie_up_category_by_distance_v3(
|
458
615
|
page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL
|
459
616
|
)
|
460
617
|
ret = []
|
@@ -470,10 +627,10 @@ class MagicModel:
|
|
470
627
|
return ret
|
471
628
|
|
472
629
|
def get_tables_v2(self, page_no: int) -> list:
|
473
|
-
with_captions = self.
|
630
|
+
with_captions = self.__tie_up_category_by_distance_v3(
|
474
631
|
page_no, 5, 6, PosRelationEnum.UP
|
475
632
|
)
|
476
|
-
with_footnotes = self.
|
633
|
+
with_footnotes = self.__tie_up_category_by_distance_v3(
|
477
634
|
page_no, 5, 7, PosRelationEnum.ALL
|
478
635
|
)
|
479
636
|
ret = []
|
@@ -89,13 +89,6 @@ class CustomPEKModel:
|
|
89
89
|
# 初始化解析方案
|
90
90
|
self.device = kwargs.get('device', 'cpu')
|
91
91
|
|
92
|
-
if str(self.device).startswith("npu"):
|
93
|
-
import torch_npu
|
94
|
-
os.environ['FLAGS_npu_jit_compile'] = '0'
|
95
|
-
os.environ['FLAGS_use_stride_kernel'] = '0'
|
96
|
-
elif str(self.device).startswith("mps"):
|
97
|
-
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
98
|
-
|
99
92
|
logger.info('using device: {}'.format(self.device))
|
100
93
|
models_dir = kwargs.get(
|
101
94
|
'models_dir', os.path.join(root_dir, 'resources', 'models')
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
2
|
+
import time
|
2
3
|
from collections import Counter
|
3
4
|
from uuid import uuid4
|
4
5
|
|
@@ -102,9 +103,9 @@ class YOLOv11LangDetModel(object):
|
|
102
103
|
temp_images = split_images(image)
|
103
104
|
for temp_image in temp_images:
|
104
105
|
all_images.append(resize_images_to_224(temp_image))
|
105
|
-
|
106
|
-
images_lang_res = self.batch_predict(all_images, batch_size=
|
107
|
-
# logger.info(f"
|
106
|
+
# langdetect_start = time.time()
|
107
|
+
images_lang_res = self.batch_predict(all_images, batch_size=256)
|
108
|
+
# logger.info(f"image number of langdetect: {len(images_lang_res)}, langdetect time: {round(time.time() - langdetect_start, 2)}")
|
108
109
|
if len(images_lang_res) > 0:
|
109
110
|
count_dict = Counter(images_lang_res)
|
110
111
|
language = max(count_dict, key=count_dict.get)
|
@@ -4,22 +4,37 @@ from loguru import logger
|
|
4
4
|
from magic_pdf.config.constants import MODEL_NAME
|
5
5
|
from magic_pdf.model.model_list import AtomicModel
|
6
6
|
from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import YOLOv11LangDetModel
|
7
|
-
from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import
|
8
|
-
|
9
|
-
from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import \
|
10
|
-
Layoutlmv3_Predictor
|
7
|
+
from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import DocLayoutYOLOModel
|
8
|
+
from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import Layoutlmv3_Predictor
|
11
9
|
from magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8 import YOLOv8MFDModel
|
12
10
|
from magic_pdf.model.sub_modules.mfr.unimernet.Unimernet import UnimernetModel
|
13
|
-
from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import \
|
14
|
-
ModifiedPaddleOCR
|
15
|
-
from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import \
|
16
|
-
RapidTableModel
|
17
|
-
# from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
|
18
|
-
from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import \
|
19
|
-
StructTableModel
|
20
|
-
from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import \
|
21
|
-
TableMasterPaddleModel
|
22
11
|
|
12
|
+
try:
|
13
|
+
from magic_pdf_ascend_plugin.libs.license_verifier import load_license, LicenseFormatError, LicenseSignatureError, LicenseExpiredError
|
14
|
+
from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import ModifiedPaddleOCR
|
15
|
+
from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import RapidTableModel
|
16
|
+
license_key = load_license()
|
17
|
+
logger.info(f'Using Ascend Plugin Success, License id is {license_key["payload"]["id"]},'
|
18
|
+
f' License expired at {license_key["payload"]["date"]["end_date"]}')
|
19
|
+
except Exception as e:
|
20
|
+
if isinstance(e, ImportError):
|
21
|
+
pass
|
22
|
+
elif isinstance(e, LicenseFormatError):
|
23
|
+
logger.error("Ascend Plugin: Invalid license format. Please check the license file.")
|
24
|
+
elif isinstance(e, LicenseSignatureError):
|
25
|
+
logger.error("Ascend Plugin: Invalid signature. The license may be tampered with.")
|
26
|
+
elif isinstance(e, LicenseExpiredError):
|
27
|
+
logger.error("Ascend Plugin: License has expired. Please renew your license.")
|
28
|
+
elif isinstance(e, FileNotFoundError):
|
29
|
+
logger.error("Ascend Plugin: Not found License file.")
|
30
|
+
else:
|
31
|
+
logger.error(f"Ascend Plugin: {e}")
|
32
|
+
from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import ModifiedPaddleOCR
|
33
|
+
# from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
|
34
|
+
from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel
|
35
|
+
|
36
|
+
from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import StructTableModel
|
37
|
+
from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
|
23
38
|
|
24
39
|
def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr_engine=None, table_sub_model_name=None):
|
25
40
|
if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
|
@@ -76,7 +91,6 @@ def ocr_model_init(show_log: bool = False,
|
|
76
91
|
use_dilation=True,
|
77
92
|
det_db_unclip_ratio=1.8,
|
78
93
|
):
|
79
|
-
|
80
94
|
if lang is not None and lang != '':
|
81
95
|
model = ModifiedPaddleOCR(
|
82
96
|
show_log=show_log,
|