magic-pdf 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -126,11 +126,35 @@ def detect_language(text):
126
126
  return 'empty'
127
127
 
128
128
 
129
+ def full_to_half(text: str) -> str:
130
+ """Convert full-width characters to half-width characters using code point manipulation.
131
+
132
+ Args:
133
+ text: String containing full-width characters
134
+
135
+ Returns:
136
+ String with full-width characters converted to half-width
137
+ """
138
+ result = []
139
+ for char in text:
140
+ code = ord(char)
141
+ # Full-width ASCII variants (FF01-FF5E)
142
+ if 0xFF01 <= code <= 0xFF5E:
143
+ result.append(chr(code - 0xFEE0)) # Shift to ASCII range
144
+ # Full-width space
145
+ elif code == 0x3000:
146
+ result.append(' ')
147
+ else:
148
+ result.append(char)
149
+ return ''.join(result)
150
+
151
+
129
152
  def merge_para_with_text(para_block):
130
153
  block_text = ''
131
154
  for line in para_block['lines']:
132
155
  for span in line['spans']:
133
156
  if span['type'] in [ContentType.Text]:
157
+ span['content'] = full_to_half(span['content'])
134
158
  block_text += span['content']
135
159
  block_lang = detect_lang(block_text)
136
160
 
@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
23
23
  pdf_meta['image_info_per_page'],
24
24
  pdf_meta['text_len_per_page'],
25
25
  pdf_meta['imgs_per_page'],
26
- pdf_meta['text_layout_per_page'],
26
+ # pdf_meta['text_layout_per_page'],
27
27
  pdf_meta['invalid_chars'],
28
28
  )
29
29
  if is_text_pdf:
@@ -305,7 +305,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
305
305
 
306
306
 
307
307
  def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
308
- text_layout_list: list, invalid_chars: bool):
308
+ # text_layout_list: list,
309
+ invalid_chars: bool):
309
310
  """
310
311
  这里的图片和页面长度单位是pts
311
312
  :param total_page:
@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
321
322
  'by_text_len': classify_by_text_len(text_len_list, total_page),
322
323
  'by_avg_words': classify_by_avg_words(text_len_list),
323
324
  'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
324
- 'by_text_layout': classify_by_text_layout(text_layout_list),
325
+ # 'by_text_layout': classify_by_text_layout(text_layout_list),
325
326
  'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
326
327
  'by_invalid_chars': invalid_chars,
327
328
  }
@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
332
333
  return False, results
333
334
  else:
334
335
  logger.warning(
335
- f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
336
+ f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
336
337
  f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
337
- f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
338
+ # f" by_text_layout: {results['by_text_layout']},"
339
+ f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
338
340
  f" by_invalid_chars: {results['by_invalid_chars']}",
339
341
  file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
340
342
  return False, results
@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
356
356
  # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
357
357
  text_len_per_page = get_pdf_textlen_per_page(doc)
358
358
  # logger.info(f"text_len_per_page: {text_len_per_page}")
359
- text_layout_per_page = get_pdf_text_layout_per_page(doc)
359
+ # text_layout_per_page = get_pdf_text_layout_per_page(doc)
360
360
  # logger.info(f"text_layout_per_page: {text_layout_per_page}")
361
- text_language = get_language(doc)
361
+ # text_language = get_language(doc)
362
362
  # logger.info(f"text_language: {text_language}")
363
363
  invalid_chars = check_invalid_chars(pdf_bytes)
364
364
  # logger.info(f"invalid_chars: {invalid_chars}")
@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
372
372
  'page_height_pts': int(page_height_pts),
373
373
  'image_info_per_page': image_info_per_page,
374
374
  'text_len_per_page': text_len_per_page,
375
- 'text_layout_per_page': text_layout_per_page,
376
- 'text_language': text_language,
375
+ # 'text_layout_per_page': text_layout_per_page,
376
+ # 'text_language': text_language,
377
377
  # "svgs_per_page": svgs_per_page,
378
378
  'imgs_per_page': imgs_per_page, # 增加每页img数量list
379
379
  'junk_img_bojids': junk_img_bojids, # 增加垃圾图片的bojid list
@@ -4,6 +4,7 @@ from loguru import logger
4
4
  import re
5
5
  from io import BytesIO
6
6
  from pdfminer.high_level import extract_text
7
+ from pdfminer.layout import LAParams
7
8
 
8
9
 
9
10
  def calculate_sample_count(total_page: int):
@@ -41,7 +42,16 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
41
42
  sample_docs = extract_pages(src_pdf_bytes)
42
43
  sample_pdf_bytes = sample_docs.tobytes()
43
44
  sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
44
- text = extract_text(sample_pdf_file_like_object)
45
+ laparams = LAParams(
46
+ line_overlap=0.5,
47
+ char_margin=2.0,
48
+ line_margin=0.5,
49
+ word_margin=0.1,
50
+ boxes_flow=None,
51
+ detect_vertical=False,
52
+ all_texts=False,
53
+ )
54
+ text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
45
55
  text = text.replace("\n", "")
46
56
  # logger.info(text)
47
57
  '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.0"
1
+ __version__ = "1.2.0"
@@ -1,21 +1,22 @@
1
1
  import os
2
2
  import time
3
+ import torch
3
4
 
5
+ os.environ['FLAGS_npu_jit_compile'] = '0' # 关闭paddle的jit编译
6
+ os.environ['FLAGS_use_stride_kernel'] = '0'
7
+ os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback
8
+ os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
4
9
  # 关闭paddle的信号处理
5
10
  import paddle
6
- import torch
11
+ paddle.disable_signal_handler()
12
+
7
13
  from loguru import logger
8
14
 
9
15
  from magic_pdf.model.batch_analyze import BatchAnalyze
10
16
  from magic_pdf.model.sub_modules.model_utils import get_vram
11
17
 
12
- paddle.disable_signal_handler()
13
-
14
- os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
15
-
16
18
  try:
17
19
  import torchtext
18
-
19
20
  if torchtext.__version__ >= '0.18.0':
20
21
  torchtext.disable_torchtext_deprecation_warning()
21
22
  except ImportError:
@@ -32,20 +33,6 @@ from magic_pdf.model.model_list import MODEL
32
33
  from magic_pdf.operators.models import InferenceResult
33
34
 
34
35
 
35
- def dict_compare(d1, d2):
36
- return d1.items() == d2.items()
37
-
38
-
39
- def remove_duplicates_dicts(lst):
40
- unique_dicts = []
41
- for dict_item in lst:
42
- if not any(
43
- dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
44
- ):
45
- unique_dicts.append(dict_item)
46
- return unique_dicts
47
-
48
-
49
36
  class ModelSingleton:
50
37
  _instance = None
51
38
  _models = {}
@@ -158,7 +145,11 @@ def doc_analyze(
158
145
  table_enable=None,
159
146
  ) -> InferenceResult:
160
147
 
161
- end_page_id = end_page_id if end_page_id else len(dataset) - 1
148
+ end_page_id = (
149
+ end_page_id
150
+ if end_page_id is not None and end_page_id >= 0
151
+ else len(dataset) - 1
152
+ )
162
153
 
163
154
  model_manager = ModelSingleton()
164
155
  custom_model = model_manager.get_model(
@@ -166,6 +157,7 @@ def doc_analyze(
166
157
  )
167
158
 
168
159
  batch_analyze = False
160
+ batch_ratio = 1
169
161
  device = get_device()
170
162
 
171
163
  npu_support = False
@@ -178,21 +170,19 @@ def doc_analyze(
178
170
  gpu_memory = int(os.getenv("VIRTUAL_VRAM_SIZE", round(get_vram(device))))
179
171
  if gpu_memory is not None and gpu_memory >= 8:
180
172
 
181
- if 8 <= gpu_memory < 10:
182
- batch_ratio = 2
183
- elif 10 <= gpu_memory <= 12:
184
- batch_ratio = 4
185
- elif 12 < gpu_memory <= 16:
186
- batch_ratio = 8
187
- elif 16 < gpu_memory <= 24:
173
+ if gpu_memory >= 40:
174
+ batch_ratio = 32
175
+ elif gpu_memory >=20:
188
176
  batch_ratio = 16
177
+ elif gpu_memory >= 16:
178
+ batch_ratio = 8
179
+ elif gpu_memory >= 10:
180
+ batch_ratio = 4
189
181
  else:
190
- batch_ratio = 32
182
+ batch_ratio = 2
191
183
 
192
- if batch_ratio >= 1:
193
- logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
194
- batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
195
- batch_analyze = True
184
+ logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
185
+ batch_analyze = True
196
186
 
197
187
  model_json = []
198
188
  doc_analyze_start = time.time()
@@ -200,24 +190,26 @@ def doc_analyze(
200
190
  if batch_analyze:
201
191
  # batch analyze
202
192
  images = []
193
+ page_wh_list = []
203
194
  for index in range(len(dataset)):
204
195
  if start_page_id <= index <= end_page_id:
205
196
  page_data = dataset.get_page(index)
206
197
  img_dict = page_data.get_image()
207
198
  images.append(img_dict['img'])
199
+ page_wh_list.append((img_dict['width'], img_dict['height']))
200
+ batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
208
201
  analyze_result = batch_model(images)
209
202
 
210
203
  for index in range(len(dataset)):
211
- page_data = dataset.get_page(index)
212
- img_dict = page_data.get_image()
213
- page_width = img_dict['width']
214
- page_height = img_dict['height']
215
204
  if start_page_id <= index <= end_page_id:
216
205
  result = analyze_result.pop(0)
206
+ page_width, page_height = page_wh_list.pop(0)
217
207
  else:
218
208
  result = []
209
+ page_height = 0
210
+ page_width = 0
219
211
 
220
- page_info = {'page_no': index, 'height': page_height, 'width': page_width}
212
+ page_info = {'page_no': index, 'width': page_width, 'height': page_height}
221
213
  page_dict = {'layout_dets': result, 'page_info': page_info}
222
214
  model_json.append(page_dict)
223
215
 
@@ -237,7 +229,7 @@ def doc_analyze(
237
229
  else:
238
230
  result = []
239
231
 
240
- page_info = {'page_no': index, 'height': page_height, 'width': page_width}
232
+ page_info = {'page_no': index, 'width': page_width, 'height': page_height}
241
233
  page_dict = {'layout_dets': result, 'page_info': page_info}
242
234
  model_json.append(page_dict)
243
235
 
@@ -450,11 +450,168 @@ class MagicModel:
450
450
  )
451
451
  return ret
452
452
 
453
+
454
+ def __tie_up_category_by_distance_v3(
455
+ self,
456
+ page_no: int,
457
+ subject_category_id: int,
458
+ object_category_id: int,
459
+ priority_pos: PosRelationEnum,
460
+ ):
461
+ subjects = self.__reduct_overlap(
462
+ list(
463
+ map(
464
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
465
+ filter(
466
+ lambda x: x['category_id'] == subject_category_id,
467
+ self.__model_list[page_no]['layout_dets'],
468
+ ),
469
+ )
470
+ )
471
+ )
472
+ objects = self.__reduct_overlap(
473
+ list(
474
+ map(
475
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
476
+ filter(
477
+ lambda x: x['category_id'] == object_category_id,
478
+ self.__model_list[page_no]['layout_dets'],
479
+ ),
480
+ )
481
+ )
482
+ )
483
+
484
+ ret = []
485
+ N, M = len(subjects), len(objects)
486
+ subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
487
+ objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2)
488
+
489
+ OBJ_IDX_OFFSET = 10000
490
+ SUB_BIT_KIND, OBJ_BIT_KIND = 0, 1
491
+
492
+ all_boxes_with_idx = [(i, SUB_BIT_KIND, sub['bbox'][0], sub['bbox'][1]) for i, sub in enumerate(subjects)] + [(i + OBJ_IDX_OFFSET , OBJ_BIT_KIND, obj['bbox'][0], obj['bbox'][1]) for i, obj in enumerate(objects)]
493
+ seen_idx = set()
494
+ seen_sub_idx = set()
495
+
496
+ while N > len(seen_sub_idx):
497
+ candidates = []
498
+ for idx, kind, x0, y0 in all_boxes_with_idx:
499
+ if idx in seen_idx:
500
+ continue
501
+ candidates.append((idx, kind, x0, y0))
502
+
503
+ if len(candidates) == 0:
504
+ break
505
+ left_x = min([v[2] for v in candidates])
506
+ top_y = min([v[3] for v in candidates])
507
+
508
+ candidates.sort(key=lambda x: (x[2]-left_x) ** 2 + (x[3] - top_y) ** 2)
509
+
510
+
511
+ fst_idx, fst_kind, left_x, top_y = candidates[0]
512
+ candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y)**2)
513
+ nxt = None
514
+
515
+ for i in range(1, len(candidates)):
516
+ if candidates[i][1] ^ fst_kind == 1:
517
+ nxt = candidates[i]
518
+ break
519
+ if nxt is None:
520
+ break
521
+
522
+ if fst_kind == SUB_BIT_KIND:
523
+ sub_idx, obj_idx = fst_idx, nxt[0] - OBJ_IDX_OFFSET
524
+
525
+ else:
526
+ sub_idx, obj_idx = nxt[0], fst_idx - OBJ_IDX_OFFSET
527
+
528
+ pair_dis = bbox_distance(subjects[sub_idx]['bbox'], objects[obj_idx]['bbox'])
529
+ nearest_dis = float('inf')
530
+ for i in range(N):
531
+ if i in seen_idx:continue
532
+ nearest_dis = min(nearest_dis, bbox_distance(subjects[i]['bbox'], objects[obj_idx]['bbox']))
533
+
534
+ if pair_dis >= 3*nearest_dis:
535
+ seen_idx.add(sub_idx)
536
+ continue
537
+
538
+
539
+ seen_idx.add(sub_idx)
540
+ seen_idx.add(obj_idx + OBJ_IDX_OFFSET)
541
+ seen_sub_idx.add(sub_idx)
542
+
543
+ ret.append(
544
+ {
545
+ 'sub_bbox': {
546
+ 'bbox': subjects[sub_idx]['bbox'],
547
+ 'score': subjects[sub_idx]['score'],
548
+ },
549
+ 'obj_bboxes': [
550
+ {'score': objects[obj_idx]['score'], 'bbox': objects[obj_idx]['bbox']}
551
+ ],
552
+ 'sub_idx': sub_idx,
553
+ }
554
+ )
555
+
556
+ for i in range(len(objects)):
557
+ j = i + OBJ_IDX_OFFSET
558
+ if j in seen_idx:
559
+ continue
560
+ seen_idx.add(j)
561
+ nearest_dis, nearest_sub_idx = float('inf'), -1
562
+ for k in range(len(subjects)):
563
+ dis = bbox_distance(objects[i]['bbox'], subjects[k]['bbox'])
564
+ if dis < nearest_dis:
565
+ nearest_dis = dis
566
+ nearest_sub_idx = k
567
+
568
+ for k in range(len(subjects)):
569
+ if k != nearest_sub_idx: continue
570
+ if k in seen_sub_idx:
571
+ for kk in range(len(ret)):
572
+ if ret[kk]['sub_idx'] == k:
573
+ ret[kk]['obj_bboxes'].append({'score': objects[i]['score'], 'bbox': objects[i]['bbox']})
574
+ break
575
+ else:
576
+ ret.append(
577
+ {
578
+ 'sub_bbox': {
579
+ 'bbox': subjects[k]['bbox'],
580
+ 'score': subjects[k]['score'],
581
+ },
582
+ 'obj_bboxes': [
583
+ {'score': objects[i]['score'], 'bbox': objects[i]['bbox']}
584
+ ],
585
+ 'sub_idx': k,
586
+ }
587
+ )
588
+ seen_sub_idx.add(k)
589
+ seen_idx.add(k)
590
+
591
+
592
+ for i in range(len(subjects)):
593
+ if i in seen_sub_idx:
594
+ continue
595
+ ret.append(
596
+ {
597
+ 'sub_bbox': {
598
+ 'bbox': subjects[i]['bbox'],
599
+ 'score': subjects[i]['score'],
600
+ },
601
+ 'obj_bboxes': [],
602
+ 'sub_idx': i,
603
+ }
604
+ )
605
+
606
+
607
+ return ret
608
+
609
+
453
610
  def get_imgs_v2(self, page_no: int):
454
- with_captions = self.__tie_up_category_by_distance_v2(
611
+ with_captions = self.__tie_up_category_by_distance_v3(
455
612
  page_no, 3, 4, PosRelationEnum.BOTTOM
456
613
  )
457
- with_footnotes = self.__tie_up_category_by_distance_v2(
614
+ with_footnotes = self.__tie_up_category_by_distance_v3(
458
615
  page_no, 3, CategoryId.ImageFootnote, PosRelationEnum.ALL
459
616
  )
460
617
  ret = []
@@ -470,10 +627,10 @@ class MagicModel:
470
627
  return ret
471
628
 
472
629
  def get_tables_v2(self, page_no: int) -> list:
473
- with_captions = self.__tie_up_category_by_distance_v2(
630
+ with_captions = self.__tie_up_category_by_distance_v3(
474
631
  page_no, 5, 6, PosRelationEnum.UP
475
632
  )
476
- with_footnotes = self.__tie_up_category_by_distance_v2(
633
+ with_footnotes = self.__tie_up_category_by_distance_v3(
477
634
  page_no, 5, 7, PosRelationEnum.ALL
478
635
  )
479
636
  ret = []
@@ -89,13 +89,6 @@ class CustomPEKModel:
89
89
  # 初始化解析方案
90
90
  self.device = kwargs.get('device', 'cpu')
91
91
 
92
- if str(self.device).startswith("npu"):
93
- import torch_npu
94
- os.environ['FLAGS_npu_jit_compile'] = '0'
95
- os.environ['FLAGS_use_stride_kernel'] = '0'
96
- elif str(self.device).startswith("mps"):
97
- os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
98
-
99
92
  logger.info('using device: {}'.format(self.device))
100
93
  models_dir = kwargs.get(
101
94
  'models_dir', os.path.join(root_dir, 'resources', 'models')
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) Opendatalab. All rights reserved.
2
+ import time
2
3
  from collections import Counter
3
4
  from uuid import uuid4
4
5
 
@@ -102,9 +103,9 @@ class YOLOv11LangDetModel(object):
102
103
  temp_images = split_images(image)
103
104
  for temp_image in temp_images:
104
105
  all_images.append(resize_images_to_224(temp_image))
105
-
106
- images_lang_res = self.batch_predict(all_images, batch_size=8)
107
- # logger.info(f"images_lang_res: {images_lang_res}")
106
+ # langdetect_start = time.time()
107
+ images_lang_res = self.batch_predict(all_images, batch_size=256)
108
+ # logger.info(f"image number of langdetect: {len(images_lang_res)}, langdetect time: {round(time.time() - langdetect_start, 2)}")
108
109
  if len(images_lang_res) > 0:
109
110
  count_dict = Counter(images_lang_res)
110
111
  language = max(count_dict, key=count_dict.get)
@@ -4,22 +4,37 @@ from loguru import logger
4
4
  from magic_pdf.config.constants import MODEL_NAME
5
5
  from magic_pdf.model.model_list import AtomicModel
6
6
  from magic_pdf.model.sub_modules.language_detection.yolov11.YOLOv11 import YOLOv11LangDetModel
7
- from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import \
8
- DocLayoutYOLOModel
9
- from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import \
10
- Layoutlmv3_Predictor
7
+ from magic_pdf.model.sub_modules.layout.doclayout_yolo.DocLayoutYOLO import DocLayoutYOLOModel
8
+ from magic_pdf.model.sub_modules.layout.layoutlmv3.model_init import Layoutlmv3_Predictor
11
9
  from magic_pdf.model.sub_modules.mfd.yolov8.YOLOv8 import YOLOv8MFDModel
12
10
  from magic_pdf.model.sub_modules.mfr.unimernet.Unimernet import UnimernetModel
13
- from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import \
14
- ModifiedPaddleOCR
15
- from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import \
16
- RapidTableModel
17
- # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
18
- from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import \
19
- StructTableModel
20
- from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import \
21
- TableMasterPaddleModel
22
11
 
12
+ try:
13
+ from magic_pdf_ascend_plugin.libs.license_verifier import load_license, LicenseFormatError, LicenseSignatureError, LicenseExpiredError
14
+ from magic_pdf_ascend_plugin.model_plugin.ocr.paddleocr.ppocr_273_npu import ModifiedPaddleOCR
15
+ from magic_pdf_ascend_plugin.model_plugin.table.rapidtable.rapid_table_npu import RapidTableModel
16
+ license_key = load_license()
17
+ logger.info(f'Using Ascend Plugin Success, License id is {license_key["payload"]["id"]},'
18
+ f' License expired at {license_key["payload"]["date"]["end_date"]}')
19
+ except Exception as e:
20
+ if isinstance(e, ImportError):
21
+ pass
22
+ elif isinstance(e, LicenseFormatError):
23
+ logger.error("Ascend Plugin: Invalid license format. Please check the license file.")
24
+ elif isinstance(e, LicenseSignatureError):
25
+ logger.error("Ascend Plugin: Invalid signature. The license may be tampered with.")
26
+ elif isinstance(e, LicenseExpiredError):
27
+ logger.error("Ascend Plugin: License has expired. Please renew your license.")
28
+ elif isinstance(e, FileNotFoundError):
29
+ logger.error("Ascend Plugin: Not found License file.")
30
+ else:
31
+ logger.error(f"Ascend Plugin: {e}")
32
+ from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_273_mod import ModifiedPaddleOCR
33
+ # from magic_pdf.model.sub_modules.ocr.paddleocr.ppocr_291_mod import ModifiedPaddleOCR
34
+ from magic_pdf.model.sub_modules.table.rapidtable.rapid_table import RapidTableModel
35
+
36
+ from magic_pdf.model.sub_modules.table.structeqtable.struct_eqtable import StructTableModel
37
+ from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import TableMasterPaddleModel
23
38
 
24
39
  def table_model_init(table_model_type, model_path, max_time, _device_='cpu', ocr_engine=None, table_sub_model_name=None):
25
40
  if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
@@ -76,7 +91,6 @@ def ocr_model_init(show_log: bool = False,
76
91
  use_dilation=True,
77
92
  det_db_unclip_ratio=1.8,
78
93
  ):
79
-
80
94
  if lang is not None and lang != '':
81
95
  model = ModifiedPaddleOCR(
82
96
  show_log=show_log,