deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
deepdoc/vision/ocr.py ADDED
@@ -0,0 +1,757 @@
1
+ #
2
+ # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ import logging
17
+ import copy
18
+ import time
19
+ import os
20
+
21
+ from ..common.model_store import resolve_vision_model_dir
22
+ from ..common.misc_utils import pip_install_torch
23
+ from ..common import settings
24
+ from .operators import * # noqa: F403
25
+ from . import operators
26
+ import math
27
+ import numpy as np
28
+ import cv2
29
+ import onnxruntime as ort
30
+
31
+ from .postprocess import build_post_process
32
+
33
+ loaded_models = {}
34
+
35
+ def transform(data, ops=None):
36
+ """ transform """
37
+ if ops is None:
38
+ ops = []
39
+ for op in ops:
40
+ data = op(data)
41
+ if data is None:
42
+ return None
43
+ return data
44
+
45
+
46
+ def create_operators(op_param_list, global_config=None):
47
+ """
48
+ create operators based on the config
49
+
50
+ Args:
51
+ params(list): a dict list, used to create some operators
52
+ """
53
+ assert isinstance(
54
+ op_param_list, list), ('operator config should be a list')
55
+ ops = []
56
+ for operator in op_param_list:
57
+ assert isinstance(operator,
58
+ dict) and len(operator) == 1, "yaml format error"
59
+ op_name = list(operator)[0]
60
+ param = {} if operator[op_name] is None else operator[op_name]
61
+ if global_config is not None:
62
+ param.update(global_config)
63
+ op = getattr(operators, op_name)(**param)
64
+ ops.append(op)
65
+ return ops
66
+
67
+
68
+ def load_model(model_dir, nm, device_id: int | None = None):
69
+ model_file_path = os.path.join(model_dir, nm + ".onnx")
70
+ model_cached_tag = model_file_path + str(device_id) if device_id is not None else model_file_path
71
+
72
+ global loaded_models
73
+ loaded_model = loaded_models.get(model_cached_tag)
74
+ if loaded_model:
75
+ logging.info(f"load_model {model_file_path} reuses cached model")
76
+ return loaded_model
77
+
78
+ if not os.path.exists(model_file_path):
79
+ raise ValueError("not find model file path {}".format(
80
+ model_file_path))
81
+
82
+ def cuda_is_available():
83
+ try:
84
+ pip_install_torch()
85
+ import torch
86
+ target_id = 0 if device_id is None else device_id
87
+ if torch.cuda.is_available() and torch.cuda.device_count() > target_id:
88
+ return True
89
+ except Exception:
90
+ return False
91
+ return False
92
+
93
+ options = ort.SessionOptions()
94
+ options.enable_cpu_mem_arena = False
95
+ options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
96
+ options.intra_op_num_threads = 2
97
+ options.inter_op_num_threads = 2
98
+
99
+ # https://github.com/microsoft/onnxruntime/issues/9509#issuecomment-951546580
100
+ # Shrink GPU memory after execution
101
+ run_options = ort.RunOptions()
102
+ if cuda_is_available():
103
+ gpu_mem_limit_mb = int(os.environ.get("OCR_GPU_MEM_LIMIT_MB", "2048"))
104
+ arena_strategy = os.environ.get("OCR_ARENA_EXTEND_STRATEGY", "kNextPowerOfTwo")
105
+ provider_device_id = 0 if device_id is None else device_id
106
+ cuda_provider_options = {
107
+ "device_id": provider_device_id, # Use specific GPU
108
+ "gpu_mem_limit": max(gpu_mem_limit_mb, 0) * 1024 * 1024,
109
+ "arena_extend_strategy": arena_strategy, # gpu memory allocation strategy
110
+ }
111
+ sess = ort.InferenceSession(
112
+ model_file_path,
113
+ options=options,
114
+ providers=['CUDAExecutionProvider'],
115
+ provider_options=[cuda_provider_options]
116
+ )
117
+ logging.info(f"load_model {model_file_path} uses GPU (device {provider_device_id}, gpu_mem_limit={cuda_provider_options['gpu_mem_limit']}, arena_strategy={arena_strategy})")
118
+ else:
119
+ sess = ort.InferenceSession(
120
+ model_file_path,
121
+ options=options,
122
+ providers=['CPUExecutionProvider'])
123
+ run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "cpu")
124
+ logging.info(f"load_model {model_file_path} uses CPU")
125
+ loaded_model = (sess, run_options)
126
+ loaded_models[model_cached_tag] = loaded_model
127
+ return loaded_model
128
+
129
+
130
+ class TextRecognizer:
131
+ def __init__(self, model_dir, device_id: int | None = None):
132
+ self.rec_image_shape = [int(v) for v in "3, 48, 320".split(",")]
133
+ self.rec_batch_num = 16
134
+ postprocess_params = {
135
+ 'name': 'CTCLabelDecode',
136
+ "character_dict_path": os.path.join(model_dir, "ocr.res"),
137
+ "use_space_char": True
138
+ }
139
+ self.postprocess_op = build_post_process(postprocess_params)
140
+ self.predictor, self.run_options = load_model(model_dir, 'rec', device_id)
141
+ self.input_tensor = self.predictor.get_inputs()[0]
142
+
143
+ def resize_norm_img(self, img, max_wh_ratio):
144
+ imgC, imgH, imgW = self.rec_image_shape
145
+
146
+ assert imgC == img.shape[2]
147
+ imgW = int((imgH * max_wh_ratio))
148
+ w = self.input_tensor.shape[3:][0]
149
+ if isinstance(w, str):
150
+ pass
151
+ elif w is not None and w > 0:
152
+ imgW = w
153
+ h, w = img.shape[:2]
154
+ ratio = w / float(h)
155
+ if math.ceil(imgH * ratio) > imgW:
156
+ resized_w = imgW
157
+ else:
158
+ resized_w = int(math.ceil(imgH * ratio))
159
+
160
+ resized_image = cv2.resize(img, (resized_w, imgH))
161
+ resized_image = resized_image.astype('float32')
162
+ resized_image = resized_image.transpose((2, 0, 1)) / 255
163
+ resized_image -= 0.5
164
+ resized_image /= 0.5
165
+ padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
166
+ padding_im[:, :, 0:resized_w] = resized_image
167
+ return padding_im
168
+
169
+ def resize_norm_img_vl(self, img, image_shape):
170
+
171
+ imgC, imgH, imgW = image_shape
172
+ img = img[:, :, ::-1] # bgr2rgb
173
+ resized_image = cv2.resize(
174
+ img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
175
+ resized_image = resized_image.astype('float32')
176
+ resized_image = resized_image.transpose((2, 0, 1)) / 255
177
+ return resized_image
178
+
179
+ def resize_norm_img_srn(self, img, image_shape):
180
+ imgC, imgH, imgW = image_shape
181
+
182
+ img_black = np.zeros((imgH, imgW))
183
+ im_hei = img.shape[0]
184
+ im_wid = img.shape[1]
185
+
186
+ if im_wid <= im_hei * 1:
187
+ img_new = cv2.resize(img, (imgH * 1, imgH))
188
+ elif im_wid <= im_hei * 2:
189
+ img_new = cv2.resize(img, (imgH * 2, imgH))
190
+ elif im_wid <= im_hei * 3:
191
+ img_new = cv2.resize(img, (imgH * 3, imgH))
192
+ else:
193
+ img_new = cv2.resize(img, (imgW, imgH))
194
+
195
+ img_np = np.asarray(img_new)
196
+ img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
197
+ img_black[:, 0:img_np.shape[1]] = img_np
198
+ img_black = img_black[:, :, np.newaxis]
199
+
200
+ row, col, c = img_black.shape
201
+ c = 1
202
+
203
+ return np.reshape(img_black, (c, row, col)).astype(np.float32)
204
+
205
+ def srn_other_inputs(self, image_shape, num_heads, max_text_length):
206
+
207
+ imgC, imgH, imgW = image_shape
208
+ feature_dim = int((imgH / 8) * (imgW / 8))
209
+
210
+ encoder_word_pos = np.array(range(0, feature_dim)).reshape(
211
+ (feature_dim, 1)).astype('int64')
212
+ gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
213
+ (max_text_length, 1)).astype('int64')
214
+
215
+ gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
216
+ gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
217
+ [-1, 1, max_text_length, max_text_length])
218
+ gsrm_slf_attn_bias1 = np.tile(
219
+ gsrm_slf_attn_bias1,
220
+ [1, num_heads, 1, 1]).astype('float32') * [-1e9]
221
+
222
+ gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
223
+ [-1, 1, max_text_length, max_text_length])
224
+ gsrm_slf_attn_bias2 = np.tile(
225
+ gsrm_slf_attn_bias2,
226
+ [1, num_heads, 1, 1]).astype('float32') * [-1e9]
227
+
228
+ encoder_word_pos = encoder_word_pos[np.newaxis, :]
229
+ gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
230
+
231
+ return [
232
+ encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
233
+ gsrm_slf_attn_bias2
234
+ ]
235
+
236
+ def process_image_srn(self, img, image_shape, num_heads, max_text_length):
237
+ norm_img = self.resize_norm_img_srn(img, image_shape)
238
+ norm_img = norm_img[np.newaxis, :]
239
+
240
+ [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
241
+ self.srn_other_inputs(image_shape, num_heads, max_text_length)
242
+
243
+ gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
244
+ gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
245
+ encoder_word_pos = encoder_word_pos.astype(np.int64)
246
+ gsrm_word_pos = gsrm_word_pos.astype(np.int64)
247
+
248
+ return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
249
+ gsrm_slf_attn_bias2)
250
+
251
+ def resize_norm_img_sar(self, img, image_shape,
252
+ width_downsample_ratio=0.25):
253
+ imgC, imgH, imgW_min, imgW_max = image_shape
254
+ h = img.shape[0]
255
+ w = img.shape[1]
256
+ valid_ratio = 1.0
257
+ # make sure new_width is an integral multiple of width_divisor.
258
+ width_divisor = int(1 / width_downsample_ratio)
259
+ # resize
260
+ ratio = w / float(h)
261
+ resize_w = math.ceil(imgH * ratio)
262
+ if resize_w % width_divisor != 0:
263
+ resize_w = round(resize_w / width_divisor) * width_divisor
264
+ if imgW_min is not None:
265
+ resize_w = max(imgW_min, resize_w)
266
+ if imgW_max is not None:
267
+ valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
268
+ resize_w = min(imgW_max, resize_w)
269
+ resized_image = cv2.resize(img, (resize_w, imgH))
270
+ resized_image = resized_image.astype('float32')
271
+ # norm
272
+ if image_shape[0] == 1:
273
+ resized_image = resized_image / 255
274
+ resized_image = resized_image[np.newaxis, :]
275
+ else:
276
+ resized_image = resized_image.transpose((2, 0, 1)) / 255
277
+ resized_image -= 0.5
278
+ resized_image /= 0.5
279
+ resize_shape = resized_image.shape
280
+ padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
281
+ padding_im[:, :, 0:resize_w] = resized_image
282
+ pad_shape = padding_im.shape
283
+
284
+ return padding_im, resize_shape, pad_shape, valid_ratio
285
+
286
+ def resize_norm_img_spin(self, img):
287
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
288
+ # return padding_im
289
+ img = cv2.resize(img, tuple([100, 32]), cv2.INTER_CUBIC)
290
+ img = np.array(img, np.float32)
291
+ img = np.expand_dims(img, -1)
292
+ img = img.transpose((2, 0, 1))
293
+ mean = [127.5]
294
+ std = [127.5]
295
+ mean = np.array(mean, dtype=np.float32)
296
+ std = np.array(std, dtype=np.float32)
297
+ mean = np.float32(mean.reshape(1, -1))
298
+ stdinv = 1 / np.float32(std.reshape(1, -1))
299
+ img -= mean
300
+ img *= stdinv
301
+ return img
302
+
303
+ def resize_norm_img_svtr(self, img, image_shape):
304
+
305
+ imgC, imgH, imgW = image_shape
306
+ resized_image = cv2.resize(
307
+ img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
308
+ resized_image = resized_image.astype('float32')
309
+ resized_image = resized_image.transpose((2, 0, 1)) / 255
310
+ resized_image -= 0.5
311
+ resized_image /= 0.5
312
+ return resized_image
313
+
314
+ def resize_norm_img_abinet(self, img, image_shape):
315
+
316
+ imgC, imgH, imgW = image_shape
317
+
318
+ resized_image = cv2.resize(
319
+ img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
320
+ resized_image = resized_image.astype('float32')
321
+ resized_image = resized_image / 255.
322
+
323
+ mean = np.array([0.485, 0.456, 0.406])
324
+ std = np.array([0.229, 0.224, 0.225])
325
+ resized_image = (
326
+ resized_image - mean[None, None, ...]) / std[None, None, ...]
327
+ resized_image = resized_image.transpose((2, 0, 1))
328
+ resized_image = resized_image.astype('float32')
329
+
330
+ return resized_image
331
+
332
+ def norm_img_can(self, img, image_shape):
333
+
334
+ img = cv2.cvtColor(
335
+ img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image
336
+
337
+ if self.rec_image_shape[0] == 1:
338
+ h, w = img.shape
339
+ _, imgH, imgW = self.rec_image_shape
340
+ if h < imgH or w < imgW:
341
+ padding_h = max(imgH - h, 0)
342
+ padding_w = max(imgW - w, 0)
343
+ img_padded = np.pad(img, ((0, padding_h), (0, padding_w)),
344
+ 'constant',
345
+ constant_values=(255))
346
+ img = img_padded
347
+
348
+ img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w
349
+ img = img.astype('float32')
350
+
351
+ return img
352
+
353
+ def close(self):
354
+ # close session and release manually
355
+ # NOTE: `__del__` can run during interpreter shutdown when module
356
+ # globals (including `logging`/`gc`) may already be cleared to None.
357
+ try:
358
+ import logging as _logging
359
+ _logging.info("Close text recognizer.")
360
+ except Exception:
361
+ pass
362
+ if hasattr(self, "predictor"):
363
+ del self.predictor
364
+ try:
365
+ import gc as _gc
366
+ _gc.collect()
367
+ except Exception:
368
+ pass
369
+
370
+ def __call__(self, img_list):
371
+ img_num = len(img_list)
372
+ # Calculate the aspect ratio of all text bars
373
+ width_list = []
374
+ for img in img_list:
375
+ width_list.append(img.shape[1] / float(img.shape[0]))
376
+ # Sorting can speed up the recognition process
377
+ indices = np.argsort(np.array(width_list))
378
+ rec_res = [['', 0.0]] * img_num
379
+ batch_num = self.rec_batch_num
380
+ st = time.time()
381
+
382
+ for beg_img_no in range(0, img_num, batch_num):
383
+ end_img_no = min(img_num, beg_img_no + batch_num)
384
+ norm_img_batch = []
385
+ imgC, imgH, imgW = self.rec_image_shape[:3]
386
+ max_wh_ratio = imgW / imgH
387
+ # max_wh_ratio = 0
388
+ for ino in range(beg_img_no, end_img_no):
389
+ h, w = img_list[indices[ino]].shape[0:2]
390
+ wh_ratio = w * 1.0 / h
391
+ max_wh_ratio = max(max_wh_ratio, wh_ratio)
392
+ for ino in range(beg_img_no, end_img_no):
393
+ norm_img = self.resize_norm_img(img_list[indices[ino]],
394
+ max_wh_ratio)
395
+ norm_img = norm_img[np.newaxis, :]
396
+ norm_img_batch.append(norm_img)
397
+ norm_img_batch = np.concatenate(norm_img_batch)
398
+ norm_img_batch = norm_img_batch.copy()
399
+
400
+ input_dict = {}
401
+ input_dict[self.input_tensor.name] = norm_img_batch
402
+ for i in range(100000):
403
+ try:
404
+ outputs = self.predictor.run(None, input_dict, self.run_options)
405
+ break
406
+ except Exception as e:
407
+ if i >= 3:
408
+ raise e
409
+ time.sleep(5)
410
+ preds = outputs[0]
411
+ rec_result = self.postprocess_op(preds)
412
+ for rno in range(len(rec_result)):
413
+ rec_res[indices[beg_img_no + rno]] = rec_result[rno]
414
+
415
+ return rec_res, time.time() - st
416
+
417
+ def __del__(self):
418
+ try:
419
+ self.close()
420
+ except Exception:
421
+ # Destructors must never raise.
422
+ pass
423
+
424
+
425
+ class TextDetector:
426
+ def __init__(self, model_dir, device_id: int | None = None):
427
+ pre_process_list = [{
428
+ 'DetResizeForTest': {
429
+ 'limit_side_len': 960,
430
+ 'limit_type': "max",
431
+ }
432
+ }, {
433
+ 'NormalizeImage': {
434
+ 'std': [0.229, 0.224, 0.225],
435
+ 'mean': [0.485, 0.456, 0.406],
436
+ 'scale': '1./255.',
437
+ 'order': 'hwc'
438
+ }
439
+ }, {
440
+ 'ToCHWImage': None
441
+ }, {
442
+ 'KeepKeys': {
443
+ 'keep_keys': ['image', 'shape']
444
+ }
445
+ }]
446
+ postprocess_params = {"name": "DBPostProcess", "thresh": 0.3, "box_thresh": 0.5, "max_candidates": 1000,
447
+ "unclip_ratio": 1.5, "use_dilation": False, "score_mode": "fast", "box_type": "quad"}
448
+
449
+ self.postprocess_op = build_post_process(postprocess_params)
450
+ self.predictor, self.run_options = load_model(model_dir, 'det', device_id)
451
+ self.input_tensor = self.predictor.get_inputs()[0]
452
+
453
+ img_h, img_w = self.input_tensor.shape[2:]
454
+ if isinstance(img_h, str) or isinstance(img_w, str):
455
+ pass
456
+ elif img_h is not None and img_w is not None and img_h > 0 and img_w > 0:
457
+ pre_process_list[0] = {
458
+ 'DetResizeForTest': {
459
+ 'image_shape': [img_h, img_w]
460
+ }
461
+ }
462
+ self.preprocess_op = create_operators(pre_process_list)
463
+
464
+ def order_points_clockwise(self, pts):
465
+ rect = np.zeros((4, 2), dtype="float32")
466
+ s = pts.sum(axis=1)
467
+ rect[0] = pts[np.argmin(s)]
468
+ rect[2] = pts[np.argmax(s)]
469
+ tmp = np.delete(pts, (np.argmin(s), np.argmax(s)), axis=0)
470
+ diff = np.diff(np.array(tmp), axis=1)
471
+ rect[1] = tmp[np.argmin(diff)]
472
+ rect[3] = tmp[np.argmax(diff)]
473
+ return rect
474
+
475
+ def clip_det_res(self, points, img_height, img_width):
476
+ for pno in range(points.shape[0]):
477
+ points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
478
+ points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
479
+ return points
480
+
481
+ def filter_tag_det_res(self, dt_boxes, image_shape):
482
+ img_height, img_width = image_shape[0:2]
483
+ dt_boxes_new = []
484
+ for box in dt_boxes:
485
+ if isinstance(box, list):
486
+ box = np.array(box)
487
+ box = self.order_points_clockwise(box)
488
+ box = self.clip_det_res(box, img_height, img_width)
489
+ rect_width = int(np.linalg.norm(box[0] - box[1]))
490
+ rect_height = int(np.linalg.norm(box[0] - box[3]))
491
+ if rect_width <= 3 or rect_height <= 3:
492
+ continue
493
+ dt_boxes_new.append(box)
494
+ dt_boxes = np.array(dt_boxes_new)
495
+ return dt_boxes
496
+
497
+ def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
498
+ img_height, img_width = image_shape[0:2]
499
+ dt_boxes_new = []
500
+ for box in dt_boxes:
501
+ if isinstance(box, list):
502
+ box = np.array(box)
503
+ box = self.clip_det_res(box, img_height, img_width)
504
+ dt_boxes_new.append(box)
505
+ dt_boxes = np.array(dt_boxes_new)
506
+ return dt_boxes
507
+
508
+ def close(self):
509
+ # NOTE: `__del__` can run during interpreter shutdown when module
510
+ # globals (including `logging`/`gc`) may already be cleared to None.
511
+ try:
512
+ import logging as _logging
513
+ _logging.info("Close text detector.")
514
+ except Exception:
515
+ pass
516
+ if hasattr(self, "predictor"):
517
+ del self.predictor
518
+ try:
519
+ import gc as _gc
520
+ _gc.collect()
521
+ except Exception:
522
+ pass
523
+
524
+ def __call__(self, img):
525
+ ori_im = img.copy()
526
+ data = {'image': img}
527
+
528
+ st = time.time()
529
+ data = transform(data, self.preprocess_op)
530
+ img, shape_list = data
531
+ if img is None:
532
+ return None, 0
533
+ img = np.expand_dims(img, axis=0)
534
+ shape_list = np.expand_dims(shape_list, axis=0)
535
+ img = img.copy()
536
+ input_dict = {}
537
+ input_dict[self.input_tensor.name] = img
538
+ for i in range(100000):
539
+ try:
540
+ outputs = self.predictor.run(None, input_dict, self.run_options)
541
+ break
542
+ except Exception as e:
543
+ if i >= 3:
544
+ raise e
545
+ time.sleep(5)
546
+
547
+ post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
548
+ dt_boxes = post_result[0]['points']
549
+ dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
550
+
551
+ return dt_boxes, time.time() - st
552
+
553
+ def __del__(self):
554
+ try:
555
+ self.close()
556
+ except Exception:
557
+ # Destructors must never raise.
558
+ pass
559
+
560
+
561
+ class OCR:
562
+ def __init__(
563
+ self,
564
+ model_dir=None,
565
+ model_home: str | None = None,
566
+ model_provider: str | None = None,
567
+ offline: bool | None = None,
568
+ ):
569
+ if not model_dir:
570
+ model_dir = resolve_vision_model_dir(
571
+ model_home=model_home,
572
+ provider=model_provider,
573
+ offline=offline,
574
+ )
575
+
576
+ # Append multi-GPU tasks to the list
577
+ if settings.PARALLEL_DEVICES > 0:
578
+ self.text_detector = []
579
+ self.text_recognizer = []
580
+ for device_id in range(settings.PARALLEL_DEVICES):
581
+ self.text_detector.append(TextDetector(model_dir, device_id))
582
+ self.text_recognizer.append(TextRecognizer(model_dir, device_id))
583
+ else:
584
+ self.text_detector = [TextDetector(model_dir)]
585
+ self.text_recognizer = [TextRecognizer(model_dir)]
586
+
587
+ self.drop_score = 0.5
588
+ self.crop_image_res_index = 0
589
+
590
+ def get_rotate_crop_image(self, img, points):
591
+ """
592
+ img_height, img_width = img.shape[0:2]
593
+ left = int(np.min(points[:, 0]))
594
+ right = int(np.max(points[:, 0]))
595
+ top = int(np.min(points[:, 1]))
596
+ bottom = int(np.max(points[:, 1]))
597
+ img_crop = img[top:bottom, left:right, :].copy()
598
+ points[:, 0] = points[:, 0] - left
599
+ points[:, 1] = points[:, 1] - top
600
+ """
601
+ assert len(points) == 4, "shape of points must be 4*2"
602
+ img_crop_width = int(
603
+ max(
604
+ np.linalg.norm(points[0] - points[1]),
605
+ np.linalg.norm(points[2] - points[3])))
606
+ img_crop_height = int(
607
+ max(
608
+ np.linalg.norm(points[0] - points[3]),
609
+ np.linalg.norm(points[1] - points[2])))
610
+ pts_std = np.float32([[0, 0], [img_crop_width, 0],
611
+ [img_crop_width, img_crop_height],
612
+ [0, img_crop_height]])
613
+ M = cv2.getPerspectiveTransform(points, pts_std)
614
+ dst_img = cv2.warpPerspective(
615
+ img,
616
+ M, (img_crop_width, img_crop_height),
617
+ borderMode=cv2.BORDER_REPLICATE,
618
+ flags=cv2.INTER_CUBIC)
619
+ dst_img_height, dst_img_width = dst_img.shape[0:2]
620
+ if dst_img_height * 1.0 / dst_img_width >= 1.5:
621
+ # Try original orientation
622
+ rec_result = self.text_recognizer[0]([dst_img])
623
+ text, score = rec_result[0][0]
624
+ best_score = score
625
+ best_img = dst_img
626
+
627
+ # Try clockwise 90° rotation
628
+ rotated_cw = np.rot90(dst_img, k=3)
629
+ rec_result = self.text_recognizer[0]([rotated_cw])
630
+ rotated_cw_text, rotated_cw_score = rec_result[0][0]
631
+ if rotated_cw_score > best_score:
632
+ best_score = rotated_cw_score
633
+ best_img = rotated_cw
634
+
635
+ # Try counter-clockwise 90° rotation
636
+ rotated_ccw = np.rot90(dst_img, k=1)
637
+ rec_result = self.text_recognizer[0]([rotated_ccw])
638
+ rotated_ccw_text, rotated_ccw_score = rec_result[0][0]
639
+ if rotated_ccw_score > best_score:
640
+ best_img = rotated_ccw
641
+
642
+ # Use the best image
643
+ dst_img = best_img
644
+ return dst_img
645
+
646
+ def sorted_boxes(self, dt_boxes):
647
+ """
648
+ Sort text boxes in order from top to bottom, left to right
649
+ args:
650
+ dt_boxes(array):detected text boxes with shape [4, 2]
651
+ return:
652
+ sorted boxes(array) with shape [4, 2]
653
+ """
654
+ num_boxes = dt_boxes.shape[0]
655
+ sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
656
+ _boxes = list(sorted_boxes)
657
+
658
+ for i in range(num_boxes - 1):
659
+ for j in range(i, -1, -1):
660
+ if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
661
+ (_boxes[j + 1][0][0] < _boxes[j][0][0]):
662
+ tmp = _boxes[j]
663
+ _boxes[j] = _boxes[j + 1]
664
+ _boxes[j + 1] = tmp
665
+ else:
666
+ break
667
+ return _boxes
668
+
669
+ def detect(self, img, device_id: int | None = None):
670
+ if device_id is None:
671
+ device_id = 0
672
+
673
+ time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
674
+
675
+ if img is None:
676
+ return None, None, time_dict
677
+
678
+ start = time.time()
679
+ dt_boxes, elapse = self.text_detector[device_id](img)
680
+ time_dict['det'] = elapse
681
+
682
+ if dt_boxes is None:
683
+ end = time.time()
684
+ time_dict['all'] = end - start
685
+ return None, None, time_dict
686
+
687
+ return zip(self.sorted_boxes(dt_boxes), [
688
+ ("", 0) for _ in range(len(dt_boxes))])
689
+
690
+ def recognize(self, ori_im, box, device_id: int | None = None):
691
+ if device_id is None:
692
+ device_id = 0
693
+
694
+ img_crop = self.get_rotate_crop_image(ori_im, box)
695
+
696
+ rec_res, elapse = self.text_recognizer[device_id]([img_crop])
697
+ text, score = rec_res[0]
698
+ if score < self.drop_score:
699
+ return ""
700
+ return text
701
+
702
+ def recognize_batch(self, img_list, device_id: int | None = None):
703
+ if device_id is None:
704
+ device_id = 0
705
+ rec_res, elapse = self.text_recognizer[device_id](img_list)
706
+ texts = []
707
+ for i in range(len(rec_res)):
708
+ text, score = rec_res[i]
709
+ if score < self.drop_score:
710
+ text = ""
711
+ texts.append(text)
712
+ return texts
713
+
714
+ def __call__(self, img, device_id = 0, cls=True):
715
+ time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
716
+ if device_id is None:
717
+ device_id = 0
718
+
719
+ if img is None:
720
+ return None, None, time_dict
721
+
722
+ start = time.time()
723
+ ori_im = img.copy()
724
+ dt_boxes, elapse = self.text_detector[device_id](img)
725
+ time_dict['det'] = elapse
726
+
727
+ if dt_boxes is None:
728
+ end = time.time()
729
+ time_dict['all'] = end - start
730
+ return None, None, time_dict
731
+
732
+ img_crop_list = []
733
+
734
+ dt_boxes = self.sorted_boxes(dt_boxes)
735
+
736
+ for bno in range(len(dt_boxes)):
737
+ tmp_box = copy.deepcopy(dt_boxes[bno])
738
+ img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
739
+ img_crop_list.append(img_crop)
740
+
741
+ rec_res, elapse = self.text_recognizer[device_id](img_crop_list)
742
+
743
+ time_dict['rec'] = elapse
744
+
745
+ filter_boxes, filter_rec_res = [], []
746
+ for box, rec_result in zip(dt_boxes, rec_res):
747
+ text, score = rec_result
748
+ if score >= self.drop_score:
749
+ filter_boxes.append(box)
750
+ filter_rec_res.append(rec_result)
751
+ end = time.time()
752
+ time_dict['all'] = end - start
753
+
754
+ # for bno in range(len(img_crop_list)):
755
+ # print(f"{bno}, {rec_res[bno]}")
756
+
757
+ return list(zip([a.tolist() for a in filter_boxes], filter_rec_res))