deepdoc-lib 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. deepdoc/README.md +122 -0
  2. deepdoc/README_zh.md +116 -0
  3. deepdoc/__init__.py +43 -0
  4. deepdoc/_version.py +34 -0
  5. deepdoc/common/__init__.py +52 -0
  6. deepdoc/common/config_utils.py +63 -0
  7. deepdoc/common/connection_utils.py +73 -0
  8. deepdoc/common/file_utils.py +19 -0
  9. deepdoc/common/misc_utils.py +44 -0
  10. deepdoc/common/model_store.py +369 -0
  11. deepdoc/common/settings.py +42 -0
  12. deepdoc/common/tiktoken_cache.py +84 -0
  13. deepdoc/common/token_utils.py +96 -0
  14. deepdoc/config.py +149 -0
  15. deepdoc/depend/find_codec.py +42 -0
  16. deepdoc/depend/nltk_manager.py +114 -0
  17. deepdoc/depend/prompts/vision_llm_describe_prompt.md +23 -0
  18. deepdoc/depend/prompts/vision_llm_figure_describe_prompt.md +24 -0
  19. deepdoc/depend/prompts.py +35 -0
  20. deepdoc/depend/rag_tokenizer.py +578 -0
  21. deepdoc/depend/simple_cv_model.py +469 -0
  22. deepdoc/depend/surname.py +91 -0
  23. deepdoc/depend/timeout.py +73 -0
  24. deepdoc/depend/vision_llm_chunk.py +35 -0
  25. deepdoc/dict/README.md +19 -0
  26. deepdoc/dict/huqie.txt +555629 -0
  27. deepdoc/download_models.py +169 -0
  28. deepdoc/llm_adapter/__init__.py +15 -0
  29. deepdoc/llm_adapter/adapter.py +223 -0
  30. deepdoc/llm_adapter/utils.py +104 -0
  31. deepdoc/llm_adapter/vision.py +163 -0
  32. deepdoc/parser/__init__.py +42 -0
  33. deepdoc/parser/docling_parser.py +889 -0
  34. deepdoc/parser/docx_parser.py +150 -0
  35. deepdoc/parser/excel_parser.py +270 -0
  36. deepdoc/parser/figure_parser.py +182 -0
  37. deepdoc/parser/html_parser.py +221 -0
  38. deepdoc/parser/json_parser.py +179 -0
  39. deepdoc/parser/markdown_parser.py +321 -0
  40. deepdoc/parser/mineru_parser.py +646 -0
  41. deepdoc/parser/pdf_parser.py +1591 -0
  42. deepdoc/parser/ppt_parser.py +96 -0
  43. deepdoc/parser/resume/__init__.py +109 -0
  44. deepdoc/parser/resume/entities/__init__.py +15 -0
  45. deepdoc/parser/resume/entities/corporations.py +128 -0
  46. deepdoc/parser/resume/entities/degrees.py +44 -0
  47. deepdoc/parser/resume/entities/industries.py +712 -0
  48. deepdoc/parser/resume/entities/regions.py +789 -0
  49. deepdoc/parser/resume/entities/res/corp.tks.freq.json +65 -0
  50. deepdoc/parser/resume/entities/res/corp_baike_len.csv +31480 -0
  51. deepdoc/parser/resume/entities/res/corp_tag.json +14939 -0
  52. deepdoc/parser/resume/entities/res/good_corp.json +911 -0
  53. deepdoc/parser/resume/entities/res/good_sch.json +595 -0
  54. deepdoc/parser/resume/entities/res/school.rank.csv +1627 -0
  55. deepdoc/parser/resume/entities/res/schools.csv +5713 -0
  56. deepdoc/parser/resume/entities/schools.py +91 -0
  57. deepdoc/parser/resume/step_one.py +189 -0
  58. deepdoc/parser/resume/step_two.py +692 -0
  59. deepdoc/parser/tcadp_parser.py +538 -0
  60. deepdoc/parser/txt_parser.py +64 -0
  61. deepdoc/parser/utils.py +33 -0
  62. deepdoc/vision/__init__.py +90 -0
  63. deepdoc/vision/layout_recognizer.py +481 -0
  64. deepdoc/vision/ocr.py +757 -0
  65. deepdoc/vision/operators.py +733 -0
  66. deepdoc/vision/postprocess.py +370 -0
  67. deepdoc/vision/recognizer.py +451 -0
  68. deepdoc/vision/seeit.py +87 -0
  69. deepdoc/vision/t_ocr.py +101 -0
  70. deepdoc/vision/t_recognizer.py +186 -0
  71. deepdoc/vision/table_structure_recognizer.py +617 -0
  72. deepdoc_lib-0.2.0.dist-info/METADATA +246 -0
  73. deepdoc_lib-0.2.0.dist-info/RECORD +78 -0
  74. deepdoc_lib-0.2.0.dist-info/WHEEL +5 -0
  75. deepdoc_lib-0.2.0.dist-info/entry_points.txt +2 -0
  76. deepdoc_lib-0.2.0.dist-info/licenses/LICENSE +201 -0
  77. deepdoc_lib-0.2.0.dist-info/top_level.txt +2 -0
  78. scripts/download_models.py +10 -0
@@ -0,0 +1,733 @@
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import logging
18
+ import sys
19
+ import ast
20
+ import six
21
+ import cv2
22
+ import numpy as np
23
+ import math
24
+ from PIL import Image
25
+
26
+
27
+ class DecodeImage:
28
+ """ decode image """
29
+
30
+ def __init__(self,
31
+ img_mode='RGB',
32
+ channel_first=False,
33
+ ignore_orientation=False,
34
+ **kwargs):
35
+ self.img_mode = img_mode
36
+ self.channel_first = channel_first
37
+ self.ignore_orientation = ignore_orientation
38
+
39
+ def __call__(self, data):
40
+ img = data['image']
41
+ if six.PY2:
42
+ assert isinstance(img, str) and len(
43
+ img) > 0, "invalid input 'img' in DecodeImage"
44
+ else:
45
+ assert isinstance(img, bytes) and len(
46
+ img) > 0, "invalid input 'img' in DecodeImage"
47
+ img = np.frombuffer(img, dtype='uint8')
48
+ if self.ignore_orientation:
49
+ img = cv2.imdecode(img, cv2.IMREAD_IGNORE_ORIENTATION |
50
+ cv2.IMREAD_COLOR)
51
+ else:
52
+ img = cv2.imdecode(img, 1)
53
+ if img is None:
54
+ return None
55
+ if self.img_mode == 'GRAY':
56
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
57
+ elif self.img_mode == 'RGB':
58
+ assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
59
+ img.shape)
60
+ img = img[:, :, ::-1]
61
+
62
+ if self.channel_first:
63
+ img = img.transpose((2, 0, 1))
64
+
65
+ data['image'] = img
66
+ return data
67
+
68
+
69
+ class StandardizeImag:
70
+ """normalize image
71
+ Args:
72
+ mean (list): im - mean
73
+ std (list): im / std
74
+ is_scale (bool): whether need im / 255
75
+ norm_type (str): type in ['mean_std', 'none']
76
+ """
77
+
78
+ def __init__(self, mean, std, is_scale=True, norm_type='mean_std'):
79
+ self.mean = mean
80
+ self.std = std
81
+ self.is_scale = is_scale
82
+ self.norm_type = norm_type
83
+
84
+ def __call__(self, im, im_info):
85
+ """
86
+ Args:
87
+ im (np.ndarray): image (np.ndarray)
88
+ im_info (dict): info of image
89
+ Returns:
90
+ im (np.ndarray): processed image (np.ndarray)
91
+ im_info (dict): info of processed image
92
+ """
93
+ im = im.astype(np.float32, copy=False)
94
+ if self.is_scale:
95
+ scale = 1.0 / 255.0
96
+ im *= scale
97
+
98
+ if self.norm_type == 'mean_std':
99
+ mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
100
+ std = np.array(self.std)[np.newaxis, np.newaxis, :]
101
+ im -= mean
102
+ im /= std
103
+ return im, im_info
104
+
105
+
106
+ class NormalizeImage:
107
+ """ normalize image such as subtract mean, divide std
108
+ """
109
+
110
+ def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
111
+ if isinstance(scale, str):
112
+ try:
113
+ scale = float(scale)
114
+ except ValueError:
115
+ if '/' in scale:
116
+ parts = scale.split('/')
117
+ scale = ast.literal_eval(parts[0]) / ast.literal_eval(parts[1])
118
+ else:
119
+ scale = ast.literal_eval(scale)
120
+ self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
121
+ mean = mean if mean is not None else [0.485, 0.456, 0.406]
122
+ std = std if std is not None else [0.229, 0.224, 0.225]
123
+
124
+ shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
125
+ self.mean = np.array(mean).reshape(shape).astype('float32')
126
+ self.std = np.array(std).reshape(shape).astype('float32')
127
+
128
+ def __call__(self, data):
129
+ img = data['image']
130
+ from PIL import Image
131
+ if isinstance(img, Image.Image):
132
+ img = np.array(img)
133
+ assert isinstance(img,
134
+ np.ndarray), "invalid input 'img' in NormalizeImage"
135
+ data['image'] = (
136
+ img.astype('float32') * self.scale - self.mean) / self.std
137
+ return data
138
+
139
+
140
+ class ToCHWImage:
141
+ """ convert hwc image to chw image
142
+ """
143
+
144
+ def __init__(self, **kwargs):
145
+ pass
146
+
147
+ def __call__(self, data):
148
+ img = data['image']
149
+ from PIL import Image
150
+ if isinstance(img, Image.Image):
151
+ img = np.array(img)
152
+ data['image'] = img.transpose((2, 0, 1))
153
+ return data
154
+
155
+
156
+ class KeepKeys:
157
+ def __init__(self, keep_keys, **kwargs):
158
+ self.keep_keys = keep_keys
159
+
160
+ def __call__(self, data):
161
+ data_list = []
162
+ for key in self.keep_keys:
163
+ data_list.append(data[key])
164
+ return data_list
165
+
166
+
167
+ class Pad:
168
+ def __init__(self, size=None, size_div=32, **kwargs):
169
+ if size is not None and not isinstance(size, (int, list, tuple)):
170
+ raise TypeError("Type of target_size is invalid. Now is {}".format(
171
+ type(size)))
172
+ if isinstance(size, int):
173
+ size = [size, size]
174
+ self.size = size
175
+ self.size_div = size_div
176
+
177
+ def __call__(self, data):
178
+
179
+ img = data['image']
180
+ img_h, img_w = img.shape[0], img.shape[1]
181
+ if self.size:
182
+ resize_h2, resize_w2 = self.size
183
+ assert (
184
+ img_h < resize_h2 and img_w < resize_w2
185
+ ), '(h, w) of target size should be greater than (img_h, img_w)'
186
+ else:
187
+ resize_h2 = max(
188
+ int(math.ceil(img.shape[0] / self.size_div) * self.size_div),
189
+ self.size_div)
190
+ resize_w2 = max(
191
+ int(math.ceil(img.shape[1] / self.size_div) * self.size_div),
192
+ self.size_div)
193
+ img = cv2.copyMakeBorder(
194
+ img,
195
+ 0,
196
+ resize_h2 - img_h,
197
+ 0,
198
+ resize_w2 - img_w,
199
+ cv2.BORDER_CONSTANT,
200
+ value=0)
201
+ data['image'] = img
202
+ return data
203
+
204
+
205
+ class LinearResize:
206
+ """resize image by target_size and max_size
207
+ Args:
208
+ target_size (int): the target size of image
209
+ keep_ratio (bool): whether keep_ratio or not, default true
210
+ interp (int): method of resize
211
+ """
212
+
213
+ def __init__(self, target_size, keep_ratio=True, interp=cv2.INTER_LINEAR):
214
+ if isinstance(target_size, int):
215
+ target_size = [target_size, target_size]
216
+ self.target_size = target_size
217
+ self.keep_ratio = keep_ratio
218
+ self.interp = interp
219
+
220
+ def __call__(self, im, im_info):
221
+ """
222
+ Args:
223
+ im (np.ndarray): image (np.ndarray)
224
+ im_info (dict): info of image
225
+ Returns:
226
+ im (np.ndarray): processed image (np.ndarray)
227
+ im_info (dict): info of processed image
228
+ """
229
+ assert len(self.target_size) == 2
230
+ assert self.target_size[0] > 0 and self.target_size[1] > 0
231
+ _im_channel = im.shape[2]
232
+ im_scale_y, im_scale_x = self.generate_scale(im)
233
+ im = cv2.resize(
234
+ im,
235
+ None,
236
+ None,
237
+ fx=im_scale_x,
238
+ fy=im_scale_y,
239
+ interpolation=self.interp)
240
+ im_info['im_shape'] = np.array(im.shape[:2]).astype('float32')
241
+ im_info['scale_factor'] = np.array(
242
+ [im_scale_y, im_scale_x]).astype('float32')
243
+ return im, im_info
244
+
245
+ def generate_scale(self, im):
246
+ """
247
+ Args:
248
+ im (np.ndarray): image (np.ndarray)
249
+ Returns:
250
+ im_scale_x: the resize ratio of X
251
+ im_scale_y: the resize ratio of Y
252
+ """
253
+ origin_shape = im.shape[:2]
254
+ _im_c = im.shape[2]
255
+ if self.keep_ratio:
256
+ im_size_min = np.min(origin_shape)
257
+ im_size_max = np.max(origin_shape)
258
+ target_size_min = np.min(self.target_size)
259
+ target_size_max = np.max(self.target_size)
260
+ im_scale = float(target_size_min) / float(im_size_min)
261
+ if np.round(im_scale * im_size_max) > target_size_max:
262
+ im_scale = float(target_size_max) / float(im_size_max)
263
+ im_scale_x = im_scale
264
+ im_scale_y = im_scale
265
+ else:
266
+ resize_h, resize_w = self.target_size
267
+ im_scale_y = resize_h / float(origin_shape[0])
268
+ im_scale_x = resize_w / float(origin_shape[1])
269
+ return im_scale_y, im_scale_x
270
+
271
+
272
+ class Resize:
273
+ def __init__(self, size=(640, 640), **kwargs):
274
+ self.size = size
275
+
276
+ def resize_image(self, img):
277
+ resize_h, resize_w = self.size
278
+ ori_h, ori_w = img.shape[:2] # (h, w, c)
279
+ ratio_h = float(resize_h) / ori_h
280
+ ratio_w = float(resize_w) / ori_w
281
+ img = cv2.resize(img, (int(resize_w), int(resize_h)))
282
+ return img, [ratio_h, ratio_w]
283
+
284
+ def __call__(self, data):
285
+ img = data['image']
286
+ if 'polys' in data:
287
+ text_polys = data['polys']
288
+
289
+ img_resize, [ratio_h, ratio_w] = self.resize_image(img)
290
+ if 'polys' in data:
291
+ new_boxes = []
292
+ for box in text_polys:
293
+ new_box = []
294
+ for cord in box:
295
+ new_box.append([cord[0] * ratio_w, cord[1] * ratio_h])
296
+ new_boxes.append(new_box)
297
+ data['polys'] = np.array(new_boxes, dtype=np.float32)
298
+ data['image'] = img_resize
299
+ return data
300
+
301
+
302
+ class DetResizeForTest:
303
+ def __init__(self, **kwargs):
304
+ super(DetResizeForTest, self).__init__()
305
+ self.resize_type = 0
306
+ self.keep_ratio = False
307
+ if 'image_shape' in kwargs:
308
+ self.image_shape = kwargs['image_shape']
309
+ self.resize_type = 1
310
+ if 'keep_ratio' in kwargs:
311
+ self.keep_ratio = kwargs['keep_ratio']
312
+ elif 'limit_side_len' in kwargs:
313
+ self.limit_side_len = kwargs['limit_side_len']
314
+ self.limit_type = kwargs.get('limit_type', 'min')
315
+ elif 'resize_long' in kwargs:
316
+ self.resize_type = 2
317
+ self.resize_long = kwargs.get('resize_long', 960)
318
+ else:
319
+ self.limit_side_len = 736
320
+ self.limit_type = 'min'
321
+
322
+ def __call__(self, data):
323
+ img = data['image']
324
+ src_h, src_w, _ = img.shape
325
+ if sum([src_h, src_w]) < 64:
326
+ img = self.image_padding(img)
327
+
328
+ if self.resize_type == 0:
329
+ # img, shape = self.resize_image_type0(img)
330
+ img, [ratio_h, ratio_w] = self.resize_image_type0(img)
331
+ elif self.resize_type == 2:
332
+ img, [ratio_h, ratio_w] = self.resize_image_type2(img)
333
+ else:
334
+ # img, shape = self.resize_image_type1(img)
335
+ img, [ratio_h, ratio_w] = self.resize_image_type1(img)
336
+ data['image'] = img
337
+ data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
338
+ return data
339
+
340
+ def image_padding(self, im, value=0):
341
+ h, w, c = im.shape
342
+ im_pad = np.zeros((max(32, h), max(32, w), c), np.uint8) + value
343
+ im_pad[:h, :w, :] = im
344
+ return im_pad
345
+
346
+ def resize_image_type1(self, img):
347
+ resize_h, resize_w = self.image_shape
348
+ ori_h, ori_w = img.shape[:2] # (h, w, c)
349
+ if self.keep_ratio is True:
350
+ resize_w = ori_w * resize_h / ori_h
351
+ N = math.ceil(resize_w / 32)
352
+ resize_w = N * 32
353
+ ratio_h = float(resize_h) / ori_h
354
+ ratio_w = float(resize_w) / ori_w
355
+ img = cv2.resize(img, (int(resize_w), int(resize_h)))
356
+ # return img, np.array([ori_h, ori_w])
357
+ return img, [ratio_h, ratio_w]
358
+
359
+ def resize_image_type0(self, img):
360
+ """
361
+ resize image to a size multiple of 32 which is required by the network
362
+ args:
363
+ img(array): array with shape [h, w, c]
364
+ return(tuple):
365
+ img, (ratio_h, ratio_w)
366
+ """
367
+ limit_side_len = self.limit_side_len
368
+ h, w, c = img.shape
369
+
370
+ # limit the max side
371
+ if self.limit_type == 'max':
372
+ if max(h, w) > limit_side_len:
373
+ if h > w:
374
+ ratio = float(limit_side_len) / h
375
+ else:
376
+ ratio = float(limit_side_len) / w
377
+ else:
378
+ ratio = 1.
379
+ elif self.limit_type == 'min':
380
+ if min(h, w) < limit_side_len:
381
+ if h < w:
382
+ ratio = float(limit_side_len) / h
383
+ else:
384
+ ratio = float(limit_side_len) / w
385
+ else:
386
+ ratio = 1.
387
+ elif self.limit_type == 'resize_long':
388
+ ratio = float(limit_side_len) / max(h, w)
389
+ else:
390
+ raise Exception('not support limit type, image ')
391
+ resize_h = int(h * ratio)
392
+ resize_w = int(w * ratio)
393
+
394
+ resize_h = max(int(round(resize_h / 32) * 32), 32)
395
+ resize_w = max(int(round(resize_w / 32) * 32), 32)
396
+
397
+ try:
398
+ if int(resize_w) <= 0 or int(resize_h) <= 0:
399
+ return None, (None, None)
400
+ img = cv2.resize(img, (int(resize_w), int(resize_h)))
401
+ except BaseException:
402
+ logging.exception("{} {} {}".format(img.shape, resize_w, resize_h))
403
+ sys.exit(0)
404
+ ratio_h = resize_h / float(h)
405
+ ratio_w = resize_w / float(w)
406
+ return img, [ratio_h, ratio_w]
407
+
408
+ def resize_image_type2(self, img):
409
+ h, w, _ = img.shape
410
+
411
+ resize_w = w
412
+ resize_h = h
413
+
414
+ if resize_h > resize_w:
415
+ ratio = float(self.resize_long) / resize_h
416
+ else:
417
+ ratio = float(self.resize_long) / resize_w
418
+
419
+ resize_h = int(resize_h * ratio)
420
+ resize_w = int(resize_w * ratio)
421
+
422
+ max_stride = 128
423
+ resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
424
+ resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
425
+ img = cv2.resize(img, (int(resize_w), int(resize_h)))
426
+ ratio_h = resize_h / float(h)
427
+ ratio_w = resize_w / float(w)
428
+
429
+ return img, [ratio_h, ratio_w]
430
+
431
+
432
+ class E2EResizeForTest:
433
+ def __init__(self, **kwargs):
434
+ super(E2EResizeForTest, self).__init__()
435
+ self.max_side_len = kwargs['max_side_len']
436
+ self.valid_set = kwargs['valid_set']
437
+
438
+ def __call__(self, data):
439
+ img = data['image']
440
+ src_h, src_w, _ = img.shape
441
+ if self.valid_set == 'totaltext':
442
+ im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext(
443
+ img, max_side_len=self.max_side_len)
444
+ else:
445
+ im_resized, (ratio_h, ratio_w) = self.resize_image(
446
+ img, max_side_len=self.max_side_len)
447
+ data['image'] = im_resized
448
+ data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
449
+ return data
450
+
451
+ def resize_image_for_totaltext(self, im, max_side_len=512):
452
+ h, w, _ = im.shape
453
+ resize_w = w
454
+ resize_h = h
455
+ ratio = 1.25
456
+ if h * ratio > max_side_len:
457
+ ratio = float(max_side_len) / resize_h
458
+ resize_h = int(resize_h * ratio)
459
+ resize_w = int(resize_w * ratio)
460
+
461
+ max_stride = 128
462
+ resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
463
+ resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
464
+ im = cv2.resize(im, (int(resize_w), int(resize_h)))
465
+ ratio_h = resize_h / float(h)
466
+ ratio_w = resize_w / float(w)
467
+ return im, (ratio_h, ratio_w)
468
+
469
+ def resize_image(self, im, max_side_len=512):
470
+ """
471
+ resize image to a size multiple of max_stride which is required by the network
472
+ :param im: the resized image
473
+ :param max_side_len: limit of max image size to avoid out of memory in gpu
474
+ :return: the resized image and the resize ratio
475
+ """
476
+ h, w, _ = im.shape
477
+
478
+ resize_w = w
479
+ resize_h = h
480
+
481
+ # Fix the longer side
482
+ if resize_h > resize_w:
483
+ ratio = float(max_side_len) / resize_h
484
+ else:
485
+ ratio = float(max_side_len) / resize_w
486
+
487
+ resize_h = int(resize_h * ratio)
488
+ resize_w = int(resize_w * ratio)
489
+
490
+ max_stride = 128
491
+ resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
492
+ resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
493
+ im = cv2.resize(im, (int(resize_w), int(resize_h)))
494
+ ratio_h = resize_h / float(h)
495
+ ratio_w = resize_w / float(w)
496
+
497
+ return im, (ratio_h, ratio_w)
498
+
499
+
500
+ class KieResize:
501
+ def __init__(self, **kwargs):
502
+ super(KieResize, self).__init__()
503
+ self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[
504
+ 'img_scale'][1]
505
+
506
+ def __call__(self, data):
507
+ img = data['image']
508
+ points = data['points']
509
+ src_h, src_w, _ = img.shape
510
+ im_resized, scale_factor, [ratio_h, ratio_w
511
+ ], [new_h, new_w] = self.resize_image(img)
512
+ resize_points = self.resize_boxes(img, points, scale_factor)
513
+ data['ori_image'] = img
514
+ data['ori_boxes'] = points
515
+ data['points'] = resize_points
516
+ data['image'] = im_resized
517
+ data['shape'] = np.array([new_h, new_w])
518
+ return data
519
+
520
+ def resize_image(self, img):
521
+ norm_img = np.zeros([1024, 1024, 3], dtype='float32')
522
+ scale = [512, 1024]
523
+ h, w = img.shape[:2]
524
+ max_long_edge = max(scale)
525
+ max_short_edge = min(scale)
526
+ scale_factor = min(max_long_edge / max(h, w),
527
+ max_short_edge / min(h, w))
528
+ resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float(
529
+ scale_factor) + 0.5)
530
+ max_stride = 32
531
+ resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
532
+ resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
533
+ im = cv2.resize(img, (resize_w, resize_h))
534
+ new_h, new_w = im.shape[:2]
535
+ w_scale = new_w / w
536
+ h_scale = new_h / h
537
+ scale_factor = np.array(
538
+ [w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
539
+ norm_img[:new_h, :new_w, :] = im
540
+ return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w]
541
+
542
+ def resize_boxes(self, im, points, scale_factor):
543
+ points = points * scale_factor
544
+ img_shape = im.shape[:2]
545
+ points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1])
546
+ points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0])
547
+ return points
548
+
549
+
550
+ class SRResize:
551
+ def __init__(self,
552
+ imgH=32,
553
+ imgW=128,
554
+ down_sample_scale=4,
555
+ keep_ratio=False,
556
+ min_ratio=1,
557
+ mask=False,
558
+ infer_mode=False,
559
+ **kwargs):
560
+ self.imgH = imgH
561
+ self.imgW = imgW
562
+ self.keep_ratio = keep_ratio
563
+ self.min_ratio = min_ratio
564
+ self.down_sample_scale = down_sample_scale
565
+ self.mask = mask
566
+ self.infer_mode = infer_mode
567
+
568
+ def __call__(self, data):
569
+ imgH = self.imgH
570
+ imgW = self.imgW
571
+ images_lr = data["image_lr"]
572
+ transform2 = ResizeNormalize(
573
+ (imgW // self.down_sample_scale, imgH // self.down_sample_scale))
574
+ images_lr = transform2(images_lr)
575
+ data["img_lr"] = images_lr
576
+ if self.infer_mode:
577
+ return data
578
+
579
+ images_HR = data["image_hr"]
580
+ _label_strs = data["label"]
581
+ transform = ResizeNormalize((imgW, imgH))
582
+ images_HR = transform(images_HR)
583
+ data["img_hr"] = images_HR
584
+ return data
585
+
586
+
587
+ class ResizeNormalize:
588
+ def __init__(self, size, interpolation=Image.BICUBIC):
589
+ self.size = size
590
+ self.interpolation = interpolation
591
+
592
+ def __call__(self, img):
593
+ img = img.resize(self.size, self.interpolation)
594
+ img_numpy = np.array(img).astype("float32")
595
+ img_numpy = img_numpy.transpose((2, 0, 1)) / 255
596
+ return img_numpy
597
+
598
+
599
+ class GrayImageChannelFormat:
600
+ """
601
+ format gray scale image's channel: (3,h,w) -> (1,h,w)
602
+ Args:
603
+ inverse: inverse gray image
604
+ """
605
+
606
+ def __init__(self, inverse=False, **kwargs):
607
+ self.inverse = inverse
608
+
609
+ def __call__(self, data):
610
+ img = data['image']
611
+ img_single_channel = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
612
+ img_expanded = np.expand_dims(img_single_channel, 0)
613
+
614
+ if self.inverse:
615
+ data['image'] = np.abs(img_expanded - 1)
616
+ else:
617
+ data['image'] = img_expanded
618
+
619
+ data['src_image'] = img
620
+ return data
621
+
622
+
623
+ class Permute:
624
+ """permute image
625
+ Args:
626
+ to_bgr (bool): whether convert RGB to BGR
627
+ channel_first (bool): whether convert HWC to CHW
628
+ """
629
+
630
+ def __init__(self, ):
631
+ super(Permute, self).__init__()
632
+
633
+ def __call__(self, im, im_info):
634
+ """
635
+ Args:
636
+ im (np.ndarray): image (np.ndarray)
637
+ im_info (dict): info of image
638
+ Returns:
639
+ im (np.ndarray): processed image (np.ndarray)
640
+ im_info (dict): info of processed image
641
+ """
642
+ im = im.transpose((2, 0, 1)).copy()
643
+ return im, im_info
644
+
645
+
646
+ class PadStride:
647
+ """ padding image for model with FPN, instead PadBatch(pad_to_stride) in original config
648
+ Args:
649
+ stride (bool): model with FPN need image shape % stride == 0
650
+ """
651
+
652
+ def __init__(self, stride=0):
653
+ self.coarsest_stride = stride
654
+
655
+ def __call__(self, im, im_info):
656
+ """
657
+ Args:
658
+ im (np.ndarray): image (np.ndarray)
659
+ im_info (dict): info of image
660
+ Returns:
661
+ im (np.ndarray): processed image (np.ndarray)
662
+ im_info (dict): info of processed image
663
+ """
664
+ coarsest_stride = self.coarsest_stride
665
+ if coarsest_stride <= 0:
666
+ return im, im_info
667
+ im_c, im_h, im_w = im.shape
668
+ pad_h = int(np.ceil(float(im_h) / coarsest_stride) * coarsest_stride)
669
+ pad_w = int(np.ceil(float(im_w) / coarsest_stride) * coarsest_stride)
670
+ padding_im = np.zeros((im_c, pad_h, pad_w), dtype=np.float32)
671
+ padding_im[:, :im_h, :im_w] = im
672
+ return padding_im, im_info
673
+
674
+
675
+ def decode_image(im_file, im_info):
676
+ """read rgb image
677
+ Args:
678
+ im_file (str|np.ndarray): input can be image path or np.ndarray
679
+ im_info (dict): info of image
680
+ Returns:
681
+ im (np.ndarray): processed image (np.ndarray)
682
+ im_info (dict): info of processed image
683
+ """
684
+ if isinstance(im_file, str):
685
+ with open(im_file, 'rb') as f:
686
+ im_read = f.read()
687
+ data = np.frombuffer(im_read, dtype='uint8')
688
+ im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode
689
+ im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
690
+ else:
691
+ im = im_file
692
+ im_info['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
693
+ im_info['scale_factor'] = np.array([1., 1.], dtype=np.float32)
694
+ return im, im_info
695
+
696
+
697
+ def preprocess(im, preprocess_ops):
698
+ # process image by preprocess_ops
699
+ im_info = {
700
+ 'scale_factor': np.array(
701
+ [1., 1.], dtype=np.float32),
702
+ 'im_shape': None,
703
+ }
704
+ im, im_info = decode_image(im, im_info)
705
+ for operator in preprocess_ops:
706
+ im, im_info = operator(im, im_info)
707
+ return im, im_info
708
+
709
+
710
+ def nms(bboxes, scores, iou_thresh):
711
+ import numpy as np
712
+ x1 = bboxes[:, 0]
713
+ y1 = bboxes[:, 1]
714
+ x2 = bboxes[:, 2]
715
+ y2 = bboxes[:, 3]
716
+ areas = (y2 - y1) * (x2 - x1)
717
+
718
+ indices = []
719
+ index = scores.argsort()[::-1]
720
+ while index.size > 0:
721
+ i = index[0]
722
+ indices.append(i)
723
+ x11 = np.maximum(x1[i], x1[index[1:]])
724
+ y11 = np.maximum(y1[i], y1[index[1:]])
725
+ x22 = np.minimum(x2[i], x2[index[1:]])
726
+ y22 = np.minimum(y2[i], y2[index[1:]])
727
+ w = np.maximum(0, x22 - x11 + 1)
728
+ h = np.maximum(0, y22 - y11 + 1)
729
+ overlaps = w * h
730
+ ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
731
+ idx = np.where(ious <= iou_thresh)[0]
732
+ index = index[idx + 1]
733
+ return indices