openocr-python 0.0.9__py3-none-any.whl → 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. openocr/__init__.py +35 -1
  2. openocr/configs/dataset/rec/evaluation.yaml +41 -0
  3. openocr/configs/dataset/rec/ltb.yaml +9 -0
  4. openocr/configs/dataset/rec/mjsynth.yaml +11 -0
  5. openocr/configs/dataset/rec/openvino.yaml +25 -0
  6. openocr/configs/dataset/rec/ost.yaml +17 -0
  7. openocr/configs/dataset/rec/synthtext.yaml +7 -0
  8. openocr/configs/dataset/rec/test.yaml +77 -0
  9. openocr/configs/dataset/rec/textocr.yaml +13 -0
  10. openocr/configs/dataset/rec/textocr_horizontal.yaml +13 -0
  11. openocr/configs/dataset/rec/union14m_b.yaml +47 -0
  12. openocr/configs/dataset/rec/union14m_l_filtered.yaml +35 -0
  13. openocr/configs/rec/cmer/cmer.yml +127 -0
  14. openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_base.yml +152 -0
  15. openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_small.yml +152 -0
  16. openocr/configs/rec/unirec/focalsvtr_ardecoder_unirec.yml +114 -0
  17. openocr/configs/rec/unirec/opendoc_pipeline.yml +105 -0
  18. openocr/demo_gradio.py +28 -8
  19. openocr/demo_opendoc.py +572 -0
  20. openocr/demo_unirec.py +392 -0
  21. openocr/opendet/losses/__init__.py +5 -7
  22. openocr/opendet/preprocess/crop_resize.py +2 -1
  23. openocr/openocr.py +685 -0
  24. openocr/openrec/losses/__init__.py +8 -3
  25. openocr/openrec/losses/cmer_loss.py +12 -0
  26. openocr/openrec/losses/mdiff_loss.py +11 -0
  27. openocr/openrec/losses/unirec_loss.py +12 -0
  28. openocr/openrec/metrics/__init__.py +4 -1
  29. openocr/openrec/metrics/rec_metric_cmer.py +328 -0
  30. openocr/openrec/modeling/cmer_modeling/modeling_cmer.py +643 -0
  31. openocr/openrec/modeling/decoders/__init__.py +1 -0
  32. openocr/openrec/modeling/decoders/ctc_decoder.py +1 -1
  33. openocr/openrec/modeling/decoders/dan_decoder.py +4 -4
  34. openocr/openrec/modeling/decoders/dptr_parseq_clip_b_decoder.py +1563 -1398
  35. openocr/openrec/modeling/decoders/mdiff_decoder.py +587 -0
  36. openocr/openrec/modeling/decoders/smtr_decoder.py +99 -48
  37. openocr/openrec/modeling/unirec_modeling/configuration_unirec.py +166 -0
  38. openocr/openrec/modeling/unirec_modeling/modeling_unirec.py +433 -0
  39. openocr/openrec/optimizer/__init__.py +4 -3
  40. openocr/openrec/optimizer/lr.py +49 -0
  41. openocr/openrec/postprocess/__init__.py +2 -0
  42. openocr/openrec/postprocess/abinet_postprocess.py +1 -1
  43. openocr/openrec/postprocess/ar_postprocess.py +1 -1
  44. openocr/openrec/postprocess/cmer_postprocess.py +86 -0
  45. openocr/openrec/postprocess/cppd_postprocess.py +1 -1
  46. openocr/openrec/postprocess/igtr_postprocess.py +1 -1
  47. openocr/openrec/postprocess/lister_postprocess.py +1 -1
  48. openocr/openrec/postprocess/mgp_postprocess.py +1 -1
  49. openocr/openrec/postprocess/nrtr_postprocess.py +2 -2
  50. openocr/openrec/postprocess/smtr_postprocess.py +1 -1
  51. openocr/openrec/postprocess/srn_postprocess.py +1 -1
  52. openocr/openrec/postprocess/unirec_postprocess.py +58 -0
  53. openocr/openrec/postprocess/visionlan_postprocess.py +1 -1
  54. openocr/openrec/preprocess/__init__.py +5 -0
  55. openocr/openrec/preprocess/ce_label_encode.py +1 -1
  56. openocr/openrec/preprocess/cmer_label_encode.py +1025 -0
  57. openocr/openrec/preprocess/ctc_label_encode.py +1 -1
  58. openocr/openrec/preprocess/dptr_label_encode.py +177 -157
  59. openocr/openrec/preprocess/igtr_label_encode.py +4 -2
  60. openocr/openrec/preprocess/mdiff_label_encode.py +312 -0
  61. openocr/openrec/preprocess/rec_aug.py +128 -2
  62. openocr/openrec/preprocess/resize.py +57 -0
  63. openocr/openrec/preprocess/unirec_label_encode.py +62 -0
  64. openocr/tools/data/__init__.py +78 -55
  65. openocr/tools/data/cmer_web_dataset.py +310 -0
  66. openocr/tools/data/native_size_dataset.py +753 -0
  67. openocr/tools/data/native_size_sampler.py +158 -0
  68. openocr/tools/data/ratio_dataset_tvresize.py +2 -0
  69. openocr/tools/data/ratio_sampler.py +2 -1
  70. openocr/tools/download/download_dataset.py +38 -0
  71. openocr/tools/download/utils.py +28 -0
  72. openocr/tools/download_example_images.py +236 -0
  73. openocr/tools/engine/trainer.py +155 -39
  74. openocr/tools/eval_rec_all_ch.py +2 -2
  75. openocr/tools/infer_det.py +20 -2
  76. openocr/tools/infer_doc.py +898 -0
  77. openocr/tools/infer_doc_onnx.py +1172 -0
  78. openocr/tools/infer_e2e.py +27 -10
  79. openocr/tools/infer_rec.py +64 -15
  80. openocr/tools/infer_unirec_onnx.py +730 -0
  81. openocr/tools/to_markdown.py +468 -0
  82. openocr/tools/utils/ckpt.py +17 -5
  83. openocr/tools/utils/opendoc_onnx_utils/utils.py +1052 -0
  84. openocr_python-0.1.0.dev0.dist-info/METADATA +324 -0
  85. {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/RECORD +89 -45
  86. {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/WHEEL +1 -1
  87. openocr_python-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  88. openocr_python-0.0.9.dist-info/METADATA +0 -149
  89. /openocr_python-0.0.9.dist-info/LICENCE → /openocr_python-0.1.0.dev0.dist-info/licenses/LICENSE +0 -0
  90. {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,312 @@
1
+ import random
2
+ import numpy as np
3
+
4
+ from openrec.preprocess.ctc_label_encode import BaseRecLabelEncode
5
+
6
+
7
+ class MDiffLabelEncode(BaseRecLabelEncode):
8
+ """Convert between text-label and text-index."""
9
+
10
+ MASK = '<mask>'
11
+ EOS = '</s>'
12
+ PAD = '<pad>'
13
+
14
+ def __init__(self,
15
+ max_text_length,
16
+ character_dict_path=None,
17
+ use_space_char=False,
18
+ semi_ar=False,
19
+ mask_tpye=[0, 1, 2, 3, 4, 5],
20
+ train_all_layer=False,
21
+ sample_num=1,
22
+ **kwargs):
23
+ super(MDiffLabelEncode,
24
+ self).__init__(max_text_length, character_dict_path,
25
+ use_space_char)
26
+
27
+ self.semi_ar = semi_ar
28
+ self.mask_tpye = mask_tpye #[0, 1, 2, 3, 4, 5]
29
+ # 5种mask:
30
+ # 全部mask,纯并行
31
+ # 正向自回归
32
+ # 反向自回归
33
+ # block mask(1, 2,3,4,5,6步)
34
+ # 随机mask
35
+ self.train_all_layer = train_all_layer
36
+ if not self.train_all_layer:
37
+ self.sample_num = 1
38
+ else:
39
+ self.sample_num = sample_num
40
+
41
+ def random_mask(self, text):
42
+ l = len(text)
43
+ p_mask = random.random()
44
+
45
+ noisy_batch = text[:]
46
+ masked_indices = [False] * l
47
+ none_pad_indices = []
48
+ for i in range(l):
49
+ if random.random() < p_mask and text[i] != self.dict[self.PAD]:
50
+ noisy_batch[i] = self.dict[self.MASK]
51
+ masked_indices[i] = True
52
+ if text[i] != self.dict[self.PAD]:
53
+ none_pad_indices.append(i)
54
+ if noisy_batch[i] == self.dict[self.PAD]:
55
+ noisy_batch[i] = self.dict[self.MASK]
56
+
57
+ if not any(masked_indices) and len(none_pad_indices) > 0:
58
+ idx = random.choice(none_pad_indices)
59
+ noisy_batch[idx] = self.dict[self.MASK]
60
+ masked_indices[idx] = True
61
+ return noisy_batch, masked_indices
62
+
63
+ def full_mask(self, text):
64
+ noisy_batch = [self.dict[self.MASK]] * (self.max_text_len + 1)
65
+ masked_indices = [True] * len(text) + [False] * (self.max_text_len +
66
+ 1 - len(text))
67
+ return noisy_batch, masked_indices
68
+
69
+ def left_to_right_mask(self, text):
70
+ rand_split = random.randint(1, len(text) - 1)
71
+ noisy_batch = text[:rand_split] + [self.dict[self.MASK]] * (
72
+ self.max_text_len + 1 - rand_split)
73
+ masked_indices = [False] * rand_split + [True] * (len(
74
+ text) - rand_split) + [False] * (self.max_text_len + 1 - len(text))
75
+ return noisy_batch, masked_indices
76
+
77
+ def semi_left_to_right_mask(self, text, block_size=5, step=3, i=0):
78
+
79
+ text = text + [self.dict[self.PAD]
80
+ ] * (self.max_text_len + 1 - len(text))
81
+ noisy_batch_step_i = []
82
+ masked_indices_step_i = []
83
+
84
+ if i == 0:
85
+ for t in text[:block_size]:
86
+ if t == self.dict[self.PAD]:
87
+ masked_indices_step_i.append(False)
88
+ else:
89
+ masked_indices_step_i.append(True)
90
+
91
+ noisy_batch_step_i = [self.dict[self.MASK]] * block_size + [
92
+ self.dict[self.PAD]
93
+ ] * (self.max_text_len + 1)
94
+ masked_indices_step_i = masked_indices_step_i + [False] * (
95
+ self.max_text_len + 1 - block_size)
96
+ elif i == 1:
97
+
98
+ noisy_batch_r, masked_indices_r = self.random_mask(
99
+ text[:block_size])
100
+ noisy_batch_step_i = noisy_batch_r
101
+ masked_indices_step_i = masked_indices_r
102
+ if i == step - 1:
103
+ for t in text[i * block_size:]:
104
+ if t == self.dict[self.PAD]:
105
+ masked_indices_step_i.append(False)
106
+ else:
107
+ masked_indices_step_i.append(True)
108
+ noisy_batch_step_i.append(self.dict[self.MASK])
109
+ else:
110
+ for t in text[i * block_size:(i + 1) * block_size]:
111
+ if t == self.dict[self.PAD]:
112
+ masked_indices_step_i.append(False)
113
+ else:
114
+ masked_indices_step_i.append(True)
115
+ noisy_batch_step_i.append(self.dict[self.MASK])
116
+
117
+ noisy_batch_step_i = noisy_batch_step_i + [self.dict[self.PAD]] * (
118
+ self.max_text_len + 1 - len(noisy_batch_step_i))
119
+ masked_indices_step_i = masked_indices_step_i + [False] * (
120
+ self.max_text_len + 1 - len(noisy_batch_step_i))
121
+ elif i >= 2:
122
+
123
+ for t in text[:(i - 1) * block_size]:
124
+ if t == self.dict[self.PAD]:
125
+ noisy_batch_step_i.append(self.dict[self.MASK])
126
+ else:
127
+ noisy_batch_step_i.append(text[t])
128
+ masked_indices_step_i = [False] * ((i - 1) * block_size)
129
+
130
+ noisy_batch_r, masked_indices_r = self.random_mask(
131
+ text[(i - 1) * block_size:i * block_size])
132
+
133
+ noisy_batch_step_i = noisy_batch_step_i + noisy_batch_r
134
+ masked_indices_step_i = masked_indices_step_i + masked_indices_r
135
+ if i == step - 1:
136
+ for t in text[i * block_size:]:
137
+ if t == self.dict[self.PAD]:
138
+ masked_indices_step_i.append(False)
139
+ else:
140
+ masked_indices_step_i.append(True)
141
+ noisy_batch_step_i.append(self.dict[self.MASK])
142
+ else:
143
+ for t in text[i * block_size:(i + 1) * block_size]:
144
+ if t == self.dict[self.PAD]:
145
+ masked_indices_step_i.append(False)
146
+ else:
147
+ masked_indices_step_i.append(True)
148
+ noisy_batch_step_i.append(self.dict[self.MASK])
149
+
150
+ noisy_batch_step_i = noisy_batch_step_i + [self.dict[self.PAD]] * (
151
+ self.max_text_len + 1 - len(noisy_batch_step_i))
152
+ masked_indices_step_i = masked_indices_step_i + [False] * (
153
+ self.max_text_len + 1 - len(masked_indices_step_i))
154
+ return noisy_batch_step_i, masked_indices_step_i
155
+
156
+ def forward_process_semi_ar(self, text):
157
+
158
+ step = 5
159
+ block_size = (self.max_text_len + 1) // step
160
+ noisy_batch_semi_ar = []
161
+ masked_indices_semi_ar = []
162
+ for i in range(step):
163
+ noisy_batch_step_i, masked_indices_step_i = self.semi_left_to_right_mask(
164
+ text, block_size=block_size, step=step, i=i)
165
+ noisy_batch_semi_ar.append(noisy_batch_step_i)
166
+ masked_indices_semi_ar.append(masked_indices_step_i)
167
+
168
+ rd_step = random.choice([2, 3, 4, 6, 7, 8])
169
+ block_size = (self.max_text_len + 1) // rd_step
170
+ rd_step_i = random.randint(0, rd_step - 1)
171
+ noisy_batch, masked_indices = self.semi_left_to_right_mask(
172
+ text, block_size=block_size, step=rd_step, i=rd_step_i)
173
+ noisy_batch_semi_ar.append(noisy_batch)
174
+ masked_indices_semi_ar.append(masked_indices)
175
+
176
+ # 随机将text中的部分token mask掉
177
+ noisy_batch, masked_indices = self.random_mask(text)
178
+ noisy_batch = noisy_batch + [self.dict[self.MASK]
179
+ ] * (self.max_text_len + 1 - len(text))
180
+ masked_indices = masked_indices + [False] * (self.max_text_len + 1 -
181
+ len(text))
182
+
183
+ noisy_batch_semi_ar.append(noisy_batch)
184
+ masked_indices_semi_ar.append(masked_indices)
185
+ return noisy_batch_semi_ar, noisy_batch_semi_ar
186
+
187
+ def right_to_left_mask(self, text):
188
+ rand_split = random.randint(1, len(text) - 1)
189
+ noisy_batch = [self.dict[self.MASK]
190
+ ] * rand_split + text[rand_split:] + [
191
+ self.dict[self.MASK]
192
+ ] * (self.max_text_len + 1 - len(text))
193
+ masked_indices = [True] * rand_split + [False] * (len(
194
+ text) - rand_split) + [False] * (self.max_text_len + 1 - len(text))
195
+ return noisy_batch, masked_indices
196
+
197
+ def forward_process(self, text):
198
+
199
+ rand_choice = random.choice(self.mask_tpye)
200
+ if rand_choice == 0: # 并行mask full mask
201
+ return self.full_mask(text)
202
+ elif rand_choice == 1 and len(text) > 2: # 正向自回归 right mask
203
+ return self.left_to_right_mask(text)
204
+ elif rand_choice == 2 and len(text) > 2: # 反向自回归 left mask
205
+ return self.right_to_left_mask(text)
206
+ elif rand_choice == 3 and len(text) > 2: # block mask
207
+ rand_step = min(random.randint(2, 6), len(text))
208
+ if rand_step <= 1: # len(text) <= 1
209
+ return self.full_mask(text)
210
+ block_size = len(text) // rand_step
211
+ if block_size == 1:
212
+ return self.left_to_right_mask(text) if random.random(
213
+ ) < 0.5 else self.right_to_left_mask(text)
214
+ # 余数处理
215
+ if len(text) % rand_step != 0:
216
+ rand_step += 1
217
+ # 选择一个随机的block_size
218
+ rand_step_from_mask = random.randint(2, rand_step)
219
+ if rand_step == 2:
220
+ rand_step_from_mask = 1
221
+ else:
222
+ rand_step_from_mask = random.randint(2, rand_step)
223
+
224
+ noisy_batch = text[:block_size * (rand_step_from_mask - 1)]
225
+ masked_indices = [False] * (block_size * (rand_step_from_mask - 1))
226
+
227
+ noisy_batch = noisy_batch + [self.dict[self.MASK]] * (
228
+ self.max_text_len + 1 - len(noisy_batch))
229
+ masked_indices = masked_indices + [True] * (
230
+ len(text) - block_size *
231
+ (rand_step_from_mask - 1)) + [False] * (self.max_text_len + 1 -
232
+ len(text))
233
+ return noisy_batch, masked_indices
234
+ elif rand_choice == 4 and len(text) > 2: # cloze mask
235
+ noisy_batch = text[:]
236
+ masked_indices = [False] * len(text)
237
+ rand_index = random.randint(0, len(text) - 1)
238
+ noisy_batch[rand_index] = self.dict[self.MASK]
239
+ masked_indices[rand_index] = True
240
+ noisy_batch = noisy_batch + [self.dict[self.MASK]] * (
241
+ self.max_text_len + 1 - len(text))
242
+ masked_indices = masked_indices + [False] * (self.max_text_len +
243
+ 1 - len(text))
244
+ return noisy_batch, masked_indices
245
+ else: # random mask
246
+ # 随机将text中的部分token mask掉
247
+ noisy_batch, masked_indices = self.random_mask(text)
248
+ noisy_batch = noisy_batch + [self.dict[self.MASK]] * (
249
+ self.max_text_len + 1 - len(text))
250
+ masked_indices = masked_indices + [False] * (self.max_text_len +
251
+ 1 - len(text))
252
+ return noisy_batch, masked_indices
253
+
254
+ def reflect_random_idices(self, text, eps=1e-3):
255
+ l = len(text)
256
+ t = random.random()
257
+ p_mask = (1 - eps) * t + eps
258
+ reflect_ids = text[:]
259
+ for i in range(l):
260
+ if random.random() < p_mask:
261
+ reflect_ids[i] = random.randint(0, len(self.dict) - 1)
262
+ return reflect_ids
263
+
264
+ def __call__(self, data):
265
+ text = data['label']
266
+ text = self.encode(text)
267
+ if text is None:
268
+ return None
269
+ data['length'] = np.array(len(text) + 1)
270
+ text = text + [self.dict[self.EOS]]
271
+ p_mask_list = []
272
+ noisy_batch_list = []
273
+ masked_indices_list = []
274
+ reflect_ids_list = []
275
+ for i in range(self.sample_num):
276
+ reflect_ids = self.reflect_random_idices(text)
277
+ reflect_ids = reflect_ids + [self.dict[self.MASK]] * (
278
+ self.max_text_len + 1 - len(reflect_ids))
279
+ if self.semi_ar:
280
+ noisy_batch, masked_indices = self.forward_process_semi_ar(
281
+ text)
282
+ else:
283
+ noisy_batch, masked_indices = self.forward_process(text)
284
+ p_mask = float(sum(masked_indices)) / float(len(text))
285
+ p_mask_list.append(np.array(p_mask))
286
+ noisy_batch_list.append(np.array(noisy_batch))
287
+ masked_indices_list.append(np.array(masked_indices))
288
+ reflect_ids_list.append(np.array(reflect_ids))
289
+
290
+ if not self.semi_ar:
291
+ data['p_mask'] = np.array(
292
+ p_mask_list) if self.train_all_layer else np.array(
293
+ p_mask_list[0])
294
+ data['noisy_batch'] = np.array(
295
+ noisy_batch_list) if self.train_all_layer else np.array(
296
+ noisy_batch_list[0])
297
+ data['masked_indices'] = np.array(
298
+ masked_indices_list) if self.train_all_layer else np.array(
299
+ masked_indices_list[0])
300
+ data['reflect_ids'] = np.array(
301
+ reflect_ids_list) if self.train_all_layer else np.array(
302
+ reflect_ids_list[0])
303
+
304
+ text = text + [self.dict[self.PAD]
305
+ ] * (self.max_text_len + 1 - len(text))
306
+ data['label'] = np.array(text)
307
+
308
+ return data
309
+
310
+ def add_special_char(self, dict_character):
311
+ dict_character = [self.EOS] + dict_character + [self.MASK, self.PAD]
312
+ return dict_character
@@ -2,13 +2,13 @@ import random
2
2
 
3
3
  import cv2
4
4
  import numpy as np
5
- from PIL import Image
6
- from .parseq_aug import rand_augment_transform
5
+ from PIL import Image, ImageOps
7
6
 
8
7
 
9
8
  class PARSeqAugPIL(object):
10
9
 
11
10
  def __init__(self, **kwargs):
11
+ from .parseq_aug import rand_augment_transform
12
12
  self.transforms = rand_augment_transform()
13
13
 
14
14
  def __call__(self, data):
@@ -21,6 +21,7 @@ class PARSeqAugPIL(object):
21
21
  class PARSeqAug(object):
22
22
 
23
23
  def __init__(self, **kwargs):
24
+ from .parseq_aug import rand_augment_transform
24
25
  self.transforms = rand_augment_transform()
25
26
 
26
27
  def __call__(self, data):
@@ -149,6 +150,131 @@ class BaseDataAugmentation(object):
149
150
  return data
150
151
 
151
152
 
153
+ class DocAug(object):
154
+
155
+ def __init__(self, **kwargs):
156
+ import albumentations as A
157
+ self.aug = A.Compose(
158
+ [
159
+ A.RandomShadow(
160
+ shadow_roi=(0, 0.7, 1, 1), num_shadows_upper=2, p=0.5),
161
+ # 透视变换,模拟拍照角度,这个计算量稍大,但效果关键
162
+ A.OneOf(
163
+ [
164
+ A.Perspective(
165
+ scale=(0.05, 0.1), # 控制形变程度
166
+ keep_size=False,
167
+ fit_output=True, # 保持输出大小一致
168
+ pad_mode=cv2.BORDER_CONSTANT,
169
+ pad_val=[255, 255, 255], # 白色填充
170
+ p=0.3 # 50% 概率应用
171
+ ),
172
+ A.GridDistortion(distort_limit=0.1,
173
+ border_mode=0,
174
+ interpolation=3,
175
+ value=[255, 255, 255],
176
+ p=0.3),
177
+ # 安全版弹性变换
178
+ A.ElasticTransform(
179
+ alpha=0.5, # 减小形变强度
180
+ sigma=30, # 增大平滑度
181
+ border_mode=cv2.BORDER_REPLICATE, # 使用边缘复制代替裁剪
182
+ p=0.3)
183
+ ],
184
+ p=0.3),
185
+ A.RGBShift(r_shift_limit=15,
186
+ g_shift_limit=15,
187
+ b_shift_limit=15,
188
+ p=0.3),
189
+
190
+ # 光学特性模拟
191
+ A.OneOf(
192
+ [
193
+ A.MotionBlur(blur_limit=(3, 5), p=0.7),
194
+ A.GaussianBlur(blur_limit=(3, 5), p=0.7),
195
+ # A.GlassBlur(sigma=0.05, max_delta=1, iterations=1, p=0.2)
196
+ ],
197
+ p=0.6),
198
+
199
+ # 色彩空间变换
200
+ A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2),
201
+ contrast_limit=(-0.1, 0.1),
202
+ p=0.5),
203
+ A.RandomGamma(gamma_limit=(80, 120), p=0.3),
204
+ A.CLAHE(clip_limit=2.0, p=0.2),
205
+
206
+ # 压缩伪影
207
+ A.ImageCompression(quality_lower=5, quality_upper=90, p=0.8),
208
+
209
+ # 传感器噪声模拟
210
+ A.OneOf([
211
+ A.GaussNoise(var_limit=(10.0, 30.0), p=0.5),
212
+ A.ISONoise(
213
+ color_shift=(0.01, 0.05), intensity=(0.1, 0.3), p=0.3),
214
+ A.MultiplicativeNoise(multiplier=(0.9, 1.1), p=0.2)
215
+ ],
216
+ p=0.8),
217
+ A.RandomFog(fog_coef_lower=0.1, fog_coef_upper=0.3, p=0.5),
218
+
219
+ # 随机分辨率调整(保持长宽比)
220
+ A.Downscale(scale_min=0.5,
221
+ scale_max=0.8,
222
+ interpolation=cv2.INTER_LINEAR,
223
+ p=0.25),
224
+ A.PixelDropout(dropout_prob=0.01, p=0.2)
225
+ ],
226
+ p=1) # 100%的概率应用整个流程
227
+
228
+ def biased_random_int(self, max_value: int, exponent: float = 3.0) -> int:
229
+ """
230
+ 返回 [0, max_value] 之间的一个整数,值越大概率越小。
231
+ exponent > 1 越大表示越偏向小值。
232
+ """
233
+ r = random.random() # 均匀分布 [0,1)
234
+ biased = r**exponent # 变成偏小
235
+ return int(biased * max_value)
236
+
237
+ def random_pad(self, img: Image.Image) -> Image.Image:
238
+ w, h = img.size
239
+
240
+ # 最大可 pad 宽度和高度
241
+ max_pad_w = int(w * 0.2)
242
+ max_pad_h = int(h * 0.2)
243
+
244
+ # 随机分配左右 padding
245
+ pad_left = self.biased_random_int(max_pad_w)
246
+ pad_right = self.biased_random_int(max_pad_w)
247
+
248
+ # 随机分配上下 padding
249
+ pad_top = self.biased_random_int(max_pad_h)
250
+ pad_bottom = self.biased_random_int(max_pad_h)
251
+
252
+ # 应用 padding
253
+ padded_img = ImageOps.expand(img,
254
+ border=(pad_left, pad_top, pad_right,
255
+ pad_bottom),
256
+ fill=(255, 255, 255))
257
+ return padded_img
258
+
259
+ def __call__(self, data):
260
+ image = data['image']
261
+ arxiv = data['arxiv']
262
+ # 执行数据增强
263
+ try:
264
+ if random.random() < 0.2 and arxiv:
265
+ # 随机填充
266
+ image = self.random_pad(image)
267
+ if not isinstance(image, np.ndarray):
268
+ image = np.array(image)
269
+
270
+ augmented = self.aug(image=image)['image']
271
+ data['image'] = Image.fromarray(augmented)
272
+ except Exception as e:
273
+ print(e)
274
+
275
+ return data
276
+
277
+
152
278
  def hsv_aug(img):
153
279
  """cvtColor."""
154
280
  hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
@@ -304,6 +304,63 @@ class RecDynamicResize(object):
304
304
  return data
305
305
 
306
306
 
307
+ class NaSizeResize(object):
308
+
309
+ def __init__(self,
310
+ max_side=[64 * 15, 64 * 22],
311
+ divided_factor=[64, 64],
312
+ **kwargs):
313
+ from torchvision import transforms as T
314
+ from torchvision.transforms import functional as F
315
+ self.F = F
316
+ self.max_side = max_side
317
+ self.divided_factor = divided_factor
318
+ self.interpolation = T.InterpolationMode.BICUBIC
319
+ transforms = []
320
+ transforms.extend([
321
+ T.ToTensor(),
322
+ T.Normalize(0.5, 0.5),
323
+ ])
324
+ self.transforms = T.Compose(transforms)
325
+
326
+ def resize_image(self, original_width, original_height, max_width,
327
+ max_height):
328
+ # 计算宽高比
329
+ aspect_ratio = original_width / original_height
330
+
331
+ # 计算新的宽度和高度
332
+ if original_width > max_width or original_height > max_height:
333
+ if (max_width / max_height) >= aspect_ratio:
334
+ # 按高度限制比例
335
+ new_height = max_height
336
+ new_width = int(new_height * aspect_ratio)
337
+ else:
338
+ # 按宽度限制比例
339
+ new_width = max_width
340
+ new_height = int(new_width / aspect_ratio)
341
+ else:
342
+ # 如果图片已经小于或等于最大尺寸,则无需调整
343
+ new_width, new_height = original_width, original_height
344
+ return new_width, new_height
345
+
346
+ def __call__(self, data):
347
+ img = data['image']
348
+ # imgW, imgH = self.max_side
349
+ w, h = img.size
350
+ w_r, h_r = self.resize_image(w, h, self.max_side[0], self.max_side[1])
351
+ h_r = max(int(h_r // self.divided_factor[1] * self.divided_factor[1]),
352
+ 64)
353
+ w_r = max(int(w_r // self.divided_factor[0] * self.divided_factor[0]),
354
+ 64)
355
+ resized_image = self.F.resize(img, (h_r, w_r),
356
+ interpolation=self.interpolation)
357
+ img = self.transforms(resized_image)
358
+ valid_ratio = min(1.0, float(w_r / w))
359
+ data['image'] = img
360
+ data['valid_ratio'] = valid_ratio
361
+ return data
362
+
363
+
307
364
  def resize_norm_img_slice(
308
365
  img,
309
366
  image_shape,
@@ -0,0 +1,62 @@
1
+ import numpy as np
2
+ from openrec.preprocess.ctc_label_encode import BaseRecLabelEncode
3
+
4
+ import os
5
+
6
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
7
+
8
+
9
+ class UniRecLabelEncode(BaseRecLabelEncode):
10
+ """ Convert between text-label and text-index """
11
+ SPACE = '[s]'
12
+ GO = '[GO]'
13
+ list_token = [GO, SPACE]
14
+
15
+ def __init__(self,
16
+ max_text_length,
17
+ character_dict_path=None,
18
+ use_space_char=False,
19
+ vlmocr=False,
20
+ tokenizer_path='./galactica',
21
+ **kwargs):
22
+ super(UniRecLabelEncode,
23
+ self).__init__(max_text_length, character_dict_path,
24
+ use_space_char)
25
+ # character (str): set of the possible characters.
26
+ # [GO] for the start token of the attention decoder. [s] for end-of-sentence token.
27
+ if vlmocr:
28
+ self.padding_idx = 1
29
+ self.eos_idx = 2
30
+ self.bos_idx = 0
31
+ else:
32
+ self.padding_idx = 0
33
+ self.eos_idx = 2
34
+ self.bos_idx = 1
35
+ self.batch_max_length = max_text_length + 3
36
+ from transformers import AutoTokenizer # transformers==4.2.1
37
+ self.bpe_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
38
+
39
+ def __call__(self, data):
40
+ text = data['label']
41
+ bpe_text, length = self.bpe_encode(text)
42
+ if bpe_text is None:
43
+ return None
44
+ data['label'] = np.array(bpe_text)
45
+ data['length'] = np.array(length)
46
+ return data
47
+
48
+ def add_special_char(self, dict_character):
49
+ dict_character = self.list_token + dict_character
50
+ return dict_character
51
+
52
+ def bpe_encode(self, text):
53
+ if len(text) == 0:
54
+ return None, None
55
+ token = self.bpe_tokenizer(text)['input_ids']
56
+ length = len(token)
57
+ text_list = [self.bos_idx] + token + [2]
58
+ if len(text_list) == 0 or len(text_list) > self.batch_max_length:
59
+ return None, None
60
+ text_list = text_list + [self.padding_idx
61
+ ] * (self.batch_max_length - len(text_list))
62
+ return text_list, length