openocr-python 0.0.9__py3-none-any.whl → 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openocr/__init__.py +35 -1
- openocr/configs/dataset/rec/evaluation.yaml +41 -0
- openocr/configs/dataset/rec/ltb.yaml +9 -0
- openocr/configs/dataset/rec/mjsynth.yaml +11 -0
- openocr/configs/dataset/rec/openvino.yaml +25 -0
- openocr/configs/dataset/rec/ost.yaml +17 -0
- openocr/configs/dataset/rec/synthtext.yaml +7 -0
- openocr/configs/dataset/rec/test.yaml +77 -0
- openocr/configs/dataset/rec/textocr.yaml +13 -0
- openocr/configs/dataset/rec/textocr_horizontal.yaml +13 -0
- openocr/configs/dataset/rec/union14m_b.yaml +47 -0
- openocr/configs/dataset/rec/union14m_l_filtered.yaml +35 -0
- openocr/configs/rec/cmer/cmer.yml +127 -0
- openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_base.yml +152 -0
- openocr/configs/rec/mdiff4str/svtrv2_mdiffdecoder_small.yml +152 -0
- openocr/configs/rec/unirec/focalsvtr_ardecoder_unirec.yml +114 -0
- openocr/configs/rec/unirec/opendoc_pipeline.yml +105 -0
- openocr/demo_gradio.py +28 -8
- openocr/demo_opendoc.py +572 -0
- openocr/demo_unirec.py +392 -0
- openocr/opendet/losses/__init__.py +5 -7
- openocr/opendet/preprocess/crop_resize.py +2 -1
- openocr/openocr.py +685 -0
- openocr/openrec/losses/__init__.py +8 -3
- openocr/openrec/losses/cmer_loss.py +12 -0
- openocr/openrec/losses/mdiff_loss.py +11 -0
- openocr/openrec/losses/unirec_loss.py +12 -0
- openocr/openrec/metrics/__init__.py +4 -1
- openocr/openrec/metrics/rec_metric_cmer.py +328 -0
- openocr/openrec/modeling/cmer_modeling/modeling_cmer.py +643 -0
- openocr/openrec/modeling/decoders/__init__.py +1 -0
- openocr/openrec/modeling/decoders/ctc_decoder.py +1 -1
- openocr/openrec/modeling/decoders/dan_decoder.py +4 -4
- openocr/openrec/modeling/decoders/dptr_parseq_clip_b_decoder.py +1563 -1398
- openocr/openrec/modeling/decoders/mdiff_decoder.py +587 -0
- openocr/openrec/modeling/decoders/smtr_decoder.py +99 -48
- openocr/openrec/modeling/unirec_modeling/configuration_unirec.py +166 -0
- openocr/openrec/modeling/unirec_modeling/modeling_unirec.py +433 -0
- openocr/openrec/optimizer/__init__.py +4 -3
- openocr/openrec/optimizer/lr.py +49 -0
- openocr/openrec/postprocess/__init__.py +2 -0
- openocr/openrec/postprocess/abinet_postprocess.py +1 -1
- openocr/openrec/postprocess/ar_postprocess.py +1 -1
- openocr/openrec/postprocess/cmer_postprocess.py +86 -0
- openocr/openrec/postprocess/cppd_postprocess.py +1 -1
- openocr/openrec/postprocess/igtr_postprocess.py +1 -1
- openocr/openrec/postprocess/lister_postprocess.py +1 -1
- openocr/openrec/postprocess/mgp_postprocess.py +1 -1
- openocr/openrec/postprocess/nrtr_postprocess.py +2 -2
- openocr/openrec/postprocess/smtr_postprocess.py +1 -1
- openocr/openrec/postprocess/srn_postprocess.py +1 -1
- openocr/openrec/postprocess/unirec_postprocess.py +58 -0
- openocr/openrec/postprocess/visionlan_postprocess.py +1 -1
- openocr/openrec/preprocess/__init__.py +5 -0
- openocr/openrec/preprocess/ce_label_encode.py +1 -1
- openocr/openrec/preprocess/cmer_label_encode.py +1025 -0
- openocr/openrec/preprocess/ctc_label_encode.py +1 -1
- openocr/openrec/preprocess/dptr_label_encode.py +177 -157
- openocr/openrec/preprocess/igtr_label_encode.py +4 -2
- openocr/openrec/preprocess/mdiff_label_encode.py +312 -0
- openocr/openrec/preprocess/rec_aug.py +128 -2
- openocr/openrec/preprocess/resize.py +57 -0
- openocr/openrec/preprocess/unirec_label_encode.py +62 -0
- openocr/tools/data/__init__.py +78 -55
- openocr/tools/data/cmer_web_dataset.py +310 -0
- openocr/tools/data/native_size_dataset.py +753 -0
- openocr/tools/data/native_size_sampler.py +158 -0
- openocr/tools/data/ratio_dataset_tvresize.py +2 -0
- openocr/tools/data/ratio_sampler.py +2 -1
- openocr/tools/download/download_dataset.py +38 -0
- openocr/tools/download/utils.py +28 -0
- openocr/tools/download_example_images.py +236 -0
- openocr/tools/engine/trainer.py +155 -39
- openocr/tools/eval_rec_all_ch.py +2 -2
- openocr/tools/infer_det.py +20 -2
- openocr/tools/infer_doc.py +898 -0
- openocr/tools/infer_doc_onnx.py +1172 -0
- openocr/tools/infer_e2e.py +27 -10
- openocr/tools/infer_rec.py +64 -15
- openocr/tools/infer_unirec_onnx.py +730 -0
- openocr/tools/to_markdown.py +468 -0
- openocr/tools/utils/ckpt.py +17 -5
- openocr/tools/utils/opendoc_onnx_utils/utils.py +1052 -0
- openocr_python-0.1.0.dev0.dist-info/METADATA +324 -0
- {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/RECORD +89 -45
- {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/WHEEL +1 -1
- openocr_python-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- openocr_python-0.0.9.dist-info/METADATA +0 -149
- /openocr_python-0.0.9.dist-info/LICENCE → /openocr_python-0.1.0.dev0.dist-info/licenses/LICENSE +0 -0
- {openocr_python-0.0.9.dist-info → openocr_python-0.1.0.dev0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
from openrec.preprocess.ctc_label_encode import BaseRecLabelEncode
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MDiffLabelEncode(BaseRecLabelEncode):
|
|
8
|
+
"""Convert between text-label and text-index."""
|
|
9
|
+
|
|
10
|
+
MASK = '<mask>'
|
|
11
|
+
EOS = '</s>'
|
|
12
|
+
PAD = '<pad>'
|
|
13
|
+
|
|
14
|
+
def __init__(self,
|
|
15
|
+
max_text_length,
|
|
16
|
+
character_dict_path=None,
|
|
17
|
+
use_space_char=False,
|
|
18
|
+
semi_ar=False,
|
|
19
|
+
mask_tpye=[0, 1, 2, 3, 4, 5],
|
|
20
|
+
train_all_layer=False,
|
|
21
|
+
sample_num=1,
|
|
22
|
+
**kwargs):
|
|
23
|
+
super(MDiffLabelEncode,
|
|
24
|
+
self).__init__(max_text_length, character_dict_path,
|
|
25
|
+
use_space_char)
|
|
26
|
+
|
|
27
|
+
self.semi_ar = semi_ar
|
|
28
|
+
self.mask_tpye = mask_tpye #[0, 1, 2, 3, 4, 5]
|
|
29
|
+
# 5种mask:
|
|
30
|
+
# 全部mask,纯并行
|
|
31
|
+
# 正向自回归
|
|
32
|
+
# 反向自回归
|
|
33
|
+
# block mask(1, 2,3,4,5,6步)
|
|
34
|
+
# 随机mask
|
|
35
|
+
self.train_all_layer = train_all_layer
|
|
36
|
+
if not self.train_all_layer:
|
|
37
|
+
self.sample_num = 1
|
|
38
|
+
else:
|
|
39
|
+
self.sample_num = sample_num
|
|
40
|
+
|
|
41
|
+
def random_mask(self, text):
|
|
42
|
+
l = len(text)
|
|
43
|
+
p_mask = random.random()
|
|
44
|
+
|
|
45
|
+
noisy_batch = text[:]
|
|
46
|
+
masked_indices = [False] * l
|
|
47
|
+
none_pad_indices = []
|
|
48
|
+
for i in range(l):
|
|
49
|
+
if random.random() < p_mask and text[i] != self.dict[self.PAD]:
|
|
50
|
+
noisy_batch[i] = self.dict[self.MASK]
|
|
51
|
+
masked_indices[i] = True
|
|
52
|
+
if text[i] != self.dict[self.PAD]:
|
|
53
|
+
none_pad_indices.append(i)
|
|
54
|
+
if noisy_batch[i] == self.dict[self.PAD]:
|
|
55
|
+
noisy_batch[i] = self.dict[self.MASK]
|
|
56
|
+
|
|
57
|
+
if not any(masked_indices) and len(none_pad_indices) > 0:
|
|
58
|
+
idx = random.choice(none_pad_indices)
|
|
59
|
+
noisy_batch[idx] = self.dict[self.MASK]
|
|
60
|
+
masked_indices[idx] = True
|
|
61
|
+
return noisy_batch, masked_indices
|
|
62
|
+
|
|
63
|
+
def full_mask(self, text):
|
|
64
|
+
noisy_batch = [self.dict[self.MASK]] * (self.max_text_len + 1)
|
|
65
|
+
masked_indices = [True] * len(text) + [False] * (self.max_text_len +
|
|
66
|
+
1 - len(text))
|
|
67
|
+
return noisy_batch, masked_indices
|
|
68
|
+
|
|
69
|
+
def left_to_right_mask(self, text):
|
|
70
|
+
rand_split = random.randint(1, len(text) - 1)
|
|
71
|
+
noisy_batch = text[:rand_split] + [self.dict[self.MASK]] * (
|
|
72
|
+
self.max_text_len + 1 - rand_split)
|
|
73
|
+
masked_indices = [False] * rand_split + [True] * (len(
|
|
74
|
+
text) - rand_split) + [False] * (self.max_text_len + 1 - len(text))
|
|
75
|
+
return noisy_batch, masked_indices
|
|
76
|
+
|
|
77
|
+
def semi_left_to_right_mask(self, text, block_size=5, step=3, i=0):
|
|
78
|
+
|
|
79
|
+
text = text + [self.dict[self.PAD]
|
|
80
|
+
] * (self.max_text_len + 1 - len(text))
|
|
81
|
+
noisy_batch_step_i = []
|
|
82
|
+
masked_indices_step_i = []
|
|
83
|
+
|
|
84
|
+
if i == 0:
|
|
85
|
+
for t in text[:block_size]:
|
|
86
|
+
if t == self.dict[self.PAD]:
|
|
87
|
+
masked_indices_step_i.append(False)
|
|
88
|
+
else:
|
|
89
|
+
masked_indices_step_i.append(True)
|
|
90
|
+
|
|
91
|
+
noisy_batch_step_i = [self.dict[self.MASK]] * block_size + [
|
|
92
|
+
self.dict[self.PAD]
|
|
93
|
+
] * (self.max_text_len + 1)
|
|
94
|
+
masked_indices_step_i = masked_indices_step_i + [False] * (
|
|
95
|
+
self.max_text_len + 1 - block_size)
|
|
96
|
+
elif i == 1:
|
|
97
|
+
|
|
98
|
+
noisy_batch_r, masked_indices_r = self.random_mask(
|
|
99
|
+
text[:block_size])
|
|
100
|
+
noisy_batch_step_i = noisy_batch_r
|
|
101
|
+
masked_indices_step_i = masked_indices_r
|
|
102
|
+
if i == step - 1:
|
|
103
|
+
for t in text[i * block_size:]:
|
|
104
|
+
if t == self.dict[self.PAD]:
|
|
105
|
+
masked_indices_step_i.append(False)
|
|
106
|
+
else:
|
|
107
|
+
masked_indices_step_i.append(True)
|
|
108
|
+
noisy_batch_step_i.append(self.dict[self.MASK])
|
|
109
|
+
else:
|
|
110
|
+
for t in text[i * block_size:(i + 1) * block_size]:
|
|
111
|
+
if t == self.dict[self.PAD]:
|
|
112
|
+
masked_indices_step_i.append(False)
|
|
113
|
+
else:
|
|
114
|
+
masked_indices_step_i.append(True)
|
|
115
|
+
noisy_batch_step_i.append(self.dict[self.MASK])
|
|
116
|
+
|
|
117
|
+
noisy_batch_step_i = noisy_batch_step_i + [self.dict[self.PAD]] * (
|
|
118
|
+
self.max_text_len + 1 - len(noisy_batch_step_i))
|
|
119
|
+
masked_indices_step_i = masked_indices_step_i + [False] * (
|
|
120
|
+
self.max_text_len + 1 - len(noisy_batch_step_i))
|
|
121
|
+
elif i >= 2:
|
|
122
|
+
|
|
123
|
+
for t in text[:(i - 1) * block_size]:
|
|
124
|
+
if t == self.dict[self.PAD]:
|
|
125
|
+
noisy_batch_step_i.append(self.dict[self.MASK])
|
|
126
|
+
else:
|
|
127
|
+
noisy_batch_step_i.append(text[t])
|
|
128
|
+
masked_indices_step_i = [False] * ((i - 1) * block_size)
|
|
129
|
+
|
|
130
|
+
noisy_batch_r, masked_indices_r = self.random_mask(
|
|
131
|
+
text[(i - 1) * block_size:i * block_size])
|
|
132
|
+
|
|
133
|
+
noisy_batch_step_i = noisy_batch_step_i + noisy_batch_r
|
|
134
|
+
masked_indices_step_i = masked_indices_step_i + masked_indices_r
|
|
135
|
+
if i == step - 1:
|
|
136
|
+
for t in text[i * block_size:]:
|
|
137
|
+
if t == self.dict[self.PAD]:
|
|
138
|
+
masked_indices_step_i.append(False)
|
|
139
|
+
else:
|
|
140
|
+
masked_indices_step_i.append(True)
|
|
141
|
+
noisy_batch_step_i.append(self.dict[self.MASK])
|
|
142
|
+
else:
|
|
143
|
+
for t in text[i * block_size:(i + 1) * block_size]:
|
|
144
|
+
if t == self.dict[self.PAD]:
|
|
145
|
+
masked_indices_step_i.append(False)
|
|
146
|
+
else:
|
|
147
|
+
masked_indices_step_i.append(True)
|
|
148
|
+
noisy_batch_step_i.append(self.dict[self.MASK])
|
|
149
|
+
|
|
150
|
+
noisy_batch_step_i = noisy_batch_step_i + [self.dict[self.PAD]] * (
|
|
151
|
+
self.max_text_len + 1 - len(noisy_batch_step_i))
|
|
152
|
+
masked_indices_step_i = masked_indices_step_i + [False] * (
|
|
153
|
+
self.max_text_len + 1 - len(masked_indices_step_i))
|
|
154
|
+
return noisy_batch_step_i, masked_indices_step_i
|
|
155
|
+
|
|
156
|
+
def forward_process_semi_ar(self, text):
|
|
157
|
+
|
|
158
|
+
step = 5
|
|
159
|
+
block_size = (self.max_text_len + 1) // step
|
|
160
|
+
noisy_batch_semi_ar = []
|
|
161
|
+
masked_indices_semi_ar = []
|
|
162
|
+
for i in range(step):
|
|
163
|
+
noisy_batch_step_i, masked_indices_step_i = self.semi_left_to_right_mask(
|
|
164
|
+
text, block_size=block_size, step=step, i=i)
|
|
165
|
+
noisy_batch_semi_ar.append(noisy_batch_step_i)
|
|
166
|
+
masked_indices_semi_ar.append(masked_indices_step_i)
|
|
167
|
+
|
|
168
|
+
rd_step = random.choice([2, 3, 4, 6, 7, 8])
|
|
169
|
+
block_size = (self.max_text_len + 1) // rd_step
|
|
170
|
+
rd_step_i = random.randint(0, rd_step - 1)
|
|
171
|
+
noisy_batch, masked_indices = self.semi_left_to_right_mask(
|
|
172
|
+
text, block_size=block_size, step=rd_step, i=rd_step_i)
|
|
173
|
+
noisy_batch_semi_ar.append(noisy_batch)
|
|
174
|
+
masked_indices_semi_ar.append(masked_indices)
|
|
175
|
+
|
|
176
|
+
# 随机将text中的部分token mask掉
|
|
177
|
+
noisy_batch, masked_indices = self.random_mask(text)
|
|
178
|
+
noisy_batch = noisy_batch + [self.dict[self.MASK]
|
|
179
|
+
] * (self.max_text_len + 1 - len(text))
|
|
180
|
+
masked_indices = masked_indices + [False] * (self.max_text_len + 1 -
|
|
181
|
+
len(text))
|
|
182
|
+
|
|
183
|
+
noisy_batch_semi_ar.append(noisy_batch)
|
|
184
|
+
masked_indices_semi_ar.append(masked_indices)
|
|
185
|
+
return noisy_batch_semi_ar, noisy_batch_semi_ar
|
|
186
|
+
|
|
187
|
+
def right_to_left_mask(self, text):
|
|
188
|
+
rand_split = random.randint(1, len(text) - 1)
|
|
189
|
+
noisy_batch = [self.dict[self.MASK]
|
|
190
|
+
] * rand_split + text[rand_split:] + [
|
|
191
|
+
self.dict[self.MASK]
|
|
192
|
+
] * (self.max_text_len + 1 - len(text))
|
|
193
|
+
masked_indices = [True] * rand_split + [False] * (len(
|
|
194
|
+
text) - rand_split) + [False] * (self.max_text_len + 1 - len(text))
|
|
195
|
+
return noisy_batch, masked_indices
|
|
196
|
+
|
|
197
|
+
def forward_process(self, text):
|
|
198
|
+
|
|
199
|
+
rand_choice = random.choice(self.mask_tpye)
|
|
200
|
+
if rand_choice == 0: # 并行mask full mask
|
|
201
|
+
return self.full_mask(text)
|
|
202
|
+
elif rand_choice == 1 and len(text) > 2: # 正向自回归 right mask
|
|
203
|
+
return self.left_to_right_mask(text)
|
|
204
|
+
elif rand_choice == 2 and len(text) > 2: # 反向自回归 left mask
|
|
205
|
+
return self.right_to_left_mask(text)
|
|
206
|
+
elif rand_choice == 3 and len(text) > 2: # block mask
|
|
207
|
+
rand_step = min(random.randint(2, 6), len(text))
|
|
208
|
+
if rand_step <= 1: # len(text) <= 1
|
|
209
|
+
return self.full_mask(text)
|
|
210
|
+
block_size = len(text) // rand_step
|
|
211
|
+
if block_size == 1:
|
|
212
|
+
return self.left_to_right_mask(text) if random.random(
|
|
213
|
+
) < 0.5 else self.right_to_left_mask(text)
|
|
214
|
+
# 余数处理
|
|
215
|
+
if len(text) % rand_step != 0:
|
|
216
|
+
rand_step += 1
|
|
217
|
+
# 选择一个随机的block_size
|
|
218
|
+
rand_step_from_mask = random.randint(2, rand_step)
|
|
219
|
+
if rand_step == 2:
|
|
220
|
+
rand_step_from_mask = 1
|
|
221
|
+
else:
|
|
222
|
+
rand_step_from_mask = random.randint(2, rand_step)
|
|
223
|
+
|
|
224
|
+
noisy_batch = text[:block_size * (rand_step_from_mask - 1)]
|
|
225
|
+
masked_indices = [False] * (block_size * (rand_step_from_mask - 1))
|
|
226
|
+
|
|
227
|
+
noisy_batch = noisy_batch + [self.dict[self.MASK]] * (
|
|
228
|
+
self.max_text_len + 1 - len(noisy_batch))
|
|
229
|
+
masked_indices = masked_indices + [True] * (
|
|
230
|
+
len(text) - block_size *
|
|
231
|
+
(rand_step_from_mask - 1)) + [False] * (self.max_text_len + 1 -
|
|
232
|
+
len(text))
|
|
233
|
+
return noisy_batch, masked_indices
|
|
234
|
+
elif rand_choice == 4 and len(text) > 2: # cloze mask
|
|
235
|
+
noisy_batch = text[:]
|
|
236
|
+
masked_indices = [False] * len(text)
|
|
237
|
+
rand_index = random.randint(0, len(text) - 1)
|
|
238
|
+
noisy_batch[rand_index] = self.dict[self.MASK]
|
|
239
|
+
masked_indices[rand_index] = True
|
|
240
|
+
noisy_batch = noisy_batch + [self.dict[self.MASK]] * (
|
|
241
|
+
self.max_text_len + 1 - len(text))
|
|
242
|
+
masked_indices = masked_indices + [False] * (self.max_text_len +
|
|
243
|
+
1 - len(text))
|
|
244
|
+
return noisy_batch, masked_indices
|
|
245
|
+
else: # random mask
|
|
246
|
+
# 随机将text中的部分token mask掉
|
|
247
|
+
noisy_batch, masked_indices = self.random_mask(text)
|
|
248
|
+
noisy_batch = noisy_batch + [self.dict[self.MASK]] * (
|
|
249
|
+
self.max_text_len + 1 - len(text))
|
|
250
|
+
masked_indices = masked_indices + [False] * (self.max_text_len +
|
|
251
|
+
1 - len(text))
|
|
252
|
+
return noisy_batch, masked_indices
|
|
253
|
+
|
|
254
|
+
def reflect_random_idices(self, text, eps=1e-3):
|
|
255
|
+
l = len(text)
|
|
256
|
+
t = random.random()
|
|
257
|
+
p_mask = (1 - eps) * t + eps
|
|
258
|
+
reflect_ids = text[:]
|
|
259
|
+
for i in range(l):
|
|
260
|
+
if random.random() < p_mask:
|
|
261
|
+
reflect_ids[i] = random.randint(0, len(self.dict) - 1)
|
|
262
|
+
return reflect_ids
|
|
263
|
+
|
|
264
|
+
def __call__(self, data):
|
|
265
|
+
text = data['label']
|
|
266
|
+
text = self.encode(text)
|
|
267
|
+
if text is None:
|
|
268
|
+
return None
|
|
269
|
+
data['length'] = np.array(len(text) + 1)
|
|
270
|
+
text = text + [self.dict[self.EOS]]
|
|
271
|
+
p_mask_list = []
|
|
272
|
+
noisy_batch_list = []
|
|
273
|
+
masked_indices_list = []
|
|
274
|
+
reflect_ids_list = []
|
|
275
|
+
for i in range(self.sample_num):
|
|
276
|
+
reflect_ids = self.reflect_random_idices(text)
|
|
277
|
+
reflect_ids = reflect_ids + [self.dict[self.MASK]] * (
|
|
278
|
+
self.max_text_len + 1 - len(reflect_ids))
|
|
279
|
+
if self.semi_ar:
|
|
280
|
+
noisy_batch, masked_indices = self.forward_process_semi_ar(
|
|
281
|
+
text)
|
|
282
|
+
else:
|
|
283
|
+
noisy_batch, masked_indices = self.forward_process(text)
|
|
284
|
+
p_mask = float(sum(masked_indices)) / float(len(text))
|
|
285
|
+
p_mask_list.append(np.array(p_mask))
|
|
286
|
+
noisy_batch_list.append(np.array(noisy_batch))
|
|
287
|
+
masked_indices_list.append(np.array(masked_indices))
|
|
288
|
+
reflect_ids_list.append(np.array(reflect_ids))
|
|
289
|
+
|
|
290
|
+
if not self.semi_ar:
|
|
291
|
+
data['p_mask'] = np.array(
|
|
292
|
+
p_mask_list) if self.train_all_layer else np.array(
|
|
293
|
+
p_mask_list[0])
|
|
294
|
+
data['noisy_batch'] = np.array(
|
|
295
|
+
noisy_batch_list) if self.train_all_layer else np.array(
|
|
296
|
+
noisy_batch_list[0])
|
|
297
|
+
data['masked_indices'] = np.array(
|
|
298
|
+
masked_indices_list) if self.train_all_layer else np.array(
|
|
299
|
+
masked_indices_list[0])
|
|
300
|
+
data['reflect_ids'] = np.array(
|
|
301
|
+
reflect_ids_list) if self.train_all_layer else np.array(
|
|
302
|
+
reflect_ids_list[0])
|
|
303
|
+
|
|
304
|
+
text = text + [self.dict[self.PAD]
|
|
305
|
+
] * (self.max_text_len + 1 - len(text))
|
|
306
|
+
data['label'] = np.array(text)
|
|
307
|
+
|
|
308
|
+
return data
|
|
309
|
+
|
|
310
|
+
def add_special_char(self, dict_character):
|
|
311
|
+
dict_character = [self.EOS] + dict_character + [self.MASK, self.PAD]
|
|
312
|
+
return dict_character
|
|
@@ -2,13 +2,13 @@ import random
|
|
|
2
2
|
|
|
3
3
|
import cv2
|
|
4
4
|
import numpy as np
|
|
5
|
-
from PIL import Image
|
|
6
|
-
from .parseq_aug import rand_augment_transform
|
|
5
|
+
from PIL import Image, ImageOps
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
class PARSeqAugPIL(object):
|
|
10
9
|
|
|
11
10
|
def __init__(self, **kwargs):
|
|
11
|
+
from .parseq_aug import rand_augment_transform
|
|
12
12
|
self.transforms = rand_augment_transform()
|
|
13
13
|
|
|
14
14
|
def __call__(self, data):
|
|
@@ -21,6 +21,7 @@ class PARSeqAugPIL(object):
|
|
|
21
21
|
class PARSeqAug(object):
|
|
22
22
|
|
|
23
23
|
def __init__(self, **kwargs):
|
|
24
|
+
from .parseq_aug import rand_augment_transform
|
|
24
25
|
self.transforms = rand_augment_transform()
|
|
25
26
|
|
|
26
27
|
def __call__(self, data):
|
|
@@ -149,6 +150,131 @@ class BaseDataAugmentation(object):
|
|
|
149
150
|
return data
|
|
150
151
|
|
|
151
152
|
|
|
153
|
+
class DocAug(object):
|
|
154
|
+
|
|
155
|
+
def __init__(self, **kwargs):
|
|
156
|
+
import albumentations as A
|
|
157
|
+
self.aug = A.Compose(
|
|
158
|
+
[
|
|
159
|
+
A.RandomShadow(
|
|
160
|
+
shadow_roi=(0, 0.7, 1, 1), num_shadows_upper=2, p=0.5),
|
|
161
|
+
# 透视变换,模拟拍照角度,这个计算量稍大,但效果关键
|
|
162
|
+
A.OneOf(
|
|
163
|
+
[
|
|
164
|
+
A.Perspective(
|
|
165
|
+
scale=(0.05, 0.1), # 控制形变程度
|
|
166
|
+
keep_size=False,
|
|
167
|
+
fit_output=True, # 保持输出大小一致
|
|
168
|
+
pad_mode=cv2.BORDER_CONSTANT,
|
|
169
|
+
pad_val=[255, 255, 255], # 白色填充
|
|
170
|
+
p=0.3 # 50% 概率应用
|
|
171
|
+
),
|
|
172
|
+
A.GridDistortion(distort_limit=0.1,
|
|
173
|
+
border_mode=0,
|
|
174
|
+
interpolation=3,
|
|
175
|
+
value=[255, 255, 255],
|
|
176
|
+
p=0.3),
|
|
177
|
+
# 安全版弹性变换
|
|
178
|
+
A.ElasticTransform(
|
|
179
|
+
alpha=0.5, # 减小形变强度
|
|
180
|
+
sigma=30, # 增大平滑度
|
|
181
|
+
border_mode=cv2.BORDER_REPLICATE, # 使用边缘复制代替裁剪
|
|
182
|
+
p=0.3)
|
|
183
|
+
],
|
|
184
|
+
p=0.3),
|
|
185
|
+
A.RGBShift(r_shift_limit=15,
|
|
186
|
+
g_shift_limit=15,
|
|
187
|
+
b_shift_limit=15,
|
|
188
|
+
p=0.3),
|
|
189
|
+
|
|
190
|
+
# 光学特性模拟
|
|
191
|
+
A.OneOf(
|
|
192
|
+
[
|
|
193
|
+
A.MotionBlur(blur_limit=(3, 5), p=0.7),
|
|
194
|
+
A.GaussianBlur(blur_limit=(3, 5), p=0.7),
|
|
195
|
+
# A.GlassBlur(sigma=0.05, max_delta=1, iterations=1, p=0.2)
|
|
196
|
+
],
|
|
197
|
+
p=0.6),
|
|
198
|
+
|
|
199
|
+
# 色彩空间变换
|
|
200
|
+
A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2),
|
|
201
|
+
contrast_limit=(-0.1, 0.1),
|
|
202
|
+
p=0.5),
|
|
203
|
+
A.RandomGamma(gamma_limit=(80, 120), p=0.3),
|
|
204
|
+
A.CLAHE(clip_limit=2.0, p=0.2),
|
|
205
|
+
|
|
206
|
+
# 压缩伪影
|
|
207
|
+
A.ImageCompression(quality_lower=5, quality_upper=90, p=0.8),
|
|
208
|
+
|
|
209
|
+
# 传感器噪声模拟
|
|
210
|
+
A.OneOf([
|
|
211
|
+
A.GaussNoise(var_limit=(10.0, 30.0), p=0.5),
|
|
212
|
+
A.ISONoise(
|
|
213
|
+
color_shift=(0.01, 0.05), intensity=(0.1, 0.3), p=0.3),
|
|
214
|
+
A.MultiplicativeNoise(multiplier=(0.9, 1.1), p=0.2)
|
|
215
|
+
],
|
|
216
|
+
p=0.8),
|
|
217
|
+
A.RandomFog(fog_coef_lower=0.1, fog_coef_upper=0.3, p=0.5),
|
|
218
|
+
|
|
219
|
+
# 随机分辨率调整(保持长宽比)
|
|
220
|
+
A.Downscale(scale_min=0.5,
|
|
221
|
+
scale_max=0.8,
|
|
222
|
+
interpolation=cv2.INTER_LINEAR,
|
|
223
|
+
p=0.25),
|
|
224
|
+
A.PixelDropout(dropout_prob=0.01, p=0.2)
|
|
225
|
+
],
|
|
226
|
+
p=1) # 100%的概率应用整个流程
|
|
227
|
+
|
|
228
|
+
def biased_random_int(self, max_value: int, exponent: float = 3.0) -> int:
|
|
229
|
+
"""
|
|
230
|
+
返回 [0, max_value] 之间的一个整数,值越大概率越小。
|
|
231
|
+
exponent > 1 越大表示越偏向小值。
|
|
232
|
+
"""
|
|
233
|
+
r = random.random() # 均匀分布 [0,1)
|
|
234
|
+
biased = r**exponent # 变成偏小
|
|
235
|
+
return int(biased * max_value)
|
|
236
|
+
|
|
237
|
+
def random_pad(self, img: Image.Image) -> Image.Image:
|
|
238
|
+
w, h = img.size
|
|
239
|
+
|
|
240
|
+
# 最大可 pad 宽度和高度
|
|
241
|
+
max_pad_w = int(w * 0.2)
|
|
242
|
+
max_pad_h = int(h * 0.2)
|
|
243
|
+
|
|
244
|
+
# 随机分配左右 padding
|
|
245
|
+
pad_left = self.biased_random_int(max_pad_w)
|
|
246
|
+
pad_right = self.biased_random_int(max_pad_w)
|
|
247
|
+
|
|
248
|
+
# 随机分配上下 padding
|
|
249
|
+
pad_top = self.biased_random_int(max_pad_h)
|
|
250
|
+
pad_bottom = self.biased_random_int(max_pad_h)
|
|
251
|
+
|
|
252
|
+
# 应用 padding
|
|
253
|
+
padded_img = ImageOps.expand(img,
|
|
254
|
+
border=(pad_left, pad_top, pad_right,
|
|
255
|
+
pad_bottom),
|
|
256
|
+
fill=(255, 255, 255))
|
|
257
|
+
return padded_img
|
|
258
|
+
|
|
259
|
+
def __call__(self, data):
|
|
260
|
+
image = data['image']
|
|
261
|
+
arxiv = data['arxiv']
|
|
262
|
+
# 执行数据增强
|
|
263
|
+
try:
|
|
264
|
+
if random.random() < 0.2 and arxiv:
|
|
265
|
+
# 随机填充
|
|
266
|
+
image = self.random_pad(image)
|
|
267
|
+
if not isinstance(image, np.ndarray):
|
|
268
|
+
image = np.array(image)
|
|
269
|
+
|
|
270
|
+
augmented = self.aug(image=image)['image']
|
|
271
|
+
data['image'] = Image.fromarray(augmented)
|
|
272
|
+
except Exception as e:
|
|
273
|
+
print(e)
|
|
274
|
+
|
|
275
|
+
return data
|
|
276
|
+
|
|
277
|
+
|
|
152
278
|
def hsv_aug(img):
|
|
153
279
|
"""cvtColor."""
|
|
154
280
|
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
|
|
@@ -304,6 +304,63 @@ class RecDynamicResize(object):
|
|
|
304
304
|
return data
|
|
305
305
|
|
|
306
306
|
|
|
307
|
+
class NaSizeResize(object):
|
|
308
|
+
|
|
309
|
+
def __init__(self,
|
|
310
|
+
max_side=[64 * 15, 64 * 22],
|
|
311
|
+
divided_factor=[64, 64],
|
|
312
|
+
**kwargs):
|
|
313
|
+
from torchvision import transforms as T
|
|
314
|
+
from torchvision.transforms import functional as F
|
|
315
|
+
self.F = F
|
|
316
|
+
self.max_side = max_side
|
|
317
|
+
self.divided_factor = divided_factor
|
|
318
|
+
self.interpolation = T.InterpolationMode.BICUBIC
|
|
319
|
+
transforms = []
|
|
320
|
+
transforms.extend([
|
|
321
|
+
T.ToTensor(),
|
|
322
|
+
T.Normalize(0.5, 0.5),
|
|
323
|
+
])
|
|
324
|
+
self.transforms = T.Compose(transforms)
|
|
325
|
+
|
|
326
|
+
def resize_image(self, original_width, original_height, max_width,
|
|
327
|
+
max_height):
|
|
328
|
+
# 计算宽高比
|
|
329
|
+
aspect_ratio = original_width / original_height
|
|
330
|
+
|
|
331
|
+
# 计算新的宽度和高度
|
|
332
|
+
if original_width > max_width or original_height > max_height:
|
|
333
|
+
if (max_width / max_height) >= aspect_ratio:
|
|
334
|
+
# 按高度限制比例
|
|
335
|
+
new_height = max_height
|
|
336
|
+
new_width = int(new_height * aspect_ratio)
|
|
337
|
+
else:
|
|
338
|
+
# 按宽度限制比例
|
|
339
|
+
new_width = max_width
|
|
340
|
+
new_height = int(new_width / aspect_ratio)
|
|
341
|
+
else:
|
|
342
|
+
# 如果图片已经小于或等于最大尺寸,则无需调整
|
|
343
|
+
new_width, new_height = original_width, original_height
|
|
344
|
+
return new_width, new_height
|
|
345
|
+
|
|
346
|
+
def __call__(self, data):
|
|
347
|
+
img = data['image']
|
|
348
|
+
# imgW, imgH = self.max_side
|
|
349
|
+
w, h = img.size
|
|
350
|
+
w_r, h_r = self.resize_image(w, h, self.max_side[0], self.max_side[1])
|
|
351
|
+
h_r = max(int(h_r // self.divided_factor[1] * self.divided_factor[1]),
|
|
352
|
+
64)
|
|
353
|
+
w_r = max(int(w_r // self.divided_factor[0] * self.divided_factor[0]),
|
|
354
|
+
64)
|
|
355
|
+
resized_image = self.F.resize(img, (h_r, w_r),
|
|
356
|
+
interpolation=self.interpolation)
|
|
357
|
+
img = self.transforms(resized_image)
|
|
358
|
+
valid_ratio = min(1.0, float(w_r / w))
|
|
359
|
+
data['image'] = img
|
|
360
|
+
data['valid_ratio'] = valid_ratio
|
|
361
|
+
return data
|
|
362
|
+
|
|
363
|
+
|
|
307
364
|
def resize_norm_img_slice(
|
|
308
365
|
img,
|
|
309
366
|
image_shape,
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from openrec.preprocess.ctc_label_encode import BaseRecLabelEncode
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class UniRecLabelEncode(BaseRecLabelEncode):
|
|
10
|
+
""" Convert between text-label and text-index """
|
|
11
|
+
SPACE = '[s]'
|
|
12
|
+
GO = '[GO]'
|
|
13
|
+
list_token = [GO, SPACE]
|
|
14
|
+
|
|
15
|
+
def __init__(self,
|
|
16
|
+
max_text_length,
|
|
17
|
+
character_dict_path=None,
|
|
18
|
+
use_space_char=False,
|
|
19
|
+
vlmocr=False,
|
|
20
|
+
tokenizer_path='./galactica',
|
|
21
|
+
**kwargs):
|
|
22
|
+
super(UniRecLabelEncode,
|
|
23
|
+
self).__init__(max_text_length, character_dict_path,
|
|
24
|
+
use_space_char)
|
|
25
|
+
# character (str): set of the possible characters.
|
|
26
|
+
# [GO] for the start token of the attention decoder. [s] for end-of-sentence token.
|
|
27
|
+
if vlmocr:
|
|
28
|
+
self.padding_idx = 1
|
|
29
|
+
self.eos_idx = 2
|
|
30
|
+
self.bos_idx = 0
|
|
31
|
+
else:
|
|
32
|
+
self.padding_idx = 0
|
|
33
|
+
self.eos_idx = 2
|
|
34
|
+
self.bos_idx = 1
|
|
35
|
+
self.batch_max_length = max_text_length + 3
|
|
36
|
+
from transformers import AutoTokenizer # transformers==4.2.1
|
|
37
|
+
self.bpe_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
|
|
38
|
+
|
|
39
|
+
def __call__(self, data):
|
|
40
|
+
text = data['label']
|
|
41
|
+
bpe_text, length = self.bpe_encode(text)
|
|
42
|
+
if bpe_text is None:
|
|
43
|
+
return None
|
|
44
|
+
data['label'] = np.array(bpe_text)
|
|
45
|
+
data['length'] = np.array(length)
|
|
46
|
+
return data
|
|
47
|
+
|
|
48
|
+
def add_special_char(self, dict_character):
|
|
49
|
+
dict_character = self.list_token + dict_character
|
|
50
|
+
return dict_character
|
|
51
|
+
|
|
52
|
+
def bpe_encode(self, text):
|
|
53
|
+
if len(text) == 0:
|
|
54
|
+
return None, None
|
|
55
|
+
token = self.bpe_tokenizer(text)['input_ids']
|
|
56
|
+
length = len(token)
|
|
57
|
+
text_list = [self.bos_idx] + token + [2]
|
|
58
|
+
if len(text_list) == 0 or len(text_list) > self.batch_max_length:
|
|
59
|
+
return None, None
|
|
60
|
+
text_list = text_list + [self.padding_idx
|
|
61
|
+
] * (self.batch_max_length - len(text_list))
|
|
62
|
+
return text_list, length
|