wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. wolof_translate/__init__.py +73 -0
  2. wolof_translate/data/__init__.py +0 -0
  3. wolof_translate/data/dataset_v1.py +151 -0
  4. wolof_translate/data/dataset_v2.py +187 -0
  5. wolof_translate/data/dataset_v3.py +187 -0
  6. wolof_translate/data/dataset_v3_2.py +187 -0
  7. wolof_translate/data/dataset_v4.py +202 -0
  8. wolof_translate/data/dataset_v5.py +65 -0
  9. wolof_translate/models/__init__.py +0 -0
  10. wolof_translate/models/transformers/__init__.py +0 -0
  11. wolof_translate/models/transformers/main.py +865 -0
  12. wolof_translate/models/transformers/main_2.py +362 -0
  13. wolof_translate/models/transformers/optimization.py +41 -0
  14. wolof_translate/models/transformers/position.py +46 -0
  15. wolof_translate/models/transformers/size.py +44 -0
  16. wolof_translate/pipe/__init__.py +1 -0
  17. wolof_translate/pipe/nlp_pipeline.py +512 -0
  18. wolof_translate/tokenizers/__init__.py +0 -0
  19. wolof_translate/trainers/__init__.py +0 -0
  20. wolof_translate/trainers/transformer_trainer.py +760 -0
  21. wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  22. wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  23. wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  24. wolof_translate/utils/__init__.py +1 -0
  25. wolof_translate/utils/bucket_iterator.py +143 -0
  26. wolof_translate/utils/database_manager.py +116 -0
  27. wolof_translate/utils/display_predictions.py +162 -0
  28. wolof_translate/utils/download_model.py +40 -0
  29. wolof_translate/utils/evaluate_custom.py +147 -0
  30. wolof_translate/utils/evaluation.py +74 -0
  31. wolof_translate/utils/extract_new_sentences.py +810 -0
  32. wolof_translate/utils/extract_poems.py +60 -0
  33. wolof_translate/utils/extract_sentences.py +562 -0
  34. wolof_translate/utils/improvements/__init__.py +0 -0
  35. wolof_translate/utils/improvements/end_marks.py +45 -0
  36. wolof_translate/utils/recuperate_datasets.py +94 -0
  37. wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  38. wolof_translate/utils/send_model.py +26 -0
  39. wolof_translate/utils/sent_corrections.py +169 -0
  40. wolof_translate/utils/sent_transformers.py +27 -0
  41. wolof_translate/utils/sent_unification.py +97 -0
  42. wolof_translate/utils/split_with_valid.py +72 -0
  43. wolof_translate/utils/tokenize_text.py +46 -0
  44. wolof_translate/utils/training.py +213 -0
  45. wolof_translate/utils/trunc_hg_training.py +196 -0
  46. wolof_translate-0.0.1.dist-info/METADATA +31 -0
  47. wolof_translate-0.0.1.dist-info/RECORD +49 -0
  48. wolof_translate-0.0.1.dist-info/WHEEL +5 -0
  49. wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,362 @@
1
+ from wolof_translate.models.transformers.position import PositionalEncoding
2
+ from wolof_translate.models.transformers.size import SizePredict
3
+ from torch.nn.utils.rnn import pad_sequence
4
+ from torch import nn
5
+ from typing import *
6
+ import torch
7
+ import copy
8
+
9
+ # new Exception for that transformer
10
+ class TargetException(Exception):
11
+ def __init__(self, error):
12
+
13
+ print(error)
14
+
15
+
16
+ class GenerationException(Exception):
17
+ def __init__(self, error):
18
+
19
+ print(error)
20
+
21
+
22
+ class Transformer(nn.Module):
23
+ def __init__(
24
+ self,
25
+ vocab_size: int,
26
+ encoder,
27
+ decoder,
28
+ class_criterion=nn.CrossEntropyLoss(label_smoothing=0.1),
29
+ size_criterion=nn.MSELoss(),
30
+ n_features: int = 100,
31
+ n_layers: int = 2,
32
+ n_poses_max: int = 500,
33
+ projection_type: str = "embedding",
34
+ max_len: Union[int, None] = None,
35
+ share_weight: bool = False,
36
+ ):
37
+
38
+ super(Transformer, self).__init__()
39
+
40
+ assert len(encoder.layers) > 0 and len(decoder.layers) > 0
41
+
42
+ self.dropout = encoder.layers._modules["0"].dropout.p
43
+
44
+ self.enc_embed_dim = encoder.layers._modules["0"].linear1.in_features
45
+
46
+ self.dec_embed_dim = decoder.layers._modules["0"].linear1.in_features
47
+
48
+ # we can initiate the positional encoding model
49
+ self.pe = PositionalEncoding(n_poses_max, self.enc_embed_dim)
50
+
51
+ if projection_type == "embedding":
52
+
53
+ self.embedding_layer = nn.Embedding(vocab_size, self.enc_embed_dim)
54
+
55
+ elif projection_type == "linear":
56
+
57
+ self.embedding_layer = nn.Linear(vocab_size, self.enc_embed_dim)
58
+
59
+ # initialize the first encoder and decoder
60
+ self.encoder = encoder
61
+
62
+ self.decoder = decoder
63
+
64
+ self.class_criterion = class_criterion
65
+
66
+ self.size_criterion = size_criterion
67
+
68
+ # let's initiate the mlp for predicting the target size
69
+ self.size_prediction = SizePredict(
70
+ self.enc_embed_dim,
71
+ n_features=n_features,
72
+ n_layers=n_layers,
73
+ normalization=True, # we always use normalization
74
+ drop_out=self.dropout,
75
+ )
76
+
77
+ self.classifier = nn.Linear(self.dec_embed_dim, vocab_size)
78
+
79
+ # let us share the weights between the embedding layer and classification
80
+ # linear layer
81
+ if share_weight:
82
+
83
+ self.classifier.weight.data = self.embedding_layer.weight.data
84
+
85
+ self.max_len = max_len
86
+
87
+ def forward(
88
+ self,
89
+ input_,
90
+ input_mask=None,
91
+ target=None,
92
+ target_mask=None,
93
+ pad_token_id: int = 3,
94
+ ):
95
+
96
+ # ---> Encoder prediction
97
+ input_embed = self.embedding_layer(input_)
98
+
99
+ # recuperate the last input (before position)
100
+ last_input = input_embed[:, -1:]
101
+
102
+ # add position to input_embedding
103
+ input_embed = self.pe(input_embed)
104
+
105
+ # recuperate the input mask for pytorch encoder
106
+ pad_mask1 = (
107
+ (input_mask == 0).to(next(self.parameters()).device, dtype=torch.bool)
108
+ if not input_mask is None
109
+ else None
110
+ )
111
+
112
+ # let us compute the states
113
+ input_embed = input_embed.type_as(next(self.encoder.parameters()))
114
+
115
+ states = self.encoder(input_embed, src_key_padding_mask=pad_mask1)
116
+
117
+ # ---> Decoder prediction
118
+ # let's predict the size of the target
119
+ target_size = self.size_prediction(states).mean(axis=1)
120
+
121
+ target_embed = self.embedding_layer(target)
122
+
123
+ # recuperate target mask for pytorch decoder
124
+ pad_mask2 = (
125
+ (target_mask == 0).to(next(self.parameters()).device, dtype=torch.bool)
126
+ if not target_mask is None
127
+ else None
128
+ )
129
+
130
+ # define the attention mask
131
+ targ_mask = self.get_target_mask(target_embed.size(1))
132
+
133
+ # let's concatenate the last input and the target shifted from one position to the right (new seq dim = target seq dim)
134
+ target_embed = torch.cat((last_input, target_embed[:, :-1]), dim=1)
135
+
136
+ # add position to target embed
137
+ target_embed = self.pe(target_embed)
138
+
139
+ # we pass all of the shifted target sequence to the decoder if training mode
140
+ if self.training:
141
+
142
+ target_embed = target_embed.type_as(next(self.encoder.parameters()))
143
+
144
+ outputs = self.decoder(
145
+ target_embed, states, tgt_mask=targ_mask, tgt_key_padding_mask=pad_mask2
146
+ )
147
+
148
+ else: ## This part was understand with the help of the professor Bousso.
149
+
150
+ # if we are in evaluation mode we will not use the target but the outputs to make prediction and it is
151
+ # sequentially done (see comments)
152
+
153
+ # let us recuperate the last input as the current outputs
154
+ outputs = last_input.type_as(next(self.encoder.parameters()))
155
+
156
+ # for each target that we want to predict
157
+ for t in range(target.size(1)):
158
+
159
+ # recuperate the target mask of the current decoder input
160
+ current_targ_mask = targ_mask[
161
+ : t + 1, : t + 1
162
+ ] # all attentions between the elements before the last target
163
+
164
+ # we do the same for the padding mask
165
+ current_pad_mask = None
166
+
167
+ if not pad_mask2 is None:
168
+
169
+ current_pad_mask = pad_mask2[:, : t + 1]
170
+
171
+ # make new predictions
172
+ out = self.decoder(
173
+ outputs,
174
+ states,
175
+ tgt_mask=current_targ_mask,
176
+ tgt_key_padding_mask=current_pad_mask,
177
+ )
178
+
179
+ # add the last new prediction to the decoder inputs
180
+ outputs = torch.cat(
181
+ (outputs, out[:, -1:]), dim=1
182
+ ) # the prediction of the last output is the last to add (!)
183
+
184
+ # let's take only the predictions (the last input will not be taken)
185
+ outputs = outputs[:, 1:]
186
+
187
+ # let us add padding index to the outputs
188
+ if not target_mask is None:
189
+ target = copy.deepcopy(target.cpu())
190
+ target = target.to(target_mask.device).masked_fill_(target_mask == 0, -100)
191
+
192
+ # ---> Loss Calculation
193
+ # let us calculate the loss of the size prediction
194
+ size_loss = 0
195
+ if not self.size_criterion is None:
196
+
197
+ size_loss = self.size_criterion(
198
+ target_size,
199
+ target_mask.sum(axis=-1).unsqueeze(1).type_as(next(self.parameters())),
200
+ )
201
+
202
+ outputs = self.classifier(outputs)
203
+
204
+ # let us permute the two last dimensions of the outputs
205
+ outputs_ = outputs.permute(0, -1, -2)
206
+
207
+ # calculate the loss
208
+ loss = self.class_criterion(outputs_, target)
209
+
210
+ outputs = torch.softmax(outputs, dim=-1)
211
+
212
+ # calculate the predictionos
213
+ outputs = copy.deepcopy(outputs.detach().cpu())
214
+ predictions = (
215
+ torch.argmax(outputs, dim=-1)
216
+ .to(target_mask.device)
217
+ .masked_fill_(target_mask == 0, pad_token_id)
218
+ )
219
+
220
+ return {"loss": loss + size_loss, "preds": predictions}
221
+
222
+ def generate(
223
+ self, input_, input_mask=None, temperature: float = 0, pad_token_id: int = 3
224
+ ):
225
+
226
+ if self.training:
227
+
228
+ raise GenerationException(
229
+ "You cannot generate when the model is on training mode!"
230
+ )
231
+
232
+ # ---> Encoder prediction
233
+ input_embed = self.embedding_layer(input_)
234
+
235
+ # recuperate the last input (before position)
236
+ last_input = input_embed[:, -1:]
237
+
238
+ # add position to input_embedding
239
+ input_embed = self.pe(input_embed)
240
+
241
+ # recuperate the input mask for pytorch encoder
242
+ pad_mask1 = (
243
+ (input_mask == 0).bool().to(next(self.parameters()).device)
244
+ if not input_mask is None
245
+ else None
246
+ )
247
+
248
+ # let us compute the states
249
+ input_embed = input_embed.type_as(next(self.encoder.parameters()))
250
+
251
+ states = self.encoder(input_embed, src_key_padding_mask=pad_mask1)
252
+
253
+ # ---> Decoder prediction
254
+ # let us recuperate the maximum length
255
+ max_len = self.max_len if not self.max_len is None else 0
256
+
257
+ # let's predict the size of the target and the target mask
258
+ if max_len > 0:
259
+
260
+ target_size = (
261
+ self.size_prediction(states).mean(axis=1).round().clip(1, max_len)
262
+ )
263
+
264
+ else:
265
+
266
+ target_size = torch.max(
267
+ self.size_prediction(states).mean(axis=1).round(), torch.tensor(1.0)
268
+ )
269
+
270
+ target_ = copy.deepcopy(target_size.cpu())
271
+
272
+ target_mask = [
273
+ torch.tensor(int(size[0]) * [1] + [0] * max(max_len - int(size[0]), 0))
274
+ for size in target_.tolist()
275
+ ]
276
+
277
+ if max_len > 0:
278
+
279
+ target_mask = torch.stack(target_mask).to(
280
+ next(self.parameters()).device, dtype=torch.bool
281
+ )
282
+
283
+ else:
284
+
285
+ target_mask = pad_sequence(target_, batch_first=True).to(
286
+ next(self.parameters()).device, dtype=torch.bool
287
+ )
288
+
289
+ # recuperate target mask for pytorch decoder
290
+ pad_mask2 = (
291
+ (target_mask == 0).to(next(self.parameters()).device, dtype=torch.bool)
292
+ if not target_mask is None
293
+ else None
294
+ )
295
+
296
+ # define the attention mask
297
+ targ_mask = self.get_target_mask(target_mask.size(1))
298
+
299
+ # if we are in evaluation mode we will not use the target but the outputs to make prediction and it is
300
+ # sequentially done (see comments)
301
+
302
+ # let us recuperate the last input as the current outputs
303
+ outputs = last_input.type_as(next(self.encoder.parameters()))
304
+
305
+ # for each target that we want to predict
306
+ for t in range(target_mask.size(1)):
307
+
308
+ # recuperate the target mask of the current decoder input
309
+ current_targ_mask = targ_mask[
310
+ : t + 1, : t + 1
311
+ ] # all attentions between the elements before the last target
312
+
313
+ # we do the same for the padding mask
314
+ current_pad_mask = None
315
+
316
+ if not pad_mask2 is None:
317
+
318
+ current_pad_mask = pad_mask2[:, : t + 1]
319
+
320
+ # make new predictions
321
+ out = self.decoder(
322
+ outputs,
323
+ states,
324
+ tgt_mask=current_targ_mask,
325
+ tgt_key_padding_mask=current_pad_mask,
326
+ )
327
+
328
+ # add the last new prediction to the decoder inputs
329
+ outputs = torch.cat(
330
+ (outputs, out[:, -1:]), dim=1
331
+ ) # the prediction of the last output is the last to add (!)
332
+
333
+ # let's take only the predictions (the last input will not be taken)
334
+ outputs = outputs[:, 1:]
335
+
336
+ # ---> Predictions
337
+ outputs = self.classifier(outputs)
338
+
339
+ # calculate the resulted outputs with temperature
340
+ if temperature > 0:
341
+
342
+ outputs = torch.softmax(outputs / temperature, dim=-1)
343
+
344
+ else:
345
+
346
+ outputs = torch.softmax(outputs, dim=-1)
347
+
348
+ # calculate the predictionos
349
+ outputs = copy.deepcopy(outputs.detach().cpu())
350
+ predictions = (
351
+ torch.argmax(outputs, dim=-1)
352
+ .to(target_mask.device)
353
+ .masked_fill_(target_mask == 0, pad_token_id)
354
+ )
355
+
356
+ return predictions
357
+
358
+ def get_target_mask(self, attention_size: int):
359
+
360
+ return torch.triu(torch.ones((attention_size, attention_size)), diagonal=1).to(
361
+ next(self.parameters()).device, dtype=torch.bool
362
+ )
@@ -0,0 +1,41 @@
1
+ """This custom learning rate scheduler apply the learning rate scheduler with warmup according to the paper [Deep Transformer Models for Time Series Forecasting: The Influenza Prevalence Case](https://arxiv.org/pdf/2001.08317)
2
+ """
3
+
4
+ from torch.optim.optimizer import Optimizer
5
+ from torch.optim.lr_scheduler import _LRScheduler
6
+
7
+
8
+ class TransformerScheduler(_LRScheduler):
9
+ def __init__(
10
+ self, optimizer: Optimizer, d_model=100, lr_warmup_step=5000, **kwargs
11
+ ):
12
+ """Initialize the main attribute of the scheduler
13
+
14
+ Args:
15
+ optimizer (Optimizer): The optimizer
16
+ d_model (int, optional): The embedding layer feature dimension. Defaults to 100.
17
+ lr_warmup_step (int, optional): The number of warmup steps. Defaults to 5000.
18
+ """
19
+
20
+ self.d_model = d_model
21
+
22
+ self.lr_warmup = lr_warmup_step
23
+
24
+ self.len_param_groups = len(optimizer.param_groups)
25
+
26
+ super().__init__(optimizer, **kwargs)
27
+
28
+ def get_lr(self):
29
+ """Get the new learning rates
30
+
31
+ Returns:
32
+ list: The learning rate of the different parameter groups
33
+ """
34
+
35
+ step_num = self.last_epoch + 1 # Increment step number from 1
36
+
37
+ lr = self.d_model**-0.5 * min(
38
+ step_num**-0.5, step_num * self.lr_warmup**-1.5
39
+ )
40
+
41
+ return [lr] * self.len_param_groups
@@ -0,0 +1,46 @@
1
+ from torch import nn
2
+ import numpy as np
3
+ import torch
4
+
5
+
6
+ class PositionalEncoding(nn.Module):
7
+ def __init__(self, n_poses_max: int = 500, d_model: int = 512):
8
+ super(PositionalEncoding, self).__init__()
9
+
10
+ self.n_poses = n_poses_max
11
+
12
+ self.n_dims = d_model
13
+
14
+ # the angle is calculated as following
15
+ angle = lambda pos, i: pos / 10000 ** (i / self.n_dims)
16
+
17
+ # let's initialize the different token positions
18
+ poses = np.arange(0, self.n_poses)
19
+
20
+ # let's initialize also the different dimension indexes
21
+ dims = np.arange(0, self.n_dims)
22
+
23
+ # let's initialize the index of the different positional vector values
24
+ circle_index = np.arange(0, self.n_dims / 2)
25
+
26
+ # let's create the possible combinations between a position and a dimension index
27
+ xv, yv = np.meshgrid(poses, circle_index)
28
+
29
+ # let's create a matrix which will contain all the different points initialized
30
+ points = np.zeros((self.n_poses, self.n_dims))
31
+
32
+ # let's calculate the circle y axis coordinates
33
+ points[:, ::2] = np.sin(angle(xv.T, yv.T))
34
+
35
+ # let's calculate the circle x axis coordinates
36
+ points[:, 1::2] = np.cos(angle(xv.T, yv.T))
37
+
38
+ self.register_buffer("pe", torch.from_numpy(points).unsqueeze(0))
39
+
40
+ def forward(self, input_: torch.Tensor):
41
+
42
+ # let's scale the input
43
+ input_ = input_ * torch.sqrt(torch.tensor(self.n_dims))
44
+
45
+ # let's recuperate the result of the sum between the input and the positional encoding vectors
46
+ return input_ + self.pe[:, : input_.size(1), :].type_as(input_)
@@ -0,0 +1,44 @@
1
+ from torch import nn
2
+ import torch
3
+
4
+
5
+ class SizePredict(nn.Module):
6
+ def __init__(
7
+ self,
8
+ input_size: int,
9
+ target_size: int = 1,
10
+ n_features: int = 100,
11
+ n_layers: int = 1,
12
+ normalization: bool = True,
13
+ drop_out: float = 0.1,
14
+ ):
15
+ super(SizePredict, self).__init__()
16
+
17
+ self.layers = nn.ModuleList([])
18
+
19
+ for l in range(n_layers):
20
+
21
+ # we have to add batch normalization and drop_out if their are specified
22
+ self.layers.append(
23
+ nn.Sequential(
24
+ nn.Linear(input_size if l == 0 else n_features, n_features),
25
+ nn.LayerNorm(n_features) if normalization else nn.Identity(),
26
+ nn.ReLU(),
27
+ nn.Dropout(drop_out),
28
+ )
29
+ )
30
+
31
+ # Initiate the last linear layer
32
+ self.output_layer = nn.Linear(n_features, target_size)
33
+
34
+ def forward(self, input_: torch.Tensor):
35
+
36
+ # let's pass the input into the different sequences
37
+ out = input_
38
+
39
+ for layer in self.layers:
40
+
41
+ out = layer(out)
42
+
43
+ # return the final result (you have to take the absolute value of the result to make the number positive)
44
+ return self.output_layer(out)
@@ -0,0 +1 @@
1
+ # from wolof_translate.pipe.nlp_pipeline import NLPPipeline, TextPipeProcessing