wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. wolof_translate/__init__.py +73 -0
  2. wolof_translate/data/__init__.py +0 -0
  3. wolof_translate/data/dataset_v1.py +151 -0
  4. wolof_translate/data/dataset_v2.py +187 -0
  5. wolof_translate/data/dataset_v3.py +187 -0
  6. wolof_translate/data/dataset_v3_2.py +187 -0
  7. wolof_translate/data/dataset_v4.py +202 -0
  8. wolof_translate/data/dataset_v5.py +65 -0
  9. wolof_translate/models/__init__.py +0 -0
  10. wolof_translate/models/transformers/__init__.py +0 -0
  11. wolof_translate/models/transformers/main.py +865 -0
  12. wolof_translate/models/transformers/main_2.py +362 -0
  13. wolof_translate/models/transformers/optimization.py +41 -0
  14. wolof_translate/models/transformers/position.py +46 -0
  15. wolof_translate/models/transformers/size.py +44 -0
  16. wolof_translate/pipe/__init__.py +1 -0
  17. wolof_translate/pipe/nlp_pipeline.py +512 -0
  18. wolof_translate/tokenizers/__init__.py +0 -0
  19. wolof_translate/trainers/__init__.py +0 -0
  20. wolof_translate/trainers/transformer_trainer.py +760 -0
  21. wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  22. wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  23. wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  24. wolof_translate/utils/__init__.py +1 -0
  25. wolof_translate/utils/bucket_iterator.py +143 -0
  26. wolof_translate/utils/database_manager.py +116 -0
  27. wolof_translate/utils/display_predictions.py +162 -0
  28. wolof_translate/utils/download_model.py +40 -0
  29. wolof_translate/utils/evaluate_custom.py +147 -0
  30. wolof_translate/utils/evaluation.py +74 -0
  31. wolof_translate/utils/extract_new_sentences.py +810 -0
  32. wolof_translate/utils/extract_poems.py +60 -0
  33. wolof_translate/utils/extract_sentences.py +562 -0
  34. wolof_translate/utils/improvements/__init__.py +0 -0
  35. wolof_translate/utils/improvements/end_marks.py +45 -0
  36. wolof_translate/utils/recuperate_datasets.py +94 -0
  37. wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  38. wolof_translate/utils/send_model.py +26 -0
  39. wolof_translate/utils/sent_corrections.py +169 -0
  40. wolof_translate/utils/sent_transformers.py +27 -0
  41. wolof_translate/utils/sent_unification.py +97 -0
  42. wolof_translate/utils/split_with_valid.py +72 -0
  43. wolof_translate/utils/tokenize_text.py +46 -0
  44. wolof_translate/utils/training.py +213 -0
  45. wolof_translate/utils/trunc_hg_training.py +196 -0
  46. wolof_translate-0.0.1.dist-info/METADATA +31 -0
  47. wolof_translate-0.0.1.dist-info/RECORD +49 -0
  48. wolof_translate-0.0.1.dist-info/WHEEL +5 -0
  49. wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,925 @@
1
+ """Nouvelle classe d'entraînement. On la fournit un modèle et des hyperparamètres en entrée.
2
+ Nous allons créer des classes supplémentaire qui vont supporter la classe d'entraînement
3
+ """
4
+
5
+ from wolof_translate.utils.evaluation import TranslationEvaluation
6
+ from torch.utils.data.distributed import DistributedSampler
7
+ from torch.utils.tensorboard import SummaryWriter
8
+ from torch.utils.data import Dataset, DataLoader
9
+ from tokenizers import Tokenizer
10
+ import torch.distributed as dist
11
+ from tqdm import tqdm, trange
12
+ from torch.nn import utils
13
+ from torch import optim
14
+ from typing import *
15
+ from torch import nn
16
+ import pandas as pd
17
+ import numpy as np
18
+ import string
19
+ import torch
20
+ import time
21
+ import json
22
+ import copy
23
+ import os
24
+
25
+ # choose letters for random words
26
+ letters = string.ascii_lowercase
27
+
28
+
29
+ class PredictionError(Exception):
30
+ def __init__(self, error: Union[str, None] = None):
31
+
32
+ if not error is None:
33
+
34
+ print(error)
35
+
36
+ else:
37
+
38
+ print(
39
+ "You cannot with this type of data! Provide a list of tensors, a list of numpy arrays, a numpy array or a torch tensor."
40
+ )
41
+
42
+
43
+ class LossError(Exception):
44
+ def __init__(self, error: Union[str, None] = None):
45
+
46
+ if not error is None:
47
+
48
+ print(error)
49
+
50
+ else:
51
+
52
+ print("A list of losses is provided for multiple outputs.")
53
+
54
+
55
+ class ModelRunner:
56
+ def __init__(
57
+ self,
58
+ model: nn.Module,
59
+ optimizer=optim.AdamW,
60
+ seed: Union[int, None] = None,
61
+ evaluation: Union[TranslationEvaluation, None] = None,
62
+ version: int = 1,
63
+ ):
64
+
65
+ # Initialisation de la graine du générateur
66
+ self.seed = seed
67
+
68
+ # Initialisation de la version
69
+ self.version = version
70
+
71
+ # Recuperate the evaluation metric
72
+ self.evaluation = evaluation
73
+
74
+ # Initialisation du générateur
75
+ if self.seed:
76
+ torch.manual_seed(self.seed)
77
+
78
+ # Le modèle à utiliser pour les différents entraînements
79
+ self.orig_model = model
80
+
81
+ # L'optimiseur à utiliser pour les différentes mises à jour du modèle
82
+ self.orig_optimizer = optimizer
83
+
84
+ # Récupération du type de 'device'
85
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
86
+
87
+ self.compilation = False
88
+
89
+ # ------------------------------ Training staffs (Partie entraînement et compilation) --------------------------
90
+
91
+ def _average_gradients(
92
+ self,
93
+ ): # link: https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/pytorch_mnist/mnist.py
94
+ # average the gradients
95
+ size = float(self.dist.get_world_size())
96
+ for param in self.model.parameters():
97
+ self.dist.all_reduce(param.grad.data, op=self.dist.reduce_op.SUM)
98
+ param.grad.data /= size
99
+
100
+ def batch_train(
101
+ self,
102
+ input_: torch.Tensor,
103
+ input_mask: torch.Tensor,
104
+ labels: torch.Tensor,
105
+ labels_mask: torch.Tensor,
106
+ pad_token_id: int = 3,
107
+ ):
108
+ if (
109
+ self.hugging_face
110
+ ): # Nous allons utilise un modèle text to text de hugging face (but only for fine-tuning)
111
+
112
+ # concatenate the input and the label
113
+
114
+ # effectuons un passage vers l'avant
115
+ outputs = self.model(
116
+ input_ids=input_, attention_mask=input_mask, labels=labels
117
+ )
118
+
119
+ # recuperate the predictions and the loss
120
+ preds, loss = outputs.logits, outputs.loss
121
+
122
+ else:
123
+
124
+ # effectuons un passage vers l'avant
125
+ outputs = self.model(
126
+ input_, input_mask, labels, labels_mask, pad_token_id=pad_token_id
127
+ )
128
+
129
+ # recuperate the predictions and the loss
130
+ preds, loss = outputs["preds"], outputs["loss"]
131
+
132
+ # effectuons un passage vers l'arrière
133
+ loss.backward()
134
+
135
+ # average the gradients if the training is distributed over multi machine cpu
136
+ if self.distributed and not self.device == torch.device("cuda"):
137
+
138
+ self._average_gradients()
139
+
140
+ # forcons les valeurs des gradients à se tenir dans un certain interval si nécessaire
141
+ if not self.clipping_value is None:
142
+
143
+ utils.clip_grad_value_(
144
+ self.model.parameters(), clip_value=self.clipping_value
145
+ )
146
+
147
+ # mettons à jour les paramètres
148
+ self.optimizer.step()
149
+
150
+ # Réduction du taux d'apprentissage à chaque itération si nécessaire
151
+ if not self.lr_scheduling is None:
152
+
153
+ self.lr_scheduling.step()
154
+
155
+ # reinitialisation des gradients
156
+ self.optimizer.zero_grad()
157
+
158
+ return preds, loss
159
+
160
+ def batch_eval(
161
+ self,
162
+ input_: torch.Tensor,
163
+ input_mask: torch.Tensor,
164
+ labels: torch.Tensor,
165
+ labels_mask: torch.Tensor,
166
+ pad_token_id: int = 3,
167
+ ):
168
+
169
+ if (
170
+ self.hugging_face
171
+ ): # Nous allons utilise un modèle text to text de hugging face (but only for fine-tuning)
172
+
173
+ # effectuons un passage vers l'avant
174
+ outputs = self.model(
175
+ input_ids=input_, attention_mask=input_mask, labels=labels
176
+ )
177
+ # recuperate the predictions and the loss
178
+ preds, loss = outputs.logits, outputs.loss
179
+
180
+ else:
181
+
182
+ # effectuons un passage vers l'avant
183
+ outputs = self.model(
184
+ input_, input_mask, labels, labels_mask, pad_token_id=pad_token_id
185
+ )
186
+
187
+ # recuperate the predictions and the loss
188
+ preds, loss = outputs["preds"], outputs["loss"]
189
+
190
+ return preds, loss
191
+
192
+ # On a décidé d'ajouter quelques paramètres qui ont été utiles au niveau des enciennes classes d'entraînement
193
+ def compile(
194
+ self,
195
+ train_dataset: Dataset,
196
+ test_dataset: Union[Dataset, None] = None,
197
+ tokenizer: Union[Tokenizer, None] = None,
198
+ train_loader_kwargs: dict = {"batch_size": 16, "shuffle": True},
199
+ test_loader_kwargs: dict = {"batch_size": 16, "shuffle": False},
200
+ optimizer_kwargs: dict = {"lr": 1e-4, "weight_decay": 0.4},
201
+ model_kwargs: dict = {
202
+ "class_criterion": nn.CrossEntropyLoss(label_smoothing=0.1)
203
+ },
204
+ lr_scheduler_kwargs: dict = {"d_model": 512, "lr_warmup_step": 100},
205
+ lr_scheduler=None,
206
+ stopping_patience: Union[int, None] = None,
207
+ gradient_clipping_value: Union[float, torch.Tensor, None] = None,
208
+ predict_with_generate: bool = False,
209
+ logging_dir: Union[str, None] = None,
210
+ hugging_face: bool = False,
211
+ is_distributed: bool = False,
212
+ dist=None,
213
+ ):
214
+
215
+ if self.seed:
216
+ torch.manual_seed(self.seed)
217
+
218
+ # On devra utiliser la méthode 'spread' car on ne connait pas les paramètres du modèle
219
+ if isinstance(
220
+ self.orig_model, nn.Module
221
+ ): # si c'est une instance d'un modèle alors pas de paramètres requis
222
+
223
+ self.model = copy.deepcopy(self.orig_model).to(self.device)
224
+
225
+ else: # sinon on fournit les paramètres
226
+
227
+ self.model = copy.deepcopy(self.orig_model(**model_kwargs)).to(self.device)
228
+
229
+ # add distribution if available
230
+ if is_distributed and self.device == torch.device("cuda"):
231
+
232
+ self.model = torch.nn.parallel.DistributedDataParallel(self.model)
233
+
234
+ else:
235
+
236
+ self.model = torch.nn.parallel.DataParallel(self.model)
237
+
238
+ # Initialisation des paramètres de l'optimiseur
239
+ self.optimizer = self.orig_optimizer(
240
+ self.model.parameters(), **optimizer_kwargs
241
+ )
242
+
243
+ # On ajoute un réducteur de taux d'apprentissage si nécessaire
244
+ self.lr_scheduling = None
245
+
246
+ if not lr_scheduler is None and self.lr_scheduling is None:
247
+
248
+ self.lr_scheduling = lr_scheduler(self.optimizer, **lr_scheduler_kwargs)
249
+
250
+ # Initialize the datasets and the loaders
251
+ self.train_set = train_dataset
252
+ self.test_set = test_dataset
253
+
254
+ # If the data is distributed over multiple gpus we will parallelize it
255
+ if is_distributed:
256
+
257
+ # We verify if the train loader kwargs already contains a sampler and
258
+ # if it is the case add it to the parallel sampler object
259
+ sampler = None
260
+ if "batch_sampler" in train_loader_kwargs:
261
+
262
+ sampler = "batch_sampler"
263
+
264
+ elif "sampler" in train_loader_kwargs:
265
+
266
+ sampler = "sampler"
267
+
268
+ if not sampler is None:
269
+
270
+ sampler = DistributedSampler(train_loader_kwargs[sampler])
271
+
272
+ distributed_sampler = sampler
273
+
274
+ train_loader_kwargs[sampler] = distributed_sampler
275
+
276
+ else:
277
+
278
+ distributed_sampler = DistributedSampler(train_dataset)
279
+
280
+ train_loader_kwargs["sampler"] = distributed_sampler
281
+
282
+ self.train_loader = DataLoader(
283
+ train_dataset,
284
+ **train_loader_kwargs,
285
+ )
286
+
287
+ if test_dataset:
288
+ self.test_loader = DataLoader(
289
+ test_dataset,
290
+ **test_loader_kwargs,
291
+ )
292
+
293
+ else:
294
+ self.test_loader = None
295
+
296
+ # Let us initialize the clipping value to make gradient clipping
297
+ self.clipping_value = gradient_clipping_value
298
+
299
+ # Other parameters for step tracking and metrics
300
+ self.compilation = True
301
+
302
+ self.current_epoch = None
303
+
304
+ self.best_score = None
305
+
306
+ self.best_epoch = self.current_epoch
307
+
308
+ # Recuperate some boolean attributes
309
+ self.predict_with_generate = predict_with_generate
310
+
311
+ # Recuperate tokenizer
312
+ self.tokenizer = tokenizer
313
+
314
+ # Recuperate the logging directory
315
+ self.logging_dir = logging_dir
316
+
317
+ # Initialize the metrics
318
+ self.metrics = {}
319
+
320
+ # Initialize the attribute which indicate if the model is from huggingface
321
+ self.hugging_face = hugging_face
322
+
323
+ # Initialize the torch distributed module and distribution option
324
+ self.distributed = is_distributed
325
+ self.dist = dist
326
+
327
+ # initialize the early stopping patience
328
+ self.patience = stopping_patience
329
+
330
+ # add early stopping
331
+ self.epochs_since_improvement = 0
332
+
333
+ def train(
334
+ self,
335
+ epochs: int = 100,
336
+ auto_save: bool = False,
337
+ log_step: Union[int, None] = None,
338
+ saving_directory: str = "data/checkpoints/last_checkpoints",
339
+ file_name: str = "checkpoints",
340
+ save_best: bool = True,
341
+ metric_for_best_model: str = "test_loss",
342
+ metric_objective: str = "minimize",
343
+ add_bleu_only: bool = True,
344
+ ):
345
+ """Entraînement du modèle
346
+
347
+ Args:
348
+ epochs (int, optional): Le nombre d'itérations. Defaults to 100.
349
+ auto_save (bool, optional): Auto-sauvegarde du modèle. Defaults to False.
350
+ log_step (int, optional): Le nombre d'itération avant d'afficher les performances. Defaults to 1.
351
+ saving_directory (str, optional): Le dossier de sauvegarde du modèle. Defaults to "inception_package/storage".
352
+ file_name (str, optional): Le nom du fichier de sauvegarde. Defaults to "checkpoints".
353
+ save_best (bool): Une varible booléenne indiquant si l'on souhaite sauvegarder le meilleur modèle. Defaults to True.
354
+ metric_for_best_model (str): Le nom de la métrique qui permet de choisir le meilleur modèle. Defaults to 'eval_loss'.
355
+ metric_objective (str): Indique si la métrique doit être maximisée 'maximize' ou minimisée 'minimize'. Defaults to 'minimize'.
356
+ add_bleu_only (bool): Indique si l'on souhaite n'ajouter la métrique BLEU. Si c'est le cas l'évaluation se fera à la fin des
357
+ itérations. Defaults to True.
358
+
359
+ Raises:
360
+ Exception: L'entraînement implique d'avoir déja initialisé les paramètres
361
+ """
362
+
363
+ # the file name cannot be "best_checkpoints"
364
+ assert file_name != "best_checkpoints"
365
+
366
+ ##################### Error Handling ##################################################
367
+ if not self.compilation:
368
+ raise Exception(
369
+ "You must initialize datasets and\
370
+ parameters with `compile` method. Make sure you don't forget any of them before \n \
371
+ training the model"
372
+ )
373
+
374
+ ##################### Initializations #################################################
375
+
376
+ if metric_objective in ["maximize", "minimize"]:
377
+
378
+ best_score = (
379
+ float("-inf") if metric_objective == "maximize" else float("inf")
380
+ )
381
+
382
+ else:
383
+
384
+ raise ValueError(
385
+ "The metric objective can only between 'maximize' or minimize!"
386
+ )
387
+
388
+ if not self.best_score is None:
389
+
390
+ best_score = self.best_score
391
+
392
+ start_epoch = self.current_epoch if not self.current_epoch is None else 0
393
+
394
+ ##################### Training ########################################################
395
+
396
+ modes = ["train", "test"]
397
+
398
+ if self.test_loader is None:
399
+ modes = ["train"]
400
+
401
+ for epoch in tqdm(range(start_epoch, start_epoch + epochs)):
402
+
403
+ # Print the actual learning rate
404
+ print(f"For epoch {epoch + 1}: ")
405
+
406
+ if self.lr_scheduling:
407
+ print(f"{{Learning rate: {self.lr_scheduling.get_lr()}}}")
408
+
409
+ self.metrics = {}
410
+
411
+ i = {}
412
+
413
+ for mode in modes:
414
+
415
+ if mode == "test" and (epoch + 1) % log_step != 0:
416
+
417
+ continue
418
+
419
+ with torch.set_grad_enabled(mode == "train"):
420
+
421
+ # Initialize the loss of the current mode
422
+ self.metrics[f"{mode}_loss"] = 0
423
+
424
+ # Let us initialize the predictions
425
+ predictions_ = []
426
+
427
+ # Let us initialize the labels
428
+ labels_ = []
429
+
430
+ if mode == "train":
431
+
432
+ self.model.train()
433
+
434
+ # loader = list(iter(self.train_loader))
435
+ loader = self.train_loader
436
+
437
+ dataset = self.train_set
438
+
439
+ else:
440
+
441
+ self.model.eval()
442
+
443
+ # loader = list(iter(self.test_loader))
444
+ loader = self.test_loader
445
+
446
+ dataset = self.test_set
447
+
448
+ # progress_bar = trange(len(loader))
449
+
450
+ with trange(
451
+ len(loader), unit="batches", position=0, leave=True
452
+ ) as pbar:
453
+ # i[mode] = 0
454
+ for i, data in enumerate(loader, 1):
455
+
456
+ # i[mode] += 1
457
+ pbar.set_description(
458
+ f"{mode[0].upper() + mode[1:]} batch number {i + 1}"
459
+ )
460
+
461
+ # data = loader[i]
462
+
463
+ input_ = data[0].to(self.device)
464
+
465
+ # let us initialize a fake input
466
+ # input__ = None
467
+
468
+ input_mask = data[1].to(self.device, dtype=torch.bool)
469
+
470
+ # let us initialize a fake input mask
471
+ # input_mask_ = None
472
+
473
+ labels = data[2].to(self.device)
474
+
475
+ if self.hugging_face:
476
+
477
+ # concatenate the input with the labels and the two attention masks if we only use a decoder
478
+ # if self.decoder_only:
479
+
480
+ # # let us modify the fake input to the first sentence
481
+ # input__ = copy.deepcopy(input_)
482
+
483
+ # input_ = torch.concat((input_, labels), dim=1)
484
+
485
+ # # the new labels are equal to the inputs
486
+ # labels = copy.deepcopy(input_)
487
+
488
+ # # let us modify the fake input mask to mask of the first sentence
489
+ # input_mask_ = copy.deepcopy(input_mask)
490
+
491
+ # input_mask = torch.concat((input_mask, data[3].to(self.device)), dim=1)
492
+
493
+ labels[labels == self.tokenizer.pad_token_id] == -100
494
+
495
+ labels_mask = data[3].to(self.device, dtype=torch.bool)
496
+
497
+ # Récupération de identifiant token du padding (par défaut = 3)
498
+ pad_token_id = (
499
+ 3
500
+ if self.tokenizer is None
501
+ else self.tokenizer.pad_token_id
502
+ )
503
+
504
+ preds, loss = (
505
+ self.batch_train(
506
+ input_,
507
+ input_mask,
508
+ labels,
509
+ labels_mask,
510
+ pad_token_id,
511
+ )
512
+ if mode == "train"
513
+ else self.batch_eval(
514
+ input_,
515
+ input_mask,
516
+ labels,
517
+ labels_mask,
518
+ pad_token_id,
519
+ )
520
+ )
521
+
522
+ # let us calculate the weight of the batch
523
+ batch_weight = labels.shape[0] / len(dataset)
524
+
525
+ self.metrics[f"{mode}_loss"] += loss.item() * batch_weight
526
+
527
+ # let us add the predictions and labels in the list of predictions and labels after their determinations
528
+ if mode == "test":
529
+
530
+ if self.predict_with_generate:
531
+
532
+ if self.hugging_face:
533
+
534
+ # preds = self.model.generate(input_ if not self.decoder_only else input__,
535
+ # attention_mask = input_mask if not self.decoder_only else input_mask_,
536
+ # max_new_tokens = self.train_set.max_len, pad_token_id = self.test_set.tokenizer.eos_token_id)
537
+ preds = self.model.module.generate(
538
+ input_,
539
+ attention_mask=input_mask,
540
+ max_length=labels.shape[1],
541
+ )
542
+
543
+ else:
544
+
545
+ preds = self.model.module.generate(
546
+ input_,
547
+ input_mask,
548
+ pad_token_id=pad_token_id,
549
+ max_len=labels.shape[1],
550
+ )
551
+
552
+ else:
553
+
554
+ if self.hugging_face:
555
+
556
+ preds = torch.argmax(preds, dim=-1)
557
+
558
+ # if add_bleu_only:
559
+
560
+ # predictions_.extend(preds.detach().cpu().tolist())
561
+
562
+ # labels_.extend(labels.detach().cpu().tolist())
563
+
564
+ # else:
565
+
566
+ if not self.evaluation is None and mode == "test":
567
+
568
+ # calculate the metrics on the current predictions and labels
569
+ metrics = self.evaluation.compute_metrics(
570
+ (
571
+ preds.cpu().detach().numpy(),
572
+ labels.cpu().detach().numpy(),
573
+ ),
574
+ bleu=True,
575
+ accuracy=not self.hugging_face,
576
+ )
577
+
578
+ for metric in metrics:
579
+
580
+ if metric != f"{mode}_loss":
581
+
582
+ self.metrics[metric] = (
583
+ self.metrics[metric]
584
+ + metrics[metric] * batch_weight
585
+ if metric in self.metrics
586
+ else metrics[metric] * batch_weight
587
+ )
588
+
589
+ pbar.update()
590
+
591
+ # if not self.evaluation is None and not self.test_loader is None:
592
+
593
+ # # if add_bleu_only:
594
+
595
+ # self.metrics.update(self.evaluation.compute_metrics((np.array(predictions_, dtype = object), np.array(labels_, dtype = object))))
596
+
597
+ # self.metrics['test_loss'] = self.metrics['test_loss'] / i['test']
598
+
599
+ # else:
600
+
601
+ # for metric in self.metrics:
602
+
603
+ # if metric != 'train_loss':
604
+
605
+ # self.metrics[metric] = self.metrics[metric] / i['test']
606
+ # self.metrics[metric] = self.metrics[metric] / len(loader)
607
+
608
+ # elif not self.test_loader is None:
609
+
610
+ # self.metrics["test_loss"] = self.metrics["test_loss"] / i['test']
611
+ # self.metrics["test_loss"] = self.metrics["test_loss"] / len(loader)
612
+
613
+ # self.metrics["train_loss"] = self.metrics["train_loss"] / i['train']
614
+ # self.metrics["train_loss"] = self.metrics["train_loss"] / len(loader)
615
+
616
+ # for metric in self.metrics:
617
+
618
+ # if metric != 'train_loss':
619
+
620
+ # self.metrics[metric] = self.metrics[metric] / len(self.test_loader)
621
+
622
+ # Affichage des métriques
623
+ if not log_step is None and (epoch + 1) % log_step == 0:
624
+
625
+ print(f"\nMetrics: {self.metrics}")
626
+
627
+ if not self.logging_dir is None:
628
+
629
+ with SummaryWriter(
630
+ os.path.join(self.logging_dir, f"version_{self.version}")
631
+ ) as writer:
632
+
633
+ for metric in self.metrics:
634
+
635
+ writer.add_scalar(
636
+ metric, self.metrics[metric], global_step=epoch
637
+ )
638
+
639
+ writer.add_scalar("global_step", epoch)
640
+
641
+ print("\n=============================\n")
642
+
643
+ ##################### Model saving #########################################################
644
+
645
+ # Save the model in the end of the current epoch. Sauvegarde du modèle à la fin d'une itération
646
+ if auto_save and not log_step is None and (epoch + 1) % log_step == 0:
647
+
648
+ self.current_epoch = epoch + 1
649
+
650
+ if save_best:
651
+
652
+ # verify if the current score is best and recuperate it if yes
653
+ if metric_objective == "maximize":
654
+
655
+ last_score = best_score < self.metrics[metric_for_best_model]
656
+
657
+ elif metric_objective == "minimize":
658
+
659
+ last_score = best_score > self.metrics[metric_for_best_model]
660
+
661
+ else:
662
+
663
+ raise ValueError(
664
+ "The metric objective can only be in ['maximize', 'minimize'] !"
665
+ )
666
+
667
+ # recuperate the best score
668
+ if last_score:
669
+
670
+ best_score = self.metrics[metric_for_best_model]
671
+
672
+ self.best_epoch = self.current_epoch + 1
673
+
674
+ self.best_score = best_score
675
+
676
+ self.save(saving_directory, "best_checkpoints")
677
+
678
+ if not self.patience is None:
679
+
680
+ self.epochs_since_improvement = 0
681
+
682
+ else:
683
+
684
+ if not self.patience is None:
685
+
686
+ self.epochs_since_improvement += 1
687
+
688
+ if self.epochs_since_improvement >= self.patience:
689
+ print(
690
+ f"Early stopping triggered. No improvement in validation {metric_for_best_model} for {self.patience} epochs !"
691
+ )
692
+ break
693
+
694
+ self.save(saving_directory, file_name)
695
+
696
+ # Pour la méthode nous allons nous inspirer sur la méthode save de l'agent ddpg (RL) que l'on avait créée
697
+ def save(
698
+ self,
699
+ directory: str = "data/checkpoints/last_checkpoints",
700
+ file_name: str = "checkpoints",
701
+ ):
702
+
703
+ if not os.path.exists(directory):
704
+ os.makedirs(directory)
705
+
706
+ file_path = os.path.join(directory, f"{file_name}.pth")
707
+
708
+ checkpoints = {
709
+ "model_state_dict": self.model.state_dict(),
710
+ "optimizer_state_dict": self.optimizer.state_dict(),
711
+ "current_epoch": self.current_epoch,
712
+ "metrics": self.metrics,
713
+ "best_score": self.best_score,
714
+ "best_epoch": self.best_epoch,
715
+ "lr_scheduler_state_dict": self.lr_scheduling.state_dict()
716
+ if not self.lr_scheduling is None
717
+ else None,
718
+ "epochs_since_improvement": self.epochs_since_improvement,
719
+ }
720
+
721
+ torch.save(checkpoints, file_path)
722
+
723
+ # update metrics and the best score dict
724
+ self.metrics["current_epoch"] = (
725
+ self.current_epoch + 1
726
+ if not self.current_epoch is None
727
+ else self.current_epoch
728
+ )
729
+
730
+ best_score_dict = {"best_score": self.best_score, "best_epoch": self.best_epoch}
731
+
732
+ # save the metrics as json file
733
+ metrics = json.dumps(
734
+ {"metrics": self.metrics, "best_performance": best_score_dict}, indent=4
735
+ )
736
+
737
+ with open(os.path.join(directory, f"{file_name}.json"), "w") as f:
738
+
739
+ f.write(metrics)
740
+
741
+ # Ainsi que pour la méthode load
742
+ def load(
743
+ self,
744
+ directory: str = "data/checkpoints/last_checkpoints",
745
+ file_name: str = "checkpoints",
746
+ load_best: bool = False,
747
+ ):
748
+
749
+ if load_best:
750
+ file_name = "best_checkpoints"
751
+
752
+ file_path = os.path.join(directory, f"{file_name}.pth")
753
+
754
+ if os.path.exists(file_path):
755
+
756
+ checkpoints = (
757
+ torch.load(file_path)
758
+ if torch.device == torch.device("cuda")
759
+ else torch.load(file_path, map_location="cpu")
760
+ )
761
+
762
+ self.model.load_state_dict(checkpoints["model_state_dict"])
763
+
764
+ self.optimizer.load_state_dict(checkpoints["optimizer_state_dict"])
765
+
766
+ self.current_epoch = checkpoints["current_epoch"]
767
+
768
+ self.best_score = checkpoints["best_score"]
769
+
770
+ self.best_epoch = checkpoints["best_epoch"]
771
+
772
+ self.epochs_since_improvement = checkpoints["epochs_since_improvement"]
773
+
774
+ if not self.lr_scheduling is None:
775
+
776
+ self.lr_scheduling.load_state_dict(
777
+ checkpoints["lr_scheduler_state_dict"]
778
+ )
779
+
780
+ else:
781
+
782
+ raise OSError(
783
+ f"Le fichier {file_path} est introuvable. Vérifiez si le chemin fourni est correct!"
784
+ )
785
+
786
+ def evaluate(
787
+ self,
788
+ test_dataset,
789
+ loader_kwargs: dict = {},
790
+ num_beams: int = 3,
791
+ top_k: int = 10,
792
+ top_p: float = 1.0,
793
+ temperature: float = 1.0,
794
+ ):
795
+
796
+ self.model.eval()
797
+
798
+ test_loader = DataLoader(
799
+ test_dataset,
800
+ **loader_kwargs,
801
+ )
802
+
803
+ # Let us initialize the predictions
804
+ predictions_ = []
805
+
806
+ # Let us initialize the labels
807
+ labels_ = []
808
+
809
+ metrics = {"test_loss": 0.0}
810
+
811
+ results = {"original_sentences": [], "translations": [], "predictions": []}
812
+
813
+ # progress_bar = trange(len(test_loader))
814
+
815
+ with torch.no_grad():
816
+
817
+ # i = 0
818
+ # for data in test_loader:
819
+ with trange(
820
+ len(test_loader), unit="batches", position=0, leave=True
821
+ ) as pbar:
822
+ # for i in tqdm(range(len(test_loader))):
823
+ for i, data in enumerate(test_loader, 1):
824
+ # i += 1
825
+ pbar.set_description(f"Evaluation batch number {i + 1}")
826
+
827
+ # data = test_loader[i]
828
+
829
+ input_ = data[0].long().to(self.device)
830
+
831
+ input_mask = data[1].to(self.device)
832
+
833
+ labels = data[2].long().to(self.device)
834
+
835
+ if self.hugging_face:
836
+
837
+ # concatenate the input with the labels and the two attention masks if we only use a decoder
838
+ # if self.decoder_only:
839
+
840
+ # labels = torch.concat((input_, labels))
841
+
842
+ labels[labels == test_dataset.tokenizer.pad_token_id] == -100
843
+
844
+ labels_mask = data[3].to(self.device)
845
+
846
+ preds, loss = self.batch_eval(
847
+ input_,
848
+ input_mask,
849
+ labels,
850
+ labels_mask,
851
+ test_dataset.tokenizer.pad_token_id,
852
+ )
853
+
854
+ # let us calculate the weight of the batch
855
+ batch_weight = labels.shape[0] / len(test_dataset)
856
+
857
+ metrics[f"test_loss"] += loss.item() * batch_weight
858
+
859
+ if self.hugging_face:
860
+
861
+ # preds = self.model.generate(input_, attention_mask = input_mask, max_new_tokens = self.train_set.max_len * 2, pad_token_id = test_dataset.tokenizer.eos_token_id)
862
+
863
+ preds = self.model.module.generate(
864
+ input_,
865
+ attention_mask=input_mask,
866
+ max_length=labels.shape[1],
867
+ num_beams=num_beams,
868
+ top_k=top_k,
869
+ top_p=top_p,
870
+ temperature=temperature,
871
+ )
872
+
873
+ else:
874
+
875
+ preds = self.model.module.generate(
876
+ input_,
877
+ input_mask,
878
+ pad_token_id=test_dataset.tokenizer.pad_token_id,
879
+ max_len=labels.shape[1],
880
+ )
881
+
882
+ if not self.evaluation is None:
883
+
884
+ # calculate the metrics on the current predictions and labels
885
+ mets = self.evaluation.compute_metrics(
886
+ (
887
+ preds.cpu().detach().numpy(),
888
+ labels.cpu().detach().numpy(),
889
+ ),
890
+ accuracy=not self.hugging_face,
891
+ bleu=True,
892
+ )
893
+
894
+ for metric in mets:
895
+
896
+ if metric != "test_loss":
897
+
898
+ metrics[metric] = (
899
+ metrics[metric] + mets[metric] * batch_weight
900
+ if metric in metrics
901
+ else mets[metric] * batch_weight
902
+ )
903
+
904
+ # labels_.extend(labels.detach().cpu().tolist())
905
+
906
+ # predictions_.extend(preds.detach().cpu().tolist())
907
+
908
+ # let us recuperate the original sentences
909
+ results["original_sentences"].extend(test_dataset.decode(input_))
910
+
911
+ results["translations"].extend(test_dataset.decode(labels))
912
+
913
+ results["predictions"].extend(test_dataset.decode(preds))
914
+
915
+ pbar.update()
916
+
917
+ # if not self.evaluation is None:
918
+
919
+ # metrics = {metric: value / len(test_loader) for metric, value in metrics.items()}
920
+
921
+ # else:
922
+
923
+ # metrics["test_loss"] = metrics["test_loss"] / len(test_loader)
924
+
925
+ return metrics, pd.DataFrame(results)