wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. wolof_translate/__init__.py +73 -0
  2. wolof_translate/data/__init__.py +0 -0
  3. wolof_translate/data/dataset_v1.py +151 -0
  4. wolof_translate/data/dataset_v2.py +187 -0
  5. wolof_translate/data/dataset_v3.py +187 -0
  6. wolof_translate/data/dataset_v3_2.py +187 -0
  7. wolof_translate/data/dataset_v4.py +202 -0
  8. wolof_translate/data/dataset_v5.py +65 -0
  9. wolof_translate/models/__init__.py +0 -0
  10. wolof_translate/models/transformers/__init__.py +0 -0
  11. wolof_translate/models/transformers/main.py +865 -0
  12. wolof_translate/models/transformers/main_2.py +362 -0
  13. wolof_translate/models/transformers/optimization.py +41 -0
  14. wolof_translate/models/transformers/position.py +46 -0
  15. wolof_translate/models/transformers/size.py +44 -0
  16. wolof_translate/pipe/__init__.py +1 -0
  17. wolof_translate/pipe/nlp_pipeline.py +512 -0
  18. wolof_translate/tokenizers/__init__.py +0 -0
  19. wolof_translate/trainers/__init__.py +0 -0
  20. wolof_translate/trainers/transformer_trainer.py +760 -0
  21. wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  22. wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  23. wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  24. wolof_translate/utils/__init__.py +1 -0
  25. wolof_translate/utils/bucket_iterator.py +143 -0
  26. wolof_translate/utils/database_manager.py +116 -0
  27. wolof_translate/utils/display_predictions.py +162 -0
  28. wolof_translate/utils/download_model.py +40 -0
  29. wolof_translate/utils/evaluate_custom.py +147 -0
  30. wolof_translate/utils/evaluation.py +74 -0
  31. wolof_translate/utils/extract_new_sentences.py +810 -0
  32. wolof_translate/utils/extract_poems.py +60 -0
  33. wolof_translate/utils/extract_sentences.py +562 -0
  34. wolof_translate/utils/improvements/__init__.py +0 -0
  35. wolof_translate/utils/improvements/end_marks.py +45 -0
  36. wolof_translate/utils/recuperate_datasets.py +94 -0
  37. wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  38. wolof_translate/utils/send_model.py +26 -0
  39. wolof_translate/utils/sent_corrections.py +169 -0
  40. wolof_translate/utils/sent_transformers.py +27 -0
  41. wolof_translate/utils/sent_unification.py +97 -0
  42. wolof_translate/utils/split_with_valid.py +72 -0
  43. wolof_translate/utils/tokenize_text.py +46 -0
  44. wolof_translate/utils/training.py +213 -0
  45. wolof_translate/utils/trunc_hg_training.py +196 -0
  46. wolof_translate-0.0.1.dist-info/METADATA +31 -0
  47. wolof_translate-0.0.1.dist-info/RECORD +49 -0
  48. wolof_translate-0.0.1.dist-info/WHEEL +5 -0
  49. wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,760 @@
1
+ """Nouvelle classe d'entraînement. On la fournit un modèle et des hyperparamètres en entrée.
2
+ Nous allons créer des classes supplémentaire qui vont supporter la classe d'entraînement
3
+ """
4
+
5
+ from wolof_translate.utils.evaluation import TranslationEvaluation
6
+ from torch.utils.tensorboard import SummaryWriter
7
+ from torch.utils.data import Dataset, DataLoader
8
+ from tokenizers import Tokenizer
9
+ from tqdm import tqdm, trange
10
+ from torch.nn import utils
11
+ from torch import optim
12
+ from typing import *
13
+ from torch import nn
14
+ import pandas as pd
15
+ import numpy as np
16
+ import string
17
+ import torch
18
+ import json
19
+ import copy
20
+ import os
21
+
22
+ # choose letters for random words
23
+ letters = string.ascii_lowercase
24
+
25
+
26
+ class PredictionError(Exception):
27
+ def __init__(self, error: Union[str, None] = None):
28
+
29
+ if not error is None:
30
+
31
+ print(error)
32
+
33
+ else:
34
+
35
+ print(
36
+ "You cannot with this type of data! Provide a list of tensors, a list of numpy arrays, a numpy array or a torch tensor."
37
+ )
38
+
39
+
40
+ class LossError(Exception):
41
+ def __init__(self, error: Union[str, None] = None):
42
+
43
+ if not error is None:
44
+
45
+ print(error)
46
+
47
+ else:
48
+
49
+ print("A list of losses is provided for multiple outputs.")
50
+
51
+
52
+ class ModelRunner:
53
+ def __init__(
54
+ self,
55
+ model: nn.Module,
56
+ optimizer=optim.AdamW,
57
+ seed: Union[int, None] = None,
58
+ evaluation: Union[TranslationEvaluation, None] = None,
59
+ version: int = 1,
60
+ ):
61
+
62
+ # Initialisation de la graine du générateur
63
+ self.seed = seed
64
+
65
+ # Initialisation de la version
66
+ self.version = version
67
+
68
+ # Recuperate the evaluation metric
69
+ self.evaluation = evaluation
70
+
71
+ # Initialisation du générateur
72
+ if self.seed:
73
+ torch.manual_seed(self.seed)
74
+
75
+ # Le modèle à utiliser pour les différents entraînements
76
+ self.orig_model = model
77
+
78
+ # L'optimiseur à utiliser pour les différentes mises à jour du modèle
79
+ self.orig_optimizer = optimizer
80
+
81
+ # Récupération du type de 'device'
82
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
83
+
84
+ self.compilation = False
85
+
86
+ # ------------------------------ Training staffs (Partie entraînement et compilation) --------------------------
87
+
88
+ def batch_train(
89
+ self,
90
+ input_: torch.Tensor,
91
+ input_mask: torch.Tensor,
92
+ labels: torch.Tensor,
93
+ labels_mask: torch.Tensor,
94
+ pad_token_id: int = 3,
95
+ ):
96
+ if (
97
+ self.hugging_face
98
+ ): # Nous allons utilise un modèle text to text de hugging face (but only for fine-tuning)
99
+
100
+ # concatenate the input and the label
101
+
102
+ # effectuons un passage vers l'avant
103
+ outputs = self.model(
104
+ input_ids=input_, attention_mask=input_mask, labels=labels
105
+ )
106
+
107
+ # recuperate the predictions and the loss
108
+ preds, loss = outputs.logits, outputs.loss
109
+
110
+ else:
111
+
112
+ # effectuons un passage vers l'avant
113
+ outputs = self.model(
114
+ input_, input_mask, labels, labels_mask, pad_token_id=pad_token_id
115
+ )
116
+
117
+ # recuperate the predictions and the loss
118
+ preds, loss = outputs["preds"], outputs["loss"]
119
+
120
+ # effectuons un passage vers l'arrière
121
+ loss.backward()
122
+
123
+ # forcons les valeurs des gradients à se tenir dans un certain interval si nécessaire
124
+ if not self.clipping_value is None:
125
+
126
+ utils.clip_grad_value_(
127
+ self.model.parameters(), clip_value=self.clipping_value
128
+ )
129
+
130
+ # mettons à jour les paramètres
131
+ self.optimizer.step()
132
+
133
+ # Réduction du taux d'apprentissage à chaque itération si nécessaire
134
+ if not self.lr_scheduling is None:
135
+
136
+ self.lr_scheduling.step()
137
+
138
+ # reinitialisation des gradients
139
+ self.optimizer.zero_grad()
140
+
141
+ return preds, loss
142
+
143
+ def batch_eval(
144
+ self,
145
+ input_: torch.Tensor,
146
+ input_mask: torch.Tensor,
147
+ labels: torch.Tensor,
148
+ labels_mask: torch.Tensor,
149
+ pad_token_id: int = 3,
150
+ ):
151
+
152
+ if (
153
+ self.hugging_face
154
+ ): # Nous allons utilise un modèle text to text de hugging face (but only for fine-tuning)
155
+
156
+ # effectuons un passage vers l'avant
157
+ outputs = self.model(
158
+ input_ids=input_, attention_mask=input_mask, labels=labels
159
+ )
160
+ # recuperate the predictions and the loss
161
+ preds, loss = outputs.logits, outputs.loss
162
+
163
+ else:
164
+
165
+ # effectuons un passage vers l'avant
166
+ outputs = self.model(
167
+ input_, input_mask, labels, labels_mask, pad_token_id=pad_token_id
168
+ )
169
+
170
+ # recuperate the predictions and the loss
171
+ preds, loss = outputs["preds"], outputs["loss"]
172
+
173
+ return preds, loss
174
+
175
+ # On a décidé d'ajouter quelques paramètres qui ont été utiles au niveau des enciennes classes d'entraînement
176
+ def compile(
177
+ self,
178
+ train_dataset: Dataset,
179
+ test_dataset: Union[Dataset, None] = None,
180
+ tokenizer: Union[Tokenizer, None] = None,
181
+ train_loader_kwargs: dict = {"batch_size": 16},
182
+ test_loader_kwargs: dict = {"batch_size": 16},
183
+ optimizer_kwargs: dict = {"lr": 1e-4, "weight_decay": 0.4},
184
+ model_kwargs: dict = {
185
+ "class_criterion": nn.CrossEntropyLoss(label_smoothing=0.1)
186
+ },
187
+ lr_scheduler_kwargs: dict = {"d_model": 512, "lr_warmup_step": 100},
188
+ lr_scheduler=None,
189
+ gradient_clipping_value: Union[float, torch.Tensor, None] = None,
190
+ predict_with_generate: bool = False,
191
+ logging_dir: Union[str, None] = None,
192
+ hugging_face: bool = False,
193
+ decoder_only: bool = False,
194
+ ):
195
+
196
+ if self.seed:
197
+ torch.manual_seed(self.seed)
198
+
199
+ # On devra utiliser la méthode 'spread' car on ne connait pas les paramètres du modèle
200
+ if isinstance(
201
+ self.orig_model, nn.Module
202
+ ): # si c'est une instance d'un modèle alors pas de paramètres requis
203
+
204
+ self.model = copy.deepcopy(self.orig_model).to(self.device)
205
+
206
+ else: # sinon on fournit les paramètres
207
+
208
+ self.model = copy.deepcopy(self.orig_model(**model_kwargs)).to(self.device)
209
+
210
+ # Initialisation des paramètres de l'optimiseur
211
+ self.optimizer = self.orig_optimizer(
212
+ self.model.parameters(), **optimizer_kwargs
213
+ )
214
+
215
+ # On ajoute un réducteur de taux d'apprentissage si nécessaire
216
+ self.lr_scheduling = None
217
+
218
+ if not lr_scheduler is None and self.lr_scheduling is None:
219
+
220
+ self.lr_scheduling = lr_scheduler(self.optimizer, **lr_scheduler_kwargs)
221
+
222
+ # initialize the datasets and the loaders
223
+ self.train_set = train_dataset
224
+ self.test_set = test_dataset
225
+
226
+ self.train_loader = DataLoader(
227
+ train_dataset,
228
+ shuffle=True,
229
+ **train_loader_kwargs,
230
+ )
231
+
232
+ if test_dataset:
233
+ self.test_loader = DataLoader(
234
+ test_dataset,
235
+ shuffle=False,
236
+ **test_loader_kwargs,
237
+ )
238
+
239
+ else:
240
+ self.test_loader = None
241
+
242
+ # Let us initialize the clipping value to make gradient clipping
243
+ self.clipping_value = gradient_clipping_value
244
+
245
+ # Other parameters for step tracking and metrics
246
+ self.compilation = True
247
+
248
+ self.current_epoch = None
249
+
250
+ self.best_score = None
251
+
252
+ self.best_epoch = self.current_epoch
253
+
254
+ # Recuperate some boolean attributes
255
+ self.predict_with_generate = predict_with_generate
256
+
257
+ # Recuperate tokenizer
258
+ self.tokenizer = tokenizer
259
+
260
+ # Recuperate the logging directory
261
+ self.logging_dir = logging_dir
262
+
263
+ # Initialize the metrics
264
+ self.metrics = {}
265
+
266
+ # Initialize the attribute which indicate if the model is from huggingface
267
+ self.hugging_face = hugging_face
268
+
269
+ # Initialize the hugging face model type
270
+ self.decoder_only = decoder_only
271
+
272
+ def train(
273
+ self,
274
+ epochs: int = 100,
275
+ auto_save: bool = False,
276
+ log_step: Union[int, None] = None,
277
+ saving_directory: str = "data/checkpoints/last_checkpoints",
278
+ file_name: str = "checkpoints",
279
+ save_best: bool = True,
280
+ metric_for_best_model: str = "test_loss",
281
+ metric_objective: str = "minimize",
282
+ ):
283
+ """Entraînement du modèle
284
+
285
+ Args:
286
+ epochs (int, optional): Le nombre d'itérations. Defaults to 100.
287
+ auto_save (bool, optional): Auto-sauvegarde du modèle. Defaults to False.
288
+ log_step (int, optional): Le nombre d'itération avant d'afficher les performances. Defaults to 1.
289
+ saving_directory (str, optional): Le dossier de sauvegarde du modèle. Defaults to "inception_package/storage".
290
+ file_name (str, optional): Le nom du fichier de sauvegarde. Defaults to "checkpoints".
291
+ save_best (bool): Une varible booléenne indiquant si l'on souhaite sauvegarder le meilleur modèle. Defaults to True.
292
+ metric_for_best_model (str): Le nom de la métrique qui permet de choisir le meilleur modèle. Defaults to 'eval_loss'.
293
+ metric_objective (str): Indique si la métrique doit être maximisée 'maximize' ou minimisée 'minimize'. Defaults to 'minimize'.
294
+
295
+ Raises:
296
+ Exception: L'entraînement implique d'avoir déja initialisé les paramètres
297
+ """
298
+
299
+ # the file name cannot be "best_checkpoints"
300
+ assert file_name != "best_checkpoints"
301
+
302
+ ##################### Error Handling ##################################################
303
+ if not self.compilation:
304
+ raise Exception(
305
+ "You must initialize datasets and\
306
+ parameters with `compile` method. Make sure you don't forget any of them before \n \
307
+ training the model"
308
+ )
309
+
310
+ ##################### Initializations #################################################
311
+
312
+ if metric_objective in ["maximize", "minimize"]:
313
+
314
+ best_score = (
315
+ float("-inf") if metric_objective == "maximize" else float("inf")
316
+ )
317
+
318
+ else:
319
+
320
+ raise ValueError(
321
+ "The metric objective can only between 'maximize' or minimize!"
322
+ )
323
+
324
+ if not self.best_score is None:
325
+
326
+ best_score = self.best_score
327
+
328
+ start_epoch = self.current_epoch if not self.current_epoch is None else 0
329
+
330
+ ##################### Training ########################################################
331
+
332
+ modes = ["train", "test"]
333
+
334
+ if self.test_loader is None:
335
+ modes = ["train"]
336
+
337
+ for epoch in tqdm(range(start_epoch, start_epoch + epochs)):
338
+
339
+ # Print the actual learning rate
340
+ print(f"For epoch {epoch + 1}: ")
341
+
342
+ if self.lr_scheduling:
343
+ print(f"{{Learning rate: {self.lr_scheduling.get_lr()}}}")
344
+
345
+ self.metrics = {}
346
+
347
+ for mode in modes:
348
+
349
+ with torch.set_grad_enabled(mode == "train"):
350
+
351
+ # Initialize the loss of the current mode
352
+ self.metrics[f"{mode}_loss"] = 0
353
+
354
+ # Let us initialize the predictions
355
+ predictions_ = []
356
+
357
+ # Let us initialize the labels
358
+ labels_ = []
359
+
360
+ if mode == "train":
361
+
362
+ self.model.train()
363
+
364
+ loader = list(iter(self.train_loader))
365
+
366
+ else:
367
+
368
+ self.model.eval()
369
+
370
+ loader = list(iter(self.test_loader))
371
+
372
+ with trange(
373
+ len(loader), unit="batches", position=0, leave=True
374
+ ) as pbar:
375
+ # for i in tqdm(range(len(loader))):
376
+ for i in pbar:
377
+
378
+ pbar.set_description(
379
+ f"{mode[0].upper() + mode[1:]} batch number {i + 1}"
380
+ )
381
+
382
+ data = loader[i]
383
+
384
+ input_ = data[0].long().to(self.device)
385
+
386
+ # let us initialize a fake input
387
+ # input__ = None
388
+
389
+ input_mask = data[1].to(self.device)
390
+
391
+ # let us initialize a fake input mask
392
+ # input_mask_ = None
393
+
394
+ labels = data[2].long().to(self.device)
395
+
396
+ if self.hugging_face:
397
+
398
+ # concatenate the input with the labels and the two attention masks if we only use a decoder
399
+ # if self.decoder_only:
400
+
401
+ # # let us modify the fake input to the first sentence
402
+ # input__ = copy.deepcopy(input_)
403
+
404
+ # input_ = torch.concat((input_, labels), dim=1)
405
+
406
+ # # the new labels are equal to the inputs
407
+ # labels = copy.deepcopy(input_)
408
+
409
+ # # let us modify the fake input mask to mask of the first sentence
410
+ # input_mask_ = copy.deepcopy(input_mask)
411
+
412
+ # input_mask = torch.concat((input_mask, data[3].to(self.device)), dim=1)
413
+
414
+ labels[labels == self.tokenizer.pad_token_id] == -100
415
+
416
+ labels_mask = data[3].to(self.device)
417
+
418
+ # Récupération de identifiant token du padding (par défaut = 3)
419
+ pad_token_id = (
420
+ 3
421
+ if self.tokenizer is None
422
+ else self.tokenizer.pad_token_id
423
+ )
424
+
425
+ preds, loss = (
426
+ self.batch_train(
427
+ input_,
428
+ input_mask,
429
+ labels,
430
+ labels_mask,
431
+ pad_token_id,
432
+ )
433
+ if mode == "train"
434
+ else self.batch_eval(
435
+ input_,
436
+ input_mask,
437
+ labels,
438
+ labels_mask,
439
+ pad_token_id,
440
+ )
441
+ )
442
+
443
+ self.metrics[f"{mode}_loss"] += loss.item()
444
+
445
+ # let us add the predictions and labels in the list of predictions and labels after their determinations
446
+ if mode == "test":
447
+
448
+ if self.predict_with_generate:
449
+
450
+ if self.hugging_face:
451
+
452
+ # preds = self.model.generate(input_ if not self.decoder_only else input__,
453
+ # attention_mask = input_mask if not self.decoder_only else input_mask_,
454
+ # max_new_tokens = self.train_set.max_len, pad_token_id = self.test_set.tokenizer.eos_token_id)
455
+
456
+ preds = self.model.generate(
457
+ input_,
458
+ attention_mask=input_mask,
459
+ max_length=self.train_set.max_len,
460
+ )
461
+
462
+ else:
463
+
464
+ preds = self.model.generate(
465
+ input_,
466
+ input_mask,
467
+ pad_token_id=pad_token_id,
468
+ )
469
+
470
+ else:
471
+
472
+ if self.hugging_face:
473
+
474
+ preds = torch.argmax(preds, dim=-1)
475
+
476
+ predictions_.extend(preds.detach().cpu().tolist())
477
+
478
+ labels_.extend(labels.detach().cpu().tolist())
479
+
480
+ if not self.evaluation is None and mode == "test":
481
+
482
+ self.metrics.update(
483
+ self.evaluation.compute_metrics(
484
+ (np.array(predictions_), np.array(labels_))
485
+ )
486
+ )
487
+
488
+ self.metrics[f"train_loss"] = self.metrics[f"train_loss"] / len(
489
+ self.train_loader
490
+ )
491
+
492
+ if not self.test_loader is None:
493
+
494
+ self.metrics[f"test_loss"] = self.metrics[f"test_loss"] / len(
495
+ self.test_loader
496
+ )
497
+
498
+ # for metric in self.metrics:
499
+
500
+ # if metric != 'train_loss':
501
+
502
+ # self.metrics[metric] = self.metrics[metric] / len(self.test_loader)
503
+
504
+ # Affichage des métriques
505
+ if not log_step is None and (epoch + 1) % log_step == 0:
506
+
507
+ print(f"\nMetrics: {self.metrics}")
508
+
509
+ if not self.logging_dir is None:
510
+
511
+ with SummaryWriter(
512
+ os.path.join(self.logging_dir, f"version_{self.version}")
513
+ ) as writer:
514
+
515
+ for metric in self.metrics:
516
+
517
+ writer.add_scalar(
518
+ metric, self.metrics[metric], global_step=epoch
519
+ )
520
+
521
+ writer.add_scalar("global_step", epoch)
522
+
523
+ print("\n=============================\n")
524
+
525
+ ##################### Model saving #########################################################
526
+
527
+ # Save the model in the end of the current epoch. Sauvegarde du modèle à la fin d'une itération
528
+ if auto_save:
529
+
530
+ self.current_epoch = epoch + 1
531
+
532
+ if save_best:
533
+
534
+ # verify if the current score is best and recuperate it if yes
535
+ if metric_objective == "maximize":
536
+
537
+ last_score = best_score < self.metrics[metric_for_best_model]
538
+
539
+ elif metric_objective == "minimize":
540
+
541
+ last_score = best_score > self.metrics[metric_for_best_model]
542
+
543
+ else:
544
+
545
+ raise ValueError(
546
+ "The metric objective can only be in ['maximize', 'minimize'] !"
547
+ )
548
+
549
+ # recuperate the best score
550
+ if last_score:
551
+
552
+ best_score = self.metrics[metric_for_best_model]
553
+
554
+ self.best_epoch = self.current_epoch + 1
555
+
556
+ self.best_score = best_score
557
+
558
+ self.save(saving_directory, "best_checkpoints")
559
+
560
+ self.save(saving_directory, file_name)
561
+
562
+ # Pour la méthode nous allons nous inspirer sur la méthode save de l'agent ddpg (RL) que l'on avait créée
563
+ def save(
564
+ self,
565
+ directory: str = "data/checkpoints/last_checkpoints",
566
+ file_name: str = "checkpoints",
567
+ ):
568
+
569
+ if not os.path.exists(directory):
570
+ os.makedirs(directory)
571
+
572
+ file_path = os.path.join(directory, f"{file_name}.pth")
573
+
574
+ checkpoints = {
575
+ "model_state_dict": self.model.state_dict(),
576
+ "optimizer_state_dict": self.optimizer.state_dict(),
577
+ "current_epoch": self.current_epoch,
578
+ "metrics": self.metrics,
579
+ "best_score": self.best_score,
580
+ "best_epoch": self.best_epoch,
581
+ "lr_scheduler_state_dict": self.lr_scheduling.state_dict()
582
+ if not self.lr_scheduling is None
583
+ else None,
584
+ }
585
+
586
+ torch.save(checkpoints, file_path)
587
+
588
+ # update metrics and the best score dict
589
+ self.metrics["current_epoch"] = (
590
+ self.current_epoch + 1
591
+ if not self.current_epoch is None
592
+ else self.current_epoch
593
+ )
594
+
595
+ best_score_dict = {"best_score": self.best_score, "best_epoch": self.best_epoch}
596
+
597
+ # save the metrics as json file
598
+ metrics = json.dumps(
599
+ {"metrics": self.metrics, "best_performance": best_score_dict}, indent=4
600
+ )
601
+
602
+ with open(os.path.join(directory, f"{file_name}.json"), "w") as f:
603
+
604
+ f.write(metrics)
605
+
606
+ # Ainsi que pour la méthode load
607
+ def load(
608
+ self,
609
+ directory: str = "data/checkpoints/last_checkpoints",
610
+ file_name: str = "checkpoints",
611
+ load_best: bool = False,
612
+ ):
613
+
614
+ if load_best:
615
+ file_name = "best_checkpoints"
616
+
617
+ file_path = os.path.join(directory, f"{file_name}.pth")
618
+
619
+ if os.path.exists(file_path):
620
+
621
+ checkpoints = (
622
+ torch.load(file_path)
623
+ if torch.device == torch.device("cuda")
624
+ else torch.load(file_path, map_location="cpu")
625
+ )
626
+
627
+ self.model.load_state_dict(checkpoints["model_state_dict"])
628
+
629
+ self.optimizer.load_state_dict(checkpoints["optimizer_state_dict"])
630
+
631
+ self.current_epoch = checkpoints["current_epoch"]
632
+
633
+ self.best_score = checkpoints["best_score"]
634
+
635
+ self.best_epoch = checkpoints["best_epoch"]
636
+
637
+ if not self.lr_scheduling is None:
638
+
639
+ self.lr_scheduling.load_state_dict(
640
+ checkpoints["lr_scheduler_state_dict"]
641
+ )
642
+
643
+ else:
644
+
645
+ raise OSError(
646
+ f"Le fichier {file_path} est introuvable. Vérifiez si le chemin fourni est correct!"
647
+ )
648
+
649
+ def evaluate(self, test_dataset, batch_size: int = 16, loader_kwargs: dict = {}):
650
+
651
+ self.model.eval()
652
+
653
+ test_loader = list(
654
+ iter(
655
+ DataLoader(
656
+ test_dataset,
657
+ batch_size,
658
+ shuffle=False,
659
+ **loader_kwargs,
660
+ )
661
+ )
662
+ )
663
+
664
+ # Let us initialize the predictions
665
+ predictions_ = []
666
+
667
+ # Let us initialize the labels
668
+ labels_ = []
669
+
670
+ metrics = {"test_loss": 0.0}
671
+
672
+ results = {"original_sentences": [], "translations": [], "predictions": []}
673
+
674
+ with torch.no_grad():
675
+
676
+ with trange(
677
+ len(test_loader), unit="batches", position=0, leave=True
678
+ ) as pbar:
679
+ # for i in tqdm(range(len(test_loader))):
680
+ for i in pbar:
681
+
682
+ pbar.set_description(f"Evaluation batch number {i + 1}")
683
+
684
+ data = test_loader[i]
685
+
686
+ input_ = data[0].long().to(self.device)
687
+
688
+ input_mask = data[1].to(self.device)
689
+
690
+ labels = data[2].long().to(self.device)
691
+
692
+ if self.hugging_face:
693
+
694
+ # concatenate the input with the labels and the two attention masks if we only use a decoder
695
+ # if self.decoder_only:
696
+
697
+ # labels = torch.concat((input_, labels))
698
+
699
+ labels[labels == test_dataset.tokenizer.pad_token_id] == -100
700
+
701
+ labels_mask = data[3].to(self.device)
702
+
703
+ preds, loss = self.batch_eval(
704
+ input_,
705
+ input_mask,
706
+ labels,
707
+ labels_mask,
708
+ test_dataset.tokenizer.pad_token_id,
709
+ )
710
+
711
+ metrics[f"test_loss"] += loss.item()
712
+
713
+ if self.hugging_face:
714
+
715
+ # preds = self.model.generate(input_, attention_mask = input_mask, max_new_tokens = self.train_set.max_len * 2, pad_token_id = test_dataset.tokenizer.eos_token_id)
716
+
717
+ preds = self.model.generate(
718
+ input_,
719
+ attention_mask=input_mask,
720
+ max_length=self.train_set.max_len,
721
+ )
722
+
723
+ else:
724
+
725
+ preds = self.model.generate(
726
+ input_,
727
+ input_mask,
728
+ pad_token_id=test_dataset.tokenizer.pad_token_id,
729
+ )
730
+
731
+ labels_.extend(labels.detach().cpu().tolist())
732
+
733
+ predictions_.extend(preds.detach().cpu().tolist())
734
+
735
+ # let us recuperate the original sentences
736
+ results["original_sentences"].extend(
737
+ test_dataset.tokenizer.batch_decode(
738
+ input_, skip_special_tokens=True
739
+ )
740
+ )
741
+
742
+ results["translations"].extend(
743
+ test_dataset.tokenizer.batch_decode(
744
+ labels, skip_special_tokens=True
745
+ )
746
+ )
747
+
748
+ results["predictions"].extend(test_dataset.decode(preds))
749
+
750
+ if not self.evaluation is None:
751
+
752
+ metrics.update(
753
+ self.evaluation.compute_metrics(
754
+ (np.array(predictions_), np.array(labels_))
755
+ )
756
+ )
757
+
758
+ metrics["test_loss"] = metrics["test_loss"] / len(test_loader)
759
+
760
+ return metrics, pd.DataFrame(results)