wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. wolof_translate/__init__.py +73 -0
  2. wolof_translate/data/__init__.py +0 -0
  3. wolof_translate/data/dataset_v1.py +151 -0
  4. wolof_translate/data/dataset_v2.py +187 -0
  5. wolof_translate/data/dataset_v3.py +187 -0
  6. wolof_translate/data/dataset_v3_2.py +187 -0
  7. wolof_translate/data/dataset_v4.py +202 -0
  8. wolof_translate/data/dataset_v5.py +65 -0
  9. wolof_translate/models/__init__.py +0 -0
  10. wolof_translate/models/transformers/__init__.py +0 -0
  11. wolof_translate/models/transformers/main.py +865 -0
  12. wolof_translate/models/transformers/main_2.py +362 -0
  13. wolof_translate/models/transformers/optimization.py +41 -0
  14. wolof_translate/models/transformers/position.py +46 -0
  15. wolof_translate/models/transformers/size.py +44 -0
  16. wolof_translate/pipe/__init__.py +1 -0
  17. wolof_translate/pipe/nlp_pipeline.py +512 -0
  18. wolof_translate/tokenizers/__init__.py +0 -0
  19. wolof_translate/trainers/__init__.py +0 -0
  20. wolof_translate/trainers/transformer_trainer.py +760 -0
  21. wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  22. wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  23. wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  24. wolof_translate/utils/__init__.py +1 -0
  25. wolof_translate/utils/bucket_iterator.py +143 -0
  26. wolof_translate/utils/database_manager.py +116 -0
  27. wolof_translate/utils/display_predictions.py +162 -0
  28. wolof_translate/utils/download_model.py +40 -0
  29. wolof_translate/utils/evaluate_custom.py +147 -0
  30. wolof_translate/utils/evaluation.py +74 -0
  31. wolof_translate/utils/extract_new_sentences.py +810 -0
  32. wolof_translate/utils/extract_poems.py +60 -0
  33. wolof_translate/utils/extract_sentences.py +562 -0
  34. wolof_translate/utils/improvements/__init__.py +0 -0
  35. wolof_translate/utils/improvements/end_marks.py +45 -0
  36. wolof_translate/utils/recuperate_datasets.py +94 -0
  37. wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  38. wolof_translate/utils/send_model.py +26 -0
  39. wolof_translate/utils/sent_corrections.py +169 -0
  40. wolof_translate/utils/sent_transformers.py +27 -0
  41. wolof_translate/utils/sent_unification.py +97 -0
  42. wolof_translate/utils/split_with_valid.py +72 -0
  43. wolof_translate/utils/tokenize_text.py +46 -0
  44. wolof_translate/utils/training.py +213 -0
  45. wolof_translate/utils/trunc_hg_training.py +196 -0
  46. wolof_translate-0.0.1.dist-info/METADATA +31 -0
  47. wolof_translate-0.0.1.dist-info/RECORD +49 -0
  48. wolof_translate-0.0.1.dist-info/WHEEL +5 -0
  49. wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,882 @@
1
+ """Nouvelle classe d'entraînement. On la fournit un modèle et des hyperparamètres en entrée.
2
+ Nous allons créer des classes supplémentaire qui vont supporter la classe d'entraînement
3
+ """
4
+
5
+ from wolof_translate.utils.evaluation import TranslationEvaluation
6
+ from torch.utils.data.distributed import DistributedSampler
7
+ from torch.utils.tensorboard import SummaryWriter
8
+ from torch.utils.data import Dataset, DataLoader
9
+ from tokenizers import Tokenizer
10
+ import torch.distributed as dist
11
+ from tqdm import tqdm, trange
12
+ from torch.nn import utils
13
+ from torch import optim
14
+ from typing import *
15
+ from torch import nn
16
+ import pandas as pd
17
+ import numpy as np
18
+ import string
19
+ import torch
20
+ import json
21
+ import copy
22
+ import os
23
+
24
+ # choose letters for random words
25
+ letters = string.ascii_lowercase
26
+
27
+
28
+ class PredictionError(Exception):
29
+ def __init__(self, error: Union[str, None] = None):
30
+
31
+ if not error is None:
32
+
33
+ print(error)
34
+
35
+ else:
36
+
37
+ print(
38
+ "You cannot with this type of data! Provide a list of tensors, a list of numpy arrays, a numpy array or a torch tensor."
39
+ )
40
+
41
+
42
+ class LossError(Exception):
43
+ def __init__(self, error: Union[str, None] = None):
44
+
45
+ if not error is None:
46
+
47
+ print(error)
48
+
49
+ else:
50
+
51
+ print("A list of losses is provided for multiple outputs.")
52
+
53
+
54
+ class ModelRunner:
55
+ def __init__(
56
+ self,
57
+ model: nn.Module,
58
+ optimizer=optim.AdamW,
59
+ seed: Union[int, None] = None,
60
+ evaluation: Union[TranslationEvaluation, None] = None,
61
+ version: int = 1,
62
+ ):
63
+
64
+ # Initialisation de la graine du générateur
65
+ self.seed = seed
66
+
67
+ # Initialisation de la version
68
+ self.version = version
69
+
70
+ # Recuperate the evaluation metric
71
+ self.evaluation = evaluation
72
+
73
+ # Initialisation du générateur
74
+ if self.seed:
75
+ torch.manual_seed(self.seed)
76
+
77
+ # Le modèle à utiliser pour les différents entraînements
78
+ self.orig_model = model
79
+
80
+ # L'optimiseur à utiliser pour les différentes mises à jour du modèle
81
+ self.orig_optimizer = optimizer
82
+
83
+ # Récupération du type de 'device'
84
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
85
+
86
+ self.compilation = False
87
+
88
+ # ------------------------------ Training staffs (Partie entraînement et compilation) --------------------------
89
+
90
+ def _average_gradients(
91
+ self,
92
+ ): # link: https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/pytorch_mnist/mnist.py
93
+ # average the gradients
94
+ size = float(self.dist.get_world_size())
95
+ for param in self.model.parameters():
96
+ self.dist.all_reduce(param.grad.data, op=self.dist.reduce_op.SUM)
97
+ param.grad.data /= size
98
+
99
+ def batch_train(
100
+ self,
101
+ input_: torch.Tensor,
102
+ input_mask: torch.Tensor,
103
+ labels: torch.Tensor,
104
+ labels_mask: torch.Tensor,
105
+ pad_token_id: int = 3,
106
+ ):
107
+ if (
108
+ self.hugging_face
109
+ ): # Nous allons utilise un modèle text to text de hugging face (but only for fine-tuning)
110
+
111
+ # concatenate the input and the label
112
+
113
+ # effectuons un passage vers l'avant
114
+ outputs = self.model(
115
+ input_ids=input_, attention_mask=input_mask, labels=labels
116
+ )
117
+
118
+ # recuperate the predictions and the loss
119
+ preds, loss = outputs.logits, outputs.loss
120
+
121
+ else:
122
+
123
+ # effectuons un passage vers l'avant
124
+ outputs = self.model(
125
+ input_, input_mask, labels, labels_mask, pad_token_id=pad_token_id
126
+ )
127
+
128
+ # recuperate the predictions and the loss
129
+ preds, loss = outputs["preds"], outputs["loss"]
130
+
131
+ # effectuons un passage vers l'arrière
132
+ loss.backward()
133
+
134
+ # average the gradients if the training is distributed over multi machine cpu
135
+ if self.distributed and not self.device == torch.device("cuda"):
136
+
137
+ self._average_gradients()
138
+
139
+ # forcons les valeurs des gradients à se tenir dans un certain interval si nécessaire
140
+ if not self.clipping_value is None:
141
+
142
+ utils.clip_grad_value_(
143
+ self.model.parameters(), clip_value=self.clipping_value
144
+ )
145
+
146
+ # mettons à jour les paramètres
147
+ self.optimizer.step()
148
+
149
+ # Réduction du taux d'apprentissage à chaque itération si nécessaire
150
+ if not self.lr_scheduling is None:
151
+
152
+ self.lr_scheduling.step()
153
+
154
+ # reinitialisation des gradients
155
+ self.optimizer.zero_grad()
156
+
157
+ return preds, loss
158
+
159
+ def batch_eval(
160
+ self,
161
+ input_: torch.Tensor,
162
+ input_mask: torch.Tensor,
163
+ labels: torch.Tensor,
164
+ labels_mask: torch.Tensor,
165
+ pad_token_id: int = 3,
166
+ ):
167
+
168
+ if (
169
+ self.hugging_face
170
+ ): # Nous allons utilise un modèle text to text de hugging face (but only for fine-tuning)
171
+
172
+ # effectuons un passage vers l'avant
173
+ outputs = self.model(
174
+ input_ids=input_, attention_mask=input_mask, labels=labels
175
+ )
176
+ # recuperate the predictions and the loss
177
+ preds, loss = outputs.logits, outputs.loss
178
+
179
+ else:
180
+
181
+ # effectuons un passage vers l'avant
182
+ outputs = self.model(
183
+ input_, input_mask, labels, labels_mask, pad_token_id=pad_token_id
184
+ )
185
+
186
+ # recuperate the predictions and the loss
187
+ preds, loss = outputs["preds"], outputs["loss"]
188
+
189
+ return preds, loss
190
+
191
+ # On a décidé d'ajouter quelques paramètres qui ont été utiles au niveau des enciennes classes d'entraînement
192
+ def compile(
193
+ self,
194
+ train_dataset: Dataset,
195
+ test_dataset: Union[Dataset, None] = None,
196
+ tokenizer: Union[Tokenizer, None] = None,
197
+ train_loader_kwargs: dict = {"batch_size": 16, "shuffle": True},
198
+ test_loader_kwargs: dict = {"batch_size": 16, "shuffle": False},
199
+ optimizer_kwargs: dict = {"lr": 1e-4, "weight_decay": 0.4},
200
+ model_kwargs: dict = {
201
+ "class_criterion": nn.CrossEntropyLoss(label_smoothing=0.1)
202
+ },
203
+ lr_scheduler_kwargs: dict = {"d_model": 512, "lr_warmup_step": 100},
204
+ lr_scheduler=None,
205
+ gradient_clipping_value: Union[float, torch.Tensor, None] = None,
206
+ predict_with_generate: bool = False,
207
+ logging_dir: Union[str, None] = None,
208
+ hugging_face: bool = False,
209
+ is_distributed: bool = False,
210
+ dist=None,
211
+ ):
212
+
213
+ if self.seed:
214
+ torch.manual_seed(self.seed)
215
+
216
+ # On devra utiliser la méthode 'spread' car on ne connait pas les paramètres du modèle
217
+ if isinstance(
218
+ self.orig_model, nn.Module
219
+ ): # si c'est une instance d'un modèle alors pas de paramètres requis
220
+
221
+ self.model = copy.deepcopy(self.orig_model).to(self.device)
222
+
223
+ else: # sinon on fournit les paramètres
224
+
225
+ self.model = copy.deepcopy(self.orig_model(**model_kwargs)).to(self.device)
226
+
227
+ # add distribution if available
228
+ if is_distributed and self.device == torch.device("cuda"):
229
+
230
+ self.model = torch.nn.parallel.DistributedDataParallel(self.model)
231
+
232
+ else:
233
+
234
+ self.model = torch.nn.parallel.DataParallel(self.model)
235
+
236
+ # Initialisation des paramètres de l'optimiseur
237
+ self.optimizer = self.orig_optimizer(
238
+ self.model.parameters(), **optimizer_kwargs
239
+ )
240
+
241
+ # On ajoute un réducteur de taux d'apprentissage si nécessaire
242
+ self.lr_scheduling = None
243
+
244
+ if not lr_scheduler is None and self.lr_scheduling is None:
245
+
246
+ self.lr_scheduling = lr_scheduler(self.optimizer, **lr_scheduler_kwargs)
247
+
248
+ # Initialize the datasets and the loaders
249
+ self.train_set = train_dataset
250
+ self.test_set = test_dataset
251
+
252
+ # If the data is distributed over multiple gpus we will parallelize it
253
+ if is_distributed:
254
+
255
+ # We verify if the train loader kwargs already contains a sampler and
256
+ # if it is the case add it to the parallel sampler object
257
+ sampler = None
258
+ if "batch_sampler" in train_loader_kwargs:
259
+
260
+ sampler = "batch_sampler"
261
+
262
+ elif "sampler" in train_loader_kwargs:
263
+
264
+ sampler = "sampler"
265
+
266
+ if not sampler is None:
267
+
268
+ sampler = DistributedSampler(train_loader_kwargs[sampler])
269
+
270
+ distributed_sampler = sampler
271
+
272
+ train_loader_kwargs[sampler] = distributed_sampler
273
+
274
+ else:
275
+
276
+ distributed_sampler = DistributedSampler(train_dataset)
277
+
278
+ train_loader_kwargs["sampler"] = distributed_sampler
279
+
280
+ self.train_loader = DataLoader(
281
+ train_dataset,
282
+ **train_loader_kwargs,
283
+ )
284
+
285
+ if test_dataset:
286
+ self.test_loader = DataLoader(
287
+ test_dataset,
288
+ **test_loader_kwargs,
289
+ )
290
+
291
+ else:
292
+ self.test_loader = None
293
+
294
+ # Let us initialize the clipping value to make gradient clipping
295
+ self.clipping_value = gradient_clipping_value
296
+
297
+ # Other parameters for step tracking and metrics
298
+ self.compilation = True
299
+
300
+ self.current_epoch = None
301
+
302
+ self.best_score = None
303
+
304
+ self.best_epoch = self.current_epoch
305
+
306
+ # Recuperate some boolean attributes
307
+ self.predict_with_generate = predict_with_generate
308
+
309
+ # Recuperate tokenizer
310
+ self.tokenizer = tokenizer
311
+
312
+ # Recuperate the logging directory
313
+ self.logging_dir = logging_dir
314
+
315
+ # Initialize the metrics
316
+ self.metrics = {}
317
+
318
+ # Initialize the attribute which indicate if the model is from huggingface
319
+ self.hugging_face = hugging_face
320
+
321
+ # Initialize the torch distributed module and distribution option
322
+ self.distributed = is_distributed
323
+ self.dist = dist
324
+
325
+ def train(
326
+ self,
327
+ epochs: int = 100,
328
+ auto_save: bool = False,
329
+ log_step: Union[int, None] = None,
330
+ saving_directory: str = "data/checkpoints/last_checkpoints",
331
+ file_name: str = "checkpoints",
332
+ save_best: bool = True,
333
+ metric_for_best_model: str = "test_loss",
334
+ metric_objective: str = "minimize",
335
+ add_bleu_only: bool = True,
336
+ ):
337
+ """Entraînement du modèle
338
+
339
+ Args:
340
+ epochs (int, optional): Le nombre d'itérations. Defaults to 100.
341
+ auto_save (bool, optional): Auto-sauvegarde du modèle. Defaults to False.
342
+ log_step (int, optional): Le nombre d'itération avant d'afficher les performances. Defaults to 1.
343
+ saving_directory (str, optional): Le dossier de sauvegarde du modèle. Defaults to "inception_package/storage".
344
+ file_name (str, optional): Le nom du fichier de sauvegarde. Defaults to "checkpoints".
345
+ save_best (bool): Une varible booléenne indiquant si l'on souhaite sauvegarder le meilleur modèle. Defaults to True.
346
+ metric_for_best_model (str): Le nom de la métrique qui permet de choisir le meilleur modèle. Defaults to 'eval_loss'.
347
+ metric_objective (str): Indique si la métrique doit être maximisée 'maximize' ou minimisée 'minimize'. Defaults to 'minimize'.
348
+ add_bleu_only (bool): Indique si l'on souhaite n'ajouter la métrique BLEU. Si c'est le cas l'évaluation se fera à la fin des
349
+ itérations. Defaults to True.
350
+
351
+ Raises:
352
+ Exception: L'entraînement implique d'avoir déja initialisé les paramètres
353
+ """
354
+
355
+ # the file name cannot be "best_checkpoints"
356
+ assert file_name != "best_checkpoints"
357
+
358
+ ##################### Error Handling ##################################################
359
+ if not self.compilation:
360
+ raise Exception(
361
+ "You must initialize datasets and\
362
+ parameters with `compile` method. Make sure you don't forget any of them before \n \
363
+ training the model"
364
+ )
365
+
366
+ ##################### Initializations #################################################
367
+
368
+ if metric_objective in ["maximize", "minimize"]:
369
+
370
+ best_score = (
371
+ float("-inf") if metric_objective == "maximize" else float("inf")
372
+ )
373
+
374
+ else:
375
+
376
+ raise ValueError(
377
+ "The metric objective can only between 'maximize' or minimize!"
378
+ )
379
+
380
+ if not self.best_score is None:
381
+
382
+ best_score = self.best_score
383
+
384
+ start_epoch = self.current_epoch if not self.current_epoch is None else 0
385
+
386
+ ##################### Training ########################################################
387
+
388
+ modes = ["train", "test"]
389
+
390
+ if self.test_loader is None:
391
+ modes = ["train"]
392
+
393
+ for epoch in tqdm(range(start_epoch, start_epoch + epochs)):
394
+
395
+ # Print the actual learning rate
396
+ print(f"For epoch {epoch + 1}: ")
397
+
398
+ if self.lr_scheduling:
399
+ print(f"{{Learning rate: {self.lr_scheduling.get_lr()}}}")
400
+
401
+ self.metrics = {}
402
+
403
+ i = {}
404
+
405
+ for mode in modes:
406
+
407
+ if mode == "test" and (epoch + 1) % log_step != 0:
408
+
409
+ continue
410
+
411
+ with torch.set_grad_enabled(mode == "train"):
412
+
413
+ # Initialize the loss of the current mode
414
+ self.metrics[f"{mode}_loss"] = 0
415
+
416
+ # Let us initialize the predictions
417
+ predictions_ = []
418
+
419
+ # Let us initialize the labels
420
+ labels_ = []
421
+
422
+ if mode == "train":
423
+
424
+ self.model.train()
425
+
426
+ # loader = list(iter(self.train_loader))
427
+ loader = self.train_loader
428
+
429
+ dataset = self.train_set
430
+
431
+ else:
432
+
433
+ self.model.eval()
434
+
435
+ # loader = list(iter(self.test_loader))
436
+ loader = self.test_loader
437
+
438
+ dataset = self.test_set
439
+
440
+ # progress_bar = trange(len(loader))
441
+
442
+ with trange(
443
+ len(loader), unit="batches", position=0, leave=True
444
+ ) as pbar:
445
+ # i[mode] = 0
446
+ for i, data in enumerate(loader, 1):
447
+
448
+ # i[mode] += 1
449
+ pbar.set_description(
450
+ f"{mode[0].upper() + mode[1:]} batch number {i + 1}"
451
+ )
452
+
453
+ # data = loader[i]
454
+
455
+ input_ = data[0].to(self.device)
456
+
457
+ # let us initialize a fake input
458
+ # input__ = None
459
+
460
+ input_mask = data[1].to(self.device, dtype=torch.bool)
461
+
462
+ # let us initialize a fake input mask
463
+ # input_mask_ = None
464
+
465
+ labels = data[2].to(self.device)
466
+
467
+ if self.hugging_face:
468
+
469
+ # concatenate the input with the labels and the two attention masks if we only use a decoder
470
+ # if self.decoder_only:
471
+
472
+ # # let us modify the fake input to the first sentence
473
+ # input__ = copy.deepcopy(input_)
474
+
475
+ # input_ = torch.concat((input_, labels), dim=1)
476
+
477
+ # # the new labels are equal to the inputs
478
+ # labels = copy.deepcopy(input_)
479
+
480
+ # # let us modify the fake input mask to mask of the first sentence
481
+ # input_mask_ = copy.deepcopy(input_mask)
482
+
483
+ # input_mask = torch.concat((input_mask, data[3].to(self.device)), dim=1)
484
+
485
+ labels[labels == self.tokenizer.pad_token_id] == -100
486
+
487
+ labels_mask = data[3].to(self.device, dtype=torch.bool)
488
+
489
+ # Récupération de identifiant token du padding (par défaut = 3)
490
+ pad_token_id = (
491
+ 3
492
+ if self.tokenizer is None
493
+ else self.tokenizer.pad_token_id
494
+ )
495
+
496
+ preds, loss = (
497
+ self.batch_train(
498
+ input_,
499
+ input_mask,
500
+ labels,
501
+ labels_mask,
502
+ pad_token_id,
503
+ )
504
+ if mode == "train"
505
+ else self.batch_eval(
506
+ input_,
507
+ input_mask,
508
+ labels,
509
+ labels_mask,
510
+ pad_token_id,
511
+ )
512
+ )
513
+
514
+ # let us calculate the weight of the batch
515
+ batch_weight = labels.shape[0] / len(dataset)
516
+
517
+ self.metrics[f"{mode}_loss"] += loss.item() * batch_weight
518
+
519
+ # let us add the predictions and labels in the list of predictions and labels after their determinations
520
+ if mode == "test":
521
+
522
+ if self.predict_with_generate:
523
+
524
+ if self.hugging_face:
525
+
526
+ # preds = self.model.generate(input_ if not self.decoder_only else input__,
527
+ # attention_mask = input_mask if not self.decoder_only else input_mask_,
528
+ # max_new_tokens = self.train_set.max_len, pad_token_id = self.test_set.tokenizer.eos_token_id)
529
+ preds = self.model.module.generate(
530
+ input_,
531
+ attention_mask=input_mask,
532
+ max_length=labels.shape[1],
533
+ )
534
+
535
+ print(preds.shape[1])
536
+ print(preds)
537
+
538
+ print(labels.shape[1])
539
+
540
+ print(i)
541
+
542
+ else:
543
+
544
+ preds = self.model.module.generate(
545
+ input_,
546
+ input_mask,
547
+ pad_token_id=pad_token_id,
548
+ max_len=labels.shape[1],
549
+ )
550
+
551
+ else:
552
+
553
+ if self.hugging_face:
554
+
555
+ preds = torch.argmax(preds, dim=-1)
556
+
557
+ # if add_bleu_only:
558
+
559
+ predictions_.extend(preds.detach().cpu().tolist())
560
+
561
+ labels_.extend(labels.detach().cpu().tolist())
562
+
563
+ # else:
564
+
565
+ # if not self.evaluation is None and mode == 'test':
566
+
567
+ # calculate the metrics on the current predictions and labels
568
+ # metrics = self.evaluation.compute_metrics((preds.cpu().detach().numpy(), labels.cpu().detach().numpy()), bleu = True, accuracy = True)
569
+
570
+ # for metric in metrics:
571
+
572
+ # if metric != f'{mode}_loss':
573
+
574
+ # self.metrics[metric] = self.metrics[metric] + metrics[metric] * batch_weight\
575
+ # if metric in self.metrics else metrics[metric] * batch_weight
576
+
577
+ pbar.update()
578
+
579
+ if not self.evaluation is None and mode == "test":
580
+
581
+ self.metrics.update(
582
+ self.evaluation.compute_metrics(
583
+ (np.array(predictions_), np.array(labels_))
584
+ )
585
+ )
586
+
587
+ self.metrics["test_loss"] = self.metrics["test_loss"] / i
588
+
589
+ elif mode == "train":
590
+
591
+ self.metrics["train_loss"] = self.metrics["train_loss"] / i
592
+
593
+ # else:
594
+
595
+ # for metric in self.metrics:
596
+
597
+ # if metric != 'train_loss':
598
+
599
+ # self.metrics[metric] = self.metrics[metric] / i['test']
600
+ # self.metrics[metric] = self.metrics[metric] / len(loader)
601
+
602
+ # elif not self.test_loader is None:
603
+
604
+ # self.metrics["test_loss"] = self.metrics["test_loss"] / i['test']
605
+ # self.metrics["test_loss"] = self.metrics["test_loss"] / len(loader)
606
+
607
+ # self.metrics["train_loss"] = self.metrics["train_loss"] / i['train']
608
+ # self.metrics["train_loss"] = self.metrics["train_loss"] / len(loader)
609
+
610
+ # for metric in self.metrics:
611
+
612
+ # if metric != 'train_loss':
613
+
614
+ # self.metrics[metric] = self.metrics[metric] / len(self.test_loader)
615
+
616
+ # Affichage des métriques
617
+ if not log_step is None and (epoch + 1) % log_step == 0:
618
+
619
+ print(f"\nMetrics: {self.metrics}")
620
+
621
+ if not self.logging_dir is None:
622
+
623
+ with SummaryWriter(
624
+ os.path.join(self.logging_dir, f"version_{self.version}")
625
+ ) as writer:
626
+
627
+ for metric in self.metrics:
628
+
629
+ writer.add_scalar(
630
+ metric, self.metrics[metric], global_step=epoch
631
+ )
632
+
633
+ writer.add_scalar("global_step", epoch)
634
+
635
+ print("\n=============================\n")
636
+
637
+ ##################### Model saving #########################################################
638
+
639
+ # Save the model in the end of the current epoch. Sauvegarde du modèle à la fin d'une itération
640
+ if auto_save and not log_step is None and (epoch + 1) % log_step == 0:
641
+
642
+ self.current_epoch = epoch + 1
643
+
644
+ if save_best:
645
+
646
+ # verify if the current score is best and recuperate it if yes
647
+ if metric_objective == "maximize":
648
+
649
+ last_score = best_score < self.metrics[metric_for_best_model]
650
+
651
+ elif metric_objective == "minimize":
652
+
653
+ last_score = best_score > self.metrics[metric_for_best_model]
654
+
655
+ else:
656
+
657
+ raise ValueError(
658
+ "The metric objective can only be in ['maximize', 'minimize'] !"
659
+ )
660
+
661
+ # recuperate the best score
662
+ if last_score:
663
+
664
+ best_score = self.metrics[metric_for_best_model]
665
+
666
+ self.best_epoch = self.current_epoch + 1
667
+
668
+ self.best_score = best_score
669
+
670
+ self.save(saving_directory, "best_checkpoints")
671
+
672
+ self.save(saving_directory, file_name)
673
+
674
+ # Pour la méthode nous allons nous inspirer sur la méthode save de l'agent ddpg (RL) que l'on avait créée
675
+ def save(
676
+ self,
677
+ directory: str = "data/checkpoints/last_checkpoints",
678
+ file_name: str = "checkpoints",
679
+ ):
680
+
681
+ if not os.path.exists(directory):
682
+ os.makedirs(directory)
683
+
684
+ file_path = os.path.join(directory, f"{file_name}.pth")
685
+
686
+ checkpoints = {
687
+ "model_state_dict": self.model.state_dict(),
688
+ "optimizer_state_dict": self.optimizer.state_dict(),
689
+ "current_epoch": self.current_epoch,
690
+ "metrics": self.metrics,
691
+ "best_score": self.best_score,
692
+ "best_epoch": self.best_epoch,
693
+ "lr_scheduler_state_dict": self.lr_scheduling.state_dict()
694
+ if not self.lr_scheduling is None
695
+ else None,
696
+ }
697
+
698
+ torch.save(checkpoints, file_path)
699
+
700
+ # update metrics and the best score dict
701
+ self.metrics["current_epoch"] = (
702
+ self.current_epoch + 1
703
+ if not self.current_epoch is None
704
+ else self.current_epoch
705
+ )
706
+
707
+ best_score_dict = {"best_score": self.best_score, "best_epoch": self.best_epoch}
708
+
709
+ # save the metrics as json file
710
+ metrics = json.dumps(
711
+ {"metrics": self.metrics, "best_performance": best_score_dict}, indent=4
712
+ )
713
+
714
+ with open(os.path.join(directory, f"{file_name}.json"), "w") as f:
715
+
716
+ f.write(metrics)
717
+
718
+ # Ainsi que pour la méthode load
719
+ def load(
720
+ self,
721
+ directory: str = "data/checkpoints/last_checkpoints",
722
+ file_name: str = "checkpoints",
723
+ load_best: bool = False,
724
+ ):
725
+
726
+ if load_best:
727
+ file_name = "best_checkpoints"
728
+
729
+ file_path = os.path.join(directory, f"{file_name}.pth")
730
+
731
+ if os.path.exists(file_path):
732
+
733
+ checkpoints = (
734
+ torch.load(file_path)
735
+ if torch.device == torch.device("cuda")
736
+ else torch.load(file_path, map_location="cpu")
737
+ )
738
+
739
+ self.model.load_state_dict(checkpoints["model_state_dict"])
740
+
741
+ self.optimizer.load_state_dict(checkpoints["optimizer_state_dict"])
742
+
743
+ self.current_epoch = checkpoints["current_epoch"]
744
+
745
+ self.best_score = checkpoints["best_score"]
746
+
747
+ self.best_epoch = checkpoints["best_epoch"]
748
+
749
+ if not self.lr_scheduling is None:
750
+
751
+ self.lr_scheduling.load_state_dict(
752
+ checkpoints["lr_scheduler_state_dict"]
753
+ )
754
+
755
+ else:
756
+
757
+ raise OSError(
758
+ f"Le fichier {file_path} est introuvable. Vérifiez si le chemin fourni est correct!"
759
+ )
760
+
761
+ def evaluate(self, test_dataset, loader_kwargs: dict = {}):
762
+
763
+ self.model.eval()
764
+
765
+ test_loader = DataLoader(
766
+ test_dataset,
767
+ **loader_kwargs,
768
+ )
769
+
770
+ # Let us initialize the predictions
771
+ predictions_ = []
772
+
773
+ # Let us initialize the labels
774
+ labels_ = []
775
+
776
+ metrics = {"test_loss": 0.0}
777
+
778
+ results = {"original_sentences": [], "translations": [], "predictions": []}
779
+
780
+ # progress_bar = trange(len(test_loader))
781
+
782
+ with torch.no_grad():
783
+
784
+ # i = 0
785
+ # for data in test_loader:
786
+ with trange(
787
+ len(test_loader), unit="batches", position=0, leave=True
788
+ ) as pbar:
789
+ # for i in tqdm(range(len(test_loader))):
790
+ for i, data in enumerate(test_loader, 1):
791
+ # i += 1
792
+ pbar.set_description(f"Evaluation batch number {i + 1}")
793
+
794
+ # data = test_loader[i]
795
+
796
+ input_ = data[0].long().to(self.device)
797
+
798
+ input_mask = data[1].to(self.device)
799
+
800
+ labels = data[2].long().to(self.device)
801
+
802
+ if self.hugging_face:
803
+
804
+ # concatenate the input with the labels and the two attention masks if we only use a decoder
805
+ # if self.decoder_only:
806
+
807
+ # labels = torch.concat((input_, labels))
808
+
809
+ labels[labels == test_dataset.tokenizer.pad_token_id] == -100
810
+
811
+ labels_mask = data[3].to(self.device)
812
+
813
+ preds, loss = self.batch_eval(
814
+ input_,
815
+ input_mask,
816
+ labels,
817
+ labels_mask,
818
+ test_dataset.tokenizer.pad_token_id,
819
+ )
820
+
821
+ # let us calculate the weight of the batch
822
+ batch_weight = labels.shape[0] / len(test_dataset)
823
+
824
+ metrics[f"test_loss"] += loss.item() * batch_weight
825
+
826
+ if self.hugging_face:
827
+
828
+ # preds = self.model.generate(input_, attention_mask = input_mask, max_new_tokens = self.train_set.max_len * 2, pad_token_id = test_dataset.tokenizer.eos_token_id)
829
+
830
+ preds = self.model.module.generate(
831
+ input_,
832
+ attention_mask=input_mask,
833
+ max_length=labels.shape[1],
834
+ )
835
+
836
+ else:
837
+
838
+ preds = self.model.module.generate(
839
+ input_,
840
+ input_mask,
841
+ pad_token_id=test_dataset.tokenizer.pad_token_id,
842
+ max_len=labels.shape[1],
843
+ )
844
+
845
+ # if not self.evaluation is None:
846
+
847
+ # # calculate the metrics on the current predictions and labels
848
+ # mets = self.evaluation.compute_metrics((preds.cpu().detach().numpy(), labels.cpu().detach().numpy()),
849
+ # accuracy = not self.hugging_face, bleu = True)
850
+
851
+ # for metric in mets:
852
+
853
+ # if metric != 'test_loss':
854
+
855
+ # metrics[metric] = metrics[metric] + mets[metric] * batch_weight\
856
+ # if metric in metrics else mets[metric] * batch_weight
857
+
858
+ labels_.extend(labels.detach().cpu().tolist())
859
+
860
+ predictions_.extend(preds.detach().cpu().tolist())
861
+
862
+ # let us recuperate the original sentences
863
+ results["original_sentences"].extend(test_dataset.decode(input_))
864
+
865
+ results["translations"].extend(test_dataset.decode(labels))
866
+
867
+ results["predictions"].extend(test_dataset.decode(preds))
868
+
869
+ pbar.update()
870
+
871
+ if not self.evaluation is None:
872
+
873
+ # calculate the metrics on the current predictions and labels
874
+ metrics.update(
875
+ self.evaluation.compute_metrics(
876
+ (np.array(predictions_), np.array(labels_))
877
+ )
878
+ )
879
+
880
+ metrics["test_loss"] = metrics["test_loss"] / len(test_loader)
881
+
882
+ return metrics, pd.DataFrame(results)