wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. wolof_translate/__init__.py +73 -0
  2. wolof_translate/data/__init__.py +0 -0
  3. wolof_translate/data/dataset_v1.py +151 -0
  4. wolof_translate/data/dataset_v2.py +187 -0
  5. wolof_translate/data/dataset_v3.py +187 -0
  6. wolof_translate/data/dataset_v3_2.py +187 -0
  7. wolof_translate/data/dataset_v4.py +202 -0
  8. wolof_translate/data/dataset_v5.py +65 -0
  9. wolof_translate/models/__init__.py +0 -0
  10. wolof_translate/models/transformers/__init__.py +0 -0
  11. wolof_translate/models/transformers/main.py +865 -0
  12. wolof_translate/models/transformers/main_2.py +362 -0
  13. wolof_translate/models/transformers/optimization.py +41 -0
  14. wolof_translate/models/transformers/position.py +46 -0
  15. wolof_translate/models/transformers/size.py +44 -0
  16. wolof_translate/pipe/__init__.py +1 -0
  17. wolof_translate/pipe/nlp_pipeline.py +512 -0
  18. wolof_translate/tokenizers/__init__.py +0 -0
  19. wolof_translate/trainers/__init__.py +0 -0
  20. wolof_translate/trainers/transformer_trainer.py +760 -0
  21. wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  22. wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  23. wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  24. wolof_translate/utils/__init__.py +1 -0
  25. wolof_translate/utils/bucket_iterator.py +143 -0
  26. wolof_translate/utils/database_manager.py +116 -0
  27. wolof_translate/utils/display_predictions.py +162 -0
  28. wolof_translate/utils/download_model.py +40 -0
  29. wolof_translate/utils/evaluate_custom.py +147 -0
  30. wolof_translate/utils/evaluation.py +74 -0
  31. wolof_translate/utils/extract_new_sentences.py +810 -0
  32. wolof_translate/utils/extract_poems.py +60 -0
  33. wolof_translate/utils/extract_sentences.py +562 -0
  34. wolof_translate/utils/improvements/__init__.py +0 -0
  35. wolof_translate/utils/improvements/end_marks.py +45 -0
  36. wolof_translate/utils/recuperate_datasets.py +94 -0
  37. wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  38. wolof_translate/utils/send_model.py +26 -0
  39. wolof_translate/utils/sent_corrections.py +169 -0
  40. wolof_translate/utils/sent_transformers.py +27 -0
  41. wolof_translate/utils/sent_unification.py +97 -0
  42. wolof_translate/utils/split_with_valid.py +72 -0
  43. wolof_translate/utils/tokenize_text.py +46 -0
  44. wolof_translate/utils/training.py +213 -0
  45. wolof_translate/utils/trunc_hg_training.py +196 -0
  46. wolof_translate-0.0.1.dist-info/METADATA +31 -0
  47. wolof_translate-0.0.1.dist-info/RECORD +49 -0
  48. wolof_translate-0.0.1.dist-info/WHEEL +5 -0
  49. wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1042 @@
1
+ """Nouvelle classe d'entraînement. On la fournit un modèle et des hyperparamètres en entrée.
2
+ Nous allons créer des classes supplémentaire qui vont supporter la classe d'entraînement
3
+ """
4
+
5
+ # from wolof_translate.utils.evaluation import TranslationEvaluation
6
+ from wolof_translate.utils.evaluate_custom import TranslationEvaluation
7
+ from torch.utils.data.distributed import DistributedSampler
8
+ from torch.utils.tensorboard import SummaryWriter
9
+ from torch.utils.data import Dataset, DataLoader
10
+ from tokenizers import Tokenizer
11
+ import torch.distributed as dist
12
+ from tqdm import tqdm, trange
13
+ from torch.nn import utils
14
+ from torch import optim
15
+ from typing import *
16
+ from torch import nn
17
+ import pandas as pd
18
+ import numpy as np
19
+ import string
20
+ import torch
21
+ import time
22
+ import json
23
+ import copy
24
+ import os
25
+
26
+ # choose letters for random words
27
+ letters = string.ascii_lowercase
28
+
29
+
30
+ class PredictionError(Exception):
31
+ def __init__(self, error: Union[str, None] = None):
32
+
33
+ if not error is None:
34
+
35
+ print(error)
36
+
37
+ else:
38
+
39
+ print(
40
+ "You cannot with this type of data! Provide a list of tensors, a list of numpy arrays, a numpy array or a torch tensor."
41
+ )
42
+
43
+
44
+ class LossError(Exception):
45
+ def __init__(self, error: Union[str, None] = None):
46
+
47
+ if not error is None:
48
+
49
+ print(error)
50
+
51
+ else:
52
+
53
+ print("A list of losses is provided for multiple outputs.")
54
+
55
+
56
+ class ModelRunner:
57
+ def __init__(
58
+ self,
59
+ model: nn.Module,
60
+ optimizer=optim.AdamW,
61
+ seed: Union[int, None] = None,
62
+ evaluation: Union[TranslationEvaluation, None] = None,
63
+ version: int = 1,
64
+ ):
65
+
66
+ # Initialisation de la graine du générateur
67
+ self.seed = seed
68
+
69
+ # Initialisation de la version
70
+ self.version = version
71
+
72
+ # Recuperate the evaluation metric
73
+ self.evaluation = evaluation
74
+
75
+ # Initialisation du générateur
76
+ if self.seed:
77
+ torch.manual_seed(self.seed)
78
+
79
+ # Le modèle à utiliser pour les différents entraînements
80
+ self.orig_model = model
81
+
82
+ # L'optimiseur à utiliser pour les différentes mises à jour du modèle
83
+ self.orig_optimizer = optimizer
84
+
85
+ # Récupération du type de 'device'
86
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
87
+
88
+ self.compilation = False
89
+
90
+ # ------------------------------ Training staffs (Partie entraînement et compilation) --------------------------
91
+
92
+ def _average_gradients(
93
+ self,
94
+ ): # link: https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/pytorch_mnist/mnist.py
95
+ # average the gradients
96
+ size = float(self.dist.get_world_size())
97
+ for param in self.model.parameters():
98
+ self.dist.all_reduce(param.grad.data, op=self.dist.reduce_op.SUM)
99
+ param.grad.data /= size
100
+
101
+ def batch_train(
102
+ self,
103
+ input_: Union[torch.Tensor, None] = None,
104
+ input_mask: Union[torch.Tensor, None] = None,
105
+ labels: Union[torch.Tensor, None] = None,
106
+ labels_mask: Union[torch.Tensor, None] = None,
107
+ pad_token_id: int = 3,
108
+ data: Union[dict, None] = None,
109
+ ):
110
+
111
+ assert (
112
+ not input_ is None and not input_mask is None and not labels is None
113
+ ) or (not data is None and self.hugging_face)
114
+
115
+ if (
116
+ self.hugging_face
117
+ ): # Nous allons utilise un modèle text to text de hugging face (but only for fine-tuning)
118
+
119
+ # concatenate the input and the label
120
+
121
+ # effectuons un passage vers l'avant
122
+ if data is None:
123
+
124
+ outputs = self.model(
125
+ input_ids=input_, attention_mask=input_mask, labels=labels
126
+ )
127
+
128
+ else:
129
+
130
+ outputs = self.model(**data)
131
+
132
+ # recuperate the predictions and the loss
133
+ preds, loss = outputs.logits, outputs.loss
134
+
135
+ else:
136
+
137
+ # effectuons un passage vers l'avant
138
+ outputs = self.model(
139
+ input_, input_mask, labels, labels_mask, pad_token_id=pad_token_id
140
+ )
141
+
142
+ # recuperate the predictions and the loss
143
+ preds, loss = outputs["preds"], outputs["loss"]
144
+
145
+ # effectuons un passage vers l'arrière
146
+ loss.mean().backward()
147
+
148
+ # average the gradients if the training is distributed over multi machine cpu
149
+ if self.distributed and not self.device == torch.device("cuda"):
150
+
151
+ self._average_gradients()
152
+
153
+ # forcons les valeurs des gradients à se tenir dans un certain interval si nécessaire
154
+ if not self.clipping_value is None:
155
+
156
+ utils.clip_grad_value_(
157
+ self.model.parameters(), clip_value=self.clipping_value
158
+ )
159
+
160
+ # mettons à jour les paramètres
161
+ self.optimizer.step()
162
+
163
+ # Réduction du taux d'apprentissage à chaque itération si nécessaire
164
+ if not self.lr_scheduling is None:
165
+
166
+ self.lr_scheduling.step()
167
+
168
+ # reinitialisation des gradients
169
+ self.optimizer.zero_grad()
170
+
171
+ return preds, loss.mean()
172
+
173
+ def batch_eval(
174
+ self,
175
+ input_: Union[torch.Tensor, None] = None,
176
+ input_mask: Union[torch.Tensor, None] = None,
177
+ labels: Union[torch.Tensor, None] = None,
178
+ labels_mask: Union[torch.Tensor, None] = None,
179
+ pad_token_id: int = 3,
180
+ data: Union[dict, None] = None,
181
+ ):
182
+
183
+ assert (
184
+ not input_ is None and not input_mask is None and not labels is None
185
+ ) or (not data is None and self.hugging_face)
186
+
187
+ if (
188
+ self.hugging_face
189
+ ): # Nous allons utilise un modèle text to text de hugging face (but only for fine-tuning)
190
+
191
+ # effectuons un passage vers l'avant
192
+ if data is None:
193
+
194
+ outputs = self.model(
195
+ input_ids=input_, attention_mask=input_mask, labels=labels
196
+ )
197
+
198
+ else:
199
+
200
+ outputs = self.model(**data)
201
+
202
+ # recuperate the predictions and the loss
203
+ preds, loss = outputs.logits, outputs.loss
204
+
205
+ else:
206
+
207
+ # effectuons un passage vers l'avant
208
+ outputs = self.model(
209
+ input_, input_mask, labels, labels_mask, pad_token_id=pad_token_id
210
+ )
211
+
212
+ # recuperate the predictions and the loss
213
+ preds, loss = outputs["preds"], outputs["loss"]
214
+
215
+ return preds, loss.mean()
216
+
217
+ # On a décidé d'ajouter quelques paramètres qui ont été utiles au niveau des enciennes classes d'entraînement
218
+ def compile(
219
+ self,
220
+ train_dataset: Dataset,
221
+ test_dataset: Union[Dataset, None] = None,
222
+ tokenizer: Union[Tokenizer, None] = None,
223
+ train_loader_kwargs: dict = {"batch_size": 16, "shuffle": True},
224
+ test_loader_kwargs: dict = {"batch_size": 16, "shuffle": False},
225
+ optimizer_kwargs: dict = {"lr": 1e-4, "weight_decay": 0.4},
226
+ model_kwargs: dict = {
227
+ "class_criterion": nn.CrossEntropyLoss(label_smoothing=0.1)
228
+ },
229
+ lr_scheduler_kwargs: dict = {"d_model": 512, "lr_warmup_step": 100},
230
+ lr_scheduler=None,
231
+ stopping_patience: Union[int, None] = None,
232
+ gradient_clipping_value: Union[float, torch.Tensor, None] = None,
233
+ predict_with_generate: bool = False,
234
+ logging_dir: Union[str, None] = None,
235
+ hugging_face: bool = False,
236
+ is_distributed: bool = False,
237
+ dist=None,
238
+ loss_mask_value=-100,
239
+ ):
240
+
241
+ if self.seed:
242
+ torch.manual_seed(self.seed)
243
+
244
+ # On devra utiliser la méthode 'spread' car on ne connait pas les paramètres du modèle
245
+ if isinstance(
246
+ self.orig_model, nn.Module
247
+ ): # si c'est une instance d'un modèle alors pas de paramètres requis
248
+
249
+ self.model = copy.deepcopy(self.orig_model).to(self.device)
250
+
251
+ else: # sinon on fournit les paramètres
252
+
253
+ self.model = copy.deepcopy(self.orig_model(**model_kwargs)).to(self.device)
254
+
255
+ # add distribution if available
256
+ if is_distributed and self.device == torch.device("cuda"):
257
+
258
+ self.model = torch.nn.parallel.DistributedDataParallel(self.model)
259
+
260
+ else:
261
+
262
+ self.model = torch.nn.parallel.DataParallel(self.model)
263
+
264
+ # Initialisation des paramètres de l'optimiseur
265
+ self.optimizer = self.orig_optimizer(
266
+ self.model.parameters(), **optimizer_kwargs
267
+ )
268
+
269
+ # On ajoute un réducteur de taux d'apprentissage si nécessaire
270
+ self.lr_scheduling = None
271
+
272
+ if not lr_scheduler is None and self.lr_scheduling is None:
273
+
274
+ self.lr_scheduling = lr_scheduler(self.optimizer, **lr_scheduler_kwargs)
275
+
276
+ # Initialize the datasets and the loaders
277
+ self.train_set = train_dataset
278
+ self.test_set = test_dataset
279
+
280
+ # If the data is distributed over multiple gpus we will parallelize it
281
+ if is_distributed:
282
+
283
+ # We verify if the train loader kwargs already contains a sampler and
284
+ # if it is the case add it to the parallel sampler object
285
+ sampler = None
286
+ if "batch_sampler" in train_loader_kwargs:
287
+
288
+ sampler = "batch_sampler"
289
+
290
+ elif "sampler" in train_loader_kwargs:
291
+
292
+ sampler = "sampler"
293
+
294
+ if not sampler is None:
295
+
296
+ sampler_ = DistributedSampler(train_loader_kwargs[sampler])
297
+
298
+ distributed_sampler = sampler_
299
+
300
+ train_loader_kwargs[sampler] = distributed_sampler
301
+
302
+ else:
303
+
304
+ distributed_sampler = DistributedSampler(train_dataset)
305
+
306
+ train_loader_kwargs["sampler"] = distributed_sampler
307
+
308
+ self.train_loader = DataLoader(
309
+ train_dataset,
310
+ **train_loader_kwargs,
311
+ )
312
+
313
+ if test_dataset:
314
+ self.test_loader = DataLoader(
315
+ test_dataset,
316
+ **test_loader_kwargs,
317
+ )
318
+
319
+ else:
320
+ self.test_loader = None
321
+
322
+ # Let us initialize the clipping value to make gradient clipping
323
+ self.clipping_value = gradient_clipping_value
324
+
325
+ # Other parameters for step tracking and metrics
326
+ self.compilation = True
327
+
328
+ self.current_epoch = None
329
+
330
+ self.best_score = None
331
+
332
+ self.best_epoch = self.current_epoch
333
+
334
+ # Recuperate some boolean attributes
335
+ self.predict_with_generate = predict_with_generate
336
+
337
+ # Recuperate tokenizer
338
+ self.tokenizer = tokenizer
339
+
340
+ # Recuperate the logging directory
341
+ self.logging_dir = logging_dir
342
+
343
+ # Initialize the metrics
344
+ self.metrics = {}
345
+
346
+ # Initialize the attribute which indicate if the model is from huggingface
347
+ self.hugging_face = hugging_face
348
+
349
+ # Initialize the torch distributed module and distribution option
350
+ self.distributed = is_distributed
351
+ self.dist = dist
352
+
353
+ # initialize the early stopping patience
354
+ self.patience = stopping_patience
355
+
356
+ # add early stopping
357
+ self.epochs_since_improvement = 0
358
+
359
+ # Initialize the mask value for loss
360
+ self.loss_mask_value = loss_mask_value
361
+
362
+ def train(
363
+ self,
364
+ epochs: int = 100,
365
+ auto_save: bool = False,
366
+ log_step: Union[int, None] = None,
367
+ saving_directory: str = "data/checkpoints/last_checkpoints",
368
+ file_name: str = "checkpoints",
369
+ save_best: bool = True,
370
+ metric_for_best_model: str = "test_loss",
371
+ metric_objective: str = "minimize",
372
+ ):
373
+ """Entraînement du modèle
374
+
375
+ Args:
376
+ epochs (int, optional): Le nombre d'itérations. Defaults to 100.
377
+ auto_save (bool, optional): Auto-sauvegarde du modèle. Defaults to False.
378
+ log_step (int, optional): Le nombre d'itération avant d'afficher les performances. Defaults to 1.
379
+ saving_directory (str, optional): Le dossier de sauvegarde du modèle. Defaults to "inception_package/storage".
380
+ file_name (str, optional): Le nom du fichier de sauvegarde. Defaults to "checkpoints".
381
+ save_best (bool): Une varible booléenne indiquant si l'on souhaite sauvegarder le meilleur modèle. Defaults to True.
382
+ metric_for_best_model (str): Le nom de la métrique qui permet de choisir le meilleur modèle. Defaults to 'eval_loss'.
383
+ metric_objective (str): Indique si la métrique doit être maximisée 'maximize' ou minimisée 'minimize'. Defaults to 'minimize'.
384
+
385
+ Raises:
386
+ Exception: L'entraînement implique d'avoir déja initialisé les paramètres
387
+ """
388
+
389
+ # the file name cannot be "best_checkpoints"
390
+ assert file_name != "best_checkpoints"
391
+
392
+ ##################### Error Handling ##################################################
393
+ if not self.compilation:
394
+ raise Exception(
395
+ "You must initialize datasets and\
396
+ parameters with `compile` method. Make sure you don't forget any of them before \n \
397
+ training the model"
398
+ )
399
+
400
+ ##################### Initializations #################################################
401
+
402
+ if metric_objective in ["maximize", "minimize"]:
403
+
404
+ best_score = (
405
+ float("-inf") if metric_objective == "maximize" else float("inf")
406
+ )
407
+
408
+ else:
409
+
410
+ raise ValueError(
411
+ "The metric objective can only between 'maximize' or minimize!"
412
+ )
413
+
414
+ if not self.best_score is None:
415
+
416
+ best_score = self.best_score
417
+
418
+ start_epoch = self.current_epoch if not self.current_epoch is None else 0
419
+
420
+ ##################### Training ########################################################
421
+
422
+ modes = ["train", "test"]
423
+
424
+ if self.test_loader is None:
425
+ modes = ["train"]
426
+
427
+ for epoch in tqdm(range(start_epoch, start_epoch + epochs)):
428
+
429
+ # Print the actual learning rate
430
+ print(f"For epoch {epoch + 1}: ")
431
+
432
+ if self.lr_scheduling:
433
+ print(f"{{Learning rate: {self.lr_scheduling.get_lr()}}}")
434
+
435
+ self.metrics = {}
436
+
437
+ i = {}
438
+
439
+ for mode in modes:
440
+
441
+ if mode == "test" and (epoch + 1) % log_step != 0:
442
+
443
+ continue
444
+
445
+ with torch.set_grad_enabled(mode == "train"):
446
+
447
+ # Initialize the loss of the current mode
448
+ self.metrics[f"{mode}_loss"] = 0
449
+
450
+ # Let us initialize the predictions
451
+ # predictions_ = []
452
+
453
+ # # Let us initialize the labels
454
+ # labels_ = []
455
+
456
+ if mode == "train":
457
+
458
+ self.model.train()
459
+
460
+ # loader = list(iter(self.train_loader))
461
+ loader = self.train_loader
462
+
463
+ dataset = self.train_set
464
+
465
+ else:
466
+
467
+ self.model.eval()
468
+
469
+ # loader = list(iter(self.test_loader))
470
+ loader = self.test_loader
471
+
472
+ dataset = self.test_set
473
+
474
+ # progress_bar = trange(len(loader))
475
+
476
+ with trange(
477
+ len(loader), unit="batches", position=0, leave=True
478
+ ) as pbar:
479
+ # i[mode] = 0
480
+ for i, data in enumerate(loader, 1):
481
+
482
+ # i[mode] += 1
483
+ pbar.set_description(
484
+ f"{mode[0].upper() + mode[1:]} batch number {i + 1}"
485
+ )
486
+
487
+ # data = loader[i]
488
+
489
+ if isinstance(data, dict):
490
+
491
+ input_ = data["input_ids"].long().to(self.device)
492
+
493
+ input_mask = data["attention_mask"].to(
494
+ self.device, dtype=torch.bool
495
+ )
496
+
497
+ labels = data["labels"].to(self.device)
498
+
499
+ if self.hugging_face:
500
+
501
+ labels[
502
+ labels == self.tokenizer.pad_token_id
503
+ ] == self.loss_mask_value
504
+
505
+ # if i == 76:
506
+
507
+ # print(torch.max(data['input_ids']))
508
+ # print(data['input_ids'].shape)
509
+
510
+ # preds, loss = (
511
+ # self.batch_train(data = data)
512
+ # if mode == "train"
513
+ # else self.batch_eval(data = data)
514
+ # )
515
+ preds, loss = (
516
+ self.batch_train(
517
+ input_=input_,
518
+ input_mask=input_mask,
519
+ labels=labels,
520
+ )
521
+ if mode == "train"
522
+ else self.batch_eval(
523
+ input_=input_,
524
+ input_mask=input_mask,
525
+ labels=labels,
526
+ )
527
+ )
528
+
529
+ else:
530
+
531
+ input_ = data[0].long().to(self.device)
532
+
533
+ # let us initialize a fake input
534
+ # input__ = None
535
+
536
+ input_mask = data[1].to(self.device, dtype=torch.bool)
537
+
538
+ # let us initialize a fake input mask
539
+ # input_mask_ = None
540
+
541
+ labels = data[2].to(self.device)
542
+
543
+ if self.hugging_face:
544
+
545
+ # concatenate the input with the labels and the two attention masks if we only use a decoder
546
+ # if self.decoder_only:
547
+
548
+ # # let us modify the fake input to the first sentence
549
+ # input__ = copy.deepcopy(input_)
550
+
551
+ # input_ = torch.concat((input_, labels), dim=1)
552
+
553
+ # # the new labels are equal to the inputs
554
+ # labels = copy.deepcopy(input_)
555
+
556
+ # # let us modify the fake input mask to mask of the first sentence
557
+ # input_mask_ = copy.deepcopy(input_mask)
558
+
559
+ # input_mask = torch.concat((input_mask, data[3].to(self.device)), dim=1)
560
+
561
+ labels[
562
+ labels == self.tokenizer.pad_token_id
563
+ ] == self.loss_mask_value
564
+
565
+ labels_mask = data[3].to(self.device, dtype=torch.bool)
566
+
567
+ # Récupération de identifiant token du padding (par défaut = 3)
568
+ pad_token_id = (
569
+ 3
570
+ if self.tokenizer is None
571
+ else self.tokenizer.pad_token_id
572
+ )
573
+
574
+ preds, loss = (
575
+ self.batch_train(
576
+ input_,
577
+ input_mask,
578
+ labels,
579
+ labels_mask,
580
+ pad_token_id,
581
+ )
582
+ if mode == "train"
583
+ else self.batch_eval(
584
+ input_,
585
+ input_mask,
586
+ labels,
587
+ labels_mask,
588
+ pad_token_id,
589
+ )
590
+ )
591
+
592
+ # let us calculate the weight of the batch
593
+ batch_weight = labels.shape[0] / len(dataset)
594
+
595
+ self.metrics[f"{mode}_loss"] += loss.item() * batch_weight
596
+
597
+ # let us add the predictions and labels in the list of predictions and labels after their determinations
598
+ if mode == "test":
599
+
600
+ if self.predict_with_generate:
601
+
602
+ if self.hugging_face:
603
+
604
+ # preds = self.model.generate(input_ if not self.decoder_only else input__,
605
+ # attention_mask = input_mask if not self.decoder_only else input_mask_,
606
+ # max_new_tokens = self.train_set.max_len, pad_token_id = self.test_set.tokenizer.eos_token_id)
607
+ if isinstance(data, dict) and "gen" in list(
608
+ data.keys()
609
+ ):
610
+
611
+ input_ = (
612
+ data["input_ids_gen"]
613
+ .long()
614
+ .to(self.device)
615
+ )
616
+
617
+ input_mask = data["attention_mask_gen"].to(
618
+ self.device, dtype=torch.bool
619
+ )
620
+
621
+ max_new_tokens = labels.shape[1]
622
+
623
+ else:
624
+
625
+ max_new_tokens = labels.shape[1]
626
+
627
+ preds = self.model.module.generate(
628
+ input_,
629
+ attention_mask=input_mask,
630
+ max_new_tokens=max_new_tokens,
631
+ )
632
+
633
+ else:
634
+
635
+ preds = self.model.module.generate(
636
+ input_, input_mask, max_len=labels.shape[1]
637
+ )
638
+
639
+ else:
640
+
641
+ if self.hugging_face:
642
+
643
+ preds = torch.argmax(preds, dim=-1)
644
+
645
+ # if add_bleu_only:
646
+
647
+ # predictions_.extend(preds.detach().cpu().tolist())
648
+
649
+ # labels_.extend(labels.detach().cpu().tolist())
650
+
651
+ # else:
652
+
653
+ if not self.evaluation is None and mode == "test":
654
+
655
+ # calculate the metrics on the current predictions and labels
656
+ metrics = self.evaluation.compute_metrics(
657
+ (
658
+ preds.cpu().detach().numpy(),
659
+ labels.cpu().detach().numpy(),
660
+ ),
661
+ bleu=True,
662
+ accuracy=not self.hugging_face,
663
+ )
664
+
665
+ for metric in metrics:
666
+
667
+ if metric != f"{mode}_loss":
668
+
669
+ self.metrics[metric] = (
670
+ self.metrics[metric]
671
+ + metrics[metric] * batch_weight
672
+ if metric in self.metrics
673
+ else metrics[metric] * batch_weight
674
+ )
675
+
676
+ pbar.update()
677
+
678
+ # if not self.evaluation is None and not self.test_loader is None:
679
+
680
+ # # if add_bleu_only:
681
+
682
+ # self.metrics.update(self.evaluation.compute_metrics((np.array(predictions_, dtype = object), np.array(labels_, dtype = object))))
683
+
684
+ # self.metrics['test_loss'] = self.metrics['test_loss'] / i['test']
685
+
686
+ # else:
687
+
688
+ # for metric in self.metrics:
689
+
690
+ # if metric != 'train_loss':
691
+
692
+ # self.metrics[metric] = self.metrics[metric] / i['test']
693
+ # self.metrics[metric] = self.metrics[metric] / len(loader)
694
+
695
+ # elif not self.test_loader is None:
696
+
697
+ # self.metrics["test_loss"] = self.metrics["test_loss"] / i['test']
698
+ # self.metrics["test_loss"] = self.metrics["test_loss"] / len(loader)
699
+
700
+ # self.metrics["train_loss"] = self.metrics["train_loss"] / i['train']
701
+ # self.metrics["train_loss"] = self.metrics["train_loss"] / len(loader)
702
+
703
+ # for metric in self.metrics:
704
+
705
+ # if metric != 'train_loss':
706
+
707
+ # self.metrics[metric] = self.metrics[metric] / len(self.test_loader)
708
+
709
+ # Affichage des métriques
710
+ if not log_step is None and (epoch + 1) % log_step == 0:
711
+
712
+ print(f"\nMetrics: {self.metrics}")
713
+
714
+ if not self.logging_dir is None:
715
+
716
+ with SummaryWriter(
717
+ os.path.join(self.logging_dir, f"version_{self.version}")
718
+ ) as writer:
719
+
720
+ for metric in self.metrics:
721
+
722
+ writer.add_scalar(
723
+ metric, self.metrics[metric], global_step=epoch
724
+ )
725
+
726
+ writer.add_scalar("global_step", epoch)
727
+
728
+ print("\n=============================\n")
729
+
730
+ ##################### Model saving #########################################################
731
+
732
+ # Save the model in the end of the current epoch. Sauvegarde du modèle à la fin d'une itération
733
+ if auto_save and not log_step is None and (epoch + 1) % log_step == 0:
734
+
735
+ self.current_epoch = epoch + 1
736
+
737
+ if save_best:
738
+
739
+ # verify if the current score is best and recuperate it if yes
740
+ if metric_objective == "maximize":
741
+
742
+ last_score = best_score < self.metrics[metric_for_best_model]
743
+
744
+ elif metric_objective == "minimize":
745
+
746
+ last_score = best_score > self.metrics[metric_for_best_model]
747
+
748
+ else:
749
+
750
+ raise ValueError(
751
+ "The metric objective can only be in ['maximize', 'minimize'] !"
752
+ )
753
+
754
+ # recuperate the best score
755
+ if last_score:
756
+
757
+ best_score = self.metrics[metric_for_best_model]
758
+
759
+ self.best_epoch = self.current_epoch + 1
760
+
761
+ self.best_score = best_score
762
+
763
+ self.save(saving_directory, "best_checkpoints")
764
+
765
+ if not self.patience is None:
766
+
767
+ self.epochs_since_improvement = 0
768
+
769
+ else:
770
+
771
+ if not self.patience is None:
772
+
773
+ self.epochs_since_improvement += 1
774
+
775
+ if self.epochs_since_improvement >= self.patience:
776
+ print(
777
+ f"Early stopping triggered. No improvement in validation {metric_for_best_model} for {self.patience} epochs !"
778
+ )
779
+ break
780
+
781
+ self.save(saving_directory, file_name)
782
+
783
+ # Pour la méthode nous allons nous inspirer sur la méthode save de l'agent ddpg (RL) que l'on avait créée
784
+ def save(
785
+ self,
786
+ directory: str = "data/checkpoints/last_checkpoints",
787
+ file_name: str = "checkpoints",
788
+ ):
789
+
790
+ if not os.path.exists(directory):
791
+ os.makedirs(directory)
792
+
793
+ file_path = os.path.join(directory, f"{file_name}.pth")
794
+
795
+ checkpoints = {
796
+ "model_state_dict": self.model.state_dict(),
797
+ "optimizer_state_dict": self.optimizer.state_dict(),
798
+ "current_epoch": self.current_epoch,
799
+ "metrics": self.metrics,
800
+ "best_score": self.best_score,
801
+ "best_epoch": self.best_epoch,
802
+ "lr_scheduler_state_dict": self.lr_scheduling.state_dict()
803
+ if not self.lr_scheduling is None
804
+ else None,
805
+ "epochs_since_improvement": self.epochs_since_improvement,
806
+ }
807
+
808
+ torch.save(checkpoints, file_path)
809
+
810
+ # update metrics and the best score dict
811
+ self.metrics["current_epoch"] = (
812
+ self.current_epoch + 1
813
+ if not self.current_epoch is None
814
+ else self.current_epoch
815
+ )
816
+
817
+ best_score_dict = {"best_score": self.best_score, "best_epoch": self.best_epoch}
818
+
819
+ # save the metrics as json file
820
+ metrics = json.dumps(
821
+ {"metrics": self.metrics, "best_performance": best_score_dict}, indent=4
822
+ )
823
+
824
+ with open(os.path.join(directory, f"{file_name}.json"), "w") as f:
825
+
826
+ f.write(metrics)
827
+
828
+ # Ainsi que pour la méthode load
829
+ def load(
830
+ self,
831
+ directory: str = "data/checkpoints/last_checkpoints",
832
+ file_name: str = "checkpoints",
833
+ load_best: bool = False,
834
+ ):
835
+
836
+ if load_best:
837
+ file_name = "best_checkpoints"
838
+
839
+ file_path = os.path.join(directory, f"{file_name}.pth")
840
+
841
+ if os.path.exists(file_path):
842
+
843
+ checkpoints = (
844
+ torch.load(file_path)
845
+ if torch.device == torch.device("cuda")
846
+ else torch.load(file_path, map_location="cpu")
847
+ )
848
+
849
+ self.model.load_state_dict(checkpoints["model_state_dict"])
850
+
851
+ self.optimizer.load_state_dict(checkpoints["optimizer_state_dict"])
852
+
853
+ self.current_epoch = checkpoints["current_epoch"]
854
+
855
+ self.best_score = checkpoints["best_score"]
856
+
857
+ self.best_epoch = checkpoints["best_epoch"]
858
+
859
+ self.epochs_since_improvement = checkpoints["epochs_since_improvement"]
860
+
861
+ if not self.lr_scheduling is None:
862
+
863
+ self.lr_scheduling.load_state_dict(
864
+ checkpoints["lr_scheduler_state_dict"]
865
+ )
866
+
867
+ else:
868
+
869
+ raise OSError(
870
+ f"Le fichier {file_path} est introuvable. Vérifiez si le chemin fourni est correct!"
871
+ )
872
+
873
+ def evaluate(
874
+ self,
875
+ test_dataset,
876
+ loader_kwargs: dict = {},
877
+ beam_size: int = 3,
878
+ beam_groups: int = 1,
879
+ diversity_penalty: float = 0.5,
880
+ top_k: int = 10,
881
+ top_p: float = 1.0,
882
+ temperature: float = 1.0,
883
+ max_length: int = 50,
884
+ ):
885
+
886
+ self.model.eval()
887
+
888
+ test_loader = DataLoader(
889
+ test_dataset,
890
+ **loader_kwargs,
891
+ )
892
+
893
+ # Let us initialize the predictions
894
+ predictions_ = []
895
+
896
+ # Let us initialize the labels
897
+ labels_ = []
898
+
899
+ metrics = {"test_loss": 0.0}
900
+
901
+ results = {
902
+ "Source References": [],
903
+ "Target Translations": [],
904
+ "Predictions": [],
905
+ }
906
+
907
+ # progress_bar = trange(len(test_loader))
908
+
909
+ with torch.no_grad():
910
+
911
+ # i = 0
912
+ # for data in test_loader:
913
+ with trange(
914
+ len(test_loader), unit="batches", position=0, leave=True
915
+ ) as pbar:
916
+ # for i in tqdm(range(len(test_loader))):
917
+ for i, data in enumerate(test_loader, 1):
918
+ # i += 1
919
+ pbar.set_description(f"Evaluation batch number {i + 1}")
920
+
921
+ # data = test_loader[i]
922
+
923
+ if isinstance(data, dict):
924
+
925
+ input_ = data["input_ids"].long().to(self.device)
926
+
927
+ input_mask = data["attention_mask"].to(
928
+ self.device, dtype=torch.bool
929
+ )
930
+
931
+ labels = data["labels"].to(self.device)
932
+
933
+ if self.hugging_face:
934
+
935
+ labels[
936
+ labels == self.tokenizer.pad_token_id
937
+ ] == self.loss_mask_value
938
+
939
+ preds, loss = self.batch_eval(data=data)
940
+
941
+ else:
942
+
943
+ input_ = data[0].long().to(self.device)
944
+
945
+ input_mask = data[1].to(self.device)
946
+
947
+ labels = data[2].long().to(self.device)
948
+
949
+ if self.hugging_face:
950
+
951
+ # concatenate the input with the labels and the two attention masks if we only use a decoder
952
+ # if self.decoder_only:
953
+
954
+ # labels = torch.concat((input_, labels))
955
+
956
+ labels[
957
+ labels == test_dataset.tokenizer.pad_token_id
958
+ ] == self.loss_mask_value
959
+
960
+ labels_mask = data[3].to(self.device)
961
+
962
+ preds, loss = self.batch_eval(
963
+ input_,
964
+ input_mask,
965
+ labels,
966
+ labels_mask,
967
+ test_dataset.tokenizer.pad_token_id,
968
+ )
969
+
970
+ # let us calculate the weight of the batch
971
+ batch_weight = labels.shape[0] / len(test_dataset)
972
+
973
+ metrics[f"test_loss"] += loss.item() * batch_weight
974
+
975
+ if self.hugging_face:
976
+
977
+ preds = self.model.module.generate(
978
+ input_,
979
+ attention_mask=input_mask,
980
+ max_length=max_length,
981
+ num_beams=beam_size,
982
+ num_beam_groups=beam_groups,
983
+ diversity_penalty=diversity_penalty,
984
+ temperature=temperature,
985
+ )
986
+
987
+ else:
988
+
989
+ preds = self.model.module.diverse_beam_generate(
990
+ input_,
991
+ input_mask,
992
+ max_len=labels.shape[1],
993
+ beam_size=beam_size,
994
+ beam_groups=beam_groups,
995
+ diversity_penalty=diversity_penalty,
996
+ temperature=temperature,
997
+ )
998
+
999
+ if not self.evaluation is None:
1000
+
1001
+ # calculate the metrics on the current predictions and labels
1002
+ mets = self.evaluation.compute_metrics(
1003
+ (
1004
+ preds.cpu().detach().numpy(),
1005
+ labels.cpu().detach().numpy(),
1006
+ ),
1007
+ accuracy=not self.hugging_face,
1008
+ bleu=True,
1009
+ )
1010
+
1011
+ for metric in mets:
1012
+
1013
+ if metric != "test_loss":
1014
+
1015
+ metrics[metric] = (
1016
+ metrics[metric] + mets[metric] * batch_weight
1017
+ if metric in metrics
1018
+ else mets[metric] * batch_weight
1019
+ )
1020
+
1021
+ # labels_.extend(labels.detach().cpu().tolist())
1022
+
1023
+ # predictions_.extend(preds.detach().cpu().tolist())
1024
+
1025
+ # let us recuperate the original sentences
1026
+ results["Source References"].extend(test_dataset.decode(input_))
1027
+
1028
+ results["Target Translations"].extend(test_dataset.decode(labels))
1029
+
1030
+ results["Predictions"].extend(test_dataset.decode(preds))
1031
+
1032
+ pbar.update()
1033
+
1034
+ # if not self.evaluation is None:
1035
+
1036
+ # metrics = {metric: value / len(test_loader) for metric, value in metrics.items()}
1037
+
1038
+ # else:
1039
+
1040
+ # metrics["test_loss"] = metrics["test_loss"] / len(test_loader)
1041
+
1042
+ return metrics, pd.DataFrame(results)