translate-package 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ # -*- coding: utf-8 -*-
2
+ from tokenizers import (
3
+ decoders,
4
+ models,
5
+ normalizers,
6
+ pre_tokenizers,
7
+ processors,
8
+ trainers,
9
+ Tokenizer
10
+ )
11
+ from transformers import (
12
+ GenerationConfig,
13
+ TrainingArguments,
14
+ Trainer, AutoModelForSeq2SeqLM,
15
+ get_linear_schedule_with_warmup,
16
+ T5ForConditionalGeneration, Adafactor, BartForConditionalGeneration,
17
+ MT5ForConditionalGeneration, AdamWeightDecay
18
+ )
19
+ from wolof_translate.utils.bucket_iterator import SequenceLengthBatchSampler, BucketSampler
20
+ from wolof_translate.utils.sent_transformers import TransformerSequences
21
+ from wolof_translate.utils.sent_corrections import *
22
+ from peft import LoraConfig, get_peft_model, TaskType
23
+ from sklearn.model_selection import train_test_split
24
+ from torch.utils.data import DataLoader, Dataset
25
+ from nlpaug.augmenter import char as nac
26
+ import matplotlib.pyplot as plt
27
+ import pytorch_lightning as pl
28
+ from functools import partial
29
+ import sentencepiece as spm
30
+ from math import ceil
31
+ import pandas as pd
32
+ import numpy as np
33
+ import argparse
34
+ import evaluate
35
+ import string
36
+ import random
37
+ import shutil
38
+ import wandb
39
+ import torch
40
+ import time
41
+ import nltk
42
+ import os
43
+
44
+ # Désactiver le parallélisme des tokenizers
45
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
File without changes
@@ -0,0 +1,424 @@
1
+ from translate_package import (
2
+ pd,
3
+ train_test_split,
4
+ Dataset,
5
+ DataLoader,
6
+ plt,
7
+ torch,
8
+ SequenceLengthBatchSampler,
9
+ BucketSampler,
10
+ partial,
11
+ Union,
12
+ Callable,
13
+ ceil,
14
+ np,
15
+ TransformerSequences,
16
+ nac,
17
+ remove_mark_space,
18
+ delete_guillemet_space
19
+ )
20
+
21
+ # sentence beginning with "Mooy li ko waral ci li ñu xamle waaye itam" is too long and must removed or corrected
22
+
23
+ # python translate_hyperparameter_tuning.py --model_generation "t5" --model_name "google-t5/t5-small" --tokenizer_name "sp" --use_bucketing --save_artifact
24
+
25
+ def augment(examples, src_label, p_word = 0.12554160436087158, p_char = 0.8269672653838092, max_words = 21):
26
+
27
+ examples[src_label] = TransformerSequences(nac.RandomCharAug(action = 'swap', aug_word_p = p_word, aug_char_p = p_char, aug_word_max = max_words))(examples[src_label])[0]
28
+
29
+ return examples
30
+
31
+ def augment_(examples, src_label, tgt_label):
32
+
33
+ examples[src_label] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[src_label])[0]
34
+
35
+ examples[tgt_label] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[tgt_label])[0]
36
+
37
+ return examples
38
+
39
+ def tokenize(examples, tokenizer, src_label, tgt_label, model_generation):
40
+
41
+ if model_generation in ["t5", "mt5", "nllb"]:
42
+
43
+ eos_token = ""
44
+ bos_token = ""
45
+
46
+ else:
47
+
48
+ eos_token = tokenizer.eos_token
49
+ bos_token = tokenizer.bos_token
50
+
51
+ examples[src_label] = bos_token + examples[src_label] + eos_token
52
+
53
+ examples[tgt_label] = bos_token + examples[tgt_label] + eos_token
54
+
55
+ examples.update({key: value[0] for key, value in tokenizer(examples[src_label], return_tensors = 'pt').items()})
56
+
57
+ examples.update({f'decoder_{key}': value[0] for key, value in tokenizer(examples[tgt_label], return_tensors = 'pt').items()})
58
+
59
+ examples['labels'] = examples['decoder_input_ids']
60
+
61
+ return examples
62
+
63
+ def apply_funcs(funcs, data):
64
+ # Logic to apply the functions
65
+ for func in funcs:
66
+ data = func(data)
67
+ return data
68
+
69
+ def sequences(examples, functions):
70
+
71
+ for function in functions:
72
+
73
+ examples = function(examples)
74
+
75
+ return examples
76
+
77
+ class SentenceDataset(Dataset):
78
+
79
+ def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH'):
80
+
81
+ assert source_column in dataframe.columns.tolist() and target_column in dataframe.columns.tolist()
82
+
83
+ self.source_sentences = dataframe[source_column].tolist()
84
+
85
+ self.target_sentences = dataframe[target_column].tolist()
86
+
87
+ self.transformers = transformers
88
+
89
+ self.source_column = source_column
90
+
91
+ self.target_column = target_column
92
+
93
+ def __getitem__(self, index):
94
+
95
+ source_sentence = self.source_sentences[index]
96
+
97
+ target_sentence = self.target_sentences[index]
98
+
99
+ sentences = {
100
+ self.source_column: source_sentence,
101
+ self.target_column: target_sentence
102
+ }
103
+
104
+ if not self.transformers is None:
105
+
106
+ sentences = self.transformers(sentences)
107
+
108
+ return sentences
109
+
110
+ def __len__(self):
111
+
112
+ return len(self.source_sentences)
113
+
114
+
115
+ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
116
+
117
+ # load the dataset with pandas
118
+ dataset_ = pd.read_csv(data_path)
119
+
120
+ # split dataset between train, validation, and test sets
121
+ if test_size == 1.0:
122
+
123
+ dataset = {
124
+ "test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label),
125
+ }
126
+
127
+ else:
128
+
129
+ train, test = train_test_split(
130
+ dataset_, test_size=test_size + valid_size, random_state=seed
131
+ )
132
+
133
+ valid, test = train_test_split(
134
+ test, test_size=test_size / (valid_size + test_size), random_state=seed
135
+ )
136
+
137
+ dataset = {
138
+ "train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label),
139
+ "val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label),
140
+ "test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label),
141
+ }
142
+
143
+ # The dataset actually contains 3 diff splits: train, validation, test.
144
+
145
+ return dataset
146
+
147
+ def get_boundaries(dataset, sizes, min_count):
148
+
149
+ length = []
150
+
151
+ for i in range(len(dataset)):
152
+
153
+ length.append(max(len(dataset[i]["input_ids"]), len(dataset[i]["labels"])))
154
+
155
+ # Create histogram
156
+ hist, bins, _ = plt.hist(length, bins=10) # Adjust the number of bins as needed
157
+
158
+ # Analyze the histogram
159
+ # Identify peaks or gaps to determine the boundaries
160
+
161
+ # Choose the boundaries based on the analysis
162
+ boundaries = (
163
+ [ceil(bins[0])]
164
+ + [ceil(bin) for bin, count in zip(bins[1:], hist) if count > min_count]
165
+ + [np.inf]
166
+ )
167
+
168
+ boundaries = boundaries[:-1]
169
+
170
+ # define batch sizes and samplers
171
+ batch_sizes = [
172
+ sizes[i] if (i + 1) < len(sizes) else sizes[-1] for i in range(len(boundaries))
173
+ ]
174
+
175
+ return boundaries, batch_sizes
176
+
177
+ def collate_fn_trunc(batch, input_max_len, label_max_len, eos_token_id, pad_token_id, keys: list = ['input_ids', 'attention_mask', 'labels']):
178
+
179
+ from torch.nn.utils.rnn import pad_sequence
180
+
181
+ df_dict = {key: [] for key in keys}
182
+
183
+ for b in batch:
184
+
185
+ for key in df_dict:
186
+
187
+ df_dict[key].append(b[key])
188
+
189
+ padded_sequences = {}
190
+
191
+ for key in df_dict:
192
+
193
+ max_len = label_max_len if 'decoder' in key or 'label' in key else input_max_len
194
+
195
+ padding_value = 0 if 'mask' in key else pad_token_id # must be take care
196
+
197
+ # Pad the input sequences to have the same length
198
+ padded_sequences[key] = pad_sequence(df_dict[key], batch_first=True, padding_value = padding_value)[:,:max_len]
199
+
200
+ # eos token if it is not the case
201
+ if not 'mask' in key:
202
+
203
+ padded_sequences[key][:, -1:][(padded_sequences[key][:, -1:] != eos_token_id) & (padded_sequences[key][:, -1:] != pad_token_id)] = eos_token_id
204
+
205
+ return padded_sequences
206
+
207
+ # define padding collate function
208
+ def pad_collate(batch, padding_value):
209
+
210
+ X = [b["input_ids"] for b in batch]
211
+ att = [b["attention_mask"] for b in batch]
212
+ y = [b["labels"] for b in batch]
213
+
214
+ X_ = torch.nn.utils.rnn.pad_sequence(
215
+ X, batch_first=True, padding_value=padding_value
216
+ )
217
+ att_ = torch.nn.utils.rnn.pad_sequence(att, batch_first=True, padding_value=0)
218
+ y_ = torch.nn.utils.rnn.pad_sequence(
219
+ y, batch_first=True, padding_value=padding_value
220
+ )
221
+
222
+ return {"input_ids": X_, "attention_mask": att_, "labels": y_}
223
+
224
+
225
+ def get_loaders(
226
+ tokenizer,
227
+ model_generation,
228
+ src_label,
229
+ tgt_label,
230
+ sizes,
231
+ data_path,
232
+ test_size,
233
+ valid_size,
234
+ seed,
235
+ p_word,
236
+ p_char,
237
+ max_words,
238
+ count,
239
+ src_max_len,
240
+ tgt_max_len,
241
+ num_workers,
242
+ device,
243
+ use_bucketing,
244
+ use_truncation,
245
+ batch_size,
246
+ ):
247
+
248
+ # get dataset
249
+ dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed)
250
+
251
+ # analysis transformations
252
+
253
+ a_transformers = partial(sequences,
254
+ functions = [
255
+ partial(augment_, src_label = src_label, tgt_label = tgt_label),
256
+ partial(tokenize, tokenizer = tokenizer, src_label = src_label, tgt_label = tgt_label, model_generation = model_generation)
257
+ ])
258
+
259
+ # training transformations
260
+ t_transformers = partial(sequences,
261
+ functions = [
262
+ partial(augment, src_label = src_label, p_word = p_word, p_char = p_char, max_words = max_words),
263
+ partial(augment_, src_label = src_label, tgt_label = tgt_label),
264
+ partial(tokenize, tokenizer = tokenizer, src_label = src_label, tgt_label = tgt_label, model_generation = model_generation)
265
+ ])
266
+
267
+ if use_bucketing:
268
+
269
+ if use_truncation:
270
+
271
+ # initialize loaders
272
+ train_sampler = BucketSampler(
273
+ dataset["train"](transformers=a_transformers),
274
+ batch_size=batch_size,
275
+ input_key="input_ids",
276
+ label_key="labels",
277
+ )
278
+
279
+ valid_sampler = BucketSampler(
280
+ dataset["val"](transformers=a_transformers),
281
+ batch_size=batch_size,
282
+ input_key="input_ids",
283
+ label_key="labels",
284
+ )
285
+
286
+ test_sampler = BucketSampler(
287
+ dataset["test"](transformers=a_transformers),
288
+ batch_size=batch_size,
289
+ input_key="input_ids",
290
+ label_key="labels",
291
+ )
292
+
293
+ # add transformations
294
+ dataset = {s: dataset[s](transformers = t_transformers) if s == 'train' else dataset[s](transformers = a_transformers) for s in dataset}
295
+
296
+ # define data loaders
297
+ train_loader = DataLoader(
298
+ dataset["train"],
299
+ batch_sampler=train_sampler,
300
+ collate_fn = partial(collate_fn_trunc, input_max_len = src_max_len, label_max_len = tgt_max_len,
301
+ eos_token_id = tokenizer.eos_token_id, pad_token_id = tokenizer.pad_token_id),
302
+ num_workers=num_workers,
303
+ pin_memory=True if device in ["cuda", "gpu"] else False,
304
+ )
305
+ valid_loader = DataLoader(
306
+ dataset["val"],
307
+ batch_sampler=valid_sampler,
308
+ collate_fn=partial(collate_fn_trunc, input_max_len = src_max_len, label_max_len = tgt_max_len,
309
+ eos_token_id = tokenizer.eos_token_id, pad_token_id = tokenizer.pad_token_id),
310
+ num_workers=num_workers,
311
+ pin_memory=True if device in ["cuda", "gpu"] else False,
312
+ )
313
+ test_loader = DataLoader(
314
+ dataset["test"],
315
+ batch_sampler=test_sampler,
316
+ collate_fn=partial(collate_fn_trunc, input_max_len = src_max_len, label_max_len = tgt_max_len,
317
+ eos_token_id = tokenizer.eos_token_id, pad_token_id = tokenizer.pad_token_id),
318
+ num_workers=num_workers,
319
+ pin_memory=True if device in ["cuda", "gpu"] else False,
320
+ )
321
+
322
+ else:
323
+
324
+ # get boundaries
325
+ boundaries, batch_sizes = get_boundaries(dataset['train'](transformers = a_transformers), sizes, count)
326
+
327
+ # initialize loaders
328
+ train_sampler = SequenceLengthBatchSampler(
329
+ dataset["train"](transformers=a_transformers),
330
+ boundaries=boundaries,
331
+ batch_sizes=batch_sizes,
332
+ input_key="input_ids",
333
+ label_key="labels",
334
+ )
335
+
336
+ valid_sampler = SequenceLengthBatchSampler(
337
+ dataset["val"](transformers=a_transformers),
338
+ boundaries=boundaries,
339
+ batch_sizes=batch_sizes,
340
+ input_key="input_ids",
341
+ label_key="labels",
342
+ )
343
+
344
+ test_sampler = SequenceLengthBatchSampler(
345
+ dataset["test"](transformers=a_transformers),
346
+ boundaries=boundaries,
347
+ batch_sizes=batch_sizes,
348
+ input_key="input_ids",
349
+ label_key="labels",
350
+ )
351
+
352
+ # add transformations
353
+ dataset = {s: dataset[s](transformers = t_transformers) for s in dataset}
354
+
355
+ # define data loaders
356
+ train_loader = DataLoader(
357
+ dataset["train"],
358
+ batch_sampler=train_sampler,
359
+ collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
360
+ num_workers=num_workers,
361
+ pin_memory=True if device in ["cuda", "gpu"] else False,
362
+ )
363
+ valid_loader = DataLoader(
364
+ dataset["val"],
365
+ batch_sampler=valid_sampler,
366
+ collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
367
+ num_workers=num_workers,
368
+ pin_memory=True if device in ["cuda", "gpu"] else False,
369
+ )
370
+ test_loader = DataLoader(
371
+ dataset["test"],
372
+ batch_sampler=test_sampler,
373
+ collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
374
+ num_workers=num_workers,
375
+ pin_memory=True if device in ["cuda", "gpu"] else False,
376
+ )
377
+
378
+ else:
379
+
380
+ # add transformations
381
+ dataset = {s: dataset[s](transformers = t_transformers) for s in dataset}
382
+
383
+ if "train" in dataset:
384
+ # define data loaders
385
+ train_loader = DataLoader(
386
+ dataset["train"],
387
+ batch_size=batch_size,
388
+ collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
389
+ num_workers=num_workers,
390
+ pin_memory=True if device in ["cuda", "gpu"] else False,
391
+ shuffle=True,
392
+ )
393
+
394
+ if "val" in dataset:
395
+
396
+ valid_loader = DataLoader(
397
+ dataset["val"],
398
+ batch_size=batch_size,
399
+ collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
400
+ num_workers=num_workers,
401
+ pin_memory=True if device in ["cuda", "gpu"] else False,
402
+ )
403
+
404
+ if "test" in dataset:
405
+
406
+ test_loader = DataLoader(
407
+ dataset["test"],
408
+ batch_size=batch_size,
409
+ collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
410
+ num_workers=num_workers,
411
+ pin_memory=True if device in ["cuda", "gpu"] else False,
412
+ )
413
+
414
+ if "train" in dataset and "val" in dataset:
415
+
416
+ return {
417
+ "train_loader": train_loader,
418
+ "valid_loader": valid_loader,
419
+ "test_loader": test_loader,
420
+ }
421
+
422
+ else:
423
+
424
+ return {"test_loader": test_loader}
@@ -0,0 +1,12 @@
1
+ class EmptyArtifactException(Exception):
2
+ def __init__(
3
+ self, error="No wandb's artifact path was provided!! Please provide one."
4
+ ):
5
+
6
+ print(error)
7
+
8
+ class TokenizerException(Exception):
9
+
10
+ def __init__(self, error: str):
11
+
12
+ print(error)
File without changes
@@ -0,0 +1,9 @@
1
+ from torch.nn import Module
2
+
3
+ def get_gradients_mean(model: Module):
4
+ ave_grads = []
5
+ for name, param in model.named_parameters():
6
+ if param.grad is not None:
7
+ ave_grads.append(param.grad.abs().mean().item())
8
+
9
+ return sum(ave_grads)/(len(ave_grads) + 1e-5)
@@ -0,0 +1,97 @@
1
+ import torch
2
+
3
+ class LSTMOutput:
4
+
5
+ def __init__(self, logits, loss):
6
+
7
+ self.logits = logits
8
+
9
+ self.loss = loss
10
+
11
+ class LSTMSequenceToSequence(torch.nn.Module):
12
+
13
+ def __init__(self, tokenizer, embedding_size = 128, num_layers = 6, hidden_size = 128, dropout=0.1, bidirectional = True):
14
+
15
+ super().__init__()
16
+
17
+ self.tokenizer = tokenizer
18
+
19
+ self.vocab_size = self.tokenizer.vocab_size
20
+
21
+ self.embedding = torch.nn.Embedding(self.vocab_size, embedding_size)
22
+
23
+ self.encoder = torch.nn.LSTM(input_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, batch_first = True,
24
+ bidirectional = bidirectional, dropout=dropout)
25
+
26
+ self.decoder = torch.nn.LSTM(input_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, batch_first = True,
27
+ bidirectional = bidirectional, dropout=dropout)
28
+
29
+ copy = 2 if bidirectional else 1
30
+
31
+ self.decoder_output_layer = torch.nn.Linear(copy * hidden_size, self.vocab_size)
32
+
33
+ self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
34
+
35
+ def forward(self, input, output):
36
+
37
+ input_embed = self.embedding(input)
38
+
39
+ state, hidden = self.encoder(input_embed)
40
+
41
+ # decal output for decoder
42
+ decoder_input = output[:, :-1]
43
+
44
+ decoder_input = self.embedding(decoder_input)
45
+
46
+ decoder_output, _ = self.decoder(decoder_input, hidden)
47
+
48
+ decoder_output = self.decoder_output_layer(decoder_output)
49
+
50
+ loss = self.loss_fn(decoder_output.reshape(-1, decoder_output.shape[-1]), output[:, 1:].reshape(-1))
51
+
52
+ return LSTMOutput(decoder_output, loss)
53
+
54
+ def generate(self, input, max_new_tokens: int = 100, temperature: float = 0.0, use_sampling = False, **kwargs):
55
+
56
+ input_embed = self.embedding(input)
57
+
58
+ _, hidden = self.encoder(input_embed)
59
+
60
+ # initialize predictions
61
+ predictions = torch.tensor([[self.tokenizer.bos_token_id]]*input_embed.shape[0], dtype=torch.long, device=input.device)
62
+
63
+ # variable identifying if the sequence is finished
64
+ finished_sequences = torch.zeros(input_embed.shape[0], dtype=torch.bool, device = input.device)
65
+
66
+ # generate predictions
67
+ for i in range(max_new_tokens):
68
+
69
+ decoder_input = self.embedding(predictions)
70
+
71
+ decoder_output, hidden = self.decoder(decoder_input, hidden)
72
+
73
+ decoder_output = self.decoder_output_layer(decoder_output)
74
+
75
+ if temperature > 0.0: decoder_output = (decoder_output / temperature)
76
+
77
+ # get probs and sample the next token from a multinomial distribution
78
+ probs = torch.softmax(decoder_output[:, -1], dim = -1)
79
+
80
+ if use_sampling: prediction = torch.multinomial(probs, num_samples = 1)
81
+ else: prediction = torch.argmax(probs, dim=-1, keepdim=True)
82
+
83
+ # add new prediction
84
+ predictions = torch.cat((predictions, prediction), dim = -1)
85
+
86
+ # recuperate next ids
87
+ next_token_ids = prediction.squeeze(-1)
88
+
89
+ finished_sequences |= (next_token_ids == self.tokenizer.eos_token_id)
90
+
91
+ if finished_sequences.all():
92
+
93
+ break
94
+
95
+ # return predictions
96
+ return predictions[:, 1:]
97
+
@@ -0,0 +1,308 @@
1
+ from translate_package import (
2
+ pl,
3
+ evaluate,
4
+ LoraConfig,
5
+ TaskType,
6
+ torch,
7
+ get_linear_schedule_with_warmup,
8
+ wandb,
9
+ get_peft_model,
10
+ T5ForConditionalGeneration,
11
+ MT5ForConditionalGeneration,
12
+ BartForConditionalGeneration,
13
+ AutoModelForSeq2SeqLM,
14
+ Adafactor,
15
+ AdamWeightDecay
16
+ )
17
+
18
+ from translate_package.models.gradient_observation import get_gradients_mean
19
+
20
+ from translate_package.models.lstm import LSTMSequenceToSequence
21
+
22
+
23
+ def print_number_of_trainable_model_parameters(model):
24
+ trainable_model_params = 0
25
+ all_model_params = 0
26
+ for _, param in model.named_parameters():
27
+ all_model_params += param.numel()
28
+ if param.requires_grad:
29
+ trainable_model_params += param.numel()
30
+ return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"
31
+
32
+ class MachineTranslationTransformer(pl.LightningModule):
33
+
34
+ rouge = evaluate.load("rouge")
35
+ bleu = evaluate.load("sacrebleu")
36
+
37
+ def __init__(
38
+ self,
39
+ model_name,
40
+ tokenizer,
41
+ model_generation="t5",
42
+ model=None,
43
+ lr=1e-4,
44
+ weight_decay=1e-2,
45
+ num_warmup_steps=0,
46
+ num_training_steps=20000,
47
+ r=32,
48
+ lora_alpha=32,
49
+ lora_dropout=0.05,
50
+ bias="none",
51
+ max_new_tokens=200,
52
+ predict_with_generate=True,
53
+ num_beams=0,
54
+ use_peft=False,
55
+ embedding_size=128,
56
+ num_layers=6,
57
+ hidden_size=128,
58
+ dropout=0.1,
59
+ bidirectional=False
60
+ ):
61
+
62
+ super().__init__()
63
+
64
+ if model is None:
65
+ if model_generation in ["t5"]:
66
+
67
+ self.original_model = T5ForConditionalGeneration.from_pretrained(
68
+ model_name, torch_dtype=torch.float32
69
+ )
70
+
71
+ elif model_generation in ["mt5"]:
72
+
73
+ self.original_model = MT5ForConditionalGeneration.from_pretrained(
74
+ model_name, torch_dtype=torch.float32
75
+ )
76
+
77
+ elif model_generation in ["nllb"]:
78
+
79
+ self.original_model = AutoModelForSeq2SeqLM.from_pretrained(
80
+ model_name, torch_dtype=torch.float32
81
+ )
82
+
83
+ elif model_generation in ["bart"]:
84
+
85
+ self.original_model = BartForConditionalGeneration.from_pretrained(
86
+ model_name, torch_dtype=torch.float32
87
+ )
88
+
89
+ elif model_generation in ["lstm"]:
90
+
91
+ self.original_model = LSTMSequenceToSequence(tokenizer, embedding_size, num_layers, hidden_size, dropout, bidirectional)
92
+
93
+ # resize the token embeddings
94
+ if not model_generation in ["lstm"]: self.original_model.resize_token_embeddings(len(tokenizer))
95
+
96
+ if use_peft and not model_generation in ["lstm"]:
97
+
98
+ self.lora_config = LoraConfig(
99
+ r=r, # Rank
100
+ lora_alpha=lora_alpha,
101
+ lora_dropout=lora_dropout,
102
+ bias=bias,
103
+ task_type=TaskType.SEQ_2_SEQ_LM,
104
+ )
105
+
106
+ self.model = get_peft_model(self.original_model, self.lora_config)
107
+
108
+ else:
109
+
110
+ self.model = self.original_model
111
+
112
+ else:
113
+
114
+ self.model = model
115
+
116
+ print(print_number_of_trainable_model_parameters(self.model))
117
+
118
+ self.tokenizer = tokenizer
119
+
120
+ self.lr = lr
121
+
122
+ self.weight_decay = weight_decay
123
+
124
+ self.num_warmup_steps = num_warmup_steps
125
+
126
+ self.num_training_steps = num_training_steps
127
+
128
+ self.predict_with_generate = predict_with_generate
129
+
130
+ self.max_new_tokens = max_new_tokens
131
+
132
+ self.num_beams = num_beams
133
+
134
+ self.model_generation = model_generation
135
+
136
+ self.predictions = {
137
+ "Source references": [],
138
+ "Predictions": [],
139
+ "Target references": [],
140
+ }
141
+
142
+ def forward(self, input):
143
+
144
+ output = self.model(**input) if not self.model_generation in ["lstm"] else self.model(input['input_ids'], input['labels'])
145
+
146
+ return output.loss, output.logits
147
+
148
+ def configure_optimizers(self):
149
+
150
+ if self.model_generation in ["t5", "mt5"]:
151
+
152
+ optimizer = Adafactor(
153
+ self.parameters(), lr=self.lr, weight_decay=self.weight_decay, relative_step = False,
154
+ warmup_init = False
155
+ )
156
+
157
+ elif self.model_generation in ["bart", "mbart"]:
158
+
159
+ optimizer = torch.optim.AdamW(
160
+ self.parameters(), lr=self.lr, weight_decay=self.weight_decay
161
+ )
162
+
163
+ elif self.model_generation in ["lstm"]:
164
+
165
+ optimizer = torch.optim.AdamW(
166
+ self.parameters(), lr=self.lr, weight_decay=self.weight_decay
167
+ )
168
+
169
+ if self.model_generation in ["t5", "lstm", "mt5"]:
170
+
171
+ return [optimizer]
172
+
173
+ elif self.model_generation in ["bart"]:
174
+
175
+ scheduler = get_linear_schedule_with_warmup(
176
+ optimizer,
177
+ num_warmup_steps=self.num_warmup_steps,
178
+ num_training_steps=self.num_training_steps,
179
+ )
180
+
181
+ return {'optimizer': optimizer, 'lr_scheduler': {"scheduler": scheduler}}
182
+
183
+ def training_step(self, batch, batch_idx=None):
184
+
185
+ loss, y_pred = self(batch)
186
+
187
+ self.log_dict(
188
+ {"train_loss": loss, "global_step": float(self.global_step)},
189
+ prog_bar=True,
190
+ on_step=False,
191
+ on_epoch=True,
192
+ sync_dist=True,
193
+ )
194
+
195
+ mean_grad = get_gradients_mean(self.original_model)
196
+
197
+ wandb.log({"train_loss": loss, "trainer/global_step": self.global_step, "mean_gradient": mean_grad})
198
+
199
+ return loss
200
+
201
+ def validation_step(self, batch, batch_idx=None):
202
+
203
+ loss, y_pred = self(batch)
204
+
205
+ metrics = {}
206
+
207
+ if self.predict_with_generate:
208
+
209
+ # generate predictions
210
+ predictions = self.model.generate(
211
+ input_ids=batch["input_ids"],
212
+ attention_mask=batch["attention_mask"],
213
+ max_new_tokens=self.max_new_tokens,
214
+ ) if not self.model_generation in ["lstm"] else self.model.generate(
215
+ input=batch["input_ids"],
216
+ max_new_tokens=self.max_new_tokens,
217
+ )
218
+
219
+ # decode the labels
220
+ predictions = self.tokenizer.batch_decode(
221
+ predictions, skip_special_tokens=True
222
+ )
223
+ labels = self.tokenizer.batch_decode(
224
+ batch["labels"], skip_special_tokens=True
225
+ )
226
+
227
+ # get bleu metric
228
+ bleu = self.bleu.compute(
229
+ predictions=predictions,
230
+ references=[[label.strip()] for label in labels],
231
+ )
232
+
233
+ metrics["bleu"] = bleu["score"]
234
+
235
+ # get rouge metrics
236
+ rouge = self.rouge.compute(
237
+ predictions=predictions, references=[label.strip() for label in labels]
238
+ )
239
+
240
+ metrics.update({k: v for k, v in rouge.items() if "rouge" in k})
241
+
242
+ # get the loss
243
+ metrics.update(
244
+ {"eval_loss": loss.item(), "global_step": float(self.global_step)}
245
+ )
246
+
247
+ self.log_dict(
248
+ metrics, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True
249
+ )
250
+
251
+ metrics.update({"trainer/global_step": self.global_step})
252
+
253
+ wandb.log(metrics)
254
+
255
+ return loss
256
+
257
+ def test_step(self, batch, batch_idx):
258
+
259
+ loss, y_pred = self(batch)
260
+
261
+ references = self.tokenizer.batch_decode(
262
+ batch["input_ids"], skip_special_tokens=True
263
+ )
264
+
265
+ # generate predictions
266
+ predictions = self.model.generate(
267
+ input_ids=batch["input_ids"],
268
+ attention_mask=batch["attention_mask"],
269
+ max_new_tokens=self.max_new_tokens,
270
+ do_sample=self.num_beams > 0,
271
+ num_beams=self.num_beams
272
+ ) if not self.model_generation in ["lstm"] else self.model.generate(
273
+ input=batch["input_ids"],
274
+ max_new_tokens=self.max_new_tokens,
275
+ use_sampling=True
276
+ )
277
+
278
+ # decode the labels
279
+ predictions = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
280
+ labels = self.tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
281
+
282
+ self.predictions["Source references"].extend(references)
283
+ self.predictions["Predictions"].extend(predictions)
284
+ self.predictions["Target references"].extend(labels)
285
+
286
+ # get bleu metric
287
+ bleu = self.bleu.compute(
288
+ predictions=predictions, references=[[label.strip()] for label in labels]
289
+ )
290
+
291
+ metrics = {}
292
+
293
+ metrics["bleu"] = bleu["score"]
294
+
295
+ # get rouge metrics
296
+ rouge = self.rouge.compute(predictions=predictions, references=labels)
297
+
298
+ metrics.update({k: v for k, v in rouge.items() if "rouge" in k})
299
+
300
+ # get the loss
301
+ metrics.update(
302
+ {"test_loss": loss.item(), "global_step": float(self.global_step)}
303
+ )
304
+
305
+ # log metrics
306
+ self.log_dict(
307
+ metrics, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True
308
+ )
File without changes
@@ -0,0 +1,40 @@
1
+ from transformers import BartTokenizerFast
2
+ from transformers import T5TokenizerFast
3
+ from transformers import AutoTokenizer
4
+ import os
5
+
6
+ def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None):
7
+
8
+ if model == "nllb":
9
+
10
+ if not model_name is None:
11
+
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+
14
+ print(f"The {model}'s tokenizer was successfully loaded")
15
+
16
+ else:
17
+
18
+ raise ValueError("For the nllb model you must specify the path to the model!")
19
+
20
+ if tokenizer_name == "bpe":
21
+
22
+ tokenizer_path = os.path.join(dir_path, f"{file_name}.json")
23
+
24
+ if model in ["bart", "lstm"]:
25
+
26
+ tokenizer = BartTokenizerFast(tokenizer_file=tokenizer_path)
27
+
28
+ print(f"The Byte Pair Encoding tokenizer was successfully uploaded from {tokenizer_path}")
29
+
30
+ elif tokenizer_name == "sp":
31
+
32
+ tokenizer_path = os.path.join(dir_path, f"{file_name}.model")
33
+
34
+ if model in ['t5', 'mt5']:
35
+
36
+ tokenizer = T5TokenizerFast(vocab_file=tokenizer_path)
37
+
38
+ print(f"The Sentence Piece tokenizer was successfully uploaded from {tokenizer_path}")
39
+
40
+ return tokenizer
@@ -0,0 +1,83 @@
1
+
2
+ from translate_package import argparse, spm, pd, os, Tokenizer, models, pre_tokenizers, trainers, processors, decoders
3
+ from translate_package.errors import TokenizerException
4
+
5
+ def train_tokenizer(arguments):
6
+
7
+ # recuperate dataset
8
+ dataset = pd.read_csv(arguments.dataset_file).astype(str)
9
+
10
+ if arguments.name == 'bpe':
11
+
12
+ # initialize tokenizer
13
+ tokenizer = Tokenizer(models.BPE())
14
+
15
+ # initialize pre-to
16
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space = False)
17
+
18
+ # initialize trainer
19
+ trainer = trainers.BpeTrainer(vocab_size = arguments.vocab_size, special_tokens = ['<s>', '<pad>', '</s>', '<unk>'])
20
+
21
+ # iterate over the dataset and return the sentences
22
+ def get_training_corpus():
23
+
24
+ sentences = dataset[arguments.src_label].tolist() + dataset[arguments.tgt_label].tolist()
25
+
26
+ for i in range(0, len(sentences), arguments.batch_size):
27
+
28
+ yield sentences[i: i + arguments.batch_size]
29
+
30
+ # train the tokenizer
31
+ tokenizer.train_from_iterator(get_training_corpus(), trainer = trainer)
32
+
33
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets = False)
34
+
35
+ tokenizer.decoder = decoders.ByteLevel()
36
+
37
+ # get path
38
+ tk_path = os.path.join(arguments.save_path, f'{arguments.file_name}.json')
39
+
40
+ tokenizer.save(tk_path)
41
+
42
+ print(f"The Byte Pair Encoding tokenizer was saved as {tk_path}!")
43
+
44
+ elif arguments.name == 'sp':
45
+
46
+ # print sentences into a file
47
+ with open('sents.txt', 'w', encoding = 'utf-8') as f:
48
+
49
+ sentences = dataset[arguments.src_label].tolist() + dataset[arguments.tgt_label].tolist()
50
+
51
+ for i in range(0, len(sentences), arguments.batch_size):
52
+
53
+ sents = sentences[i: i + arguments.batch_size]
54
+
55
+ f.write("\n".join(sents)+'\n')
56
+
57
+ # get path
58
+ tk_path = os.path.join(arguments.save_path, arguments.file_name)
59
+
60
+ # initialize sentence piece trainer
61
+ spm.SentencePieceTrainer.Train(input = f'sents.txt',
62
+ model_prefix=os.path.join(arguments.save_path, arguments.file_name),
63
+ vocab_size=arguments.vocab_size,
64
+ character_coverage=1.0,
65
+ pad_id=0,
66
+ eos_id=1,
67
+ unk_id=2,
68
+ bos_id=3,
69
+ pad_piece='<pad>',
70
+ eos_piece='</s>',
71
+ unk_piece='<unk>',
72
+ bos_piece='<s>',
73
+ )
74
+
75
+ # remove file
76
+ os.remove('sents.txt')
77
+
78
+ print(f"The Sentence Piece tokenizer was saved as {tk_path}(.model / for model)!")
79
+
80
+ else:
81
+
82
+ raise TokenizerException("You can only train a sentence piece (as 'sp') tokenizer, or a byte pair encoding (as 'bpe') tokenizer!")
83
+
File without changes
@@ -0,0 +1,16 @@
1
+ from translate_package import wandb
2
+
3
+
4
+ def download_checkpoint(project, artifact_path, key):
5
+
6
+ wandb.login(key=key)
7
+
8
+ run = wandb.init(project=project)
9
+
10
+ artifact = run.use_artifact(artifact_path, type="dataset")
11
+
12
+ artifact_dir = artifact.download()
13
+
14
+ wandb.finish()
15
+
16
+ return artifact_dir
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.1
2
+ Name: translate-package
3
+ Version: 0.0.1
4
+ Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
5
+ Author: Oumar Kane
6
+ Author-email: oumar.kane@univ-thies.sn
7
+ Requires-Dist: accelerate
8
+ Requires-Dist: torch
9
+ Requires-Dist: spacy
10
+ Requires-Dist: nltk
11
+ Requires-Dist: gensim
12
+ Requires-Dist: furo
13
+ Requires-Dist: streamlit
14
+ Requires-Dist: tokenizers
15
+ Requires-Dist: tensorboard
16
+ Requires-Dist: evaluate
17
+ Requires-Dist: transformers
18
+ Requires-Dist: pandas
19
+ Requires-Dist: numpy
20
+ Requires-Dist: scikit-learn
21
+ Requires-Dist: matplotlib
22
+ Requires-Dist: plotly
23
+ Requires-Dist: sacrebleu
24
+ Requires-Dist: nlpaug
25
+ Requires-Dist: wandb
26
+ Requires-Dist: pytorch-lightning
27
+ Requires-Dist: selenium
28
+ Requires-Dist: sentencepiece
29
+ Requires-Dist: peft
30
+ Requires-Dist: rouge-score
31
+
@@ -0,0 +1,17 @@
1
+ translate_package/__init__.py,sha256=Nckjm15LBEfKSU5-EBjfRkHJQhWELzwow0BD3rKtmkw,1297
2
+ translate_package/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ translate_package/data/data_preparation.py,sha256=sDMKL9LWcwXp6Iy2eS-1n-NWgkZhvhRyFN7k7uID1_w,14804
4
+ translate_package/errors/__init__.py,sha256=gu6XjAIghG4lLkYo8x_7_yyLRtK2FIvmC-WcfJaeOlg,299
5
+ translate_package/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ translate_package/models/gradient_observation.py,sha256=P91UA5i-RdkK46TqpPOJ54DsUYgTI9cRohgPS1Ch0Lc,294
7
+ translate_package/models/lstm.py,sha256=OPkvvceowz5JqdGGH4cfPhH23kbP11z-29zIJn5d8ig,3273
8
+ translate_package/models/machine_translation.py,sha256=5QQpjs_HR9mnPryMyfYpcMgU5tHAAj-eVrv3oGmjR5Y,9963
9
+ translate_package/tokenization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ translate_package/tokenization/load_tokenizer.py,sha256=vzCHS0ZDSJyr0y08zNvupMtD2jP8A16EBN-ob0LJHG0,1344
11
+ translate_package/tokenization/train_tokenizer.py,sha256=RkdT5DUx201OBNaswM6m54iqcrmCThd3ITLguQb_zVM,3347
12
+ translate_package/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ translate_package/utils/checkpoint.py,sha256=GqymRvF8_QZgrQq9m79Ppj6Qr7NQm78kDARm3p_chC0,322
14
+ translate_package-0.0.1.dist-info/METADATA,sha256=eHD4CGRhoQHOl-EXUPYPTQ9kGQCw5q3Ju0xkmB3PGTM,819
15
+ translate_package-0.0.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
16
+ translate_package-0.0.1.dist-info/top_level.txt,sha256=8e2HIrGAMzoSukqu2q929dOJMV1zGYKI_BAFwl-P7XU,18
17
+ translate_package-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.37.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ translate_package