translate-package 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- translate_package/__init__.py +45 -0
- translate_package/data/__init__.py +0 -0
- translate_package/data/data_preparation.py +424 -0
- translate_package/errors/__init__.py +12 -0
- translate_package/models/__init__.py +0 -0
- translate_package/models/gradient_observation.py +9 -0
- translate_package/models/lstm.py +97 -0
- translate_package/models/machine_translation.py +308 -0
- translate_package/tokenization/__init__.py +0 -0
- translate_package/tokenization/load_tokenizer.py +40 -0
- translate_package/tokenization/train_tokenizer.py +83 -0
- translate_package/utils/__init__.py +0 -0
- translate_package/utils/checkpoint.py +16 -0
- translate_package-0.0.1.dist-info/METADATA +31 -0
- translate_package-0.0.1.dist-info/RECORD +17 -0
- translate_package-0.0.1.dist-info/WHEEL +5 -0
- translate_package-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from tokenizers import (
|
|
3
|
+
decoders,
|
|
4
|
+
models,
|
|
5
|
+
normalizers,
|
|
6
|
+
pre_tokenizers,
|
|
7
|
+
processors,
|
|
8
|
+
trainers,
|
|
9
|
+
Tokenizer
|
|
10
|
+
)
|
|
11
|
+
from transformers import (
|
|
12
|
+
GenerationConfig,
|
|
13
|
+
TrainingArguments,
|
|
14
|
+
Trainer, AutoModelForSeq2SeqLM,
|
|
15
|
+
get_linear_schedule_with_warmup,
|
|
16
|
+
T5ForConditionalGeneration, Adafactor, BartForConditionalGeneration,
|
|
17
|
+
MT5ForConditionalGeneration, AdamWeightDecay
|
|
18
|
+
)
|
|
19
|
+
from wolof_translate.utils.bucket_iterator import SequenceLengthBatchSampler, BucketSampler
|
|
20
|
+
from wolof_translate.utils.sent_transformers import TransformerSequences
|
|
21
|
+
from wolof_translate.utils.sent_corrections import *
|
|
22
|
+
from peft import LoraConfig, get_peft_model, TaskType
|
|
23
|
+
from sklearn.model_selection import train_test_split
|
|
24
|
+
from torch.utils.data import DataLoader, Dataset
|
|
25
|
+
from nlpaug.augmenter import char as nac
|
|
26
|
+
import matplotlib.pyplot as plt
|
|
27
|
+
import pytorch_lightning as pl
|
|
28
|
+
from functools import partial
|
|
29
|
+
import sentencepiece as spm
|
|
30
|
+
from math import ceil
|
|
31
|
+
import pandas as pd
|
|
32
|
+
import numpy as np
|
|
33
|
+
import argparse
|
|
34
|
+
import evaluate
|
|
35
|
+
import string
|
|
36
|
+
import random
|
|
37
|
+
import shutil
|
|
38
|
+
import wandb
|
|
39
|
+
import torch
|
|
40
|
+
import time
|
|
41
|
+
import nltk
|
|
42
|
+
import os
|
|
43
|
+
|
|
44
|
+
# Désactiver le parallélisme des tokenizers
|
|
45
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
File without changes
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
from translate_package import (
|
|
2
|
+
pd,
|
|
3
|
+
train_test_split,
|
|
4
|
+
Dataset,
|
|
5
|
+
DataLoader,
|
|
6
|
+
plt,
|
|
7
|
+
torch,
|
|
8
|
+
SequenceLengthBatchSampler,
|
|
9
|
+
BucketSampler,
|
|
10
|
+
partial,
|
|
11
|
+
Union,
|
|
12
|
+
Callable,
|
|
13
|
+
ceil,
|
|
14
|
+
np,
|
|
15
|
+
TransformerSequences,
|
|
16
|
+
nac,
|
|
17
|
+
remove_mark_space,
|
|
18
|
+
delete_guillemet_space
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# sentence beginning with "Mooy li ko waral ci li ñu xamle waaye itam" is too long and must removed or corrected
|
|
22
|
+
|
|
23
|
+
# python translate_hyperparameter_tuning.py --model_generation "t5" --model_name "google-t5/t5-small" --tokenizer_name "sp" --use_bucketing --save_artifact
|
|
24
|
+
|
|
25
|
+
def augment(examples, src_label, p_word = 0.12554160436087158, p_char = 0.8269672653838092, max_words = 21):
|
|
26
|
+
|
|
27
|
+
examples[src_label] = TransformerSequences(nac.RandomCharAug(action = 'swap', aug_word_p = p_word, aug_char_p = p_char, aug_word_max = max_words))(examples[src_label])[0]
|
|
28
|
+
|
|
29
|
+
return examples
|
|
30
|
+
|
|
31
|
+
def augment_(examples, src_label, tgt_label):
|
|
32
|
+
|
|
33
|
+
examples[src_label] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[src_label])[0]
|
|
34
|
+
|
|
35
|
+
examples[tgt_label] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[tgt_label])[0]
|
|
36
|
+
|
|
37
|
+
return examples
|
|
38
|
+
|
|
39
|
+
def tokenize(examples, tokenizer, src_label, tgt_label, model_generation):
|
|
40
|
+
|
|
41
|
+
if model_generation in ["t5", "mt5", "nllb"]:
|
|
42
|
+
|
|
43
|
+
eos_token = ""
|
|
44
|
+
bos_token = ""
|
|
45
|
+
|
|
46
|
+
else:
|
|
47
|
+
|
|
48
|
+
eos_token = tokenizer.eos_token
|
|
49
|
+
bos_token = tokenizer.bos_token
|
|
50
|
+
|
|
51
|
+
examples[src_label] = bos_token + examples[src_label] + eos_token
|
|
52
|
+
|
|
53
|
+
examples[tgt_label] = bos_token + examples[tgt_label] + eos_token
|
|
54
|
+
|
|
55
|
+
examples.update({key: value[0] for key, value in tokenizer(examples[src_label], return_tensors = 'pt').items()})
|
|
56
|
+
|
|
57
|
+
examples.update({f'decoder_{key}': value[0] for key, value in tokenizer(examples[tgt_label], return_tensors = 'pt').items()})
|
|
58
|
+
|
|
59
|
+
examples['labels'] = examples['decoder_input_ids']
|
|
60
|
+
|
|
61
|
+
return examples
|
|
62
|
+
|
|
63
|
+
def apply_funcs(funcs, data):
|
|
64
|
+
# Logic to apply the functions
|
|
65
|
+
for func in funcs:
|
|
66
|
+
data = func(data)
|
|
67
|
+
return data
|
|
68
|
+
|
|
69
|
+
def sequences(examples, functions):
|
|
70
|
+
|
|
71
|
+
for function in functions:
|
|
72
|
+
|
|
73
|
+
examples = function(examples)
|
|
74
|
+
|
|
75
|
+
return examples
|
|
76
|
+
|
|
77
|
+
class SentenceDataset(Dataset):
|
|
78
|
+
|
|
79
|
+
def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH'):
|
|
80
|
+
|
|
81
|
+
assert source_column in dataframe.columns.tolist() and target_column in dataframe.columns.tolist()
|
|
82
|
+
|
|
83
|
+
self.source_sentences = dataframe[source_column].tolist()
|
|
84
|
+
|
|
85
|
+
self.target_sentences = dataframe[target_column].tolist()
|
|
86
|
+
|
|
87
|
+
self.transformers = transformers
|
|
88
|
+
|
|
89
|
+
self.source_column = source_column
|
|
90
|
+
|
|
91
|
+
self.target_column = target_column
|
|
92
|
+
|
|
93
|
+
def __getitem__(self, index):
|
|
94
|
+
|
|
95
|
+
source_sentence = self.source_sentences[index]
|
|
96
|
+
|
|
97
|
+
target_sentence = self.target_sentences[index]
|
|
98
|
+
|
|
99
|
+
sentences = {
|
|
100
|
+
self.source_column: source_sentence,
|
|
101
|
+
self.target_column: target_sentence
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if not self.transformers is None:
|
|
105
|
+
|
|
106
|
+
sentences = self.transformers(sentences)
|
|
107
|
+
|
|
108
|
+
return sentences
|
|
109
|
+
|
|
110
|
+
def __len__(self):
|
|
111
|
+
|
|
112
|
+
return len(self.source_sentences)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
|
|
116
|
+
|
|
117
|
+
# load the dataset with pandas
|
|
118
|
+
dataset_ = pd.read_csv(data_path)
|
|
119
|
+
|
|
120
|
+
# split dataset between train, validation, and test sets
|
|
121
|
+
if test_size == 1.0:
|
|
122
|
+
|
|
123
|
+
dataset = {
|
|
124
|
+
"test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label),
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
else:
|
|
128
|
+
|
|
129
|
+
train, test = train_test_split(
|
|
130
|
+
dataset_, test_size=test_size + valid_size, random_state=seed
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
valid, test = train_test_split(
|
|
134
|
+
test, test_size=test_size / (valid_size + test_size), random_state=seed
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
dataset = {
|
|
138
|
+
"train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label),
|
|
139
|
+
"val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label),
|
|
140
|
+
"test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label),
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# The dataset actually contains 3 diff splits: train, validation, test.
|
|
144
|
+
|
|
145
|
+
return dataset
|
|
146
|
+
|
|
147
|
+
def get_boundaries(dataset, sizes, min_count):
|
|
148
|
+
|
|
149
|
+
length = []
|
|
150
|
+
|
|
151
|
+
for i in range(len(dataset)):
|
|
152
|
+
|
|
153
|
+
length.append(max(len(dataset[i]["input_ids"]), len(dataset[i]["labels"])))
|
|
154
|
+
|
|
155
|
+
# Create histogram
|
|
156
|
+
hist, bins, _ = plt.hist(length, bins=10) # Adjust the number of bins as needed
|
|
157
|
+
|
|
158
|
+
# Analyze the histogram
|
|
159
|
+
# Identify peaks or gaps to determine the boundaries
|
|
160
|
+
|
|
161
|
+
# Choose the boundaries based on the analysis
|
|
162
|
+
boundaries = (
|
|
163
|
+
[ceil(bins[0])]
|
|
164
|
+
+ [ceil(bin) for bin, count in zip(bins[1:], hist) if count > min_count]
|
|
165
|
+
+ [np.inf]
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
boundaries = boundaries[:-1]
|
|
169
|
+
|
|
170
|
+
# define batch sizes and samplers
|
|
171
|
+
batch_sizes = [
|
|
172
|
+
sizes[i] if (i + 1) < len(sizes) else sizes[-1] for i in range(len(boundaries))
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
return boundaries, batch_sizes
|
|
176
|
+
|
|
177
|
+
def collate_fn_trunc(batch, input_max_len, label_max_len, eos_token_id, pad_token_id, keys: list = ['input_ids', 'attention_mask', 'labels']):
|
|
178
|
+
|
|
179
|
+
from torch.nn.utils.rnn import pad_sequence
|
|
180
|
+
|
|
181
|
+
df_dict = {key: [] for key in keys}
|
|
182
|
+
|
|
183
|
+
for b in batch:
|
|
184
|
+
|
|
185
|
+
for key in df_dict:
|
|
186
|
+
|
|
187
|
+
df_dict[key].append(b[key])
|
|
188
|
+
|
|
189
|
+
padded_sequences = {}
|
|
190
|
+
|
|
191
|
+
for key in df_dict:
|
|
192
|
+
|
|
193
|
+
max_len = label_max_len if 'decoder' in key or 'label' in key else input_max_len
|
|
194
|
+
|
|
195
|
+
padding_value = 0 if 'mask' in key else pad_token_id # must be take care
|
|
196
|
+
|
|
197
|
+
# Pad the input sequences to have the same length
|
|
198
|
+
padded_sequences[key] = pad_sequence(df_dict[key], batch_first=True, padding_value = padding_value)[:,:max_len]
|
|
199
|
+
|
|
200
|
+
# eos token if it is not the case
|
|
201
|
+
if not 'mask' in key:
|
|
202
|
+
|
|
203
|
+
padded_sequences[key][:, -1:][(padded_sequences[key][:, -1:] != eos_token_id) & (padded_sequences[key][:, -1:] != pad_token_id)] = eos_token_id
|
|
204
|
+
|
|
205
|
+
return padded_sequences
|
|
206
|
+
|
|
207
|
+
# define padding collate function
|
|
208
|
+
def pad_collate(batch, padding_value):
|
|
209
|
+
|
|
210
|
+
X = [b["input_ids"] for b in batch]
|
|
211
|
+
att = [b["attention_mask"] for b in batch]
|
|
212
|
+
y = [b["labels"] for b in batch]
|
|
213
|
+
|
|
214
|
+
X_ = torch.nn.utils.rnn.pad_sequence(
|
|
215
|
+
X, batch_first=True, padding_value=padding_value
|
|
216
|
+
)
|
|
217
|
+
att_ = torch.nn.utils.rnn.pad_sequence(att, batch_first=True, padding_value=0)
|
|
218
|
+
y_ = torch.nn.utils.rnn.pad_sequence(
|
|
219
|
+
y, batch_first=True, padding_value=padding_value
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
return {"input_ids": X_, "attention_mask": att_, "labels": y_}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def get_loaders(
|
|
226
|
+
tokenizer,
|
|
227
|
+
model_generation,
|
|
228
|
+
src_label,
|
|
229
|
+
tgt_label,
|
|
230
|
+
sizes,
|
|
231
|
+
data_path,
|
|
232
|
+
test_size,
|
|
233
|
+
valid_size,
|
|
234
|
+
seed,
|
|
235
|
+
p_word,
|
|
236
|
+
p_char,
|
|
237
|
+
max_words,
|
|
238
|
+
count,
|
|
239
|
+
src_max_len,
|
|
240
|
+
tgt_max_len,
|
|
241
|
+
num_workers,
|
|
242
|
+
device,
|
|
243
|
+
use_bucketing,
|
|
244
|
+
use_truncation,
|
|
245
|
+
batch_size,
|
|
246
|
+
):
|
|
247
|
+
|
|
248
|
+
# get dataset
|
|
249
|
+
dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed)
|
|
250
|
+
|
|
251
|
+
# analysis transformations
|
|
252
|
+
|
|
253
|
+
a_transformers = partial(sequences,
|
|
254
|
+
functions = [
|
|
255
|
+
partial(augment_, src_label = src_label, tgt_label = tgt_label),
|
|
256
|
+
partial(tokenize, tokenizer = tokenizer, src_label = src_label, tgt_label = tgt_label, model_generation = model_generation)
|
|
257
|
+
])
|
|
258
|
+
|
|
259
|
+
# training transformations
|
|
260
|
+
t_transformers = partial(sequences,
|
|
261
|
+
functions = [
|
|
262
|
+
partial(augment, src_label = src_label, p_word = p_word, p_char = p_char, max_words = max_words),
|
|
263
|
+
partial(augment_, src_label = src_label, tgt_label = tgt_label),
|
|
264
|
+
partial(tokenize, tokenizer = tokenizer, src_label = src_label, tgt_label = tgt_label, model_generation = model_generation)
|
|
265
|
+
])
|
|
266
|
+
|
|
267
|
+
if use_bucketing:
|
|
268
|
+
|
|
269
|
+
if use_truncation:
|
|
270
|
+
|
|
271
|
+
# initialize loaders
|
|
272
|
+
train_sampler = BucketSampler(
|
|
273
|
+
dataset["train"](transformers=a_transformers),
|
|
274
|
+
batch_size=batch_size,
|
|
275
|
+
input_key="input_ids",
|
|
276
|
+
label_key="labels",
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
valid_sampler = BucketSampler(
|
|
280
|
+
dataset["val"](transformers=a_transformers),
|
|
281
|
+
batch_size=batch_size,
|
|
282
|
+
input_key="input_ids",
|
|
283
|
+
label_key="labels",
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
test_sampler = BucketSampler(
|
|
287
|
+
dataset["test"](transformers=a_transformers),
|
|
288
|
+
batch_size=batch_size,
|
|
289
|
+
input_key="input_ids",
|
|
290
|
+
label_key="labels",
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# add transformations
|
|
294
|
+
dataset = {s: dataset[s](transformers = t_transformers) if s == 'train' else dataset[s](transformers = a_transformers) for s in dataset}
|
|
295
|
+
|
|
296
|
+
# define data loaders
|
|
297
|
+
train_loader = DataLoader(
|
|
298
|
+
dataset["train"],
|
|
299
|
+
batch_sampler=train_sampler,
|
|
300
|
+
collate_fn = partial(collate_fn_trunc, input_max_len = src_max_len, label_max_len = tgt_max_len,
|
|
301
|
+
eos_token_id = tokenizer.eos_token_id, pad_token_id = tokenizer.pad_token_id),
|
|
302
|
+
num_workers=num_workers,
|
|
303
|
+
pin_memory=True if device in ["cuda", "gpu"] else False,
|
|
304
|
+
)
|
|
305
|
+
valid_loader = DataLoader(
|
|
306
|
+
dataset["val"],
|
|
307
|
+
batch_sampler=valid_sampler,
|
|
308
|
+
collate_fn=partial(collate_fn_trunc, input_max_len = src_max_len, label_max_len = tgt_max_len,
|
|
309
|
+
eos_token_id = tokenizer.eos_token_id, pad_token_id = tokenizer.pad_token_id),
|
|
310
|
+
num_workers=num_workers,
|
|
311
|
+
pin_memory=True if device in ["cuda", "gpu"] else False,
|
|
312
|
+
)
|
|
313
|
+
test_loader = DataLoader(
|
|
314
|
+
dataset["test"],
|
|
315
|
+
batch_sampler=test_sampler,
|
|
316
|
+
collate_fn=partial(collate_fn_trunc, input_max_len = src_max_len, label_max_len = tgt_max_len,
|
|
317
|
+
eos_token_id = tokenizer.eos_token_id, pad_token_id = tokenizer.pad_token_id),
|
|
318
|
+
num_workers=num_workers,
|
|
319
|
+
pin_memory=True if device in ["cuda", "gpu"] else False,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
else:
|
|
323
|
+
|
|
324
|
+
# get boundaries
|
|
325
|
+
boundaries, batch_sizes = get_boundaries(dataset['train'](transformers = a_transformers), sizes, count)
|
|
326
|
+
|
|
327
|
+
# initialize loaders
|
|
328
|
+
train_sampler = SequenceLengthBatchSampler(
|
|
329
|
+
dataset["train"](transformers=a_transformers),
|
|
330
|
+
boundaries=boundaries,
|
|
331
|
+
batch_sizes=batch_sizes,
|
|
332
|
+
input_key="input_ids",
|
|
333
|
+
label_key="labels",
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
valid_sampler = SequenceLengthBatchSampler(
|
|
337
|
+
dataset["val"](transformers=a_transformers),
|
|
338
|
+
boundaries=boundaries,
|
|
339
|
+
batch_sizes=batch_sizes,
|
|
340
|
+
input_key="input_ids",
|
|
341
|
+
label_key="labels",
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
test_sampler = SequenceLengthBatchSampler(
|
|
345
|
+
dataset["test"](transformers=a_transformers),
|
|
346
|
+
boundaries=boundaries,
|
|
347
|
+
batch_sizes=batch_sizes,
|
|
348
|
+
input_key="input_ids",
|
|
349
|
+
label_key="labels",
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
# add transformations
|
|
353
|
+
dataset = {s: dataset[s](transformers = t_transformers) for s in dataset}
|
|
354
|
+
|
|
355
|
+
# define data loaders
|
|
356
|
+
train_loader = DataLoader(
|
|
357
|
+
dataset["train"],
|
|
358
|
+
batch_sampler=train_sampler,
|
|
359
|
+
collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
|
|
360
|
+
num_workers=num_workers,
|
|
361
|
+
pin_memory=True if device in ["cuda", "gpu"] else False,
|
|
362
|
+
)
|
|
363
|
+
valid_loader = DataLoader(
|
|
364
|
+
dataset["val"],
|
|
365
|
+
batch_sampler=valid_sampler,
|
|
366
|
+
collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
|
|
367
|
+
num_workers=num_workers,
|
|
368
|
+
pin_memory=True if device in ["cuda", "gpu"] else False,
|
|
369
|
+
)
|
|
370
|
+
test_loader = DataLoader(
|
|
371
|
+
dataset["test"],
|
|
372
|
+
batch_sampler=test_sampler,
|
|
373
|
+
collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
|
|
374
|
+
num_workers=num_workers,
|
|
375
|
+
pin_memory=True if device in ["cuda", "gpu"] else False,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
else:
|
|
379
|
+
|
|
380
|
+
# add transformations
|
|
381
|
+
dataset = {s: dataset[s](transformers = t_transformers) for s in dataset}
|
|
382
|
+
|
|
383
|
+
if "train" in dataset:
|
|
384
|
+
# define data loaders
|
|
385
|
+
train_loader = DataLoader(
|
|
386
|
+
dataset["train"],
|
|
387
|
+
batch_size=batch_size,
|
|
388
|
+
collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
|
|
389
|
+
num_workers=num_workers,
|
|
390
|
+
pin_memory=True if device in ["cuda", "gpu"] else False,
|
|
391
|
+
shuffle=True,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
if "val" in dataset:
|
|
395
|
+
|
|
396
|
+
valid_loader = DataLoader(
|
|
397
|
+
dataset["val"],
|
|
398
|
+
batch_size=batch_size,
|
|
399
|
+
collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
|
|
400
|
+
num_workers=num_workers,
|
|
401
|
+
pin_memory=True if device in ["cuda", "gpu"] else False,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
if "test" in dataset:
|
|
405
|
+
|
|
406
|
+
test_loader = DataLoader(
|
|
407
|
+
dataset["test"],
|
|
408
|
+
batch_size=batch_size,
|
|
409
|
+
collate_fn=partial(pad_collate, padding_value=tokenizer.pad_token_id),
|
|
410
|
+
num_workers=num_workers,
|
|
411
|
+
pin_memory=True if device in ["cuda", "gpu"] else False,
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
if "train" in dataset and "val" in dataset:
|
|
415
|
+
|
|
416
|
+
return {
|
|
417
|
+
"train_loader": train_loader,
|
|
418
|
+
"valid_loader": valid_loader,
|
|
419
|
+
"test_loader": test_loader,
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
else:
|
|
423
|
+
|
|
424
|
+
return {"test_loader": test_loader}
|
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from torch.nn import Module
|
|
2
|
+
|
|
3
|
+
def get_gradients_mean(model: Module):
|
|
4
|
+
ave_grads = []
|
|
5
|
+
for name, param in model.named_parameters():
|
|
6
|
+
if param.grad is not None:
|
|
7
|
+
ave_grads.append(param.grad.abs().mean().item())
|
|
8
|
+
|
|
9
|
+
return sum(ave_grads)/(len(ave_grads) + 1e-5)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
class LSTMOutput:
|
|
4
|
+
|
|
5
|
+
def __init__(self, logits, loss):
|
|
6
|
+
|
|
7
|
+
self.logits = logits
|
|
8
|
+
|
|
9
|
+
self.loss = loss
|
|
10
|
+
|
|
11
|
+
class LSTMSequenceToSequence(torch.nn.Module):
|
|
12
|
+
|
|
13
|
+
def __init__(self, tokenizer, embedding_size = 128, num_layers = 6, hidden_size = 128, dropout=0.1, bidirectional = True):
|
|
14
|
+
|
|
15
|
+
super().__init__()
|
|
16
|
+
|
|
17
|
+
self.tokenizer = tokenizer
|
|
18
|
+
|
|
19
|
+
self.vocab_size = self.tokenizer.vocab_size
|
|
20
|
+
|
|
21
|
+
self.embedding = torch.nn.Embedding(self.vocab_size, embedding_size)
|
|
22
|
+
|
|
23
|
+
self.encoder = torch.nn.LSTM(input_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, batch_first = True,
|
|
24
|
+
bidirectional = bidirectional, dropout=dropout)
|
|
25
|
+
|
|
26
|
+
self.decoder = torch.nn.LSTM(input_size = embedding_size, hidden_size = hidden_size, num_layers = num_layers, batch_first = True,
|
|
27
|
+
bidirectional = bidirectional, dropout=dropout)
|
|
28
|
+
|
|
29
|
+
copy = 2 if bidirectional else 1
|
|
30
|
+
|
|
31
|
+
self.decoder_output_layer = torch.nn.Linear(copy * hidden_size, self.vocab_size)
|
|
32
|
+
|
|
33
|
+
self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
|
|
34
|
+
|
|
35
|
+
def forward(self, input, output):
|
|
36
|
+
|
|
37
|
+
input_embed = self.embedding(input)
|
|
38
|
+
|
|
39
|
+
state, hidden = self.encoder(input_embed)
|
|
40
|
+
|
|
41
|
+
# decal output for decoder
|
|
42
|
+
decoder_input = output[:, :-1]
|
|
43
|
+
|
|
44
|
+
decoder_input = self.embedding(decoder_input)
|
|
45
|
+
|
|
46
|
+
decoder_output, _ = self.decoder(decoder_input, hidden)
|
|
47
|
+
|
|
48
|
+
decoder_output = self.decoder_output_layer(decoder_output)
|
|
49
|
+
|
|
50
|
+
loss = self.loss_fn(decoder_output.reshape(-1, decoder_output.shape[-1]), output[:, 1:].reshape(-1))
|
|
51
|
+
|
|
52
|
+
return LSTMOutput(decoder_output, loss)
|
|
53
|
+
|
|
54
|
+
def generate(self, input, max_new_tokens: int = 100, temperature: float = 0.0, use_sampling = False, **kwargs):
|
|
55
|
+
|
|
56
|
+
input_embed = self.embedding(input)
|
|
57
|
+
|
|
58
|
+
_, hidden = self.encoder(input_embed)
|
|
59
|
+
|
|
60
|
+
# initialize predictions
|
|
61
|
+
predictions = torch.tensor([[self.tokenizer.bos_token_id]]*input_embed.shape[0], dtype=torch.long, device=input.device)
|
|
62
|
+
|
|
63
|
+
# variable identifying if the sequence is finished
|
|
64
|
+
finished_sequences = torch.zeros(input_embed.shape[0], dtype=torch.bool, device = input.device)
|
|
65
|
+
|
|
66
|
+
# generate predictions
|
|
67
|
+
for i in range(max_new_tokens):
|
|
68
|
+
|
|
69
|
+
decoder_input = self.embedding(predictions)
|
|
70
|
+
|
|
71
|
+
decoder_output, hidden = self.decoder(decoder_input, hidden)
|
|
72
|
+
|
|
73
|
+
decoder_output = self.decoder_output_layer(decoder_output)
|
|
74
|
+
|
|
75
|
+
if temperature > 0.0: decoder_output = (decoder_output / temperature)
|
|
76
|
+
|
|
77
|
+
# get probs and sample the next token from a multinomial distribution
|
|
78
|
+
probs = torch.softmax(decoder_output[:, -1], dim = -1)
|
|
79
|
+
|
|
80
|
+
if use_sampling: prediction = torch.multinomial(probs, num_samples = 1)
|
|
81
|
+
else: prediction = torch.argmax(probs, dim=-1, keepdim=True)
|
|
82
|
+
|
|
83
|
+
# add new prediction
|
|
84
|
+
predictions = torch.cat((predictions, prediction), dim = -1)
|
|
85
|
+
|
|
86
|
+
# recuperate next ids
|
|
87
|
+
next_token_ids = prediction.squeeze(-1)
|
|
88
|
+
|
|
89
|
+
finished_sequences |= (next_token_ids == self.tokenizer.eos_token_id)
|
|
90
|
+
|
|
91
|
+
if finished_sequences.all():
|
|
92
|
+
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
# return predictions
|
|
96
|
+
return predictions[:, 1:]
|
|
97
|
+
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
from translate_package import (
|
|
2
|
+
pl,
|
|
3
|
+
evaluate,
|
|
4
|
+
LoraConfig,
|
|
5
|
+
TaskType,
|
|
6
|
+
torch,
|
|
7
|
+
get_linear_schedule_with_warmup,
|
|
8
|
+
wandb,
|
|
9
|
+
get_peft_model,
|
|
10
|
+
T5ForConditionalGeneration,
|
|
11
|
+
MT5ForConditionalGeneration,
|
|
12
|
+
BartForConditionalGeneration,
|
|
13
|
+
AutoModelForSeq2SeqLM,
|
|
14
|
+
Adafactor,
|
|
15
|
+
AdamWeightDecay
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from translate_package.models.gradient_observation import get_gradients_mean
|
|
19
|
+
|
|
20
|
+
from translate_package.models.lstm import LSTMSequenceToSequence
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def print_number_of_trainable_model_parameters(model):
|
|
24
|
+
trainable_model_params = 0
|
|
25
|
+
all_model_params = 0
|
|
26
|
+
for _, param in model.named_parameters():
|
|
27
|
+
all_model_params += param.numel()
|
|
28
|
+
if param.requires_grad:
|
|
29
|
+
trainable_model_params += param.numel()
|
|
30
|
+
return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"
|
|
31
|
+
|
|
32
|
+
class MachineTranslationTransformer(pl.LightningModule):
|
|
33
|
+
|
|
34
|
+
rouge = evaluate.load("rouge")
|
|
35
|
+
bleu = evaluate.load("sacrebleu")
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
model_name,
|
|
40
|
+
tokenizer,
|
|
41
|
+
model_generation="t5",
|
|
42
|
+
model=None,
|
|
43
|
+
lr=1e-4,
|
|
44
|
+
weight_decay=1e-2,
|
|
45
|
+
num_warmup_steps=0,
|
|
46
|
+
num_training_steps=20000,
|
|
47
|
+
r=32,
|
|
48
|
+
lora_alpha=32,
|
|
49
|
+
lora_dropout=0.05,
|
|
50
|
+
bias="none",
|
|
51
|
+
max_new_tokens=200,
|
|
52
|
+
predict_with_generate=True,
|
|
53
|
+
num_beams=0,
|
|
54
|
+
use_peft=False,
|
|
55
|
+
embedding_size=128,
|
|
56
|
+
num_layers=6,
|
|
57
|
+
hidden_size=128,
|
|
58
|
+
dropout=0.1,
|
|
59
|
+
bidirectional=False
|
|
60
|
+
):
|
|
61
|
+
|
|
62
|
+
super().__init__()
|
|
63
|
+
|
|
64
|
+
if model is None:
|
|
65
|
+
if model_generation in ["t5"]:
|
|
66
|
+
|
|
67
|
+
self.original_model = T5ForConditionalGeneration.from_pretrained(
|
|
68
|
+
model_name, torch_dtype=torch.float32
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
elif model_generation in ["mt5"]:
|
|
72
|
+
|
|
73
|
+
self.original_model = MT5ForConditionalGeneration.from_pretrained(
|
|
74
|
+
model_name, torch_dtype=torch.float32
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
elif model_generation in ["nllb"]:
|
|
78
|
+
|
|
79
|
+
self.original_model = AutoModelForSeq2SeqLM.from_pretrained(
|
|
80
|
+
model_name, torch_dtype=torch.float32
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
elif model_generation in ["bart"]:
|
|
84
|
+
|
|
85
|
+
self.original_model = BartForConditionalGeneration.from_pretrained(
|
|
86
|
+
model_name, torch_dtype=torch.float32
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
elif model_generation in ["lstm"]:
|
|
90
|
+
|
|
91
|
+
self.original_model = LSTMSequenceToSequence(tokenizer, embedding_size, num_layers, hidden_size, dropout, bidirectional)
|
|
92
|
+
|
|
93
|
+
# resize the token embeddings
|
|
94
|
+
if not model_generation in ["lstm"]: self.original_model.resize_token_embeddings(len(tokenizer))
|
|
95
|
+
|
|
96
|
+
if use_peft and not model_generation in ["lstm"]:
|
|
97
|
+
|
|
98
|
+
self.lora_config = LoraConfig(
|
|
99
|
+
r=r, # Rank
|
|
100
|
+
lora_alpha=lora_alpha,
|
|
101
|
+
lora_dropout=lora_dropout,
|
|
102
|
+
bias=bias,
|
|
103
|
+
task_type=TaskType.SEQ_2_SEQ_LM,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
self.model = get_peft_model(self.original_model, self.lora_config)
|
|
107
|
+
|
|
108
|
+
else:
|
|
109
|
+
|
|
110
|
+
self.model = self.original_model
|
|
111
|
+
|
|
112
|
+
else:
|
|
113
|
+
|
|
114
|
+
self.model = model
|
|
115
|
+
|
|
116
|
+
print(print_number_of_trainable_model_parameters(self.model))
|
|
117
|
+
|
|
118
|
+
self.tokenizer = tokenizer
|
|
119
|
+
|
|
120
|
+
self.lr = lr
|
|
121
|
+
|
|
122
|
+
self.weight_decay = weight_decay
|
|
123
|
+
|
|
124
|
+
self.num_warmup_steps = num_warmup_steps
|
|
125
|
+
|
|
126
|
+
self.num_training_steps = num_training_steps
|
|
127
|
+
|
|
128
|
+
self.predict_with_generate = predict_with_generate
|
|
129
|
+
|
|
130
|
+
self.max_new_tokens = max_new_tokens
|
|
131
|
+
|
|
132
|
+
self.num_beams = num_beams
|
|
133
|
+
|
|
134
|
+
self.model_generation = model_generation
|
|
135
|
+
|
|
136
|
+
self.predictions = {
|
|
137
|
+
"Source references": [],
|
|
138
|
+
"Predictions": [],
|
|
139
|
+
"Target references": [],
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
def forward(self, input):
|
|
143
|
+
|
|
144
|
+
output = self.model(**input) if not self.model_generation in ["lstm"] else self.model(input['input_ids'], input['labels'])
|
|
145
|
+
|
|
146
|
+
return output.loss, output.logits
|
|
147
|
+
|
|
148
|
+
def configure_optimizers(self):
|
|
149
|
+
|
|
150
|
+
if self.model_generation in ["t5", "mt5"]:
|
|
151
|
+
|
|
152
|
+
optimizer = Adafactor(
|
|
153
|
+
self.parameters(), lr=self.lr, weight_decay=self.weight_decay, relative_step = False,
|
|
154
|
+
warmup_init = False
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
elif self.model_generation in ["bart", "mbart"]:
|
|
158
|
+
|
|
159
|
+
optimizer = torch.optim.AdamW(
|
|
160
|
+
self.parameters(), lr=self.lr, weight_decay=self.weight_decay
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
elif self.model_generation in ["lstm"]:
|
|
164
|
+
|
|
165
|
+
optimizer = torch.optim.AdamW(
|
|
166
|
+
self.parameters(), lr=self.lr, weight_decay=self.weight_decay
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if self.model_generation in ["t5", "lstm", "mt5"]:
|
|
170
|
+
|
|
171
|
+
return [optimizer]
|
|
172
|
+
|
|
173
|
+
elif self.model_generation in ["bart"]:
|
|
174
|
+
|
|
175
|
+
scheduler = get_linear_schedule_with_warmup(
|
|
176
|
+
optimizer,
|
|
177
|
+
num_warmup_steps=self.num_warmup_steps,
|
|
178
|
+
num_training_steps=self.num_training_steps,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return {'optimizer': optimizer, 'lr_scheduler': {"scheduler": scheduler}}
|
|
182
|
+
|
|
183
|
+
def training_step(self, batch, batch_idx=None):
|
|
184
|
+
|
|
185
|
+
loss, y_pred = self(batch)
|
|
186
|
+
|
|
187
|
+
self.log_dict(
|
|
188
|
+
{"train_loss": loss, "global_step": float(self.global_step)},
|
|
189
|
+
prog_bar=True,
|
|
190
|
+
on_step=False,
|
|
191
|
+
on_epoch=True,
|
|
192
|
+
sync_dist=True,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
mean_grad = get_gradients_mean(self.original_model)
|
|
196
|
+
|
|
197
|
+
wandb.log({"train_loss": loss, "trainer/global_step": self.global_step, "mean_gradient": mean_grad})
|
|
198
|
+
|
|
199
|
+
return loss
|
|
200
|
+
|
|
201
|
+
def validation_step(self, batch, batch_idx=None):
|
|
202
|
+
|
|
203
|
+
loss, y_pred = self(batch)
|
|
204
|
+
|
|
205
|
+
metrics = {}
|
|
206
|
+
|
|
207
|
+
if self.predict_with_generate:
|
|
208
|
+
|
|
209
|
+
# generate predictions
|
|
210
|
+
predictions = self.model.generate(
|
|
211
|
+
input_ids=batch["input_ids"],
|
|
212
|
+
attention_mask=batch["attention_mask"],
|
|
213
|
+
max_new_tokens=self.max_new_tokens,
|
|
214
|
+
) if not self.model_generation in ["lstm"] else self.model.generate(
|
|
215
|
+
input=batch["input_ids"],
|
|
216
|
+
max_new_tokens=self.max_new_tokens,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# decode the labels
|
|
220
|
+
predictions = self.tokenizer.batch_decode(
|
|
221
|
+
predictions, skip_special_tokens=True
|
|
222
|
+
)
|
|
223
|
+
labels = self.tokenizer.batch_decode(
|
|
224
|
+
batch["labels"], skip_special_tokens=True
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# get bleu metric
|
|
228
|
+
bleu = self.bleu.compute(
|
|
229
|
+
predictions=predictions,
|
|
230
|
+
references=[[label.strip()] for label in labels],
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
metrics["bleu"] = bleu["score"]
|
|
234
|
+
|
|
235
|
+
# get rouge metrics
|
|
236
|
+
rouge = self.rouge.compute(
|
|
237
|
+
predictions=predictions, references=[label.strip() for label in labels]
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
metrics.update({k: v for k, v in rouge.items() if "rouge" in k})
|
|
241
|
+
|
|
242
|
+
# get the loss
|
|
243
|
+
metrics.update(
|
|
244
|
+
{"eval_loss": loss.item(), "global_step": float(self.global_step)}
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
self.log_dict(
|
|
248
|
+
metrics, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
metrics.update({"trainer/global_step": self.global_step})
|
|
252
|
+
|
|
253
|
+
wandb.log(metrics)
|
|
254
|
+
|
|
255
|
+
return loss
|
|
256
|
+
|
|
257
|
+
def test_step(self, batch, batch_idx):
|
|
258
|
+
|
|
259
|
+
loss, y_pred = self(batch)
|
|
260
|
+
|
|
261
|
+
references = self.tokenizer.batch_decode(
|
|
262
|
+
batch["input_ids"], skip_special_tokens=True
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# generate predictions
|
|
266
|
+
predictions = self.model.generate(
|
|
267
|
+
input_ids=batch["input_ids"],
|
|
268
|
+
attention_mask=batch["attention_mask"],
|
|
269
|
+
max_new_tokens=self.max_new_tokens,
|
|
270
|
+
do_sample=self.num_beams > 0,
|
|
271
|
+
num_beams=self.num_beams
|
|
272
|
+
) if not self.model_generation in ["lstm"] else self.model.generate(
|
|
273
|
+
input=batch["input_ids"],
|
|
274
|
+
max_new_tokens=self.max_new_tokens,
|
|
275
|
+
use_sampling=True
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# decode the labels
|
|
279
|
+
predictions = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
|
280
|
+
labels = self.tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
|
|
281
|
+
|
|
282
|
+
self.predictions["Source references"].extend(references)
|
|
283
|
+
self.predictions["Predictions"].extend(predictions)
|
|
284
|
+
self.predictions["Target references"].extend(labels)
|
|
285
|
+
|
|
286
|
+
# get bleu metric
|
|
287
|
+
bleu = self.bleu.compute(
|
|
288
|
+
predictions=predictions, references=[[label.strip()] for label in labels]
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
metrics = {}
|
|
292
|
+
|
|
293
|
+
metrics["bleu"] = bleu["score"]
|
|
294
|
+
|
|
295
|
+
# get rouge metrics
|
|
296
|
+
rouge = self.rouge.compute(predictions=predictions, references=labels)
|
|
297
|
+
|
|
298
|
+
metrics.update({k: v for k, v in rouge.items() if "rouge" in k})
|
|
299
|
+
|
|
300
|
+
# get the loss
|
|
301
|
+
metrics.update(
|
|
302
|
+
{"test_loss": loss.item(), "global_step": float(self.global_step)}
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# log metrics
|
|
306
|
+
self.log_dict(
|
|
307
|
+
metrics, prog_bar=True, on_step=False, on_epoch=True, sync_dist=True
|
|
308
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from transformers import BartTokenizerFast
|
|
2
|
+
from transformers import T5TokenizerFast
|
|
3
|
+
from transformers import AutoTokenizer
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None):
|
|
7
|
+
|
|
8
|
+
if model == "nllb":
|
|
9
|
+
|
|
10
|
+
if not model_name is None:
|
|
11
|
+
|
|
12
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
13
|
+
|
|
14
|
+
print(f"The {model}'s tokenizer was successfully loaded")
|
|
15
|
+
|
|
16
|
+
else:
|
|
17
|
+
|
|
18
|
+
raise ValueError("For the nllb model you must specify the path to the model!")
|
|
19
|
+
|
|
20
|
+
if tokenizer_name == "bpe":
|
|
21
|
+
|
|
22
|
+
tokenizer_path = os.path.join(dir_path, f"{file_name}.json")
|
|
23
|
+
|
|
24
|
+
if model in ["bart", "lstm"]:
|
|
25
|
+
|
|
26
|
+
tokenizer = BartTokenizerFast(tokenizer_file=tokenizer_path)
|
|
27
|
+
|
|
28
|
+
print(f"The Byte Pair Encoding tokenizer was successfully uploaded from {tokenizer_path}")
|
|
29
|
+
|
|
30
|
+
elif tokenizer_name == "sp":
|
|
31
|
+
|
|
32
|
+
tokenizer_path = os.path.join(dir_path, f"{file_name}.model")
|
|
33
|
+
|
|
34
|
+
if model in ['t5', 'mt5']:
|
|
35
|
+
|
|
36
|
+
tokenizer = T5TokenizerFast(vocab_file=tokenizer_path)
|
|
37
|
+
|
|
38
|
+
print(f"The Sentence Piece tokenizer was successfully uploaded from {tokenizer_path}")
|
|
39
|
+
|
|
40
|
+
return tokenizer
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
|
|
2
|
+
from translate_package import argparse, spm, pd, os, Tokenizer, models, pre_tokenizers, trainers, processors, decoders
|
|
3
|
+
from translate_package.errors import TokenizerException
|
|
4
|
+
|
|
5
|
+
def train_tokenizer(arguments):
|
|
6
|
+
|
|
7
|
+
# recuperate dataset
|
|
8
|
+
dataset = pd.read_csv(arguments.dataset_file).astype(str)
|
|
9
|
+
|
|
10
|
+
if arguments.name == 'bpe':
|
|
11
|
+
|
|
12
|
+
# initialize tokenizer
|
|
13
|
+
tokenizer = Tokenizer(models.BPE())
|
|
14
|
+
|
|
15
|
+
# initialize pre-to
|
|
16
|
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space = False)
|
|
17
|
+
|
|
18
|
+
# initialize trainer
|
|
19
|
+
trainer = trainers.BpeTrainer(vocab_size = arguments.vocab_size, special_tokens = ['<s>', '<pad>', '</s>', '<unk>'])
|
|
20
|
+
|
|
21
|
+
# iterate over the dataset and return the sentences
|
|
22
|
+
def get_training_corpus():
|
|
23
|
+
|
|
24
|
+
sentences = dataset[arguments.src_label].tolist() + dataset[arguments.tgt_label].tolist()
|
|
25
|
+
|
|
26
|
+
for i in range(0, len(sentences), arguments.batch_size):
|
|
27
|
+
|
|
28
|
+
yield sentences[i: i + arguments.batch_size]
|
|
29
|
+
|
|
30
|
+
# train the tokenizer
|
|
31
|
+
tokenizer.train_from_iterator(get_training_corpus(), trainer = trainer)
|
|
32
|
+
|
|
33
|
+
tokenizer.post_processor = processors.ByteLevel(trim_offsets = False)
|
|
34
|
+
|
|
35
|
+
tokenizer.decoder = decoders.ByteLevel()
|
|
36
|
+
|
|
37
|
+
# get path
|
|
38
|
+
tk_path = os.path.join(arguments.save_path, f'{arguments.file_name}.json')
|
|
39
|
+
|
|
40
|
+
tokenizer.save(tk_path)
|
|
41
|
+
|
|
42
|
+
print(f"The Byte Pair Encoding tokenizer was saved as {tk_path}!")
|
|
43
|
+
|
|
44
|
+
elif arguments.name == 'sp':
|
|
45
|
+
|
|
46
|
+
# print sentences into a file
|
|
47
|
+
with open('sents.txt', 'w', encoding = 'utf-8') as f:
|
|
48
|
+
|
|
49
|
+
sentences = dataset[arguments.src_label].tolist() + dataset[arguments.tgt_label].tolist()
|
|
50
|
+
|
|
51
|
+
for i in range(0, len(sentences), arguments.batch_size):
|
|
52
|
+
|
|
53
|
+
sents = sentences[i: i + arguments.batch_size]
|
|
54
|
+
|
|
55
|
+
f.write("\n".join(sents)+'\n')
|
|
56
|
+
|
|
57
|
+
# get path
|
|
58
|
+
tk_path = os.path.join(arguments.save_path, arguments.file_name)
|
|
59
|
+
|
|
60
|
+
# initialize sentence piece trainer
|
|
61
|
+
spm.SentencePieceTrainer.Train(input = f'sents.txt',
|
|
62
|
+
model_prefix=os.path.join(arguments.save_path, arguments.file_name),
|
|
63
|
+
vocab_size=arguments.vocab_size,
|
|
64
|
+
character_coverage=1.0,
|
|
65
|
+
pad_id=0,
|
|
66
|
+
eos_id=1,
|
|
67
|
+
unk_id=2,
|
|
68
|
+
bos_id=3,
|
|
69
|
+
pad_piece='<pad>',
|
|
70
|
+
eos_piece='</s>',
|
|
71
|
+
unk_piece='<unk>',
|
|
72
|
+
bos_piece='<s>',
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# remove file
|
|
76
|
+
os.remove('sents.txt')
|
|
77
|
+
|
|
78
|
+
print(f"The Sentence Piece tokenizer was saved as {tk_path}(.model / for model)!")
|
|
79
|
+
|
|
80
|
+
else:
|
|
81
|
+
|
|
82
|
+
raise TokenizerException("You can only train a sentence piece (as 'sp') tokenizer, or a byte pair encoding (as 'bpe') tokenizer!")
|
|
83
|
+
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from translate_package import wandb
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def download_checkpoint(project, artifact_path, key):
|
|
5
|
+
|
|
6
|
+
wandb.login(key=key)
|
|
7
|
+
|
|
8
|
+
run = wandb.init(project=project)
|
|
9
|
+
|
|
10
|
+
artifact = run.use_artifact(artifact_path, type="dataset")
|
|
11
|
+
|
|
12
|
+
artifact_dir = artifact.download()
|
|
13
|
+
|
|
14
|
+
wandb.finish()
|
|
15
|
+
|
|
16
|
+
return artifact_dir
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: translate-package
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
|
|
5
|
+
Author: Oumar Kane
|
|
6
|
+
Author-email: oumar.kane@univ-thies.sn
|
|
7
|
+
Requires-Dist: accelerate
|
|
8
|
+
Requires-Dist: torch
|
|
9
|
+
Requires-Dist: spacy
|
|
10
|
+
Requires-Dist: nltk
|
|
11
|
+
Requires-Dist: gensim
|
|
12
|
+
Requires-Dist: furo
|
|
13
|
+
Requires-Dist: streamlit
|
|
14
|
+
Requires-Dist: tokenizers
|
|
15
|
+
Requires-Dist: tensorboard
|
|
16
|
+
Requires-Dist: evaluate
|
|
17
|
+
Requires-Dist: transformers
|
|
18
|
+
Requires-Dist: pandas
|
|
19
|
+
Requires-Dist: numpy
|
|
20
|
+
Requires-Dist: scikit-learn
|
|
21
|
+
Requires-Dist: matplotlib
|
|
22
|
+
Requires-Dist: plotly
|
|
23
|
+
Requires-Dist: sacrebleu
|
|
24
|
+
Requires-Dist: nlpaug
|
|
25
|
+
Requires-Dist: wandb
|
|
26
|
+
Requires-Dist: pytorch-lightning
|
|
27
|
+
Requires-Dist: selenium
|
|
28
|
+
Requires-Dist: sentencepiece
|
|
29
|
+
Requires-Dist: peft
|
|
30
|
+
Requires-Dist: rouge-score
|
|
31
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
translate_package/__init__.py,sha256=Nckjm15LBEfKSU5-EBjfRkHJQhWELzwow0BD3rKtmkw,1297
|
|
2
|
+
translate_package/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
translate_package/data/data_preparation.py,sha256=sDMKL9LWcwXp6Iy2eS-1n-NWgkZhvhRyFN7k7uID1_w,14804
|
|
4
|
+
translate_package/errors/__init__.py,sha256=gu6XjAIghG4lLkYo8x_7_yyLRtK2FIvmC-WcfJaeOlg,299
|
|
5
|
+
translate_package/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
translate_package/models/gradient_observation.py,sha256=P91UA5i-RdkK46TqpPOJ54DsUYgTI9cRohgPS1Ch0Lc,294
|
|
7
|
+
translate_package/models/lstm.py,sha256=OPkvvceowz5JqdGGH4cfPhH23kbP11z-29zIJn5d8ig,3273
|
|
8
|
+
translate_package/models/machine_translation.py,sha256=5QQpjs_HR9mnPryMyfYpcMgU5tHAAj-eVrv3oGmjR5Y,9963
|
|
9
|
+
translate_package/tokenization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
translate_package/tokenization/load_tokenizer.py,sha256=vzCHS0ZDSJyr0y08zNvupMtD2jP8A16EBN-ob0LJHG0,1344
|
|
11
|
+
translate_package/tokenization/train_tokenizer.py,sha256=RkdT5DUx201OBNaswM6m54iqcrmCThd3ITLguQb_zVM,3347
|
|
12
|
+
translate_package/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
translate_package/utils/checkpoint.py,sha256=GqymRvF8_QZgrQq9m79Ppj6Qr7NQm78kDARm3p_chC0,322
|
|
14
|
+
translate_package-0.0.1.dist-info/METADATA,sha256=eHD4CGRhoQHOl-EXUPYPTQ9kGQCw5q3Ju0xkmB3PGTM,819
|
|
15
|
+
translate_package-0.0.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
|
16
|
+
translate_package-0.0.1.dist-info/top_level.txt,sha256=8e2HIrGAMzoSukqu2q929dOJMV1zGYKI_BAFwl-P7XU,18
|
|
17
|
+
translate_package-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
translate_package
|