translate-package 0.1.8__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- translate_package-0.2.0/PKG-INFO +37 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/setup.py +1 -1
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/data/data_preparation.py +59 -26
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/load_tokenizer.py +11 -2
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/upload_to_hughub.py +3 -2
- translate_package-0.2.0/translate_package.egg-info/PKG-INFO +37 -0
- translate_package-0.1.8/PKG-INFO +0 -6
- translate_package-0.1.8/translate_package.egg-info/PKG-INFO +0 -6
- {translate_package-0.1.8 → translate_package-0.2.0}/setup.cfg +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/__init__.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/data/__init__.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/data/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/data/__pycache__/data_preparation.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/errors/__init__.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/errors/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/__init__.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/__pycache__/code_generation.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/__pycache__/machine_translation.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/gradient_observation.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/lstm.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/machine_translation.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/__init__.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/__pycache__/load_tokenizer.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/__pycache__/train_tokenizer.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/train_tokenizer.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/__init__.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/__pycache__/checkpoint.cpython-310.pyc +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/checkpoint.py +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/SOURCES.txt +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/dependency_links.txt +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/requires.txt +0 -0
- {translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: translate_package
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
|
|
5
|
+
Author: Oumar Kane
|
|
6
|
+
Author-email: oumar.kane@univ-thies.sn
|
|
7
|
+
Requires-Dist: accelerate
|
|
8
|
+
Requires-Dist: torch==2.7.0
|
|
9
|
+
Requires-Dist: torchvision
|
|
10
|
+
Requires-Dist: spacy
|
|
11
|
+
Requires-Dist: nltk
|
|
12
|
+
Requires-Dist: gensim
|
|
13
|
+
Requires-Dist: furo
|
|
14
|
+
Requires-Dist: streamlit
|
|
15
|
+
Requires-Dist: tokenizers
|
|
16
|
+
Requires-Dist: tensorboard
|
|
17
|
+
Requires-Dist: evaluate
|
|
18
|
+
Requires-Dist: transformers
|
|
19
|
+
Requires-Dist: pandas
|
|
20
|
+
Requires-Dist: numpy
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
22
|
+
Requires-Dist: matplotlib
|
|
23
|
+
Requires-Dist: plotly
|
|
24
|
+
Requires-Dist: sacrebleu
|
|
25
|
+
Requires-Dist: nlpaug
|
|
26
|
+
Requires-Dist: wandb
|
|
27
|
+
Requires-Dist: pytorch-lightning
|
|
28
|
+
Requires-Dist: selenium
|
|
29
|
+
Requires-Dist: sentencepiece
|
|
30
|
+
Requires-Dist: peft
|
|
31
|
+
Requires-Dist: rouge-score
|
|
32
|
+
Requires-Dist: sacrebleu
|
|
33
|
+
Requires-Dist: wolof-translate
|
|
34
|
+
Dynamic: author
|
|
35
|
+
Dynamic: author-email
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: summary
|
{translate_package-0.1.8 → translate_package-0.2.0}/translate_package/data/data_preparation.py
RENAMED
|
@@ -22,21 +22,23 @@ from translate_package import (
|
|
|
22
22
|
|
|
23
23
|
# python translate_hyperparameter_tuning.py --model_generation "t5" --model_name "google-t5/t5-small" --tokenizer_name "sp" --use_bucketing --save_artifact
|
|
24
24
|
|
|
25
|
-
def augment(examples,
|
|
25
|
+
def augment(examples, p_word = 0.12554160436087158, p_char = 0.8269672653838092, max_words = 21):
|
|
26
26
|
|
|
27
|
-
examples[
|
|
27
|
+
examples[examples["source"]] = TransformerSequences(nac.RandomCharAug(action = 'swap', aug_word_p = p_word, aug_char_p = p_char, aug_word_max = max_words))(examples[examples["source"]])[0]
|
|
28
28
|
|
|
29
29
|
return examples
|
|
30
30
|
|
|
31
|
-
def augment_(examples
|
|
31
|
+
def augment_(examples):
|
|
32
32
|
|
|
33
|
-
examples[
|
|
33
|
+
examples[examples["source"]] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[examples["source"]])[0]
|
|
34
34
|
|
|
35
|
-
examples[
|
|
35
|
+
examples[examples["target"]] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[examples["target"]])[0]
|
|
36
36
|
|
|
37
37
|
return examples
|
|
38
38
|
|
|
39
|
-
def tokenize(examples, tokenizer,
|
|
39
|
+
def tokenize(examples, tokenizer, model_generation, bidirectional = False):
|
|
40
|
+
|
|
41
|
+
direction = f"{examples['source']}_{examples['target']}"
|
|
40
42
|
|
|
41
43
|
if model_generation in ["t5", "mt5", "nllb"]:
|
|
42
44
|
|
|
@@ -48,13 +50,21 @@ def tokenize(examples, tokenizer, src_label, tgt_label, model_generation):
|
|
|
48
50
|
eos_token = tokenizer.eos_token
|
|
49
51
|
bos_token = tokenizer.bos_token
|
|
50
52
|
|
|
51
|
-
examples[
|
|
53
|
+
examples[examples["source"]] = bos_token + examples[examples["source"]] + eos_token
|
|
54
|
+
|
|
55
|
+
examples[examples["target"]] = bos_token + examples[examples["target"]] + eos_token
|
|
52
56
|
|
|
53
|
-
|
|
57
|
+
if bidirectional:
|
|
58
|
+
|
|
59
|
+
examples.update({key: value[0] for key, value in tokenizer[direction](examples[examples["source"]], return_tensors = 'pt').items()})
|
|
54
60
|
|
|
55
|
-
|
|
61
|
+
examples.update({f'decoder_{key}': value[0] for key, value in tokenizer[direction](examples[examples["target"]], return_tensors = 'pt').items()})
|
|
62
|
+
|
|
63
|
+
else:
|
|
64
|
+
|
|
65
|
+
examples.update({key: value[0] for key, value in tokenizer(examples[examples["source"]], return_tensors = 'pt').items()})
|
|
56
66
|
|
|
57
|
-
|
|
67
|
+
examples.update({f'decoder_{key}': value[0] for key, value in tokenizer(examples[examples["target"]], return_tensors = 'pt').items()})
|
|
58
68
|
|
|
59
69
|
examples['labels'] = examples['decoder_input_ids']
|
|
60
70
|
|
|
@@ -76,7 +86,7 @@ def sequences(examples, functions):
|
|
|
76
86
|
|
|
77
87
|
class SentenceDataset(Dataset):
|
|
78
88
|
|
|
79
|
-
def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH'):
|
|
89
|
+
def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH', bidirectional: bool = False):
|
|
80
90
|
|
|
81
91
|
assert source_column in dataframe.columns.tolist() and target_column in dataframe.columns.tolist()
|
|
82
92
|
|
|
@@ -89,6 +99,8 @@ class SentenceDataset(Dataset):
|
|
|
89
99
|
self.source_column = source_column
|
|
90
100
|
|
|
91
101
|
self.target_column = target_column
|
|
102
|
+
|
|
103
|
+
self.bidirectional = bidirectional
|
|
92
104
|
|
|
93
105
|
def __getitem__(self, index):
|
|
94
106
|
|
|
@@ -96,10 +108,26 @@ class SentenceDataset(Dataset):
|
|
|
96
108
|
|
|
97
109
|
target_sentence = self.target_sentences[index]
|
|
98
110
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
self.
|
|
102
|
-
|
|
111
|
+
if index > len(source_sentence):
|
|
112
|
+
|
|
113
|
+
source_sentence = self.target_sentences[index]
|
|
114
|
+
|
|
115
|
+
target_sentence = self.source_sentences[index]
|
|
116
|
+
|
|
117
|
+
sentences = {
|
|
118
|
+
self.source_column: source_sentence,
|
|
119
|
+
self.target_column: target_sentence,
|
|
120
|
+
"source": self.source_column,
|
|
121
|
+
"target": self.target_column
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
else:
|
|
125
|
+
sentences = {
|
|
126
|
+
self.source_column: source_sentence,
|
|
127
|
+
self.target_column: target_sentence,
|
|
128
|
+
"source": self.target_column,
|
|
129
|
+
"target": self.source_column
|
|
130
|
+
}
|
|
103
131
|
|
|
104
132
|
if not self.transformers is None:
|
|
105
133
|
|
|
@@ -109,10 +137,14 @@ class SentenceDataset(Dataset):
|
|
|
109
137
|
|
|
110
138
|
def __len__(self):
|
|
111
139
|
|
|
140
|
+
if self.bidirectional:
|
|
141
|
+
|
|
142
|
+
return len(self.source_sentences) * 2
|
|
143
|
+
|
|
112
144
|
return len(self.source_sentences)
|
|
113
145
|
|
|
114
146
|
|
|
115
|
-
def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
|
|
147
|
+
def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed, bidirectional):
|
|
116
148
|
|
|
117
149
|
# load the dataset with pandas
|
|
118
150
|
dataset_ = pd.read_csv(data_path)
|
|
@@ -121,7 +153,7 @@ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
|
|
|
121
153
|
if test_size == 1.0:
|
|
122
154
|
|
|
123
155
|
dataset = {
|
|
124
|
-
"test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label),
|
|
156
|
+
"test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
|
|
125
157
|
}
|
|
126
158
|
|
|
127
159
|
else:
|
|
@@ -135,9 +167,9 @@ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
|
|
|
135
167
|
)
|
|
136
168
|
|
|
137
169
|
dataset = {
|
|
138
|
-
"train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label),
|
|
139
|
-
"val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label),
|
|
140
|
-
"test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label),
|
|
170
|
+
"train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
|
|
171
|
+
"val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
|
|
172
|
+
"test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
|
|
141
173
|
}
|
|
142
174
|
|
|
143
175
|
# The dataset actually contains 3 diff splits: train, validation, test.
|
|
@@ -243,25 +275,26 @@ def get_loaders(
|
|
|
243
275
|
use_bucketing,
|
|
244
276
|
use_truncation,
|
|
245
277
|
batch_size,
|
|
278
|
+
bidirectional
|
|
246
279
|
):
|
|
247
280
|
|
|
248
281
|
# get dataset
|
|
249
|
-
dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed)
|
|
282
|
+
dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed, bidirectional)
|
|
250
283
|
|
|
251
284
|
# analysis transformations
|
|
252
285
|
|
|
253
286
|
a_transformers = partial(sequences,
|
|
254
287
|
functions = [
|
|
255
|
-
partial(augment_
|
|
256
|
-
partial(tokenize, tokenizer = tokenizer,
|
|
288
|
+
partial(augment_),
|
|
289
|
+
partial(tokenize, tokenizer = tokenizer, model_generation = model_generation, bidirectional=bidirectional)
|
|
257
290
|
])
|
|
258
291
|
|
|
259
292
|
# training transformations
|
|
260
293
|
t_transformers = partial(sequences,
|
|
261
294
|
functions = [
|
|
262
|
-
partial(augment,
|
|
263
|
-
partial(augment_
|
|
264
|
-
partial(tokenize, tokenizer = tokenizer,
|
|
295
|
+
partial(augment, p_word = p_word, p_char = p_char, max_words = max_words),
|
|
296
|
+
partial(augment_),
|
|
297
|
+
partial(tokenize, tokenizer = tokenizer, model_generation = model_generation, bidirectional=bidirectional)
|
|
265
298
|
])
|
|
266
299
|
|
|
267
300
|
if use_bucketing:
|
{translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/load_tokenizer.py
RENAMED
|
@@ -8,14 +8,23 @@ BCP_47_languages = {
|
|
|
8
8
|
'wolof': 'wol_Latn',
|
|
9
9
|
}
|
|
10
10
|
|
|
11
|
-
def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None, src_lang = "french", tgt_lang = "wolof"):
|
|
11
|
+
def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None, src_lang = "french", tgt_lang = "wolof", bidirectional: bool = False):
|
|
12
12
|
|
|
13
13
|
if model == "nllb":
|
|
14
14
|
|
|
15
15
|
if not model_name is None:
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
if not bidirectional:
|
|
18
|
+
|
|
19
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang])
|
|
18
20
|
|
|
21
|
+
else:
|
|
22
|
+
|
|
23
|
+
tokenizer = {
|
|
24
|
+
f"{src_lang}_{tgt_lang}": AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang]),
|
|
25
|
+
f"{tgt_lang}_{src_lang}": AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[tgt_lang], tgt_lang = BCP_47_languages[src_lang])
|
|
26
|
+
}
|
|
27
|
+
|
|
19
28
|
print(f"The {model}'s tokenizer was successfully loaded")
|
|
20
29
|
|
|
21
30
|
else:
|
{translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/upload_to_hughub.py
RENAMED
|
@@ -7,9 +7,10 @@ def upload_model(hub_token, directory = "my_model", username = "", repo_name = "
|
|
|
7
7
|
|
|
8
8
|
login(token=hub_token)
|
|
9
9
|
|
|
10
|
-
create_repo(repo_id)
|
|
10
|
+
create_repo(repo_id, token = hub_token)
|
|
11
11
|
|
|
12
|
-
upload_folder(repo_id = repo_id, folder_path = directory, commit_message= commit_message)
|
|
12
|
+
upload_folder(repo_id = repo_id, folder_path = directory, commit_message= commit_message, token=hub_token)
|
|
13
13
|
|
|
14
14
|
print(f"Model was successfully upload to {repo_id}.")
|
|
15
|
+
|
|
15
16
|
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: translate_package
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
|
|
5
|
+
Author: Oumar Kane
|
|
6
|
+
Author-email: oumar.kane@univ-thies.sn
|
|
7
|
+
Requires-Dist: accelerate
|
|
8
|
+
Requires-Dist: torch==2.7.0
|
|
9
|
+
Requires-Dist: torchvision
|
|
10
|
+
Requires-Dist: spacy
|
|
11
|
+
Requires-Dist: nltk
|
|
12
|
+
Requires-Dist: gensim
|
|
13
|
+
Requires-Dist: furo
|
|
14
|
+
Requires-Dist: streamlit
|
|
15
|
+
Requires-Dist: tokenizers
|
|
16
|
+
Requires-Dist: tensorboard
|
|
17
|
+
Requires-Dist: evaluate
|
|
18
|
+
Requires-Dist: transformers
|
|
19
|
+
Requires-Dist: pandas
|
|
20
|
+
Requires-Dist: numpy
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
22
|
+
Requires-Dist: matplotlib
|
|
23
|
+
Requires-Dist: plotly
|
|
24
|
+
Requires-Dist: sacrebleu
|
|
25
|
+
Requires-Dist: nlpaug
|
|
26
|
+
Requires-Dist: wandb
|
|
27
|
+
Requires-Dist: pytorch-lightning
|
|
28
|
+
Requires-Dist: selenium
|
|
29
|
+
Requires-Dist: sentencepiece
|
|
30
|
+
Requires-Dist: peft
|
|
31
|
+
Requires-Dist: rouge-score
|
|
32
|
+
Requires-Dist: sacrebleu
|
|
33
|
+
Requires-Dist: wolof-translate
|
|
34
|
+
Dynamic: author
|
|
35
|
+
Dynamic: author-email
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: summary
|
translate_package-0.1.8/PKG-INFO
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/gradient_observation.py
RENAMED
|
File without changes
|
|
File without changes
|
{translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/machine_translation.py
RENAMED
|
File without changes
|
{translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/top_level.txt
RENAMED
|
File without changes
|