translate-package 0.1.9__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- translate_package-0.2.1/PKG-INFO +37 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/setup.py +1 -1
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/data/data_preparation.py +60 -26
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/tokenization/load_tokenizer.py +11 -2
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/utils/upload_to_hughub.py +3 -2
- translate_package-0.2.1/translate_package.egg-info/PKG-INFO +37 -0
- translate_package-0.1.9/PKG-INFO +0 -6
- translate_package-0.1.9/translate_package.egg-info/PKG-INFO +0 -6
- {translate_package-0.1.9 → translate_package-0.2.1}/setup.cfg +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/__init__.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/data/__init__.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/data/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/data/__pycache__/data_preparation.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/errors/__init__.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/errors/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/models/__init__.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/models/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/models/__pycache__/code_generation.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/models/__pycache__/machine_translation.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/models/gradient_observation.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/models/lstm.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/models/machine_translation.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/tokenization/__init__.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/tokenization/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/tokenization/__pycache__/load_tokenizer.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/tokenization/__pycache__/train_tokenizer.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/tokenization/train_tokenizer.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/utils/__init__.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/utils/__pycache__/checkpoint.cpython-310.pyc +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package/utils/checkpoint.py +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package.egg-info/SOURCES.txt +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package.egg-info/dependency_links.txt +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package.egg-info/requires.txt +0 -0
- {translate_package-0.1.9 → translate_package-0.2.1}/translate_package.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: translate_package
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
|
|
5
|
+
Author: Oumar Kane
|
|
6
|
+
Author-email: oumar.kane@univ-thies.sn
|
|
7
|
+
Requires-Dist: accelerate
|
|
8
|
+
Requires-Dist: torch==2.7.0
|
|
9
|
+
Requires-Dist: torchvision
|
|
10
|
+
Requires-Dist: spacy
|
|
11
|
+
Requires-Dist: nltk
|
|
12
|
+
Requires-Dist: gensim
|
|
13
|
+
Requires-Dist: furo
|
|
14
|
+
Requires-Dist: streamlit
|
|
15
|
+
Requires-Dist: tokenizers
|
|
16
|
+
Requires-Dist: tensorboard
|
|
17
|
+
Requires-Dist: evaluate
|
|
18
|
+
Requires-Dist: transformers
|
|
19
|
+
Requires-Dist: pandas
|
|
20
|
+
Requires-Dist: numpy
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
22
|
+
Requires-Dist: matplotlib
|
|
23
|
+
Requires-Dist: plotly
|
|
24
|
+
Requires-Dist: sacrebleu
|
|
25
|
+
Requires-Dist: nlpaug
|
|
26
|
+
Requires-Dist: wandb
|
|
27
|
+
Requires-Dist: pytorch-lightning
|
|
28
|
+
Requires-Dist: selenium
|
|
29
|
+
Requires-Dist: sentencepiece
|
|
30
|
+
Requires-Dist: peft
|
|
31
|
+
Requires-Dist: rouge-score
|
|
32
|
+
Requires-Dist: sacrebleu
|
|
33
|
+
Requires-Dist: wolof-translate
|
|
34
|
+
Dynamic: author
|
|
35
|
+
Dynamic: author-email
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: summary
|
{translate_package-0.1.9 → translate_package-0.2.1}/translate_package/data/data_preparation.py
RENAMED
|
@@ -22,21 +22,24 @@ from translate_package import (
|
|
|
22
22
|
|
|
23
23
|
# python translate_hyperparameter_tuning.py --model_generation "t5" --model_name "google-t5/t5-small" --tokenizer_name "sp" --use_bucketing --save_artifact
|
|
24
24
|
|
|
25
|
-
def augment(examples,
|
|
25
|
+
def augment(examples, p_word = 0.12554160436087158, p_char = 0.8269672653838092, max_words = 21):
|
|
26
26
|
|
|
27
|
-
examples[
|
|
27
|
+
examples[examples["source"]] = TransformerSequences(nac.RandomCharAug(action = 'swap', aug_word_p = p_word, aug_char_p = p_char, aug_word_max = max_words))(examples[examples["source"]])[0]
|
|
28
28
|
|
|
29
29
|
return examples
|
|
30
30
|
|
|
31
|
-
def augment_(examples
|
|
31
|
+
def augment_(examples):
|
|
32
32
|
|
|
33
|
-
examples[
|
|
33
|
+
examples[examples["source"]] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[examples["source"]])[0]
|
|
34
34
|
|
|
35
|
-
examples[
|
|
35
|
+
examples[examples["target"]] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[examples["target"]])[0]
|
|
36
36
|
|
|
37
37
|
return examples
|
|
38
38
|
|
|
39
|
-
def tokenize(examples, tokenizer,
|
|
39
|
+
def tokenize(examples, tokenizer, model_generation, bidirectional = False):
|
|
40
|
+
|
|
41
|
+
direction = f"{examples['source']}_{examples['target']}"
|
|
42
|
+
rev_direction = f"{examples['target']}_{examples['source']}"
|
|
40
43
|
|
|
41
44
|
if model_generation in ["t5", "mt5", "nllb"]:
|
|
42
45
|
|
|
@@ -48,13 +51,21 @@ def tokenize(examples, tokenizer, src_label, tgt_label, model_generation):
|
|
|
48
51
|
eos_token = tokenizer.eos_token
|
|
49
52
|
bos_token = tokenizer.bos_token
|
|
50
53
|
|
|
51
|
-
examples[
|
|
54
|
+
examples[examples["source"]] = bos_token + examples[examples["source"]] + eos_token
|
|
55
|
+
|
|
56
|
+
examples[examples["target"]] = bos_token + examples[examples["target"]] + eos_token
|
|
52
57
|
|
|
53
|
-
|
|
58
|
+
if bidirectional:
|
|
59
|
+
|
|
60
|
+
examples.update({key: value[0] for key, value in tokenizer[direction](examples[examples["source"]], return_tensors = 'pt').items()})
|
|
54
61
|
|
|
55
|
-
|
|
62
|
+
examples.update({f'decoder_{key}': value[0] for key, value in tokenizer[rev_direction](examples[examples["target"]], return_tensors = 'pt').items()})
|
|
63
|
+
|
|
64
|
+
else:
|
|
65
|
+
|
|
66
|
+
examples.update({key: value[0] for key, value in tokenizer(examples[examples["source"]], return_tensors = 'pt').items()})
|
|
56
67
|
|
|
57
|
-
|
|
68
|
+
examples.update({f'decoder_{key}': value[0] for key, value in tokenizer(examples[examples["target"]], return_tensors = 'pt').items()})
|
|
58
69
|
|
|
59
70
|
examples['labels'] = examples['decoder_input_ids']
|
|
60
71
|
|
|
@@ -76,7 +87,7 @@ def sequences(examples, functions):
|
|
|
76
87
|
|
|
77
88
|
class SentenceDataset(Dataset):
|
|
78
89
|
|
|
79
|
-
def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH'):
|
|
90
|
+
def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH', bidirectional: bool = False):
|
|
80
91
|
|
|
81
92
|
assert source_column in dataframe.columns.tolist() and target_column in dataframe.columns.tolist()
|
|
82
93
|
|
|
@@ -89,6 +100,8 @@ class SentenceDataset(Dataset):
|
|
|
89
100
|
self.source_column = source_column
|
|
90
101
|
|
|
91
102
|
self.target_column = target_column
|
|
103
|
+
|
|
104
|
+
self.bidirectional = bidirectional
|
|
92
105
|
|
|
93
106
|
def __getitem__(self, index):
|
|
94
107
|
|
|
@@ -96,10 +109,26 @@ class SentenceDataset(Dataset):
|
|
|
96
109
|
|
|
97
110
|
target_sentence = self.target_sentences[index]
|
|
98
111
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
self.
|
|
102
|
-
|
|
112
|
+
if index > len(source_sentence):
|
|
113
|
+
|
|
114
|
+
source_sentence = self.target_sentences[index]
|
|
115
|
+
|
|
116
|
+
target_sentence = self.source_sentences[index]
|
|
117
|
+
|
|
118
|
+
sentences = {
|
|
119
|
+
self.source_column: source_sentence,
|
|
120
|
+
self.target_column: target_sentence,
|
|
121
|
+
"source": self.source_column,
|
|
122
|
+
"target": self.target_column
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
else:
|
|
126
|
+
sentences = {
|
|
127
|
+
self.source_column: source_sentence,
|
|
128
|
+
self.target_column: target_sentence,
|
|
129
|
+
"source": self.target_column,
|
|
130
|
+
"target": self.source_column
|
|
131
|
+
}
|
|
103
132
|
|
|
104
133
|
if not self.transformers is None:
|
|
105
134
|
|
|
@@ -109,10 +138,14 @@ class SentenceDataset(Dataset):
|
|
|
109
138
|
|
|
110
139
|
def __len__(self):
|
|
111
140
|
|
|
141
|
+
if self.bidirectional:
|
|
142
|
+
|
|
143
|
+
return len(self.source_sentences) * 2
|
|
144
|
+
|
|
112
145
|
return len(self.source_sentences)
|
|
113
146
|
|
|
114
147
|
|
|
115
|
-
def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
|
|
148
|
+
def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed, bidirectional):
|
|
116
149
|
|
|
117
150
|
# load the dataset with pandas
|
|
118
151
|
dataset_ = pd.read_csv(data_path)
|
|
@@ -121,7 +154,7 @@ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
|
|
|
121
154
|
if test_size == 1.0:
|
|
122
155
|
|
|
123
156
|
dataset = {
|
|
124
|
-
"test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label),
|
|
157
|
+
"test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
|
|
125
158
|
}
|
|
126
159
|
|
|
127
160
|
else:
|
|
@@ -135,9 +168,9 @@ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
|
|
|
135
168
|
)
|
|
136
169
|
|
|
137
170
|
dataset = {
|
|
138
|
-
"train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label),
|
|
139
|
-
"val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label),
|
|
140
|
-
"test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label),
|
|
171
|
+
"train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
|
|
172
|
+
"val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
|
|
173
|
+
"test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
|
|
141
174
|
}
|
|
142
175
|
|
|
143
176
|
# The dataset actually contains 3 diff splits: train, validation, test.
|
|
@@ -243,25 +276,26 @@ def get_loaders(
|
|
|
243
276
|
use_bucketing,
|
|
244
277
|
use_truncation,
|
|
245
278
|
batch_size,
|
|
279
|
+
bidirectional
|
|
246
280
|
):
|
|
247
281
|
|
|
248
282
|
# get dataset
|
|
249
|
-
dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed)
|
|
283
|
+
dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed, bidirectional)
|
|
250
284
|
|
|
251
285
|
# analysis transformations
|
|
252
286
|
|
|
253
287
|
a_transformers = partial(sequences,
|
|
254
288
|
functions = [
|
|
255
|
-
partial(augment_
|
|
256
|
-
partial(tokenize, tokenizer = tokenizer,
|
|
289
|
+
partial(augment_),
|
|
290
|
+
partial(tokenize, tokenizer = tokenizer, model_generation = model_generation, bidirectional=bidirectional)
|
|
257
291
|
])
|
|
258
292
|
|
|
259
293
|
# training transformations
|
|
260
294
|
t_transformers = partial(sequences,
|
|
261
295
|
functions = [
|
|
262
|
-
partial(augment,
|
|
263
|
-
partial(augment_
|
|
264
|
-
partial(tokenize, tokenizer = tokenizer,
|
|
296
|
+
partial(augment, p_word = p_word, p_char = p_char, max_words = max_words),
|
|
297
|
+
partial(augment_),
|
|
298
|
+
partial(tokenize, tokenizer = tokenizer, model_generation = model_generation, bidirectional=bidirectional)
|
|
265
299
|
])
|
|
266
300
|
|
|
267
301
|
if use_bucketing:
|
{translate_package-0.1.9 → translate_package-0.2.1}/translate_package/tokenization/load_tokenizer.py
RENAMED
|
@@ -8,14 +8,23 @@ BCP_47_languages = {
|
|
|
8
8
|
'wolof': 'wol_Latn',
|
|
9
9
|
}
|
|
10
10
|
|
|
11
|
-
def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None, src_lang = "french", tgt_lang = "wolof"):
|
|
11
|
+
def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None, src_lang = "french", tgt_lang = "wolof", bidirectional: bool = False):
|
|
12
12
|
|
|
13
13
|
if model == "nllb":
|
|
14
14
|
|
|
15
15
|
if not model_name is None:
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
if not bidirectional:
|
|
18
|
+
|
|
19
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang])
|
|
18
20
|
|
|
21
|
+
else:
|
|
22
|
+
|
|
23
|
+
tokenizer = {
|
|
24
|
+
f"{src_lang}_{tgt_lang}": AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang]),
|
|
25
|
+
f"{tgt_lang}_{src_lang}": AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[tgt_lang], tgt_lang = BCP_47_languages[src_lang])
|
|
26
|
+
}
|
|
27
|
+
|
|
19
28
|
print(f"The {model}'s tokenizer was successfully loaded")
|
|
20
29
|
|
|
21
30
|
else:
|
{translate_package-0.1.9 → translate_package-0.2.1}/translate_package/utils/upload_to_hughub.py
RENAMED
|
@@ -7,9 +7,10 @@ def upload_model(hub_token, directory = "my_model", username = "", repo_name = "
|
|
|
7
7
|
|
|
8
8
|
login(token=hub_token)
|
|
9
9
|
|
|
10
|
-
create_repo(repo_id)
|
|
10
|
+
create_repo(repo_id, token = hub_token)
|
|
11
11
|
|
|
12
|
-
upload_folder(repo_id = repo_id, folder_path = directory, commit_message= commit_message)
|
|
12
|
+
upload_folder(repo_id = repo_id, folder_path = directory, commit_message= commit_message, token=hub_token)
|
|
13
13
|
|
|
14
14
|
print(f"Model was successfully upload to {repo_id}.")
|
|
15
|
+
|
|
15
16
|
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: translate_package
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
|
|
5
|
+
Author: Oumar Kane
|
|
6
|
+
Author-email: oumar.kane@univ-thies.sn
|
|
7
|
+
Requires-Dist: accelerate
|
|
8
|
+
Requires-Dist: torch==2.7.0
|
|
9
|
+
Requires-Dist: torchvision
|
|
10
|
+
Requires-Dist: spacy
|
|
11
|
+
Requires-Dist: nltk
|
|
12
|
+
Requires-Dist: gensim
|
|
13
|
+
Requires-Dist: furo
|
|
14
|
+
Requires-Dist: streamlit
|
|
15
|
+
Requires-Dist: tokenizers
|
|
16
|
+
Requires-Dist: tensorboard
|
|
17
|
+
Requires-Dist: evaluate
|
|
18
|
+
Requires-Dist: transformers
|
|
19
|
+
Requires-Dist: pandas
|
|
20
|
+
Requires-Dist: numpy
|
|
21
|
+
Requires-Dist: scikit-learn
|
|
22
|
+
Requires-Dist: matplotlib
|
|
23
|
+
Requires-Dist: plotly
|
|
24
|
+
Requires-Dist: sacrebleu
|
|
25
|
+
Requires-Dist: nlpaug
|
|
26
|
+
Requires-Dist: wandb
|
|
27
|
+
Requires-Dist: pytorch-lightning
|
|
28
|
+
Requires-Dist: selenium
|
|
29
|
+
Requires-Dist: sentencepiece
|
|
30
|
+
Requires-Dist: peft
|
|
31
|
+
Requires-Dist: rouge-score
|
|
32
|
+
Requires-Dist: sacrebleu
|
|
33
|
+
Requires-Dist: wolof-translate
|
|
34
|
+
Dynamic: author
|
|
35
|
+
Dynamic: author-email
|
|
36
|
+
Dynamic: requires-dist
|
|
37
|
+
Dynamic: summary
|
translate_package-0.1.9/PKG-INFO
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{translate_package-0.1.9 → translate_package-0.2.1}/translate_package/models/gradient_observation.py
RENAMED
|
File without changes
|
|
File without changes
|
{translate_package-0.1.9 → translate_package-0.2.1}/translate_package/models/machine_translation.py
RENAMED
|
File without changes
|
{translate_package-0.1.9 → translate_package-0.2.1}/translate_package/tokenization/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{translate_package-0.1.9 → translate_package-0.2.1}/translate_package.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
{translate_package-0.1.9 → translate_package-0.2.1}/translate_package.egg-info/top_level.txt
RENAMED
|
File without changes
|