translate-package 0.1.8__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. translate_package-0.2.0/PKG-INFO +37 -0
  2. {translate_package-0.1.8 → translate_package-0.2.0}/setup.py +1 -1
  3. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/data/data_preparation.py +59 -26
  4. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/load_tokenizer.py +11 -2
  5. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/upload_to_hughub.py +3 -2
  6. translate_package-0.2.0/translate_package.egg-info/PKG-INFO +37 -0
  7. translate_package-0.1.8/PKG-INFO +0 -6
  8. translate_package-0.1.8/translate_package.egg-info/PKG-INFO +0 -6
  9. {translate_package-0.1.8 → translate_package-0.2.0}/setup.cfg +0 -0
  10. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/__init__.py +0 -0
  11. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/data/__init__.py +0 -0
  12. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/data/__pycache__/__init__.cpython-310.pyc +0 -0
  13. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/data/__pycache__/data_preparation.cpython-310.pyc +0 -0
  14. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/errors/__init__.py +0 -0
  15. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/errors/__pycache__/__init__.cpython-310.pyc +0 -0
  16. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/__init__.py +0 -0
  17. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/__pycache__/__init__.cpython-310.pyc +0 -0
  18. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/__pycache__/code_generation.cpython-310.pyc +0 -0
  19. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/__pycache__/machine_translation.cpython-310.pyc +0 -0
  20. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/gradient_observation.py +0 -0
  21. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/lstm.py +0 -0
  22. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/models/machine_translation.py +0 -0
  23. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/__init__.py +0 -0
  24. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/__pycache__/__init__.cpython-310.pyc +0 -0
  25. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/__pycache__/load_tokenizer.cpython-310.pyc +0 -0
  26. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/__pycache__/train_tokenizer.cpython-310.pyc +0 -0
  27. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/tokenization/train_tokenizer.py +0 -0
  28. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/__init__.py +0 -0
  29. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  30. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/__pycache__/checkpoint.cpython-310.pyc +0 -0
  31. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package/utils/checkpoint.py +0 -0
  32. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/SOURCES.txt +0 -0
  33. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/dependency_links.txt +0 -0
  34. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/requires.txt +0 -0
  35. {translate_package-0.1.8 → translate_package-0.2.0}/translate_package.egg-info/top_level.txt +0 -0
@@ -0,0 +1,37 @@
1
+ Metadata-Version: 2.4
2
+ Name: translate_package
3
+ Version: 0.2.0
4
+ Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
5
+ Author: Oumar Kane
6
+ Author-email: oumar.kane@univ-thies.sn
7
+ Requires-Dist: accelerate
8
+ Requires-Dist: torch==2.7.0
9
+ Requires-Dist: torchvision
10
+ Requires-Dist: spacy
11
+ Requires-Dist: nltk
12
+ Requires-Dist: gensim
13
+ Requires-Dist: furo
14
+ Requires-Dist: streamlit
15
+ Requires-Dist: tokenizers
16
+ Requires-Dist: tensorboard
17
+ Requires-Dist: evaluate
18
+ Requires-Dist: transformers
19
+ Requires-Dist: pandas
20
+ Requires-Dist: numpy
21
+ Requires-Dist: scikit-learn
22
+ Requires-Dist: matplotlib
23
+ Requires-Dist: plotly
24
+ Requires-Dist: sacrebleu
25
+ Requires-Dist: nlpaug
26
+ Requires-Dist: wandb
27
+ Requires-Dist: pytorch-lightning
28
+ Requires-Dist: selenium
29
+ Requires-Dist: sentencepiece
30
+ Requires-Dist: peft
31
+ Requires-Dist: rouge-score
32
+ Requires-Dist: sacrebleu
33
+ Requires-Dist: wolof-translate
34
+ Dynamic: author
35
+ Dynamic: author-email
36
+ Dynamic: requires-dist
37
+ Dynamic: summary
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="translate_package",
5
- version="0.1.8",
5
+ version="0.2.0",
6
6
  author="Oumar Kane",
7
7
  packages=find_packages(),
8
8
  author_email="oumar.kane@univ-thies.sn",
@@ -22,21 +22,23 @@ from translate_package import (
22
22
 
23
23
  # python translate_hyperparameter_tuning.py --model_generation "t5" --model_name "google-t5/t5-small" --tokenizer_name "sp" --use_bucketing --save_artifact
24
24
 
25
- def augment(examples, src_label, p_word = 0.12554160436087158, p_char = 0.8269672653838092, max_words = 21):
25
+ def augment(examples, p_word = 0.12554160436087158, p_char = 0.8269672653838092, max_words = 21):
26
26
 
27
- examples[src_label] = TransformerSequences(nac.RandomCharAug(action = 'swap', aug_word_p = p_word, aug_char_p = p_char, aug_word_max = max_words))(examples[src_label])[0]
27
+ examples[examples["source"]] = TransformerSequences(nac.RandomCharAug(action = 'swap', aug_word_p = p_word, aug_char_p = p_char, aug_word_max = max_words))(examples[examples["source"]])[0]
28
28
 
29
29
  return examples
30
30
 
31
- def augment_(examples, src_label, tgt_label):
31
+ def augment_(examples):
32
32
 
33
- examples[src_label] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[src_label])[0]
33
+ examples[examples["source"]] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[examples["source"]])[0]
34
34
 
35
- examples[tgt_label] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[tgt_label])[0]
35
+ examples[examples["target"]] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[examples["target"]])[0]
36
36
 
37
37
  return examples
38
38
 
39
- def tokenize(examples, tokenizer, src_label, tgt_label, model_generation):
39
+ def tokenize(examples, tokenizer, model_generation, bidirectional = False):
40
+
41
+ direction = f"{examples['source']}_{examples['target']}"
40
42
 
41
43
  if model_generation in ["t5", "mt5", "nllb"]:
42
44
 
@@ -48,13 +50,21 @@ def tokenize(examples, tokenizer, src_label, tgt_label, model_generation):
48
50
  eos_token = tokenizer.eos_token
49
51
  bos_token = tokenizer.bos_token
50
52
 
51
- examples[src_label] = bos_token + examples[src_label] + eos_token
53
+ examples[examples["source"]] = bos_token + examples[examples["source"]] + eos_token
54
+
55
+ examples[examples["target"]] = bos_token + examples[examples["target"]] + eos_token
52
56
 
53
- examples[tgt_label] = bos_token + examples[tgt_label] + eos_token
57
+ if bidirectional:
58
+
59
+ examples.update({key: value[0] for key, value in tokenizer[direction](examples[examples["source"]], return_tensors = 'pt').items()})
54
60
 
55
- examples.update({key: value[0] for key, value in tokenizer(examples[src_label], return_tensors = 'pt').items()})
61
+ examples.update({f'decoder_{key}': value[0] for key, value in tokenizer[direction](examples[examples["target"]], return_tensors = 'pt').items()})
62
+
63
+ else:
64
+
65
+ examples.update({key: value[0] for key, value in tokenizer(examples[examples["source"]], return_tensors = 'pt').items()})
56
66
 
57
- examples.update({f'decoder_{key}': value[0] for key, value in tokenizer(examples[tgt_label], return_tensors = 'pt').items()})
67
+ examples.update({f'decoder_{key}': value[0] for key, value in tokenizer(examples[examples["target"]], return_tensors = 'pt').items()})
58
68
 
59
69
  examples['labels'] = examples['decoder_input_ids']
60
70
 
@@ -76,7 +86,7 @@ def sequences(examples, functions):
76
86
 
77
87
  class SentenceDataset(Dataset):
78
88
 
79
- def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH'):
89
+ def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH', bidirectional: bool = False):
80
90
 
81
91
  assert source_column in dataframe.columns.tolist() and target_column in dataframe.columns.tolist()
82
92
 
@@ -89,6 +99,8 @@ class SentenceDataset(Dataset):
89
99
  self.source_column = source_column
90
100
 
91
101
  self.target_column = target_column
102
+
103
+ self.bidirectional = bidirectional
92
104
 
93
105
  def __getitem__(self, index):
94
106
 
@@ -96,10 +108,26 @@ class SentenceDataset(Dataset):
96
108
 
97
109
  target_sentence = self.target_sentences[index]
98
110
 
99
- sentences = {
100
- self.source_column: source_sentence,
101
- self.target_column: target_sentence
102
- }
111
+ if index > len(source_sentence):
112
+
113
+ source_sentence = self.target_sentences[index]
114
+
115
+ target_sentence = self.source_sentences[index]
116
+
117
+ sentences = {
118
+ self.source_column: source_sentence,
119
+ self.target_column: target_sentence,
120
+ "source": self.source_column,
121
+ "target": self.target_column
122
+ }
123
+
124
+ else:
125
+ sentences = {
126
+ self.source_column: source_sentence,
127
+ self.target_column: target_sentence,
128
+ "source": self.target_column,
129
+ "target": self.source_column
130
+ }
103
131
 
104
132
  if not self.transformers is None:
105
133
 
@@ -109,10 +137,14 @@ class SentenceDataset(Dataset):
109
137
 
110
138
  def __len__(self):
111
139
 
140
+ if self.bidirectional:
141
+
142
+ return len(self.source_sentences) * 2
143
+
112
144
  return len(self.source_sentences)
113
145
 
114
146
 
115
- def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
147
+ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed, bidirectional):
116
148
 
117
149
  # load the dataset with pandas
118
150
  dataset_ = pd.read_csv(data_path)
@@ -121,7 +153,7 @@ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
121
153
  if test_size == 1.0:
122
154
 
123
155
  dataset = {
124
- "test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label),
156
+ "test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
125
157
  }
126
158
 
127
159
  else:
@@ -135,9 +167,9 @@ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
135
167
  )
136
168
 
137
169
  dataset = {
138
- "train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label),
139
- "val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label),
140
- "test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label),
170
+ "train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
171
+ "val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
172
+ "test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
141
173
  }
142
174
 
143
175
  # The dataset actually contains 3 diff splits: train, validation, test.
@@ -243,25 +275,26 @@ def get_loaders(
243
275
  use_bucketing,
244
276
  use_truncation,
245
277
  batch_size,
278
+ bidirectional
246
279
  ):
247
280
 
248
281
  # get dataset
249
- dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed)
282
+ dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed, bidirectional)
250
283
 
251
284
  # analysis transformations
252
285
 
253
286
  a_transformers = partial(sequences,
254
287
  functions = [
255
- partial(augment_, src_label = src_label, tgt_label = tgt_label),
256
- partial(tokenize, tokenizer = tokenizer, src_label = src_label, tgt_label = tgt_label, model_generation = model_generation)
288
+ partial(augment_),
289
+ partial(tokenize, tokenizer = tokenizer, model_generation = model_generation, bidirectional=bidirectional)
257
290
  ])
258
291
 
259
292
  # training transformations
260
293
  t_transformers = partial(sequences,
261
294
  functions = [
262
- partial(augment, src_label = src_label, p_word = p_word, p_char = p_char, max_words = max_words),
263
- partial(augment_, src_label = src_label, tgt_label = tgt_label),
264
- partial(tokenize, tokenizer = tokenizer, src_label = src_label, tgt_label = tgt_label, model_generation = model_generation)
295
+ partial(augment, p_word = p_word, p_char = p_char, max_words = max_words),
296
+ partial(augment_),
297
+ partial(tokenize, tokenizer = tokenizer, model_generation = model_generation, bidirectional=bidirectional)
265
298
  ])
266
299
 
267
300
  if use_bucketing:
@@ -8,14 +8,23 @@ BCP_47_languages = {
8
8
  'wolof': 'wol_Latn',
9
9
  }
10
10
 
11
- def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None, src_lang = "french", tgt_lang = "wolof"):
11
+ def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None, src_lang = "french", tgt_lang = "wolof", bidirectional: bool = False):
12
12
 
13
13
  if model == "nllb":
14
14
 
15
15
  if not model_name is None:
16
16
 
17
- tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang])
17
+ if not bidirectional:
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang])
18
20
 
21
+ else:
22
+
23
+ tokenizer = {
24
+ f"{src_lang}_{tgt_lang}": AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang]),
25
+ f"{tgt_lang}_{src_lang}": AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[tgt_lang], tgt_lang = BCP_47_languages[src_lang])
26
+ }
27
+
19
28
  print(f"The {model}'s tokenizer was successfully loaded")
20
29
 
21
30
  else:
@@ -7,9 +7,10 @@ def upload_model(hub_token, directory = "my_model", username = "", repo_name = "
7
7
 
8
8
  login(token=hub_token)
9
9
 
10
- create_repo(repo_id)
10
+ create_repo(repo_id, token = hub_token)
11
11
 
12
- upload_folder(repo_id = repo_id, folder_path = directory, commit_message= commit_message)
12
+ upload_folder(repo_id = repo_id, folder_path = directory, commit_message= commit_message, token=hub_token)
13
13
 
14
14
  print(f"Model was successfully upload to {repo_id}.")
15
+
15
16
 
@@ -0,0 +1,37 @@
1
+ Metadata-Version: 2.4
2
+ Name: translate_package
3
+ Version: 0.2.0
4
+ Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
5
+ Author: Oumar Kane
6
+ Author-email: oumar.kane@univ-thies.sn
7
+ Requires-Dist: accelerate
8
+ Requires-Dist: torch==2.7.0
9
+ Requires-Dist: torchvision
10
+ Requires-Dist: spacy
11
+ Requires-Dist: nltk
12
+ Requires-Dist: gensim
13
+ Requires-Dist: furo
14
+ Requires-Dist: streamlit
15
+ Requires-Dist: tokenizers
16
+ Requires-Dist: tensorboard
17
+ Requires-Dist: evaluate
18
+ Requires-Dist: transformers
19
+ Requires-Dist: pandas
20
+ Requires-Dist: numpy
21
+ Requires-Dist: scikit-learn
22
+ Requires-Dist: matplotlib
23
+ Requires-Dist: plotly
24
+ Requires-Dist: sacrebleu
25
+ Requires-Dist: nlpaug
26
+ Requires-Dist: wandb
27
+ Requires-Dist: pytorch-lightning
28
+ Requires-Dist: selenium
29
+ Requires-Dist: sentencepiece
30
+ Requires-Dist: peft
31
+ Requires-Dist: rouge-score
32
+ Requires-Dist: sacrebleu
33
+ Requires-Dist: wolof-translate
34
+ Dynamic: author
35
+ Dynamic: author-email
36
+ Dynamic: requires-dist
37
+ Dynamic: summary
@@ -1,6 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: translate_package
3
- Version: 0.1.8
4
- Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
5
- Author: Oumar Kane
6
- Author-email: oumar.kane@univ-thies.sn
@@ -1,6 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: translate-package
3
- Version: 0.1.8
4
- Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
5
- Author: Oumar Kane
6
- Author-email: oumar.kane@univ-thies.sn