translate-package 0.1.9__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,21 +22,24 @@ from translate_package import (
22
22
 
23
23
  # python translate_hyperparameter_tuning.py --model_generation "t5" --model_name "google-t5/t5-small" --tokenizer_name "sp" --use_bucketing --save_artifact
24
24
 
25
- def augment(examples, src_label, p_word = 0.12554160436087158, p_char = 0.8269672653838092, max_words = 21):
25
+ def augment(examples, p_word = 0.12554160436087158, p_char = 0.8269672653838092, max_words = 21):
26
26
 
27
- examples[src_label] = TransformerSequences(nac.RandomCharAug(action = 'swap', aug_word_p = p_word, aug_char_p = p_char, aug_word_max = max_words))(str(examples[src_label]))[0]
27
+ examples[examples["source"]] = TransformerSequences(nac.RandomCharAug(action = 'swap', aug_word_p = p_word, aug_char_p = p_char, aug_word_max = max_words))(examples[examples["source"]])[0]
28
28
 
29
29
  return examples
30
30
 
31
- def augment_(examples, src_label, tgt_label):
31
+ def augment_(examples):
32
32
 
33
- examples[src_label] = TransformerSequences(remove_mark_space, delete_guillemet_space)(str(examples[src_label]))[0]
33
+ examples[examples["source"]] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[examples["source"]])[0]
34
34
 
35
- examples[tgt_label] = TransformerSequences(remove_mark_space, delete_guillemet_space)(str(examples[tgt_label]))[0]
35
+ examples[examples["target"]] = TransformerSequences(remove_mark_space, delete_guillemet_space)(examples[examples["target"]])[0]
36
36
 
37
37
  return examples
38
38
 
39
- def tokenize(examples, tokenizer, src_label, tgt_label, model_generation):
39
+ def tokenize(examples, tokenizer, model_generation, bidirectional = False):
40
+
41
+ direction = f"{examples['source']}_{examples['target']}"
42
+ rev_direction = f"{examples['target']}_{examples['source']}"
40
43
 
41
44
  if model_generation in ["t5", "mt5", "nllb"]:
42
45
 
@@ -48,13 +51,21 @@ def tokenize(examples, tokenizer, src_label, tgt_label, model_generation):
48
51
  eos_token = tokenizer.eos_token
49
52
  bos_token = tokenizer.bos_token
50
53
 
51
- examples[src_label] = bos_token + examples[src_label] + eos_token
54
+ examples[examples["source"]] = bos_token + examples[examples["source"]] + eos_token
55
+
56
+ examples[examples["target"]] = bos_token + examples[examples["target"]] + eos_token
52
57
 
53
- examples[tgt_label] = bos_token + examples[tgt_label] + eos_token
58
+ if bidirectional:
59
+
60
+ examples.update({key: value[0] for key, value in tokenizer[direction](examples[examples["source"]], return_tensors = 'pt').items()})
54
61
 
55
- examples.update({key: value[0] for key, value in tokenizer(examples[src_label], return_tensors = 'pt').items()})
62
+ examples.update({f'decoder_{key}': value[0] for key, value in tokenizer[rev_direction](examples[examples["target"]], return_tensors = 'pt').items()})
63
+
64
+ else:
65
+
66
+ examples.update({key: value[0] for key, value in tokenizer(examples[examples["source"]], return_tensors = 'pt').items()})
56
67
 
57
- examples.update({f'decoder_{key}': value[0] for key, value in tokenizer(examples[tgt_label], return_tensors = 'pt').items()})
68
+ examples.update({f'decoder_{key}': value[0] for key, value in tokenizer(examples[examples["target"]], return_tensors = 'pt').items()})
58
69
 
59
70
  examples['labels'] = examples['decoder_input_ids']
60
71
 
@@ -76,7 +87,7 @@ def sequences(examples, functions):
76
87
 
77
88
  class SentenceDataset(Dataset):
78
89
 
79
- def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH'):
90
+ def __init__(self, dataframe, transformers: Union[Callable, None] = None, source_column: str = 'WOLOF', target_column: str = 'FRENCH', bidirectional: bool = False):
80
91
 
81
92
  assert source_column in dataframe.columns.tolist() and target_column in dataframe.columns.tolist()
82
93
 
@@ -89,6 +100,8 @@ class SentenceDataset(Dataset):
89
100
  self.source_column = source_column
90
101
 
91
102
  self.target_column = target_column
103
+
104
+ self.bidirectional = bidirectional
92
105
 
93
106
  def __getitem__(self, index):
94
107
 
@@ -96,10 +109,26 @@ class SentenceDataset(Dataset):
96
109
 
97
110
  target_sentence = self.target_sentences[index]
98
111
 
99
- sentences = {
100
- self.source_column: source_sentence,
101
- self.target_column: target_sentence
102
- }
112
+ if index > len(source_sentence):
113
+
114
+ source_sentence = self.target_sentences[index]
115
+
116
+ target_sentence = self.source_sentences[index]
117
+
118
+ sentences = {
119
+ self.source_column: source_sentence,
120
+ self.target_column: target_sentence,
121
+ "source": self.source_column,
122
+ "target": self.target_column
123
+ }
124
+
125
+ else:
126
+ sentences = {
127
+ self.source_column: source_sentence,
128
+ self.target_column: target_sentence,
129
+ "source": self.target_column,
130
+ "target": self.source_column
131
+ }
103
132
 
104
133
  if not self.transformers is None:
105
134
 
@@ -109,10 +138,14 @@ class SentenceDataset(Dataset):
109
138
 
110
139
  def __len__(self):
111
140
 
141
+ if self.bidirectional:
142
+
143
+ return len(self.source_sentences) * 2
144
+
112
145
  return len(self.source_sentences)
113
146
 
114
147
 
115
- def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
148
+ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed, bidirectional):
116
149
 
117
150
  # load the dataset with pandas
118
151
  dataset_ = pd.read_csv(data_path)
@@ -121,7 +154,7 @@ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
121
154
  if test_size == 1.0:
122
155
 
123
156
  dataset = {
124
- "test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label),
157
+ "test": partial(SentenceDataset, dataframe = dataset_, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
125
158
  }
126
159
 
127
160
  else:
@@ -135,9 +168,9 @@ def load_data(src_label, tgt_label, data_path, test_size, valid_size, seed):
135
168
  )
136
169
 
137
170
  dataset = {
138
- "train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label),
139
- "val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label),
140
- "test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label),
171
+ "train": partial(SentenceDataset, dataframe = train, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
172
+ "val": partial(SentenceDataset, dataframe = valid, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
173
+ "test": partial(SentenceDataset, dataframe = test, source_column = src_label, target_column = tgt_label, bidirectional=bidirectional),
141
174
  }
142
175
 
143
176
  # The dataset actually contains 3 diff splits: train, validation, test.
@@ -243,25 +276,26 @@ def get_loaders(
243
276
  use_bucketing,
244
277
  use_truncation,
245
278
  batch_size,
279
+ bidirectional
246
280
  ):
247
281
 
248
282
  # get dataset
249
- dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed)
283
+ dataset = load_data(src_label, tgt_label, data_path, test_size, valid_size, seed, bidirectional)
250
284
 
251
285
  # analysis transformations
252
286
 
253
287
  a_transformers = partial(sequences,
254
288
  functions = [
255
- partial(augment_, src_label = src_label, tgt_label = tgt_label),
256
- partial(tokenize, tokenizer = tokenizer, src_label = src_label, tgt_label = tgt_label, model_generation = model_generation)
289
+ partial(augment_),
290
+ partial(tokenize, tokenizer = tokenizer, model_generation = model_generation, bidirectional=bidirectional)
257
291
  ])
258
292
 
259
293
  # training transformations
260
294
  t_transformers = partial(sequences,
261
295
  functions = [
262
- partial(augment, src_label = src_label, p_word = p_word, p_char = p_char, max_words = max_words),
263
- partial(augment_, src_label = src_label, tgt_label = tgt_label),
264
- partial(tokenize, tokenizer = tokenizer, src_label = src_label, tgt_label = tgt_label, model_generation = model_generation)
296
+ partial(augment, p_word = p_word, p_char = p_char, max_words = max_words),
297
+ partial(augment_),
298
+ partial(tokenize, tokenizer = tokenizer, model_generation = model_generation, bidirectional=bidirectional)
265
299
  ])
266
300
 
267
301
  if use_bucketing:
@@ -8,14 +8,23 @@ BCP_47_languages = {
8
8
  'wolof': 'wol_Latn',
9
9
  }
10
10
 
11
- def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None, src_lang = "french", tgt_lang = "wolof"):
11
+ def load_tokenizer(tokenizer_name, model, dir_path, file_name, model_name = None, src_lang = "french", tgt_lang = "wolof", bidirectional: bool = False):
12
12
 
13
13
  if model == "nllb":
14
14
 
15
15
  if not model_name is None:
16
16
 
17
- tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang])
17
+ if not bidirectional:
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang])
18
20
 
21
+ else:
22
+
23
+ tokenizer = {
24
+ f"{src_lang}_{tgt_lang}": AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[src_lang], tgt_lang = BCP_47_languages[tgt_lang]),
25
+ f"{tgt_lang}_{src_lang}": AutoTokenizer.from_pretrained(model_name, src_lang = BCP_47_languages[tgt_lang], tgt_lang = BCP_47_languages[src_lang])
26
+ }
27
+
19
28
  print(f"The {model}'s tokenizer was successfully loaded")
20
29
 
21
30
  else:
@@ -7,9 +7,10 @@ def upload_model(hub_token, directory = "my_model", username = "", repo_name = "
7
7
 
8
8
  login(token=hub_token)
9
9
 
10
- create_repo(repo_id)
10
+ create_repo(repo_id, token = hub_token)
11
11
 
12
- upload_folder(repo_id = repo_id, folder_path = directory, commit_message= commit_message)
12
+ upload_folder(repo_id = repo_id, folder_path = directory, commit_message= commit_message, token=hub_token)
13
13
 
14
14
  print(f"Model was successfully upload to {repo_id}.")
15
+
15
16
 
@@ -1,33 +1,37 @@
1
- Metadata-Version: 2.1
2
- Name: translate-package
3
- Version: 0.1.9
4
- Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
5
- Author: Oumar Kane
6
- Author-email: oumar.kane@univ-thies.sn
7
- Requires-Dist: accelerate
8
- Requires-Dist: torch (==2.7.0)
9
- Requires-Dist: torchvision
10
- Requires-Dist: spacy
11
- Requires-Dist: nltk
12
- Requires-Dist: gensim
13
- Requires-Dist: furo
14
- Requires-Dist: streamlit
15
- Requires-Dist: tokenizers
16
- Requires-Dist: tensorboard
17
- Requires-Dist: evaluate
18
- Requires-Dist: transformers
19
- Requires-Dist: pandas
20
- Requires-Dist: numpy
21
- Requires-Dist: scikit-learn
22
- Requires-Dist: matplotlib
23
- Requires-Dist: plotly
24
- Requires-Dist: sacrebleu
25
- Requires-Dist: nlpaug
26
- Requires-Dist: wandb
27
- Requires-Dist: pytorch-lightning
28
- Requires-Dist: selenium
29
- Requires-Dist: sentencepiece
30
- Requires-Dist: peft
31
- Requires-Dist: rouge-score
32
- Requires-Dist: wolof-translate
33
-
1
+ Metadata-Version: 2.4
2
+ Name: translate_package
3
+ Version: 0.2.1
4
+ Summary: Contain functions and classes to efficiently train a sequence to sequence to translate between two languages.
5
+ Author: Oumar Kane
6
+ Author-email: oumar.kane@univ-thies.sn
7
+ Requires-Dist: accelerate
8
+ Requires-Dist: torch==2.7.0
9
+ Requires-Dist: torchvision
10
+ Requires-Dist: spacy
11
+ Requires-Dist: nltk
12
+ Requires-Dist: gensim
13
+ Requires-Dist: furo
14
+ Requires-Dist: streamlit
15
+ Requires-Dist: tokenizers
16
+ Requires-Dist: tensorboard
17
+ Requires-Dist: evaluate
18
+ Requires-Dist: transformers
19
+ Requires-Dist: pandas
20
+ Requires-Dist: numpy
21
+ Requires-Dist: scikit-learn
22
+ Requires-Dist: matplotlib
23
+ Requires-Dist: plotly
24
+ Requires-Dist: sacrebleu
25
+ Requires-Dist: nlpaug
26
+ Requires-Dist: wandb
27
+ Requires-Dist: pytorch-lightning
28
+ Requires-Dist: selenium
29
+ Requires-Dist: sentencepiece
30
+ Requires-Dist: peft
31
+ Requires-Dist: rouge-score
32
+ Requires-Dist: sacrebleu
33
+ Requires-Dist: wolof-translate
34
+ Dynamic: author
35
+ Dynamic: author-email
36
+ Dynamic: requires-dist
37
+ Dynamic: summary
@@ -1,18 +1,18 @@
1
1
  translate_package/__init__.py,sha256=miie3aAeUYHsVk2O-kd4T86fFksuCiY70Eo6RNeY1Oo,1312
2
2
  translate_package/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- translate_package/data/data_preparation.py,sha256=odTSpud2biQaxI8lW65FtS-hMuXbglD4mQCTYx2kNZs,14897
3
+ translate_package/data/data_preparation.py,sha256=_fK2joza3FlzvrGPSk-KywhNj6R8IpK7TBBC0p6X940,16243
4
4
  translate_package/errors/__init__.py,sha256=gu6XjAIghG4lLkYo8x_7_yyLRtK2FIvmC-WcfJaeOlg,299
5
5
  translate_package/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  translate_package/models/gradient_observation.py,sha256=P91UA5i-RdkK46TqpPOJ54DsUYgTI9cRohgPS1Ch0Lc,294
7
7
  translate_package/models/lstm.py,sha256=OPkvvceowz5JqdGGH4cfPhH23kbP11z-29zIJn5d8ig,3273
8
8
  translate_package/models/machine_translation.py,sha256=1ot9Me6U1O7UHJMuJGvatx3DxoKY9TghzzHNzxdZa5g,11170
9
9
  translate_package/tokenization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- translate_package/tokenization/load_tokenizer.py,sha256=g8j5pDmimFhwjpeYNkWot0hXMzAqqURbtedcQK-1xYE,1543
10
+ translate_package/tokenization/load_tokenizer.py,sha256=Q7ZFMCefs3vPe2CE9iWKkgGz3Wk4C9rbUTGmyjTioJQ,2069
11
11
  translate_package/tokenization/train_tokenizer.py,sha256=RkdT5DUx201OBNaswM6m54iqcrmCThd3ITLguQb_zVM,3347
12
12
  translate_package/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  translate_package/utils/checkpoint.py,sha256=GqymRvF8_QZgrQq9m79Ppj6Qr7NQm78kDARm3p_chC0,322
14
- translate_package/utils/upload_to_hughub.py,sha256=0qihZIAAUuJXfOZ23Njz0aWpDpe8twQNDGPplgrIfzA,480
15
- translate_package-0.1.9.dist-info/METADATA,sha256=Cm1FrRayiCWGmiL6-IOe57f2DQzghEpkgWrMK46Buzs,887
16
- translate_package-0.1.9.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
17
- translate_package-0.1.9.dist-info/top_level.txt,sha256=8e2HIrGAMzoSukqu2q929dOJMV1zGYKI_BAFwl-P7XU,18
18
- translate_package-0.1.9.dist-info/RECORD,,
14
+ translate_package/utils/upload_to_hughub.py,sha256=zWXJQfv1ZndFIeyGWO0JMLSCP7rj-B4RuiiY-TFkBnw,522
15
+ translate_package-0.2.1.dist-info/METADATA,sha256=gf6SydqZxLYimXZsVsm8mHqYx3fz4kqgm68oYBQlf9A,1023
16
+ translate_package-0.2.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
17
+ translate_package-0.2.1.dist-info/top_level.txt,sha256=8e2HIrGAMzoSukqu2q929dOJMV1zGYKI_BAFwl-P7XU,18
18
+ translate_package-0.2.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.37.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5