wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. wolof_translate/__init__.py +73 -0
  2. wolof_translate/data/__init__.py +0 -0
  3. wolof_translate/data/dataset_v1.py +151 -0
  4. wolof_translate/data/dataset_v2.py +187 -0
  5. wolof_translate/data/dataset_v3.py +187 -0
  6. wolof_translate/data/dataset_v3_2.py +187 -0
  7. wolof_translate/data/dataset_v4.py +202 -0
  8. wolof_translate/data/dataset_v5.py +65 -0
  9. wolof_translate/models/__init__.py +0 -0
  10. wolof_translate/models/transformers/__init__.py +0 -0
  11. wolof_translate/models/transformers/main.py +865 -0
  12. wolof_translate/models/transformers/main_2.py +362 -0
  13. wolof_translate/models/transformers/optimization.py +41 -0
  14. wolof_translate/models/transformers/position.py +46 -0
  15. wolof_translate/models/transformers/size.py +44 -0
  16. wolof_translate/pipe/__init__.py +1 -0
  17. wolof_translate/pipe/nlp_pipeline.py +512 -0
  18. wolof_translate/tokenizers/__init__.py +0 -0
  19. wolof_translate/trainers/__init__.py +0 -0
  20. wolof_translate/trainers/transformer_trainer.py +760 -0
  21. wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  22. wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  23. wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  24. wolof_translate/utils/__init__.py +1 -0
  25. wolof_translate/utils/bucket_iterator.py +143 -0
  26. wolof_translate/utils/database_manager.py +116 -0
  27. wolof_translate/utils/display_predictions.py +162 -0
  28. wolof_translate/utils/download_model.py +40 -0
  29. wolof_translate/utils/evaluate_custom.py +147 -0
  30. wolof_translate/utils/evaluation.py +74 -0
  31. wolof_translate/utils/extract_new_sentences.py +810 -0
  32. wolof_translate/utils/extract_poems.py +60 -0
  33. wolof_translate/utils/extract_sentences.py +562 -0
  34. wolof_translate/utils/improvements/__init__.py +0 -0
  35. wolof_translate/utils/improvements/end_marks.py +45 -0
  36. wolof_translate/utils/recuperate_datasets.py +94 -0
  37. wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  38. wolof_translate/utils/send_model.py +26 -0
  39. wolof_translate/utils/sent_corrections.py +169 -0
  40. wolof_translate/utils/sent_transformers.py +27 -0
  41. wolof_translate/utils/sent_unification.py +97 -0
  42. wolof_translate/utils/split_with_valid.py +72 -0
  43. wolof_translate/utils/tokenize_text.py +46 -0
  44. wolof_translate/utils/training.py +213 -0
  45. wolof_translate/utils/trunc_hg_training.py +196 -0
  46. wolof_translate-0.0.1.dist-info/METADATA +31 -0
  47. wolof_translate-0.0.1.dist-info/RECORD +49 -0
  48. wolof_translate-0.0.1.dist-info/WHEEL +5 -0
  49. wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,94 @@
1
+ from wolof_translate import *
2
+
3
+
4
+ def recuperate_datasets(
5
+ char_p: float,
6
+ word_p: float,
7
+ max_len: int,
8
+ end_mark: int,
9
+ tokenizer: T5TokenizerFast,
10
+ corpus_1: str = "french",
11
+ corpus_2: str = "wolof",
12
+ train_file: str = "data/extractions/new_data/train_set.csv",
13
+ test_file: str = "data/extractions/new_data/test_file.csv",
14
+ ):
15
+
16
+ # Let us recuperate the end_mark adding option
17
+ if end_mark == 1:
18
+ # Create augmentation to add on French sentences
19
+ fr_augmentation_1 = TransformerSequences(
20
+ nac.RandomCharAug(
21
+ action="swap",
22
+ aug_char_p=char_p,
23
+ aug_word_p=word_p,
24
+ aug_word_max=max_len,
25
+ ),
26
+ remove_mark_space,
27
+ delete_guillemet_space,
28
+ add_mark_space,
29
+ )
30
+
31
+ fr_augmentation_2 = TransformerSequences(
32
+ remove_mark_space, delete_guillemet_space, add_mark_space
33
+ )
34
+
35
+ else:
36
+
37
+ if end_mark == 2:
38
+
39
+ end_mark_fn = partial(add_end_mark, end_mark_to_remove="!", replace=True)
40
+
41
+ elif end_mark == 3:
42
+
43
+ end_mark_fn = partial(add_end_mark)
44
+
45
+ elif end_mark == 4:
46
+
47
+ end_mark_fn = partial(add_end_mark, end_mark_to_remove="!")
48
+
49
+ else:
50
+
51
+ raise ValueError(f"No end mark number {end_mark}")
52
+
53
+ # Create augmentation to add on French sentences
54
+ fr_augmentation_1 = TransformerSequences(
55
+ nac.RandomCharAug(
56
+ action="swap",
57
+ aug_char_p=char_p,
58
+ aug_word_p=word_p,
59
+ aug_word_max=max_len,
60
+ ),
61
+ remove_mark_space,
62
+ delete_guillemet_space,
63
+ add_mark_space,
64
+ end_mark_fn,
65
+ )
66
+
67
+ fr_augmentation_2 = TransformerSequences(
68
+ remove_mark_space, delete_guillemet_space, add_mark_space, end_mark_fn
69
+ )
70
+
71
+ # Recuperate the train dataset
72
+ train_dataset_aug = SentenceDataset(
73
+ train_file,
74
+ tokenizer,
75
+ truncation=False,
76
+ cp1_transformer=fr_augmentation_1,
77
+ cp2_transformer=fr_augmentation_2,
78
+ corpus_1=corpus_1,
79
+ corpus_2=corpus_2,
80
+ )
81
+
82
+ # Recuperate the valid dataset
83
+ valid_dataset = SentenceDataset(
84
+ test_file,
85
+ tokenizer,
86
+ cp1_transformer=fr_augmentation_2,
87
+ cp2_transformer=fr_augmentation_2,
88
+ corpus_1=corpus_1,
89
+ corpus_2=corpus_2,
90
+ truncation=False,
91
+ )
92
+
93
+ # Return the datasets
94
+ return train_dataset_aug, valid_dataset
@@ -0,0 +1,85 @@
1
+ from wolof_translate import *
2
+
3
+
4
+ def recuperate_datasets(
5
+ char_p: float,
6
+ word_p: float,
7
+ max_len: int,
8
+ end_mark: int,
9
+ tokenizer: T5TokenizerFast,
10
+ corpus_1: str = "french",
11
+ corpus_2: str = "wolof",
12
+ train_file: str = "data/extractions/new_data/train_set.csv",
13
+ test_file: str = "data/extractions/new_data/test_file.csv",
14
+ augmenter=partial(nac.RandomCharAug, action="swap"),
15
+ ):
16
+
17
+ # Let us recuperate the end_mark adding option
18
+ if end_mark == 1:
19
+ # Create augmentation to add on French sentences
20
+ fr_augmentation_1 = TransformerSequences(
21
+ augmenter(aug_char_p=fr_char_p, aug_word_p=fr_word_p, aug_word_max=max_len),
22
+ remove_mark_space,
23
+ delete_guillemet_space,
24
+ add_mark_space,
25
+ )
26
+
27
+ fr_augmentation_2 = TransformerSequences(
28
+ remove_mark_space, delete_guillemet_space, add_mark_space
29
+ )
30
+
31
+ else:
32
+
33
+ if end_mark == 2:
34
+
35
+ end_mark_fn = partial(add_end_mark, end_mark_to_remove="!", replace=True)
36
+
37
+ elif end_mark == 3:
38
+
39
+ end_mark_fn = partial(add_end_mark)
40
+
41
+ elif end_mark == 4:
42
+
43
+ end_mark_fn = partial(add_end_mark, end_mark_to_remove="!")
44
+
45
+ else:
46
+
47
+ raise ValueError(f"No end mark number {end_mark}")
48
+
49
+ # Create augmentation to add on French sentences
50
+ fr_augmentation_1 = TransformerSequences(
51
+ augmenter(aug_char_p=fr_char_p, aug_word_p=fr_word_p, aug_word_max=max_len),
52
+ remove_mark_space,
53
+ delete_guillemet_space,
54
+ add_mark_space,
55
+ end_mark_fn,
56
+ )
57
+
58
+ fr_augmentation_2 = TransformerSequences(
59
+ remove_mark_space, delete_guillemet_space, add_mark_space, end_mark_fn
60
+ )
61
+
62
+ # Recuperate the train dataset
63
+ train_dataset_aug = SentenceDataset(
64
+ train_file,
65
+ tokenizer,
66
+ truncation=False,
67
+ cp1_transformer=fr_augmentation_1,
68
+ cp2_transformer=fr_augmentation_2,
69
+ corpus_1=corpus_1,
70
+ corpus_2=corpus_2,
71
+ )
72
+
73
+ # Recuperate the valid dataset
74
+ valid_dataset = SentenceDataset(
75
+ test_file,
76
+ tokenizer,
77
+ cp1_transformer=fr_augmentation_2,
78
+ cp2_transformer=fr_augmentation_2,
79
+ corpus_1=corpus_1,
80
+ corpus_2=corpus_2,
81
+ truncation=False,
82
+ )
83
+
84
+ # Return the datasets
85
+ return train_dataset_aug, valid_dataset
@@ -0,0 +1,26 @@
1
+ import wandb
2
+
3
+
4
+ def add_directory(
5
+ directory: str,
6
+ artifact_name: str,
7
+ project: str = "fw_artifacts",
8
+ entity: str = "oumar-kane-team",
9
+ ):
10
+ """Initialize a project and add checkpoints as artifact to wandb
11
+
12
+ Args:
13
+ directory (str): The directory where are stored the checkpoints
14
+ artifact_name (_type_): The name of the artifact
15
+ project (str, optional): The project name. Defaults to 'fw_artifacts'.
16
+ entity (str, optional): The entity name. Defaults to 'oumar-kane-team'.
17
+ """
18
+
19
+ run = wandb.init(project=project, entity=entity)
20
+
21
+ # add a directory as artifact to wandb
22
+ artifact = wandb.Artifact(artifact_name, type="dataset")
23
+ artifact.add_dir(directory)
24
+ run.log_artifact(artifact)
25
+
26
+ wandb.finish()
@@ -0,0 +1,169 @@
1
+ from typing import *
2
+
3
+
4
+ def add_guillemet_space(sentences: Union[list, str]):
5
+ """Adding space between a letter and guillemet in a sentence
6
+
7
+ Args:
8
+ sentence (Union[list, str]): The sentence that will be changed
9
+
10
+ Returns:
11
+ str: The modified sentence
12
+ """
13
+
14
+ if type(sentences) is str:
15
+ sentences = [sentences]
16
+
17
+ for s in range(len(sentences)):
18
+
19
+ sentence = sentences[s]
20
+
21
+ if "«" in sentence:
22
+
23
+ sentence = sentence.split()
24
+
25
+ for i in range(len(sentence)):
26
+
27
+ word = sentence[i]
28
+
29
+ if "«" in word and word != "«":
30
+
31
+ word = word.split("«")
32
+
33
+ word = "« ".join(word)
34
+
35
+ if "»" in word and word != "»":
36
+
37
+ word = word.split("»")
38
+
39
+ word = " »".join(word)
40
+
41
+ sentence[i] = word
42
+
43
+ sentence = " ".join(sentence)
44
+
45
+ sentences[s] = sentence
46
+
47
+ return sentences
48
+
49
+
50
+ def delete_guillemet_space(sentences: Union[list, str]):
51
+
52
+ if type(sentences) is str:
53
+ sentences = [sentences]
54
+
55
+ for s in range(len(sentences)):
56
+
57
+ sentence = sentences[s]
58
+
59
+ letters = [sentence[0]]
60
+
61
+ for i in range(1, len(sentence)):
62
+
63
+ if sentence[i] == "”":
64
+
65
+ j = i - 1
66
+
67
+ while letters[j] == " ":
68
+
69
+ letters[j] = ""
70
+
71
+ j -= 1
72
+
73
+ letters.append(sentence[i])
74
+
75
+ elif letters[-1] == "“" and sentence[i] == " ":
76
+
77
+ letters.append("")
78
+
79
+ else:
80
+
81
+ letters.append(sentence[i])
82
+
83
+ sentences[s] = "".join(letters)
84
+
85
+ return sentences
86
+
87
+
88
+ def add_mark_space(
89
+ sentences: Union[list, str], marks: list = ["?", "!", "–", ":", ";"]
90
+ ):
91
+
92
+ if type(sentences) is str:
93
+ sentences = [sentences]
94
+
95
+ for s in range(len(sentences)):
96
+
97
+ sentence = sentences[s]
98
+
99
+ letters = [sentence[0]]
100
+
101
+ for i in range(1, len(sentence)):
102
+
103
+ if sentence[i] in marks and letters[-1] != " ":
104
+
105
+ letters[-1] = letters[-1] + " "
106
+
107
+ letters.append(sentence[i])
108
+
109
+ elif letters[-1] in marks and sentence[i] != " ":
110
+
111
+ letters.append(sentence[i] + " ")
112
+
113
+ else:
114
+
115
+ letters.append(sentence[i])
116
+
117
+ sentences[s] = "".join(letters)
118
+
119
+ return sentences
120
+
121
+
122
+ def remove_mark_space(sentences: Union[list, str], marks: list = ["'", "-"]):
123
+
124
+ if type(sentences) is str:
125
+ sentences = [sentences]
126
+
127
+ for s in range(len(sentences)):
128
+
129
+ sentence = sentences[s]
130
+
131
+ letters = [sentence[0]]
132
+
133
+ for i in range(1, len(sentence)):
134
+
135
+ if sentence[i] in marks:
136
+
137
+ j = i - 1
138
+
139
+ while letters[j] == " ":
140
+
141
+ letters[j] = ""
142
+
143
+ j -= 1
144
+
145
+ letters.append(sentence[i])
146
+
147
+ elif letters[-1] in marks and sentence[i] == " ":
148
+
149
+ letters.append("")
150
+
151
+ else:
152
+
153
+ letters.append(sentence[i])
154
+
155
+ sentences[s] = "".join(letters)
156
+
157
+ return sentences
158
+
159
+
160
+ def delete_much_space(sentences: Union[list, str]):
161
+
162
+ if type(sentences) is str:
163
+ sentences = [sentences]
164
+
165
+ for i in range(len(sentences)):
166
+
167
+ sentences[i] = " ".join(sentences[i].split())
168
+
169
+ return sentences
@@ -0,0 +1,27 @@
1
+ from typing import *
2
+
3
+
4
+ class TransformerSequences:
5
+ def __init__(self, *args, **kwargs):
6
+
7
+ self.transformers = []
8
+
9
+ self.transformers.extend(list(args))
10
+
11
+ self.transformers.extend(list(kwargs.values()))
12
+
13
+ def __call__(self, sentences: Union[List, str]):
14
+
15
+ output = sentences
16
+
17
+ for transformer in self.transformers:
18
+
19
+ if hasattr(transformer, "augment"):
20
+
21
+ output = transformer.augment(output)
22
+
23
+ else:
24
+
25
+ output = transformer(output)
26
+
27
+ return output
@@ -0,0 +1,97 @@
1
+ from nlp_project import *
2
+
3
+
4
+ def unify_correction(
5
+ sentences: list,
6
+ marks: List[Tuple] = [("«", "»", True), ("(", ")", False)],
7
+ unified_sentences_between_pos: List[Tuple] = [(925, 930)],
8
+ ):
9
+
10
+ corrected_sentences = []
11
+
12
+ only_end_mark = []
13
+
14
+ only_begin_mark = []
15
+
16
+ i = 0
17
+
18
+ while i < len(sentences):
19
+
20
+ for u in unified_sentences_between_pos:
21
+
22
+ if i >= u[0] - 1 and i < u[1]:
23
+
24
+ range_ = u[1] - u[0]
25
+
26
+ unification = sentences[u[0] - 1]
27
+
28
+ for j in range(u[0], u[0] + range_):
29
+
30
+ unification += " " + sentences[j]
31
+
32
+ i += range_ + 1
33
+
34
+ corrected_sentences.append(unification)
35
+
36
+ unify_next = False
37
+
38
+ space = " "
39
+
40
+ if i != 0:
41
+
42
+ for mark in marks:
43
+
44
+ begin_mark = False
45
+
46
+ end_mark = False
47
+
48
+ for letter in corrected_sentences[-1]:
49
+
50
+ if letter == mark[1]:
51
+
52
+ begin_mark = False
53
+
54
+ elif letter == mark[0]:
55
+
56
+ begin_mark = True
57
+
58
+ for letter in sentences[i]:
59
+
60
+ if letter == mark[1]:
61
+
62
+ end_mark = True
63
+
64
+ break
65
+
66
+ else:
67
+
68
+ break
69
+
70
+ if end_mark and not begin_mark:
71
+
72
+ only_end_mark.append(sentences[i])
73
+
74
+ elif begin_mark and not end_mark:
75
+
76
+ only_begin_mark.append(corrected_sentences[-1])
77
+
78
+ if end_mark and begin_mark:
79
+
80
+ unify_next = True
81
+
82
+ space = " " if mark[2] else ""
83
+
84
+ if unify_next:
85
+
86
+ corrected_sentences[-1] = corrected_sentences[-1] + space + sentences[i]
87
+
88
+ else:
89
+
90
+ corrected_sentences.append(sentences[i])
91
+
92
+ i += 1
93
+
94
+ return corrected_sentences, {
95
+ "begin_mark_only": only_begin_mark,
96
+ "end_mark_only": only_end_mark,
97
+ }
@@ -0,0 +1,72 @@
1
+ """ This module contains a function which split the data. It will consider adding the validation set
2
+ """
3
+ from sklearn.model_selection import train_test_split
4
+ import pandas as pd
5
+ import os
6
+
7
+
8
+ def split_data(
9
+ random_state: int = 50,
10
+ data_directory: str = "data/extractions/new_data",
11
+ csv_file: str = "sentences.csv",
12
+ ):
13
+ """Split data between train, validation and test sets
14
+
15
+ Args:
16
+ random_state (int): the seed of the splitting generator. Defaults to 50
17
+ """
18
+ # load the corpora and split into train and test sets
19
+ corpora = pd.read_csv(os.path.join(data_directory, csv_file))
20
+
21
+ train_set, test_valid_set = train_test_split(
22
+ corpora, test_size=0.2, random_state=random_state
23
+ )
24
+
25
+ # let us save the final training set when performing
26
+
27
+ test_set, valid_set = train_test_split(
28
+ test_valid_set, test_size=0.5, random_state=random_state
29
+ )
30
+
31
+ train_set.to_csv(os.path.join(data_directory, "final_train_set.csv"), index=False)
32
+
33
+ # let us save the sets
34
+ train_set.to_csv(os.path.join(data_directory, "train_set.csv"), index=False)
35
+
36
+ valid_set.to_csv(os.path.join(data_directory, "valid_set.csv"), index=False)
37
+
38
+ test_set.to_csv(os.path.join(data_directory, "test_set.csv"), index=False)
39
+
40
+
41
+ def split_data_kaggle(
42
+ random_state: int,
43
+ data_directory: str,
44
+ split_directory: str = "kaggle/working/splits",
45
+ csv_file: str = "sentences.csv",
46
+ ):
47
+ """Split data between train, validation and test sets
48
+
49
+ Args:
50
+ random_state (int): the seed of the splitting generator. Defaults to 50
51
+ """
52
+ # load the corpora and split into train and test sets
53
+ corpora = pd.read_csv(os.path.join(data_directory, csv_file))
54
+
55
+ train_set, test_valid_set = train_test_split(
56
+ corpora, test_size=0.2, random_state=random_state
57
+ )
58
+
59
+ # let us save the final training set when performing
60
+
61
+ test_set, valid_set = train_test_split(
62
+ test_valid_set, test_size=0.5, random_state=random_state
63
+ )
64
+
65
+ train_set.to_csv(os.path.join(split_directory, "final_train_set.csv"), index=False)
66
+
67
+ # let us save the sets
68
+ train_set.to_csv(os.path.join(split_directory, "train_set.csv"), index=False)
69
+
70
+ valid_set.to_csv(os.path.join(split_directory, "valid_set.csv"), index=False)
71
+
72
+ test_set.to_csv(os.path.join(split_directory, "test_set.csv"), index=False)
@@ -0,0 +1,46 @@
1
+ import spacy
2
+ from typing import *
3
+
4
+
5
+ def tokenization(
6
+ nlp=spacy.load("fr_core_news_lg"),
7
+ corpus: Union[List[str], Tuple[str]] = [],
8
+ rm_spaces: bool = True,
9
+ ):
10
+ """Tokenize the text (keep each of the unique token both in the french and the wolof corpora)
11
+
12
+ Args:
13
+ nlp (_type_, optional): A spacy model. Defaults to spacy.load("fr_core_news_lg").
14
+ corpus (Union[List[str], Tuple[str]], optional): The list of documents. Defaults to [].
15
+ rm_spaces (bool, optional): Indicate if the too much spaces will be deleted. Defaults to True.
16
+
17
+ Returns:
18
+ List[List[str]]: The list of list of tokens
19
+ """
20
+
21
+ # Create a inner function to tokenize a given document
22
+ def transformation(doc):
23
+
24
+ tokens = []
25
+
26
+ for token in doc:
27
+
28
+ if not (rm_spaces and token.is_space):
29
+
30
+ tokens.append(token.text)
31
+
32
+ return tokens
33
+
34
+ # Let's create a pipeline with the nlp object
35
+ docs = nlp.pipe(corpus)
36
+
37
+ # Initialize the list of tokenized documents and the list of pos_tags
38
+ tokens = []
39
+
40
+ for doc in docs:
41
+
42
+ tokens_ = transformation(doc)
43
+
44
+ tokens.append(tokens_)
45
+
46
+ return tokens