wolof-translate 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wolof_translate/__init__.py +73 -0
- wolof_translate/data/__init__.py +0 -0
- wolof_translate/data/dataset_v1.py +151 -0
- wolof_translate/data/dataset_v2.py +187 -0
- wolof_translate/data/dataset_v3.py +187 -0
- wolof_translate/data/dataset_v3_2.py +187 -0
- wolof_translate/data/dataset_v4.py +202 -0
- wolof_translate/data/dataset_v5.py +65 -0
- wolof_translate/models/__init__.py +0 -0
- wolof_translate/models/transformers/__init__.py +0 -0
- wolof_translate/models/transformers/main.py +865 -0
- wolof_translate/models/transformers/main_2.py +362 -0
- wolof_translate/models/transformers/optimization.py +41 -0
- wolof_translate/models/transformers/position.py +46 -0
- wolof_translate/models/transformers/size.py +44 -0
- wolof_translate/pipe/__init__.py +1 -0
- wolof_translate/pipe/nlp_pipeline.py +512 -0
- wolof_translate/tokenizers/__init__.py +0 -0
- wolof_translate/trainers/__init__.py +0 -0
- wolof_translate/trainers/transformer_trainer.py +760 -0
- wolof_translate/trainers/transformer_trainer_custom.py +882 -0
- wolof_translate/trainers/transformer_trainer_ml.py +925 -0
- wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
- wolof_translate/utils/__init__.py +1 -0
- wolof_translate/utils/bucket_iterator.py +143 -0
- wolof_translate/utils/database_manager.py +116 -0
- wolof_translate/utils/display_predictions.py +162 -0
- wolof_translate/utils/download_model.py +40 -0
- wolof_translate/utils/evaluate_custom.py +147 -0
- wolof_translate/utils/evaluation.py +74 -0
- wolof_translate/utils/extract_new_sentences.py +810 -0
- wolof_translate/utils/extract_poems.py +60 -0
- wolof_translate/utils/extract_sentences.py +562 -0
- wolof_translate/utils/improvements/__init__.py +0 -0
- wolof_translate/utils/improvements/end_marks.py +45 -0
- wolof_translate/utils/recuperate_datasets.py +94 -0
- wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
- wolof_translate/utils/send_model.py +26 -0
- wolof_translate/utils/sent_corrections.py +169 -0
- wolof_translate/utils/sent_transformers.py +27 -0
- wolof_translate/utils/sent_unification.py +97 -0
- wolof_translate/utils/split_with_valid.py +72 -0
- wolof_translate/utils/tokenize_text.py +46 -0
- wolof_translate/utils/training.py +213 -0
- wolof_translate/utils/trunc_hg_training.py +196 -0
- wolof_translate-0.0.1.dist-info/METADATA +31 -0
- wolof_translate-0.0.1.dist-info/RECORD +49 -0
- wolof_translate-0.0.1.dist-info/WHEEL +5 -0
- wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from wolof_translate import *
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def recuperate_datasets(
|
|
5
|
+
char_p: float,
|
|
6
|
+
word_p: float,
|
|
7
|
+
max_len: int,
|
|
8
|
+
end_mark: int,
|
|
9
|
+
tokenizer: T5TokenizerFast,
|
|
10
|
+
corpus_1: str = "french",
|
|
11
|
+
corpus_2: str = "wolof",
|
|
12
|
+
train_file: str = "data/extractions/new_data/train_set.csv",
|
|
13
|
+
test_file: str = "data/extractions/new_data/test_file.csv",
|
|
14
|
+
):
|
|
15
|
+
|
|
16
|
+
# Let us recuperate the end_mark adding option
|
|
17
|
+
if end_mark == 1:
|
|
18
|
+
# Create augmentation to add on French sentences
|
|
19
|
+
fr_augmentation_1 = TransformerSequences(
|
|
20
|
+
nac.RandomCharAug(
|
|
21
|
+
action="swap",
|
|
22
|
+
aug_char_p=char_p,
|
|
23
|
+
aug_word_p=word_p,
|
|
24
|
+
aug_word_max=max_len,
|
|
25
|
+
),
|
|
26
|
+
remove_mark_space,
|
|
27
|
+
delete_guillemet_space,
|
|
28
|
+
add_mark_space,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
fr_augmentation_2 = TransformerSequences(
|
|
32
|
+
remove_mark_space, delete_guillemet_space, add_mark_space
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
else:
|
|
36
|
+
|
|
37
|
+
if end_mark == 2:
|
|
38
|
+
|
|
39
|
+
end_mark_fn = partial(add_end_mark, end_mark_to_remove="!", replace=True)
|
|
40
|
+
|
|
41
|
+
elif end_mark == 3:
|
|
42
|
+
|
|
43
|
+
end_mark_fn = partial(add_end_mark)
|
|
44
|
+
|
|
45
|
+
elif end_mark == 4:
|
|
46
|
+
|
|
47
|
+
end_mark_fn = partial(add_end_mark, end_mark_to_remove="!")
|
|
48
|
+
|
|
49
|
+
else:
|
|
50
|
+
|
|
51
|
+
raise ValueError(f"No end mark number {end_mark}")
|
|
52
|
+
|
|
53
|
+
# Create augmentation to add on French sentences
|
|
54
|
+
fr_augmentation_1 = TransformerSequences(
|
|
55
|
+
nac.RandomCharAug(
|
|
56
|
+
action="swap",
|
|
57
|
+
aug_char_p=char_p,
|
|
58
|
+
aug_word_p=word_p,
|
|
59
|
+
aug_word_max=max_len,
|
|
60
|
+
),
|
|
61
|
+
remove_mark_space,
|
|
62
|
+
delete_guillemet_space,
|
|
63
|
+
add_mark_space,
|
|
64
|
+
end_mark_fn,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
fr_augmentation_2 = TransformerSequences(
|
|
68
|
+
remove_mark_space, delete_guillemet_space, add_mark_space, end_mark_fn
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Recuperate the train dataset
|
|
72
|
+
train_dataset_aug = SentenceDataset(
|
|
73
|
+
train_file,
|
|
74
|
+
tokenizer,
|
|
75
|
+
truncation=False,
|
|
76
|
+
cp1_transformer=fr_augmentation_1,
|
|
77
|
+
cp2_transformer=fr_augmentation_2,
|
|
78
|
+
corpus_1=corpus_1,
|
|
79
|
+
corpus_2=corpus_2,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Recuperate the valid dataset
|
|
83
|
+
valid_dataset = SentenceDataset(
|
|
84
|
+
test_file,
|
|
85
|
+
tokenizer,
|
|
86
|
+
cp1_transformer=fr_augmentation_2,
|
|
87
|
+
cp2_transformer=fr_augmentation_2,
|
|
88
|
+
corpus_1=corpus_1,
|
|
89
|
+
corpus_2=corpus_2,
|
|
90
|
+
truncation=False,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Return the datasets
|
|
94
|
+
return train_dataset_aug, valid_dataset
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from wolof_translate import *
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def recuperate_datasets(
|
|
5
|
+
char_p: float,
|
|
6
|
+
word_p: float,
|
|
7
|
+
max_len: int,
|
|
8
|
+
end_mark: int,
|
|
9
|
+
tokenizer: T5TokenizerFast,
|
|
10
|
+
corpus_1: str = "french",
|
|
11
|
+
corpus_2: str = "wolof",
|
|
12
|
+
train_file: str = "data/extractions/new_data/train_set.csv",
|
|
13
|
+
test_file: str = "data/extractions/new_data/test_file.csv",
|
|
14
|
+
augmenter=partial(nac.RandomCharAug, action="swap"),
|
|
15
|
+
):
|
|
16
|
+
|
|
17
|
+
# Let us recuperate the end_mark adding option
|
|
18
|
+
if end_mark == 1:
|
|
19
|
+
# Create augmentation to add on French sentences
|
|
20
|
+
fr_augmentation_1 = TransformerSequences(
|
|
21
|
+
augmenter(aug_char_p=fr_char_p, aug_word_p=fr_word_p, aug_word_max=max_len),
|
|
22
|
+
remove_mark_space,
|
|
23
|
+
delete_guillemet_space,
|
|
24
|
+
add_mark_space,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
fr_augmentation_2 = TransformerSequences(
|
|
28
|
+
remove_mark_space, delete_guillemet_space, add_mark_space
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
else:
|
|
32
|
+
|
|
33
|
+
if end_mark == 2:
|
|
34
|
+
|
|
35
|
+
end_mark_fn = partial(add_end_mark, end_mark_to_remove="!", replace=True)
|
|
36
|
+
|
|
37
|
+
elif end_mark == 3:
|
|
38
|
+
|
|
39
|
+
end_mark_fn = partial(add_end_mark)
|
|
40
|
+
|
|
41
|
+
elif end_mark == 4:
|
|
42
|
+
|
|
43
|
+
end_mark_fn = partial(add_end_mark, end_mark_to_remove="!")
|
|
44
|
+
|
|
45
|
+
else:
|
|
46
|
+
|
|
47
|
+
raise ValueError(f"No end mark number {end_mark}")
|
|
48
|
+
|
|
49
|
+
# Create augmentation to add on French sentences
|
|
50
|
+
fr_augmentation_1 = TransformerSequences(
|
|
51
|
+
augmenter(aug_char_p=fr_char_p, aug_word_p=fr_word_p, aug_word_max=max_len),
|
|
52
|
+
remove_mark_space,
|
|
53
|
+
delete_guillemet_space,
|
|
54
|
+
add_mark_space,
|
|
55
|
+
end_mark_fn,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
fr_augmentation_2 = TransformerSequences(
|
|
59
|
+
remove_mark_space, delete_guillemet_space, add_mark_space, end_mark_fn
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Recuperate the train dataset
|
|
63
|
+
train_dataset_aug = SentenceDataset(
|
|
64
|
+
train_file,
|
|
65
|
+
tokenizer,
|
|
66
|
+
truncation=False,
|
|
67
|
+
cp1_transformer=fr_augmentation_1,
|
|
68
|
+
cp2_transformer=fr_augmentation_2,
|
|
69
|
+
corpus_1=corpus_1,
|
|
70
|
+
corpus_2=corpus_2,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Recuperate the valid dataset
|
|
74
|
+
valid_dataset = SentenceDataset(
|
|
75
|
+
test_file,
|
|
76
|
+
tokenizer,
|
|
77
|
+
cp1_transformer=fr_augmentation_2,
|
|
78
|
+
cp2_transformer=fr_augmentation_2,
|
|
79
|
+
corpus_1=corpus_1,
|
|
80
|
+
corpus_2=corpus_2,
|
|
81
|
+
truncation=False,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Return the datasets
|
|
85
|
+
return train_dataset_aug, valid_dataset
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import wandb
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def add_directory(
|
|
5
|
+
directory: str,
|
|
6
|
+
artifact_name: str,
|
|
7
|
+
project: str = "fw_artifacts",
|
|
8
|
+
entity: str = "oumar-kane-team",
|
|
9
|
+
):
|
|
10
|
+
"""Initialize a project and add checkpoints as artifact to wandb
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
directory (str): The directory where are stored the checkpoints
|
|
14
|
+
artifact_name (_type_): The name of the artifact
|
|
15
|
+
project (str, optional): The project name. Defaults to 'fw_artifacts'.
|
|
16
|
+
entity (str, optional): The entity name. Defaults to 'oumar-kane-team'.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
run = wandb.init(project=project, entity=entity)
|
|
20
|
+
|
|
21
|
+
# add a directory as artifact to wandb
|
|
22
|
+
artifact = wandb.Artifact(artifact_name, type="dataset")
|
|
23
|
+
artifact.add_dir(directory)
|
|
24
|
+
run.log_artifact(artifact)
|
|
25
|
+
|
|
26
|
+
wandb.finish()
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from typing import *
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def add_guillemet_space(sentences: Union[list, str]):
|
|
5
|
+
"""Adding space between a letter and guillemet in a sentence
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
sentence (Union[list, str]): The sentence that will be changed
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
str: The modified sentence
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
if type(sentences) is str:
|
|
15
|
+
sentences = [sentences]
|
|
16
|
+
|
|
17
|
+
for s in range(len(sentences)):
|
|
18
|
+
|
|
19
|
+
sentence = sentences[s]
|
|
20
|
+
|
|
21
|
+
if "«" in sentence:
|
|
22
|
+
|
|
23
|
+
sentence = sentence.split()
|
|
24
|
+
|
|
25
|
+
for i in range(len(sentence)):
|
|
26
|
+
|
|
27
|
+
word = sentence[i]
|
|
28
|
+
|
|
29
|
+
if "«" in word and word != "«":
|
|
30
|
+
|
|
31
|
+
word = word.split("«")
|
|
32
|
+
|
|
33
|
+
word = "« ".join(word)
|
|
34
|
+
|
|
35
|
+
if "»" in word and word != "»":
|
|
36
|
+
|
|
37
|
+
word = word.split("»")
|
|
38
|
+
|
|
39
|
+
word = " »".join(word)
|
|
40
|
+
|
|
41
|
+
sentence[i] = word
|
|
42
|
+
|
|
43
|
+
sentence = " ".join(sentence)
|
|
44
|
+
|
|
45
|
+
sentences[s] = sentence
|
|
46
|
+
|
|
47
|
+
return sentences
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def delete_guillemet_space(sentences: Union[list, str]):
|
|
51
|
+
|
|
52
|
+
if type(sentences) is str:
|
|
53
|
+
sentences = [sentences]
|
|
54
|
+
|
|
55
|
+
for s in range(len(sentences)):
|
|
56
|
+
|
|
57
|
+
sentence = sentences[s]
|
|
58
|
+
|
|
59
|
+
letters = [sentence[0]]
|
|
60
|
+
|
|
61
|
+
for i in range(1, len(sentence)):
|
|
62
|
+
|
|
63
|
+
if sentence[i] == "”":
|
|
64
|
+
|
|
65
|
+
j = i - 1
|
|
66
|
+
|
|
67
|
+
while letters[j] == " ":
|
|
68
|
+
|
|
69
|
+
letters[j] = ""
|
|
70
|
+
|
|
71
|
+
j -= 1
|
|
72
|
+
|
|
73
|
+
letters.append(sentence[i])
|
|
74
|
+
|
|
75
|
+
elif letters[-1] == "“" and sentence[i] == " ":
|
|
76
|
+
|
|
77
|
+
letters.append("")
|
|
78
|
+
|
|
79
|
+
else:
|
|
80
|
+
|
|
81
|
+
letters.append(sentence[i])
|
|
82
|
+
|
|
83
|
+
sentences[s] = "".join(letters)
|
|
84
|
+
|
|
85
|
+
return sentences
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def add_mark_space(
|
|
89
|
+
sentences: Union[list, str], marks: list = ["?", "!", "–", ":", ";"]
|
|
90
|
+
):
|
|
91
|
+
|
|
92
|
+
if type(sentences) is str:
|
|
93
|
+
sentences = [sentences]
|
|
94
|
+
|
|
95
|
+
for s in range(len(sentences)):
|
|
96
|
+
|
|
97
|
+
sentence = sentences[s]
|
|
98
|
+
|
|
99
|
+
letters = [sentence[0]]
|
|
100
|
+
|
|
101
|
+
for i in range(1, len(sentence)):
|
|
102
|
+
|
|
103
|
+
if sentence[i] in marks and letters[-1] != " ":
|
|
104
|
+
|
|
105
|
+
letters[-1] = letters[-1] + " "
|
|
106
|
+
|
|
107
|
+
letters.append(sentence[i])
|
|
108
|
+
|
|
109
|
+
elif letters[-1] in marks and sentence[i] != " ":
|
|
110
|
+
|
|
111
|
+
letters.append(sentence[i] + " ")
|
|
112
|
+
|
|
113
|
+
else:
|
|
114
|
+
|
|
115
|
+
letters.append(sentence[i])
|
|
116
|
+
|
|
117
|
+
sentences[s] = "".join(letters)
|
|
118
|
+
|
|
119
|
+
return sentences
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def remove_mark_space(sentences: Union[list, str], marks: list = ["'", "-"]):
|
|
123
|
+
|
|
124
|
+
if type(sentences) is str:
|
|
125
|
+
sentences = [sentences]
|
|
126
|
+
|
|
127
|
+
for s in range(len(sentences)):
|
|
128
|
+
|
|
129
|
+
sentence = sentences[s]
|
|
130
|
+
|
|
131
|
+
letters = [sentence[0]]
|
|
132
|
+
|
|
133
|
+
for i in range(1, len(sentence)):
|
|
134
|
+
|
|
135
|
+
if sentence[i] in marks:
|
|
136
|
+
|
|
137
|
+
j = i - 1
|
|
138
|
+
|
|
139
|
+
while letters[j] == " ":
|
|
140
|
+
|
|
141
|
+
letters[j] = ""
|
|
142
|
+
|
|
143
|
+
j -= 1
|
|
144
|
+
|
|
145
|
+
letters.append(sentence[i])
|
|
146
|
+
|
|
147
|
+
elif letters[-1] in marks and sentence[i] == " ":
|
|
148
|
+
|
|
149
|
+
letters.append("")
|
|
150
|
+
|
|
151
|
+
else:
|
|
152
|
+
|
|
153
|
+
letters.append(sentence[i])
|
|
154
|
+
|
|
155
|
+
sentences[s] = "".join(letters)
|
|
156
|
+
|
|
157
|
+
return sentences
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def delete_much_space(sentences: Union[list, str]):
|
|
161
|
+
|
|
162
|
+
if type(sentences) is str:
|
|
163
|
+
sentences = [sentences]
|
|
164
|
+
|
|
165
|
+
for i in range(len(sentences)):
|
|
166
|
+
|
|
167
|
+
sentences[i] = " ".join(sentences[i].split())
|
|
168
|
+
|
|
169
|
+
return sentences
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from typing import *
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TransformerSequences:
|
|
5
|
+
def __init__(self, *args, **kwargs):
|
|
6
|
+
|
|
7
|
+
self.transformers = []
|
|
8
|
+
|
|
9
|
+
self.transformers.extend(list(args))
|
|
10
|
+
|
|
11
|
+
self.transformers.extend(list(kwargs.values()))
|
|
12
|
+
|
|
13
|
+
def __call__(self, sentences: Union[List, str]):
|
|
14
|
+
|
|
15
|
+
output = sentences
|
|
16
|
+
|
|
17
|
+
for transformer in self.transformers:
|
|
18
|
+
|
|
19
|
+
if hasattr(transformer, "augment"):
|
|
20
|
+
|
|
21
|
+
output = transformer.augment(output)
|
|
22
|
+
|
|
23
|
+
else:
|
|
24
|
+
|
|
25
|
+
output = transformer(output)
|
|
26
|
+
|
|
27
|
+
return output
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from nlp_project import *
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def unify_correction(
|
|
5
|
+
sentences: list,
|
|
6
|
+
marks: List[Tuple] = [("«", "»", True), ("(", ")", False)],
|
|
7
|
+
unified_sentences_between_pos: List[Tuple] = [(925, 930)],
|
|
8
|
+
):
|
|
9
|
+
|
|
10
|
+
corrected_sentences = []
|
|
11
|
+
|
|
12
|
+
only_end_mark = []
|
|
13
|
+
|
|
14
|
+
only_begin_mark = []
|
|
15
|
+
|
|
16
|
+
i = 0
|
|
17
|
+
|
|
18
|
+
while i < len(sentences):
|
|
19
|
+
|
|
20
|
+
for u in unified_sentences_between_pos:
|
|
21
|
+
|
|
22
|
+
if i >= u[0] - 1 and i < u[1]:
|
|
23
|
+
|
|
24
|
+
range_ = u[1] - u[0]
|
|
25
|
+
|
|
26
|
+
unification = sentences[u[0] - 1]
|
|
27
|
+
|
|
28
|
+
for j in range(u[0], u[0] + range_):
|
|
29
|
+
|
|
30
|
+
unification += " " + sentences[j]
|
|
31
|
+
|
|
32
|
+
i += range_ + 1
|
|
33
|
+
|
|
34
|
+
corrected_sentences.append(unification)
|
|
35
|
+
|
|
36
|
+
unify_next = False
|
|
37
|
+
|
|
38
|
+
space = " "
|
|
39
|
+
|
|
40
|
+
if i != 0:
|
|
41
|
+
|
|
42
|
+
for mark in marks:
|
|
43
|
+
|
|
44
|
+
begin_mark = False
|
|
45
|
+
|
|
46
|
+
end_mark = False
|
|
47
|
+
|
|
48
|
+
for letter in corrected_sentences[-1]:
|
|
49
|
+
|
|
50
|
+
if letter == mark[1]:
|
|
51
|
+
|
|
52
|
+
begin_mark = False
|
|
53
|
+
|
|
54
|
+
elif letter == mark[0]:
|
|
55
|
+
|
|
56
|
+
begin_mark = True
|
|
57
|
+
|
|
58
|
+
for letter in sentences[i]:
|
|
59
|
+
|
|
60
|
+
if letter == mark[1]:
|
|
61
|
+
|
|
62
|
+
end_mark = True
|
|
63
|
+
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
else:
|
|
67
|
+
|
|
68
|
+
break
|
|
69
|
+
|
|
70
|
+
if end_mark and not begin_mark:
|
|
71
|
+
|
|
72
|
+
only_end_mark.append(sentences[i])
|
|
73
|
+
|
|
74
|
+
elif begin_mark and not end_mark:
|
|
75
|
+
|
|
76
|
+
only_begin_mark.append(corrected_sentences[-1])
|
|
77
|
+
|
|
78
|
+
if end_mark and begin_mark:
|
|
79
|
+
|
|
80
|
+
unify_next = True
|
|
81
|
+
|
|
82
|
+
space = " " if mark[2] else ""
|
|
83
|
+
|
|
84
|
+
if unify_next:
|
|
85
|
+
|
|
86
|
+
corrected_sentences[-1] = corrected_sentences[-1] + space + sentences[i]
|
|
87
|
+
|
|
88
|
+
else:
|
|
89
|
+
|
|
90
|
+
corrected_sentences.append(sentences[i])
|
|
91
|
+
|
|
92
|
+
i += 1
|
|
93
|
+
|
|
94
|
+
return corrected_sentences, {
|
|
95
|
+
"begin_mark_only": only_begin_mark,
|
|
96
|
+
"end_mark_only": only_end_mark,
|
|
97
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
""" This module contains a function which split the data. It will consider adding the validation set
|
|
2
|
+
"""
|
|
3
|
+
from sklearn.model_selection import train_test_split
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def split_data(
|
|
9
|
+
random_state: int = 50,
|
|
10
|
+
data_directory: str = "data/extractions/new_data",
|
|
11
|
+
csv_file: str = "sentences.csv",
|
|
12
|
+
):
|
|
13
|
+
"""Split data between train, validation and test sets
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
random_state (int): the seed of the splitting generator. Defaults to 50
|
|
17
|
+
"""
|
|
18
|
+
# load the corpora and split into train and test sets
|
|
19
|
+
corpora = pd.read_csv(os.path.join(data_directory, csv_file))
|
|
20
|
+
|
|
21
|
+
train_set, test_valid_set = train_test_split(
|
|
22
|
+
corpora, test_size=0.2, random_state=random_state
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# let us save the final training set when performing
|
|
26
|
+
|
|
27
|
+
test_set, valid_set = train_test_split(
|
|
28
|
+
test_valid_set, test_size=0.5, random_state=random_state
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
train_set.to_csv(os.path.join(data_directory, "final_train_set.csv"), index=False)
|
|
32
|
+
|
|
33
|
+
# let us save the sets
|
|
34
|
+
train_set.to_csv(os.path.join(data_directory, "train_set.csv"), index=False)
|
|
35
|
+
|
|
36
|
+
valid_set.to_csv(os.path.join(data_directory, "valid_set.csv"), index=False)
|
|
37
|
+
|
|
38
|
+
test_set.to_csv(os.path.join(data_directory, "test_set.csv"), index=False)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def split_data_kaggle(
|
|
42
|
+
random_state: int,
|
|
43
|
+
data_directory: str,
|
|
44
|
+
split_directory: str = "kaggle/working/splits",
|
|
45
|
+
csv_file: str = "sentences.csv",
|
|
46
|
+
):
|
|
47
|
+
"""Split data between train, validation and test sets
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
random_state (int): the seed of the splitting generator. Defaults to 50
|
|
51
|
+
"""
|
|
52
|
+
# load the corpora and split into train and test sets
|
|
53
|
+
corpora = pd.read_csv(os.path.join(data_directory, csv_file))
|
|
54
|
+
|
|
55
|
+
train_set, test_valid_set = train_test_split(
|
|
56
|
+
corpora, test_size=0.2, random_state=random_state
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# let us save the final training set when performing
|
|
60
|
+
|
|
61
|
+
test_set, valid_set = train_test_split(
|
|
62
|
+
test_valid_set, test_size=0.5, random_state=random_state
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
train_set.to_csv(os.path.join(split_directory, "final_train_set.csv"), index=False)
|
|
66
|
+
|
|
67
|
+
# let us save the sets
|
|
68
|
+
train_set.to_csv(os.path.join(split_directory, "train_set.csv"), index=False)
|
|
69
|
+
|
|
70
|
+
valid_set.to_csv(os.path.join(split_directory, "valid_set.csv"), index=False)
|
|
71
|
+
|
|
72
|
+
test_set.to_csv(os.path.join(split_directory, "test_set.csv"), index=False)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import spacy
|
|
2
|
+
from typing import *
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def tokenization(
|
|
6
|
+
nlp=spacy.load("fr_core_news_lg"),
|
|
7
|
+
corpus: Union[List[str], Tuple[str]] = [],
|
|
8
|
+
rm_spaces: bool = True,
|
|
9
|
+
):
|
|
10
|
+
"""Tokenize the text (keep each of the unique token both in the french and the wolof corpora)
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
nlp (_type_, optional): A spacy model. Defaults to spacy.load("fr_core_news_lg").
|
|
14
|
+
corpus (Union[List[str], Tuple[str]], optional): The list of documents. Defaults to [].
|
|
15
|
+
rm_spaces (bool, optional): Indicate if the too much spaces will be deleted. Defaults to True.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
List[List[str]]: The list of list of tokens
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# Create a inner function to tokenize a given document
|
|
22
|
+
def transformation(doc):
|
|
23
|
+
|
|
24
|
+
tokens = []
|
|
25
|
+
|
|
26
|
+
for token in doc:
|
|
27
|
+
|
|
28
|
+
if not (rm_spaces and token.is_space):
|
|
29
|
+
|
|
30
|
+
tokens.append(token.text)
|
|
31
|
+
|
|
32
|
+
return tokens
|
|
33
|
+
|
|
34
|
+
# Let's create a pipeline with the nlp object
|
|
35
|
+
docs = nlp.pipe(corpus)
|
|
36
|
+
|
|
37
|
+
# Initialize the list of tokenized documents and the list of pos_tags
|
|
38
|
+
tokens = []
|
|
39
|
+
|
|
40
|
+
for doc in docs:
|
|
41
|
+
|
|
42
|
+
tokens_ = transformation(doc)
|
|
43
|
+
|
|
44
|
+
tokens.append(tokens_)
|
|
45
|
+
|
|
46
|
+
return tokens
|