wolof-translate 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wolof_translate/__init__.py +73 -0
- wolof_translate/data/__init__.py +0 -0
- wolof_translate/data/dataset_v1.py +151 -0
- wolof_translate/data/dataset_v2.py +187 -0
- wolof_translate/data/dataset_v3.py +187 -0
- wolof_translate/data/dataset_v3_2.py +187 -0
- wolof_translate/data/dataset_v4.py +202 -0
- wolof_translate/data/dataset_v5.py +65 -0
- wolof_translate/models/__init__.py +0 -0
- wolof_translate/models/transformers/__init__.py +0 -0
- wolof_translate/models/transformers/main.py +865 -0
- wolof_translate/models/transformers/main_2.py +362 -0
- wolof_translate/models/transformers/optimization.py +41 -0
- wolof_translate/models/transformers/position.py +46 -0
- wolof_translate/models/transformers/size.py +44 -0
- wolof_translate/pipe/__init__.py +1 -0
- wolof_translate/pipe/nlp_pipeline.py +512 -0
- wolof_translate/tokenizers/__init__.py +0 -0
- wolof_translate/trainers/__init__.py +0 -0
- wolof_translate/trainers/transformer_trainer.py +760 -0
- wolof_translate/trainers/transformer_trainer_custom.py +882 -0
- wolof_translate/trainers/transformer_trainer_ml.py +925 -0
- wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
- wolof_translate/utils/__init__.py +1 -0
- wolof_translate/utils/bucket_iterator.py +143 -0
- wolof_translate/utils/database_manager.py +116 -0
- wolof_translate/utils/display_predictions.py +162 -0
- wolof_translate/utils/download_model.py +40 -0
- wolof_translate/utils/evaluate_custom.py +147 -0
- wolof_translate/utils/evaluation.py +74 -0
- wolof_translate/utils/extract_new_sentences.py +810 -0
- wolof_translate/utils/extract_poems.py +60 -0
- wolof_translate/utils/extract_sentences.py +562 -0
- wolof_translate/utils/improvements/__init__.py +0 -0
- wolof_translate/utils/improvements/end_marks.py +45 -0
- wolof_translate/utils/recuperate_datasets.py +94 -0
- wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
- wolof_translate/utils/send_model.py +26 -0
- wolof_translate/utils/sent_corrections.py +169 -0
- wolof_translate/utils/sent_transformers.py +27 -0
- wolof_translate/utils/sent_unification.py +97 -0
- wolof_translate/utils/split_with_valid.py +72 -0
- wolof_translate/utils/tokenize_text.py +46 -0
- wolof_translate/utils/training.py +213 -0
- wolof_translate/utils/trunc_hg_training.py +196 -0
- wolof_translate-0.0.1.dist-info/METADATA +31 -0
- wolof_translate-0.0.1.dist-info/RECORD +49 -0
- wolof_translate-0.0.1.dist-info/WHEEL +5 -0
- wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Script containing importation
|
|
2
|
+
================================
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# let us import all necessary libraries
|
|
6
|
+
from transformers import (
|
|
7
|
+
T5Model,
|
|
8
|
+
T5ForConditionalGeneration,
|
|
9
|
+
Seq2SeqTrainer,
|
|
10
|
+
T5TokenizerFast,
|
|
11
|
+
set_seed,
|
|
12
|
+
AdamWeightDecay,
|
|
13
|
+
get_linear_schedule_with_warmup,
|
|
14
|
+
get_linear_schedule_with_warmup,
|
|
15
|
+
get_cosine_schedule_with_warmup,
|
|
16
|
+
get_constant_schedule_with_warmup,
|
|
17
|
+
Adafactor,
|
|
18
|
+
)
|
|
19
|
+
from wolof_translate.utils.sent_transformers import TransformerSequences
|
|
20
|
+
from wolof_translate.utils.improvements.end_marks import add_end_mark # added
|
|
21
|
+
from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
|
|
22
|
+
from torch.utils.data import Dataset, DataLoader, random_split
|
|
23
|
+
from wolof_translate.data.dataset_v4 import SentenceDataset # v2 -> v3 -> v4
|
|
24
|
+
from wolof_translate.utils.sent_corrections import *
|
|
25
|
+
from sklearn.model_selection import train_test_split
|
|
26
|
+
from torch.optim.lr_scheduler import _LRScheduler
|
|
27
|
+
from torch.nn.utils.rnn import pad_sequence
|
|
28
|
+
from plotly.subplots import make_subplots
|
|
29
|
+
from nlpaug.augmenter import char as nac
|
|
30
|
+
from torch.utils.data import DataLoader
|
|
31
|
+
from torch.nn import functional as F
|
|
32
|
+
import plotly.graph_objects as go
|
|
33
|
+
from tokenizers import Tokenizer
|
|
34
|
+
import torch.distributed as dist
|
|
35
|
+
import matplotlib.pyplot as plt
|
|
36
|
+
import pytorch_lightning as lt
|
|
37
|
+
from tqdm import tqdm, trange
|
|
38
|
+
from functools import partial
|
|
39
|
+
from torch.nn import utils
|
|
40
|
+
from copy import deepcopy
|
|
41
|
+
from torch import optim
|
|
42
|
+
from typing import *
|
|
43
|
+
from torch import nn
|
|
44
|
+
import pandas as pd
|
|
45
|
+
import numpy as np
|
|
46
|
+
import itertools
|
|
47
|
+
import evaluate
|
|
48
|
+
import random
|
|
49
|
+
import string
|
|
50
|
+
import shutil
|
|
51
|
+
import wandb
|
|
52
|
+
import torch
|
|
53
|
+
import json
|
|
54
|
+
import copy
|
|
55
|
+
import os
|
|
56
|
+
|
|
57
|
+
###-----------------------------------------------
|
|
58
|
+
# Libraries imported from wolof translate
|
|
59
|
+
from wolof_translate.utils.bucket_iterator import (
|
|
60
|
+
SequenceLengthBatchSampler,
|
|
61
|
+
BucketSampler,
|
|
62
|
+
collate_fn,
|
|
63
|
+
collate_fn_trunc,
|
|
64
|
+
)
|
|
65
|
+
from wolof_translate.trainers.transformer_trainer_custom import (
|
|
66
|
+
ModelRunner as CustomModelRunner,
|
|
67
|
+
)
|
|
68
|
+
from wolof_translate.models.transformers.optimization import TransformerScheduler
|
|
69
|
+
from wolof_translate.utils.recuperate_datasets import recuperate_datasets
|
|
70
|
+
from wolof_translate.trainers.transformer_trainer_ml_ import ModelRunner
|
|
71
|
+
from wolof_translate.utils.evaluate_custom import TranslationEvaluation
|
|
72
|
+
from wolof_translate.models.transformers.main import Transformer
|
|
73
|
+
from wolof_translate.utils.split_with_valid import split_data
|
|
File without changes
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from wolof_translate.utils.sent_transformers import TransformerSequences
|
|
2
|
+
from transformers import PreTrainedTokenizerFast
|
|
3
|
+
from torch.utils.data import Dataset
|
|
4
|
+
from tokenizers import Tokenizer
|
|
5
|
+
from typing import *
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import torch
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SentenceDataset(Dataset):
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
file_path: str,
|
|
15
|
+
corpus_1: str = "french_corpus",
|
|
16
|
+
corpus_2: str = "wolof_corpus",
|
|
17
|
+
tokenizer_path: str = "wolof-translate/wolof_translate/tokenizers/tokenizer_v1.json",
|
|
18
|
+
max_len: int = 379,
|
|
19
|
+
truncation: bool = False,
|
|
20
|
+
file_sep: str = ",",
|
|
21
|
+
cls_token: str = "<|endoftext|>",
|
|
22
|
+
sep_token: str = "<|translateto|>",
|
|
23
|
+
pad_token: str = "<|pad|>",
|
|
24
|
+
cp1_transformer: Union[TransformerSequences, None] = None,
|
|
25
|
+
cp2_transformer: Union[TransformerSequences, None] = None,
|
|
26
|
+
**kwargs,
|
|
27
|
+
):
|
|
28
|
+
|
|
29
|
+
# let us recuperate the data frame
|
|
30
|
+
self.__sentences = pd.read_csv(file_path, sep=file_sep, **kwargs)
|
|
31
|
+
|
|
32
|
+
# let us recuperate the tokenizer
|
|
33
|
+
self.tokenizer = PreTrainedTokenizerFast(
|
|
34
|
+
tokenizer_file=tokenizer_path,
|
|
35
|
+
bos_token=cls_token,
|
|
36
|
+
eos_token=cls_token,
|
|
37
|
+
pad_token=pad_token,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# recuperate the first corpus' sentences
|
|
41
|
+
self.__sentences_1 = self.__sentences[corpus_1].to_list()
|
|
42
|
+
|
|
43
|
+
# recuperate the second corpus' sentences
|
|
44
|
+
self.__sentences_2 = self.__sentences[corpus_2].to_list()
|
|
45
|
+
|
|
46
|
+
# recuperate the special tokens
|
|
47
|
+
self.cls_token = cls_token
|
|
48
|
+
|
|
49
|
+
self.sep_token = sep_token
|
|
50
|
+
|
|
51
|
+
self.pad_token = pad_token
|
|
52
|
+
|
|
53
|
+
# recuperate the length
|
|
54
|
+
self.__length = len(self.__sentences_1)
|
|
55
|
+
|
|
56
|
+
# recuperate the max id
|
|
57
|
+
self.max_id = len(self.tokenizer) - 1
|
|
58
|
+
|
|
59
|
+
# let us recuperate the max len
|
|
60
|
+
self.max_len = max_len
|
|
61
|
+
|
|
62
|
+
# let us recuperate the truncate argument
|
|
63
|
+
self.truncation = truncation
|
|
64
|
+
|
|
65
|
+
# let us initialize the transformer
|
|
66
|
+
self.cp1_transformer = cp1_transformer
|
|
67
|
+
|
|
68
|
+
self.cp2_transformer = cp2_transformer
|
|
69
|
+
|
|
70
|
+
def __getitem__(self, index):
|
|
71
|
+
|
|
72
|
+
sentence_1 = self.__sentences_1[index]
|
|
73
|
+
|
|
74
|
+
sentence_2 = self.__sentences_2[index]
|
|
75
|
+
|
|
76
|
+
# apply transformers if necessary
|
|
77
|
+
if not self.cp1_transformer is None:
|
|
78
|
+
|
|
79
|
+
sentence_1 = self.cp1_transformer(sentence_1)
|
|
80
|
+
|
|
81
|
+
if not self.cp2_transformer is None:
|
|
82
|
+
|
|
83
|
+
sentence_2 = self.cp2_transformer(sentence_2)
|
|
84
|
+
|
|
85
|
+
# let us create the sentence with special tokens
|
|
86
|
+
sentence = (
|
|
87
|
+
f"{self.cls_token}{sentence_1}{self.sep_token}{sentence_2}{self.cls_token}"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# let us encode the sentence
|
|
91
|
+
encoding = self.tokenizer(
|
|
92
|
+
sentence,
|
|
93
|
+
truncation=self.truncation,
|
|
94
|
+
max_length=self.max_len,
|
|
95
|
+
padding="max_length",
|
|
96
|
+
return_tensors="pt",
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
return encoding.input_ids.squeeze(0), encoding.attention_mask.squeeze(0)
|
|
100
|
+
|
|
101
|
+
def __len__(self):
|
|
102
|
+
|
|
103
|
+
return self.__length
|
|
104
|
+
|
|
105
|
+
def decode(self, ids: torch.Tensor, for_prediction: bool = False):
|
|
106
|
+
|
|
107
|
+
if ids.ndim < 2:
|
|
108
|
+
|
|
109
|
+
ids = ids.unsqueeze(0)
|
|
110
|
+
|
|
111
|
+
ids = ids.tolist()
|
|
112
|
+
|
|
113
|
+
for id in ids:
|
|
114
|
+
|
|
115
|
+
sentence = self.tokenizer.decode(id)
|
|
116
|
+
|
|
117
|
+
if not for_prediction:
|
|
118
|
+
|
|
119
|
+
sentence = sentence.split(f"{self.sep_token}")
|
|
120
|
+
|
|
121
|
+
else:
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
|
|
125
|
+
while self.sep_token in sentence:
|
|
126
|
+
|
|
127
|
+
sentence = re.findall(f"{self.sep_token}(.*)", sentence)[-1]
|
|
128
|
+
|
|
129
|
+
except:
|
|
130
|
+
|
|
131
|
+
sentence = "None"
|
|
132
|
+
|
|
133
|
+
if for_prediction:
|
|
134
|
+
|
|
135
|
+
yield sentence.replace(f"{self.cls_token}", "").replace(
|
|
136
|
+
f"{self.pad_token}", ""
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
else:
|
|
140
|
+
|
|
141
|
+
sents = []
|
|
142
|
+
|
|
143
|
+
for sent in sentence:
|
|
144
|
+
|
|
145
|
+
sents.append(
|
|
146
|
+
sent.replace(f"{self.cls_token}", "").replace(
|
|
147
|
+
f"{self.pad_token}", ""
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
yield sents
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from wolof_translate.utils.sent_transformers import TransformerSequences
|
|
2
|
+
from transformers import PreTrainedTokenizerFast
|
|
3
|
+
from torch.utils.data import Dataset
|
|
4
|
+
from typing import *
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import torch
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class T5SentenceDataset(Dataset):
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
data_path: str,
|
|
14
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
15
|
+
corpus_1: str = "french",
|
|
16
|
+
corpus_2: str = "wolof",
|
|
17
|
+
max_len: int = 51,
|
|
18
|
+
truncation: bool = False,
|
|
19
|
+
file_sep: str = ",",
|
|
20
|
+
cp1_transformer: Union[TransformerSequences, None] = None,
|
|
21
|
+
cp2_transformer: Union[TransformerSequences, None] = None,
|
|
22
|
+
**kwargs
|
|
23
|
+
):
|
|
24
|
+
|
|
25
|
+
# let us recuperate the data frame
|
|
26
|
+
self.__sentences = pd.read_csv(data_path, sep=file_sep, **kwargs)
|
|
27
|
+
|
|
28
|
+
# let us recuperate the tokenizer
|
|
29
|
+
self.tokenizer = tokenizer
|
|
30
|
+
|
|
31
|
+
# recuperate the first corpus' sentences
|
|
32
|
+
self.sentences_1 = self.__sentences[corpus_1].to_list()
|
|
33
|
+
|
|
34
|
+
# recuperate the second corpus' sentences
|
|
35
|
+
self.sentences_2 = self.__sentences[corpus_2].to_list()
|
|
36
|
+
|
|
37
|
+
# recuperate the length
|
|
38
|
+
self.length = len(self.sentences_1)
|
|
39
|
+
|
|
40
|
+
# let us recuperate the max len
|
|
41
|
+
self.max_len = max_len
|
|
42
|
+
|
|
43
|
+
# let us recuperate the truncation argument
|
|
44
|
+
self.truncation = truncation
|
|
45
|
+
|
|
46
|
+
# let us initialize the transformer
|
|
47
|
+
self.cp1_transformer = cp1_transformer
|
|
48
|
+
|
|
49
|
+
self.cp2_transformer = cp2_transformer
|
|
50
|
+
|
|
51
|
+
def __getitem__(self, index):
|
|
52
|
+
"""Recuperate ids and attention masks of sentences at index
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
index (int): The index of the sentences to recuperate
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
tuple: The `sentence to translate' ids`, `the attention mask of the sentence to translate`
|
|
59
|
+
`the labels' ids`
|
|
60
|
+
"""
|
|
61
|
+
sentence_1 = self.sentences_1[index]
|
|
62
|
+
|
|
63
|
+
sentence_2 = self.sentences_2[index]
|
|
64
|
+
|
|
65
|
+
# apply transformers if necessary
|
|
66
|
+
if not self.cp1_transformer is None:
|
|
67
|
+
|
|
68
|
+
sentence_1 = self.cp1_transformer(sentence_1)[0]
|
|
69
|
+
|
|
70
|
+
if not self.cp2_transformer is None:
|
|
71
|
+
|
|
72
|
+
sentence_2 = self.cp2_transformer(sentence_2)[0]
|
|
73
|
+
|
|
74
|
+
sentence_1 = sentence_1 + self.tokenizer.eos_token
|
|
75
|
+
|
|
76
|
+
sentence_2 = sentence_2 + self.tokenizer.eos_token
|
|
77
|
+
|
|
78
|
+
# let us encode the sentences (we provide the second sentence as labels to the tokenizer)
|
|
79
|
+
data = self.tokenizer(
|
|
80
|
+
sentence_1,
|
|
81
|
+
truncation=self.truncation,
|
|
82
|
+
max_length=self.max_len,
|
|
83
|
+
padding="max_length",
|
|
84
|
+
return_tensors="pt",
|
|
85
|
+
text_target=sentence_2,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return (
|
|
89
|
+
data.input_ids.squeeze(0),
|
|
90
|
+
data.attention_mask.squeeze(0),
|
|
91
|
+
data.labels.squeeze(0),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def __len__(self):
|
|
95
|
+
|
|
96
|
+
return self.length
|
|
97
|
+
|
|
98
|
+
def decode(self, labels: torch.Tensor):
|
|
99
|
+
|
|
100
|
+
if labels.ndim < 2:
|
|
101
|
+
|
|
102
|
+
labels = labels.unsqueeze(0)
|
|
103
|
+
|
|
104
|
+
sentences = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
|
|
105
|
+
|
|
106
|
+
return sentences
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class SentenceDataset(T5SentenceDataset):
|
|
110
|
+
def __init__(
|
|
111
|
+
self,
|
|
112
|
+
data_path: str,
|
|
113
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
114
|
+
corpus_1: str = "french",
|
|
115
|
+
corpus_2: str = "wolof",
|
|
116
|
+
max_len: int = 51,
|
|
117
|
+
truncation: bool = False,
|
|
118
|
+
file_sep: str = ",",
|
|
119
|
+
cp1_transformer: Union[TransformerSequences, None] = None,
|
|
120
|
+
cp2_transformer: Union[TransformerSequences, None] = None,
|
|
121
|
+
**kwargs
|
|
122
|
+
):
|
|
123
|
+
|
|
124
|
+
super().__init__(
|
|
125
|
+
data_path,
|
|
126
|
+
tokenizer,
|
|
127
|
+
corpus_1,
|
|
128
|
+
corpus_2,
|
|
129
|
+
max_len,
|
|
130
|
+
truncation,
|
|
131
|
+
file_sep,
|
|
132
|
+
cp1_transformer,
|
|
133
|
+
cp2_transformer,
|
|
134
|
+
**kwargs
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def __getitem__(self, index):
|
|
138
|
+
"""Recuperate ids and attention masks of sentences at index
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
index (int): The index of the sentences to recuperate
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
tuple: The `sentence to translate' ids`, `the attention mask of the sentence to translate`
|
|
145
|
+
`the labels' ids`
|
|
146
|
+
"""
|
|
147
|
+
sentence_1 = self.sentences_1[index]
|
|
148
|
+
|
|
149
|
+
sentence_2 = self.sentences_2[index]
|
|
150
|
+
|
|
151
|
+
# apply transformers if necessary
|
|
152
|
+
if not self.cp1_transformer is None:
|
|
153
|
+
|
|
154
|
+
sentence_1 = self.cp1_transformer(sentence_1)[0]
|
|
155
|
+
|
|
156
|
+
if not self.cp2_transformer is None:
|
|
157
|
+
|
|
158
|
+
sentence_2 = self.cp2_transformer(sentence_2)[0]
|
|
159
|
+
|
|
160
|
+
sentence_1 = sentence_1 + self.tokenizer.eos_token
|
|
161
|
+
|
|
162
|
+
sentence_2 = sentence_2 + self.tokenizer.eos_token
|
|
163
|
+
|
|
164
|
+
# let us encode the sentences (we provide the second sentence as labels to the tokenizer)
|
|
165
|
+
data = self.tokenizer(
|
|
166
|
+
sentence_1,
|
|
167
|
+
truncation=self.truncation,
|
|
168
|
+
max_length=self.max_len,
|
|
169
|
+
padding="max_length",
|
|
170
|
+
return_tensors="pt",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# let us encode the sentences (we provide the second sentence as labels to the tokenizer)
|
|
174
|
+
labels = self.tokenizer(
|
|
175
|
+
sentence_2,
|
|
176
|
+
truncation=self.truncation,
|
|
177
|
+
max_length=self.max_len,
|
|
178
|
+
padding="max_length",
|
|
179
|
+
return_tensors="pt",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return (
|
|
183
|
+
data.input_ids.squeeze(0),
|
|
184
|
+
data.attention_mask.squeeze(0),
|
|
185
|
+
labels.input_ids.squeeze(0),
|
|
186
|
+
labels.attention_mask.squeeze(0),
|
|
187
|
+
)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from wolof_translate.utils.sent_transformers import TransformerSequences
|
|
2
|
+
from transformers import PreTrainedTokenizerFast
|
|
3
|
+
from torch.utils.data import Dataset
|
|
4
|
+
from typing import *
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import torch
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class T5SentenceDataset(Dataset):
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
data_path: str,
|
|
14
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
15
|
+
corpus_1: str = "french",
|
|
16
|
+
corpus_2: str = "wolof",
|
|
17
|
+
max_len: int = 60,
|
|
18
|
+
truncation: bool = False,
|
|
19
|
+
file_sep: str = ",",
|
|
20
|
+
cp1_transformer: Union[TransformerSequences, None] = None,
|
|
21
|
+
cp2_transformer: Union[TransformerSequences, None] = None,
|
|
22
|
+
**kwargs
|
|
23
|
+
):
|
|
24
|
+
|
|
25
|
+
# let us recuperate the data frame
|
|
26
|
+
self.__sentences = pd.read_csv(data_path, sep=file_sep, **kwargs)
|
|
27
|
+
|
|
28
|
+
# let us recuperate the tokenizer
|
|
29
|
+
self.tokenizer = tokenizer
|
|
30
|
+
|
|
31
|
+
# recuperate the first corpus' sentences
|
|
32
|
+
self.sentences_1 = self.__sentences[corpus_1].to_list()
|
|
33
|
+
|
|
34
|
+
# recuperate the second corpus' sentences
|
|
35
|
+
self.sentences_2 = self.__sentences[corpus_2].to_list()
|
|
36
|
+
|
|
37
|
+
# recuperate the length
|
|
38
|
+
self.length = len(self.sentences_1)
|
|
39
|
+
|
|
40
|
+
# let us recuperate the max len
|
|
41
|
+
self.max_len = max_len + max_len // 6
|
|
42
|
+
|
|
43
|
+
# let us recuperate the truncation argument
|
|
44
|
+
self.truncation = truncation
|
|
45
|
+
|
|
46
|
+
# let us initialize the transformer
|
|
47
|
+
self.cp1_transformer = cp1_transformer
|
|
48
|
+
|
|
49
|
+
self.cp2_transformer = cp2_transformer
|
|
50
|
+
|
|
51
|
+
def __getitem__(self, index):
|
|
52
|
+
"""Recuperate ids and attention masks of sentences at index
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
index (int): The index of the sentences to recuperate
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
tuple: The `sentence to translate' ids`, `the attention mask of the sentence to translate`
|
|
59
|
+
`the labels' ids`
|
|
60
|
+
"""
|
|
61
|
+
sentence_1 = self.sentences_1[index]
|
|
62
|
+
|
|
63
|
+
sentence_2 = self.sentences_2[index]
|
|
64
|
+
|
|
65
|
+
# apply transformers if necessary
|
|
66
|
+
if not self.cp1_transformer is None:
|
|
67
|
+
|
|
68
|
+
sentence_1 = self.cp1_transformer(sentence_1)[0]
|
|
69
|
+
|
|
70
|
+
if not self.cp2_transformer is None:
|
|
71
|
+
|
|
72
|
+
sentence_2 = self.cp2_transformer(sentence_2)[0]
|
|
73
|
+
|
|
74
|
+
sentence_1 = sentence_1 + self.tokenizer.eos_token
|
|
75
|
+
|
|
76
|
+
sentence_2 = sentence_2 + self.tokenizer.eos_token
|
|
77
|
+
|
|
78
|
+
# let us encode the sentences (we provide the second sentence as labels to the tokenizer)
|
|
79
|
+
data = self.tokenizer(
|
|
80
|
+
sentence_1,
|
|
81
|
+
truncation=self.truncation,
|
|
82
|
+
max_length=self.max_len,
|
|
83
|
+
padding="max_length",
|
|
84
|
+
return_tensors="pt",
|
|
85
|
+
text_target=sentence_2,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return (
|
|
89
|
+
data.input_ids.squeeze(0),
|
|
90
|
+
data.attention_mask.squeeze(0),
|
|
91
|
+
data.labels.squeeze(0),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def __len__(self):
|
|
95
|
+
|
|
96
|
+
return self.length
|
|
97
|
+
|
|
98
|
+
def decode(self, labels: torch.Tensor):
|
|
99
|
+
|
|
100
|
+
if labels.ndim < 2:
|
|
101
|
+
|
|
102
|
+
labels = labels.unsqueeze(0)
|
|
103
|
+
|
|
104
|
+
sentences = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
|
|
105
|
+
|
|
106
|
+
return sentences
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class SentenceDataset(T5SentenceDataset):
|
|
110
|
+
def __init__(
|
|
111
|
+
self,
|
|
112
|
+
data_path: str,
|
|
113
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
114
|
+
corpus_1: str = "french",
|
|
115
|
+
corpus_2: str = "wolof",
|
|
116
|
+
max_len: int = 42,
|
|
117
|
+
truncation: bool = False,
|
|
118
|
+
file_sep: str = ",",
|
|
119
|
+
cp1_transformer: Union[TransformerSequences, None] = None,
|
|
120
|
+
cp2_transformer: Union[TransformerSequences, None] = None,
|
|
121
|
+
**kwargs
|
|
122
|
+
):
|
|
123
|
+
|
|
124
|
+
super().__init__(
|
|
125
|
+
data_path,
|
|
126
|
+
tokenizer,
|
|
127
|
+
corpus_1,
|
|
128
|
+
corpus_2,
|
|
129
|
+
max_len,
|
|
130
|
+
truncation,
|
|
131
|
+
file_sep,
|
|
132
|
+
cp1_transformer,
|
|
133
|
+
cp2_transformer,
|
|
134
|
+
**kwargs
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def __getitem__(self, index):
|
|
138
|
+
"""Recuperate ids and attention masks of sentences at index
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
index (int): The index of the sentences to recuperate
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
tuple: The `sentence to translate' ids`, `the attention mask of the sentence to translate`
|
|
145
|
+
`the labels' ids`
|
|
146
|
+
"""
|
|
147
|
+
sentence_1 = self.sentences_1[index]
|
|
148
|
+
|
|
149
|
+
sentence_2 = self.sentences_2[index]
|
|
150
|
+
|
|
151
|
+
# apply transformers if necessary
|
|
152
|
+
if not self.cp1_transformer is None:
|
|
153
|
+
|
|
154
|
+
sentence_1 = self.cp1_transformer(sentence_1)[0]
|
|
155
|
+
|
|
156
|
+
if not self.cp2_transformer is None:
|
|
157
|
+
|
|
158
|
+
sentence_2 = self.cp2_transformer(sentence_2)[0]
|
|
159
|
+
|
|
160
|
+
sentence_1 = sentence_1 + self.tokenizer.eos_token
|
|
161
|
+
|
|
162
|
+
sentence_2 = sentence_2 + self.tokenizer.eos_token
|
|
163
|
+
|
|
164
|
+
# let us encode the sentences (we provide the second sentence as labels to the tokenizer)
|
|
165
|
+
data = self.tokenizer(
|
|
166
|
+
sentence_1,
|
|
167
|
+
truncation=self.truncation,
|
|
168
|
+
max_length=self.max_len,
|
|
169
|
+
padding="max_length",
|
|
170
|
+
return_tensors="pt",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# let us encode the sentences (we provide the second sentence as labels to the tokenizer)
|
|
174
|
+
labels = self.tokenizer(
|
|
175
|
+
sentence_2,
|
|
176
|
+
truncation=self.truncation,
|
|
177
|
+
max_length=self.max_len,
|
|
178
|
+
padding="max_length",
|
|
179
|
+
return_tensors="pt",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
return (
|
|
183
|
+
data.input_ids.squeeze(0),
|
|
184
|
+
data.attention_mask.squeeze(0),
|
|
185
|
+
labels.input_ids.squeeze(0),
|
|
186
|
+
labels.attention_mask.squeeze(0),
|
|
187
|
+
)
|