wolof-translate 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wolof_translate/__init__.py +73 -0
- wolof_translate/data/__init__.py +0 -0
- wolof_translate/data/dataset_v1.py +151 -0
- wolof_translate/data/dataset_v2.py +187 -0
- wolof_translate/data/dataset_v3.py +187 -0
- wolof_translate/data/dataset_v3_2.py +187 -0
- wolof_translate/data/dataset_v4.py +202 -0
- wolof_translate/data/dataset_v5.py +65 -0
- wolof_translate/models/__init__.py +0 -0
- wolof_translate/models/transformers/__init__.py +0 -0
- wolof_translate/models/transformers/main.py +865 -0
- wolof_translate/models/transformers/main_2.py +362 -0
- wolof_translate/models/transformers/optimization.py +41 -0
- wolof_translate/models/transformers/position.py +46 -0
- wolof_translate/models/transformers/size.py +44 -0
- wolof_translate/pipe/__init__.py +1 -0
- wolof_translate/pipe/nlp_pipeline.py +512 -0
- wolof_translate/tokenizers/__init__.py +0 -0
- wolof_translate/trainers/__init__.py +0 -0
- wolof_translate/trainers/transformer_trainer.py +760 -0
- wolof_translate/trainers/transformer_trainer_custom.py +882 -0
- wolof_translate/trainers/transformer_trainer_ml.py +925 -0
- wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
- wolof_translate/utils/__init__.py +1 -0
- wolof_translate/utils/bucket_iterator.py +143 -0
- wolof_translate/utils/database_manager.py +116 -0
- wolof_translate/utils/display_predictions.py +162 -0
- wolof_translate/utils/download_model.py +40 -0
- wolof_translate/utils/evaluate_custom.py +147 -0
- wolof_translate/utils/evaluation.py +74 -0
- wolof_translate/utils/extract_new_sentences.py +810 -0
- wolof_translate/utils/extract_poems.py +60 -0
- wolof_translate/utils/extract_sentences.py +562 -0
- wolof_translate/utils/improvements/__init__.py +0 -0
- wolof_translate/utils/improvements/end_marks.py +45 -0
- wolof_translate/utils/recuperate_datasets.py +94 -0
- wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
- wolof_translate/utils/send_model.py +26 -0
- wolof_translate/utils/sent_corrections.py +169 -0
- wolof_translate/utils/sent_transformers.py +27 -0
- wolof_translate/utils/sent_unification.py +97 -0
- wolof_translate/utils/split_with_valid.py +72 -0
- wolof_translate/utils/tokenize_text.py +46 -0
- wolof_translate/utils/training.py +213 -0
- wolof_translate/utils/trunc_hg_training.py +196 -0
- wolof_translate-0.0.1.dist-info/METADATA +31 -0
- wolof_translate-0.0.1.dist-info/RECORD +49 -0
- wolof_translate-0.0.1.dist-info/WHEEL +5 -0
- wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
from wolof_translate import *
|
|
2
|
+
import warnings
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def train(config: dict):
|
|
6
|
+
|
|
7
|
+
# ---------------------------------------
|
|
8
|
+
# add distribution if necessary (https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/pytorch_mnist/mnist.py)
|
|
9
|
+
|
|
10
|
+
logger = config["logger"]
|
|
11
|
+
|
|
12
|
+
is_distributed = len(config["hosts"]) > 1 and config["backend"] is not None
|
|
13
|
+
|
|
14
|
+
use_cuda = config["num_gpus"] > 0
|
|
15
|
+
|
|
16
|
+
config.update({"num_workers": 1, "pin_memory": True} if use_cuda else {})
|
|
17
|
+
|
|
18
|
+
if not logger is None:
|
|
19
|
+
|
|
20
|
+
logger.debug("Distributed training - {}".format(is_distributed))
|
|
21
|
+
|
|
22
|
+
logger.debug("Number of gpus available - {}".format(config["num_gpus"]))
|
|
23
|
+
|
|
24
|
+
if is_distributed:
|
|
25
|
+
# Initialize the distributed environment.
|
|
26
|
+
world_size = len(config["hosts"])
|
|
27
|
+
|
|
28
|
+
os.environ["WORLD_SIZE"] = str(world_size)
|
|
29
|
+
|
|
30
|
+
host_rank = config["hosts"].index(config["current_host"])
|
|
31
|
+
|
|
32
|
+
os.environ["RANK"] = str(host_rank)
|
|
33
|
+
|
|
34
|
+
dist.init_process_group(
|
|
35
|
+
backend=config["backend"], rank=host_rank, world_size=world_size
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
if not logger is None:
|
|
39
|
+
logger.info(
|
|
40
|
+
"Initialized the distributed environment: '{}' backend on {} nodes. ".format(
|
|
41
|
+
config["backend"], dist.get_world_size()
|
|
42
|
+
)
|
|
43
|
+
+ "Current host rank is {}. Number of gpus: {}".format(
|
|
44
|
+
dist.get_rank(), config["num_gpus"]
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
# ---------------------------------------
|
|
48
|
+
|
|
49
|
+
# split the data
|
|
50
|
+
if config["include_split"]:
|
|
51
|
+
split_data(
|
|
52
|
+
config["random_state"], config["data_directory"], config["data_file"]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# recuperate the tokenizer
|
|
56
|
+
tokenizer = T5TokenizerFast(config["tokenizer_path"])
|
|
57
|
+
|
|
58
|
+
# recuperate train and test set
|
|
59
|
+
train_dataset, test_dataset = recuperate_datasets(
|
|
60
|
+
config["char_p"],
|
|
61
|
+
config["word_p"],
|
|
62
|
+
config["max_len"],
|
|
63
|
+
config["end_mark"],
|
|
64
|
+
tokenizer,
|
|
65
|
+
config["corpus_1"],
|
|
66
|
+
config["corpus_2"],
|
|
67
|
+
config["train_file"],
|
|
68
|
+
config["test_file"],
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# initialize the evaluation object
|
|
72
|
+
evaluation = TranslationEvaluation(tokenizer, train_dataset.decode)
|
|
73
|
+
|
|
74
|
+
# let us initialize the trainer
|
|
75
|
+
trainer = ModelRunner(
|
|
76
|
+
model=Transformer,
|
|
77
|
+
version=config["version"],
|
|
78
|
+
seed=0,
|
|
79
|
+
evaluation=evaluation,
|
|
80
|
+
optimizer=Adafactor,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# initialize the encoder and the decoder layers
|
|
84
|
+
encoder_layer = nn.TransformerEncoderLayer(
|
|
85
|
+
config["d_model"],
|
|
86
|
+
config["n_head"],
|
|
87
|
+
config["dim_ff"],
|
|
88
|
+
config["drop_out_rate"],
|
|
89
|
+
batch_first=True,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
decoder_layer = nn.TransformerDecoderLayer(
|
|
93
|
+
config["d_model"],
|
|
94
|
+
config["n_head"],
|
|
95
|
+
config["dim_ff"],
|
|
96
|
+
config["drop_out_rate"],
|
|
97
|
+
batch_first=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# let us initialize the encoder and the decoder
|
|
101
|
+
encoder = nn.TransformerEncoder(encoder_layer, config["n_encoders"])
|
|
102
|
+
|
|
103
|
+
decoder = nn.TransformerDecoder(decoder_layer, config["n_decoders"])
|
|
104
|
+
|
|
105
|
+
# -------------------------------------
|
|
106
|
+
# in the case when the linear learning rate scheduler with warmup is used
|
|
107
|
+
|
|
108
|
+
# let us calculate the appropriate warmup steps (let us take a max epoch of 100)
|
|
109
|
+
# length = len(train_dataset)
|
|
110
|
+
|
|
111
|
+
# n_steps = length // config['batch_size']
|
|
112
|
+
|
|
113
|
+
# num_steps = config['max_epoch'] * n_steps
|
|
114
|
+
|
|
115
|
+
# warmup_steps = (config['max_epoch'] * n_steps) * config['warmup_ratio']
|
|
116
|
+
|
|
117
|
+
# Initialize the scheduler parameters
|
|
118
|
+
# scheduler_args = {'num_warmup_steps': warmup_steps, 'num_training_steps': num_steps}
|
|
119
|
+
# -------------------------------------
|
|
120
|
+
|
|
121
|
+
# Initialize the transformer parameters
|
|
122
|
+
model_args = {
|
|
123
|
+
"vocab_size": len(tokenizer),
|
|
124
|
+
"encoder": encoder,
|
|
125
|
+
"decoder": decoder,
|
|
126
|
+
"class_criterion": nn.CrossEntropyLoss(
|
|
127
|
+
label_smoothing=config["label_smoothing"]
|
|
128
|
+
),
|
|
129
|
+
"max_len": config["max_len"],
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# Initialize the optimizer parameters
|
|
133
|
+
optimizer_args = {
|
|
134
|
+
"lr": config["learning_rate"],
|
|
135
|
+
"weight_decay": config["weight_decay"],
|
|
136
|
+
# 'betas': (0.9, 0.98),
|
|
137
|
+
"warmup_init": config["warmup_init"],
|
|
138
|
+
"relative_step": config["relative_step"],
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# ----------------------------
|
|
142
|
+
# initialize the bucket samplers for distributed environment
|
|
143
|
+
boundaries = config["boundaries"]
|
|
144
|
+
batch_sizes = config["batch_sizes"]
|
|
145
|
+
|
|
146
|
+
train_sampler = SequenceLengthBatchSampler(
|
|
147
|
+
train_dataset, boundaries=boundaries, batch_sizes=batch_sizes
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
test_sampler = SequenceLengthBatchSampler(
|
|
151
|
+
test_dataset, boundaries=boundaries, batch_sizes=batch_sizes
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# ------------------------------
|
|
155
|
+
# initialize a bucket sampler with fixed batch size in the case of single machine
|
|
156
|
+
# with parallelization on multiple gpus
|
|
157
|
+
# train_sampler = BucketSampler(train_dataset, config['batch_size'])
|
|
158
|
+
|
|
159
|
+
# test_sampler = BucketSampler(test_dataset, config['batch_size'])
|
|
160
|
+
|
|
161
|
+
# ------------------------------
|
|
162
|
+
|
|
163
|
+
# Initialize the loaders parameters
|
|
164
|
+
train_loader_args = {
|
|
165
|
+
"batch_sampler": train_sampler,
|
|
166
|
+
"collate_fn": collate_fn,
|
|
167
|
+
"num_workers": config["num_workers"],
|
|
168
|
+
"pin_memory": config["pin_memory"],
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
test_loader_args = {
|
|
172
|
+
"batch_sampler": test_sampler,
|
|
173
|
+
"collate_fn": collate_fn,
|
|
174
|
+
"num_workers": config["num_workers"],
|
|
175
|
+
"pin_memory": config["pin_memory"],
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
# Add the datasets and hyperparameters to trainer
|
|
179
|
+
trainer.compile(
|
|
180
|
+
train_dataset,
|
|
181
|
+
test_dataset,
|
|
182
|
+
tokenizer,
|
|
183
|
+
train_loader_args,
|
|
184
|
+
test_loader_args,
|
|
185
|
+
optimizer_kwargs=optimizer_args,
|
|
186
|
+
model_kwargs=model_args,
|
|
187
|
+
# lr_scheduler=get_linear_schedule_with_warmup,
|
|
188
|
+
# lr_scheduler_kwargs=scheduler_args,
|
|
189
|
+
predict_with_generate=True,
|
|
190
|
+
is_distributed=is_distributed,
|
|
191
|
+
logging_dir=config["logging_dir"],
|
|
192
|
+
dist=dist,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# load the model
|
|
196
|
+
trainer.load(config["model_dir"], load_best=not config["continue"])
|
|
197
|
+
|
|
198
|
+
# Train the model
|
|
199
|
+
trainer.train(
|
|
200
|
+
config["epochs"] - trainer.current_epoch,
|
|
201
|
+
auto_save=True,
|
|
202
|
+
log_step=config["log_step"],
|
|
203
|
+
saving_directory=config["new_model_dir"],
|
|
204
|
+
save_best=config["save_best"],
|
|
205
|
+
metric_for_best_model=config["metric_for_best_model"],
|
|
206
|
+
metric_objective=config["metric_objective"],
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if config["return_trainer"]:
|
|
210
|
+
|
|
211
|
+
return trainer
|
|
212
|
+
|
|
213
|
+
return None
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
from wolof_translate import *
|
|
2
|
+
import warnings
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def train(config: dict):
|
|
6
|
+
|
|
7
|
+
# ---------------------------------------
|
|
8
|
+
# add distribution if necessary (https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/pytorch_mnist/mnist.py)
|
|
9
|
+
|
|
10
|
+
logger = config["logger"]
|
|
11
|
+
|
|
12
|
+
is_distributed = len(config["hosts"]) > 1 and config["backend"] is not None
|
|
13
|
+
|
|
14
|
+
use_cuda = config["num_gpus"] > 0
|
|
15
|
+
|
|
16
|
+
config.update({"num_workers": 1, "pin_memory": True} if use_cuda else {})
|
|
17
|
+
|
|
18
|
+
if not logger is None:
|
|
19
|
+
|
|
20
|
+
logger.debug("Distributed training - {}".format(is_distributed))
|
|
21
|
+
|
|
22
|
+
logger.debug("Number of gpus available - {}".format(config["num_gpus"]))
|
|
23
|
+
|
|
24
|
+
if is_distributed:
|
|
25
|
+
# Initialize the distributed environment.
|
|
26
|
+
world_size = len(config["hosts"])
|
|
27
|
+
|
|
28
|
+
os.environ["WORLD_SIZE"] = str(world_size)
|
|
29
|
+
|
|
30
|
+
host_rank = config["hosts"].index(config["current_host"])
|
|
31
|
+
|
|
32
|
+
os.environ["RANK"] = str(host_rank)
|
|
33
|
+
|
|
34
|
+
dist.init_process_group(
|
|
35
|
+
backend=config["backend"], rank=host_rank, world_size=world_size
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
if not logger is None:
|
|
39
|
+
logger.info(
|
|
40
|
+
"Initialized the distributed environment: '{}' backend on {} nodes. ".format(
|
|
41
|
+
config["backend"], dist.get_world_size()
|
|
42
|
+
)
|
|
43
|
+
+ "Current host rank is {}. Number of gpus: {}".format(
|
|
44
|
+
dist.get_rank(), config["num_gpus"]
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
# ---------------------------------------
|
|
48
|
+
|
|
49
|
+
# split the data
|
|
50
|
+
if config["include_split"]:
|
|
51
|
+
split_data(
|
|
52
|
+
config["random_state"], config["data_directory"], config["data_file"]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# recuperate the tokenizer
|
|
56
|
+
tokenizer = T5TokenizerFast(config["tokenizer_path"])
|
|
57
|
+
|
|
58
|
+
# Initialize the model name
|
|
59
|
+
model_name = "t5-small"
|
|
60
|
+
|
|
61
|
+
# import the model with its pre-trained weights
|
|
62
|
+
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
|
63
|
+
|
|
64
|
+
# resize the token embeddings
|
|
65
|
+
model.resize_token_embeddings(len(tokenizer))
|
|
66
|
+
|
|
67
|
+
# recuperate train and test set
|
|
68
|
+
train_dataset, test_dataset = recuperate_datasets(
|
|
69
|
+
config["char_p"],
|
|
70
|
+
config["word_p"],
|
|
71
|
+
50,
|
|
72
|
+
config["end_mark"],
|
|
73
|
+
tokenizer,
|
|
74
|
+
config["corpus_1"],
|
|
75
|
+
config["corpus_2"],
|
|
76
|
+
config["train_file"],
|
|
77
|
+
config["test_file"],
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# initialize the evaluation object
|
|
81
|
+
evaluation = TranslationEvaluation(tokenizer, train_dataset.decode)
|
|
82
|
+
|
|
83
|
+
# let us initialize the trainer
|
|
84
|
+
trainer = ModelRunner(
|
|
85
|
+
model=model,
|
|
86
|
+
version=config["version"],
|
|
87
|
+
seed=0,
|
|
88
|
+
evaluation=evaluation,
|
|
89
|
+
optimizer=Adafactor,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# -------------------------------------
|
|
93
|
+
# in the case when the linear learning rate scheduler with warmup is used
|
|
94
|
+
|
|
95
|
+
# let us calculate the appropriate warmup steps (let us take a max epoch of 100)
|
|
96
|
+
# length = len(train_dataset)
|
|
97
|
+
|
|
98
|
+
# n_steps = length // config['batch_size']
|
|
99
|
+
|
|
100
|
+
# num_steps = config['max_epoch'] * n_steps
|
|
101
|
+
|
|
102
|
+
# warmup_steps = (config['max_epoch'] * n_steps) * config['warmup_ratio']
|
|
103
|
+
|
|
104
|
+
# Initialize the scheduler parameters
|
|
105
|
+
# scheduler_args = {'num_warmup_steps': warmup_steps, 'num_training_steps': num_steps}
|
|
106
|
+
# -------------------------------------
|
|
107
|
+
|
|
108
|
+
# Initialize the optimizer parameters
|
|
109
|
+
optimizer_args = {
|
|
110
|
+
"lr": config["learning_rate"],
|
|
111
|
+
"weight_decay": config["weight_decay"],
|
|
112
|
+
# 'betas': (0.9, 0.98),
|
|
113
|
+
"warmup_init": config["warmup_init"],
|
|
114
|
+
"relative_step": config["relative_step"],
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# ----------------------------
|
|
118
|
+
# initialize the bucket samplers for distributed environment
|
|
119
|
+
# boundaries = config['boundaries']
|
|
120
|
+
# batch_sizes = config['batch_sizes']
|
|
121
|
+
|
|
122
|
+
# train_sampler = SequenceLengthBatchSampler(train_dataset,
|
|
123
|
+
# boundaries = boundaries,
|
|
124
|
+
# batch_sizes = batch_sizes)
|
|
125
|
+
|
|
126
|
+
# test_sampler = SequenceLengthBatchSampler(test_dataset,
|
|
127
|
+
# boundaries = boundaries,
|
|
128
|
+
# batch_sizes = batch_sizes)
|
|
129
|
+
|
|
130
|
+
# ------------------------------
|
|
131
|
+
# initialize a bucket sampler with fixed batch size in the case of single machine
|
|
132
|
+
# with parallelization on multiple gpus
|
|
133
|
+
train_sampler = BucketSampler(train_dataset, config["batch_size"])
|
|
134
|
+
|
|
135
|
+
test_sampler = BucketSampler(test_dataset, config["batch_size"])
|
|
136
|
+
|
|
137
|
+
# ------------------------------
|
|
138
|
+
|
|
139
|
+
# Initialize the loaders parameters
|
|
140
|
+
train_loader_args = {
|
|
141
|
+
"batch_sampler": train_sampler,
|
|
142
|
+
"collate_fn": partial(
|
|
143
|
+
collate_fn_trunc,
|
|
144
|
+
max_len=train_dataset.max_len,
|
|
145
|
+
eos_token_id=tokenizer.eos_token_id,
|
|
146
|
+
pad_token_id=tokenizer.pad_token_id,
|
|
147
|
+
),
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
test_loader_args = {
|
|
151
|
+
"batch_sampler": test_sampler,
|
|
152
|
+
"collate_fn": partial(
|
|
153
|
+
collate_fn_trunc,
|
|
154
|
+
max_len=train_dataset.max_len,
|
|
155
|
+
eos_token_id=tokenizer.eos_token_id,
|
|
156
|
+
pad_token_id=tokenizer.pad_token_id,
|
|
157
|
+
),
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
# Add the datasets and hyperparameters to trainer
|
|
161
|
+
trainer.compile(
|
|
162
|
+
train_dataset,
|
|
163
|
+
test_dataset,
|
|
164
|
+
tokenizer,
|
|
165
|
+
train_loader_args,
|
|
166
|
+
test_loader_args,
|
|
167
|
+
optimizer_kwargs=optimizer_args,
|
|
168
|
+
# lr_scheduler=get_linear_schedule_with_warmup,
|
|
169
|
+
# lr_scheduler_kwargs=scheduler_args,
|
|
170
|
+
predict_with_generate=True,
|
|
171
|
+
stopping_patience=config["stopping_patience"],
|
|
172
|
+
hugging_face=True,
|
|
173
|
+
is_distributed=is_distributed,
|
|
174
|
+
logging_dir=config["logging_dir"],
|
|
175
|
+
dist=dist,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# load the model
|
|
179
|
+
trainer.load(config["model_dir"], load_best=not config["continue"])
|
|
180
|
+
|
|
181
|
+
# Train the model
|
|
182
|
+
trainer.train(
|
|
183
|
+
config["epochs"] - trainer.current_epoch,
|
|
184
|
+
auto_save=True,
|
|
185
|
+
log_step=config["log_step"],
|
|
186
|
+
saving_directory=config["new_model_dir"],
|
|
187
|
+
save_best=config["save_best"],
|
|
188
|
+
metric_for_best_model=config["metric_for_best_model"],
|
|
189
|
+
metric_objective=config["metric_objective"],
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
if config["return_trainer"]:
|
|
193
|
+
|
|
194
|
+
return trainer
|
|
195
|
+
|
|
196
|
+
return None
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: wolof-translate
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Contain function and classes to process corpora for making translation between wolof text and other languages.
|
|
5
|
+
Author: Oumar Kane
|
|
6
|
+
Author-email: oumar.kane@univ-thies.sn
|
|
7
|
+
Requires-Dist: accelerate
|
|
8
|
+
Requires-Dist: torch
|
|
9
|
+
Requires-Dist: spacy
|
|
10
|
+
Requires-Dist: nltk
|
|
11
|
+
Requires-Dist: gensim
|
|
12
|
+
Requires-Dist: furo
|
|
13
|
+
Requires-Dist: streamlit
|
|
14
|
+
Requires-Dist: tokenizers
|
|
15
|
+
Requires-Dist: tensorboard
|
|
16
|
+
Requires-Dist: evaluate
|
|
17
|
+
Requires-Dist: transformers
|
|
18
|
+
Requires-Dist: pandas
|
|
19
|
+
Requires-Dist: numpy
|
|
20
|
+
Requires-Dist: scikit-learn
|
|
21
|
+
Requires-Dist: matplotlib
|
|
22
|
+
Requires-Dist: plotly
|
|
23
|
+
Requires-Dist: sacrebleu
|
|
24
|
+
Requires-Dist: nlpaug
|
|
25
|
+
Requires-Dist: wandb
|
|
26
|
+
Requires-Dist: pytorch-lightning
|
|
27
|
+
Requires-Dist: selenium
|
|
28
|
+
Requires-Dist: sentencepiece
|
|
29
|
+
Requires-Dist: peft
|
|
30
|
+
Requires-Dist: rouge-score
|
|
31
|
+
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
wolof_translate/__init__.py,sha256=qHFFSR2P3SpUh62FuZghk3KWNCeo2At_SJnRb3wRRpU,2509
|
|
2
|
+
wolof_translate/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
wolof_translate/data/dataset_v1.py,sha256=7BWW4jv8qR1Wau_-GT007hrZpnHxbPwnvDca8C0_eHU,4264
|
|
4
|
+
wolof_translate/data/dataset_v2.py,sha256=nTYW88yP6mcky-T43mh-1CeYU30-Xlq4DxOpd3FZ0OQ,5613
|
|
5
|
+
wolof_translate/data/dataset_v3.py,sha256=SD0VGr2oMl_4TUafD_5ZKqiZAnbwuFD44sajI2gGl2Y,5628
|
|
6
|
+
wolof_translate/data/dataset_v3_2.py,sha256=-gq31O7dt41Zi196m6uHKV6PHtODAeZq95ICc9NJXRA,5628
|
|
7
|
+
wolof_translate/data/dataset_v4.py,sha256=0c97Pkjb7TBC5G91-1bFvVzafLQpruX3lj_PDOzmaAE,6271
|
|
8
|
+
wolof_translate/data/dataset_v5.py,sha256=JZMacAn4jdfCOJ3PH2tWnQ4qDN_OzYYquKHBTOe_ZdM,2050
|
|
9
|
+
wolof_translate/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
wolof_translate/models/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
wolof_translate/models/transformers/main.py,sha256=Ua8AG0CRhkJ0At7Ma8nt7lcl9226DGg-PgrR8MOTgPo,30276
|
|
12
|
+
wolof_translate/models/transformers/main_2.py,sha256=ApUMR5LxSYHfF48wOkAgfEgnUm-4fYD9JFrq_llx_0U,12015
|
|
13
|
+
wolof_translate/models/transformers/optimization.py,sha256=2YJ66NUBJBM86CNNx8mCng1ryFAVbcS08fUK7f3OYOk,1401
|
|
14
|
+
wolof_translate/models/transformers/position.py,sha256=44Z-qSfB-NGhqXOXKbaztC8Bpc17BHcOX69zMssjbeg,1672
|
|
15
|
+
wolof_translate/models/transformers/size.py,sha256=UI4I30cwJMZ5RDs31X63HWW60_Zgy542FMC9sguQHqc,1317
|
|
16
|
+
wolof_translate/pipe/__init__.py,sha256=n2k4pXK1y-xxVnX13E9H1-hkYtpHbCAJVs-LMARWlvI,81
|
|
17
|
+
wolof_translate/pipe/nlp_pipeline.py,sha256=jmC5xXb1pAZ9uWXFepGCcYjIIzMk3Hpqu1cBMaMN2KE,15610
|
|
18
|
+
wolof_translate/tokenizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
wolof_translate/trainers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
+
wolof_translate/trainers/transformer_trainer.py,sha256=yK6GtQUXRdcG9XxR5QeEOgId8T6Oh3WJtFVmiyFmI_U,26691
|
|
21
|
+
wolof_translate/trainers/transformer_trainer_custom.py,sha256=hHUBcU4YK6wuRUMiwX5xG0oTaoDLt9bVSZbzIFRuces,31926
|
|
22
|
+
wolof_translate/trainers/transformer_trainer_ml.py,sha256=WgggaugkVHSJlwIAZT-QwI90Fl-_zT8Clhb-7M0m8gM,33561
|
|
23
|
+
wolof_translate/trainers/transformer_trainer_ml_.py,sha256=QaN9DB5pqhBxV4WlFmJCmUyfwlX-UyAzKRwL6rVEr4Q,38199
|
|
24
|
+
wolof_translate/utils/__init__.py,sha256=Nl3300H-Xd3uTHDR8y-rYa-UUR9FqbqZPwUKJUpQOb4,64
|
|
25
|
+
wolof_translate/utils/bucket_iterator.py,sha256=Hglii1Hj6H_K51JunTjUAxuLd4ehPb6LeeMVhsmhNxQ,6248
|
|
26
|
+
wolof_translate/utils/database_manager.py,sha256=7yhgBN1LvVFNEQikxCjSCva82h5nX44Nx2zh8cpFWyA,3543
|
|
27
|
+
wolof_translate/utils/display_predictions.py,sha256=y5H5lfgIODl6E5Zfb1YIwiAxIlHUxRBoChfQR5kjh24,5145
|
|
28
|
+
wolof_translate/utils/download_model.py,sha256=x92KpfVPvNK8Suen1qnOcPtZOlB4kXTfqWgoVuuMUEM,1241
|
|
29
|
+
wolof_translate/utils/evaluate_custom.py,sha256=cmcGfRAjhTuP9ekeJ0cioNoE1cQ7fQ7mZTh6_1IAaXM,3587
|
|
30
|
+
wolof_translate/utils/evaluation.py,sha256=Taxv4UAgg5q3WxC73pp84srr_wX6Kw9Ub9MloYrUmLs,1838
|
|
31
|
+
wolof_translate/utils/extract_new_sentences.py,sha256=li9UDgLa4nI6DSdB5oH0_m8xek3EEoVBL3CrVKTxGrc,22861
|
|
32
|
+
wolof_translate/utils/extract_poems.py,sha256=9Pf1PluUq257vcS2iinGPi6azGjgmHU7Q57uwQkHfAs,1314
|
|
33
|
+
wolof_translate/utils/extract_sentences.py,sha256=-PDBmceKUqiTdV9ieezSIITfADAnv_7OsNY8zdJi0To,15713
|
|
34
|
+
wolof_translate/utils/recuperate_datasets.py,sha256=4yTNXPOIfTokon0Bke50SdB8MT_Ojmu1aTmYv_K_w64,2644
|
|
35
|
+
wolof_translate/utils/recuperate_datasets_trunc.py,sha256=82T7mHbxruYJUw0L0ZUUoPHxO2Yr65rApakmIhe034M,2500
|
|
36
|
+
wolof_translate/utils/send_model.py,sha256=v_dQJDDpk3ak_DutbhwSqKF8-Q_-Gx9zezZsTot6Onk,797
|
|
37
|
+
wolof_translate/utils/sent_corrections.py,sha256=5iqdS4j78ayag0GxnCEl_dBUs4zbBAWAOac2h0ECv4c,3534
|
|
38
|
+
wolof_translate/utils/sent_transformers.py,sha256=kbbc5H-zPkxSM1uOghGeZa9fCAcm2GwTSuiRHM0asgI,574
|
|
39
|
+
wolof_translate/utils/sent_unification.py,sha256=UD9uZ--NREj5Z462n5hs-UjMPNhUN8Nr_6ZmR2w-B6Y,2104
|
|
40
|
+
wolof_translate/utils/split_with_valid.py,sha256=7-e6EfvPbLpTYrZOXJVYYqm_nV7n6yUYOaWkn8hsJJw,2424
|
|
41
|
+
wolof_translate/utils/tokenize_text.py,sha256=LZNsYmpchlkNsul00yb3HQToC-L7XSYuPHGCRCfbz9Y,1226
|
|
42
|
+
wolof_translate/utils/training.py,sha256=5vPVuqHL6_gqLkh4PTxXqW4UvAJBWNWVDDXC9Fk7IQI,6732
|
|
43
|
+
wolof_translate/utils/trunc_hg_training.py,sha256=mMGrU7Mjr9vYd7eLc8nbFRhRXwSWMKyg35lGf0L6RtQ,6418
|
|
44
|
+
wolof_translate/utils/improvements/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
|
+
wolof_translate/utils/improvements/end_marks.py,sha256=scmhMMYguZmrZTPozx1ZovizKrrPfPpMLXbU2-IOdGs,1194
|
|
46
|
+
wolof_translate-0.0.1.dist-info/METADATA,sha256=itQMCA-zGM3gSDiKco5dMtY5qfpPHvKNJbw9KtffxzI,818
|
|
47
|
+
wolof_translate-0.0.1.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
|
48
|
+
wolof_translate-0.0.1.dist-info/top_level.txt,sha256=YG-kBnOwUZyQ7SofNvMxNYjzCreH2PVcW2UaEg1-Reg,16
|
|
49
|
+
wolof_translate-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
wolof_translate
|