wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. wolof_translate/__init__.py +73 -0
  2. wolof_translate/data/__init__.py +0 -0
  3. wolof_translate/data/dataset_v1.py +151 -0
  4. wolof_translate/data/dataset_v2.py +187 -0
  5. wolof_translate/data/dataset_v3.py +187 -0
  6. wolof_translate/data/dataset_v3_2.py +187 -0
  7. wolof_translate/data/dataset_v4.py +202 -0
  8. wolof_translate/data/dataset_v5.py +65 -0
  9. wolof_translate/models/__init__.py +0 -0
  10. wolof_translate/models/transformers/__init__.py +0 -0
  11. wolof_translate/models/transformers/main.py +865 -0
  12. wolof_translate/models/transformers/main_2.py +362 -0
  13. wolof_translate/models/transformers/optimization.py +41 -0
  14. wolof_translate/models/transformers/position.py +46 -0
  15. wolof_translate/models/transformers/size.py +44 -0
  16. wolof_translate/pipe/__init__.py +1 -0
  17. wolof_translate/pipe/nlp_pipeline.py +512 -0
  18. wolof_translate/tokenizers/__init__.py +0 -0
  19. wolof_translate/trainers/__init__.py +0 -0
  20. wolof_translate/trainers/transformer_trainer.py +760 -0
  21. wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  22. wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  23. wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  24. wolof_translate/utils/__init__.py +1 -0
  25. wolof_translate/utils/bucket_iterator.py +143 -0
  26. wolof_translate/utils/database_manager.py +116 -0
  27. wolof_translate/utils/display_predictions.py +162 -0
  28. wolof_translate/utils/download_model.py +40 -0
  29. wolof_translate/utils/evaluate_custom.py +147 -0
  30. wolof_translate/utils/evaluation.py +74 -0
  31. wolof_translate/utils/extract_new_sentences.py +810 -0
  32. wolof_translate/utils/extract_poems.py +60 -0
  33. wolof_translate/utils/extract_sentences.py +562 -0
  34. wolof_translate/utils/improvements/__init__.py +0 -0
  35. wolof_translate/utils/improvements/end_marks.py +45 -0
  36. wolof_translate/utils/recuperate_datasets.py +94 -0
  37. wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  38. wolof_translate/utils/send_model.py +26 -0
  39. wolof_translate/utils/sent_corrections.py +169 -0
  40. wolof_translate/utils/sent_transformers.py +27 -0
  41. wolof_translate/utils/sent_unification.py +97 -0
  42. wolof_translate/utils/split_with_valid.py +72 -0
  43. wolof_translate/utils/tokenize_text.py +46 -0
  44. wolof_translate/utils/training.py +213 -0
  45. wolof_translate/utils/trunc_hg_training.py +196 -0
  46. wolof_translate-0.0.1.dist-info/METADATA +31 -0
  47. wolof_translate-0.0.1.dist-info/RECORD +49 -0
  48. wolof_translate-0.0.1.dist-info/WHEEL +5 -0
  49. wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,60 @@
1
+ from typing import *
2
+
3
+
4
+ def extract_sentences(poems: List[List[str]], remove: list = ["\n"], sep: str = "_"):
5
+
6
+ new_poems = {1: []}
7
+
8
+ for poem in poems:
9
+
10
+ new_poem = {1: []}
11
+
12
+ i = 1
13
+
14
+ j = 0
15
+
16
+ for line in poem:
17
+
18
+ for mark in remove:
19
+
20
+ line = line.strip(mark).strip()
21
+
22
+ if line == sep:
23
+
24
+ i += 1
25
+
26
+ j = 0
27
+
28
+ new_poem[i] = []
29
+
30
+ if line != "" and line != sep:
31
+
32
+ if i > 1:
33
+
34
+ try:
35
+
36
+ line = (
37
+ line[0].upper() + line[1:]
38
+ if new_poem[i - 1][j][0].isupper()
39
+ else line[0].lower() + line[1:]
40
+ )
41
+
42
+ except IndexError:
43
+
44
+ raise IndexError(
45
+ "The number of lines in the different corpora are not the sames !"
46
+ )
47
+
48
+ new_poem[i].append(line.strip())
49
+
50
+ j += 1
51
+
52
+ for key in new_poem.keys():
53
+
54
+ if not key in new_poems:
55
+
56
+ new_poems[key] = []
57
+
58
+ new_poems = {k: new_poems[k] + v for k, v in new_poem.items()}
59
+
60
+ return new_poems
@@ -0,0 +1,562 @@
1
+ from nlp_project import *
2
+ import pickle
3
+
4
+
5
+ class LengthError(Exception):
6
+ def __init__(self, *args: object) -> None:
7
+ super().__init__(*args)
8
+
9
+
10
+ class CommandError(Exception):
11
+ def __init__(self, *args: object) -> None:
12
+ super().__init__(*args)
13
+
14
+
15
+ class ExtractRelatedSentences:
16
+
17
+ nb_process = 1
18
+
19
+ indices = {}
20
+
21
+ def __init__(
22
+ self,
23
+ corpora: pd.DataFrame,
24
+ corpus_1: str = "french_corpus",
25
+ corpus_2: str = "wolof_corpus",
26
+ ):
27
+
28
+ self.corpora = corpora
29
+
30
+ self.corpus_1 = corpus_1
31
+
32
+ self.corpus_2 = corpus_2
33
+
34
+ self.length = corpora.shape[0]
35
+
36
+ self.sentences = {}
37
+
38
+ self.passed = {}
39
+
40
+ @classmethod
41
+ def reload(cls, number: int = 1):
42
+
43
+ cls.nb_process = number
44
+
45
+ @classmethod
46
+ def store_indices(cls, nb_paragraph, i: int, j: int):
47
+
48
+ cls.indices[nb_paragraph] = {"i": i, "j": j}
49
+
50
+ def increment(self, number: int = 1):
51
+
52
+ ExtractRelatedSentences.nb_process += number
53
+
54
+ if ExtractRelatedSentences.nb_process > self.length:
55
+ pass
56
+ # raise ValueError("The last paragraph is reached!")
57
+
58
+ def decrement(self, number: int = 1):
59
+
60
+ ExtractRelatedSentences.nb_process -= number
61
+
62
+ if ExtractRelatedSentences.nb_process < 1:
63
+
64
+ ExtractRelatedSentences.nb_process = 1
65
+
66
+ def add_sentences(self, nb_paragraph: int, sentences: dict):
67
+
68
+ self.sentences[nb_paragraph] = sentences
69
+
70
+ if len(sentences["1"]) != len(sentences["2"]):
71
+
72
+ raise LengthError(
73
+ "The number of sentences in the two corpora must be equal!"
74
+ )
75
+
76
+ def add_passed(self, nb_paragraph: int, sentences: dict):
77
+
78
+ self.sentences[nb_paragraph] = sentences
79
+
80
+ def clear_sentences(self, nb_paragraph: int):
81
+
82
+ sentences = self.sentences[nb_paragraph]
83
+
84
+ clear = input(
85
+ f"Are you sure you want to remove the following sentences!\
86
+ \n\nOn {self.corpus_1}:\n{sentences['1']}\n\nOn {self.corpus_2}:\n{sentences['2']}\nYes(y), No(n) :"
87
+ )
88
+
89
+ if clear == "y":
90
+
91
+ del sentences[nb_paragraph]
92
+
93
+ print(f"Sentences at {nb_paragraph} was cleared!")
94
+
95
+ elif clear == "n":
96
+
97
+ print(f"Process aborted!")
98
+
99
+ else:
100
+
101
+ raise CommandError(f"You cannot take the command {clear}!")
102
+
103
+ def get_sentences(self, nb_paragraph: int):
104
+
105
+ return self.sentences[nb_paragraph]
106
+
107
+ def unify(self, sentences: list, sentence: str, unification_marks: list):
108
+
109
+ for mark in unification_marks:
110
+
111
+ begin_mark = False
112
+
113
+ end_mark = False
114
+
115
+ for letter in sentences[-1]:
116
+
117
+ if letter == mark[1]:
118
+
119
+ begin_mark = False
120
+
121
+ elif letter == mark[0]:
122
+
123
+ begin_mark = True
124
+
125
+ for letter in sentence:
126
+
127
+ if letter == mark[1]:
128
+
129
+ end_mark = True
130
+
131
+ break
132
+
133
+ else:
134
+
135
+ break
136
+
137
+ if sentence != "" and sentence[0].islower():
138
+
139
+ return True, " "
140
+
141
+ if end_mark or begin_mark:
142
+
143
+ space = " " if mark[2] else ""
144
+
145
+ return True, space
146
+
147
+ return False, " "
148
+
149
+ def split(self, paragraph: str, ending_marks: list, unification_marks: list):
150
+
151
+ prob_sentences = paragraph.strip()
152
+
153
+ for mark in ending_marks:
154
+
155
+ if isinstance(prob_sentences, list):
156
+
157
+ new_sentences = prob_sentences.copy()
158
+
159
+ counter = 0
160
+
161
+ for s in prob_sentences:
162
+
163
+ if mark in s:
164
+
165
+ splits = new_sentences[counter].split(mark)
166
+
167
+ sentences = [
168
+ sentence.strip() + mark for sentence in splits[:-1]
169
+ ] + splits[-1:]
170
+
171
+ new_sentences = (
172
+ new_sentences[:counter]
173
+ + sentences
174
+ + new_sentences[counter + 1 :]
175
+ )
176
+
177
+ counter += len(sentences) - 1
178
+
179
+ else:
180
+
181
+ counter += 1
182
+
183
+ prob_sentences = new_sentences
184
+
185
+ else:
186
+
187
+ if mark in prob_sentences:
188
+
189
+ splits = prob_sentences.split(mark)
190
+
191
+ prob_sentences = [
192
+ sentence.strip() + mark for sentence in splits[:-1]
193
+ ] + splits[-1:]
194
+
195
+ new_sentences = []
196
+
197
+ counter = 0
198
+
199
+ for s in prob_sentences:
200
+
201
+ unify, space = False, ""
202
+
203
+ if counter != 0:
204
+
205
+ unify, space = self.unify(new_sentences, s, unification_marks)
206
+
207
+ if s != "":
208
+
209
+ if not unify:
210
+
211
+ new_sentences.append(s)
212
+
213
+ counter += 1
214
+
215
+ else:
216
+
217
+ new_sentences[-1] = new_sentences[-1] + space + s
218
+
219
+ prob_sentences = new_sentences
220
+
221
+ return prob_sentences
222
+
223
+ def __save(self, storage: str = "data/extractions/new_data/sent_extraction.txt"):
224
+
225
+ with open(storage, "wb") as f:
226
+
227
+ checkpoints = {
228
+ "indices": ExtractRelatedSentences.indices,
229
+ "nb_process": ExtractRelatedSentences.nb_process,
230
+ "sentences": self.sentences,
231
+ "passed": self.passed,
232
+ }
233
+
234
+ pickler = pickle.Pickler(f)
235
+
236
+ pickler.dump(checkpoints)
237
+
238
+ def save_data_frame(
239
+ self,
240
+ storage: str = "data/extractions/new_data/sent_extraction.txt",
241
+ csv_file_path: str = "data/extractions/new_data/sent_extraction.csv",
242
+ **kwargs,
243
+ ):
244
+
245
+ self.load(storage)
246
+
247
+ data_frame = pd.DataFrame.from_dict(self.sentences[1], orient="columns")
248
+
249
+ for i in range(2, self.length + 1):
250
+
251
+ data_frame = pd.concat(
252
+ (
253
+ data_frame,
254
+ pd.DataFrame.from_dict(self.sentences[i], orient="columns"),
255
+ )
256
+ )
257
+
258
+ data_frame.rename(
259
+ columns={"1": self.corpus_1, "2": self.corpus_2}, inplace=True
260
+ )
261
+
262
+ data_frame.to_csv(csv_file_path, index=False, **kwargs)
263
+
264
+ def load(self, storage: str = "data/extractions/new_data/sent_extraction.txt"):
265
+
266
+ with open(storage, "rb") as f:
267
+
268
+ depickler = pickle.Unpickler(f)
269
+
270
+ checkpoints = depickler.load()
271
+
272
+ ExtractRelatedSentences.indices = checkpoints["indices"]
273
+
274
+ ExtractRelatedSentences.nb_process = checkpoints["nb_process"]
275
+
276
+ self.sentences = checkpoints["sentences"]
277
+
278
+ self.passed = checkpoints["passed"]
279
+
280
+ def preprocess(
281
+ self,
282
+ number: Union[int, None] = None,
283
+ ending_marks: list = [".", " ?", " !"],
284
+ unification_marks: List[Tuple] = [
285
+ ("«", "»", True),
286
+ ("(", ")", True),
287
+ ("“", "”", True),
288
+ ],
289
+ cr: str = "r",
290
+ cm1: str = "f",
291
+ cm2: str = "j",
292
+ cm3: str = "l",
293
+ cmp1: str = "y",
294
+ cmp2: str = "i",
295
+ cmp3: str = "p",
296
+ q: str = "q",
297
+ start_at_last_indices: bool = False,
298
+ i: int = 0,
299
+ j: int = 0,
300
+ auto_save: bool = True,
301
+ storage: str = "data/extractions/new_data/sent_extraction.txt",
302
+ ):
303
+
304
+ line = number if not number is None else self.nb_process
305
+
306
+ process_again = ""
307
+
308
+ try:
309
+ self.load(storage=storage)
310
+ except:
311
+ pass
312
+
313
+ if line in set(self.sentences):
314
+
315
+ process_again = input(
316
+ f"You have already process the paragraph at line {line}.\nDo you want to modify from the processed sentences ? Yes(y), No(n):"
317
+ )
318
+
319
+ print(f"Preprocessing of paragraph at line {line}")
320
+
321
+ if process_again == "n" or process_again == "":
322
+
323
+ paragraph1 = str(self.corpora.loc[line - 1, self.corpus_1])
324
+
325
+ paragraph2 = str(self.corpora.loc[line - 1, self.corpus_2])
326
+
327
+ # let us separate the paragraphs by ending marks
328
+
329
+ prob_sentences1 = self.split(paragraph1, ending_marks, unification_marks)
330
+
331
+ prob_sentences2 = self.split(paragraph2, ending_marks, unification_marks)
332
+
333
+ elif process_again == "y":
334
+
335
+ prob_sentences1 = self.sentences[line]["1"]
336
+
337
+ prob_sentences2 = self.sentences[line]["2"]
338
+
339
+ else:
340
+
341
+ raise CommandError(f"You cannot take the command {process_again}!")
342
+
343
+ print("\n-----------\n-----------\n")
344
+
345
+ print("Do you want to process the following sentences:\n")
346
+
347
+ print(f"On {self.corpus_1}: ")
348
+
349
+ [print(f"{i}: {sentence}") for i, sentence in enumerate(prob_sentences1)]
350
+
351
+ print(f"\nOn {self.corpus_2}: ")
352
+
353
+ [print(f"{i}: {sentence}") for i, sentence in enumerate(prob_sentences2)]
354
+
355
+ print("\n-----------")
356
+
357
+ process = input("Yes(y), Accept all (a) or No(n): ")
358
+
359
+ cm = ""
360
+
361
+ if process == "y":
362
+
363
+ sentences = {"1": [], "2": []}
364
+
365
+ passed = {"1": [], "2": []}
366
+
367
+ last_sentences = {"1": "", "2": ""}
368
+
369
+ if start_at_last_indices and line in set(self.indices):
370
+
371
+ i = self.indices[line]["i"]
372
+
373
+ j = self.indices[line]["j"]
374
+
375
+ while i < len(prob_sentences1) and j < len(prob_sentences2):
376
+
377
+ self.store_indices(line, i, j)
378
+
379
+ sentence1 = sentence1_ = prob_sentences1[i]
380
+
381
+ sentence2 = sentence2_ = prob_sentences2[j]
382
+
383
+ if last_sentences["1"] != "":
384
+
385
+ sentence1_ = last_sentences["1"].strip() + " " + sentence1
386
+
387
+ if last_sentences["2"] != "":
388
+
389
+ sentence2_ = last_sentences["2"] + " " + sentence2
390
+
391
+ print(
392
+ f"\nThe current sentences are:\n{self.corpus_1} (index = {i}) : {sentence1_}\n{self.corpus_2} (index = {j}) : {sentence2_}"
393
+ )
394
+
395
+ cm = input(
396
+ f"Are they related together ?\n(index = {i}) : {sentence1_}\n{self.corpus_2} (index = {j}) : {sentence2_}\nQuit ({q}), Related ({cr}), Sentence 1 is uncompleted ({cm1}), Sentence 2 is uncompleted ({cm2}), The two sentences are uncompleted ({cm3}),\n Pass sentence 1 ({cmp1}), Pass sentence 2 ({cmp2}), Pass the two sentences ({cmp3}) :"
397
+ )
398
+
399
+ if cm == cr:
400
+
401
+ # clear the last sentences
402
+ last_sentences = {"1": "", "2": ""}
403
+
404
+ # add the sentences to the list of related sentences
405
+ sentences["1"].append(sentence1_.strip())
406
+
407
+ sentences["2"].append(sentence2_.strip())
408
+
409
+ # Pass to the next sentences
410
+ i += 1
411
+
412
+ j += 1
413
+
414
+ elif cm == cm1:
415
+
416
+ # The first sentence is added to the last sentence 1
417
+ last_sentences["1"] += " " + sentence1
418
+
419
+ # Pass to the next sentence at corpus 1
420
+ i += 1
421
+
422
+ elif cm == cm2:
423
+
424
+ # The second sentence is added to the last sentence 2
425
+ last_sentences["2"] += " " + sentence2
426
+
427
+ # Pass to the next sentence at corpus 2
428
+ j += 1
429
+
430
+ elif cm == cm3:
431
+
432
+ # The two sentences are added to the last sentences
433
+ last_sentences["1"] += " " + sentence1
434
+
435
+ last_sentences["2"] += " " + sentence2
436
+
437
+ # Pass to the next sentences
438
+ i += 1
439
+
440
+ j += 1
441
+
442
+ elif cm == cmp1:
443
+
444
+ # Clear the last sentence at corpus 1
445
+ last_sentences["1"] = ""
446
+
447
+ # Add the sentence 1 to the passed sentences at corpus 1
448
+ passed["1"].append(sentence1_)
449
+
450
+ # Pass to the next sentence at corpus 1
451
+ i += 1
452
+
453
+ elif cm == cmp2:
454
+
455
+ # Clear the last sentence at corpus 2
456
+ last_sentences["2"] = ""
457
+
458
+ # Add the sentence 2 to the passed sentences at corpus 2
459
+ passed["2"].append(sentence2_)
460
+
461
+ # Pass to the next sentence at corpus 2
462
+ j += 1
463
+
464
+ elif cm == cmp3:
465
+
466
+ # Clear the last sentences
467
+ last_sentences = {"1": "", "2": ""}
468
+
469
+ # Add the two sentences to the passed sentences
470
+ passed["1"].append(sentence1_)
471
+
472
+ passed["2"].append(sentence2_)
473
+
474
+ # Pass to the next sentences
475
+ i += 1
476
+
477
+ j += 1
478
+
479
+ elif cm == q:
480
+
481
+ break
482
+
483
+ else:
484
+
485
+ print(f"You cannot take the command {cm} ! Please retry again !")
486
+
487
+ print("\n-----------\n")
488
+
489
+ continue
490
+
491
+ print("\nYou have stored the following sentences for the moment :\n")
492
+
493
+ print(f"On {self.corpus_1} (length = {len(sentences['1'])}): ")
494
+
495
+ print("\n".join(sentences["1"]))
496
+
497
+ print(f"\nOn {self.corpus_2} (length = {len(sentences['2'])}): ")
498
+
499
+ print("\n".join(sentences["2"]))
500
+
501
+ print("\n-----------\n")
502
+
503
+ print("You have passed the following sentences :\n")
504
+
505
+ print(f"On {self.corpus_1} (length = {len(passed['1'])}): ")
506
+
507
+ print("\n".join(passed["1"]))
508
+
509
+ print(f"\nOn {self.corpus_2} (length = {len(passed['2'])}): ")
510
+
511
+ print("\n".join(passed["2"]))
512
+
513
+ print("\n------------------------------------")
514
+
515
+ # add the sentences
516
+ self.add_sentences(line, sentences)
517
+
518
+ # add the passed sentences
519
+ self.add_passed(line, passed)
520
+
521
+ if auto_save:
522
+ # save the checkpoints
523
+ self.__save(storage)
524
+
525
+ print("\nFinished!")
526
+
527
+ # incrementing the number of processes
528
+ self.increment()
529
+
530
+ # add the passed sentences
531
+ self.add_passed(line, passed)
532
+
533
+ # add the sentences
534
+ self.add_sentences(line, sentences)
535
+
536
+ if auto_save:
537
+ # save the checkpoints
538
+ self.__save(storage)
539
+
540
+ elif process == "a":
541
+
542
+ sentences = {"1": prob_sentences1, "2": prob_sentences2}
543
+
544
+ print("\nFinished!")
545
+
546
+ # incrementing the number of processes
547
+ self.increment()
548
+
549
+ # add the sentences
550
+ self.add_sentences(line, sentences)
551
+
552
+ if auto_save:
553
+ # save the checkpoints
554
+ self.__save(storage)
555
+
556
+ elif process == "n" or cm == "q":
557
+
558
+ print(f"Process aborted!")
559
+
560
+ else:
561
+
562
+ raise CommandError(f"You cannot take the command {process}!")
File without changes
@@ -0,0 +1,45 @@
1
+ import pandas as pd
2
+ from typing import *
3
+
4
+
5
+ def add_end_mark(
6
+ sentences: Union[list, str],
7
+ end_mark: str = ".",
8
+ end_mark_to_remove: Union[str, None] = None,
9
+ replace: bool = False,
10
+ poss_end_marks: list = ["!", "?", ".", "..."],
11
+ ):
12
+
13
+ if isinstance(sentences, str):
14
+ sentences = [sentences]
15
+
16
+ if replace and end_mark_to_remove is None:
17
+
18
+ raise ValueError(
19
+ "You must provide a end mark to remove if you want to make replacement !"
20
+ )
21
+
22
+ new_sentences = []
23
+
24
+ # if replace is chosen we will replace the end to remove by the end mark if not only add end mark to each sentence and remove the end mark to remove
25
+ for sentence in sentences:
26
+
27
+ if replace:
28
+
29
+ if sentence[-1] == end_mark_to_remove:
30
+
31
+ sentence = sentence[:-1].strip() + end_mark
32
+
33
+ else:
34
+
35
+ if not end_mark_to_remove is None and sentence[-1] == end_mark_to_remove:
36
+
37
+ sentence = sentence[:-1]
38
+
39
+ if not sentence[-1] in poss_end_marks:
40
+
41
+ sentence = sentence.strip() + end_mark
42
+
43
+ new_sentences.append(sentence)
44
+
45
+ return new_sentences