wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. wolof_translate/__init__.py +73 -0
  2. wolof_translate/data/__init__.py +0 -0
  3. wolof_translate/data/dataset_v1.py +151 -0
  4. wolof_translate/data/dataset_v2.py +187 -0
  5. wolof_translate/data/dataset_v3.py +187 -0
  6. wolof_translate/data/dataset_v3_2.py +187 -0
  7. wolof_translate/data/dataset_v4.py +202 -0
  8. wolof_translate/data/dataset_v5.py +65 -0
  9. wolof_translate/models/__init__.py +0 -0
  10. wolof_translate/models/transformers/__init__.py +0 -0
  11. wolof_translate/models/transformers/main.py +865 -0
  12. wolof_translate/models/transformers/main_2.py +362 -0
  13. wolof_translate/models/transformers/optimization.py +41 -0
  14. wolof_translate/models/transformers/position.py +46 -0
  15. wolof_translate/models/transformers/size.py +44 -0
  16. wolof_translate/pipe/__init__.py +1 -0
  17. wolof_translate/pipe/nlp_pipeline.py +512 -0
  18. wolof_translate/tokenizers/__init__.py +0 -0
  19. wolof_translate/trainers/__init__.py +0 -0
  20. wolof_translate/trainers/transformer_trainer.py +760 -0
  21. wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  22. wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  23. wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  24. wolof_translate/utils/__init__.py +1 -0
  25. wolof_translate/utils/bucket_iterator.py +143 -0
  26. wolof_translate/utils/database_manager.py +116 -0
  27. wolof_translate/utils/display_predictions.py +162 -0
  28. wolof_translate/utils/download_model.py +40 -0
  29. wolof_translate/utils/evaluate_custom.py +147 -0
  30. wolof_translate/utils/evaluation.py +74 -0
  31. wolof_translate/utils/extract_new_sentences.py +810 -0
  32. wolof_translate/utils/extract_poems.py +60 -0
  33. wolof_translate/utils/extract_sentences.py +562 -0
  34. wolof_translate/utils/improvements/__init__.py +0 -0
  35. wolof_translate/utils/improvements/end_marks.py +45 -0
  36. wolof_translate/utils/recuperate_datasets.py +94 -0
  37. wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  38. wolof_translate/utils/send_model.py +26 -0
  39. wolof_translate/utils/sent_corrections.py +169 -0
  40. wolof_translate/utils/sent_transformers.py +27 -0
  41. wolof_translate/utils/sent_unification.py +97 -0
  42. wolof_translate/utils/split_with_valid.py +72 -0
  43. wolof_translate/utils/tokenize_text.py +46 -0
  44. wolof_translate/utils/training.py +213 -0
  45. wolof_translate/utils/trunc_hg_training.py +196 -0
  46. wolof_translate-0.0.1.dist-info/METADATA +31 -0
  47. wolof_translate-0.0.1.dist-info/RECORD +49 -0
  48. wolof_translate-0.0.1.dist-info/WHEEL +5 -0
  49. wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,810 @@
1
+ from typing import *
2
+ import pandas as pd
3
+ import pickle
4
+ import re
5
+ import os
6
+
7
+
8
+ class NewSentenceExtraction:
9
+ def __init__(
10
+ self,
11
+ text: Union[str, None] = None,
12
+ sent_sep: str = ":",
13
+ corpus_1: str = "wolof",
14
+ corpus_2: str = "french",
15
+ save_directory: str = "data/additional_documents/diagne_sentences/",
16
+ checkpoint_name: str = "new_sentences",
17
+ ):
18
+
19
+ self.text = text
20
+
21
+ self.corpus_1 = corpus_1
22
+
23
+ self.corpus_2 = corpus_2
24
+
25
+ self.sep = sent_sep
26
+
27
+ self.groups = []
28
+
29
+ self.index = 0
30
+
31
+ self.save_directory = save_directory
32
+
33
+ self.checkpoint = checkpoint_name
34
+
35
+ self.extractions = {corpus_1: [], corpus_2: []}
36
+
37
+ def __save(self):
38
+
39
+ checkpoints = {
40
+ # 'extractions': self.extractions,
41
+ "index": self.index,
42
+ # 'groups': self.groups
43
+ }
44
+
45
+ pd.DataFrame({"groups": self.groups}).to_csv(
46
+ os.path.join(self.save_directory, "groups.csv"), index=False
47
+ )
48
+
49
+ pd.DataFrame(self.extractions).to_csv(
50
+ os.path.join(self.save_directory, "extractions.csv"), index=False
51
+ )
52
+
53
+ with open(os.path.join(self.save_directory, self.checkpoint), "wb") as f:
54
+
55
+ pickler = pickle.Pickler(f)
56
+
57
+ pickler.dump(checkpoints)
58
+
59
+ def sep_with_mark(self, group: str, mark: Union[str, None] = None):
60
+
61
+ raise NotImplementedError
62
+
63
+ def load(self):
64
+
65
+ with open(os.path.join(self.save_directory, self.checkpoint), "rb") as f:
66
+
67
+ depickler = pickle.Unpickler(f)
68
+
69
+ checkpoints = depickler.load()
70
+
71
+ try:
72
+
73
+ self.extractions = pd.read_csv(
74
+ os.path.join(self.save_directory, "extractions.csv")
75
+ ).to_dict("list")
76
+
77
+ except Exception:
78
+
79
+ pass
80
+
81
+ self.groups = pd.read_csv(os.path.join(self.save_directory, "groups.csv"))[
82
+ "groups"
83
+ ].to_list()
84
+
85
+ self.index = checkpoints["index"]
86
+
87
+ def add_groups(self, new_groups: list):
88
+
89
+ self.groups += new_groups
90
+
91
+ self.__save()
92
+
93
+ def get_groups(self, stop_criterions: list = [" ", "\n"], comparisons: list = []):
94
+
95
+ assert not self.text is None
96
+
97
+ i = 0
98
+
99
+ a = 0
100
+
101
+ g = 1
102
+
103
+ while i < len(self.text):
104
+
105
+ letter = self.text[i]
106
+
107
+ if letter == self.sep:
108
+
109
+ print(f"Extraction of group number {g}\n")
110
+
111
+ b = i - 1 # index of letters before the current letter
112
+
113
+ a = i + 1 # index of letters after the current letter
114
+
115
+ corpus_1_s = [] # letters of the left sentence
116
+
117
+ corpus_2_s = [] # letters of the right sentence
118
+
119
+ stop = False
120
+
121
+ for stop_cr in stop_criterions:
122
+
123
+ if self.text[b - len(stop_cr) + 1 : b + 1] == stop_cr:
124
+
125
+ stop = True
126
+
127
+ while not stop:
128
+
129
+ corpus_1_s.append(self.text[b])
130
+
131
+ b -= 1
132
+
133
+ stop = False
134
+
135
+ for stop_cr in stop_criterions:
136
+
137
+ if self.text[b - len(stop_cr) + 1 : b + 1] == stop_cr:
138
+
139
+ stop = True
140
+
141
+ stop = False
142
+
143
+ for stop_cr in stop_criterions:
144
+
145
+ if self.text[a : a + len(stop_cr)] == stop_cr:
146
+
147
+ stop = True
148
+
149
+ while not stop:
150
+
151
+ corpus_2_s.append(self.text[a])
152
+
153
+ a += 1
154
+
155
+ stop = False
156
+
157
+ for stop_cr in stop_criterions:
158
+
159
+ if self.text[a : a + len(stop_cr)] == stop_cr:
160
+
161
+ stop = True
162
+
163
+ # reverse first sentence
164
+ corpus_1_s.reverse()
165
+
166
+ # add the sentences
167
+ current_sentence = (
168
+ "".join(corpus_1_s).strip()
169
+ + f" {self.sep} "
170
+ + "".join(corpus_2_s).strip()
171
+ )
172
+
173
+ if "".join(corpus_1_s).strip() != "" and "".join(corpus_2_s) != "":
174
+
175
+ # verify if it is not already manually got
176
+ not_recuperated = True
177
+
178
+ for comparison in comparisons:
179
+
180
+ if current_sentence in comparison:
181
+
182
+ not_recuperated = False
183
+
184
+ # verify if it is not already in the extracted groups
185
+ for group in self.groups:
186
+
187
+ if current_sentence in group:
188
+
189
+ not_recuperated = False
190
+
191
+ if not_recuperated:
192
+
193
+ self.groups.append(current_sentence.strip())
194
+ # print(current_sentence)
195
+
196
+ g += 1
197
+
198
+ print("Successfully extracted !!\n")
199
+
200
+ print("-----------------\n")
201
+
202
+ i = a - 1
203
+
204
+ self.__save()
205
+
206
+ i += 1
207
+
208
+ # print("The groups were successfully recuperated !")
209
+
210
+ def replace_groups(
211
+ self,
212
+ re_match: str,
213
+ delete_re: Union[str, None] = None,
214
+ n_replace_max: int = 1,
215
+ load: bool = True,
216
+ save: bool = False,
217
+ manual_replace: bool = False,
218
+ csv_file: str = "founded.csv",
219
+ force_replace: bool = False,
220
+ ):
221
+
222
+ # we load the data
223
+ if load:
224
+
225
+ self.load()
226
+
227
+ # find the groups matching the match regex
228
+ founded = [
229
+ (i, self.groups[i])
230
+ for i in range(len(self.groups))
231
+ if re.match(re_match, self.groups[i])
232
+ ]
233
+
234
+ print(
235
+ f"Found groups matching the regular expression {re_match} are the followings:\n"
236
+ )
237
+
238
+ [print(f"- {f[1]}") for f in founded]
239
+
240
+ print("\n----------------------\n")
241
+
242
+ # if regex for deletion are provided we replace those that will be found with a max number of replace
243
+ not_replaced = set()
244
+
245
+ replaced = set()
246
+
247
+ result = {}
248
+
249
+ delete_re_ = input(
250
+ "Do you want to change the deletion' regex expression -> provide one if yes or give empty string ('') if not : "
251
+ )
252
+
253
+ if delete_re_ != "":
254
+
255
+ delete_re = delete_re_
256
+
257
+ if not delete_re is None or manual_replace:
258
+
259
+ for i in range(len(founded)):
260
+
261
+ f = founded[i][1]
262
+
263
+ index = founded[i][0]
264
+
265
+ m_replace = "n"
266
+
267
+ if not force_replace and manual_replace:
268
+
269
+ print(f"You will modify the following group:\n {f}")
270
+
271
+ m_replace = input(
272
+ f"\nDo you want to make a manual replacement of the group {f} -> Yes(y) or No(n). If you want to quit, press q!"
273
+ )
274
+
275
+ if m_replace == "q":
276
+
277
+ break
278
+
279
+ while not m_replace in ["y", "n"]:
280
+
281
+ replace_r = input(
282
+ f"You must provide a response between Yes(y), No(n)!"
283
+ )
284
+
285
+ if m_replace != "n":
286
+
287
+ print(
288
+ f"The manual modification of the group\n {f}\n is done in the following file: {csv_file}\n!If you want to provide multiple new groups please make them in different lines"
289
+ )
290
+
291
+ finish = "n"
292
+
293
+ pd.DataFrame({"to_modify": [f]}).to_csv(csv_file, index=False)
294
+
295
+ while finish == "n":
296
+
297
+ finish = input(
298
+ "Did you finish to replace -> No(n) if you didn't finish yet, click any another key if Yes(y) : "
299
+ )
300
+
301
+ f = pd.read_csv(csv_file)["to_modify"].to_list()
302
+
303
+ print("\n--------\n")
304
+
305
+ if not delete_re is None and m_replace in ["n", ""]:
306
+
307
+ to_replace = set(re.findall(delete_re, f))
308
+
309
+ replace_r = None
310
+
311
+ for r in to_replace:
312
+
313
+ if force_replace:
314
+
315
+ f = f.replace(r, "", n_replace_max)
316
+
317
+ replaced.add(f)
318
+
319
+ else:
320
+
321
+ replace_r = input(
322
+ f"Do you want to replace the {r} string in the group:\n {f} ? Yes(y) or No(n). If you want to quit, press q!"
323
+ )
324
+
325
+ if m_replace == "q":
326
+
327
+ break
328
+
329
+ while not replace_r in ["y", "n"]:
330
+
331
+ replace_r = input(
332
+ f"You must provide a response between Yes(y) and No(n)!"
333
+ )
334
+
335
+ if replace_r == "y":
336
+
337
+ f = f.replace(r, "", n_replace_max)
338
+
339
+ replaced.add(f)
340
+
341
+ else:
342
+
343
+ not_replaced.add(f)
344
+
345
+ if not replace_r is None and replace_r == "q":
346
+
347
+ break
348
+
349
+ if isinstance(f, str):
350
+
351
+ f = [f.strip()]
352
+
353
+ else:
354
+
355
+ f = [f_.strip() for f_ in f]
356
+
357
+ try:
358
+
359
+ self.groups = self.groups[:index] + f + self.groups[index + 1 :]
360
+
361
+ except IndexError:
362
+
363
+ self.groups = self.groups[:index] + f
364
+
365
+ if len(f) > 1 and i != len(founded) - 1:
366
+
367
+ for j in range(i + 1, len(founded)):
368
+
369
+ founded[j] = (founded[j][0] + len(f) - 1, founded[j][1])
370
+
371
+ result[index] = f
372
+
373
+ if save:
374
+
375
+ print("Final result:")
376
+
377
+ [print(v) for r, v in result.items()]
378
+
379
+ save_result = input("Do you want to save the result ? Yes(y) or No(n)")
380
+
381
+ while not save_result in ["y", "n"]:
382
+
383
+ replace_r = input(
384
+ f"You must provide a response between Yes(y) or No(n) !"
385
+ )
386
+
387
+ if save_result == "y":
388
+
389
+ self.__save()
390
+
391
+ return {
392
+ "founded": founded,
393
+ "result": result,
394
+ "replaced": replaced,
395
+ "not_replaced": not_replaced,
396
+ }
397
+
398
+ def extraction_commands(
399
+ self,
400
+ add_end_mark_cmd: str = "a",
401
+ pass_cmd: str = "p",
402
+ add_end_mark_on_all: str = "l",
403
+ add_upper_cmd: str = "u",
404
+ add_upper_on_all: str = "o",
405
+ sep_cmd: str = "_",
406
+ quit_cmd: str = "q",
407
+ ):
408
+
409
+ # recuperate the current command
410
+ cm = input(
411
+ f"Choose one of the following commands: \n- {add_end_mark_cmd}+group_nb1,group_nb2:mark|group_nb3,group_nb4:mark|...(or group_nb1-group_nbn:mark) : To add end mark on specific groups\
412
+ \n- {add_end_mark_on_all}+mark : To add end mark of all groups, \n- {add_upper_cmd}+group_nb1,group_nb2,group_nb3,group_nb4,...(or group_nb1-group_nbn) : To uppercase the first letter of specific groups\
413
+ \n- {add_upper_on_all} : To uppercase the first letter of all the groups\
414
+ \n- {pass_cmd} : To accept all of the groups\
415
+ \n- {quit_cmd} : To stop the process\
416
+ \n- You can combine all two commands by underscore {sep_cmd} excepted for the two last commands !"
417
+ )
418
+
419
+ cms = cm.split(sep_cmd)
420
+
421
+ error = False
422
+
423
+ if len(cms) == 2:
424
+
425
+ p_cm = [cms[0].split("+")[0], cms[1].split("+")[0]]
426
+
427
+ if pass_cmd in p_cm or quit_cmd in p_cm or sep_cmd in p_cm:
428
+
429
+ print(
430
+ f"You cannot provide {pass_cmd}, {quit_cmd} or {sep_cmd} in combined commands !"
431
+ )
432
+
433
+ error = True
434
+
435
+ elif (
436
+ p_cm[0] in [add_end_mark_cmd, add_end_mark_on_all]
437
+ and p_cm[1] in [add_upper_cmd, add_upper_on_all]
438
+ ) or (
439
+ p_cm[0] in [add_upper_cmd, add_upper_on_all]
440
+ and p_cm[1] in [add_upper_cmd, add_upper_on_all]
441
+ ):
442
+
443
+ print(
444
+ "You cannot combine the same type of command: Type of commands are 'end mark' and 'upper'"
445
+ )
446
+
447
+ elif len(cms) == 1:
448
+
449
+ if not cms[0].split("+")[0] in [
450
+ add_end_mark_cmd,
451
+ add_end_mark_on_all,
452
+ add_upper_cmd,
453
+ add_upper_on_all,
454
+ pass_cmd,
455
+ quit_cmd,
456
+ ]:
457
+
458
+ print("You didn't provide a right command ! Please retry")
459
+
460
+ error = True
461
+
462
+ else:
463
+
464
+ print("You cannot provide more than 2 or 0 commands !")
465
+
466
+ return cms, error
467
+
468
+ def split_group(self, group: Union[list, str]):
469
+ # we base on the colon critter to split the groups
470
+
471
+ if isinstance(group, str):
472
+
473
+ group = [group]
474
+
475
+ sents = {self.corpus_1: [], self.corpus_2: []}
476
+
477
+ for g in group:
478
+
479
+ splits = g.split(":")
480
+
481
+ middle = len(splits) // 2
482
+
483
+ cp1_corpus = "".join(splits[:middle])
484
+
485
+ cp2_corpus = "".join(splits[middle:])
486
+
487
+ sents[self.corpus_1].append(cp1_corpus.strip())
488
+
489
+ sents[self.corpus_2].append(cp2_corpus.strip())
490
+
491
+ return sents
492
+
493
+ def add_end_mark(self, batch: dict, command: str):
494
+
495
+ cm = command
496
+
497
+ # recuperate the marks with groups and apply the transformations
498
+ tfs = cm.split("|")
499
+
500
+ for tf in tfs:
501
+
502
+ if "-" in tf:
503
+
504
+ groups = tf.split(":")[0].split("-")
505
+
506
+ groups = list(range(int(groups[0]), int(groups[1]) + 1))
507
+
508
+ else:
509
+
510
+ groups = [int(nb) for nb in tf.split(":")[0].split(",")]
511
+
512
+ mark = tf.split(":")[1]
513
+
514
+ for nb in groups:
515
+
516
+ batch[self.corpus_1][nb - 1] += mark
517
+
518
+ batch[self.corpus_2][nb - 1] += mark
519
+
520
+ return batch
521
+
522
+ def add_upper(self, batch: dict, command: str):
523
+
524
+ cm = command
525
+
526
+ # recuperate the marks with groups and apply the transformations
527
+ tfs = cm.split("|")
528
+
529
+ for tf in tfs:
530
+
531
+ # recuperate the marks with groups and apply the transformations
532
+ if "-" in tf:
533
+
534
+ groups = tf.split("-")
535
+
536
+ groups = list(range(int(groups[0]), int(groups[1]) + 1))
537
+
538
+ else:
539
+
540
+ groups = [int(nb) for nb in tf.split(",")]
541
+
542
+ for nb in groups:
543
+
544
+ batch[self.corpus_1][nb - 1] = (
545
+ batch[self.corpus_1][nb - 1][0].upper()
546
+ + batch[self.corpus_1][nb - 1][1:]
547
+ )
548
+
549
+ batch[self.corpus_2][nb - 1] = (
550
+ batch[self.corpus_2][nb - 1][0].upper()
551
+ + batch[self.corpus_2][nb - 1][1:]
552
+ )
553
+
554
+ return batch
555
+
556
+ def inner_command(self, batch: dict):
557
+
558
+ cp1_sents = batch[self.corpus_1]
559
+
560
+ cp2_sents = batch[self.corpus_2]
561
+
562
+ for i in range(0, len(batch[self.corpus_1])):
563
+
564
+ cp1_sent = cp1_sents[i]
565
+
566
+ cp2_sent = cp2_sents[i]
567
+
568
+ if re.match(".*Mark\[.*\].*", cp2_sent):
569
+
570
+ mark = re.findall("Mark\[.*\]", cp2_sent)[0]
571
+
572
+ mark = mark.replace("Mark[", "").replace("]", "")
573
+
574
+ cp1_sent = cp1_sent + mark
575
+
576
+ cp2_sent = re.sub("Mark\[.*\]", "", cp2_sent, 1) + mark
577
+
578
+ if re.match(".*Upper", cp2_sent):
579
+
580
+ cp1_sent = cp1_sent[0].upper() + cp1_sent[1:]
581
+
582
+ cp2_sent = cp2_sent[0].upper() + re.sub("Upper", "", cp2_sent, 1)[1:]
583
+
584
+ cp1_sents[i] = cp1_sent
585
+
586
+ cp2_sents[i] = cp2_sent
587
+
588
+ batch[self.corpus_1] = cp1_sents
589
+
590
+ batch[self.corpus_2] = cp2_sents
591
+
592
+ return batch
593
+
594
+ def extract_sentences(
595
+ self,
596
+ group_range: Union[tuple, None] = None,
597
+ add_end_mark_cmd: str = "a",
598
+ pass_cmd: str = "p",
599
+ add_end_mark_on_all: str = "l",
600
+ add_upper_cmd: str = "u",
601
+ add_upper_on_all: str = "o",
602
+ sep_cmd: str = "_",
603
+ quit_cmd: str = "q",
604
+ batch_size: int = 30,
605
+ load: bool = True,
606
+ save: bool = False,
607
+ csv_file: str = "batch.csv",
608
+ last_checkpoint: bool = True,
609
+ ):
610
+
611
+ # we load the data
612
+ if load:
613
+
614
+ self.load()
615
+
616
+ # the group range is equal to a tuple containing the last saved index and the index of the last element in the list of groups
617
+ # indices if nothing is given
618
+ if last_checkpoint:
619
+
620
+ if group_range is None:
621
+ group_range = (self.index, len(self.groups) - 1)
622
+
623
+ else:
624
+
625
+ raise ValueError(
626
+ "You must provide a group range if last checkpoint is to False !"
627
+ )
628
+
629
+ # change the number of displayed lines
630
+ pd.options.display.max_rows = batch_size
631
+
632
+ groups = self.groups[group_range[0] : group_range[1] + 1]
633
+
634
+ # initialize the sub corpora
635
+ sub_corpora = {self.corpus_1: [], self.corpus_2: []}
636
+
637
+ i = 0
638
+
639
+ # for each batch we will add the groups in a csv file and take a command
640
+ for b in range(0, len(groups), batch_size):
641
+
642
+ # recuperate a batch
643
+ batch_ = groups[b : b + batch_size]
644
+
645
+ # recuperate the index
646
+ self.index += len(batch_)
647
+
648
+ # split each group into two sentences and transform the obtained dictionary to a DataFrame
649
+ batch = self.split_group(batch_)
650
+
651
+ pd.DataFrame(batch).to_csv(csv_file, index=False)
652
+
653
+ print(
654
+ f"Which of the groups of batch number {i+1} do you consider to be complete sentences (see the file {csv_file}) ?"
655
+ )
656
+
657
+ error = False
658
+
659
+ cms = []
660
+
661
+ try:
662
+
663
+ cms, error = self.extraction_commands(
664
+ add_end_mark_cmd,
665
+ pass_cmd,
666
+ add_end_mark_on_all,
667
+ add_upper_cmd,
668
+ add_upper_on_all,
669
+ sep_cmd,
670
+ quit_cmd,
671
+ )
672
+
673
+ except Exception:
674
+
675
+ print("You didn't provide a right group number !")
676
+
677
+ error = True
678
+
679
+ while error:
680
+
681
+ error = False
682
+
683
+ try:
684
+
685
+ cms, error = self.extraction_commands(
686
+ add_end_mark_cmd,
687
+ pass_cmd,
688
+ add_end_mark_on_all,
689
+ add_upper_cmd,
690
+ add_upper_on_all,
691
+ sep_cmd,
692
+ quit_cmd,
693
+ )
694
+
695
+ except IndexError:
696
+
697
+ print("You didn't provide a right group number !")
698
+
699
+ error = True
700
+
701
+ # recuperate the batch
702
+ batch = pd.read_csv(csv_file).to_dict("list")
703
+
704
+ # add corrections
705
+ batch = self.inner_command(batch)
706
+
707
+ cm_type = ""
708
+
709
+ quit_ = "n"
710
+
711
+ for cm in cms:
712
+
713
+ cm_type = cm.split("+")[0]
714
+
715
+ if cm_type == add_end_mark_cmd:
716
+
717
+ batch = self.add_end_mark(batch, cm.split("+")[1])
718
+
719
+ elif cm_type == add_end_mark_on_all:
720
+
721
+ mark = cm.split("+")[1]
722
+
723
+ batch = self.add_end_mark(
724
+ batch,
725
+ ",".join(
726
+ [str(nb) for nb in range(1, len(batch[self.corpus_1]) + 1)]
727
+ )
728
+ + f":{mark}",
729
+ )
730
+
731
+ elif cm_type == add_upper_cmd:
732
+
733
+ batch = self.add_upper(batch, cm.split("+")[1])
734
+
735
+ elif cm_type == add_upper_on_all:
736
+
737
+ batch = self.add_upper(
738
+ batch,
739
+ ",".join(
740
+ [str(nb) for nb in range(1, len(batch[self.corpus_1]) + 1)]
741
+ ),
742
+ )
743
+
744
+ elif cm_type == quit_cmd:
745
+
746
+ quit_ = input("Are you sure you want to quit: Yes(y) or No(n)")
747
+
748
+ while not quit_ in ["y", "n"]:
749
+
750
+ quit_ = input("Are you sure you want to quit: Yes(y) or No(n)")
751
+
752
+ if quit_ == "y":
753
+
754
+ break
755
+
756
+ print("\nBatch result")
757
+
758
+ print(pd.DataFrame(batch).head(batch_size))
759
+
760
+ print("\n--------------------\n\n")
761
+
762
+ # add the batch to the sub corpora
763
+ sub_corpora[self.corpus_1].extend(batch[self.corpus_1])
764
+
765
+ sub_corpora[self.corpus_2].extend(batch[self.corpus_2])
766
+
767
+ if cm_type == quit_cmd and quit_ == "y":
768
+
769
+ break
770
+
771
+ else:
772
+
773
+ if save:
774
+
775
+ save_ = input("Do you want to save the result ? Yes(y) or No(n)")
776
+
777
+ while not save_ in ["y", "n"]:
778
+
779
+ save_ = input(
780
+ "Do you want to save the result ? Yes(y) or No(n)"
781
+ )
782
+
783
+ if save_ == "y":
784
+
785
+ self.extractions[self.corpus_1].extend(batch[self.corpus_1])
786
+
787
+ self.extractions[self.corpus_2].extend(batch[self.corpus_2])
788
+
789
+ self.__save()
790
+
791
+ i += 1
792
+
793
+ print("Finished !")
794
+
795
+ def remove_duplicated_sentences(self, save: bool = False):
796
+
797
+ # we load the data
798
+ self.load()
799
+
800
+ # use pandas to delete the duplicated rows
801
+ extractions = pd.DataFrame(self.extractions)
802
+
803
+ extractions.drop_duplicates(inplace=True)
804
+
805
+ self.extractions = extractions.to_dict("list")
806
+
807
+ # save the sentences
808
+ if save:
809
+
810
+ self.__save()