wolof-translate 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. wolof_translate-0.0.1/PKG-INFO +6 -0
  2. wolof_translate-0.0.1/setup.cfg +4 -0
  3. wolof_translate-0.0.1/setup.py +36 -0
  4. wolof_translate-0.0.1/wolof_translate/__init__.py +73 -0
  5. wolof_translate-0.0.1/wolof_translate/__pycache__/__init__.cpython-310.pyc +0 -0
  6. wolof_translate-0.0.1/wolof_translate/__pycache__/__init__.cpython-311.pyc +0 -0
  7. wolof_translate-0.0.1/wolof_translate/__pycache__/dataset_v1.cpython-310.pyc +0 -0
  8. wolof_translate-0.0.1/wolof_translate/__pycache__/sent_transformers.cpython-310.pyc +0 -0
  9. wolof_translate-0.0.1/wolof_translate/data/__init__.py +0 -0
  10. wolof_translate-0.0.1/wolof_translate/data/__pycache__/__init__.cpython-310.pyc +0 -0
  11. wolof_translate-0.0.1/wolof_translate/data/__pycache__/__init__.cpython-311.pyc +0 -0
  12. wolof_translate-0.0.1/wolof_translate/data/__pycache__/dataset_v1.cpython-310.pyc +0 -0
  13. wolof_translate-0.0.1/wolof_translate/data/__pycache__/dataset_v2.cpython-310.pyc +0 -0
  14. wolof_translate-0.0.1/wolof_translate/data/__pycache__/dataset_v3.cpython-310.pyc +0 -0
  15. wolof_translate-0.0.1/wolof_translate/data/__pycache__/dataset_v3.cpython-311.pyc +0 -0
  16. wolof_translate-0.0.1/wolof_translate/data/__pycache__/dataset_v4.cpython-310.pyc +0 -0
  17. wolof_translate-0.0.1/wolof_translate/data/__pycache__/dataset_v4.cpython-311.pyc +0 -0
  18. wolof_translate-0.0.1/wolof_translate/data/dataset_v1.py +151 -0
  19. wolof_translate-0.0.1/wolof_translate/data/dataset_v2.py +187 -0
  20. wolof_translate-0.0.1/wolof_translate/data/dataset_v3.py +187 -0
  21. wolof_translate-0.0.1/wolof_translate/data/dataset_v3_2.py +187 -0
  22. wolof_translate-0.0.1/wolof_translate/data/dataset_v4.py +202 -0
  23. wolof_translate-0.0.1/wolof_translate/data/dataset_v5.py +65 -0
  24. wolof_translate-0.0.1/wolof_translate/models/__init__.py +0 -0
  25. wolof_translate-0.0.1/wolof_translate/models/__pycache__/__init__.cpython-310.pyc +0 -0
  26. wolof_translate-0.0.1/wolof_translate/models/__pycache__/__init__.cpython-311.pyc +0 -0
  27. wolof_translate-0.0.1/wolof_translate/models/french/tfidfaug_w2idf.txt +6235 -0
  28. wolof_translate-0.0.1/wolof_translate/models/french/tfidfaug_w2tfidf.txt +6235 -0
  29. wolof_translate-0.0.1/wolof_translate/models/transformers/__init__.py +0 -0
  30. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/__init__.cpython-310.pyc +0 -0
  31. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/__init__.cpython-311.pyc +0 -0
  32. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/main.cpython-310.pyc +0 -0
  33. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/main.cpython-311.pyc +0 -0
  34. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/optimization.cpython-310.pyc +0 -0
  35. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/optimization.cpython-311.pyc +0 -0
  36. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/position.cpython-310.pyc +0 -0
  37. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/position.cpython-311.pyc +0 -0
  38. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/size.cpython-310.pyc +0 -0
  39. wolof_translate-0.0.1/wolof_translate/models/transformers/__pycache__/size.cpython-311.pyc +0 -0
  40. wolof_translate-0.0.1/wolof_translate/models/transformers/main.py +865 -0
  41. wolof_translate-0.0.1/wolof_translate/models/transformers/main_2.py +362 -0
  42. wolof_translate-0.0.1/wolof_translate/models/transformers/optimization.py +41 -0
  43. wolof_translate-0.0.1/wolof_translate/models/transformers/position.py +46 -0
  44. wolof_translate-0.0.1/wolof_translate/models/transformers/size.py +44 -0
  45. wolof_translate-0.0.1/wolof_translate/models/wolof/tfidfaug_w2idf.txt +5911 -0
  46. wolof_translate-0.0.1/wolof_translate/models/wolof/tfidfaug_w2tfidf.txt +5911 -0
  47. wolof_translate-0.0.1/wolof_translate/pipe/__init__.py +1 -0
  48. wolof_translate-0.0.1/wolof_translate/pipe/__pycache__/__init__.cpython-310.pyc +0 -0
  49. wolof_translate-0.0.1/wolof_translate/pipe/__pycache__/nlp_pipeline.cpython-310.pyc +0 -0
  50. wolof_translate-0.0.1/wolof_translate/pipe/nlp_pipeline.py +512 -0
  51. wolof_translate-0.0.1/wolof_translate/tokenizers/__init__.py +0 -0
  52. wolof_translate-0.0.1/wolof_translate/tokenizers/__pycache__/__init__.cpython-310.pyc +0 -0
  53. wolof_translate-0.0.1/wolof_translate/tokenizers/adverse_tokenizer.json +15185 -0
  54. wolof_translate-0.0.1/wolof_translate/tokenizers/bart_tokenizers/tokenizer_v3.json +7816 -0
  55. wolof_translate-0.0.1/wolof_translate/tokenizers/bart_tokenizers/tokenizer_v3_2.json +14856 -0
  56. wolof_translate-0.0.1/wolof_translate/tokenizers/bart_tokenizers/tokenizer_v5.json +38008 -0
  57. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v3.model +0 -0
  58. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v3.vocab +1500 -0
  59. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v4.model +0 -0
  60. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v4.vocab +3000 -0
  61. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v5.model +0 -0
  62. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v5.vocab +8000 -0
  63. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v6.model +0 -0
  64. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v6.vocab +10000 -0
  65. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v7.model +0 -0
  66. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v7.vocab +12000 -0
  67. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v8.model +0 -0
  68. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v8.vocab +15000 -0
  69. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v9.model +0 -0
  70. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v9.vocab +17000 -0
  71. wolof_translate-0.0.1/wolof_translate/tokenizers/t5_tokenizers.zip +0 -0
  72. wolof_translate-0.0.1/wolof_translate/tokenizers/tokenizer_v1.json +31336 -0
  73. wolof_translate-0.0.1/wolof_translate/tokenizers/tokenizer_v3_2.json +14870 -0
  74. wolof_translate-0.0.1/wolof_translate/tokenizers/trax/sentencepiece_tokenizer_v4.subwords +5311 -0
  75. wolof_translate-0.0.1/wolof_translate/tokenizers/trax/sentencepiece_tokenizer_v5.subwords +7715 -0
  76. wolof_translate-0.0.1/wolof_translate/tokenizers/trax/sentencepiece_tokenizer_v6.subwords +7583 -0
  77. wolof_translate-0.0.1/wolof_translate/tokenizers/trax/sentencepiece_tokenizer_v7.subwords +7218 -0
  78. wolof_translate-0.0.1/wolof_translate/trainers/__init__.py +0 -0
  79. wolof_translate-0.0.1/wolof_translate/trainers/__pycache__/__init__.cpython-310.pyc +0 -0
  80. wolof_translate-0.0.1/wolof_translate/trainers/__pycache__/__init__.cpython-311.pyc +0 -0
  81. wolof_translate-0.0.1/wolof_translate/trainers/__pycache__/transformer_trainer.cpython-310.pyc +0 -0
  82. wolof_translate-0.0.1/wolof_translate/trainers/__pycache__/transformer_trainer.cpython-311.pyc +0 -0
  83. wolof_translate-0.0.1/wolof_translate/trainers/__pycache__/transformer_trainer_custom.cpython-310.pyc +0 -0
  84. wolof_translate-0.0.1/wolof_translate/trainers/__pycache__/transformer_trainer_ml.cpython-310.pyc +0 -0
  85. wolof_translate-0.0.1/wolof_translate/trainers/__pycache__/transformer_trainer_ml_.cpython-310.pyc +0 -0
  86. wolof_translate-0.0.1/wolof_translate/trainers/transformer_trainer.py +760 -0
  87. wolof_translate-0.0.1/wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  88. wolof_translate-0.0.1/wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  89. wolof_translate-0.0.1/wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  90. wolof_translate-0.0.1/wolof_translate/utils/__init__.py +1 -0
  91. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/__init__.cpython-310.pyc +0 -0
  92. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  93. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/bucket_iterator.cpython-310.pyc +0 -0
  94. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/database_manager.cpython-310.pyc +0 -0
  95. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/display_predictions.cpython-310.pyc +0 -0
  96. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/download_model.cpython-310.pyc +0 -0
  97. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/evaluate_custom.cpython-310.pyc +0 -0
  98. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/evaluation.cpython-310.pyc +0 -0
  99. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/evaluation.cpython-311.pyc +0 -0
  100. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/extract_new_sentences.cpython-310.pyc +0 -0
  101. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/recuperate_datasets.cpython-310.pyc +0 -0
  102. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/sent_corrections.cpython-310.pyc +0 -0
  103. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/sent_corrections.cpython-311.pyc +0 -0
  104. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/sent_transformers.cpython-310.pyc +0 -0
  105. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/sent_transformers.cpython-311.pyc +0 -0
  106. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/sent_unification.cpython-310.pyc +0 -0
  107. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/split_with_valid.cpython-310.pyc +0 -0
  108. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/split_with_valid.cpython-311.pyc +0 -0
  109. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/tokenize_text.cpython-310.pyc +0 -0
  110. wolof_translate-0.0.1/wolof_translate/utils/__pycache__/training.cpython-310.pyc +0 -0
  111. wolof_translate-0.0.1/wolof_translate/utils/bucket_iterator.py +143 -0
  112. wolof_translate-0.0.1/wolof_translate/utils/database_manager.py +116 -0
  113. wolof_translate-0.0.1/wolof_translate/utils/display_predictions.py +162 -0
  114. wolof_translate-0.0.1/wolof_translate/utils/download_model.py +40 -0
  115. wolof_translate-0.0.1/wolof_translate/utils/evaluate_custom.py +147 -0
  116. wolof_translate-0.0.1/wolof_translate/utils/evaluation.py +74 -0
  117. wolof_translate-0.0.1/wolof_translate/utils/extract_new_sentences.py +810 -0
  118. wolof_translate-0.0.1/wolof_translate/utils/extract_poems.py +60 -0
  119. wolof_translate-0.0.1/wolof_translate/utils/extract_sentences.py +562 -0
  120. wolof_translate-0.0.1/wolof_translate/utils/improvements/__init__.py +0 -0
  121. wolof_translate-0.0.1/wolof_translate/utils/improvements/__pycache__/__init__.cpython-310.pyc +0 -0
  122. wolof_translate-0.0.1/wolof_translate/utils/improvements/__pycache__/__init__.cpython-311.pyc +0 -0
  123. wolof_translate-0.0.1/wolof_translate/utils/improvements/__pycache__/end_marks.cpython-310.pyc +0 -0
  124. wolof_translate-0.0.1/wolof_translate/utils/improvements/__pycache__/end_marks.cpython-311.pyc +0 -0
  125. wolof_translate-0.0.1/wolof_translate/utils/improvements/end_marks.py +45 -0
  126. wolof_translate-0.0.1/wolof_translate/utils/recuperate_datasets.py +94 -0
  127. wolof_translate-0.0.1/wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  128. wolof_translate-0.0.1/wolof_translate/utils/send_model.py +26 -0
  129. wolof_translate-0.0.1/wolof_translate/utils/sent_corrections.py +169 -0
  130. wolof_translate-0.0.1/wolof_translate/utils/sent_transformers.py +27 -0
  131. wolof_translate-0.0.1/wolof_translate/utils/sent_unification.py +97 -0
  132. wolof_translate-0.0.1/wolof_translate/utils/split_with_valid.py +72 -0
  133. wolof_translate-0.0.1/wolof_translate/utils/tokenize_text.py +46 -0
  134. wolof_translate-0.0.1/wolof_translate/utils/training.py +213 -0
  135. wolof_translate-0.0.1/wolof_translate/utils/trunc_hg_training.py +196 -0
  136. wolof_translate-0.0.1/wolof_translate.egg-info/PKG-INFO +6 -0
  137. wolof_translate-0.0.1/wolof_translate.egg-info/SOURCES.txt +138 -0
  138. wolof_translate-0.0.1/wolof_translate.egg-info/dependency_links.txt +1 -0
  139. wolof_translate-0.0.1/wolof_translate.egg-info/requires.txt +25 -0
  140. wolof_translate-0.0.1/wolof_translate.egg-info/top_level.txt +1 -0
@@ -0,0 +1,6 @@
1
+ Metadata-Version: 2.1
2
+ Name: wolof_translate
3
+ Version: 0.0.1
4
+ Summary: Contain function and classes to process corpora for making translation between wolof text and other languages.
5
+ Author: Oumar Kane
6
+ Author-email: oumar.kane@univ-thies.sn
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,36 @@
1
+ from setuptools import setup
2
+
3
+ setup(
4
+ name="wolof_translate",
5
+ version="0.0.1",
6
+ author="Oumar Kane",
7
+ author_email="oumar.kane@univ-thies.sn",
8
+ description="Contain function and classes to process corpora for making translation between wolof text and other languages.",
9
+ install_requires=[
10
+ "accelerate",
11
+ "torch",
12
+ "spacy",
13
+ "nltk",
14
+ "gensim",
15
+ "furo",
16
+ "streamlit",
17
+ "tokenizers",
18
+ "tensorboard",
19
+ "evaluate",
20
+ "transformers",
21
+ "pandas",
22
+ "numpy",
23
+ "scikit-learn",
24
+ "matplotlib",
25
+ "plotly",
26
+ "sacrebleu",
27
+ "nlpaug",
28
+ "wandb",
29
+ "pytorch-lightning",
30
+ "selenium",
31
+ "sentencepiece",
32
+ "peft",
33
+ "rouge-score",
34
+ "sacrebleu"
35
+ ],
36
+ )
@@ -0,0 +1,73 @@
1
+ """Script containing importation
2
+ ================================
3
+ """
4
+
5
+ # let us import all necessary libraries
6
+ from transformers import (
7
+ T5Model,
8
+ T5ForConditionalGeneration,
9
+ Seq2SeqTrainer,
10
+ T5TokenizerFast,
11
+ set_seed,
12
+ AdamWeightDecay,
13
+ get_linear_schedule_with_warmup,
14
+ get_linear_schedule_with_warmup,
15
+ get_cosine_schedule_with_warmup,
16
+ get_constant_schedule_with_warmup,
17
+ Adafactor,
18
+ )
19
+ from wolof_translate.utils.sent_transformers import TransformerSequences
20
+ from wolof_translate.utils.improvements.end_marks import add_end_mark # added
21
+ from torch.nn import TransformerEncoderLayer, TransformerDecoderLayer
22
+ from torch.utils.data import Dataset, DataLoader, random_split
23
+ from wolof_translate.data.dataset_v4 import SentenceDataset # v2 -> v3 -> v4
24
+ from wolof_translate.utils.sent_corrections import *
25
+ from sklearn.model_selection import train_test_split
26
+ from torch.optim.lr_scheduler import _LRScheduler
27
+ from torch.nn.utils.rnn import pad_sequence
28
+ from plotly.subplots import make_subplots
29
+ from nlpaug.augmenter import char as nac
30
+ from torch.utils.data import DataLoader
31
+ from torch.nn import functional as F
32
+ import plotly.graph_objects as go
33
+ from tokenizers import Tokenizer
34
+ import torch.distributed as dist
35
+ import matplotlib.pyplot as plt
36
+ import pytorch_lightning as lt
37
+ from tqdm import tqdm, trange
38
+ from functools import partial
39
+ from torch.nn import utils
40
+ from copy import deepcopy
41
+ from torch import optim
42
+ from typing import *
43
+ from torch import nn
44
+ import pandas as pd
45
+ import numpy as np
46
+ import itertools
47
+ import evaluate
48
+ import random
49
+ import string
50
+ import shutil
51
+ import wandb
52
+ import torch
53
+ import json
54
+ import copy
55
+ import os
56
+
57
+ ###-----------------------------------------------
58
+ # Libraries imported from wolof translate
59
+ from wolof_translate.utils.bucket_iterator import (
60
+ SequenceLengthBatchSampler,
61
+ BucketSampler,
62
+ collate_fn,
63
+ collate_fn_trunc,
64
+ )
65
+ from wolof_translate.trainers.transformer_trainer_custom import (
66
+ ModelRunner as CustomModelRunner,
67
+ )
68
+ from wolof_translate.models.transformers.optimization import TransformerScheduler
69
+ from wolof_translate.utils.recuperate_datasets import recuperate_datasets
70
+ from wolof_translate.trainers.transformer_trainer_ml_ import ModelRunner
71
+ from wolof_translate.utils.evaluate_custom import TranslationEvaluation
72
+ from wolof_translate.models.transformers.main import Transformer
73
+ from wolof_translate.utils.split_with_valid import split_data
File without changes
@@ -0,0 +1,151 @@
1
+ from wolof_translate.utils.sent_transformers import TransformerSequences
2
+ from transformers import PreTrainedTokenizerFast
3
+ from torch.utils.data import Dataset
4
+ from tokenizers import Tokenizer
5
+ from typing import *
6
+ import pandas as pd
7
+ import torch
8
+ import re
9
+
10
+
11
+ class SentenceDataset(Dataset):
12
+ def __init__(
13
+ self,
14
+ file_path: str,
15
+ corpus_1: str = "french_corpus",
16
+ corpus_2: str = "wolof_corpus",
17
+ tokenizer_path: str = "wolof-translate/wolof_translate/tokenizers/tokenizer_v1.json",
18
+ max_len: int = 379,
19
+ truncation: bool = False,
20
+ file_sep: str = ",",
21
+ cls_token: str = "<|endoftext|>",
22
+ sep_token: str = "<|translateto|>",
23
+ pad_token: str = "<|pad|>",
24
+ cp1_transformer: Union[TransformerSequences, None] = None,
25
+ cp2_transformer: Union[TransformerSequences, None] = None,
26
+ **kwargs,
27
+ ):
28
+
29
+ # let us recuperate the data frame
30
+ self.__sentences = pd.read_csv(file_path, sep=file_sep, **kwargs)
31
+
32
+ # let us recuperate the tokenizer
33
+ self.tokenizer = PreTrainedTokenizerFast(
34
+ tokenizer_file=tokenizer_path,
35
+ bos_token=cls_token,
36
+ eos_token=cls_token,
37
+ pad_token=pad_token,
38
+ )
39
+
40
+ # recuperate the first corpus' sentences
41
+ self.__sentences_1 = self.__sentences[corpus_1].to_list()
42
+
43
+ # recuperate the second corpus' sentences
44
+ self.__sentences_2 = self.__sentences[corpus_2].to_list()
45
+
46
+ # recuperate the special tokens
47
+ self.cls_token = cls_token
48
+
49
+ self.sep_token = sep_token
50
+
51
+ self.pad_token = pad_token
52
+
53
+ # recuperate the length
54
+ self.__length = len(self.__sentences_1)
55
+
56
+ # recuperate the max id
57
+ self.max_id = len(self.tokenizer) - 1
58
+
59
+ # let us recuperate the max len
60
+ self.max_len = max_len
61
+
62
+ # let us recuperate the truncate argument
63
+ self.truncation = truncation
64
+
65
+ # let us initialize the transformer
66
+ self.cp1_transformer = cp1_transformer
67
+
68
+ self.cp2_transformer = cp2_transformer
69
+
70
+ def __getitem__(self, index):
71
+
72
+ sentence_1 = self.__sentences_1[index]
73
+
74
+ sentence_2 = self.__sentences_2[index]
75
+
76
+ # apply transformers if necessary
77
+ if not self.cp1_transformer is None:
78
+
79
+ sentence_1 = self.cp1_transformer(sentence_1)
80
+
81
+ if not self.cp2_transformer is None:
82
+
83
+ sentence_2 = self.cp2_transformer(sentence_2)
84
+
85
+ # let us create the sentence with special tokens
86
+ sentence = (
87
+ f"{self.cls_token}{sentence_1}{self.sep_token}{sentence_2}{self.cls_token}"
88
+ )
89
+
90
+ # let us encode the sentence
91
+ encoding = self.tokenizer(
92
+ sentence,
93
+ truncation=self.truncation,
94
+ max_length=self.max_len,
95
+ padding="max_length",
96
+ return_tensors="pt",
97
+ )
98
+
99
+ return encoding.input_ids.squeeze(0), encoding.attention_mask.squeeze(0)
100
+
101
+ def __len__(self):
102
+
103
+ return self.__length
104
+
105
+ def decode(self, ids: torch.Tensor, for_prediction: bool = False):
106
+
107
+ if ids.ndim < 2:
108
+
109
+ ids = ids.unsqueeze(0)
110
+
111
+ ids = ids.tolist()
112
+
113
+ for id in ids:
114
+
115
+ sentence = self.tokenizer.decode(id)
116
+
117
+ if not for_prediction:
118
+
119
+ sentence = sentence.split(f"{self.sep_token}")
120
+
121
+ else:
122
+
123
+ try:
124
+
125
+ while self.sep_token in sentence:
126
+
127
+ sentence = re.findall(f"{self.sep_token}(.*)", sentence)[-1]
128
+
129
+ except:
130
+
131
+ sentence = "None"
132
+
133
+ if for_prediction:
134
+
135
+ yield sentence.replace(f"{self.cls_token}", "").replace(
136
+ f"{self.pad_token}", ""
137
+ )
138
+
139
+ else:
140
+
141
+ sents = []
142
+
143
+ for sent in sentence:
144
+
145
+ sents.append(
146
+ sent.replace(f"{self.cls_token}", "").replace(
147
+ f"{self.pad_token}", ""
148
+ )
149
+ )
150
+
151
+ yield sents
@@ -0,0 +1,187 @@
1
+ from wolof_translate.utils.sent_transformers import TransformerSequences
2
+ from transformers import PreTrainedTokenizerFast
3
+ from torch.utils.data import Dataset
4
+ from typing import *
5
+ import pandas as pd
6
+ import torch
7
+ import re
8
+
9
+
10
+ class T5SentenceDataset(Dataset):
11
+ def __init__(
12
+ self,
13
+ data_path: str,
14
+ tokenizer: PreTrainedTokenizerFast,
15
+ corpus_1: str = "french",
16
+ corpus_2: str = "wolof",
17
+ max_len: int = 51,
18
+ truncation: bool = False,
19
+ file_sep: str = ",",
20
+ cp1_transformer: Union[TransformerSequences, None] = None,
21
+ cp2_transformer: Union[TransformerSequences, None] = None,
22
+ **kwargs
23
+ ):
24
+
25
+ # let us recuperate the data frame
26
+ self.__sentences = pd.read_csv(data_path, sep=file_sep, **kwargs)
27
+
28
+ # let us recuperate the tokenizer
29
+ self.tokenizer = tokenizer
30
+
31
+ # recuperate the first corpus' sentences
32
+ self.sentences_1 = self.__sentences[corpus_1].to_list()
33
+
34
+ # recuperate the second corpus' sentences
35
+ self.sentences_2 = self.__sentences[corpus_2].to_list()
36
+
37
+ # recuperate the length
38
+ self.length = len(self.sentences_1)
39
+
40
+ # let us recuperate the max len
41
+ self.max_len = max_len
42
+
43
+ # let us recuperate the truncation argument
44
+ self.truncation = truncation
45
+
46
+ # let us initialize the transformer
47
+ self.cp1_transformer = cp1_transformer
48
+
49
+ self.cp2_transformer = cp2_transformer
50
+
51
+ def __getitem__(self, index):
52
+ """Recuperate ids and attention masks of sentences at index
53
+
54
+ Args:
55
+ index (int): The index of the sentences to recuperate
56
+
57
+ Returns:
58
+ tuple: The `sentence to translate' ids`, `the attention mask of the sentence to translate`
59
+ `the labels' ids`
60
+ """
61
+ sentence_1 = self.sentences_1[index]
62
+
63
+ sentence_2 = self.sentences_2[index]
64
+
65
+ # apply transformers if necessary
66
+ if not self.cp1_transformer is None:
67
+
68
+ sentence_1 = self.cp1_transformer(sentence_1)[0]
69
+
70
+ if not self.cp2_transformer is None:
71
+
72
+ sentence_2 = self.cp2_transformer(sentence_2)[0]
73
+
74
+ sentence_1 = sentence_1 + self.tokenizer.eos_token
75
+
76
+ sentence_2 = sentence_2 + self.tokenizer.eos_token
77
+
78
+ # let us encode the sentences (we provide the second sentence as labels to the tokenizer)
79
+ data = self.tokenizer(
80
+ sentence_1,
81
+ truncation=self.truncation,
82
+ max_length=self.max_len,
83
+ padding="max_length",
84
+ return_tensors="pt",
85
+ text_target=sentence_2,
86
+ )
87
+
88
+ return (
89
+ data.input_ids.squeeze(0),
90
+ data.attention_mask.squeeze(0),
91
+ data.labels.squeeze(0),
92
+ )
93
+
94
+ def __len__(self):
95
+
96
+ return self.length
97
+
98
+ def decode(self, labels: torch.Tensor):
99
+
100
+ if labels.ndim < 2:
101
+
102
+ labels = labels.unsqueeze(0)
103
+
104
+ sentences = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
105
+
106
+ return sentences
107
+
108
+
109
+ class SentenceDataset(T5SentenceDataset):
110
+ def __init__(
111
+ self,
112
+ data_path: str,
113
+ tokenizer: PreTrainedTokenizerFast,
114
+ corpus_1: str = "french",
115
+ corpus_2: str = "wolof",
116
+ max_len: int = 51,
117
+ truncation: bool = False,
118
+ file_sep: str = ",",
119
+ cp1_transformer: Union[TransformerSequences, None] = None,
120
+ cp2_transformer: Union[TransformerSequences, None] = None,
121
+ **kwargs
122
+ ):
123
+
124
+ super().__init__(
125
+ data_path,
126
+ tokenizer,
127
+ corpus_1,
128
+ corpus_2,
129
+ max_len,
130
+ truncation,
131
+ file_sep,
132
+ cp1_transformer,
133
+ cp2_transformer,
134
+ **kwargs
135
+ )
136
+
137
+ def __getitem__(self, index):
138
+ """Recuperate ids and attention masks of sentences at index
139
+
140
+ Args:
141
+ index (int): The index of the sentences to recuperate
142
+
143
+ Returns:
144
+ tuple: The `sentence to translate' ids`, `the attention mask of the sentence to translate`
145
+ `the labels' ids`
146
+ """
147
+ sentence_1 = self.sentences_1[index]
148
+
149
+ sentence_2 = self.sentences_2[index]
150
+
151
+ # apply transformers if necessary
152
+ if not self.cp1_transformer is None:
153
+
154
+ sentence_1 = self.cp1_transformer(sentence_1)[0]
155
+
156
+ if not self.cp2_transformer is None:
157
+
158
+ sentence_2 = self.cp2_transformer(sentence_2)[0]
159
+
160
+ sentence_1 = sentence_1 + self.tokenizer.eos_token
161
+
162
+ sentence_2 = sentence_2 + self.tokenizer.eos_token
163
+
164
+ # let us encode the sentences (we provide the second sentence as labels to the tokenizer)
165
+ data = self.tokenizer(
166
+ sentence_1,
167
+ truncation=self.truncation,
168
+ max_length=self.max_len,
169
+ padding="max_length",
170
+ return_tensors="pt",
171
+ )
172
+
173
+ # let us encode the sentences (we provide the second sentence as labels to the tokenizer)
174
+ labels = self.tokenizer(
175
+ sentence_2,
176
+ truncation=self.truncation,
177
+ max_length=self.max_len,
178
+ padding="max_length",
179
+ return_tensors="pt",
180
+ )
181
+
182
+ return (
183
+ data.input_ids.squeeze(0),
184
+ data.attention_mask.squeeze(0),
185
+ labels.input_ids.squeeze(0),
186
+ labels.attention_mask.squeeze(0),
187
+ )
@@ -0,0 +1,187 @@
1
+ from wolof_translate.utils.sent_transformers import TransformerSequences
2
+ from transformers import PreTrainedTokenizerFast
3
+ from torch.utils.data import Dataset
4
+ from typing import *
5
+ import pandas as pd
6
+ import torch
7
+ import re
8
+
9
+
10
+ class T5SentenceDataset(Dataset):
11
+ def __init__(
12
+ self,
13
+ data_path: str,
14
+ tokenizer: PreTrainedTokenizerFast,
15
+ corpus_1: str = "french",
16
+ corpus_2: str = "wolof",
17
+ max_len: int = 60,
18
+ truncation: bool = False,
19
+ file_sep: str = ",",
20
+ cp1_transformer: Union[TransformerSequences, None] = None,
21
+ cp2_transformer: Union[TransformerSequences, None] = None,
22
+ **kwargs
23
+ ):
24
+
25
+ # let us recuperate the data frame
26
+ self.__sentences = pd.read_csv(data_path, sep=file_sep, **kwargs)
27
+
28
+ # let us recuperate the tokenizer
29
+ self.tokenizer = tokenizer
30
+
31
+ # recuperate the first corpus' sentences
32
+ self.sentences_1 = self.__sentences[corpus_1].to_list()
33
+
34
+ # recuperate the second corpus' sentences
35
+ self.sentences_2 = self.__sentences[corpus_2].to_list()
36
+
37
+ # recuperate the length
38
+ self.length = len(self.sentences_1)
39
+
40
+ # let us recuperate the max len
41
+ self.max_len = max_len + max_len // 6
42
+
43
+ # let us recuperate the truncation argument
44
+ self.truncation = truncation
45
+
46
+ # let us initialize the transformer
47
+ self.cp1_transformer = cp1_transformer
48
+
49
+ self.cp2_transformer = cp2_transformer
50
+
51
+ def __getitem__(self, index):
52
+ """Recuperate ids and attention masks of sentences at index
53
+
54
+ Args:
55
+ index (int): The index of the sentences to recuperate
56
+
57
+ Returns:
58
+ tuple: The `sentence to translate' ids`, `the attention mask of the sentence to translate`
59
+ `the labels' ids`
60
+ """
61
+ sentence_1 = self.sentences_1[index]
62
+
63
+ sentence_2 = self.sentences_2[index]
64
+
65
+ # apply transformers if necessary
66
+ if not self.cp1_transformer is None:
67
+
68
+ sentence_1 = self.cp1_transformer(sentence_1)[0]
69
+
70
+ if not self.cp2_transformer is None:
71
+
72
+ sentence_2 = self.cp2_transformer(sentence_2)[0]
73
+
74
+ sentence_1 = sentence_1 + self.tokenizer.eos_token
75
+
76
+ sentence_2 = sentence_2 + self.tokenizer.eos_token
77
+
78
+ # let us encode the sentences (we provide the second sentence as labels to the tokenizer)
79
+ data = self.tokenizer(
80
+ sentence_1,
81
+ truncation=self.truncation,
82
+ max_length=self.max_len,
83
+ padding="max_length",
84
+ return_tensors="pt",
85
+ text_target=sentence_2,
86
+ )
87
+
88
+ return (
89
+ data.input_ids.squeeze(0),
90
+ data.attention_mask.squeeze(0),
91
+ data.labels.squeeze(0),
92
+ )
93
+
94
+ def __len__(self):
95
+
96
+ return self.length
97
+
98
+ def decode(self, labels: torch.Tensor):
99
+
100
+ if labels.ndim < 2:
101
+
102
+ labels = labels.unsqueeze(0)
103
+
104
+ sentences = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
105
+
106
+ return sentences
107
+
108
+
109
+ class SentenceDataset(T5SentenceDataset):
110
+ def __init__(
111
+ self,
112
+ data_path: str,
113
+ tokenizer: PreTrainedTokenizerFast,
114
+ corpus_1: str = "french",
115
+ corpus_2: str = "wolof",
116
+ max_len: int = 42,
117
+ truncation: bool = False,
118
+ file_sep: str = ",",
119
+ cp1_transformer: Union[TransformerSequences, None] = None,
120
+ cp2_transformer: Union[TransformerSequences, None] = None,
121
+ **kwargs
122
+ ):
123
+
124
+ super().__init__(
125
+ data_path,
126
+ tokenizer,
127
+ corpus_1,
128
+ corpus_2,
129
+ max_len,
130
+ truncation,
131
+ file_sep,
132
+ cp1_transformer,
133
+ cp2_transformer,
134
+ **kwargs
135
+ )
136
+
137
+ def __getitem__(self, index):
138
+ """Recuperate ids and attention masks of sentences at index
139
+
140
+ Args:
141
+ index (int): The index of the sentences to recuperate
142
+
143
+ Returns:
144
+ tuple: The `sentence to translate' ids`, `the attention mask of the sentence to translate`
145
+ `the labels' ids`
146
+ """
147
+ sentence_1 = self.sentences_1[index]
148
+
149
+ sentence_2 = self.sentences_2[index]
150
+
151
+ # apply transformers if necessary
152
+ if not self.cp1_transformer is None:
153
+
154
+ sentence_1 = self.cp1_transformer(sentence_1)[0]
155
+
156
+ if not self.cp2_transformer is None:
157
+
158
+ sentence_2 = self.cp2_transformer(sentence_2)[0]
159
+
160
+ sentence_1 = sentence_1 + self.tokenizer.eos_token
161
+
162
+ sentence_2 = sentence_2 + self.tokenizer.eos_token
163
+
164
+ # let us encode the sentences (we provide the second sentence as labels to the tokenizer)
165
+ data = self.tokenizer(
166
+ sentence_1,
167
+ truncation=self.truncation,
168
+ max_length=self.max_len,
169
+ padding="max_length",
170
+ return_tensors="pt",
171
+ )
172
+
173
+ # let us encode the sentences (we provide the second sentence as labels to the tokenizer)
174
+ labels = self.tokenizer(
175
+ sentence_2,
176
+ truncation=self.truncation,
177
+ max_length=self.max_len,
178
+ padding="max_length",
179
+ return_tensors="pt",
180
+ )
181
+
182
+ return (
183
+ data.input_ids.squeeze(0),
184
+ data.attention_mask.squeeze(0),
185
+ labels.input_ids.squeeze(0),
186
+ labels.attention_mask.squeeze(0),
187
+ )