wolof-translate 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. wolof_translate/__init__.py +73 -0
  2. wolof_translate/data/__init__.py +0 -0
  3. wolof_translate/data/dataset_v1.py +151 -0
  4. wolof_translate/data/dataset_v2.py +187 -0
  5. wolof_translate/data/dataset_v3.py +187 -0
  6. wolof_translate/data/dataset_v3_2.py +187 -0
  7. wolof_translate/data/dataset_v4.py +202 -0
  8. wolof_translate/data/dataset_v5.py +65 -0
  9. wolof_translate/models/__init__.py +0 -0
  10. wolof_translate/models/transformers/__init__.py +0 -0
  11. wolof_translate/models/transformers/main.py +865 -0
  12. wolof_translate/models/transformers/main_2.py +362 -0
  13. wolof_translate/models/transformers/optimization.py +41 -0
  14. wolof_translate/models/transformers/position.py +46 -0
  15. wolof_translate/models/transformers/size.py +44 -0
  16. wolof_translate/pipe/__init__.py +1 -0
  17. wolof_translate/pipe/nlp_pipeline.py +512 -0
  18. wolof_translate/tokenizers/__init__.py +0 -0
  19. wolof_translate/trainers/__init__.py +0 -0
  20. wolof_translate/trainers/transformer_trainer.py +760 -0
  21. wolof_translate/trainers/transformer_trainer_custom.py +882 -0
  22. wolof_translate/trainers/transformer_trainer_ml.py +925 -0
  23. wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
  24. wolof_translate/utils/__init__.py +1 -0
  25. wolof_translate/utils/bucket_iterator.py +143 -0
  26. wolof_translate/utils/database_manager.py +116 -0
  27. wolof_translate/utils/display_predictions.py +162 -0
  28. wolof_translate/utils/download_model.py +40 -0
  29. wolof_translate/utils/evaluate_custom.py +147 -0
  30. wolof_translate/utils/evaluation.py +74 -0
  31. wolof_translate/utils/extract_new_sentences.py +810 -0
  32. wolof_translate/utils/extract_poems.py +60 -0
  33. wolof_translate/utils/extract_sentences.py +562 -0
  34. wolof_translate/utils/improvements/__init__.py +0 -0
  35. wolof_translate/utils/improvements/end_marks.py +45 -0
  36. wolof_translate/utils/recuperate_datasets.py +94 -0
  37. wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
  38. wolof_translate/utils/send_model.py +26 -0
  39. wolof_translate/utils/sent_corrections.py +169 -0
  40. wolof_translate/utils/sent_transformers.py +27 -0
  41. wolof_translate/utils/sent_unification.py +97 -0
  42. wolof_translate/utils/split_with_valid.py +72 -0
  43. wolof_translate/utils/tokenize_text.py +46 -0
  44. wolof_translate/utils/training.py +213 -0
  45. wolof_translate/utils/trunc_hg_training.py +196 -0
  46. wolof_translate-0.0.1.dist-info/METADATA +31 -0
  47. wolof_translate-0.0.1.dist-info/RECORD +49 -0
  48. wolof_translate-0.0.1.dist-info/WHEEL +5 -0
  49. wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,512 @@
1
+ from wolof_translate.utils.tokenize_text import tokenization
2
+ from nlp_project.processing.utils import get_n_grams, guess_limitations, wordcloud
3
+ from nlp_project import *
4
+
5
+
6
+ class TextPipeProcessing:
7
+ """The pipeline is composed by (* if obligatory processing):
8
+ - tokenize_text*
9
+ - create_corpus*
10
+ - create_n_grams
11
+ - set_n_grams_as_corpus
12
+ - reset_corpus
13
+ - create_frequency*
14
+ - show_frequency_out_limits
15
+ - show_most_common_words
16
+ - plot_frequency_histogram
17
+ - show_n_time_frequency_words
18
+ - delete_n_time_frequency_words
19
+ - remove_words
20
+ - recuperate_results*
21
+ - add_results_to_corpus*
22
+ - plot_wordcloud
23
+ - some other issues...
24
+ - use context manager to store a pipeline
25
+ """
26
+
27
+ pipeline = {}
28
+
29
+ def __init__(self, corpus: List[str], name: str = "nlp_pipeline"):
30
+ """Initialize main attributes
31
+
32
+ Args:
33
+ corpus (list): The list of documents.
34
+ name (str): The name of the pipeline
35
+ """
36
+
37
+ self.corpus = corpus
38
+
39
+ self._corpus = None
40
+
41
+ self._n_grams = None
42
+
43
+ self._old_corpus = None
44
+
45
+ self._grams_active = False
46
+
47
+ self.bigrams = None
48
+
49
+ self.trigrams = None
50
+
51
+ self.name = name
52
+
53
+ def __enter__(self):
54
+
55
+ self.current_pipe = []
56
+
57
+ return self
58
+
59
+ def __call__(self, method: Callable, get_results: bool = True, *args, **kwargs):
60
+
61
+ self.current_pipe.append(
62
+ {"method": method, "args": args, "kwargs": kwargs, "result": get_results}
63
+ )
64
+
65
+ def tokenize_text(self, nlp, rm_spaces: bool = True):
66
+ """Tokenizing the text
67
+
68
+ Args:
69
+ nlp (_type_): The spacy model to use
70
+ rm_spaces (bool, optional): Indicates if we want to remove the spaces. Defaults to True.
71
+
72
+ Returns:
73
+ List[List[str]]: List of tokens
74
+ """
75
+ self._nlp = nlp
76
+
77
+ self._tokenizer = lambda texts: tokenization(nlp, texts, rm_spaces)
78
+
79
+ self._tokens = self._tokenizer(self.corpus)
80
+
81
+ return self._tokens
82
+
83
+ def create_corpus(self):
84
+ """Creating a list containing all the non distinct tokens
85
+
86
+ Returns:
87
+ Tuple[list, nltk.Text]: The list of non distinct tokens and the nltk text composed of the tokens
88
+ """
89
+ self._corpus = []
90
+
91
+ for document in tqdm(self._tokens):
92
+
93
+ self._corpus.extend(document)
94
+
95
+ self._corpus_text = nltk.text.Text(self._corpus)
96
+
97
+ print(f"Number of words: {len(self._corpus):->16}")
98
+ print(f"Number of unique words: {len(self._corpus_text.vocab()):->16}")
99
+
100
+ return self._corpus, self._corpus_text
101
+
102
+ def create_n_grams(self, n: int = 2):
103
+ """Create n grams
104
+
105
+ Args:
106
+ n (int, optional): The length of a gram. Defaults to 2.
107
+
108
+ Returns:
109
+ Tuple[list, nltk.Text]: A list of n grams and the nltk text format of the n grams
110
+ """
111
+ assert n >= 2
112
+
113
+ self._n_grams = []
114
+
115
+ for document in tqdm(self._tokens):
116
+
117
+ n_gram = get_n_grams(document, n)
118
+
119
+ self._n_grams.extend(n_gram)
120
+
121
+ self._n_grams_text = nltk.text.Text(self._n_grams)
122
+
123
+ print(f"Number of {n} grams: {len(self._n_grams):->16}")
124
+ print(f"Number of unique {n} grams: {len(self._n_grams_text.vocab()):->16}")
125
+
126
+ return self._n_grams, self._n_grams_text
127
+
128
+ def set_n_grams_as_corpus(self):
129
+ """Set the n grams as the list of tokens
130
+
131
+ Raises:
132
+ AttributeError: The create_n_grams is not called!
133
+ """
134
+ self._old_corpus = self._corpus
135
+
136
+ self._old_corpus_text = self._corpus_text
137
+
138
+ if not self._n_grams:
139
+
140
+ raise AttributeError(
141
+ "You didn't create the n grams with the `create_n_grams` method!"
142
+ )
143
+
144
+ self._corpus = self._n_grams
145
+
146
+ self._corpus_text = self._n_grams_text
147
+
148
+ self._grams_active = True
149
+
150
+ def reset_corpus(self):
151
+ """Recuperating the initial corpus
152
+
153
+ Raises:
154
+ AttributeError: The corpus is not yet created
155
+ """
156
+ if not self._old_corpus:
157
+
158
+ raise AttributeError(
159
+ "The corpus was not properly created. To create a new corpus from tokens use the `create_corpus` method!"
160
+ )
161
+
162
+ self._corpus = self._old_corpus
163
+
164
+ self._corpus_text = self._old_corpus_text
165
+
166
+ self._grams_active = False
167
+
168
+ def create_frequency(self):
169
+ """Create tokens' frequencies from the list of tokens"""
170
+ self._frequency = pd.DataFrame.from_dict(self._corpus_text.vocab(), "index")
171
+
172
+ self._frequency.rename({0: "frequency"}, inplace=True, axis=1)
173
+
174
+ self._frequency.reset_index(level=0, inplace=True)
175
+
176
+ print(self._frequency.head())
177
+
178
+ def show_frequency_out_limits(self):
179
+ """Print the frequencies fences"""
180
+ px.box(
181
+ data_frame=self._frequency, x="frequency", hover_data=["index", "frequency"]
182
+ )
183
+
184
+ self.low, self.high = guess_limitations(self._frequency, "frequency")
185
+
186
+ print(f"Low limit: {self.low:->16}")
187
+ print(f"High limit: {self.high:->16}")
188
+
189
+ def show_most_common_words(self, lower_bound: int = 400, n_words: int = 20):
190
+ """Print the most common tokens (can be n grams)
191
+
192
+ Args:
193
+ lower_bound (int, optional): The lower bound of the frequencies. Defaults to 400.
194
+ n_words (int, optional): The number of tokens to display. Defaults to 20.
195
+ """
196
+ self._freq_total = nltk.Counter(self._corpus_text.vocab())
197
+
198
+ self._stopwords_common = list(zip(*self._freq_total.most_common(lower_bound)))[
199
+ 0
200
+ ]
201
+
202
+ print("Most common words are:")
203
+ print(self._stopwords_common[:20])
204
+
205
+ def plot_frequency_histogram(self, bottom: int = 8):
206
+ """Plot the histogram of the frequencies
207
+
208
+ Args:
209
+ bottom (int, optional): The number of the sorted frequencies to display their histograms. Defaults to 8.
210
+ """
211
+ f_values = self._frequency["frequency"].sort_values().unique()
212
+
213
+ bottom_ = self._frequency[self._frequency["frequency"].isin(f_values[:bottom])]
214
+
215
+ fig = px.histogram(
216
+ data_frame=bottom_,
217
+ x="frequency",
218
+ title=f"Frequency histogram for {bottom} frequency on the bottom",
219
+ text_auto=True,
220
+ color_discrete_sequence=["indianred"],
221
+ )
222
+
223
+ fig.show()
224
+
225
+ def show_n_time_frequency_words(
226
+ self, n_time_freq: Union[int, list] = 1, n_words: int = 100
227
+ ):
228
+ """Print the percentage of tokens appearing the specified number of times (frequency) in the corpus
229
+
230
+ Args:
231
+ n_time_freq (Union[int, list], optional): The frequency. Defaults to 1.
232
+ n_words (int, optional): The number of words to display. Defaults to 100.
233
+ """
234
+ pd.options.display.max_rows = n_words
235
+
236
+ n_time_freq = [n_time_freq] if type(n_time_freq) is int else n_time_freq
237
+
238
+ size = self._frequency[self._frequency["frequency"].isin(n_time_freq)].shape[0]
239
+
240
+ n_time_frequency = self._frequency[
241
+ self._frequency["frequency"].isin(n_time_freq)
242
+ ]
243
+
244
+ print(
245
+ f"Percentage of words appearing {'/'.join([str(freq) for freq in n_time_freq])} times in the dataset: {size / self._frequency.shape[0]}%"
246
+ )
247
+
248
+ print(f"Words appearing {'/'.join([str(freq) for freq in n_time_freq])} times:")
249
+ print(n_time_frequency.iloc[:n_words, :])
250
+
251
+ def delete_n_time_frequency_words(self, n_time_freq: Union[int, list] = 1):
252
+ """Delete the tokens appearing a specified number of times in the corpus
253
+
254
+ Args:
255
+ n_time_freq (Union[int, list], optional): The number of times that that tokens appears. Defaults to 1.
256
+ """
257
+ n_time_freq = [n_time_freq] if type(n_time_freq) is int else n_time_freq
258
+
259
+ n_time_frequency = self._frequency[
260
+ self._frequency["frequency"].isin(n_time_freq)
261
+ ]
262
+
263
+ self._new_frequency = self._frequency.loc[
264
+ ~self._frequency["index"].isin(n_time_frequency["index"].to_list()), :
265
+ ]
266
+
267
+ print("The new frequency data frame is stored in `_new_frequency` attribute.")
268
+
269
+ print(f"The number of deleted observations: {n_time_frequency.shape[0]:->16}")
270
+
271
+ def remove_words(self, words_to_remove: List[str]):
272
+ """Remove tokens from the corpus
273
+
274
+ Args:
275
+ words_to_remove (List[str]): List of tokens to remove
276
+ """
277
+ self._new_frequency = self._new_frequency.copy()
278
+
279
+ self._new_frequency.drop(
280
+ index=self._new_frequency[
281
+ self._new_frequency["index"].isin(words_to_remove)
282
+ ].index,
283
+ inplace=True,
284
+ )
285
+
286
+ def recuperate_results(self):
287
+ """Recuperate the results as a dictionary of the tokens with their frequencies as values
288
+
289
+ Returns:
290
+ Tuple[nltk.FreqDist, List[tuple]]: A tuple containing the frequencies and a list of the tokens with their distinct positions in the dictionary
291
+ """
292
+ try:
293
+ frequency = self._new_frequency.copy()
294
+ except:
295
+ frequency = self._frequency.copy()
296
+ finally:
297
+ print(
298
+ "The recuperate results method recuperates the last version of the frequency data frame as a freqDist. Make sure to add transformations before calling this method!"
299
+ )
300
+
301
+ frequency.set_index("index", inplace=True)
302
+
303
+ frequency = frequency.to_dict()
304
+
305
+ frequency = frequency["frequency"]
306
+
307
+ self._results = nltk.FreqDist(frequency)
308
+
309
+ if self._grams_active:
310
+
311
+ keys = list(self._results.keys())
312
+
313
+ if len(keys[0].split(" ")) == 2:
314
+
315
+ self._bigrams = self._results
316
+
317
+ elif len(keys[0].split(" ")) == 3:
318
+
319
+ self._trigrams = self._results
320
+
321
+ self._positions = {
322
+ i: list(self._results.keys())[i] for i in range(len(self._results))
323
+ } # positions of tokens begin at 0
324
+
325
+ return self._results, self._positions
326
+
327
+ def add_results_to_corpus(self):
328
+ """Add final tokens to the corpus
329
+
330
+ Raises:
331
+ ValueError: Only uni grams can be added
332
+ """
333
+ if self._grams_active:
334
+
335
+ print("You didn't reset the corpus with the `reset_corpus` method!")
336
+
337
+ def clean_text(
338
+ tokens: list, words: Union[nltk.FreqDist, list, set, tuple] = self._results
339
+ ):
340
+ """Clean a given document by taking only words that are chosen as representative of the target
341
+
342
+ Args:
343
+ tokens (int): The tokens of the document
344
+ words (Union[nltk.FreqDist, dict, list, set, tuple]): The words that we want to preserve
345
+
346
+ Returns:
347
+ str: The new document
348
+ """
349
+
350
+ if len(list(words.keys())[0].split(" ")) != 1:
351
+
352
+ raise ValueError(
353
+ "Only uni grams can be provide as results to the data frame text column!"
354
+ )
355
+
356
+ tokens_ = [tokens[0]]
357
+
358
+ for i in range(1, len(tokens)):
359
+
360
+ if tokens[i] == "-" and tokens_[-1] != "-" or tokens_[-1][-1] == "-":
361
+
362
+ tokens_[-1] = tokens_[-1] + tokens[i]
363
+
364
+ [token for token in tokens if token in words]
365
+
366
+ return " ".join(tokens_)
367
+
368
+ self.corpus = list(map(clean_text), self._tokens)
369
+
370
+ def plot_wordcloud(
371
+ self,
372
+ figsize: tuple = (8, 8),
373
+ max_font_size: int = 60,
374
+ max_words: int = 100,
375
+ background_color="white",
376
+ ):
377
+ """Plot a wordcloud of the corpus
378
+
379
+ Args:
380
+ figsize (tuple, optional): The figure size with width and height. Defaults to (8, 8).
381
+ max_font_size (int, optional): The maximum size of the font. Defaults to 60.
382
+ max_words (int, optional): The maximum number of words on top of frequencies. Defaults to 100.
383
+ background_color (str, optional): The background color. Defaults to "white"
384
+ """
385
+
386
+ wordcloud(
387
+ " ".join(self.corpus),
388
+ figsize=figsize,
389
+ max_font_size=max_font_size,
390
+ max_words=max_words,
391
+ )
392
+
393
+ def predict_next_word(self, text: str):
394
+
395
+ if self._bigrams and self._trigrams:
396
+
397
+ bigram = " ".join(text.split(" ")[-2:])
398
+
399
+ co_occs = []
400
+
401
+ trigrams = []
402
+
403
+ for trigram in self._trigrams:
404
+
405
+ if bigram in trigram[: len(bigram)]:
406
+
407
+ if text in set(self._bigrams.keys()):
408
+
409
+ freq1 = self._bigrams[bigram]
410
+
411
+ freq2 = self._trigrams[trigram]
412
+
413
+ co_occs.append(freq2 / freq1)
414
+
415
+ trigrams.append(trigram)
416
+
417
+ else:
418
+
419
+ raise KeyError(
420
+ f"The bigram {text} is not identified in the registered bigrams!"
421
+ )
422
+
423
+ try:
424
+
425
+ max_co_occ = np.array([co_occs]).argmax()
426
+
427
+ max_trigram = trigrams[max_co_occ]
428
+
429
+ return max_trigram.split(" ")[-1], co_occs[max_co_occ]
430
+
431
+ except ValueError:
432
+
433
+ return "", None
434
+
435
+ else:
436
+
437
+ raise ValueError(
438
+ "You must create bigrams and trigrams before using them to predict the next word of your text!"
439
+ )
440
+
441
+ def display(self, text: str, style="dep"):
442
+
443
+ # Create a container object
444
+ doc = self._nlp(text)
445
+
446
+ # Render frame with displacy
447
+ spacy.displacy.render(doc, style=style)
448
+
449
+ def execute_pipeline(self, name: str = "nlp_pipeline"):
450
+ """Execute the pipeline
451
+
452
+ Args:
453
+ name (str, optional): The name of the pipeline. Defaults to "nlp_pipeline".
454
+
455
+ Raises:
456
+ ValueError: The pipeline name must exist before being recuperated
457
+
458
+ Returns:
459
+ list: The list of results
460
+ """
461
+
462
+ results = []
463
+
464
+ try:
465
+
466
+ pipeline = self.pipeline[name]
467
+
468
+ i = 1
469
+
470
+ for pipe in pipeline:
471
+
472
+ args = pipe["args"]
473
+
474
+ kwargs = pipe["kwargs"]
475
+
476
+ method = pipe["method"]
477
+
478
+ result = pipe["result"]
479
+
480
+ result_ = "True" if result else "False"
481
+
482
+ print(f"Method {i}: {method.__name__} -> result = {result_}\n")
483
+
484
+ results_ = method(*args, **kwargs)
485
+
486
+ print("\n")
487
+
488
+ print("#" * 100)
489
+
490
+ print("\n")
491
+
492
+ i += 1
493
+
494
+ if result:
495
+
496
+ results.append(results_)
497
+
498
+ return results
499
+
500
+ except KeyError:
501
+
502
+ raise ValueError("The pipeline that you specified doesn't exist!")
503
+
504
+ def __exit__(self, ctx_ept, ctx_value, ctx_tb):
505
+
506
+ self.pipeline[self.name] = self.current_pipe
507
+
508
+ print(
509
+ "You can execute the pipeline with the `pipeline_name.execute_pipeline`! The pipelines are available in the attribute `pipeline`."
510
+ )
511
+
512
+ return ctx_value
File without changes
File without changes