wolof-translate 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wolof_translate/__init__.py +73 -0
- wolof_translate/data/__init__.py +0 -0
- wolof_translate/data/dataset_v1.py +151 -0
- wolof_translate/data/dataset_v2.py +187 -0
- wolof_translate/data/dataset_v3.py +187 -0
- wolof_translate/data/dataset_v3_2.py +187 -0
- wolof_translate/data/dataset_v4.py +202 -0
- wolof_translate/data/dataset_v5.py +65 -0
- wolof_translate/models/__init__.py +0 -0
- wolof_translate/models/transformers/__init__.py +0 -0
- wolof_translate/models/transformers/main.py +865 -0
- wolof_translate/models/transformers/main_2.py +362 -0
- wolof_translate/models/transformers/optimization.py +41 -0
- wolof_translate/models/transformers/position.py +46 -0
- wolof_translate/models/transformers/size.py +44 -0
- wolof_translate/pipe/__init__.py +1 -0
- wolof_translate/pipe/nlp_pipeline.py +512 -0
- wolof_translate/tokenizers/__init__.py +0 -0
- wolof_translate/trainers/__init__.py +0 -0
- wolof_translate/trainers/transformer_trainer.py +760 -0
- wolof_translate/trainers/transformer_trainer_custom.py +882 -0
- wolof_translate/trainers/transformer_trainer_ml.py +925 -0
- wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
- wolof_translate/utils/__init__.py +1 -0
- wolof_translate/utils/bucket_iterator.py +143 -0
- wolof_translate/utils/database_manager.py +116 -0
- wolof_translate/utils/display_predictions.py +162 -0
- wolof_translate/utils/download_model.py +40 -0
- wolof_translate/utils/evaluate_custom.py +147 -0
- wolof_translate/utils/evaluation.py +74 -0
- wolof_translate/utils/extract_new_sentences.py +810 -0
- wolof_translate/utils/extract_poems.py +60 -0
- wolof_translate/utils/extract_sentences.py +562 -0
- wolof_translate/utils/improvements/__init__.py +0 -0
- wolof_translate/utils/improvements/end_marks.py +45 -0
- wolof_translate/utils/recuperate_datasets.py +94 -0
- wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
- wolof_translate/utils/send_model.py +26 -0
- wolof_translate/utils/sent_corrections.py +169 -0
- wolof_translate/utils/sent_transformers.py +27 -0
- wolof_translate/utils/sent_unification.py +97 -0
- wolof_translate/utils/split_with_valid.py +72 -0
- wolof_translate/utils/tokenize_text.py +46 -0
- wolof_translate/utils/training.py +213 -0
- wolof_translate/utils/trunc_hg_training.py +196 -0
- wolof_translate-0.0.1.dist-info/METADATA +31 -0
- wolof_translate-0.0.1.dist-info/RECORD +49 -0
- wolof_translate-0.0.1.dist-info/WHEEL +5 -0
- wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from typing import *
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def extract_sentences(poems: List[List[str]], remove: list = ["\n"], sep: str = "_"):
|
|
5
|
+
|
|
6
|
+
new_poems = {1: []}
|
|
7
|
+
|
|
8
|
+
for poem in poems:
|
|
9
|
+
|
|
10
|
+
new_poem = {1: []}
|
|
11
|
+
|
|
12
|
+
i = 1
|
|
13
|
+
|
|
14
|
+
j = 0
|
|
15
|
+
|
|
16
|
+
for line in poem:
|
|
17
|
+
|
|
18
|
+
for mark in remove:
|
|
19
|
+
|
|
20
|
+
line = line.strip(mark).strip()
|
|
21
|
+
|
|
22
|
+
if line == sep:
|
|
23
|
+
|
|
24
|
+
i += 1
|
|
25
|
+
|
|
26
|
+
j = 0
|
|
27
|
+
|
|
28
|
+
new_poem[i] = []
|
|
29
|
+
|
|
30
|
+
if line != "" and line != sep:
|
|
31
|
+
|
|
32
|
+
if i > 1:
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
|
|
36
|
+
line = (
|
|
37
|
+
line[0].upper() + line[1:]
|
|
38
|
+
if new_poem[i - 1][j][0].isupper()
|
|
39
|
+
else line[0].lower() + line[1:]
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
except IndexError:
|
|
43
|
+
|
|
44
|
+
raise IndexError(
|
|
45
|
+
"The number of lines in the different corpora are not the sames !"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
new_poem[i].append(line.strip())
|
|
49
|
+
|
|
50
|
+
j += 1
|
|
51
|
+
|
|
52
|
+
for key in new_poem.keys():
|
|
53
|
+
|
|
54
|
+
if not key in new_poems:
|
|
55
|
+
|
|
56
|
+
new_poems[key] = []
|
|
57
|
+
|
|
58
|
+
new_poems = {k: new_poems[k] + v for k, v in new_poem.items()}
|
|
59
|
+
|
|
60
|
+
return new_poems
|
|
@@ -0,0 +1,562 @@
|
|
|
1
|
+
from nlp_project import *
|
|
2
|
+
import pickle
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class LengthError(Exception):
|
|
6
|
+
def __init__(self, *args: object) -> None:
|
|
7
|
+
super().__init__(*args)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CommandError(Exception):
|
|
11
|
+
def __init__(self, *args: object) -> None:
|
|
12
|
+
super().__init__(*args)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ExtractRelatedSentences:
|
|
16
|
+
|
|
17
|
+
nb_process = 1
|
|
18
|
+
|
|
19
|
+
indices = {}
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
corpora: pd.DataFrame,
|
|
24
|
+
corpus_1: str = "french_corpus",
|
|
25
|
+
corpus_2: str = "wolof_corpus",
|
|
26
|
+
):
|
|
27
|
+
|
|
28
|
+
self.corpora = corpora
|
|
29
|
+
|
|
30
|
+
self.corpus_1 = corpus_1
|
|
31
|
+
|
|
32
|
+
self.corpus_2 = corpus_2
|
|
33
|
+
|
|
34
|
+
self.length = corpora.shape[0]
|
|
35
|
+
|
|
36
|
+
self.sentences = {}
|
|
37
|
+
|
|
38
|
+
self.passed = {}
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def reload(cls, number: int = 1):
|
|
42
|
+
|
|
43
|
+
cls.nb_process = number
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def store_indices(cls, nb_paragraph, i: int, j: int):
|
|
47
|
+
|
|
48
|
+
cls.indices[nb_paragraph] = {"i": i, "j": j}
|
|
49
|
+
|
|
50
|
+
def increment(self, number: int = 1):
|
|
51
|
+
|
|
52
|
+
ExtractRelatedSentences.nb_process += number
|
|
53
|
+
|
|
54
|
+
if ExtractRelatedSentences.nb_process > self.length:
|
|
55
|
+
pass
|
|
56
|
+
# raise ValueError("The last paragraph is reached!")
|
|
57
|
+
|
|
58
|
+
def decrement(self, number: int = 1):
|
|
59
|
+
|
|
60
|
+
ExtractRelatedSentences.nb_process -= number
|
|
61
|
+
|
|
62
|
+
if ExtractRelatedSentences.nb_process < 1:
|
|
63
|
+
|
|
64
|
+
ExtractRelatedSentences.nb_process = 1
|
|
65
|
+
|
|
66
|
+
def add_sentences(self, nb_paragraph: int, sentences: dict):
|
|
67
|
+
|
|
68
|
+
self.sentences[nb_paragraph] = sentences
|
|
69
|
+
|
|
70
|
+
if len(sentences["1"]) != len(sentences["2"]):
|
|
71
|
+
|
|
72
|
+
raise LengthError(
|
|
73
|
+
"The number of sentences in the two corpora must be equal!"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def add_passed(self, nb_paragraph: int, sentences: dict):
|
|
77
|
+
|
|
78
|
+
self.sentences[nb_paragraph] = sentences
|
|
79
|
+
|
|
80
|
+
def clear_sentences(self, nb_paragraph: int):
|
|
81
|
+
|
|
82
|
+
sentences = self.sentences[nb_paragraph]
|
|
83
|
+
|
|
84
|
+
clear = input(
|
|
85
|
+
f"Are you sure you want to remove the following sentences!\
|
|
86
|
+
\n\nOn {self.corpus_1}:\n{sentences['1']}\n\nOn {self.corpus_2}:\n{sentences['2']}\nYes(y), No(n) :"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if clear == "y":
|
|
90
|
+
|
|
91
|
+
del sentences[nb_paragraph]
|
|
92
|
+
|
|
93
|
+
print(f"Sentences at {nb_paragraph} was cleared!")
|
|
94
|
+
|
|
95
|
+
elif clear == "n":
|
|
96
|
+
|
|
97
|
+
print(f"Process aborted!")
|
|
98
|
+
|
|
99
|
+
else:
|
|
100
|
+
|
|
101
|
+
raise CommandError(f"You cannot take the command {clear}!")
|
|
102
|
+
|
|
103
|
+
def get_sentences(self, nb_paragraph: int):
|
|
104
|
+
|
|
105
|
+
return self.sentences[nb_paragraph]
|
|
106
|
+
|
|
107
|
+
def unify(self, sentences: list, sentence: str, unification_marks: list):
|
|
108
|
+
|
|
109
|
+
for mark in unification_marks:
|
|
110
|
+
|
|
111
|
+
begin_mark = False
|
|
112
|
+
|
|
113
|
+
end_mark = False
|
|
114
|
+
|
|
115
|
+
for letter in sentences[-1]:
|
|
116
|
+
|
|
117
|
+
if letter == mark[1]:
|
|
118
|
+
|
|
119
|
+
begin_mark = False
|
|
120
|
+
|
|
121
|
+
elif letter == mark[0]:
|
|
122
|
+
|
|
123
|
+
begin_mark = True
|
|
124
|
+
|
|
125
|
+
for letter in sentence:
|
|
126
|
+
|
|
127
|
+
if letter == mark[1]:
|
|
128
|
+
|
|
129
|
+
end_mark = True
|
|
130
|
+
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
else:
|
|
134
|
+
|
|
135
|
+
break
|
|
136
|
+
|
|
137
|
+
if sentence != "" and sentence[0].islower():
|
|
138
|
+
|
|
139
|
+
return True, " "
|
|
140
|
+
|
|
141
|
+
if end_mark or begin_mark:
|
|
142
|
+
|
|
143
|
+
space = " " if mark[2] else ""
|
|
144
|
+
|
|
145
|
+
return True, space
|
|
146
|
+
|
|
147
|
+
return False, " "
|
|
148
|
+
|
|
149
|
+
def split(self, paragraph: str, ending_marks: list, unification_marks: list):
|
|
150
|
+
|
|
151
|
+
prob_sentences = paragraph.strip()
|
|
152
|
+
|
|
153
|
+
for mark in ending_marks:
|
|
154
|
+
|
|
155
|
+
if isinstance(prob_sentences, list):
|
|
156
|
+
|
|
157
|
+
new_sentences = prob_sentences.copy()
|
|
158
|
+
|
|
159
|
+
counter = 0
|
|
160
|
+
|
|
161
|
+
for s in prob_sentences:
|
|
162
|
+
|
|
163
|
+
if mark in s:
|
|
164
|
+
|
|
165
|
+
splits = new_sentences[counter].split(mark)
|
|
166
|
+
|
|
167
|
+
sentences = [
|
|
168
|
+
sentence.strip() + mark for sentence in splits[:-1]
|
|
169
|
+
] + splits[-1:]
|
|
170
|
+
|
|
171
|
+
new_sentences = (
|
|
172
|
+
new_sentences[:counter]
|
|
173
|
+
+ sentences
|
|
174
|
+
+ new_sentences[counter + 1 :]
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
counter += len(sentences) - 1
|
|
178
|
+
|
|
179
|
+
else:
|
|
180
|
+
|
|
181
|
+
counter += 1
|
|
182
|
+
|
|
183
|
+
prob_sentences = new_sentences
|
|
184
|
+
|
|
185
|
+
else:
|
|
186
|
+
|
|
187
|
+
if mark in prob_sentences:
|
|
188
|
+
|
|
189
|
+
splits = prob_sentences.split(mark)
|
|
190
|
+
|
|
191
|
+
prob_sentences = [
|
|
192
|
+
sentence.strip() + mark for sentence in splits[:-1]
|
|
193
|
+
] + splits[-1:]
|
|
194
|
+
|
|
195
|
+
new_sentences = []
|
|
196
|
+
|
|
197
|
+
counter = 0
|
|
198
|
+
|
|
199
|
+
for s in prob_sentences:
|
|
200
|
+
|
|
201
|
+
unify, space = False, ""
|
|
202
|
+
|
|
203
|
+
if counter != 0:
|
|
204
|
+
|
|
205
|
+
unify, space = self.unify(new_sentences, s, unification_marks)
|
|
206
|
+
|
|
207
|
+
if s != "":
|
|
208
|
+
|
|
209
|
+
if not unify:
|
|
210
|
+
|
|
211
|
+
new_sentences.append(s)
|
|
212
|
+
|
|
213
|
+
counter += 1
|
|
214
|
+
|
|
215
|
+
else:
|
|
216
|
+
|
|
217
|
+
new_sentences[-1] = new_sentences[-1] + space + s
|
|
218
|
+
|
|
219
|
+
prob_sentences = new_sentences
|
|
220
|
+
|
|
221
|
+
return prob_sentences
|
|
222
|
+
|
|
223
|
+
def __save(self, storage: str = "data/extractions/new_data/sent_extraction.txt"):
|
|
224
|
+
|
|
225
|
+
with open(storage, "wb") as f:
|
|
226
|
+
|
|
227
|
+
checkpoints = {
|
|
228
|
+
"indices": ExtractRelatedSentences.indices,
|
|
229
|
+
"nb_process": ExtractRelatedSentences.nb_process,
|
|
230
|
+
"sentences": self.sentences,
|
|
231
|
+
"passed": self.passed,
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
pickler = pickle.Pickler(f)
|
|
235
|
+
|
|
236
|
+
pickler.dump(checkpoints)
|
|
237
|
+
|
|
238
|
+
def save_data_frame(
|
|
239
|
+
self,
|
|
240
|
+
storage: str = "data/extractions/new_data/sent_extraction.txt",
|
|
241
|
+
csv_file_path: str = "data/extractions/new_data/sent_extraction.csv",
|
|
242
|
+
**kwargs,
|
|
243
|
+
):
|
|
244
|
+
|
|
245
|
+
self.load(storage)
|
|
246
|
+
|
|
247
|
+
data_frame = pd.DataFrame.from_dict(self.sentences[1], orient="columns")
|
|
248
|
+
|
|
249
|
+
for i in range(2, self.length + 1):
|
|
250
|
+
|
|
251
|
+
data_frame = pd.concat(
|
|
252
|
+
(
|
|
253
|
+
data_frame,
|
|
254
|
+
pd.DataFrame.from_dict(self.sentences[i], orient="columns"),
|
|
255
|
+
)
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
data_frame.rename(
|
|
259
|
+
columns={"1": self.corpus_1, "2": self.corpus_2}, inplace=True
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
data_frame.to_csv(csv_file_path, index=False, **kwargs)
|
|
263
|
+
|
|
264
|
+
def load(self, storage: str = "data/extractions/new_data/sent_extraction.txt"):
|
|
265
|
+
|
|
266
|
+
with open(storage, "rb") as f:
|
|
267
|
+
|
|
268
|
+
depickler = pickle.Unpickler(f)
|
|
269
|
+
|
|
270
|
+
checkpoints = depickler.load()
|
|
271
|
+
|
|
272
|
+
ExtractRelatedSentences.indices = checkpoints["indices"]
|
|
273
|
+
|
|
274
|
+
ExtractRelatedSentences.nb_process = checkpoints["nb_process"]
|
|
275
|
+
|
|
276
|
+
self.sentences = checkpoints["sentences"]
|
|
277
|
+
|
|
278
|
+
self.passed = checkpoints["passed"]
|
|
279
|
+
|
|
280
|
+
def preprocess(
|
|
281
|
+
self,
|
|
282
|
+
number: Union[int, None] = None,
|
|
283
|
+
ending_marks: list = [".", " ?", " !"],
|
|
284
|
+
unification_marks: List[Tuple] = [
|
|
285
|
+
("«", "»", True),
|
|
286
|
+
("(", ")", True),
|
|
287
|
+
("“", "”", True),
|
|
288
|
+
],
|
|
289
|
+
cr: str = "r",
|
|
290
|
+
cm1: str = "f",
|
|
291
|
+
cm2: str = "j",
|
|
292
|
+
cm3: str = "l",
|
|
293
|
+
cmp1: str = "y",
|
|
294
|
+
cmp2: str = "i",
|
|
295
|
+
cmp3: str = "p",
|
|
296
|
+
q: str = "q",
|
|
297
|
+
start_at_last_indices: bool = False,
|
|
298
|
+
i: int = 0,
|
|
299
|
+
j: int = 0,
|
|
300
|
+
auto_save: bool = True,
|
|
301
|
+
storage: str = "data/extractions/new_data/sent_extraction.txt",
|
|
302
|
+
):
|
|
303
|
+
|
|
304
|
+
line = number if not number is None else self.nb_process
|
|
305
|
+
|
|
306
|
+
process_again = ""
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
self.load(storage=storage)
|
|
310
|
+
except:
|
|
311
|
+
pass
|
|
312
|
+
|
|
313
|
+
if line in set(self.sentences):
|
|
314
|
+
|
|
315
|
+
process_again = input(
|
|
316
|
+
f"You have already process the paragraph at line {line}.\nDo you want to modify from the processed sentences ? Yes(y), No(n):"
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
print(f"Preprocessing of paragraph at line {line}")
|
|
320
|
+
|
|
321
|
+
if process_again == "n" or process_again == "":
|
|
322
|
+
|
|
323
|
+
paragraph1 = str(self.corpora.loc[line - 1, self.corpus_1])
|
|
324
|
+
|
|
325
|
+
paragraph2 = str(self.corpora.loc[line - 1, self.corpus_2])
|
|
326
|
+
|
|
327
|
+
# let us separate the paragraphs by ending marks
|
|
328
|
+
|
|
329
|
+
prob_sentences1 = self.split(paragraph1, ending_marks, unification_marks)
|
|
330
|
+
|
|
331
|
+
prob_sentences2 = self.split(paragraph2, ending_marks, unification_marks)
|
|
332
|
+
|
|
333
|
+
elif process_again == "y":
|
|
334
|
+
|
|
335
|
+
prob_sentences1 = self.sentences[line]["1"]
|
|
336
|
+
|
|
337
|
+
prob_sentences2 = self.sentences[line]["2"]
|
|
338
|
+
|
|
339
|
+
else:
|
|
340
|
+
|
|
341
|
+
raise CommandError(f"You cannot take the command {process_again}!")
|
|
342
|
+
|
|
343
|
+
print("\n-----------\n-----------\n")
|
|
344
|
+
|
|
345
|
+
print("Do you want to process the following sentences:\n")
|
|
346
|
+
|
|
347
|
+
print(f"On {self.corpus_1}: ")
|
|
348
|
+
|
|
349
|
+
[print(f"{i}: {sentence}") for i, sentence in enumerate(prob_sentences1)]
|
|
350
|
+
|
|
351
|
+
print(f"\nOn {self.corpus_2}: ")
|
|
352
|
+
|
|
353
|
+
[print(f"{i}: {sentence}") for i, sentence in enumerate(prob_sentences2)]
|
|
354
|
+
|
|
355
|
+
print("\n-----------")
|
|
356
|
+
|
|
357
|
+
process = input("Yes(y), Accept all (a) or No(n): ")
|
|
358
|
+
|
|
359
|
+
cm = ""
|
|
360
|
+
|
|
361
|
+
if process == "y":
|
|
362
|
+
|
|
363
|
+
sentences = {"1": [], "2": []}
|
|
364
|
+
|
|
365
|
+
passed = {"1": [], "2": []}
|
|
366
|
+
|
|
367
|
+
last_sentences = {"1": "", "2": ""}
|
|
368
|
+
|
|
369
|
+
if start_at_last_indices and line in set(self.indices):
|
|
370
|
+
|
|
371
|
+
i = self.indices[line]["i"]
|
|
372
|
+
|
|
373
|
+
j = self.indices[line]["j"]
|
|
374
|
+
|
|
375
|
+
while i < len(prob_sentences1) and j < len(prob_sentences2):
|
|
376
|
+
|
|
377
|
+
self.store_indices(line, i, j)
|
|
378
|
+
|
|
379
|
+
sentence1 = sentence1_ = prob_sentences1[i]
|
|
380
|
+
|
|
381
|
+
sentence2 = sentence2_ = prob_sentences2[j]
|
|
382
|
+
|
|
383
|
+
if last_sentences["1"] != "":
|
|
384
|
+
|
|
385
|
+
sentence1_ = last_sentences["1"].strip() + " " + sentence1
|
|
386
|
+
|
|
387
|
+
if last_sentences["2"] != "":
|
|
388
|
+
|
|
389
|
+
sentence2_ = last_sentences["2"] + " " + sentence2
|
|
390
|
+
|
|
391
|
+
print(
|
|
392
|
+
f"\nThe current sentences are:\n{self.corpus_1} (index = {i}) : {sentence1_}\n{self.corpus_2} (index = {j}) : {sentence2_}"
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
cm = input(
|
|
396
|
+
f"Are they related together ?\n(index = {i}) : {sentence1_}\n{self.corpus_2} (index = {j}) : {sentence2_}\nQuit ({q}), Related ({cr}), Sentence 1 is uncompleted ({cm1}), Sentence 2 is uncompleted ({cm2}), The two sentences are uncompleted ({cm3}),\n Pass sentence 1 ({cmp1}), Pass sentence 2 ({cmp2}), Pass the two sentences ({cmp3}) :"
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if cm == cr:
|
|
400
|
+
|
|
401
|
+
# clear the last sentences
|
|
402
|
+
last_sentences = {"1": "", "2": ""}
|
|
403
|
+
|
|
404
|
+
# add the sentences to the list of related sentences
|
|
405
|
+
sentences["1"].append(sentence1_.strip())
|
|
406
|
+
|
|
407
|
+
sentences["2"].append(sentence2_.strip())
|
|
408
|
+
|
|
409
|
+
# Pass to the next sentences
|
|
410
|
+
i += 1
|
|
411
|
+
|
|
412
|
+
j += 1
|
|
413
|
+
|
|
414
|
+
elif cm == cm1:
|
|
415
|
+
|
|
416
|
+
# The first sentence is added to the last sentence 1
|
|
417
|
+
last_sentences["1"] += " " + sentence1
|
|
418
|
+
|
|
419
|
+
# Pass to the next sentence at corpus 1
|
|
420
|
+
i += 1
|
|
421
|
+
|
|
422
|
+
elif cm == cm2:
|
|
423
|
+
|
|
424
|
+
# The second sentence is added to the last sentence 2
|
|
425
|
+
last_sentences["2"] += " " + sentence2
|
|
426
|
+
|
|
427
|
+
# Pass to the next sentence at corpus 2
|
|
428
|
+
j += 1
|
|
429
|
+
|
|
430
|
+
elif cm == cm3:
|
|
431
|
+
|
|
432
|
+
# The two sentences are added to the last sentences
|
|
433
|
+
last_sentences["1"] += " " + sentence1
|
|
434
|
+
|
|
435
|
+
last_sentences["2"] += " " + sentence2
|
|
436
|
+
|
|
437
|
+
# Pass to the next sentences
|
|
438
|
+
i += 1
|
|
439
|
+
|
|
440
|
+
j += 1
|
|
441
|
+
|
|
442
|
+
elif cm == cmp1:
|
|
443
|
+
|
|
444
|
+
# Clear the last sentence at corpus 1
|
|
445
|
+
last_sentences["1"] = ""
|
|
446
|
+
|
|
447
|
+
# Add the sentence 1 to the passed sentences at corpus 1
|
|
448
|
+
passed["1"].append(sentence1_)
|
|
449
|
+
|
|
450
|
+
# Pass to the next sentence at corpus 1
|
|
451
|
+
i += 1
|
|
452
|
+
|
|
453
|
+
elif cm == cmp2:
|
|
454
|
+
|
|
455
|
+
# Clear the last sentence at corpus 2
|
|
456
|
+
last_sentences["2"] = ""
|
|
457
|
+
|
|
458
|
+
# Add the sentence 2 to the passed sentences at corpus 2
|
|
459
|
+
passed["2"].append(sentence2_)
|
|
460
|
+
|
|
461
|
+
# Pass to the next sentence at corpus 2
|
|
462
|
+
j += 1
|
|
463
|
+
|
|
464
|
+
elif cm == cmp3:
|
|
465
|
+
|
|
466
|
+
# Clear the last sentences
|
|
467
|
+
last_sentences = {"1": "", "2": ""}
|
|
468
|
+
|
|
469
|
+
# Add the two sentences to the passed sentences
|
|
470
|
+
passed["1"].append(sentence1_)
|
|
471
|
+
|
|
472
|
+
passed["2"].append(sentence2_)
|
|
473
|
+
|
|
474
|
+
# Pass to the next sentences
|
|
475
|
+
i += 1
|
|
476
|
+
|
|
477
|
+
j += 1
|
|
478
|
+
|
|
479
|
+
elif cm == q:
|
|
480
|
+
|
|
481
|
+
break
|
|
482
|
+
|
|
483
|
+
else:
|
|
484
|
+
|
|
485
|
+
print(f"You cannot take the command {cm} ! Please retry again !")
|
|
486
|
+
|
|
487
|
+
print("\n-----------\n")
|
|
488
|
+
|
|
489
|
+
continue
|
|
490
|
+
|
|
491
|
+
print("\nYou have stored the following sentences for the moment :\n")
|
|
492
|
+
|
|
493
|
+
print(f"On {self.corpus_1} (length = {len(sentences['1'])}): ")
|
|
494
|
+
|
|
495
|
+
print("\n".join(sentences["1"]))
|
|
496
|
+
|
|
497
|
+
print(f"\nOn {self.corpus_2} (length = {len(sentences['2'])}): ")
|
|
498
|
+
|
|
499
|
+
print("\n".join(sentences["2"]))
|
|
500
|
+
|
|
501
|
+
print("\n-----------\n")
|
|
502
|
+
|
|
503
|
+
print("You have passed the following sentences :\n")
|
|
504
|
+
|
|
505
|
+
print(f"On {self.corpus_1} (length = {len(passed['1'])}): ")
|
|
506
|
+
|
|
507
|
+
print("\n".join(passed["1"]))
|
|
508
|
+
|
|
509
|
+
print(f"\nOn {self.corpus_2} (length = {len(passed['2'])}): ")
|
|
510
|
+
|
|
511
|
+
print("\n".join(passed["2"]))
|
|
512
|
+
|
|
513
|
+
print("\n------------------------------------")
|
|
514
|
+
|
|
515
|
+
# add the sentences
|
|
516
|
+
self.add_sentences(line, sentences)
|
|
517
|
+
|
|
518
|
+
# add the passed sentences
|
|
519
|
+
self.add_passed(line, passed)
|
|
520
|
+
|
|
521
|
+
if auto_save:
|
|
522
|
+
# save the checkpoints
|
|
523
|
+
self.__save(storage)
|
|
524
|
+
|
|
525
|
+
print("\nFinished!")
|
|
526
|
+
|
|
527
|
+
# incrementing the number of processes
|
|
528
|
+
self.increment()
|
|
529
|
+
|
|
530
|
+
# add the passed sentences
|
|
531
|
+
self.add_passed(line, passed)
|
|
532
|
+
|
|
533
|
+
# add the sentences
|
|
534
|
+
self.add_sentences(line, sentences)
|
|
535
|
+
|
|
536
|
+
if auto_save:
|
|
537
|
+
# save the checkpoints
|
|
538
|
+
self.__save(storage)
|
|
539
|
+
|
|
540
|
+
elif process == "a":
|
|
541
|
+
|
|
542
|
+
sentences = {"1": prob_sentences1, "2": prob_sentences2}
|
|
543
|
+
|
|
544
|
+
print("\nFinished!")
|
|
545
|
+
|
|
546
|
+
# incrementing the number of processes
|
|
547
|
+
self.increment()
|
|
548
|
+
|
|
549
|
+
# add the sentences
|
|
550
|
+
self.add_sentences(line, sentences)
|
|
551
|
+
|
|
552
|
+
if auto_save:
|
|
553
|
+
# save the checkpoints
|
|
554
|
+
self.__save(storage)
|
|
555
|
+
|
|
556
|
+
elif process == "n" or cm == "q":
|
|
557
|
+
|
|
558
|
+
print(f"Process aborted!")
|
|
559
|
+
|
|
560
|
+
else:
|
|
561
|
+
|
|
562
|
+
raise CommandError(f"You cannot take the command {process}!")
|
|
File without changes
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import *
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def add_end_mark(
|
|
6
|
+
sentences: Union[list, str],
|
|
7
|
+
end_mark: str = ".",
|
|
8
|
+
end_mark_to_remove: Union[str, None] = None,
|
|
9
|
+
replace: bool = False,
|
|
10
|
+
poss_end_marks: list = ["!", "?", ".", "..."],
|
|
11
|
+
):
|
|
12
|
+
|
|
13
|
+
if isinstance(sentences, str):
|
|
14
|
+
sentences = [sentences]
|
|
15
|
+
|
|
16
|
+
if replace and end_mark_to_remove is None:
|
|
17
|
+
|
|
18
|
+
raise ValueError(
|
|
19
|
+
"You must provide a end mark to remove if you want to make replacement !"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
new_sentences = []
|
|
23
|
+
|
|
24
|
+
# if replace is chosen we will replace the end to remove by the end mark if not only add end mark to each sentence and remove the end mark to remove
|
|
25
|
+
for sentence in sentences:
|
|
26
|
+
|
|
27
|
+
if replace:
|
|
28
|
+
|
|
29
|
+
if sentence[-1] == end_mark_to_remove:
|
|
30
|
+
|
|
31
|
+
sentence = sentence[:-1].strip() + end_mark
|
|
32
|
+
|
|
33
|
+
else:
|
|
34
|
+
|
|
35
|
+
if not end_mark_to_remove is None and sentence[-1] == end_mark_to_remove:
|
|
36
|
+
|
|
37
|
+
sentence = sentence[:-1]
|
|
38
|
+
|
|
39
|
+
if not sentence[-1] in poss_end_marks:
|
|
40
|
+
|
|
41
|
+
sentence = sentence.strip() + end_mark
|
|
42
|
+
|
|
43
|
+
new_sentences.append(sentence)
|
|
44
|
+
|
|
45
|
+
return new_sentences
|