wolof-translate 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wolof_translate/__init__.py +73 -0
- wolof_translate/data/__init__.py +0 -0
- wolof_translate/data/dataset_v1.py +151 -0
- wolof_translate/data/dataset_v2.py +187 -0
- wolof_translate/data/dataset_v3.py +187 -0
- wolof_translate/data/dataset_v3_2.py +187 -0
- wolof_translate/data/dataset_v4.py +202 -0
- wolof_translate/data/dataset_v5.py +65 -0
- wolof_translate/models/__init__.py +0 -0
- wolof_translate/models/transformers/__init__.py +0 -0
- wolof_translate/models/transformers/main.py +865 -0
- wolof_translate/models/transformers/main_2.py +362 -0
- wolof_translate/models/transformers/optimization.py +41 -0
- wolof_translate/models/transformers/position.py +46 -0
- wolof_translate/models/transformers/size.py +44 -0
- wolof_translate/pipe/__init__.py +1 -0
- wolof_translate/pipe/nlp_pipeline.py +512 -0
- wolof_translate/tokenizers/__init__.py +0 -0
- wolof_translate/trainers/__init__.py +0 -0
- wolof_translate/trainers/transformer_trainer.py +760 -0
- wolof_translate/trainers/transformer_trainer_custom.py +882 -0
- wolof_translate/trainers/transformer_trainer_ml.py +925 -0
- wolof_translate/trainers/transformer_trainer_ml_.py +1042 -0
- wolof_translate/utils/__init__.py +1 -0
- wolof_translate/utils/bucket_iterator.py +143 -0
- wolof_translate/utils/database_manager.py +116 -0
- wolof_translate/utils/display_predictions.py +162 -0
- wolof_translate/utils/download_model.py +40 -0
- wolof_translate/utils/evaluate_custom.py +147 -0
- wolof_translate/utils/evaluation.py +74 -0
- wolof_translate/utils/extract_new_sentences.py +810 -0
- wolof_translate/utils/extract_poems.py +60 -0
- wolof_translate/utils/extract_sentences.py +562 -0
- wolof_translate/utils/improvements/__init__.py +0 -0
- wolof_translate/utils/improvements/end_marks.py +45 -0
- wolof_translate/utils/recuperate_datasets.py +94 -0
- wolof_translate/utils/recuperate_datasets_trunc.py +85 -0
- wolof_translate/utils/send_model.py +26 -0
- wolof_translate/utils/sent_corrections.py +169 -0
- wolof_translate/utils/sent_transformers.py +27 -0
- wolof_translate/utils/sent_unification.py +97 -0
- wolof_translate/utils/split_with_valid.py +72 -0
- wolof_translate/utils/tokenize_text.py +46 -0
- wolof_translate/utils/training.py +213 -0
- wolof_translate/utils/trunc_hg_training.py +196 -0
- wolof_translate-0.0.1.dist-info/METADATA +31 -0
- wolof_translate-0.0.1.dist-info/RECORD +49 -0
- wolof_translate-0.0.1.dist-info/WHEEL +5 -0
- wolof_translate-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,810 @@
|
|
|
1
|
+
from typing import *
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import pickle
|
|
4
|
+
import re
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NewSentenceExtraction:
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
text: Union[str, None] = None,
|
|
12
|
+
sent_sep: str = ":",
|
|
13
|
+
corpus_1: str = "wolof",
|
|
14
|
+
corpus_2: str = "french",
|
|
15
|
+
save_directory: str = "data/additional_documents/diagne_sentences/",
|
|
16
|
+
checkpoint_name: str = "new_sentences",
|
|
17
|
+
):
|
|
18
|
+
|
|
19
|
+
self.text = text
|
|
20
|
+
|
|
21
|
+
self.corpus_1 = corpus_1
|
|
22
|
+
|
|
23
|
+
self.corpus_2 = corpus_2
|
|
24
|
+
|
|
25
|
+
self.sep = sent_sep
|
|
26
|
+
|
|
27
|
+
self.groups = []
|
|
28
|
+
|
|
29
|
+
self.index = 0
|
|
30
|
+
|
|
31
|
+
self.save_directory = save_directory
|
|
32
|
+
|
|
33
|
+
self.checkpoint = checkpoint_name
|
|
34
|
+
|
|
35
|
+
self.extractions = {corpus_1: [], corpus_2: []}
|
|
36
|
+
|
|
37
|
+
def __save(self):
|
|
38
|
+
|
|
39
|
+
checkpoints = {
|
|
40
|
+
# 'extractions': self.extractions,
|
|
41
|
+
"index": self.index,
|
|
42
|
+
# 'groups': self.groups
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
pd.DataFrame({"groups": self.groups}).to_csv(
|
|
46
|
+
os.path.join(self.save_directory, "groups.csv"), index=False
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
pd.DataFrame(self.extractions).to_csv(
|
|
50
|
+
os.path.join(self.save_directory, "extractions.csv"), index=False
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
with open(os.path.join(self.save_directory, self.checkpoint), "wb") as f:
|
|
54
|
+
|
|
55
|
+
pickler = pickle.Pickler(f)
|
|
56
|
+
|
|
57
|
+
pickler.dump(checkpoints)
|
|
58
|
+
|
|
59
|
+
def sep_with_mark(self, group: str, mark: Union[str, None] = None):
|
|
60
|
+
|
|
61
|
+
raise NotImplementedError
|
|
62
|
+
|
|
63
|
+
def load(self):
|
|
64
|
+
|
|
65
|
+
with open(os.path.join(self.save_directory, self.checkpoint), "rb") as f:
|
|
66
|
+
|
|
67
|
+
depickler = pickle.Unpickler(f)
|
|
68
|
+
|
|
69
|
+
checkpoints = depickler.load()
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
|
|
73
|
+
self.extractions = pd.read_csv(
|
|
74
|
+
os.path.join(self.save_directory, "extractions.csv")
|
|
75
|
+
).to_dict("list")
|
|
76
|
+
|
|
77
|
+
except Exception:
|
|
78
|
+
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
self.groups = pd.read_csv(os.path.join(self.save_directory, "groups.csv"))[
|
|
82
|
+
"groups"
|
|
83
|
+
].to_list()
|
|
84
|
+
|
|
85
|
+
self.index = checkpoints["index"]
|
|
86
|
+
|
|
87
|
+
def add_groups(self, new_groups: list):
|
|
88
|
+
|
|
89
|
+
self.groups += new_groups
|
|
90
|
+
|
|
91
|
+
self.__save()
|
|
92
|
+
|
|
93
|
+
def get_groups(self, stop_criterions: list = [" ", "\n"], comparisons: list = []):
|
|
94
|
+
|
|
95
|
+
assert not self.text is None
|
|
96
|
+
|
|
97
|
+
i = 0
|
|
98
|
+
|
|
99
|
+
a = 0
|
|
100
|
+
|
|
101
|
+
g = 1
|
|
102
|
+
|
|
103
|
+
while i < len(self.text):
|
|
104
|
+
|
|
105
|
+
letter = self.text[i]
|
|
106
|
+
|
|
107
|
+
if letter == self.sep:
|
|
108
|
+
|
|
109
|
+
print(f"Extraction of group number {g}\n")
|
|
110
|
+
|
|
111
|
+
b = i - 1 # index of letters before the current letter
|
|
112
|
+
|
|
113
|
+
a = i + 1 # index of letters after the current letter
|
|
114
|
+
|
|
115
|
+
corpus_1_s = [] # letters of the left sentence
|
|
116
|
+
|
|
117
|
+
corpus_2_s = [] # letters of the right sentence
|
|
118
|
+
|
|
119
|
+
stop = False
|
|
120
|
+
|
|
121
|
+
for stop_cr in stop_criterions:
|
|
122
|
+
|
|
123
|
+
if self.text[b - len(stop_cr) + 1 : b + 1] == stop_cr:
|
|
124
|
+
|
|
125
|
+
stop = True
|
|
126
|
+
|
|
127
|
+
while not stop:
|
|
128
|
+
|
|
129
|
+
corpus_1_s.append(self.text[b])
|
|
130
|
+
|
|
131
|
+
b -= 1
|
|
132
|
+
|
|
133
|
+
stop = False
|
|
134
|
+
|
|
135
|
+
for stop_cr in stop_criterions:
|
|
136
|
+
|
|
137
|
+
if self.text[b - len(stop_cr) + 1 : b + 1] == stop_cr:
|
|
138
|
+
|
|
139
|
+
stop = True
|
|
140
|
+
|
|
141
|
+
stop = False
|
|
142
|
+
|
|
143
|
+
for stop_cr in stop_criterions:
|
|
144
|
+
|
|
145
|
+
if self.text[a : a + len(stop_cr)] == stop_cr:
|
|
146
|
+
|
|
147
|
+
stop = True
|
|
148
|
+
|
|
149
|
+
while not stop:
|
|
150
|
+
|
|
151
|
+
corpus_2_s.append(self.text[a])
|
|
152
|
+
|
|
153
|
+
a += 1
|
|
154
|
+
|
|
155
|
+
stop = False
|
|
156
|
+
|
|
157
|
+
for stop_cr in stop_criterions:
|
|
158
|
+
|
|
159
|
+
if self.text[a : a + len(stop_cr)] == stop_cr:
|
|
160
|
+
|
|
161
|
+
stop = True
|
|
162
|
+
|
|
163
|
+
# reverse first sentence
|
|
164
|
+
corpus_1_s.reverse()
|
|
165
|
+
|
|
166
|
+
# add the sentences
|
|
167
|
+
current_sentence = (
|
|
168
|
+
"".join(corpus_1_s).strip()
|
|
169
|
+
+ f" {self.sep} "
|
|
170
|
+
+ "".join(corpus_2_s).strip()
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if "".join(corpus_1_s).strip() != "" and "".join(corpus_2_s) != "":
|
|
174
|
+
|
|
175
|
+
# verify if it is not already manually got
|
|
176
|
+
not_recuperated = True
|
|
177
|
+
|
|
178
|
+
for comparison in comparisons:
|
|
179
|
+
|
|
180
|
+
if current_sentence in comparison:
|
|
181
|
+
|
|
182
|
+
not_recuperated = False
|
|
183
|
+
|
|
184
|
+
# verify if it is not already in the extracted groups
|
|
185
|
+
for group in self.groups:
|
|
186
|
+
|
|
187
|
+
if current_sentence in group:
|
|
188
|
+
|
|
189
|
+
not_recuperated = False
|
|
190
|
+
|
|
191
|
+
if not_recuperated:
|
|
192
|
+
|
|
193
|
+
self.groups.append(current_sentence.strip())
|
|
194
|
+
# print(current_sentence)
|
|
195
|
+
|
|
196
|
+
g += 1
|
|
197
|
+
|
|
198
|
+
print("Successfully extracted !!\n")
|
|
199
|
+
|
|
200
|
+
print("-----------------\n")
|
|
201
|
+
|
|
202
|
+
i = a - 1
|
|
203
|
+
|
|
204
|
+
self.__save()
|
|
205
|
+
|
|
206
|
+
i += 1
|
|
207
|
+
|
|
208
|
+
# print("The groups were successfully recuperated !")
|
|
209
|
+
|
|
210
|
+
def replace_groups(
|
|
211
|
+
self,
|
|
212
|
+
re_match: str,
|
|
213
|
+
delete_re: Union[str, None] = None,
|
|
214
|
+
n_replace_max: int = 1,
|
|
215
|
+
load: bool = True,
|
|
216
|
+
save: bool = False,
|
|
217
|
+
manual_replace: bool = False,
|
|
218
|
+
csv_file: str = "founded.csv",
|
|
219
|
+
force_replace: bool = False,
|
|
220
|
+
):
|
|
221
|
+
|
|
222
|
+
# we load the data
|
|
223
|
+
if load:
|
|
224
|
+
|
|
225
|
+
self.load()
|
|
226
|
+
|
|
227
|
+
# find the groups matching the match regex
|
|
228
|
+
founded = [
|
|
229
|
+
(i, self.groups[i])
|
|
230
|
+
for i in range(len(self.groups))
|
|
231
|
+
if re.match(re_match, self.groups[i])
|
|
232
|
+
]
|
|
233
|
+
|
|
234
|
+
print(
|
|
235
|
+
f"Found groups matching the regular expression {re_match} are the followings:\n"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
[print(f"- {f[1]}") for f in founded]
|
|
239
|
+
|
|
240
|
+
print("\n----------------------\n")
|
|
241
|
+
|
|
242
|
+
# if regex for deletion are provided we replace those that will be found with a max number of replace
|
|
243
|
+
not_replaced = set()
|
|
244
|
+
|
|
245
|
+
replaced = set()
|
|
246
|
+
|
|
247
|
+
result = {}
|
|
248
|
+
|
|
249
|
+
delete_re_ = input(
|
|
250
|
+
"Do you want to change the deletion' regex expression -> provide one if yes or give empty string ('') if not : "
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
if delete_re_ != "":
|
|
254
|
+
|
|
255
|
+
delete_re = delete_re_
|
|
256
|
+
|
|
257
|
+
if not delete_re is None or manual_replace:
|
|
258
|
+
|
|
259
|
+
for i in range(len(founded)):
|
|
260
|
+
|
|
261
|
+
f = founded[i][1]
|
|
262
|
+
|
|
263
|
+
index = founded[i][0]
|
|
264
|
+
|
|
265
|
+
m_replace = "n"
|
|
266
|
+
|
|
267
|
+
if not force_replace and manual_replace:
|
|
268
|
+
|
|
269
|
+
print(f"You will modify the following group:\n {f}")
|
|
270
|
+
|
|
271
|
+
m_replace = input(
|
|
272
|
+
f"\nDo you want to make a manual replacement of the group {f} -> Yes(y) or No(n). If you want to quit, press q!"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
if m_replace == "q":
|
|
276
|
+
|
|
277
|
+
break
|
|
278
|
+
|
|
279
|
+
while not m_replace in ["y", "n"]:
|
|
280
|
+
|
|
281
|
+
replace_r = input(
|
|
282
|
+
f"You must provide a response between Yes(y), No(n)!"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
if m_replace != "n":
|
|
286
|
+
|
|
287
|
+
print(
|
|
288
|
+
f"The manual modification of the group\n {f}\n is done in the following file: {csv_file}\n!If you want to provide multiple new groups please make them in different lines"
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
finish = "n"
|
|
292
|
+
|
|
293
|
+
pd.DataFrame({"to_modify": [f]}).to_csv(csv_file, index=False)
|
|
294
|
+
|
|
295
|
+
while finish == "n":
|
|
296
|
+
|
|
297
|
+
finish = input(
|
|
298
|
+
"Did you finish to replace -> No(n) if you didn't finish yet, click any another key if Yes(y) : "
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
f = pd.read_csv(csv_file)["to_modify"].to_list()
|
|
302
|
+
|
|
303
|
+
print("\n--------\n")
|
|
304
|
+
|
|
305
|
+
if not delete_re is None and m_replace in ["n", ""]:
|
|
306
|
+
|
|
307
|
+
to_replace = set(re.findall(delete_re, f))
|
|
308
|
+
|
|
309
|
+
replace_r = None
|
|
310
|
+
|
|
311
|
+
for r in to_replace:
|
|
312
|
+
|
|
313
|
+
if force_replace:
|
|
314
|
+
|
|
315
|
+
f = f.replace(r, "", n_replace_max)
|
|
316
|
+
|
|
317
|
+
replaced.add(f)
|
|
318
|
+
|
|
319
|
+
else:
|
|
320
|
+
|
|
321
|
+
replace_r = input(
|
|
322
|
+
f"Do you want to replace the {r} string in the group:\n {f} ? Yes(y) or No(n). If you want to quit, press q!"
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if m_replace == "q":
|
|
326
|
+
|
|
327
|
+
break
|
|
328
|
+
|
|
329
|
+
while not replace_r in ["y", "n"]:
|
|
330
|
+
|
|
331
|
+
replace_r = input(
|
|
332
|
+
f"You must provide a response between Yes(y) and No(n)!"
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if replace_r == "y":
|
|
336
|
+
|
|
337
|
+
f = f.replace(r, "", n_replace_max)
|
|
338
|
+
|
|
339
|
+
replaced.add(f)
|
|
340
|
+
|
|
341
|
+
else:
|
|
342
|
+
|
|
343
|
+
not_replaced.add(f)
|
|
344
|
+
|
|
345
|
+
if not replace_r is None and replace_r == "q":
|
|
346
|
+
|
|
347
|
+
break
|
|
348
|
+
|
|
349
|
+
if isinstance(f, str):
|
|
350
|
+
|
|
351
|
+
f = [f.strip()]
|
|
352
|
+
|
|
353
|
+
else:
|
|
354
|
+
|
|
355
|
+
f = [f_.strip() for f_ in f]
|
|
356
|
+
|
|
357
|
+
try:
|
|
358
|
+
|
|
359
|
+
self.groups = self.groups[:index] + f + self.groups[index + 1 :]
|
|
360
|
+
|
|
361
|
+
except IndexError:
|
|
362
|
+
|
|
363
|
+
self.groups = self.groups[:index] + f
|
|
364
|
+
|
|
365
|
+
if len(f) > 1 and i != len(founded) - 1:
|
|
366
|
+
|
|
367
|
+
for j in range(i + 1, len(founded)):
|
|
368
|
+
|
|
369
|
+
founded[j] = (founded[j][0] + len(f) - 1, founded[j][1])
|
|
370
|
+
|
|
371
|
+
result[index] = f
|
|
372
|
+
|
|
373
|
+
if save:
|
|
374
|
+
|
|
375
|
+
print("Final result:")
|
|
376
|
+
|
|
377
|
+
[print(v) for r, v in result.items()]
|
|
378
|
+
|
|
379
|
+
save_result = input("Do you want to save the result ? Yes(y) or No(n)")
|
|
380
|
+
|
|
381
|
+
while not save_result in ["y", "n"]:
|
|
382
|
+
|
|
383
|
+
replace_r = input(
|
|
384
|
+
f"You must provide a response between Yes(y) or No(n) !"
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
if save_result == "y":
|
|
388
|
+
|
|
389
|
+
self.__save()
|
|
390
|
+
|
|
391
|
+
return {
|
|
392
|
+
"founded": founded,
|
|
393
|
+
"result": result,
|
|
394
|
+
"replaced": replaced,
|
|
395
|
+
"not_replaced": not_replaced,
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
def extraction_commands(
|
|
399
|
+
self,
|
|
400
|
+
add_end_mark_cmd: str = "a",
|
|
401
|
+
pass_cmd: str = "p",
|
|
402
|
+
add_end_mark_on_all: str = "l",
|
|
403
|
+
add_upper_cmd: str = "u",
|
|
404
|
+
add_upper_on_all: str = "o",
|
|
405
|
+
sep_cmd: str = "_",
|
|
406
|
+
quit_cmd: str = "q",
|
|
407
|
+
):
|
|
408
|
+
|
|
409
|
+
# recuperate the current command
|
|
410
|
+
cm = input(
|
|
411
|
+
f"Choose one of the following commands: \n- {add_end_mark_cmd}+group_nb1,group_nb2:mark|group_nb3,group_nb4:mark|...(or group_nb1-group_nbn:mark) : To add end mark on specific groups\
|
|
412
|
+
\n- {add_end_mark_on_all}+mark : To add end mark of all groups, \n- {add_upper_cmd}+group_nb1,group_nb2,group_nb3,group_nb4,...(or group_nb1-group_nbn) : To uppercase the first letter of specific groups\
|
|
413
|
+
\n- {add_upper_on_all} : To uppercase the first letter of all the groups\
|
|
414
|
+
\n- {pass_cmd} : To accept all of the groups\
|
|
415
|
+
\n- {quit_cmd} : To stop the process\
|
|
416
|
+
\n- You can combine all two commands by underscore {sep_cmd} excepted for the two last commands !"
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
cms = cm.split(sep_cmd)
|
|
420
|
+
|
|
421
|
+
error = False
|
|
422
|
+
|
|
423
|
+
if len(cms) == 2:
|
|
424
|
+
|
|
425
|
+
p_cm = [cms[0].split("+")[0], cms[1].split("+")[0]]
|
|
426
|
+
|
|
427
|
+
if pass_cmd in p_cm or quit_cmd in p_cm or sep_cmd in p_cm:
|
|
428
|
+
|
|
429
|
+
print(
|
|
430
|
+
f"You cannot provide {pass_cmd}, {quit_cmd} or {sep_cmd} in combined commands !"
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
error = True
|
|
434
|
+
|
|
435
|
+
elif (
|
|
436
|
+
p_cm[0] in [add_end_mark_cmd, add_end_mark_on_all]
|
|
437
|
+
and p_cm[1] in [add_upper_cmd, add_upper_on_all]
|
|
438
|
+
) or (
|
|
439
|
+
p_cm[0] in [add_upper_cmd, add_upper_on_all]
|
|
440
|
+
and p_cm[1] in [add_upper_cmd, add_upper_on_all]
|
|
441
|
+
):
|
|
442
|
+
|
|
443
|
+
print(
|
|
444
|
+
"You cannot combine the same type of command: Type of commands are 'end mark' and 'upper'"
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
elif len(cms) == 1:
|
|
448
|
+
|
|
449
|
+
if not cms[0].split("+")[0] in [
|
|
450
|
+
add_end_mark_cmd,
|
|
451
|
+
add_end_mark_on_all,
|
|
452
|
+
add_upper_cmd,
|
|
453
|
+
add_upper_on_all,
|
|
454
|
+
pass_cmd,
|
|
455
|
+
quit_cmd,
|
|
456
|
+
]:
|
|
457
|
+
|
|
458
|
+
print("You didn't provide a right command ! Please retry")
|
|
459
|
+
|
|
460
|
+
error = True
|
|
461
|
+
|
|
462
|
+
else:
|
|
463
|
+
|
|
464
|
+
print("You cannot provide more than 2 or 0 commands !")
|
|
465
|
+
|
|
466
|
+
return cms, error
|
|
467
|
+
|
|
468
|
+
def split_group(self, group: Union[list, str]):
|
|
469
|
+
# we base on the colon critter to split the groups
|
|
470
|
+
|
|
471
|
+
if isinstance(group, str):
|
|
472
|
+
|
|
473
|
+
group = [group]
|
|
474
|
+
|
|
475
|
+
sents = {self.corpus_1: [], self.corpus_2: []}
|
|
476
|
+
|
|
477
|
+
for g in group:
|
|
478
|
+
|
|
479
|
+
splits = g.split(":")
|
|
480
|
+
|
|
481
|
+
middle = len(splits) // 2
|
|
482
|
+
|
|
483
|
+
cp1_corpus = "".join(splits[:middle])
|
|
484
|
+
|
|
485
|
+
cp2_corpus = "".join(splits[middle:])
|
|
486
|
+
|
|
487
|
+
sents[self.corpus_1].append(cp1_corpus.strip())
|
|
488
|
+
|
|
489
|
+
sents[self.corpus_2].append(cp2_corpus.strip())
|
|
490
|
+
|
|
491
|
+
return sents
|
|
492
|
+
|
|
493
|
+
def add_end_mark(self, batch: dict, command: str):
|
|
494
|
+
|
|
495
|
+
cm = command
|
|
496
|
+
|
|
497
|
+
# recuperate the marks with groups and apply the transformations
|
|
498
|
+
tfs = cm.split("|")
|
|
499
|
+
|
|
500
|
+
for tf in tfs:
|
|
501
|
+
|
|
502
|
+
if "-" in tf:
|
|
503
|
+
|
|
504
|
+
groups = tf.split(":")[0].split("-")
|
|
505
|
+
|
|
506
|
+
groups = list(range(int(groups[0]), int(groups[1]) + 1))
|
|
507
|
+
|
|
508
|
+
else:
|
|
509
|
+
|
|
510
|
+
groups = [int(nb) for nb in tf.split(":")[0].split(",")]
|
|
511
|
+
|
|
512
|
+
mark = tf.split(":")[1]
|
|
513
|
+
|
|
514
|
+
for nb in groups:
|
|
515
|
+
|
|
516
|
+
batch[self.corpus_1][nb - 1] += mark
|
|
517
|
+
|
|
518
|
+
batch[self.corpus_2][nb - 1] += mark
|
|
519
|
+
|
|
520
|
+
return batch
|
|
521
|
+
|
|
522
|
+
def add_upper(self, batch: dict, command: str):
|
|
523
|
+
|
|
524
|
+
cm = command
|
|
525
|
+
|
|
526
|
+
# recuperate the marks with groups and apply the transformations
|
|
527
|
+
tfs = cm.split("|")
|
|
528
|
+
|
|
529
|
+
for tf in tfs:
|
|
530
|
+
|
|
531
|
+
# recuperate the marks with groups and apply the transformations
|
|
532
|
+
if "-" in tf:
|
|
533
|
+
|
|
534
|
+
groups = tf.split("-")
|
|
535
|
+
|
|
536
|
+
groups = list(range(int(groups[0]), int(groups[1]) + 1))
|
|
537
|
+
|
|
538
|
+
else:
|
|
539
|
+
|
|
540
|
+
groups = [int(nb) for nb in tf.split(",")]
|
|
541
|
+
|
|
542
|
+
for nb in groups:
|
|
543
|
+
|
|
544
|
+
batch[self.corpus_1][nb - 1] = (
|
|
545
|
+
batch[self.corpus_1][nb - 1][0].upper()
|
|
546
|
+
+ batch[self.corpus_1][nb - 1][1:]
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
batch[self.corpus_2][nb - 1] = (
|
|
550
|
+
batch[self.corpus_2][nb - 1][0].upper()
|
|
551
|
+
+ batch[self.corpus_2][nb - 1][1:]
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
return batch
|
|
555
|
+
|
|
556
|
+
def inner_command(self, batch: dict):
|
|
557
|
+
|
|
558
|
+
cp1_sents = batch[self.corpus_1]
|
|
559
|
+
|
|
560
|
+
cp2_sents = batch[self.corpus_2]
|
|
561
|
+
|
|
562
|
+
for i in range(0, len(batch[self.corpus_1])):
|
|
563
|
+
|
|
564
|
+
cp1_sent = cp1_sents[i]
|
|
565
|
+
|
|
566
|
+
cp2_sent = cp2_sents[i]
|
|
567
|
+
|
|
568
|
+
if re.match(".*Mark\[.*\].*", cp2_sent):
|
|
569
|
+
|
|
570
|
+
mark = re.findall("Mark\[.*\]", cp2_sent)[0]
|
|
571
|
+
|
|
572
|
+
mark = mark.replace("Mark[", "").replace("]", "")
|
|
573
|
+
|
|
574
|
+
cp1_sent = cp1_sent + mark
|
|
575
|
+
|
|
576
|
+
cp2_sent = re.sub("Mark\[.*\]", "", cp2_sent, 1) + mark
|
|
577
|
+
|
|
578
|
+
if re.match(".*Upper", cp2_sent):
|
|
579
|
+
|
|
580
|
+
cp1_sent = cp1_sent[0].upper() + cp1_sent[1:]
|
|
581
|
+
|
|
582
|
+
cp2_sent = cp2_sent[0].upper() + re.sub("Upper", "", cp2_sent, 1)[1:]
|
|
583
|
+
|
|
584
|
+
cp1_sents[i] = cp1_sent
|
|
585
|
+
|
|
586
|
+
cp2_sents[i] = cp2_sent
|
|
587
|
+
|
|
588
|
+
batch[self.corpus_1] = cp1_sents
|
|
589
|
+
|
|
590
|
+
batch[self.corpus_2] = cp2_sents
|
|
591
|
+
|
|
592
|
+
return batch
|
|
593
|
+
|
|
594
|
+
def extract_sentences(
|
|
595
|
+
self,
|
|
596
|
+
group_range: Union[tuple, None] = None,
|
|
597
|
+
add_end_mark_cmd: str = "a",
|
|
598
|
+
pass_cmd: str = "p",
|
|
599
|
+
add_end_mark_on_all: str = "l",
|
|
600
|
+
add_upper_cmd: str = "u",
|
|
601
|
+
add_upper_on_all: str = "o",
|
|
602
|
+
sep_cmd: str = "_",
|
|
603
|
+
quit_cmd: str = "q",
|
|
604
|
+
batch_size: int = 30,
|
|
605
|
+
load: bool = True,
|
|
606
|
+
save: bool = False,
|
|
607
|
+
csv_file: str = "batch.csv",
|
|
608
|
+
last_checkpoint: bool = True,
|
|
609
|
+
):
|
|
610
|
+
|
|
611
|
+
# we load the data
|
|
612
|
+
if load:
|
|
613
|
+
|
|
614
|
+
self.load()
|
|
615
|
+
|
|
616
|
+
# the group range is equal to a tuple containing the last saved index and the index of the last element in the list of groups
|
|
617
|
+
# indices if nothing is given
|
|
618
|
+
if last_checkpoint:
|
|
619
|
+
|
|
620
|
+
if group_range is None:
|
|
621
|
+
group_range = (self.index, len(self.groups) - 1)
|
|
622
|
+
|
|
623
|
+
else:
|
|
624
|
+
|
|
625
|
+
raise ValueError(
|
|
626
|
+
"You must provide a group range if last checkpoint is to False !"
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
# change the number of displayed lines
|
|
630
|
+
pd.options.display.max_rows = batch_size
|
|
631
|
+
|
|
632
|
+
groups = self.groups[group_range[0] : group_range[1] + 1]
|
|
633
|
+
|
|
634
|
+
# initialize the sub corpora
|
|
635
|
+
sub_corpora = {self.corpus_1: [], self.corpus_2: []}
|
|
636
|
+
|
|
637
|
+
i = 0
|
|
638
|
+
|
|
639
|
+
# for each batch we will add the groups in a csv file and take a command
|
|
640
|
+
for b in range(0, len(groups), batch_size):
|
|
641
|
+
|
|
642
|
+
# recuperate a batch
|
|
643
|
+
batch_ = groups[b : b + batch_size]
|
|
644
|
+
|
|
645
|
+
# recuperate the index
|
|
646
|
+
self.index += len(batch_)
|
|
647
|
+
|
|
648
|
+
# split each group into two sentences and transform the obtained dictionary to a DataFrame
|
|
649
|
+
batch = self.split_group(batch_)
|
|
650
|
+
|
|
651
|
+
pd.DataFrame(batch).to_csv(csv_file, index=False)
|
|
652
|
+
|
|
653
|
+
print(
|
|
654
|
+
f"Which of the groups of batch number {i+1} do you consider to be complete sentences (see the file {csv_file}) ?"
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
error = False
|
|
658
|
+
|
|
659
|
+
cms = []
|
|
660
|
+
|
|
661
|
+
try:
|
|
662
|
+
|
|
663
|
+
cms, error = self.extraction_commands(
|
|
664
|
+
add_end_mark_cmd,
|
|
665
|
+
pass_cmd,
|
|
666
|
+
add_end_mark_on_all,
|
|
667
|
+
add_upper_cmd,
|
|
668
|
+
add_upper_on_all,
|
|
669
|
+
sep_cmd,
|
|
670
|
+
quit_cmd,
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
except Exception:
|
|
674
|
+
|
|
675
|
+
print("You didn't provide a right group number !")
|
|
676
|
+
|
|
677
|
+
error = True
|
|
678
|
+
|
|
679
|
+
while error:
|
|
680
|
+
|
|
681
|
+
error = False
|
|
682
|
+
|
|
683
|
+
try:
|
|
684
|
+
|
|
685
|
+
cms, error = self.extraction_commands(
|
|
686
|
+
add_end_mark_cmd,
|
|
687
|
+
pass_cmd,
|
|
688
|
+
add_end_mark_on_all,
|
|
689
|
+
add_upper_cmd,
|
|
690
|
+
add_upper_on_all,
|
|
691
|
+
sep_cmd,
|
|
692
|
+
quit_cmd,
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
except IndexError:
|
|
696
|
+
|
|
697
|
+
print("You didn't provide a right group number !")
|
|
698
|
+
|
|
699
|
+
error = True
|
|
700
|
+
|
|
701
|
+
# recuperate the batch
|
|
702
|
+
batch = pd.read_csv(csv_file).to_dict("list")
|
|
703
|
+
|
|
704
|
+
# add corrections
|
|
705
|
+
batch = self.inner_command(batch)
|
|
706
|
+
|
|
707
|
+
cm_type = ""
|
|
708
|
+
|
|
709
|
+
quit_ = "n"
|
|
710
|
+
|
|
711
|
+
for cm in cms:
|
|
712
|
+
|
|
713
|
+
cm_type = cm.split("+")[0]
|
|
714
|
+
|
|
715
|
+
if cm_type == add_end_mark_cmd:
|
|
716
|
+
|
|
717
|
+
batch = self.add_end_mark(batch, cm.split("+")[1])
|
|
718
|
+
|
|
719
|
+
elif cm_type == add_end_mark_on_all:
|
|
720
|
+
|
|
721
|
+
mark = cm.split("+")[1]
|
|
722
|
+
|
|
723
|
+
batch = self.add_end_mark(
|
|
724
|
+
batch,
|
|
725
|
+
",".join(
|
|
726
|
+
[str(nb) for nb in range(1, len(batch[self.corpus_1]) + 1)]
|
|
727
|
+
)
|
|
728
|
+
+ f":{mark}",
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
elif cm_type == add_upper_cmd:
|
|
732
|
+
|
|
733
|
+
batch = self.add_upper(batch, cm.split("+")[1])
|
|
734
|
+
|
|
735
|
+
elif cm_type == add_upper_on_all:
|
|
736
|
+
|
|
737
|
+
batch = self.add_upper(
|
|
738
|
+
batch,
|
|
739
|
+
",".join(
|
|
740
|
+
[str(nb) for nb in range(1, len(batch[self.corpus_1]) + 1)]
|
|
741
|
+
),
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
elif cm_type == quit_cmd:
|
|
745
|
+
|
|
746
|
+
quit_ = input("Are you sure you want to quit: Yes(y) or No(n)")
|
|
747
|
+
|
|
748
|
+
while not quit_ in ["y", "n"]:
|
|
749
|
+
|
|
750
|
+
quit_ = input("Are you sure you want to quit: Yes(y) or No(n)")
|
|
751
|
+
|
|
752
|
+
if quit_ == "y":
|
|
753
|
+
|
|
754
|
+
break
|
|
755
|
+
|
|
756
|
+
print("\nBatch result")
|
|
757
|
+
|
|
758
|
+
print(pd.DataFrame(batch).head(batch_size))
|
|
759
|
+
|
|
760
|
+
print("\n--------------------\n\n")
|
|
761
|
+
|
|
762
|
+
# add the batch to the sub corpora
|
|
763
|
+
sub_corpora[self.corpus_1].extend(batch[self.corpus_1])
|
|
764
|
+
|
|
765
|
+
sub_corpora[self.corpus_2].extend(batch[self.corpus_2])
|
|
766
|
+
|
|
767
|
+
if cm_type == quit_cmd and quit_ == "y":
|
|
768
|
+
|
|
769
|
+
break
|
|
770
|
+
|
|
771
|
+
else:
|
|
772
|
+
|
|
773
|
+
if save:
|
|
774
|
+
|
|
775
|
+
save_ = input("Do you want to save the result ? Yes(y) or No(n)")
|
|
776
|
+
|
|
777
|
+
while not save_ in ["y", "n"]:
|
|
778
|
+
|
|
779
|
+
save_ = input(
|
|
780
|
+
"Do you want to save the result ? Yes(y) or No(n)"
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
if save_ == "y":
|
|
784
|
+
|
|
785
|
+
self.extractions[self.corpus_1].extend(batch[self.corpus_1])
|
|
786
|
+
|
|
787
|
+
self.extractions[self.corpus_2].extend(batch[self.corpus_2])
|
|
788
|
+
|
|
789
|
+
self.__save()
|
|
790
|
+
|
|
791
|
+
i += 1
|
|
792
|
+
|
|
793
|
+
print("Finished !")
|
|
794
|
+
|
|
795
|
+
def remove_duplicated_sentences(self, save: bool = False):
|
|
796
|
+
|
|
797
|
+
# we load the data
|
|
798
|
+
self.load()
|
|
799
|
+
|
|
800
|
+
# use pandas to delete the duplicated rows
|
|
801
|
+
extractions = pd.DataFrame(self.extractions)
|
|
802
|
+
|
|
803
|
+
extractions.drop_duplicates(inplace=True)
|
|
804
|
+
|
|
805
|
+
self.extractions = extractions.to_dict("list")
|
|
806
|
+
|
|
807
|
+
# save the sentences
|
|
808
|
+
if save:
|
|
809
|
+
|
|
810
|
+
self.__save()
|