SinaTools 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
- SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
- SinaTools-0.1.1.dist-info/LICENSE +22 -0
- SinaTools-0.1.1.dist-info/METADATA +72 -0
- SinaTools-0.1.1.dist-info/RECORD +122 -0
- SinaTools-0.1.1.dist-info/WHEEL +6 -0
- SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.1.dist-info/top_level.txt +1 -0
- nlptools/CLI/DataDownload/download_files.py +71 -0
- nlptools/CLI/arabiner/bin/infer.py +117 -0
- nlptools/CLI/arabiner/bin/infer2.py +81 -0
- nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
- nlptools/CLI/morphology/morph_analyzer.py +91 -0
- nlptools/CLI/salma/salma_tools.py +68 -0
- nlptools/CLI/utils/__init__.py +0 -0
- nlptools/CLI/utils/arStrip.py +99 -0
- nlptools/CLI/utils/corpus_tokenizer.py +74 -0
- nlptools/CLI/utils/implication.py +92 -0
- nlptools/CLI/utils/jaccard.py +96 -0
- nlptools/CLI/utils/latin_remove.py +51 -0
- nlptools/CLI/utils/remove_Punc.py +53 -0
- nlptools/CLI/utils/sentence_tokenizer.py +90 -0
- nlptools/CLI/utils/text_transliteration.py +77 -0
- nlptools/DataDownload/__init__.py +0 -0
- nlptools/DataDownload/downloader.py +185 -0
- nlptools/VERSION +1 -0
- nlptools/__init__.py +5 -0
- nlptools/arabert/__init__.py +1 -0
- nlptools/arabert/arabert/__init__.py +14 -0
- nlptools/arabert/arabert/create_classification_data.py +260 -0
- nlptools/arabert/arabert/create_pretraining_data.py +534 -0
- nlptools/arabert/arabert/extract_features.py +444 -0
- nlptools/arabert/arabert/lamb_optimizer.py +158 -0
- nlptools/arabert/arabert/modeling.py +1027 -0
- nlptools/arabert/arabert/optimization.py +202 -0
- nlptools/arabert/arabert/run_classifier.py +1078 -0
- nlptools/arabert/arabert/run_pretraining.py +593 -0
- nlptools/arabert/arabert/run_squad.py +1440 -0
- nlptools/arabert/arabert/tokenization.py +414 -0
- nlptools/arabert/araelectra/__init__.py +1 -0
- nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
- nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
- nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
- nlptools/arabert/araelectra/configure_finetuning.py +172 -0
- nlptools/arabert/araelectra/configure_pretraining.py +143 -0
- nlptools/arabert/araelectra/finetune/__init__.py +14 -0
- nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
- nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
- nlptools/arabert/araelectra/finetune/scorer.py +54 -0
- nlptools/arabert/araelectra/finetune/task.py +74 -0
- nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
- nlptools/arabert/araelectra/flops_computation.py +215 -0
- nlptools/arabert/araelectra/model/__init__.py +14 -0
- nlptools/arabert/araelectra/model/modeling.py +1029 -0
- nlptools/arabert/araelectra/model/optimization.py +193 -0
- nlptools/arabert/araelectra/model/tokenization.py +355 -0
- nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
- nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
- nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
- nlptools/arabert/araelectra/run_finetuning.py +323 -0
- nlptools/arabert/araelectra/run_pretraining.py +469 -0
- nlptools/arabert/araelectra/util/__init__.py +14 -0
- nlptools/arabert/araelectra/util/training_utils.py +112 -0
- nlptools/arabert/araelectra/util/utils.py +109 -0
- nlptools/arabert/aragpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
- nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
- nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
- nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
- nlptools/arabert/aragpt2/grover/__init__.py +0 -0
- nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
- nlptools/arabert/aragpt2/grover/modeling.py +803 -0
- nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
- nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
- nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
- nlptools/arabert/aragpt2/grover/utils.py +234 -0
- nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
- nlptools/arabert/preprocess.py +818 -0
- nlptools/arabiner/__init__.py +0 -0
- nlptools/arabiner/bin/__init__.py +14 -0
- nlptools/arabiner/bin/eval.py +87 -0
- nlptools/arabiner/bin/infer.py +91 -0
- nlptools/arabiner/bin/process.py +140 -0
- nlptools/arabiner/bin/train.py +221 -0
- nlptools/arabiner/data/__init__.py +1 -0
- nlptools/arabiner/data/datasets.py +146 -0
- nlptools/arabiner/data/transforms.py +118 -0
- nlptools/arabiner/nn/BaseModel.py +22 -0
- nlptools/arabiner/nn/BertNestedTagger.py +34 -0
- nlptools/arabiner/nn/BertSeqTagger.py +17 -0
- nlptools/arabiner/nn/__init__.py +3 -0
- nlptools/arabiner/trainers/BaseTrainer.py +117 -0
- nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
- nlptools/arabiner/trainers/BertTrainer.py +163 -0
- nlptools/arabiner/trainers/__init__.py +3 -0
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +124 -0
- nlptools/arabiner/utils/helpers.py +151 -0
- nlptools/arabiner/utils/metrics.py +69 -0
- nlptools/environment.yml +227 -0
- nlptools/install_env.py +13 -0
- nlptools/morphology/ALMA_multi_word.py +34 -0
- nlptools/morphology/__init__.py +52 -0
- nlptools/morphology/charsets.py +60 -0
- nlptools/morphology/morph_analyzer.py +170 -0
- nlptools/morphology/settings.py +8 -0
- nlptools/morphology/tokenizers_words.py +19 -0
- nlptools/nlptools.py +1 -0
- nlptools/salma/__init__.py +12 -0
- nlptools/salma/settings.py +31 -0
- nlptools/salma/views.py +459 -0
- nlptools/salma/wsd.py +126 -0
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/corpus_tokenizer.py +73 -0
- nlptools/utils/implication.py +662 -0
- nlptools/utils/jaccard.py +247 -0
- nlptools/utils/parser.py +147 -0
- nlptools/utils/readfile.py +3 -0
- nlptools/utils/sentence_tokenizer.py +53 -0
- nlptools/utils/text_transliteration.py +232 -0
- nlptools/utils/utils.py +2 -0
nlptools/salma/views.py
ADDED
@@ -0,0 +1,459 @@
|
|
1
|
+
import json
|
2
|
+
from nlptools.salma import settings
|
3
|
+
from nlptools.salma.wsd import normalizearabert
|
4
|
+
from nlptools.salma.wsd import GlossPredictor
|
5
|
+
from nlptools.utils.parser import arStrip
|
6
|
+
from nlptools.morphology.tokenizers_words import simple_word_tokenize
|
7
|
+
from nlptools.morphology.ALMA_multi_word import ALMA_multi_word
|
8
|
+
from nlptools.morphology.morph_analyzer import analyze
|
9
|
+
#from nlptools.arabiner.bin.infer import ner
|
10
|
+
|
11
|
+
def delete_form_list(position, word_lemma):
|
12
|
+
#"""
|
13
|
+
#Remove specific elements from the word_lemma list based on the given position.
|
14
|
+
#
|
15
|
+
#Parameters:
|
16
|
+
#position (int): The current position in the input sentence.
|
17
|
+
#word_lemma (list): List of word lemma details.
|
18
|
+
#
|
19
|
+
#Returns:
|
20
|
+
#list: Updated word_lemma list with the specific elements removed.
|
21
|
+
#list: The list of removed elements.
|
22
|
+
#int: The new position in the input sentence.
|
23
|
+
#"""
|
24
|
+
tmp_word_lemma = []
|
25
|
+
output = []
|
26
|
+
for wordLemma in word_lemma:
|
27
|
+
if position == int(wordLemma[2]): # start
|
28
|
+
word = wordLemma[0]
|
29
|
+
gloss = wordLemma[1]
|
30
|
+
position = int(wordLemma[3])
|
31
|
+
concept_count = int(wordLemma[4])
|
32
|
+
undiac_multi_word_lemma = wordLemma[5]
|
33
|
+
multi_word_lemma = wordLemma[6]
|
34
|
+
output.append([word, gloss, concept_count, undiac_multi_word_lemma, multi_word_lemma])# word
|
35
|
+
elif position < int(wordLemma[2]):
|
36
|
+
tmp_word_lemma.append(wordLemma)
|
37
|
+
return tmp_word_lemma, output, position
|
38
|
+
|
39
|
+
def find_two_word_lemma(input_sentence):
|
40
|
+
#"""
|
41
|
+
#Find two-word lemmas in the input sentence using the ALMA_multi_word function.
|
42
|
+
#
|
43
|
+
#Parameters:
|
44
|
+
#input_sentence (list): Tokenized input sentence.
|
45
|
+
#
|
46
|
+
#Returns:
|
47
|
+
#list: List of details of found two-word lemmas.
|
48
|
+
#"""
|
49
|
+
i = 0
|
50
|
+
output = []
|
51
|
+
length = len(input_sentence)
|
52
|
+
while i < length - 1:
|
53
|
+
two_grams = input_sentence[i] +" "+ input_sentence[i + 1]
|
54
|
+
data = ALMA_multi_word(two_grams)
|
55
|
+
try :
|
56
|
+
glosses_list = []
|
57
|
+
concept_count = 0
|
58
|
+
ids = data[0]["ids"]
|
59
|
+
for lemma_id in ids:
|
60
|
+
if lemma_id in settings.glosses_dic.keys():
|
61
|
+
value = settings.glosses_dic[lemma_id]
|
62
|
+
glosses_list.append(json.loads(value[1]))
|
63
|
+
concept_count = concept_count + value[0]
|
64
|
+
|
65
|
+
# found two_grams
|
66
|
+
#found_2Word_lemma = [two_grams,data[0]['glosses'], i, i + 1,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
67
|
+
found_2Word_lemma = [two_grams, glosses_list, i, i + 1, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
68
|
+
output.append(found_2Word_lemma)
|
69
|
+
i = i + 1
|
70
|
+
except: # no record found on this multi_lema
|
71
|
+
i = i + 1
|
72
|
+
return output
|
73
|
+
|
74
|
+
|
75
|
+
def find_three_word_lemma(input_sentence):
|
76
|
+
i = 0
|
77
|
+
output = []
|
78
|
+
length = len(input_sentence)
|
79
|
+
while i < length - 2:
|
80
|
+
three_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2]
|
81
|
+
data = ALMA_multi_word(three_grams)
|
82
|
+
try:
|
83
|
+
glosses_list = []
|
84
|
+
concept_count = 0
|
85
|
+
ids = data[0]["ids"]
|
86
|
+
for lemma_id in ids:
|
87
|
+
if lemma_id in settings.glosses_dic.keys():
|
88
|
+
value = settings.glosses_dic[lemma_id]
|
89
|
+
glosses_list.append(json.loads(value[1]))
|
90
|
+
concept_count = concept_count + value[0]
|
91
|
+
|
92
|
+
#found_3Word_lemma = [three_grams, data[0]['glosses'], i, i + 2,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
93
|
+
found_3Word_lemma = [three_grams, glosses_list, i, i + 2, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
94
|
+
output.append(found_3Word_lemma)
|
95
|
+
i = i + 1
|
96
|
+
except:
|
97
|
+
i = i + 1
|
98
|
+
return output
|
99
|
+
|
100
|
+
def find_four_word_lemma(input_sentence):
|
101
|
+
i = 0
|
102
|
+
output = []
|
103
|
+
length = len(input_sentence)
|
104
|
+
while i < length - 3:
|
105
|
+
four_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2] + " "+ input_sentence[i + 3]
|
106
|
+
data = ALMA_multi_word(four_grams)
|
107
|
+
try:
|
108
|
+
glosses_list = []
|
109
|
+
concept_count = 0
|
110
|
+
ids = data[0]["ids"]
|
111
|
+
for lemma_id in ids:
|
112
|
+
if lemma_id in settings.glosses_dic.keys():
|
113
|
+
value = settings.glosses_dic[lemma_id]
|
114
|
+
glosses_list.append(json.loads(value[1]))
|
115
|
+
concept_count = concept_count + value[0]
|
116
|
+
#found_4Word_lemma = [four_grams, data[0]['glosses'], i, i + 3,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
117
|
+
found_4Word_lemma = [four_grams, glosses_list, i, i + 3, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
118
|
+
output.append(found_4Word_lemma)
|
119
|
+
i = i + 1
|
120
|
+
except:
|
121
|
+
i = i + 1
|
122
|
+
return output
|
123
|
+
|
124
|
+
|
125
|
+
def find_five_word_lemma(input_sentence):
|
126
|
+
i = 0
|
127
|
+
output = []
|
128
|
+
length = len(input_sentence)
|
129
|
+
while i < length - 4:
|
130
|
+
five_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2] + " "+ input_sentence[i + 3] + " "+ input_sentence[i + 4]
|
131
|
+
data = ALMA_multi_word(five_grams)
|
132
|
+
try:
|
133
|
+
glosses_list = []
|
134
|
+
concept_count = 0
|
135
|
+
ids = data[0]["ids"]
|
136
|
+
for lemma_id in ids:
|
137
|
+
if lemma_id in settings.glosses_dic.keys():
|
138
|
+
value = settings.glosses_dic[lemma_id]
|
139
|
+
glosses_list.append(json.loads(value[1]))
|
140
|
+
concept_count = concept_count + value[0]
|
141
|
+
#found_5Word_lemma = [five_grams, data[0]['glosses'], i, i + 4,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
142
|
+
found_5Word_lemma = [five_grams, glosses_list, i, i + 4, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
143
|
+
output.append(found_5Word_lemma)
|
144
|
+
i = i + 1
|
145
|
+
except:
|
146
|
+
i = i + 1
|
147
|
+
return output
|
148
|
+
|
149
|
+
def find_named_entities(string):
|
150
|
+
#"""
|
151
|
+
# Find named entities in the input string using a NER tool.
|
152
|
+
#
|
153
|
+
# Parameters:
|
154
|
+
# string (str): Input string.
|
155
|
+
#
|
156
|
+
# Returns:
|
157
|
+
# list: List of details of found named entities.
|
158
|
+
# """
|
159
|
+
found_entities = []
|
160
|
+
#entites = ner(string, "4")
|
161
|
+
entites = []
|
162
|
+
tag_gloss = {
|
163
|
+
"PERS": "اسم شخص",
|
164
|
+
"ORG": "اسم مؤسسة",
|
165
|
+
#"NORP": "مجموعة من الناس",
|
166
|
+
#"OCC": "منصب/مسمى وظيفي",
|
167
|
+
"LOC": "اسم منطقة جغرافية",
|
168
|
+
"FAC": "اسم لمَعلَم",
|
169
|
+
#"EVENT": "حدث",
|
170
|
+
"DATE": "فترة زمنية تدل على تاريخ",
|
171
|
+
"UNIT": "وحدة قياس",
|
172
|
+
"CURR": "عملة",
|
173
|
+
"GPE": "اسم بلد، له حدود إدارية/جيوسياسية",
|
174
|
+
"TIME": "فترة زمنية تدل على الوقت",
|
175
|
+
"CARDINAL": "عدد يدل على معدود",
|
176
|
+
"ORDINAL": "رقم، لا يدل على معدود",
|
177
|
+
"PERCENT": "نسبة مئوية",
|
178
|
+
"QUANTITY": "كمية",
|
179
|
+
"MONEY": "مبلغ مالي",
|
180
|
+
"LANGUAGE": "اسم للغة طبيعية",
|
181
|
+
"PRODUCT": "اسم منتج",
|
182
|
+
"LAW": "قانون"
|
183
|
+
}
|
184
|
+
|
185
|
+
for entity in entites:
|
186
|
+
gloss_ner = ""
|
187
|
+
if entity[1] in tag_gloss.keys():
|
188
|
+
gloss_ner = tag_gloss[entity[1]]
|
189
|
+
|
190
|
+
if gloss_ner != "":
|
191
|
+
gloss = [{'concept_id': '', 'resource_id': '', 'resource_name': '', 'gloss': gloss_ner}]
|
192
|
+
entity = [entity[0],gloss,int(entity[2]), int(entity[3]),1,arStrip(entity[0],True,True,True,False,True,False),entity[0]]
|
193
|
+
found_entities.append(entity)
|
194
|
+
return found_entities
|
195
|
+
|
196
|
+
|
197
|
+
def find_glosses_using_ALMA(word):
|
198
|
+
|
199
|
+
data = analyze(word)
|
200
|
+
Diac_lemma = ""
|
201
|
+
pos = ""
|
202
|
+
Undiac_lemma = ""
|
203
|
+
glosses = []
|
204
|
+
Diac_lemma = data[0][1]
|
205
|
+
pos = data[0][2]
|
206
|
+
Undiac_lemma = arStrip(Diac_lemma, True, True, True, True, True, False) # Remove diacs , smallDiacs , shaddah , digit , alif , specialChars
|
207
|
+
#"""
|
208
|
+
# Find glosses for the given word using the ALMA tool.
|
209
|
+
#
|
210
|
+
# Parameters:
|
211
|
+
# word (str): Input word.
|
212
|
+
#
|
213
|
+
# Returns:
|
214
|
+
# tuple: Details of the word including glosses, lemmas, and POS.
|
215
|
+
# """
|
216
|
+
ids = []
|
217
|
+
glosses_list = []
|
218
|
+
concept_count = 0
|
219
|
+
for line in data:
|
220
|
+
lemma_id = line[3]
|
221
|
+
ids.append(lemma_id)
|
222
|
+
|
223
|
+
for lemma_id in ids:
|
224
|
+
if lemma_id in settings.glosses_dic.keys():
|
225
|
+
value = settings.glosses_dic[lemma_id]
|
226
|
+
glosses_list.append(json.loads(value[1]))
|
227
|
+
concept_count = concept_count + value[0]
|
228
|
+
|
229
|
+
#glosses = data[0][4]
|
230
|
+
#concept_count = data[0][3]
|
231
|
+
return word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses
|
232
|
+
|
233
|
+
def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, sentence):
|
234
|
+
#"""
|
235
|
+
# Disambiguate glosses using the SALMA tool.
|
236
|
+
#
|
237
|
+
# Parameters:
|
238
|
+
# glosses (list): List of glosses.
|
239
|
+
# Diac_lemma (str): Diacritic lemma of the word.
|
240
|
+
# Undiac_lemma (str): Undiacritic lemma of the word.
|
241
|
+
# word (str): The word being analyzed.
|
242
|
+
# sentence (str): The sentence containing the word.
|
243
|
+
#
|
244
|
+
# Returns:
|
245
|
+
# dict: Disambiguated gloss details.
|
246
|
+
# """
|
247
|
+
word = normalizearabert(word)
|
248
|
+
glosses_dictionary = {}
|
249
|
+
if glosses != None:
|
250
|
+
for gloss in glosses:
|
251
|
+
glosses_dictionary.update({gloss['concept_id'] : gloss['gloss']})
|
252
|
+
concept_id, gloss = GlossPredictor(Diac_lemma, Undiac_lemma,word,sentence,glosses_dictionary)
|
253
|
+
|
254
|
+
my_json = {}
|
255
|
+
my_json['Concept_id'] = concept_id
|
256
|
+
my_json['Gloss'] = gloss
|
257
|
+
my_json['word'] = word
|
258
|
+
my_json['Undiac_lemma'] = Undiac_lemma
|
259
|
+
my_json['Diac_lemma'] = Diac_lemma
|
260
|
+
return my_json
|
261
|
+
else:
|
262
|
+
my_json = {}
|
263
|
+
my_json['word'] = word
|
264
|
+
my_json['Undiac_lemma'] = Undiac_lemma
|
265
|
+
my_json['Diac_lemma'] = Diac_lemma
|
266
|
+
return my_json
|
267
|
+
|
268
|
+
|
269
|
+
def find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lemma, five_word_lemma, ner):
|
270
|
+
output_list = []
|
271
|
+
position = 0
|
272
|
+
while position < len(input_sentence):
|
273
|
+
flag = "False"
|
274
|
+
output_from5word = delete_form_list(position, five_word_lemma)
|
275
|
+
five_word_lemma = output_from5word[0]
|
276
|
+
if output_from5word[1] != []: # output
|
277
|
+
position = output_from5word[2]
|
278
|
+
flag = "True"
|
279
|
+
my_json = {}
|
280
|
+
my_json['word'] = output_from5word[1][0][0]
|
281
|
+
my_json['concept_count'] = output_from5word[1][0][2]
|
282
|
+
my_json['glosses'] = output_from5word[1][0][1]
|
283
|
+
my_json['Diac_lemma'] = output_from5word[1][0][4]
|
284
|
+
my_json['Undiac_lemma'] = output_from5word[1][0][3]
|
285
|
+
output_list.append(my_json)
|
286
|
+
position = position + 1
|
287
|
+
|
288
|
+
|
289
|
+
|
290
|
+
output_from4word = delete_form_list(position, four_word_lemma)
|
291
|
+
four_word_lemma = output_from4word[0]
|
292
|
+
if output_from4word[1] != []: # output
|
293
|
+
position = output_from4word[2]
|
294
|
+
flag = "True"
|
295
|
+
my_json = {}
|
296
|
+
my_json['word'] = output_from4word[1][0][0]
|
297
|
+
my_json['concept_count'] = output_from4word[1][0][2]
|
298
|
+
my_json['glosses'] = output_from4word[1][0][1]
|
299
|
+
my_json['Diac_lemma'] = output_from4word[1][0][4]
|
300
|
+
my_json['Undiac_lemma'] = output_from4word[1][0][3]
|
301
|
+
output_list.append(my_json)
|
302
|
+
position = position + 1
|
303
|
+
|
304
|
+
output_from3word = delete_form_list(position, three_word_lemma)
|
305
|
+
three_word_lemma = output_from3word[0]
|
306
|
+
if output_from3word[1] != []: # output
|
307
|
+
position = output_from3word[2]
|
308
|
+
flag = "True"
|
309
|
+
my_json = {}
|
310
|
+
my_json['word'] = output_from3word[1][0][0]
|
311
|
+
my_json['concept_count'] = output_from3word[1][0][2]
|
312
|
+
my_json['glosses'] = output_from3word[1][0][1]
|
313
|
+
my_json['Diac_lemma'] = output_from3word[1][0][4]
|
314
|
+
my_json['Undiac_lemma'] = output_from3word[1][0][3]
|
315
|
+
output_list.append(my_json)
|
316
|
+
position = position + 1
|
317
|
+
|
318
|
+
|
319
|
+
|
320
|
+
output_from2Word = delete_form_list(position, two_word_lemma)
|
321
|
+
two_word_lemma = output_from2Word[0]
|
322
|
+
if output_from2Word[1] != []:
|
323
|
+
position = output_from2Word[2]
|
324
|
+
flag = "True"
|
325
|
+
my_json = {}
|
326
|
+
word = output_from2Word[1][0][0]
|
327
|
+
my_json['word'] = word
|
328
|
+
my_json['concept_count'] = output_from2Word[1][0][2]
|
329
|
+
my_json['glosses'] = output_from2Word[1][0][1]
|
330
|
+
my_json['Diac_lemma'] = output_from2Word[1][0][4]
|
331
|
+
my_json['Undiac_lemma'] = output_from2Word[1][0][3]
|
332
|
+
output_list.append(my_json)
|
333
|
+
position = position + 1
|
334
|
+
|
335
|
+
|
336
|
+
|
337
|
+
output_from_ner = delete_form_list(position, ner)
|
338
|
+
ner = output_from_ner[0]
|
339
|
+
if output_from_ner[1] != []:
|
340
|
+
position = output_from_ner[2]
|
341
|
+
flag = "True"
|
342
|
+
my_json = {}
|
343
|
+
word = output_from_ner[1][0][0]
|
344
|
+
my_json['word'] = word
|
345
|
+
my_json['concept_count'] = output_from_ner[1][0][2]
|
346
|
+
my_json['glosses'] = output_from_ner[1][0][1]
|
347
|
+
my_json['Diac_lemma'] = output_from_ner[1][0][4]
|
348
|
+
my_json['Undiac_lemma'] = output_from_ner[1][0][3]
|
349
|
+
output_list.append(my_json)
|
350
|
+
position = position + 1
|
351
|
+
|
352
|
+
if flag == "False": # Not found in ner or in multi_word_dictionary, ASK ALMA
|
353
|
+
word = input_sentence[position]
|
354
|
+
word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word)
|
355
|
+
my_json = {}
|
356
|
+
my_json['word'] = word
|
357
|
+
my_json['concept_count'] = concept_count
|
358
|
+
my_json['glosses'] = glosses
|
359
|
+
my_json['Diac_lemma'] = Diac_lemma
|
360
|
+
my_json['Undiac_lemma'] = Undiac_lemma
|
361
|
+
output_list.append(my_json)
|
362
|
+
position = position + 1
|
363
|
+
return output_list
|
364
|
+
|
365
|
+
def disambiguate_glosses_main(word, sentence):
|
366
|
+
concept_count = word['concept_count']
|
367
|
+
if concept_count == 0:
|
368
|
+
my_json = {}
|
369
|
+
my_json['word'] = word['word']
|
370
|
+
my_json['Diac_lemma'] = word['Diac_lemma']
|
371
|
+
my_json['Undiac_lemma'] = word['Undiac_lemma']
|
372
|
+
return my_json
|
373
|
+
elif concept_count == 1:
|
374
|
+
my_json = {}
|
375
|
+
my_json['word'] = word['word']
|
376
|
+
glosses = word['glosses'][0]
|
377
|
+
my_json['Gloss'] = glosses['gloss']
|
378
|
+
my_json['Concept_id'] = glosses['concept_id']
|
379
|
+
my_json['Diac_lemma'] = word['Diac_lemma']
|
380
|
+
my_json['Undiac_lemma'] = word['Undiac_lemma']
|
381
|
+
return my_json
|
382
|
+
else:
|
383
|
+
input_word = word['word']
|
384
|
+
concept_count = word['concept_count']
|
385
|
+
glosses = word['glosses']
|
386
|
+
Diac_lemma = word['Diac_lemma']
|
387
|
+
Undiac_lemma = word['Undiac_lemma']
|
388
|
+
return disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
|
389
|
+
|
390
|
+
def WSD(sentence):
|
391
|
+
|
392
|
+
input_sentence = simple_word_tokenize(sentence)
|
393
|
+
|
394
|
+
five_word_lemma = find_five_word_lemma(input_sentence)
|
395
|
+
|
396
|
+
four_word_lemma = find_four_word_lemma(input_sentence)
|
397
|
+
|
398
|
+
three_word_lemma = find_three_word_lemma(input_sentence)
|
399
|
+
|
400
|
+
two_word_lemma = find_two_word_lemma(input_sentence)
|
401
|
+
|
402
|
+
ner = find_named_entities(" ".join(input_sentence))
|
403
|
+
|
404
|
+
output_list = find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lemma, five_word_lemma, ner)
|
405
|
+
|
406
|
+
results = []
|
407
|
+
for word in output_list:
|
408
|
+
results.append(disambiguate_glosses_main(word, sentence))
|
409
|
+
return results
|
410
|
+
|
411
|
+
|
412
|
+
def SALMA(sentence):
|
413
|
+
"""
|
414
|
+
This method disambiguate words within a sentence.
|
415
|
+
|
416
|
+
Args:
|
417
|
+
sentence (:obj:`str`): The Arabic text to be disambiguated, it should be limited to less than 500 characters.
|
418
|
+
|
419
|
+
Returns:
|
420
|
+
:obj:`list`: The JSON output includes a list of words, with each word having a gloss if it exists or a lemma if no gloss is found.
|
421
|
+
|
422
|
+
**Example:**
|
423
|
+
|
424
|
+
.. highlight:: python
|
425
|
+
.. code-block:: python
|
426
|
+
|
427
|
+
from nlptools.salma.views import SALMA
|
428
|
+
JSON = SALMA("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.")
|
429
|
+
print(JSON["resp"])
|
430
|
+
|
431
|
+
#output
|
432
|
+
[
|
433
|
+
{
|
434
|
+
"Concept_id": "303019218",
|
435
|
+
"Gloss": "ذهَب إلى عملِه:- قصَده، توجَّه إليه \"ذهَب إلى الجامعة/ بيروت - اذهَب إلى أبيك والتمس منه الصفح - ذهَب إلى قول فلان أخذ به - <اذْهَبْ إِلَى فِرْعَوْنَ إِنَّهُ طَغَى> طه/ 24 \". ذهَب رأسًا إليه",
|
436
|
+
"word": "ذهبت",
|
437
|
+
"Undiac_lemma": "ذهب",
|
438
|
+
"Diac_lemma": "ذَهَبَ۪ 1"
|
439
|
+
},
|
440
|
+
{
|
441
|
+
"word": "إلى",
|
442
|
+
"Diac_lemma": "إِلَى 1",
|
443
|
+
"Undiac_lemma": "الى"
|
444
|
+
},
|
445
|
+
{
|
446
|
+
"word": "جامعة بيرزيت",
|
447
|
+
"Gloss": "جامعة فلسطينية تقع في بلدة بيرزيت، قرب مدينة رام الله، ويعود تاريخها إلى عام 1924 عندما تأسست كمدرسة ابتدائية ثم أصبحت جامعة عام 1975",
|
448
|
+
"Concept_id": "334000099",
|
449
|
+
"Diac_lemma": "جامِعَة بيرزَيت",
|
450
|
+
"Undiac_lemma": "جامعة بيرزيت"
|
451
|
+
}
|
452
|
+
]
|
453
|
+
"""
|
454
|
+
if len(sentence) > 500:
|
455
|
+
content = {"statusText":"Input is too long","statusCode":-7}
|
456
|
+
return content
|
457
|
+
else:
|
458
|
+
results = WSD(sentence)
|
459
|
+
return {"resp": results, "statusText":"OK","statusCode":0}
|
nlptools/salma/wsd.py
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
from nlptools.salma import settings
|
2
|
+
import re
|
3
|
+
import warnings
|
4
|
+
warnings.filterwarnings("ignore")
|
5
|
+
import torch
|
6
|
+
import numpy as np
|
7
|
+
import pandas as pd
|
8
|
+
from nlptools.arabert.preprocess import ArabertPreprocessor
|
9
|
+
|
10
|
+
def normalizearabert(s):
|
11
|
+
model_name = 'aubmindlab/bert-base-arabertv02'
|
12
|
+
arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])
|
13
|
+
return arabert_prep.preprocess(str(s))
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def glosses1(dfcand,target):
|
18
|
+
# """
|
19
|
+
# takes a dataframe
|
20
|
+
# return
|
21
|
+
# 'none' if the maximum logistic regression score for TRUE class is less than -2 OR
|
22
|
+
# the predicted gloss having the maximum logistic regression score
|
23
|
+
# """
|
24
|
+
|
25
|
+
wic_c = []
|
26
|
+
wic_c, _ = read_data(dfcand,normalizearabert,target)
|
27
|
+
tokenizedwic_c = np.array([settings.tokenizer.encode(x, max_length=512,padding='max_length',truncation='longest_first',add_special_tokens=True) for x in wic_c])
|
28
|
+
max_len = 512
|
29
|
+
segmentswic = torch.tensor([get_segments(settings.tokenizer.convert_ids_to_tokens(i),max_len) for i in tokenizedwic_c])
|
30
|
+
paddedwic = tokenizedwic_c
|
31
|
+
attention_maskwic = np.where(paddedwic != 0, 1, 0)
|
32
|
+
input_idswic = torch.tensor(paddedwic)
|
33
|
+
attention_maskwic = torch.tensor(attention_maskwic)
|
34
|
+
settings.model = settings.model.eval()
|
35
|
+
wicpredictions , wictrue_labels = [], []
|
36
|
+
b_input_ids = input_idswic
|
37
|
+
b_input_mask = attention_maskwic
|
38
|
+
b_input_seg = segmentswic
|
39
|
+
|
40
|
+
with torch.no_grad():
|
41
|
+
outputs = settings.model(b_input_ids,token_type_ids=b_input_seg,attention_mask=b_input_mask)
|
42
|
+
|
43
|
+
logits = outputs[0]
|
44
|
+
wicpredictions.append(logits)
|
45
|
+
wicflat_predictions = np.concatenate(wicpredictions, axis=0)
|
46
|
+
|
47
|
+
return dfcand['Concept_id'].to_list()[np.argmax(wicflat_predictions, axis=0).flatten()[1]],dfcand['Gloss'].to_list()[np.argmax(wicflat_predictions, axis=0).flatten()[1]]
|
48
|
+
|
49
|
+
def read_data(data,normalize,target):
|
50
|
+
c = []
|
51
|
+
labels = []
|
52
|
+
for i,row in data.iterrows():
|
53
|
+
|
54
|
+
example = normalize(row['Example'])
|
55
|
+
gloss = normalize(row['Gloss'])
|
56
|
+
label = row['Label']
|
57
|
+
|
58
|
+
c.append('{} [SEP] {}: {}'.format(example,target,gloss))
|
59
|
+
if label == 1.0:
|
60
|
+
labels.append(1)
|
61
|
+
else:
|
62
|
+
labels.append(0)
|
63
|
+
return c,labels
|
64
|
+
|
65
|
+
def inserttag1(sentence,tag,start,end):
|
66
|
+
before = sentence[:start]
|
67
|
+
after = sentence[end:]
|
68
|
+
target = sentence[start:end]
|
69
|
+
return before+tag+sentence[start:end]+tag+after
|
70
|
+
|
71
|
+
def get_segments(tokens, max_seq_length):
|
72
|
+
if len(tokens)>max_seq_length:
|
73
|
+
raise IndexError("Token length more than max seq length!")
|
74
|
+
segments = []
|
75
|
+
current_segment_id = 0
|
76
|
+
for token in tokens:
|
77
|
+
segments.append(current_segment_id)
|
78
|
+
if token == "[SEP]":
|
79
|
+
current_segment_id = 1
|
80
|
+
return segments + [0] * (max_seq_length - len(tokens))
|
81
|
+
|
82
|
+
def senttarget(target,example):
|
83
|
+
start = -1
|
84
|
+
try:
|
85
|
+
start = example.index(target)
|
86
|
+
except ValueError:
|
87
|
+
return -1
|
88
|
+
end = example.index(target)+len(target)
|
89
|
+
return inserttag1(example,"[UNUSED0]",start,end)
|
90
|
+
|
91
|
+
|
92
|
+
def GlossPredictor(diac_lemma, undiac_lemma,target,example,glosses):
|
93
|
+
# """
|
94
|
+
# takes
|
95
|
+
# a lemma
|
96
|
+
# corresponding target word
|
97
|
+
# an example
|
98
|
+
# glosses as a dictionay, following an example:
|
99
|
+
# glosses = {"Concept_id1": "gloss1", "Concept_id2": "gloss2", "Concept_id3": "gloss3"}
|
100
|
+
# returns
|
101
|
+
# -1 if the example does not contain the target word OR
|
102
|
+
# 'none' if no records in dftrue for the lemma and if the maximum logistic regression score for TRUE class is less than -2 OR
|
103
|
+
# the predicted gloss for the target word
|
104
|
+
#
|
105
|
+
# """
|
106
|
+
example = senttarget(target,example)
|
107
|
+
if example == -1:
|
108
|
+
return -1,-1
|
109
|
+
|
110
|
+
data = []
|
111
|
+
for g in glosses:
|
112
|
+
data.append([g,diac_lemma,undiac_lemma, glosses[g], target,example,0,1,'','',''])
|
113
|
+
dfcolumns = ['Concept_id', 'Diac_lemma', 'Undiac_lemma', 'Gloss', 'Target', 'Example', 'Is_training', 'Label', 'concept_id', 'lemma_id', 'POS']
|
114
|
+
dfcand = pd.DataFrame(data,columns=dfcolumns)
|
115
|
+
|
116
|
+
|
117
|
+
if len(dfcand) > 0:
|
118
|
+
dfcand['Example'] = dfcand['Example'].apply(lambda x: example)
|
119
|
+
dfcand['Target'] = dfcand['Target'].apply(lambda x: target)
|
120
|
+
dfcand = dfcand.drop_duplicates()
|
121
|
+
|
122
|
+
dfcand['Example'] = dfcand['Example'].apply(lambda x: x.upper())
|
123
|
+
dfcand['Example'] = dfcand['Example'].apply(lambda x: re.sub(r'^((.?\[UNUSED0\].?){1})\[UNUSED0\]', r'\1[UNUSED1]', x) )
|
124
|
+
return glosses1(dfcand,target)
|
125
|
+
else:
|
126
|
+
return 'none','none'
|
File without changes
|
@@ -0,0 +1,73 @@
|
|
1
|
+
import os
|
2
|
+
import csv
|
3
|
+
from nlptools.utils.sentence_tokenizer import sent_tokenize
|
4
|
+
from nlptools.morphology.tokenizers_words import simple_word_tokenize
|
5
|
+
|
6
|
+
def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
|
7
|
+
"""
|
8
|
+
This method receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in a CSV file.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
dir_path (:obj:`str`): The path of the directory containing multiple Arabic txt files.
|
12
|
+
output_csv (:obj:`str`): The name of the output CSV file, which will be generated in the current directory where this function is used.
|
13
|
+
row_id (:obj:`int`): Specifies the row_id you wish to start with; the default value is 1.
|
14
|
+
global_sentence_id (:obj:`int`): Specifies the global_sentence_id you wish to start with; the default value is 1.
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
csv file (:obj:`str`): The CSV file contains the following fields:
|
18
|
+
* Row_ID (primary key, unique for all records in outputfile)
|
19
|
+
* Docs_Sentence_Word_ID (DirectoryName_FileName_GlobalSentenceID_SentenceID_WordPosition)
|
20
|
+
* GlobalSentenceID (Integer, a unique identifier for each sentence in the entire file)
|
21
|
+
* SentenceID (Integer, a unique identifier for each file within the CSV file)
|
22
|
+
* Sentence (Generated text that forms a sentence)
|
23
|
+
* Word Position (Integer, the position of each word within the sentence)
|
24
|
+
* Word (Each row contains a word from the generated sentence).
|
25
|
+
|
26
|
+
**Example:**
|
27
|
+
|
28
|
+
.. highlight:: python
|
29
|
+
.. code-block:: python
|
30
|
+
|
31
|
+
from nlptools.utils.corpus_tokenizer import corpus_tokenizer
|
32
|
+
corpus_tokenizer(dir_path="History", output_csv="ouputFile.csv", row_id = 1, global_sentence_id = 1)
|
33
|
+
|
34
|
+
#output
|
35
|
+
# csv file called: ouputFile.csv
|
36
|
+
# For example, if the 'History' directory contains 2 files named 'h1.txt' and 'h2.txt'.
|
37
|
+
# The output file will contain:
|
38
|
+
# Row_ID, Docs_Sentence_Word_ID, Global Sentence ID, Sentence ID, Sentence, Word Position, Word
|
39
|
+
# 1,History_h1_1_1_1,1,1,الطيور الضارة ومكافحتها,1,الطيور
|
40
|
+
# 2,History_h1_1_1_2,1,1,الطيور الضارة ومكافحتها,2,الضارة
|
41
|
+
# 3,History_h1_1_1_3,1,1,الطيور الضارة ومكافحتها,3,ومكافحتها
|
42
|
+
# 4,History_h2_2_1_1,1,1,بشكل عام,1,بشكل
|
43
|
+
# 5,History_h2_2_1_2,1,1,بشكل عام,2,عام
|
44
|
+
"""
|
45
|
+
row_id = row_id - 1
|
46
|
+
global_sentence_id = global_sentence_id - 1
|
47
|
+
with open(output_csv, 'w', newline='', encoding="utf-8") as csvfile:
|
48
|
+
fieldnames = ['Row_ID', 'Docs_Sentence_Word_ID', 'Global Sentence ID', 'Sentence ID', 'Sentence', 'Word Position', 'Word']
|
49
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
50
|
+
writer.writeheader()
|
51
|
+
|
52
|
+
for root, dirs, files in os.walk(dir_path):
|
53
|
+
for file in files:
|
54
|
+
if file.endswith('.txt'):
|
55
|
+
file_path = os.path.join(root, file)
|
56
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
57
|
+
content = f.read()
|
58
|
+
sentences = sent_tokenize(content, dot=True, new_line=True, question_mark=False, exclamation_mark=False)
|
59
|
+
for sentence_id, sentence in enumerate(sentences, start=1):
|
60
|
+
words = simple_word_tokenize(sentence)
|
61
|
+
global_sentence_id += 1
|
62
|
+
for word_pos, word in enumerate(words, start=1):
|
63
|
+
row_id += 1
|
64
|
+
dir_name = os.path.basename(root)
|
65
|
+
doc_sentence_filename = file.split(".txt")[0]
|
66
|
+
docs_sentence_word_id = f"{dir_name}_{doc_sentence_filename}_{global_sentence_id}_{sentence_id}_{word_pos}"
|
67
|
+
writer.writerow({'Row_ID': row_id,
|
68
|
+
'Docs_Sentence_Word_ID': docs_sentence_word_id,
|
69
|
+
'Global Sentence ID': global_sentence_id,
|
70
|
+
'Sentence ID': sentence_id,
|
71
|
+
'Sentence': sentence,
|
72
|
+
'Word Position': word_pos,
|
73
|
+
'Word': word})
|