SinaTools 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
  2. SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
  3. SinaTools-0.1.1.dist-info/LICENSE +22 -0
  4. SinaTools-0.1.1.dist-info/METADATA +72 -0
  5. SinaTools-0.1.1.dist-info/RECORD +122 -0
  6. SinaTools-0.1.1.dist-info/WHEEL +6 -0
  7. SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
  8. SinaTools-0.1.1.dist-info/top_level.txt +1 -0
  9. nlptools/CLI/DataDownload/download_files.py +71 -0
  10. nlptools/CLI/arabiner/bin/infer.py +117 -0
  11. nlptools/CLI/arabiner/bin/infer2.py +81 -0
  12. nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
  13. nlptools/CLI/morphology/morph_analyzer.py +91 -0
  14. nlptools/CLI/salma/salma_tools.py +68 -0
  15. nlptools/CLI/utils/__init__.py +0 -0
  16. nlptools/CLI/utils/arStrip.py +99 -0
  17. nlptools/CLI/utils/corpus_tokenizer.py +74 -0
  18. nlptools/CLI/utils/implication.py +92 -0
  19. nlptools/CLI/utils/jaccard.py +96 -0
  20. nlptools/CLI/utils/latin_remove.py +51 -0
  21. nlptools/CLI/utils/remove_Punc.py +53 -0
  22. nlptools/CLI/utils/sentence_tokenizer.py +90 -0
  23. nlptools/CLI/utils/text_transliteration.py +77 -0
  24. nlptools/DataDownload/__init__.py +0 -0
  25. nlptools/DataDownload/downloader.py +185 -0
  26. nlptools/VERSION +1 -0
  27. nlptools/__init__.py +5 -0
  28. nlptools/arabert/__init__.py +1 -0
  29. nlptools/arabert/arabert/__init__.py +14 -0
  30. nlptools/arabert/arabert/create_classification_data.py +260 -0
  31. nlptools/arabert/arabert/create_pretraining_data.py +534 -0
  32. nlptools/arabert/arabert/extract_features.py +444 -0
  33. nlptools/arabert/arabert/lamb_optimizer.py +158 -0
  34. nlptools/arabert/arabert/modeling.py +1027 -0
  35. nlptools/arabert/arabert/optimization.py +202 -0
  36. nlptools/arabert/arabert/run_classifier.py +1078 -0
  37. nlptools/arabert/arabert/run_pretraining.py +593 -0
  38. nlptools/arabert/arabert/run_squad.py +1440 -0
  39. nlptools/arabert/arabert/tokenization.py +414 -0
  40. nlptools/arabert/araelectra/__init__.py +1 -0
  41. nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
  42. nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
  43. nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
  44. nlptools/arabert/araelectra/configure_finetuning.py +172 -0
  45. nlptools/arabert/araelectra/configure_pretraining.py +143 -0
  46. nlptools/arabert/araelectra/finetune/__init__.py +14 -0
  47. nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
  48. nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
  49. nlptools/arabert/araelectra/finetune/scorer.py +54 -0
  50. nlptools/arabert/araelectra/finetune/task.py +74 -0
  51. nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
  52. nlptools/arabert/araelectra/flops_computation.py +215 -0
  53. nlptools/arabert/araelectra/model/__init__.py +14 -0
  54. nlptools/arabert/araelectra/model/modeling.py +1029 -0
  55. nlptools/arabert/araelectra/model/optimization.py +193 -0
  56. nlptools/arabert/araelectra/model/tokenization.py +355 -0
  57. nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
  58. nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
  59. nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
  60. nlptools/arabert/araelectra/run_finetuning.py +323 -0
  61. nlptools/arabert/araelectra/run_pretraining.py +469 -0
  62. nlptools/arabert/araelectra/util/__init__.py +14 -0
  63. nlptools/arabert/araelectra/util/training_utils.py +112 -0
  64. nlptools/arabert/araelectra/util/utils.py +109 -0
  65. nlptools/arabert/aragpt2/__init__.py +2 -0
  66. nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
  67. nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
  68. nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
  69. nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
  70. nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
  71. nlptools/arabert/aragpt2/grover/__init__.py +0 -0
  72. nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
  73. nlptools/arabert/aragpt2/grover/modeling.py +803 -0
  74. nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
  75. nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
  76. nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
  77. nlptools/arabert/aragpt2/grover/utils.py +234 -0
  78. nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
  79. nlptools/arabert/preprocess.py +818 -0
  80. nlptools/arabiner/__init__.py +0 -0
  81. nlptools/arabiner/bin/__init__.py +14 -0
  82. nlptools/arabiner/bin/eval.py +87 -0
  83. nlptools/arabiner/bin/infer.py +91 -0
  84. nlptools/arabiner/bin/process.py +140 -0
  85. nlptools/arabiner/bin/train.py +221 -0
  86. nlptools/arabiner/data/__init__.py +1 -0
  87. nlptools/arabiner/data/datasets.py +146 -0
  88. nlptools/arabiner/data/transforms.py +118 -0
  89. nlptools/arabiner/nn/BaseModel.py +22 -0
  90. nlptools/arabiner/nn/BertNestedTagger.py +34 -0
  91. nlptools/arabiner/nn/BertSeqTagger.py +17 -0
  92. nlptools/arabiner/nn/__init__.py +3 -0
  93. nlptools/arabiner/trainers/BaseTrainer.py +117 -0
  94. nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
  95. nlptools/arabiner/trainers/BertTrainer.py +163 -0
  96. nlptools/arabiner/trainers/__init__.py +3 -0
  97. nlptools/arabiner/utils/__init__.py +0 -0
  98. nlptools/arabiner/utils/data.py +124 -0
  99. nlptools/arabiner/utils/helpers.py +151 -0
  100. nlptools/arabiner/utils/metrics.py +69 -0
  101. nlptools/environment.yml +227 -0
  102. nlptools/install_env.py +13 -0
  103. nlptools/morphology/ALMA_multi_word.py +34 -0
  104. nlptools/morphology/__init__.py +52 -0
  105. nlptools/morphology/charsets.py +60 -0
  106. nlptools/morphology/morph_analyzer.py +170 -0
  107. nlptools/morphology/settings.py +8 -0
  108. nlptools/morphology/tokenizers_words.py +19 -0
  109. nlptools/nlptools.py +1 -0
  110. nlptools/salma/__init__.py +12 -0
  111. nlptools/salma/settings.py +31 -0
  112. nlptools/salma/views.py +459 -0
  113. nlptools/salma/wsd.py +126 -0
  114. nlptools/utils/__init__.py +0 -0
  115. nlptools/utils/corpus_tokenizer.py +73 -0
  116. nlptools/utils/implication.py +662 -0
  117. nlptools/utils/jaccard.py +247 -0
  118. nlptools/utils/parser.py +147 -0
  119. nlptools/utils/readfile.py +3 -0
  120. nlptools/utils/sentence_tokenizer.py +53 -0
  121. nlptools/utils/text_transliteration.py +232 -0
  122. nlptools/utils/utils.py +2 -0
@@ -0,0 +1,459 @@
1
+ import json
2
+ from nlptools.salma import settings
3
+ from nlptools.salma.wsd import normalizearabert
4
+ from nlptools.salma.wsd import GlossPredictor
5
+ from nlptools.utils.parser import arStrip
6
+ from nlptools.morphology.tokenizers_words import simple_word_tokenize
7
+ from nlptools.morphology.ALMA_multi_word import ALMA_multi_word
8
+ from nlptools.morphology.morph_analyzer import analyze
9
+ #from nlptools.arabiner.bin.infer import ner
10
+
11
+ def delete_form_list(position, word_lemma):
12
+ #"""
13
+ #Remove specific elements from the word_lemma list based on the given position.
14
+ #
15
+ #Parameters:
16
+ #position (int): The current position in the input sentence.
17
+ #word_lemma (list): List of word lemma details.
18
+ #
19
+ #Returns:
20
+ #list: Updated word_lemma list with the specific elements removed.
21
+ #list: The list of removed elements.
22
+ #int: The new position in the input sentence.
23
+ #"""
24
+ tmp_word_lemma = []
25
+ output = []
26
+ for wordLemma in word_lemma:
27
+ if position == int(wordLemma[2]): # start
28
+ word = wordLemma[0]
29
+ gloss = wordLemma[1]
30
+ position = int(wordLemma[3])
31
+ concept_count = int(wordLemma[4])
32
+ undiac_multi_word_lemma = wordLemma[5]
33
+ multi_word_lemma = wordLemma[6]
34
+ output.append([word, gloss, concept_count, undiac_multi_word_lemma, multi_word_lemma])# word
35
+ elif position < int(wordLemma[2]):
36
+ tmp_word_lemma.append(wordLemma)
37
+ return tmp_word_lemma, output, position
38
+
39
+ def find_two_word_lemma(input_sentence):
40
+ #"""
41
+ #Find two-word lemmas in the input sentence using the ALMA_multi_word function.
42
+ #
43
+ #Parameters:
44
+ #input_sentence (list): Tokenized input sentence.
45
+ #
46
+ #Returns:
47
+ #list: List of details of found two-word lemmas.
48
+ #"""
49
+ i = 0
50
+ output = []
51
+ length = len(input_sentence)
52
+ while i < length - 1:
53
+ two_grams = input_sentence[i] +" "+ input_sentence[i + 1]
54
+ data = ALMA_multi_word(two_grams)
55
+ try :
56
+ glosses_list = []
57
+ concept_count = 0
58
+ ids = data[0]["ids"]
59
+ for lemma_id in ids:
60
+ if lemma_id in settings.glosses_dic.keys():
61
+ value = settings.glosses_dic[lemma_id]
62
+ glosses_list.append(json.loads(value[1]))
63
+ concept_count = concept_count + value[0]
64
+
65
+ # found two_grams
66
+ #found_2Word_lemma = [two_grams,data[0]['glosses'], i, i + 1,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
67
+ found_2Word_lemma = [two_grams, glosses_list, i, i + 1, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
68
+ output.append(found_2Word_lemma)
69
+ i = i + 1
70
+ except: # no record found on this multi_lema
71
+ i = i + 1
72
+ return output
73
+
74
+
75
+ def find_three_word_lemma(input_sentence):
76
+ i = 0
77
+ output = []
78
+ length = len(input_sentence)
79
+ while i < length - 2:
80
+ three_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2]
81
+ data = ALMA_multi_word(three_grams)
82
+ try:
83
+ glosses_list = []
84
+ concept_count = 0
85
+ ids = data[0]["ids"]
86
+ for lemma_id in ids:
87
+ if lemma_id in settings.glosses_dic.keys():
88
+ value = settings.glosses_dic[lemma_id]
89
+ glosses_list.append(json.loads(value[1]))
90
+ concept_count = concept_count + value[0]
91
+
92
+ #found_3Word_lemma = [three_grams, data[0]['glosses'], i, i + 2,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
93
+ found_3Word_lemma = [three_grams, glosses_list, i, i + 2, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
94
+ output.append(found_3Word_lemma)
95
+ i = i + 1
96
+ except:
97
+ i = i + 1
98
+ return output
99
+
100
+ def find_four_word_lemma(input_sentence):
101
+ i = 0
102
+ output = []
103
+ length = len(input_sentence)
104
+ while i < length - 3:
105
+ four_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2] + " "+ input_sentence[i + 3]
106
+ data = ALMA_multi_word(four_grams)
107
+ try:
108
+ glosses_list = []
109
+ concept_count = 0
110
+ ids = data[0]["ids"]
111
+ for lemma_id in ids:
112
+ if lemma_id in settings.glosses_dic.keys():
113
+ value = settings.glosses_dic[lemma_id]
114
+ glosses_list.append(json.loads(value[1]))
115
+ concept_count = concept_count + value[0]
116
+ #found_4Word_lemma = [four_grams, data[0]['glosses'], i, i + 3,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
117
+ found_4Word_lemma = [four_grams, glosses_list, i, i + 3, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
118
+ output.append(found_4Word_lemma)
119
+ i = i + 1
120
+ except:
121
+ i = i + 1
122
+ return output
123
+
124
+
125
+ def find_five_word_lemma(input_sentence):
126
+ i = 0
127
+ output = []
128
+ length = len(input_sentence)
129
+ while i < length - 4:
130
+ five_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2] + " "+ input_sentence[i + 3] + " "+ input_sentence[i + 4]
131
+ data = ALMA_multi_word(five_grams)
132
+ try:
133
+ glosses_list = []
134
+ concept_count = 0
135
+ ids = data[0]["ids"]
136
+ for lemma_id in ids:
137
+ if lemma_id in settings.glosses_dic.keys():
138
+ value = settings.glosses_dic[lemma_id]
139
+ glosses_list.append(json.loads(value[1]))
140
+ concept_count = concept_count + value[0]
141
+ #found_5Word_lemma = [five_grams, data[0]['glosses'], i, i + 4,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
142
+ found_5Word_lemma = [five_grams, glosses_list, i, i + 4, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
143
+ output.append(found_5Word_lemma)
144
+ i = i + 1
145
+ except:
146
+ i = i + 1
147
+ return output
148
+
149
+ def find_named_entities(string):
150
+ #"""
151
+ # Find named entities in the input string using a NER tool.
152
+ #
153
+ # Parameters:
154
+ # string (str): Input string.
155
+ #
156
+ # Returns:
157
+ # list: List of details of found named entities.
158
+ # """
159
+ found_entities = []
160
+ #entites = ner(string, "4")
161
+ entites = []
162
+ tag_gloss = {
163
+ "PERS": "اسم شخص",
164
+ "ORG": "اسم مؤسسة",
165
+ #"NORP": "مجموعة من الناس",
166
+ #"OCC": "منصب/مسمى وظيفي",
167
+ "LOC": "اسم منطقة جغرافية",
168
+ "FAC": "اسم لمَعلَم",
169
+ #"EVENT": "حدث",
170
+ "DATE": "فترة زمنية تدل على تاريخ",
171
+ "UNIT": "وحدة قياس",
172
+ "CURR": "عملة",
173
+ "GPE": "اسم بلد، له حدود إدارية/جيوسياسية",
174
+ "TIME": "فترة زمنية تدل على الوقت",
175
+ "CARDINAL": "عدد يدل على معدود",
176
+ "ORDINAL": "رقم، لا يدل على معدود",
177
+ "PERCENT": "نسبة مئوية",
178
+ "QUANTITY": "كمية",
179
+ "MONEY": "مبلغ مالي",
180
+ "LANGUAGE": "اسم للغة طبيعية",
181
+ "PRODUCT": "اسم منتج",
182
+ "LAW": "قانون"
183
+ }
184
+
185
+ for entity in entites:
186
+ gloss_ner = ""
187
+ if entity[1] in tag_gloss.keys():
188
+ gloss_ner = tag_gloss[entity[1]]
189
+
190
+ if gloss_ner != "":
191
+ gloss = [{'concept_id': '', 'resource_id': '', 'resource_name': '', 'gloss': gloss_ner}]
192
+ entity = [entity[0],gloss,int(entity[2]), int(entity[3]),1,arStrip(entity[0],True,True,True,False,True,False),entity[0]]
193
+ found_entities.append(entity)
194
+ return found_entities
195
+
196
+
197
+ def find_glosses_using_ALMA(word):
198
+
199
+ data = analyze(word)
200
+ Diac_lemma = ""
201
+ pos = ""
202
+ Undiac_lemma = ""
203
+ glosses = []
204
+ Diac_lemma = data[0][1]
205
+ pos = data[0][2]
206
+ Undiac_lemma = arStrip(Diac_lemma, True, True, True, True, True, False) # Remove diacs , smallDiacs , shaddah , digit , alif , specialChars
207
+ #"""
208
+ # Find glosses for the given word using the ALMA tool.
209
+ #
210
+ # Parameters:
211
+ # word (str): Input word.
212
+ #
213
+ # Returns:
214
+ # tuple: Details of the word including glosses, lemmas, and POS.
215
+ # """
216
+ ids = []
217
+ glosses_list = []
218
+ concept_count = 0
219
+ for line in data:
220
+ lemma_id = line[3]
221
+ ids.append(lemma_id)
222
+
223
+ for lemma_id in ids:
224
+ if lemma_id in settings.glosses_dic.keys():
225
+ value = settings.glosses_dic[lemma_id]
226
+ glosses_list.append(json.loads(value[1]))
227
+ concept_count = concept_count + value[0]
228
+
229
+ #glosses = data[0][4]
230
+ #concept_count = data[0][3]
231
+ return word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses
232
+
233
+ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, sentence):
234
+ #"""
235
+ # Disambiguate glosses using the SALMA tool.
236
+ #
237
+ # Parameters:
238
+ # glosses (list): List of glosses.
239
+ # Diac_lemma (str): Diacritic lemma of the word.
240
+ # Undiac_lemma (str): Undiacritic lemma of the word.
241
+ # word (str): The word being analyzed.
242
+ # sentence (str): The sentence containing the word.
243
+ #
244
+ # Returns:
245
+ # dict: Disambiguated gloss details.
246
+ # """
247
+ word = normalizearabert(word)
248
+ glosses_dictionary = {}
249
+ if glosses != None:
250
+ for gloss in glosses:
251
+ glosses_dictionary.update({gloss['concept_id'] : gloss['gloss']})
252
+ concept_id, gloss = GlossPredictor(Diac_lemma, Undiac_lemma,word,sentence,glosses_dictionary)
253
+
254
+ my_json = {}
255
+ my_json['Concept_id'] = concept_id
256
+ my_json['Gloss'] = gloss
257
+ my_json['word'] = word
258
+ my_json['Undiac_lemma'] = Undiac_lemma
259
+ my_json['Diac_lemma'] = Diac_lemma
260
+ return my_json
261
+ else:
262
+ my_json = {}
263
+ my_json['word'] = word
264
+ my_json['Undiac_lemma'] = Undiac_lemma
265
+ my_json['Diac_lemma'] = Diac_lemma
266
+ return my_json
267
+
268
+
269
+ def find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lemma, five_word_lemma, ner):
270
+ output_list = []
271
+ position = 0
272
+ while position < len(input_sentence):
273
+ flag = "False"
274
+ output_from5word = delete_form_list(position, five_word_lemma)
275
+ five_word_lemma = output_from5word[0]
276
+ if output_from5word[1] != []: # output
277
+ position = output_from5word[2]
278
+ flag = "True"
279
+ my_json = {}
280
+ my_json['word'] = output_from5word[1][0][0]
281
+ my_json['concept_count'] = output_from5word[1][0][2]
282
+ my_json['glosses'] = output_from5word[1][0][1]
283
+ my_json['Diac_lemma'] = output_from5word[1][0][4]
284
+ my_json['Undiac_lemma'] = output_from5word[1][0][3]
285
+ output_list.append(my_json)
286
+ position = position + 1
287
+
288
+
289
+
290
+ output_from4word = delete_form_list(position, four_word_lemma)
291
+ four_word_lemma = output_from4word[0]
292
+ if output_from4word[1] != []: # output
293
+ position = output_from4word[2]
294
+ flag = "True"
295
+ my_json = {}
296
+ my_json['word'] = output_from4word[1][0][0]
297
+ my_json['concept_count'] = output_from4word[1][0][2]
298
+ my_json['glosses'] = output_from4word[1][0][1]
299
+ my_json['Diac_lemma'] = output_from4word[1][0][4]
300
+ my_json['Undiac_lemma'] = output_from4word[1][0][3]
301
+ output_list.append(my_json)
302
+ position = position + 1
303
+
304
+ output_from3word = delete_form_list(position, three_word_lemma)
305
+ three_word_lemma = output_from3word[0]
306
+ if output_from3word[1] != []: # output
307
+ position = output_from3word[2]
308
+ flag = "True"
309
+ my_json = {}
310
+ my_json['word'] = output_from3word[1][0][0]
311
+ my_json['concept_count'] = output_from3word[1][0][2]
312
+ my_json['glosses'] = output_from3word[1][0][1]
313
+ my_json['Diac_lemma'] = output_from3word[1][0][4]
314
+ my_json['Undiac_lemma'] = output_from3word[1][0][3]
315
+ output_list.append(my_json)
316
+ position = position + 1
317
+
318
+
319
+
320
+ output_from2Word = delete_form_list(position, two_word_lemma)
321
+ two_word_lemma = output_from2Word[0]
322
+ if output_from2Word[1] != []:
323
+ position = output_from2Word[2]
324
+ flag = "True"
325
+ my_json = {}
326
+ word = output_from2Word[1][0][0]
327
+ my_json['word'] = word
328
+ my_json['concept_count'] = output_from2Word[1][0][2]
329
+ my_json['glosses'] = output_from2Word[1][0][1]
330
+ my_json['Diac_lemma'] = output_from2Word[1][0][4]
331
+ my_json['Undiac_lemma'] = output_from2Word[1][0][3]
332
+ output_list.append(my_json)
333
+ position = position + 1
334
+
335
+
336
+
337
+ output_from_ner = delete_form_list(position, ner)
338
+ ner = output_from_ner[0]
339
+ if output_from_ner[1] != []:
340
+ position = output_from_ner[2]
341
+ flag = "True"
342
+ my_json = {}
343
+ word = output_from_ner[1][0][0]
344
+ my_json['word'] = word
345
+ my_json['concept_count'] = output_from_ner[1][0][2]
346
+ my_json['glosses'] = output_from_ner[1][0][1]
347
+ my_json['Diac_lemma'] = output_from_ner[1][0][4]
348
+ my_json['Undiac_lemma'] = output_from_ner[1][0][3]
349
+ output_list.append(my_json)
350
+ position = position + 1
351
+
352
+ if flag == "False": # Not found in ner or in multi_word_dictionary, ASK ALMA
353
+ word = input_sentence[position]
354
+ word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word)
355
+ my_json = {}
356
+ my_json['word'] = word
357
+ my_json['concept_count'] = concept_count
358
+ my_json['glosses'] = glosses
359
+ my_json['Diac_lemma'] = Diac_lemma
360
+ my_json['Undiac_lemma'] = Undiac_lemma
361
+ output_list.append(my_json)
362
+ position = position + 1
363
+ return output_list
364
+
365
+ def disambiguate_glosses_main(word, sentence):
366
+ concept_count = word['concept_count']
367
+ if concept_count == 0:
368
+ my_json = {}
369
+ my_json['word'] = word['word']
370
+ my_json['Diac_lemma'] = word['Diac_lemma']
371
+ my_json['Undiac_lemma'] = word['Undiac_lemma']
372
+ return my_json
373
+ elif concept_count == 1:
374
+ my_json = {}
375
+ my_json['word'] = word['word']
376
+ glosses = word['glosses'][0]
377
+ my_json['Gloss'] = glosses['gloss']
378
+ my_json['Concept_id'] = glosses['concept_id']
379
+ my_json['Diac_lemma'] = word['Diac_lemma']
380
+ my_json['Undiac_lemma'] = word['Undiac_lemma']
381
+ return my_json
382
+ else:
383
+ input_word = word['word']
384
+ concept_count = word['concept_count']
385
+ glosses = word['glosses']
386
+ Diac_lemma = word['Diac_lemma']
387
+ Undiac_lemma = word['Undiac_lemma']
388
+ return disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
389
+
390
+ def WSD(sentence):
391
+
392
+ input_sentence = simple_word_tokenize(sentence)
393
+
394
+ five_word_lemma = find_five_word_lemma(input_sentence)
395
+
396
+ four_word_lemma = find_four_word_lemma(input_sentence)
397
+
398
+ three_word_lemma = find_three_word_lemma(input_sentence)
399
+
400
+ two_word_lemma = find_two_word_lemma(input_sentence)
401
+
402
+ ner = find_named_entities(" ".join(input_sentence))
403
+
404
+ output_list = find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lemma, five_word_lemma, ner)
405
+
406
+ results = []
407
+ for word in output_list:
408
+ results.append(disambiguate_glosses_main(word, sentence))
409
+ return results
410
+
411
+
412
+ def SALMA(sentence):
413
+ """
414
+ This method disambiguate words within a sentence.
415
+
416
+ Args:
417
+ sentence (:obj:`str`): The Arabic text to be disambiguated, it should be limited to less than 500 characters.
418
+
419
+ Returns:
420
+ :obj:`list`: The JSON output includes a list of words, with each word having a gloss if it exists or a lemma if no gloss is found.
421
+
422
+ **Example:**
423
+
424
+ .. highlight:: python
425
+ .. code-block:: python
426
+
427
+ from nlptools.salma.views import SALMA
428
+ JSON = SALMA("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.")
429
+ print(JSON["resp"])
430
+
431
+ #output
432
+ [
433
+ {
434
+ "Concept_id": "303019218",
435
+ "Gloss": "ذهَب إلى عملِه:- قصَده، توجَّه إليه \"ذهَب إلى الجامعة/ بيروت - اذهَب إلى أبيك والتمس منه الصفح - ذهَب إلى قول فلان أخذ به - <اذْهَبْ إِلَى فِرْعَوْنَ إِنَّهُ طَغَى> طه/ 24 \". ذهَب رأسًا إليه",
436
+ "word": "ذهبت",
437
+ "Undiac_lemma": "ذهب",
438
+ "Diac_lemma": "ذَهَبَ۪ 1"
439
+ },
440
+ {
441
+ "word": "إلى",
442
+ "Diac_lemma": "إِلَى 1",
443
+ "Undiac_lemma": "الى"
444
+ },
445
+ {
446
+ "word": "جامعة بيرزيت",
447
+ "Gloss": "جامعة فلسطينية تقع في بلدة بيرزيت، قرب مدينة رام الله، ويعود تاريخها إلى عام 1924 عندما تأسست كمدرسة ابتدائية ثم أصبحت جامعة عام 1975",
448
+ "Concept_id": "334000099",
449
+ "Diac_lemma": "جامِعَة بيرزَيت",
450
+ "Undiac_lemma": "جامعة بيرزيت"
451
+ }
452
+ ]
453
+ """
454
+ if len(sentence) > 500:
455
+ content = {"statusText":"Input is too long","statusCode":-7}
456
+ return content
457
+ else:
458
+ results = WSD(sentence)
459
+ return {"resp": results, "statusText":"OK","statusCode":0}
nlptools/salma/wsd.py ADDED
@@ -0,0 +1,126 @@
1
+ from nlptools.salma import settings
2
+ import re
3
+ import warnings
4
+ warnings.filterwarnings("ignore")
5
+ import torch
6
+ import numpy as np
7
+ import pandas as pd
8
+ from nlptools.arabert.preprocess import ArabertPreprocessor
9
+
10
+ def normalizearabert(s):
11
+ model_name = 'aubmindlab/bert-base-arabertv02'
12
+ arabert_prep = ArabertPreprocessor(model_name.split("/")[-1])
13
+ return arabert_prep.preprocess(str(s))
14
+
15
+
16
+
17
+ def glosses1(dfcand,target):
18
+ # """
19
+ # takes a dataframe
20
+ # return
21
+ # 'none' if the maximum logistic regression score for TRUE class is less than -2 OR
22
+ # the predicted gloss having the maximum logistic regression score
23
+ # """
24
+
25
+ wic_c = []
26
+ wic_c, _ = read_data(dfcand,normalizearabert,target)
27
+ tokenizedwic_c = np.array([settings.tokenizer.encode(x, max_length=512,padding='max_length',truncation='longest_first',add_special_tokens=True) for x in wic_c])
28
+ max_len = 512
29
+ segmentswic = torch.tensor([get_segments(settings.tokenizer.convert_ids_to_tokens(i),max_len) for i in tokenizedwic_c])
30
+ paddedwic = tokenizedwic_c
31
+ attention_maskwic = np.where(paddedwic != 0, 1, 0)
32
+ input_idswic = torch.tensor(paddedwic)
33
+ attention_maskwic = torch.tensor(attention_maskwic)
34
+ settings.model = settings.model.eval()
35
+ wicpredictions , wictrue_labels = [], []
36
+ b_input_ids = input_idswic
37
+ b_input_mask = attention_maskwic
38
+ b_input_seg = segmentswic
39
+
40
+ with torch.no_grad():
41
+ outputs = settings.model(b_input_ids,token_type_ids=b_input_seg,attention_mask=b_input_mask)
42
+
43
+ logits = outputs[0]
44
+ wicpredictions.append(logits)
45
+ wicflat_predictions = np.concatenate(wicpredictions, axis=0)
46
+
47
+ return dfcand['Concept_id'].to_list()[np.argmax(wicflat_predictions, axis=0).flatten()[1]],dfcand['Gloss'].to_list()[np.argmax(wicflat_predictions, axis=0).flatten()[1]]
48
+
49
+ def read_data(data,normalize,target):
50
+ c = []
51
+ labels = []
52
+ for i,row in data.iterrows():
53
+
54
+ example = normalize(row['Example'])
55
+ gloss = normalize(row['Gloss'])
56
+ label = row['Label']
57
+
58
+ c.append('{} [SEP] {}: {}'.format(example,target,gloss))
59
+ if label == 1.0:
60
+ labels.append(1)
61
+ else:
62
+ labels.append(0)
63
+ return c,labels
64
+
65
+ def inserttag1(sentence,tag,start,end):
66
+ before = sentence[:start]
67
+ after = sentence[end:]
68
+ target = sentence[start:end]
69
+ return before+tag+sentence[start:end]+tag+after
70
+
71
+ def get_segments(tokens, max_seq_length):
72
+ if len(tokens)>max_seq_length:
73
+ raise IndexError("Token length more than max seq length!")
74
+ segments = []
75
+ current_segment_id = 0
76
+ for token in tokens:
77
+ segments.append(current_segment_id)
78
+ if token == "[SEP]":
79
+ current_segment_id = 1
80
+ return segments + [0] * (max_seq_length - len(tokens))
81
+
82
+ def senttarget(target,example):
83
+ start = -1
84
+ try:
85
+ start = example.index(target)
86
+ except ValueError:
87
+ return -1
88
+ end = example.index(target)+len(target)
89
+ return inserttag1(example,"[UNUSED0]",start,end)
90
+
91
+
92
+ def GlossPredictor(diac_lemma, undiac_lemma,target,example,glosses):
93
+ # """
94
+ # takes
95
+ # a lemma
96
+ # corresponding target word
97
+ # an example
98
+ # glosses as a dictionay, following an example:
99
+ # glosses = {"Concept_id1": "gloss1", "Concept_id2": "gloss2", "Concept_id3": "gloss3"}
100
+ # returns
101
+ # -1 if the example does not contain the target word OR
102
+ # 'none' if no records in dftrue for the lemma and if the maximum logistic regression score for TRUE class is less than -2 OR
103
+ # the predicted gloss for the target word
104
+ #
105
+ # """
106
+ example = senttarget(target,example)
107
+ if example == -1:
108
+ return -1,-1
109
+
110
+ data = []
111
+ for g in glosses:
112
+ data.append([g,diac_lemma,undiac_lemma, glosses[g], target,example,0,1,'','',''])
113
+ dfcolumns = ['Concept_id', 'Diac_lemma', 'Undiac_lemma', 'Gloss', 'Target', 'Example', 'Is_training', 'Label', 'concept_id', 'lemma_id', 'POS']
114
+ dfcand = pd.DataFrame(data,columns=dfcolumns)
115
+
116
+
117
+ if len(dfcand) > 0:
118
+ dfcand['Example'] = dfcand['Example'].apply(lambda x: example)
119
+ dfcand['Target'] = dfcand['Target'].apply(lambda x: target)
120
+ dfcand = dfcand.drop_duplicates()
121
+
122
+ dfcand['Example'] = dfcand['Example'].apply(lambda x: x.upper())
123
+ dfcand['Example'] = dfcand['Example'].apply(lambda x: re.sub(r'^((.?\[UNUSED0\].?){1})\[UNUSED0\]', r'\1[UNUSED1]', x) )
124
+ return glosses1(dfcand,target)
125
+ else:
126
+ return 'none','none'
File without changes
@@ -0,0 +1,73 @@
1
+ import os
2
+ import csv
3
+ from nlptools.utils.sentence_tokenizer import sent_tokenize
4
+ from nlptools.morphology.tokenizers_words import simple_word_tokenize
5
+
6
+ def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
7
+ """
8
+ This method receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in a CSV file.
9
+
10
+ Args:
11
+ dir_path (:obj:`str`): The path of the directory containing multiple Arabic txt files.
12
+ output_csv (:obj:`str`): The name of the output CSV file, which will be generated in the current directory where this function is used.
13
+ row_id (:obj:`int`): Specifies the row_id you wish to start with; the default value is 1.
14
+ global_sentence_id (:obj:`int`): Specifies the global_sentence_id you wish to start with; the default value is 1.
15
+
16
+ Returns:
17
+ csv file (:obj:`str`): The CSV file contains the following fields:
18
+ * Row_ID (primary key, unique for all records in outputfile)
19
+ * Docs_Sentence_Word_ID (DirectoryName_FileName_GlobalSentenceID_SentenceID_WordPosition)
20
+ * GlobalSentenceID (Integer, a unique identifier for each sentence in the entire file)
21
+ * SentenceID (Integer, a unique identifier for each file within the CSV file)
22
+ * Sentence (Generated text that forms a sentence)
23
+ * Word Position (Integer, the position of each word within the sentence)
24
+ * Word (Each row contains a word from the generated sentence).
25
+
26
+ **Example:**
27
+
28
+ .. highlight:: python
29
+ .. code-block:: python
30
+
31
+ from nlptools.utils.corpus_tokenizer import corpus_tokenizer
32
+ corpus_tokenizer(dir_path="History", output_csv="ouputFile.csv", row_id = 1, global_sentence_id = 1)
33
+
34
+ #output
35
+ # csv file called: ouputFile.csv
36
+ # For example, if the 'History' directory contains 2 files named 'h1.txt' and 'h2.txt'.
37
+ # The output file will contain:
38
+ # Row_ID, Docs_Sentence_Word_ID, Global Sentence ID, Sentence ID, Sentence, Word Position, Word
39
+ # 1,History_h1_1_1_1,1,1,الطيور الضارة ومكافحتها,1,الطيور
40
+ # 2,History_h1_1_1_2,1,1,الطيور الضارة ومكافحتها,2,الضارة
41
+ # 3,History_h1_1_1_3,1,1,الطيور الضارة ومكافحتها,3,ومكافحتها
42
+ # 4,History_h2_2_1_1,1,1,بشكل عام,1,بشكل
43
+ # 5,History_h2_2_1_2,1,1,بشكل عام,2,عام
44
+ """
45
+ row_id = row_id - 1
46
+ global_sentence_id = global_sentence_id - 1
47
+ with open(output_csv, 'w', newline='', encoding="utf-8") as csvfile:
48
+ fieldnames = ['Row_ID', 'Docs_Sentence_Word_ID', 'Global Sentence ID', 'Sentence ID', 'Sentence', 'Word Position', 'Word']
49
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
50
+ writer.writeheader()
51
+
52
+ for root, dirs, files in os.walk(dir_path):
53
+ for file in files:
54
+ if file.endswith('.txt'):
55
+ file_path = os.path.join(root, file)
56
+ with open(file_path, 'r', encoding="utf-8") as f:
57
+ content = f.read()
58
+ sentences = sent_tokenize(content, dot=True, new_line=True, question_mark=False, exclamation_mark=False)
59
+ for sentence_id, sentence in enumerate(sentences, start=1):
60
+ words = simple_word_tokenize(sentence)
61
+ global_sentence_id += 1
62
+ for word_pos, word in enumerate(words, start=1):
63
+ row_id += 1
64
+ dir_name = os.path.basename(root)
65
+ doc_sentence_filename = file.split(".txt")[0]
66
+ docs_sentence_word_id = f"{dir_name}_{doc_sentence_filename}_{global_sentence_id}_{sentence_id}_{word_pos}"
67
+ writer.writerow({'Row_ID': row_id,
68
+ 'Docs_Sentence_Word_ID': docs_sentence_word_id,
69
+ 'Global Sentence ID': global_sentence_id,
70
+ 'Sentence ID': sentence_id,
71
+ 'Sentence': sentence,
72
+ 'Word Position': word_pos,
73
+ 'Word': word})