SinaTools 0.1.20__py2.py3-none-any.whl → 0.1.21__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.20
3
+ Version: 0.1.21
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -1,5 +1,5 @@
1
- SinaTools-0.1.20.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
- sinatools/VERSION,sha256=Cl3YqOjNQlou69HucXrHnlE2wQ5-6XQ2C-f24WemaN4,6
1
+ SinaTools-0.1.21.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
+ sinatools/VERSION,sha256=xAJpR_QiE53NSmTRvuvj7nJV3dmZVUy0LU-CE-qYKgs,6
3
3
  sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
4
4
  sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
5
5
  sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
@@ -109,14 +109,14 @@ sinatools/utils/text_dublication_detector.py,sha256=6yAOUtdw4TKiJkUPDDi3oK7CEoIu
109
109
  sinatools/utils/text_transliteration.py,sha256=NQoXrxI-h0UXnvVtDA3skNJduxIy0IW26r46N4tDxGk,8766
110
110
  sinatools/utils/tokenizer.py,sha256=QHyrVqJA_On4rKxexiWR2ovq4pI1-u6iZkdhRbK9tew,6676
111
111
  sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
112
- sinatools/wsd/__init__.py,sha256=5Ondsp-Xe9YxVjRlTc4nLrxu6xiyML7B3bQ3EZ44uEM,327
113
- sinatools/wsd/disambiguator.py,sha256=7CDlcIM_O7J9wVIrHjauAHDhsKqnia-zLT7NLnDrLys,19999
114
- sinatools/wsd/settings.py,sha256=b_AqTxVWALuGXnsMd9KhnnwIo9-JEoWOTekB-7_xJCU,1111
112
+ sinatools/wsd/__init__.py,sha256=yV-SQSCzSrjbNkciMbDCqzGZ_EESchL7rlJk56uibVI,309
113
+ sinatools/wsd/disambiguator.py,sha256=8HrVAGpEQyrzwiuEreLX9X82WSL-U2Aeca0ttrtIw2Y,19998
114
+ sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
115
115
  sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
116
- SinaTools-0.1.20.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
117
- SinaTools-0.1.20.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
118
- SinaTools-0.1.20.dist-info/METADATA,sha256=mAAUGGq-SXM-psy8aOg0DOucVqbbJLkx7oDZGREno1Q,953
119
- SinaTools-0.1.20.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
120
- SinaTools-0.1.20.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
121
- SinaTools-0.1.20.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
122
- SinaTools-0.1.20.dist-info/RECORD,,
116
+ SinaTools-0.1.21.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
117
+ SinaTools-0.1.21.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
118
+ SinaTools-0.1.21.dist-info/METADATA,sha256=BrzKHUWwumfn52P0RM00koZDTl0txyL-cfr1bho-8mQ,953
119
+ SinaTools-0.1.21.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
120
+ SinaTools-0.1.21.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
121
+ SinaTools-0.1.21.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
122
+ SinaTools-0.1.21.dist-info/RECORD,,
sinatools/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.20
1
+ 0.1.21
sinatools/wsd/__init__.py CHANGED
@@ -3,9 +3,9 @@ import pickle
3
3
  from sinatools.DataDownload import downloader
4
4
  import os
5
5
 
6
- settings.glosses_dic = {}
6
+ glosses_dic = {}
7
7
  filename = 'glosses_dic.pickle'
8
8
  path =downloader.get_appdatadir()
9
9
  file_path = os.path.join(path, filename)
10
10
  with open(file_path, 'rb') as f:
11
- settings.glosses_dic = pickle.load(f)
11
+ glosses_dic = pickle.load(f)
@@ -7,6 +7,7 @@ from sinatools.utils.tokenizers_words import simple_word_tokenize
7
7
  from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
8
8
  from sinatools.morphology.morph_analyzer import analyze
9
9
  from sinatools.ner.entity_extractor import extract
10
+ from . import glosses_dic
10
11
 
11
12
 
12
13
  def distill_entities(entities):
@@ -135,12 +136,9 @@ def find_two_word_lemma(input_sentence):
135
136
  glosses_list = []
136
137
  concept_count = 0
137
138
  ids = data[0]["ids"]
138
- for lemma_id in ids:
139
- if lemma_id in settings.glosses_dic.keys():
140
- value = settings.glosses_dic[lemma_id]
141
- glosses_list.append(json.loads(value[1]))
142
- concept_count = concept_count + value[0]
143
-
139
+ for concepts in ids:
140
+ glosses_list.append(json.loads(concepts))
141
+ concept_count = concept_count + data[0]["POS"]
144
142
  found_2Word_lemma = [two_grams, glosses_list, i, i + 1, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
145
143
  output.append(found_2Word_lemma)
146
144
  i = i + 1
@@ -160,12 +158,9 @@ def find_three_word_lemma(input_sentence):
160
158
  glosses_list = []
161
159
  concept_count = 0
162
160
  ids = data[0]["ids"]
163
- for lemma_id in ids:
164
- if lemma_id in settings.glosses_dic.keys():
165
- value = settings.glosses_dic[lemma_id]
166
- glosses_list.append(json.loads(value[1]))
167
- concept_count = concept_count + value[0]
168
-
161
+ for concepts in ids:
162
+ glosses_list.append(json.loads(concepts))
163
+ concept_count = concept_count + data[0]["POS"]
169
164
  found_3Word_lemma = [three_grams, glosses_list, i, i + 2, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
170
165
  output.append(found_3Word_lemma)
171
166
  i = i + 1
@@ -184,11 +179,9 @@ def find_four_word_lemma(input_sentence):
184
179
  glosses_list = []
185
180
  concept_count = 0
186
181
  ids = data[0]["ids"]
187
- for lemma_id in ids:
188
- if lemma_id in settings.glosses_dic.keys():
189
- value = settings.glosses_dic[lemma_id]
190
- glosses_list.append(json.loads(value[1]))
191
- concept_count = concept_count + value[0]
182
+ for concepts in ids:
183
+ glosses_list.append(json.loads(concepts))
184
+ concept_count = concept_count + data[0]["POS"]
192
185
  found_4Word_lemma = [four_grams, glosses_list, i, i + 3, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
193
186
  output.append(found_4Word_lemma)
194
187
  i = i + 1
@@ -208,11 +201,9 @@ def find_five_word_lemma(input_sentence):
208
201
  glosses_list = []
209
202
  concept_count = 0
210
203
  ids = data[0]["ids"]
211
- for lemma_id in ids:
212
- if lemma_id in settings.glosses_dic.keys():
213
- value = settings.glosses_dic[lemma_id]
214
- glosses_list.append(json.loads(value[1]))
215
- concept_count = concept_count + value[0]
204
+ for concepts in ids:
205
+ glosses_list.append(json.loads(concepts))
206
+ concept_count = concept_count + data[0]["POS"]
216
207
  found_5Word_lemma = [five_grams, glosses_list, i, i + 4, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
217
208
  output.append(found_5Word_lemma)
218
209
  i = i + 1
@@ -276,16 +267,18 @@ def find_glosses_using_ALMA(word):
276
267
  pos = data[0]["pos"]
277
268
  Undiac_lemma = arStrip(Diac_lemma, True, True, True, True, True, False) # Remove diacs , smallDiacs , shaddah , digit , alif , specialChars
278
269
  ids = []
279
- glosses_list = []
270
+ # glosses_list = []
280
271
  concept_count = 0
281
272
  lemma_id = data[0]["lemma_id"]
282
- if lemma_id in settings.glosses_dic.keys():
283
- value = settings.glosses_dic[lemma_id]
284
- glosses_list.append(json.loads(value[1]))
285
- concept_count = concept_count + value[0]
286
273
 
287
- return word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses
274
+ if lemma_id in glosses_dic.keys():
275
+ value = glosses_dic[lemma_id]
276
+ glosses= json.loads(value[1])
277
+ # glosses_list.append(json.loads(value[1]))
278
+ concept_count = concept_count + value[0]
288
279
 
280
+ return word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses
281
+
289
282
  def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, sentence):
290
283
  word = normalizearabert(word)
291
284
  glosses_dictionary = {}
@@ -309,7 +302,7 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
309
302
  return my_json
310
303
 
311
304
 
312
- def find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lemma, five_word_lemma, ner):
305
+ def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner):
313
306
  output_list = []
314
307
  position = 0
315
308
  while position < len(input_sentence):
@@ -376,7 +369,7 @@ def find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lem
376
369
  position = position + 1
377
370
 
378
371
 
379
-
372
+
380
373
  output_from_ner = delete_form_list(position, ner)
381
374
  ner = output_from_ner[0]
382
375
  if output_from_ner[1] != []:
@@ -385,11 +378,13 @@ def find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lem
385
378
  my_json = {}
386
379
  word = output_from_ner[1][0][0]
387
380
  my_json['word'] = word
388
- my_json['concept_count'] = output_from_ner[1][0][2]
381
+ # my_json['concept_count'] = output_from_ner[1][0][2]
382
+ my_json['concept_count'] = '*'
389
383
  my_json['glosses'] = output_from_ner[1][0][1]
390
384
  my_json['Diac_lemma'] = output_from_ner[1][0][4]
391
385
  my_json['Undiac_lemma'] = output_from_ner[1][0][3]
392
386
  output_list.append(my_json)
387
+ # print("output list: ", output_list)
393
388
  position = position + 1
394
389
 
395
390
  if flag == "False": # Not found in ner or in multi_word_dictionary, ASK ALMA
@@ -417,6 +412,15 @@ def disambiguate_glosses_main(word, sentence):
417
412
  my_json = {}
418
413
  my_json['word'] = word['word']
419
414
  glosses = word['glosses'][0]
415
+ # my_json['Gloss'] = glosses['gloss']
416
+ my_json['Concept_id'] = glosses['concept_id']
417
+ my_json['Diac_lemma'] = word['Diac_lemma']
418
+ my_json['Undiac_lemma'] = word['Undiac_lemma']
419
+ return my_json
420
+ elif concept_count == '*':
421
+ my_json = {}
422
+ my_json['word'] = word['word']
423
+ glosses = word['glosses'][0]
420
424
  my_json['Gloss'] = glosses['gloss']
421
425
  my_json['Concept_id'] = glosses['concept_id']
422
426
  my_json['Diac_lemma'] = word['Diac_lemma']
@@ -444,8 +448,7 @@ def WSD(sentence):
444
448
 
445
449
  ner = find_named_entities(" ".join(input_sentence))
446
450
 
447
- output_list = find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lemma, five_word_lemma, ner)
448
-
451
+ output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner)
449
452
  results = []
450
453
  for word in output_list:
451
454
  results.append(disambiguate_glosses_main(word, sentence))
sinatools/wsd/settings.py CHANGED
@@ -9,7 +9,6 @@ import pandas as pd
9
9
  from sinatools.DataDownload import downloader
10
10
  import os
11
11
 
12
- glosses_dic = {}
13
12
 
14
13
  model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
15
14
  path =downloader.get_appdatadir()
@@ -21,11 +20,6 @@ tokenizer_file_path = os.path.join(path, tokenizer_file_name)
21
20
 
22
21
  dftrue = pd.DataFrame()
23
22
 
24
- # model = BertForSequenceClassification.from_pretrained('{}'.format("bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"),
25
- # output_hidden_states = True,
26
- # num_labels=2
27
- # )
28
-
29
23
  model = BertForSequenceClassification.from_pretrained(model_file_path, output_hidden_states=True, num_labels=2)
30
24
 
31
25
  tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))