SinaTools 0.1.33__py2.py3-none-any.whl → 0.1.34__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.33
3
+ Version: 0.1.34
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -1,10 +1,10 @@
1
- SinaTools-0.1.33.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
- sinatools/VERSION,sha256=bJzhviDvRLWnGN5ta-YXGMlqn4-UFzm_e_QbUFvKv1I,6
1
+ SinaTools-0.1.34.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
+ sinatools/VERSION,sha256=hygBh9__JFOajJA1gAKoJF_AUzBnbP5eCrBYLp3dwDI,6
3
3
  sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
4
4
  sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
5
5
  sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
6
6
  sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
7
- sinatools/CLI/DataDownload/download_files.py,sha256=TzS0XjYDhusRBb2CRX1EjKjORa0wI6me_XoZ09dY4R8,2397
7
+ sinatools/CLI/DataDownload/download_files.py,sha256=u_DFXbHcIU_4Ub5Y0cL9_p1hL8h6LLWPemn9Al-XFgc,2603
8
8
  sinatools/CLI/morphology/ALMA_multi_word.py,sha256=rmpa72twwIJHme_kpQ1lu3_7y_Jorj70QTvOnQMJRuI,1274
9
9
  sinatools/CLI/morphology/morph_analyzer.py,sha256=HPamEKos_JRYCJv_2q6c12N--da58_JXTno9haww5Ao,3497
10
10
  sinatools/CLI/ner/corpus_entity_extractor.py,sha256=DdvigsDQzko5nJBjzUXlIDqoBMBTVzktjSo7JfEXTIA,4778
@@ -20,7 +20,7 @@ sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1
20
20
  sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
21
21
  sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
22
22
  sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- sinatools/DataDownload/downloader.py,sha256=3UkRRH4TLbut10V1BgWO3EqJQaHVBqr6pAj7Fn4AQZ8,6511
23
+ sinatools/DataDownload/downloader.py,sha256=VdUNgSqMKz1J-DuQD_eS1U2KWqEpy94WlSJ0pPODLig,7833
24
24
  sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
25
25
  sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
26
26
  sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
@@ -97,7 +97,7 @@ sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh
97
97
  sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
98
98
  sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
99
99
  sinatools/relations/__init__.py,sha256=cYjsP2mlTYvAwVIEFtgA6i9gLUSkGVOuDggMs7TvG5k,272
100
- sinatools/relations/relation_extractor.py,sha256=gADRNy0LZvJ021UVgSuV4DfHodRJ8bM7FeCkdV4DeeY,9719
100
+ sinatools/relations/relation_extractor.py,sha256=UuDlaaR0ch9BFv4sBF1tr7P-P9xq8oRZF41tAze6_ok,9751
101
101
  sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
102
102
  sinatools/semantic_relatedness/compute_relatedness.py,sha256=_9HFPs3nQBLklHFfkc9o3gEjEI6Bd34Ha4E1Kvv1RIg,2256
103
103
  sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO8390,548
@@ -115,13 +115,13 @@ sinatools/utils/tokenizer.py,sha256=nyk6lh5-p38wrU62hvh4wg7ni9ammkdqqIgcjbbBxxo,
115
115
  sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
116
116
  sinatools/utils/word_compare.py,sha256=rS2Z74sf7R-7MTXyrFj5miRi2TnSG9OdTDp_qQYuo2Y,28200
117
117
  sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
118
- sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
118
+ sinatools/wsd/disambiguator.py,sha256=9ottQn_WwOFX5Trr0Rpg66-Jpaln5yJduFqP6cdOOBA,22616
119
119
  sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
120
120
  sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
121
- SinaTools-0.1.33.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
122
- SinaTools-0.1.33.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
123
- SinaTools-0.1.33.dist-info/METADATA,sha256=WjUqSrwvqgsY3foTp7i3axxereSFYQOmIliv5uZ6tIY,3267
124
- SinaTools-0.1.33.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
125
- SinaTools-0.1.33.dist-info/entry_points.txt,sha256=-YGM-r0_UtNPnI0C4UcK1ptrpwFZpUhxdy2qHkehNCo,1303
126
- SinaTools-0.1.33.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
127
- SinaTools-0.1.33.dist-info/RECORD,,
121
+ SinaTools-0.1.34.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
122
+ SinaTools-0.1.34.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
123
+ SinaTools-0.1.34.dist-info/METADATA,sha256=lzqCZL8XdEQ2ZqcXH5WsoUmLBwv9TklIItPwCB0MqKc,3267
124
+ SinaTools-0.1.34.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
125
+ SinaTools-0.1.34.dist-info/entry_points.txt,sha256=-YGM-r0_UtNPnI0C4UcK1ptrpwFZpUhxdy2qHkehNCo,1303
126
+ SinaTools-0.1.34.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
127
+ SinaTools-0.1.34.dist-info/RECORD,,
@@ -34,6 +34,7 @@ import argparse
34
34
  from sinatools.DataDownload.downloader import download_file
35
35
  from sinatools.DataDownload.downloader import download_files
36
36
  from sinatools.DataDownload.downloader import get_appdatadir
37
+ from sinatools.DataDownload.downloader import download_folder_from_hf
37
38
  from sinatools.DataDownload.downloader import urls
38
39
 
39
40
 
@@ -51,15 +52,16 @@ def main():
51
52
  for file in args.files:
52
53
  print("file: ", file)
53
54
  if file == "wsd":
54
- download_file(urls["morph"])
55
- download_file(urls["ner"])
56
- download_file(urls["wsd_model"])
57
- download_file(urls["wsd_tokenizer"])
58
- download_file(urls["one_gram"])
59
- download_file(urls["five_grams"])
60
- download_file(urls["four_grams"])
61
- download_file(urls["three_grams"])
62
- download_file(urls["two_grams"])
55
+ #download_file(urls["morph"])
56
+ #download_file(urls["ner"])
57
+ #download_file(urls["wsd_model"])
58
+ download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01")
59
+ #download_file(urls["wsd_tokenizer"])
60
+ #download_file(urls["one_gram"])
61
+ #download_file(urls["five_grams"])
62
+ #download_file(urls["four_grams"])
63
+ #download_file(urls["three_grams"])
64
+ #download_file(urls["two_grams"])
63
65
  elif file == "synonyms":
64
66
  download_file(urls["graph_l2"])
65
67
  download_file(urls["graph_l3"])
@@ -8,8 +8,8 @@ import tarfile
8
8
  urls = {
9
9
  'morph': 'https://sina.birzeit.edu/lemmas_dic.pickle',
10
10
  'ner': 'https://sina.birzeit.edu/Wj27012000.tar.gz',
11
- 'wsd_model': 'https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
12
- 'wsd_tokenizer': 'https://sina.birzeit.edu/bert-base-arabertv02.zip',
11
+ # 'wsd_model': 'https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
12
+ # 'wsd_tokenizer': 'https://sina.birzeit.edu/bert-base-arabertv02.zip',
13
13
  'one_gram': 'https://sina.birzeit.edu/one_gram.pickle',
14
14
  'five_grams': 'https://sina.birzeit.edu/five_grams.pickle',
15
15
  'four_grams':'https://sina.birzeit.edu/four_grams.pickle',
@@ -184,4 +184,35 @@ def download_files():
184
184
  None
185
185
  """
186
186
  for url in urls.values():
187
- download_file(url)
187
+ download_file(url)
188
+
189
+
190
+ def download_folder_from_hf(repo_url, folder_name):
191
+
192
+ # Hugging Face API to fetch files from the repository
193
+ api_url = f"https://huggingface.co/api/models/{repo_url}/tree/main/{folder_name}"
194
+
195
+ # Make the request to get the folder structure
196
+ response = requests.get(api_url)
197
+ if response.status_code != 200:
198
+ print(f"Failed to fetch folder contents. Status code: {response.status_code}")
199
+ return
200
+
201
+ folder_content = response.json()
202
+
203
+ # Download each file in the folder
204
+ for file_info in folder_content:
205
+ file_name = file_info["path"]
206
+ file_url = f"https://huggingface.co/{repo_url}/resolve/main/{file_name}"
207
+
208
+ # Download the file and save it to the output directory
209
+ file_response = requests.get(file_url)
210
+ if file_response.status_code == 200:
211
+ # Create any necessary directories
212
+ output_file_path = os.path.join(get_appdatadir(), file_name)
213
+ os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
214
+ with open(output_file_path, 'wb') as f:
215
+ f.write(file_response.content)
216
+ print(f"Downloaded: {file_name}")
217
+ else:
218
+ print(f"Failed to download {file_name}. Status code: {file_response.status_code}")
sinatools/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.33
1
+ 0.1.34
@@ -193,7 +193,7 @@ def event_argument_relation_extraction(documnet):
193
193
  score = predicted_relation[0][0]['score']
194
194
  if score > 0.50:
195
195
  triple_id+=1
196
- relation={"TripleID":triple_id,"Subject":{"ID":entity_identifier[event_entity],"Type": entities[event_entity], "Label":event_entity}, "Relation": category, "Object":{"ID":entity_identifier[arg_name],"Type": entities[arg_name], "Label":arg_name,}}
196
+ relation={"TripleID":triple_id,"Subject":{"ID":entity_identifier[event_entity],"Type": entities[event_entity], "Label":event_entity}, "Relation": category, "Object":{"ID":entity_identifier[arg_name],"Type": entities[arg_name], "Label":arg_name,},"confidence": f"{score: .2f}"}
197
197
  output_list.append(relation)
198
198
 
199
- return output_list
199
+ return output_list
@@ -8,6 +8,10 @@ from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
8
8
  from sinatools.morphology.morph_analyzer import analyze
9
9
  from sinatools.ner.entity_extractor import extract
10
10
  from . import glosses_dic
11
+ import time
12
+ #import concurrent
13
+ #import threading
14
+ import multiprocessing
11
15
 
12
16
 
13
17
  def distill_entities(entities):
@@ -256,7 +260,7 @@ def find_named_entities(string):
256
260
  return found_entities
257
261
 
258
262
 
259
- def find_glosses_using_ALMA(word):
263
+ def find_glosses_using_ALMA(word, glosses_dic):
260
264
 
261
265
  data = analyze(word, language ='MSA', task ='full', flag="1")
262
266
  Diac_lemma = ""
@@ -302,7 +306,7 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
302
306
  return my_json
303
307
 
304
308
 
305
- def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner):
309
+ def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner, glosses_dic):
306
310
  output_list = []
307
311
  position = 0
308
312
  while position < len(input_sentence):
@@ -389,7 +393,7 @@ def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemm
389
393
 
390
394
  if flag == "False": # Not found in ner or in multi_word_dictionary, ASK ALMA
391
395
  word = input_sentence[position]
392
- word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word)
396
+ word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word, glosses_dic)
393
397
  my_json = {}
394
398
  my_json['word'] = word
395
399
  my_json['concept_count'] = concept_count
@@ -432,26 +436,95 @@ def disambiguate_glosses_main(word, sentence):
432
436
  glosses = word['glosses']
433
437
  Diac_lemma = word['Diac_lemma']
434
438
  Undiac_lemma = word['Undiac_lemma']
435
- return disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
439
+ start = time.time()
440
+ x = disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
441
+ end = time.time()
442
+ print(f"disambiguate time: {end - start}")
443
+ return x
444
+
445
+
446
+ def init_resources():
447
+ global glosses_dic
448
+
449
+
450
+ # Wrapper function for multiprocessing
451
+ def disambiguate_glosses_in_parallel(word_and_sentence):
452
+ word, sentence = word_and_sentence
453
+ return disambiguate_glosses_main(word, sentence)
436
454
 
437
455
  def WSD(sentence):
438
-
456
+ start = time.time()
439
457
  input_sentence = simple_word_tokenize(sentence)
440
-
458
+ end = time.time()
459
+ print(f"tokenizer time: {end - start}")
460
+
461
+ start = time.time()
441
462
  five_word_lemma = find_five_word_lemma(input_sentence)
463
+ end = time.time()
464
+ print(f"5grams time: {end - start}")
442
465
 
466
+ start = time.time()
443
467
  four_word_lemma = find_four_word_lemma(input_sentence)
444
-
468
+ end = time.time()
469
+ print(f"4grams time: {end - start}")
470
+
471
+ start = time.time()
445
472
  three_word_lemma = find_three_word_lemma(input_sentence)
446
-
473
+ end = time.time()
474
+ print(f"3grams time: {end - start}")
475
+
476
+ start = time.time()
447
477
  two_word_lemma = find_two_word_lemma(input_sentence)
448
-
449
- ner = find_named_entities(" ".join(input_sentence))
478
+ end = time.time()
479
+ print(f"2grams time: {end - start}")
450
480
 
451
- output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner)
452
- results = []
453
- for word in output_list:
454
- results.append(disambiguate_glosses_main(word, sentence))
481
+ start = time.time()
482
+ ner = find_named_entities(" ".join(input_sentence))
483
+ end = time.time()
484
+ print(f"ner time: {end - start}")
485
+
486
+
487
+ start = time.time()
488
+ output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner, glosses_dic_shared)
489
+ end = time.time()
490
+ print(f"lookup time: {end - start}")
491
+
492
+ # for word in output_list:
493
+ # start = time.time()
494
+ # results.append(disambiguate_glosses_main(word, sentence))
495
+ # end = time.time()
496
+ # print(f"disambiguate time: {end - start}")
497
+ # return results
498
+
499
+ # with concurrent.futures.ProcessPoolExecutor() as executor:
500
+ # results = list(executor.map(lambda word: disambiguate_glosses_main(word, sentence), output_list))
501
+ # return results
502
+
503
+ # Create and start threads
504
+ # for word in output_list:
505
+ # thread = threading.Thread(target=worker, args=(word, sentence))
506
+ # threads.append(thread)
507
+ # thread.start()
508
+ #
509
+ # for thread in threads:
510
+ # thread.join()
511
+ #
512
+ # return threading_results
513
+
514
+ # Number of CPUs
515
+ num_cpus = multiprocessing.cpu_count()
516
+ print("num_cpus : ", num_cpus)
517
+
518
+ # Create a manager to hold shared data
519
+ # with multiprocessing.Manager() as manager:
520
+ # glosses_dic_shared = manager.dict(glosses_dic)
521
+ # with multiprocessing.Pool(num_cpus) as pool:
522
+ # arguments = [(word, sentence) for word in output_list]
523
+ # results = pool.starmap(disambiguate_glosses_main, arguments)
524
+
525
+ with multiprocessing.Pool(initializer=init_resources) as pool:
526
+ # Map the list of words to the disambiguation function in parallel
527
+ results = pool.map(disambiguate_glosses_in_parallel, [(word, sentence) for word in output_list])
455
528
  return results
456
529
 
457
530
 
@@ -497,5 +570,8 @@ def disambiguate(sentence):
497
570
  content = ["Input is too long"]
498
571
  return content
499
572
  else:
573
+ start = time.time()
500
574
  results = WSD(sentence)
575
+ end = time.time()
576
+ print(f"WSD total time: {end - start}")
501
577
  return results