PyPI - SinaTools - Versions diffs - 0.1.33__py2.py3-none-any.whl → 0.1.34__py2.py3-none-any.whl - Mend

SinaTools 0.1.33py2.py3-none-any.whl → 0.1.34py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: SinaTools
-Version: 0.1.33
+Version: 0.1.34
 Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
 Home-page: https://github.com/SinaLab/sinatools
 License: MIT license

{SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-SinaTools-0.1.33.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
-sinatools/VERSION,sha256=bJzhviDvRLWnGN5ta-YXGMlqn4-UFzm_e_QbUFvKv1I,6
+SinaTools-0.1.34.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
+sinatools/VERSION,sha256=hygBh9__JFOajJA1gAKoJF_AUzBnbP5eCrBYLp3dwDI,6
 sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
 sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
 sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
 sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
-sinatools/CLI/DataDownload/download_files.py,sha256=TzS0XjYDhusRBb2CRX1EjKjORa0wI6me_XoZ09dY4R8,2397
+sinatools/CLI/DataDownload/download_files.py,sha256=u_DFXbHcIU_4Ub5Y0cL9_p1hL8h6LLWPemn9Al-XFgc,2603
 sinatools/CLI/morphology/ALMA_multi_word.py,sha256=rmpa72twwIJHme_kpQ1lu3_7y_Jorj70QTvOnQMJRuI,1274
 sinatools/CLI/morphology/morph_analyzer.py,sha256=HPamEKos_JRYCJv_2q6c12N--da58_JXTno9haww5Ao,3497
 sinatools/CLI/ner/corpus_entity_extractor.py,sha256=DdvigsDQzko5nJBjzUXlIDqoBMBTVzktjSo7JfEXTIA,4778
@@ -20,7 +20,7 @@ sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1
 sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
 sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
 sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sinatools/DataDownload/downloader.py,sha256=3UkRRH4TLbut10V1BgWO3EqJQaHVBqr6pAj7Fn4AQZ8,6511
+sinatools/DataDownload/downloader.py,sha256=VdUNgSqMKz1J-DuQD_eS1U2KWqEpy94WlSJ0pPODLig,7833
 sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
 sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
 sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
@@ -97,7 +97,7 @@ sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh
 sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
 sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
 sinatools/relations/__init__.py,sha256=cYjsP2mlTYvAwVIEFtgA6i9gLUSkGVOuDggMs7TvG5k,272
-sinatools/relations/relation_extractor.py,sha256=gADRNy0LZvJ021UVgSuV4DfHodRJ8bM7FeCkdV4DeeY,9719
+sinatools/relations/relation_extractor.py,sha256=UuDlaaR0ch9BFv4sBF1tr7P-P9xq8oRZF41tAze6_ok,9751
 sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
 sinatools/semantic_relatedness/compute_relatedness.py,sha256=_9HFPs3nQBLklHFfkc9o3gEjEI6Bd34Ha4E1Kvv1RIg,2256
 sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO8390,548
@@ -115,13 +115,13 @@ sinatools/utils/tokenizer.py,sha256=nyk6lh5-p38wrU62hvh4wg7ni9ammkdqqIgcjbbBxxo,
 sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
 sinatools/utils/word_compare.py,sha256=rS2Z74sf7R-7MTXyrFj5miRi2TnSG9OdTDp_qQYuo2Y,28200
 sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
-sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
+sinatools/wsd/disambiguator.py,sha256=9ottQn_WwOFX5Trr0Rpg66-Jpaln5yJduFqP6cdOOBA,22616
 sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
 sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
-SinaTools-0.1.33.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
-SinaTools-0.1.33.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
-SinaTools-0.1.33.dist-info/METADATA,sha256=WjUqSrwvqgsY3foTp7i3axxereSFYQOmIliv5uZ6tIY,3267
-SinaTools-0.1.33.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
-SinaTools-0.1.33.dist-info/entry_points.txt,sha256=-YGM-r0_UtNPnI0C4UcK1ptrpwFZpUhxdy2qHkehNCo,1303
-SinaTools-0.1.33.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
-SinaTools-0.1.33.dist-info/RECORD,,
+SinaTools-0.1.34.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
+SinaTools-0.1.34.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
+SinaTools-0.1.34.dist-info/METADATA,sha256=lzqCZL8XdEQ2ZqcXH5WsoUmLBwv9TklIItPwCB0MqKc,3267
+SinaTools-0.1.34.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
+SinaTools-0.1.34.dist-info/entry_points.txt,sha256=-YGM-r0_UtNPnI0C4UcK1ptrpwFZpUhxdy2qHkehNCo,1303
+SinaTools-0.1.34.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
+SinaTools-0.1.34.dist-info/RECORD,,

sinatools/CLI/DataDownload/download_files.py CHANGED Viewed

@@ -34,6 +34,7 @@ import argparse
 from sinatools.DataDownload.downloader import download_file
 from sinatools.DataDownload.downloader import download_files
 from sinatools.DataDownload.downloader import get_appdatadir
+from sinatools.DataDownload.downloader import download_folder_from_hf
 from sinatools.DataDownload.downloader import urls
@@ -51,15 +52,16 @@ def main():
         for file in args.files:
             print("file: ", file)
             if file == "wsd":
-                download_file(urls["morph"])
-                download_file(urls["ner"])
-                download_file(urls["wsd_model"])
-                download_file(urls["wsd_tokenizer"])
-                download_file(urls["one_gram"])
-                download_file(urls["five_grams"])
-                download_file(urls["four_grams"])
-                download_file(urls["three_grams"])
-                download_file(urls["two_grams"])
+                #download_file(urls["morph"])
+                #download_file(urls["ner"])
+                #download_file(urls["wsd_model"])
+                download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01")
+                #download_file(urls["wsd_tokenizer"])
+                #download_file(urls["one_gram"])
+                #download_file(urls["five_grams"])
+                #download_file(urls["four_grams"])
+                #download_file(urls["three_grams"])
+                #download_file(urls["two_grams"])
             elif file == "synonyms":
                 download_file(urls["graph_l2"])
                 download_file(urls["graph_l3"])

sinatools/DataDownload/downloader.py CHANGED Viewed

@@ -8,8 +8,8 @@ import tarfile
 urls = {
     'morph': 'https://sina.birzeit.edu/lemmas_dic.pickle',
     'ner': 'https://sina.birzeit.edu/Wj27012000.tar.gz',
-    'wsd_model': 'https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
-    'wsd_tokenizer': 'https://sina.birzeit.edu/bert-base-arabertv02.zip',
+    # 'wsd_model': 'https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
+    # 'wsd_tokenizer': 'https://sina.birzeit.edu/bert-base-arabertv02.zip',
     'one_gram': 'https://sina.birzeit.edu/one_gram.pickle',
     'five_grams': 'https://sina.birzeit.edu/five_grams.pickle',
     'four_grams':'https://sina.birzeit.edu/four_grams.pickle',
@@ -184,4 +184,35 @@ def download_files():
         None
     """
     for url in urls.values():
-        download_file(url)
+        download_file(url)
+def download_folder_from_hf(repo_url, folder_name):
+    # Hugging Face API to fetch files from the repository
+    api_url = f"https://huggingface.co/api/models/{repo_url}/tree/main/{folder_name}"
+    # Make the request to get the folder structure
+    response = requests.get(api_url)
+    if response.status_code != 200:
+        print(f"Failed to fetch folder contents. Status code: {response.status_code}")
+        return
+    folder_content = response.json()
+    # Download each file in the folder
+    for file_info in folder_content:
+        file_name = file_info["path"]
+        file_url = f"https://huggingface.co/{repo_url}/resolve/main/{file_name}"
+        # Download the file and save it to the output directory
+        file_response = requests.get(file_url)
+        if file_response.status_code == 200:
+            # Create any necessary directories
+            output_file_path = os.path.join(get_appdatadir(), file_name)
+            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
+            with open(output_file_path, 'wb') as f:
+                f.write(file_response.content)
+            print(f"Downloaded: {file_name}")
+        else:
+            print(f"Failed to download {file_name}. Status code: {file_response.status_code}")

sinatools/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.33
1	+ 0.1.34

sinatools/relations/relation_extractor.py CHANGED Viewed

@@ -193,7 +193,7 @@ def event_argument_relation_extraction(documnet):
                     score = predicted_relation[0][0]['score']
                     if score > 0.50:
                         triple_id+=1
-                        relation={"TripleID":triple_id,"Subject":{"ID":entity_identifier[event_entity],"Type": entities[event_entity], "Label":event_entity}, "Relation": category, "Object":{"ID":entity_identifier[arg_name],"Type": entities[arg_name], "Label":arg_name,}}
+                        relation={"TripleID":triple_id,"Subject":{"ID":entity_identifier[event_entity],"Type": entities[event_entity], "Label":event_entity}, "Relation": category, "Object":{"ID":entity_identifier[arg_name],"Type": entities[arg_name], "Label":arg_name,},"confidence": f"{score: .2f}"}
                         output_list.append(relation)
-    return output_list
+    return output_list

sinatools/wsd/disambiguator.py CHANGED Viewed

@@ -8,6 +8,10 @@ from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
 from sinatools.morphology.morph_analyzer import analyze
 from sinatools.ner.entity_extractor import extract
 from . import glosses_dic
+import time
+#import concurrent
+#import threading
+import multiprocessing
 def distill_entities(entities):
@@ -256,7 +260,7 @@ def find_named_entities(string):
    return found_entities
-def find_glosses_using_ALMA(word):
+def find_glosses_using_ALMA(word, glosses_dic):
    data = analyze(word, language ='MSA', task ='full', flag="1")
    Diac_lemma = ""
@@ -302,7 +306,7 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
       return my_json
-def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner):
+def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner, glosses_dic):
       output_list = []
       position = 0
       while position < len(input_sentence):
@@ -389,7 +393,7 @@ def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemm
          if flag == "False": # Not found in ner or in multi_word_dictionary, ASK ALMA
             word = input_sentence[position]
-            word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word)
+            word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word, glosses_dic)
             my_json = {}
             my_json['word'] = word
             my_json['concept_count'] = concept_count
@@ -432,26 +436,95 @@ def disambiguate_glosses_main(word, sentence):
       glosses = word['glosses']
       Diac_lemma = word['Diac_lemma']
       Undiac_lemma = word['Undiac_lemma']
-      return disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
+      start = time.time()
+      x = disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
+      end = time.time()
+      print(f"disambiguate time: {end - start}")
+      return x
+def init_resources():
+    global glosses_dic
+# Wrapper function for multiprocessing
+def disambiguate_glosses_in_parallel(word_and_sentence):
+    word, sentence = word_and_sentence
+    return disambiguate_glosses_main(word, sentence)
 def WSD(sentence):
+   start = time.time()
    input_sentence = simple_word_tokenize(sentence)
+   end = time.time()
+   print(f"tokenizer time: {end - start}")
+   start = time.time()
    five_word_lemma = find_five_word_lemma(input_sentence)
+   end = time.time()
+   print(f"5grams time: {end - start}")
+   start = time.time()
    four_word_lemma = find_four_word_lemma(input_sentence)
+   end = time.time()
+   print(f"4grams time: {end - start}")
+   start = time.time()
    three_word_lemma = find_three_word_lemma(input_sentence)
+   end = time.time()
+   print(f"3grams time: {end - start}")
+   start = time.time()
    two_word_lemma = find_two_word_lemma(input_sentence)
-   ner = find_named_entities(" ".join(input_sentence))
+   end = time.time()
+   print(f"2grams time: {end - start}")
-   output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner)
-   results = []
-   for word in output_list:
-      results.append(disambiguate_glosses_main(word, sentence))
+   start = time.time()
+   ner = find_named_entities(" ".join(input_sentence))
+   end = time.time()
+   print(f"ner time: {end - start}")
+   start = time.time()
+   output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner, glosses_dic_shared)
+   end = time.time()
+   print(f"lookup time: {end - start}")
+#    for word in output_list:
+    #  start = time.time()
+    #  results.append(disambiguate_glosses_main(word, sentence))
+    #  end = time.time()
+    #  print(f"disambiguate time: {end - start}")
+#    return results
+#    with concurrent.futures.ProcessPoolExecutor() as executor:
+    #    results = list(executor.map(lambda word: disambiguate_glosses_main(word, sentence), output_list))
+#    return results
+   # Create and start threads
+#    for word in output_list:
+    #    thread = threading.Thread(target=worker, args=(word, sentence))
+    #    threads.append(thread)
+    #    thread.start()
+#
+#    for thread in threads:
+    #    thread.join()
+#
+#    return threading_results
+    # Number of CPUs
+   num_cpus = multiprocessing.cpu_count()
+   print("num_cpus : ", num_cpus)
+   # Create a manager to hold shared data
+#    with multiprocessing.Manager() as manager:
+    #    glosses_dic_shared = manager.dict(glosses_dic)
+    #    with multiprocessing.Pool(num_cpus) as pool:
+            # arguments = [(word, sentence) for word in output_list]
+            # results = pool.starmap(disambiguate_glosses_main, arguments)
+   with multiprocessing.Pool(initializer=init_resources) as pool:
+        # Map the list of words to the disambiguation function in parallel
+        results = pool.map(disambiguate_glosses_in_parallel, [(word, sentence) for word in output_list])
    return results
@@ -497,5 +570,8 @@ def disambiguate(sentence):
        content = ["Input is too long"]
        return content
     else:
+       start = time.time()
        results = WSD(sentence)
+       end = time.time()
+       print(f"WSD total time: {end - start}")
        return results

{SinaTools-0.1.33.data → SinaTools-0.1.34.data}/data/sinatools/environment.yml RENAMED Viewed

File without changes

{SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/AUTHORS.rst RENAMED Viewed

File without changes

{SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/LICENSE RENAMED Viewed

File without changes

{SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/WHEEL RENAMED Viewed

File without changes

{SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/top_level.txt RENAMED Viewed

File without changes

SinaTools 0.1.33__py2.py3-none-any.whl → 0.1.34__py2.py3-none-any.whl

SinaTools 0.1.33py2.py3-none-any.whl → 0.1.34py2.py3-none-any.whl