PyPI - SinaTools - Versions diffs - 0.1.11__py2.py3-none-any.whl → 0.1.12__py2.py3-none-any.whl - Mend

SinaTools 0.1.11py2.py3-none-any.whl → 0.1.12py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/METADATA +2 -3
{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/RECORD +47 -26
{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/entry_points.txt +7 -3
sinatools/CLI/DataDownload/download_files.py +0 -10
sinatools/CLI/ner/corpus_entity_extractor.py +6 -6
sinatools/CLI/ner/entity_extractor.py +18 -42
sinatools/CLI/utils/arStrip.py +8 -8
sinatools/CLI/utils/implication.py +0 -8
sinatools/CLI/utils/jaccard.py +5 -14
sinatools/CLI/utils/remove_latin.py +2 -2
sinatools/CLI/utils/text_dublication_detector.py +25 -0
sinatools/VERSION +1 -1
sinatools/morphology/ALMA_multi_word.py +14 -16
sinatools/morphology/__init__.py +32 -31
sinatools/ner/__init__.py +28 -2
sinatools/ner/data/__init__.py +1 -0
sinatools/ner/data/datasets.py +146 -0
sinatools/ner/data/transforms.py +118 -0
sinatools/ner/data.py +124 -0
sinatools/ner/data_format.py +124 -0
sinatools/ner/datasets.py +146 -0
sinatools/ner/entity_extractor.py +34 -54
sinatools/ner/helpers.py +86 -0
sinatools/ner/metrics.py +69 -0
sinatools/ner/nn/BaseModel.py +22 -0
sinatools/ner/nn/BertNestedTagger.py +34 -0
sinatools/ner/nn/BertSeqTagger.py +17 -0
sinatools/ner/nn/__init__.py +3 -0
sinatools/ner/trainers/BaseTrainer.py +117 -0
sinatools/ner/trainers/BertNestedTrainer.py +203 -0
sinatools/ner/trainers/BertTrainer.py +163 -0
sinatools/ner/trainers/__init__.py +3 -0
sinatools/ner/transforms.py +119 -0
sinatools/semantic_relatedness/__init__.py +20 -0
sinatools/semantic_relatedness/compute_relatedness.py +31 -0
sinatools/synonyms/__init__.py +18 -0
sinatools/synonyms/synonyms_generator.py +192 -0
sinatools/utils/text_dublication_detector.py +110 -0
sinatools/wsd/__init__.py +11 -0
sinatools/{salma/views.py → wsd/disambiguator.py} +135 -94
sinatools/{salma → wsd}/wsd.py +1 -1
sinatools/CLI/salma/salma_tools.py +0 -68
sinatools/salma/__init__.py +0 -12
sinatools/utils/utils.py +0 -2
{SinaTools-0.1.11.data → SinaTools-0.1.12.data}/data/sinatools/environment.yml +0 -0
{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/AUTHORS.rst +0 -0
{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/LICENSE +0 -0
{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/WHEEL +0 -0
{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/top_level.txt +0 -0
/sinatools/{salma → wsd}/settings.py +0 -0

sinatools/{salma/views.py → wsd/disambiguator.py} RENAMED Viewed

@@ -1,57 +1,136 @@
 import json
-from sinatools.salma import settings
-from sinatools.salma.wsd import normalizearabert
-from sinatools.salma.wsd import GlossPredictor
+from sinatools.wsd import settings
+from sinatools.wsd.wsd import normalizearabert
+from sinatools.wsd.wsd import GlossPredictor
 from sinatools.utils.parser import arStrip
 from sinatools.utils.tokenizers_words import simple_word_tokenize
 from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
 from sinatools.morphology.morph_analyzer import analyze
-#from sinatools.ner.entity_extractor import ner
+from sinatools.ner.entity_extractor import extract
+def distill_entities(entities):
+    list_output = list()
+    temp_entities = sortTags(entities)
+    temp_list = list()
+    temp_list.append(["", "", 0, 0])
+    word_position = 0
+    for entity in temp_entities:
+        counter_tag = 0
+        for tag in str(entity[1]).split():
+            if counter_tag >= len(temp_list):
+                temp_list.append(["", "", 0, 0])
+            if "O" == tag and word_position != 0:
+                for j in range(0, len(temp_list)):
+                    if temp_list[j][1] != "":
+                        list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+                        temp_list[j][0] = ""
+                        temp_list[j][1] = ""
+                        temp_list[j][2] = word_position
+                        temp_list[j][3] = word_position
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
+                if temp_list[counter_tag][1] != "":
+                    list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
+                temp_list[counter_tag][0] = str(entity[0]) + " "
+                temp_list[counter_tag][1] = str(tag).split("-")[1]
+                temp_list[counter_tag][2] = word_position
+                temp_list[counter_tag][3] = word_position
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
+                for j in range(counter_tag,len(temp_list)):
+                    if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
+                        temp_list[j][0] += str(entity[0]) + " "
+                        temp_list[j][3] += 1
+                        break
+                    else:
+                        if temp_list[j][1] != "":
+                            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+                            temp_list[j][0] = ""
+                            temp_list[j][1] = ""
+                            temp_list[j][2] = word_position
+                            temp_list[j][3] = word_position
+            counter_tag += 1
+        word_position += 1
+    for j in range(0, len(temp_list)):
+        if temp_list[j][1] != "":
+            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+    return sorted(list_output, key=lambda x: (x[2]))
+def sortTags(entities):
+    temp_entities = entities
+    temp_counter = 0
+    for entity in temp_entities:
+        tags = entity[1].split()
+        for tag in tags:
+            if temp_counter != 0:
+                if "I-" == tag[0:2]:
+                    counter_of_this_tag = 0
+                    counter_of_previous_tag = 0
+                    for word in tags:
+                        if tag.split("-")[1] in word:
+                            counter_of_this_tag+=1
+                    for word in temp_entities[temp_counter-1][1].split():
+                        if tag.split("-")[1] in word:
+                            counter_of_previous_tag+=1
+                    if counter_of_previous_tag > counter_of_this_tag:
+                        tags.append("I-"+tag.split("-")[1])
+        tags.sort()
+        tags.reverse()
+        if temp_counter != 0:
+            this_tags = tags
+            previous_tags = temp_entities[temp_counter - 1][1].split()
+            sorted_tags = list()
+            if "O" not in this_tags and "O" not in previous_tags:
+                index = 0
+                for i in previous_tags:
+                    j = 0
+                    while this_tags and j < len(this_tags):
+                        if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
+                            sorted_tags.insert(index, this_tags.pop(j))
+                            break
+                        elif this_tags[j][0:2] == "B-":
+                            break
+                        j += 1
+                    index += 1
+            sorted_tags += this_tags
+            tags = sorted_tags
+        str_tag = " "
+        str_tag = str_tag.join(tags)
+        str_tag = str_tag.strip()
+        temp_entities[temp_counter][1] = str_tag
+        temp_counter += 1
+    return temp_entities
 def delete_form_list(position, word_lemma):
-    #"""
-    #Remove specific elements from the word_lemma list based on the given position.
-    #
-    #Parameters:
-    #position (int): The current position in the input sentence.
-    #word_lemma (list): List of word lemma details.
-    #
-    #Returns:
-    #list: Updated word_lemma list with the specific elements removed.
-    #list: The list of removed elements.
-    #int: The new position in the input sentence.
-    #"""
     tmp_word_lemma = []
     output = []
     for wordLemma in word_lemma:
-        if position == int(wordLemma[2]): # start
+        if position == int(wordLemma[2]):
            word = wordLemma[0]
            gloss = wordLemma[1]
            position = int(wordLemma[3])
            concept_count = int(wordLemma[4])
            undiac_multi_word_lemma = wordLemma[5]
            multi_word_lemma = wordLemma[6]
-           output.append([word, gloss, concept_count, undiac_multi_word_lemma, multi_word_lemma])# word
+           output.append([word, gloss, concept_count, undiac_multi_word_lemma, multi_word_lemma])
         elif position < int(wordLemma[2]):
            tmp_word_lemma.append(wordLemma)
     return tmp_word_lemma, output, position
 def find_two_word_lemma(input_sentence):
-    #"""
-    #Find two-word lemmas in the input sentence using the ALMA_multi_word function.
-    #
-    #Parameters:
-    #input_sentence (list): Tokenized input sentence.
-    #
-    #Returns:
-    #list: List of details of found two-word lemmas.
-    #"""
     i = 0
     output = []
     length = len(input_sentence)
     while i < length - 1:
         two_grams = input_sentence[i] +" "+ input_sentence[i + 1]
-        data = ALMA_multi_word(two_grams)
+        data = ALMA_multi_word(two_grams, 2)
         try :
             glosses_list = []
             concept_count = 0
@@ -62,12 +141,10 @@ def find_two_word_lemma(input_sentence):
                   glosses_list.append(json.loads(value[1]))
                   concept_count = concept_count + value[0]
-            # found two_grams
-            #found_2Word_lemma = [two_grams,data[0]['glosses'], i, i + 1,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
             found_2Word_lemma = [two_grams, glosses_list, i, i + 1, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
             output.append(found_2Word_lemma)
             i = i + 1
-        except: # no record found on this multi_lema
+        except:
             i = i + 1
     return output
@@ -78,7 +155,7 @@ def find_three_word_lemma(input_sentence):
     length = len(input_sentence)
     while i < length - 2:
         three_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2]
-        data = ALMA_multi_word(three_grams)
+        data = ALMA_multi_word(three_grams, 3)
         try:
            glosses_list = []
            concept_count = 0
@@ -89,7 +166,6 @@ def find_three_word_lemma(input_sentence):
                  glosses_list.append(json.loads(value[1]))
                  concept_count = concept_count + value[0]
-           #found_3Word_lemma = [three_grams, data[0]['glosses'], i, i + 2,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
            found_3Word_lemma = [three_grams, glosses_list, i, i + 2, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
            output.append(found_3Word_lemma)
            i = i + 1
@@ -103,7 +179,7 @@ def find_four_word_lemma(input_sentence):
    length = len(input_sentence)
    while i < length - 3:
       four_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2] + " "+ input_sentence[i + 3]
-      data = ALMA_multi_word(four_grams)
+      data = ALMA_multi_word(four_grams, 4)
       try:
          glosses_list = []
          concept_count = 0
@@ -113,7 +189,6 @@ def find_four_word_lemma(input_sentence):
                value = settings.glosses_dic[lemma_id]
                glosses_list.append(json.loads(value[1]))
                concept_count = concept_count + value[0]
-         #found_4Word_lemma = [four_grams, data[0]['glosses'], i, i + 3,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
          found_4Word_lemma = [four_grams, glosses_list, i, i + 3, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
          output.append(found_4Word_lemma)
          i = i + 1
@@ -128,7 +203,7 @@ def find_five_word_lemma(input_sentence):
    length = len(input_sentence)
    while i < length - 4:
       five_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2] + " "+ input_sentence[i + 3] + " "+ input_sentence[i + 4]
-      data = ALMA_multi_word(five_grams)
+      data = ALMA_multi_word(five_grams, 5)
       try:
          glosses_list = []
          concept_count = 0
@@ -138,7 +213,6 @@ def find_five_word_lemma(input_sentence):
                value = settings.glosses_dic[lemma_id]
                glosses_list.append(json.loads(value[1]))
                concept_count = concept_count + value[0]
-         #found_5Word_lemma = [five_grams, data[0]['glosses'], i, i + 4,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
          found_5Word_lemma = [five_grams, glosses_list, i, i + 4, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
          output.append(found_5Word_lemma)
          i = i + 1
@@ -146,19 +220,16 @@ def find_five_word_lemma(input_sentence):
          i = i + 1
    return output
+def jsons_to_list_of_lists(json_list):
+    return [[d['token'], d['tags']] for d in json_list]
 def find_named_entities(string):
-   #"""
-   # Find named entities in the input string using a NER tool.
-   #
-   # Parameters:
-   # string (str): Input string.
-   #
-   # Returns:
-   # list: List of details of found named entities.
-   # """
    found_entities = []
-   #entites = ner(string, "4")
-   entites = []
+   ner_entites = extract(string)
+   list_of_entites = jsons_to_list_of_lists(ner_entites)
+   entites = distill_entities(list_of_entites)
    tag_gloss = {
       "PERS": "اسم شخص",
       "ORG": "اسم مؤسسة",
@@ -196,54 +267,26 @@ def find_named_entities(string):
 def find_glosses_using_ALMA(word):
-   data = analyze(word)
+   data = analyze(word, language ='MSA', task ='lemmatization', flag="1")
    Diac_lemma = ""
    pos = ""
    Undiac_lemma = ""
    glosses = []
-   Diac_lemma = data[0][1]
-   pos = data[0][2]
+   Diac_lemma = data[0]["lemma"]
+   pos = data[0]["pos"]
    Undiac_lemma = arStrip(Diac_lemma, True, True, True, True, True, False) # Remove diacs , smallDiacs , shaddah ,  digit , alif , specialChars
-   #"""
-   # Find glosses for the given word using the ALMA tool.
-   #
-   # Parameters:
-   # word (str): Input word.
-   #
-   # Returns:
-   # tuple: Details of the word including glosses, lemmas, and POS.
-   # """
    ids = []
    glosses_list = []
    concept_count = 0
-   for line in data:
-      lemma_id = line[3]
-      ids.append(lemma_id)
-   for lemma_id in ids:
-      if lemma_id in settings.glosses_dic.keys():
-         value = settings.glosses_dic[lemma_id]
-         glosses_list.append(json.loads(value[1]))
-         concept_count = concept_count + value[0]
-   #glosses = data[0][4]
-   #concept_count = data[0][3]
+   lemma_id = data[0]["lemma_id"]
+   if lemma_id in settings.glosses_dic.keys():
+      value = settings.glosses_dic[lemma_id]
+      glosses_list.append(json.loads(value[1]))
+      concept_count = concept_count + value[0]
    return word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses
 def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, sentence):
-   #"""
-   # Disambiguate glosses using the SALMA tool.
-   #
-   # Parameters:
-   # glosses (list): List of glosses.
-   # Diac_lemma (str): Diacritic lemma of the word.
-   # Undiac_lemma (str): Undiacritic lemma of the word.
-   # word (str): The word being analyzed.
-   # sentence (str): The sentence containing the word.
-   #
-   # Returns:
-   # dict: Disambiguated gloss details.
-   # """
    word = normalizearabert(word)
    glosses_dictionary = {}
    if glosses != None:
@@ -253,7 +296,7 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
       my_json = {}
       my_json['Concept_id'] = concept_id
-      my_json['Gloss'] = gloss
+    #   my_json['Gloss'] = gloss
       my_json['word'] = word
       my_json['Undiac_lemma'] = Undiac_lemma
       my_json['Diac_lemma'] = Diac_lemma
@@ -409,7 +452,7 @@ def WSD(sentence):
    return results
-def SALMA(sentence):
+def disambiguate(sentence):
     """
     This method disambiguate words within a sentence.
@@ -424,15 +467,14 @@ def SALMA(sentence):
     .. highlight:: python
     .. code-block:: python
-        from sinatools.salma.views import SALMA
-        JSON = SALMA("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.")
-        print(JSON["resp"])
+        from sinatools.wsd.disambiguator import disambiguate
+        result = disambiguate("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.")
+        print(result)
         #output
          [
              {
                  "Concept_id": "303019218",
-                 "Gloss": "ذهَب إلى عملِه:- قصَده، توجَّه إليه \"ذهَب إلى الجامعة/ بيروت - اذهَب إلى أبيك والتمس منه الصفح - ذهَب إلى قول فلان أخذ به - <اذْهَبْ إِلَى فِرْعَوْنَ إِنَّهُ طَغَى> طه/ 24 \". ذهَب رأسًا إليه",
                  "word": "ذهبت",
                  "Undiac_lemma": "ذهب",
                  "Diac_lemma": "ذَهَبَ۪ 1"
@@ -444,7 +486,6 @@ def SALMA(sentence):
              },
              {
                  "word": "جامعة بيرزيت",
-                 "Gloss": جامعة فلسطينية تقع في بلدة بيرزيت، قرب مدينة رام الله، ويعود تاريخها إلى عام 1924 عندما تأسست كمدرسة ابتدائية ثم أصبحت جامعة عام 1975,
                  "Concept_id": "334000099",
                  "Diac_lemma": جامِعَة بيرزَيت,
                  "Undiac_lemma": "جامعة بيرزيت"
@@ -452,8 +493,8 @@ def SALMA(sentence):
          ]
     """
     if len(sentence) > 500:
-       content = {"statusText":"Input is too long","statusCode":-7}
+       content = ["Input is too long"]
        return content
     else:
        results = WSD(sentence)
-       return {"resp": results, "statusText":"OK","statusCode":0}
+       return results

sinatools/{salma → wsd}/wsd.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from sinatools.salma import settings
+from sinatools.wsd import settings
 import re
 import warnings
 warnings.filterwarnings("ignore")

sinatools/CLI/salma/salma_tools.py DELETED Viewed

@@ -1,68 +0,0 @@
-"""
-SALMA CLI
-About:
-------
-The SALMA command line interface (CLI) is a tool designed to utilize the SALMA function for processing Arabic sentences. This CLI allows users to input an Arabic sentence and receive a structured response that includes the processing result of the SALMA function.
-Usage:
-------
-Below is the usage information that can be generated by running the command with the --help option.
-.. code-block:: none
-    salma --text=TEXT
-    salma --file=INPUT_FILE
-Options:
---------
-.. code-block:: none
-  --text
-        The Arabic sentence to be processed by the SALMA function.
-  --file
-        The text file to be processed by the SALMA function.
-Examples:
----------
-.. code-block:: none
-    salma --text "your Arabic sentence here"
-    salma --file "path/to/your/file.txt"
-Note:
------
-.. code-block:: none
-    - The input sentence should be provided in Arabic.
-    - It is recommended that the length of the input sentence does not exceed 500 characters to ensure optimal performance and accurate results.
-"""
-import argparse
-import json
-from sinatools.salma.views import SALMA
-from sinatools.utils.readfile import read_file
-def main():
-    parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
-    parser.add_argument('--text', type=str, help='Input sentence to process')
-    parser.add_argument('--file', type=str, help='File containing the Arabic sentence to process')
-    args = parser.parse_args()
-    if args.text is None and args.file is None:
-        print("Either --text or --file argument must be provided.")
-        return
-    text_content = args.text if args.text else " ".join(read_file(args.file))
-    result = SALMA(text_content)
-    print(json.dumps(result, ensure_ascii=False, indent=4))
-if __name__ == "__main__":
-    main()
-#salma --text "your Arabic sentence here"
-#salma --file "path/to/your/file.txt"

sinatools/salma/__init__.py DELETED Viewed

@@ -1,12 +0,0 @@
-from sinatools.salma import settings
-import pickle
-from sinatools.DataDownload import downloader
-import os
-#filename = 'glosses_dic.pickle'
-#path =downloader.get_appdatadir()
-#file_path = os.path.join(path, filename)
-#with open(file_path, 'rb') as f:
-#    #Load the serialized data from the file
-#    settings.glosses_dic = pickle.load(f)
-settings.glosses_dic = {}

sinatools/utils/utils.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- def hello():
2	- return 'hello'

{SinaTools-0.1.11.data → SinaTools-0.1.12.data}/data/sinatools/environment.yml RENAMED Viewed

File without changes

{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/AUTHORS.rst RENAMED Viewed

File without changes

{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/LICENSE RENAMED Viewed

File without changes

{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/top_level.txt RENAMED Viewed

File without changes

/sinatools/{salma → wsd}/settings.py RENAMED Viewed

File without changes

SinaTools 0.1.11__py2.py3-none-any.whl → 0.1.12__py2.py3-none-any.whl

SinaTools 0.1.11py2.py3-none-any.whl → 0.1.12py2.py3-none-any.whl