PyPI - SinaTools - Versions diffs - 0.1.29__py2.py3-none-any.whl → 0.1.31__py2.py3-none-any.whl - Mend

SinaTools 0.1.29py2.py3-none-any.whl → 0.1.31py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{SinaTools-0.1.29.dist-info → SinaTools-0.1.31.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: SinaTools
-Version: 0.1.29
+Version: 0.1.31
 Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
 Home-page: https://github.com/SinaLab/sinatools
 License: MIT license

{SinaTools-0.1.29.dist-info → SinaTools-0.1.31.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-SinaTools-0.1.29.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
-sinatools/VERSION,sha256=3gAvtibHsL3Zih60tzJshU5QcbL40f0qBmAAXPGrB-Q,6
+SinaTools-0.1.31.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
+sinatools/VERSION,sha256=0G-86l6j71-98w4IH9k4eO_HB6ywVt1xyOn6MUmJ0i4,6
 sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
 sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
 sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
@@ -20,7 +20,7 @@ sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1
 sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
 sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
 sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sinatools/DataDownload/downloader.py,sha256=6xH55WlDhgtImPRFQ0AaeDFJjL8OMNU29x61PL8mZ2w,6468
+sinatools/DataDownload/downloader.py,sha256=3UkRRH4TLbut10V1BgWO3EqJQaHVBqr6pAj7Fn4AQZ8,6511
 sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
 sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
 sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
@@ -75,8 +75,8 @@ sinatools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKwe
 sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
 sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
 sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
-sinatools/morphology/morph_analyzer.py,sha256=3B-ewxFg_If83oYlk1bDdVS1clb-mgyAF4WgAMqcAVI,7009
-sinatools/ner/__init__.py,sha256=CLPaqUcvPGAA4lU-6hjAqjNfKJ5WtwRfsma6QkYZHEk,1379
+sinatools/morphology/morph_analyzer.py,sha256=XrLkFqI89GmQuRyZB5X7GNIpfedfGNnQwHzrz5bDu5A,7190
+sinatools/ner/__init__.py,sha256=59kLMX6UQhF6JpE10RhaDYC3a2_jiWOIVPuejsoflFE,1050
 sinatools/ner/data.py,sha256=lvOW86dXse8SC75Q0supQaE0rrRffoxNjIA0Qbv5WZY,4354
 sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
 sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
@@ -96,6 +96,8 @@ sinatools/ner/trainers/BaseTrainer.py,sha256=Ifz4SeTxJwVn1_uWZ3I9KbcSo2hLPN3ojsI
 sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh3OrKQZnogYy8RQ,8429
 sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
 sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
+sinatools/relations/__init__.py,sha256=cYjsP2mlTYvAwVIEFtgA6i9gLUSkGVOuDggMs7TvG5k,272
+sinatools/relations/relation_extractor.py,sha256=gADRNy0LZvJ021UVgSuV4DfHodRJ8bM7FeCkdV4DeeY,9719
 sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
 sinatools/semantic_relatedness/compute_relatedness.py,sha256=_9HFPs3nQBLklHFfkc9o3gEjEI6Bd34Ha4E1Kvv1RIg,2256
 sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO8390,548
@@ -116,10 +118,10 @@ sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
 sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
 sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
 sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
-SinaTools-0.1.29.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
-SinaTools-0.1.29.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
-SinaTools-0.1.29.dist-info/METADATA,sha256=IorbBd2klVKi0amBxKMKIEgyJHjRxcpJNxAQBgyNn04,3267
-SinaTools-0.1.29.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
-SinaTools-0.1.29.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
-SinaTools-0.1.29.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
-SinaTools-0.1.29.dist-info/RECORD,,
+SinaTools-0.1.31.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
+SinaTools-0.1.31.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
+SinaTools-0.1.31.dist-info/METADATA,sha256=oT_7vNHXOs1oX4m07uSty-cASC1eBGYEDCfKT4W1mio,3267
+SinaTools-0.1.31.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
+SinaTools-0.1.31.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
+SinaTools-0.1.31.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
+SinaTools-0.1.31.dist-info/RECORD,,

sinatools/DataDownload/downloader.py CHANGED Viewed

@@ -10,13 +10,14 @@ urls = {
     'ner': 'https://sina.birzeit.edu/Wj27012000.tar.gz',
     'wsd_model': 'https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
     'wsd_tokenizer': 'https://sina.birzeit.edu/bert-base-arabertv02.zip',
-    'glosses_dic': 'https://sina.birzeit.edu/glosses_dic.pickle',
+    'one_gram': 'https://sina.birzeit.edu/one_gram.pickle',
     'five_grams': 'https://sina.birzeit.edu/five_grams.pickle',
     'four_grams':'https://sina.birzeit.edu/four_grams.pickle',
     'three_grams':'https://sina.birzeit.edu/three_grams.pickle',
     'two_grams':'https://sina.birzeit.edu/two_grams.pickle',
-    'synonyms_level2':'https://sina.birzeit.edu/graph_l2.pkl',
-    'synonyms_level3':'https://sina.birzeit.edu/graph_l3.pkl'
+    'graph_l2':'https://sina.birzeit.edu/graph_l2.pkl',
+    'graph_l3':'https://sina.birzeit.edu/graph_l3.pkl',
+    'relation':'https://sina.birzeit.edu/relation_model.zip'
 }
 def get_appdatadir():

sinatools/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.29
1	+ 0.1.31

sinatools/morphology/morph_analyzer.py CHANGED Viewed

@@ -3,6 +3,7 @@ from sinatools.utils.tokenizers_words import simple_word_tokenize
 from sinatools.utils.parser import arStrip
 from sinatools.utils.charsets import AR_CHARSET, AR_DIAC_CHARSET
 from sinatools.DataDownload.downloader import get_appdatadir
+from sinatools.morphology.morph_analyzer import remove_punctuation
 from . import dictionary
 _IS_AR_RE = re.compile(u'^[' + re.escape(u''.join(AR_CHARSET)) + u']+$')
@@ -98,13 +99,16 @@ def analyze(text, language ='MSA', task ='full', flag="1"):
          token = arStrip(token , False , True , False , False , False , False)
          token = re.sub('[ٱ]','ﺍ',token)
          # token, freq, lemma, lemma_id, root, pos
-         solution = [token, 0, token+"_0", 0, token, ""]
+         solution = [token, 0, token, 0, token, ""]
          if token.isdigit():
-            solution[5] = "digit" #pos
+            solution[5] = "رقم" #pos
+         elif remove_punctuation(token).strip() == "":
+            solution[5] = "علامة ترقيم" #pos
          elif not _is_ar(token):
-            solution[5] = "Foreign" #pos
+            solution[5] = "أجنبي" #pos
          else:
             result_token = find_solution(token,language,flag)

sinatools/ner/__init__.py CHANGED Viewed

@@ -7,8 +7,6 @@ import torch
 import pickle
 import json
 from argparse import Namespace
-from transformers import pipeline
-#from transformers import AutoModelForSequenceClassification
 tagger = None
 tag_vocab = None
@@ -38,6 +36,3 @@ if torch.cuda.is_available():
 train_config.trainer_config["kwargs"]["model"] = model
 tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
 tagger.load(os.path.join(model_path,"checkpoints"))
-pipe = pipeline("sentiment-analysis", model= os.path.join(path, "best_model"), return_all_scores =True, max_length=128, truncation=True)
-#pipe = AutoModelForSequenceClassification.from_pretrained(os.path.join(path, "best_model"))

sinatools/relations/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from sinatools.DataDownload import downloader
+import os
+from transformers import pipeline
+path =downloader.get_appdatadir()
+pipe = pipeline("sentiment-analysis", model= os.path.join(path, "relation_model"), return_all_scores =True, max_length=128, truncation=True)

sinatools/relations/relation_extractor.py ADDED Viewed

@@ -0,0 +1,199 @@
+from urllib.request import Request, urlopen
+from sinatools.ner.entity_extractor import extract
+from sinatools.utils.tokenizer import sentence_tokenizer
+from . import pipe
+# ============================ Extract entities and their types ========================
+def jsons_to_list_of_lists(json_list):
+    return [[d['token'], d['tags']] for d in json_list]
+def entities_and_types(sentence):
+    output_list = jsons_to_list_of_lists(extract(sentence))
+    json_short = distill_entities(output_list)
+    entities = {}
+    for entity in json_short:
+        name = entity[0]
+        entity_type = entity[1]
+        entities[name] = entity_type
+    return entities
+def distill_entities(entities):
+    # This is list that we put the output what we need
+    list_output = list()
+    # This line go to sort function and save the output to temp_entities
+    temp_entities = sortTags(entities)
+    # This list help us to make the output,
+    temp_list = list()
+    # initlize the temp_list
+    temp_list.append(["", "", 0, 0])
+    word_position = 0
+    # For each entity, convert ibo to distllir list.
+    for entity in temp_entities:
+        # This is counter tag of this entity
+        counter_tag = 0
+        # For each tag
+        for tag in str(entity[1]).split():
+            # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
+            if counter_tag >= len(temp_list):
+                temp_list.append(["", "", 0, 0])
+            # If tag equal O and word postion of this tag is not equal zero then it will add all
+            # not empty eliment of temp list in output list
+            if "O" == tag and word_position != 0:
+                for j in range(0, len(temp_list)):
+                    if temp_list[j][1] != "":
+                        list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+                        temp_list[j][0] = ""
+                        temp_list[j][1] = ""
+                        temp_list[j][2] = word_position
+                        temp_list[j][3] = word_position
+            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
+            # of the split its B
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
+                # if the temp_list of counter is not empty then it will append in output list and hten it will
+                # initilize by new string and tag in templist of counter
+                if temp_list[counter_tag][1] != "":
+                    list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
+                temp_list[counter_tag][0] = str(entity[0]) + " "
+                temp_list[counter_tag][1] = str(tag).split("-")[1]
+                temp_list[counter_tag][2] = word_position
+                temp_list[counter_tag][3] = word_position
+            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
+            # of the split its O
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
+                # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
+                # then will complete if not it will save in output list and cheak another
+                for j in range(counter_tag,len(temp_list)):
+                    if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
+                        temp_list[j][0] += str(entity[0]) + " "
+                        temp_list[j][3] += 1
+                        break
+                    else:
+                        if temp_list[j][1] != "":
+                            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+                            temp_list[j][0] = ""
+                            temp_list[j][1] = ""
+                            temp_list[j][2] = word_position
+                            temp_list[j][3] = word_position
+            counter_tag += 1
+        word_position += 1
+    # For each temp_list, at the end of the previous loop, there will be some
+    # values in this list, we should save it to the output list
+    for j in range(0, len(temp_list)):
+        if temp_list[j][1] != "":
+            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+    return sorted(list_output, key=lambda x: (x[2]))
+def sortTags(entities):
+    temp_entities = entities
+    temp_counter = 0
+    # For each entity, this loop will sort each tag of entitiy, first it will check if the
+    # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
+    for entity in temp_entities:
+        tags = entity[1].split()
+        for tag in tags:
+            # if the counter is not 0 then, will complete
+            if temp_counter != 0:
+                # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
+                # count how many tag in previous tags
+                if "I-" == tag[0:2]:
+                    counter_of_this_tag = 0
+                    counter_of_previous_tag = 0
+                    for word in tags:
+                        if tag.split("-")[1] in word:
+                            counter_of_this_tag+=1
+                    for word in temp_entities[temp_counter-1][1].split():
+                        if tag.split("-")[1] in word:
+                            counter_of_previous_tag+=1
+                    # if the counter of previous tag is bigger than counter of this tag, then we
+                    # need to add I-tag in this tags
+                    if counter_of_previous_tag > counter_of_this_tag:
+                        tags.append("I-"+tag.split("-")[1])
+        # Sort the tags
+        tags.sort()
+        # Need to revers the tags because it should begins with I
+        tags.reverse()
+        # If the counter is not 0 then we can complete
+        if temp_counter != 0:
+            this_tags = tags
+            previous_tags = temp_entities[temp_counter - 1][1].split()
+            sorted_tags = list()
+            # Check if the this tag is not O and previous tags is not O, then will complete,
+            # if not then it will ignor this tag
+            if "O" not in this_tags and "O" not in previous_tags:
+                index = 0
+                #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
+                for i in previous_tags:
+                    j = 0
+                    while this_tags and j < len(this_tags):
+                        if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
+                            sorted_tags.insert(index, this_tags.pop(j))
+                            break
+                        elif this_tags[j][0:2] == "B-":
+                            break
+                        j += 1
+                    index += 1
+            sorted_tags += this_tags
+            tags = sorted_tags
+        str_tag = " "
+        str_tag = str_tag.join(tags)
+        str_tag = str_tag.strip()
+        temp_entities[temp_counter][1] = str_tag
+        temp_counter += 1
+    return temp_entities
+# ============= Prepare Templates and Catergorize Extracted Entities ================
+temp03={'location':'مكان حدوث','agent':'أحد المتأثرين في','happened at':'تاريخ حدوث'}
+categories = {
+    'agent': ['PERS', 'NORP', 'OCC', 'ORG'],
+    'location': ['LOC', 'FAC', 'GPE'],
+    'happened at': ['DATE', 'TIME']
+    }
+def get_entity_category(entity_type, categories):
+    for category, types in categories.items():
+        if entity_type in types:
+            return category
+    return None
+# ============ Extract entities, their types and categorize them ===============
+def event_argument_relation_extraction(documnet):
+    sentences=sentence_tokenizer(documnet)
+    output_list=[]
+    relation={}
+    triple_id=0
+    for sentence in sentences:
+        entities=entities_and_types(sentence)
+        entity_identifier={entity:i for entity, i in zip(entities,range(1,len(entities)+1))}
+        event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type == 'EVENT']
+        arg_event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type != 'EVENT']
+        for i in event_indices:
+            event_entity=list(entities.keys())[i]
+            for j in arg_event_indices:
+                arg_name= list(entities.keys())[j]
+                arg_type=entities[arg_name]
+                category = get_entity_category(arg_type, categories)
+                if category in temp03:
+                    relation_sentence=f"[CLS] {sentence} [SEP] {event_entity} {temp03[category]} {arg_name}"
+                    predicted_relation=pipe(relation_sentence)
+                    score = predicted_relation[0][0]['score']
+                    if score > 0.50:
+                        triple_id+=1
+                        relation={"TripleID":triple_id,"Subject":{"ID":entity_identifier[event_entity],"Type": entities[event_entity], "Label":event_entity}, "Relation": category, "Object":{"ID":entity_identifier[arg_name],"Type": entities[arg_name], "Label":arg_name,}}
+                        output_list.append(relation)
+    return output_list

{SinaTools-0.1.29.data → SinaTools-0.1.31.data}/data/sinatools/environment.yml RENAMED Viewed

File without changes

{SinaTools-0.1.29.dist-info → SinaTools-0.1.31.dist-info}/AUTHORS.rst RENAMED Viewed

File without changes

{SinaTools-0.1.29.dist-info → SinaTools-0.1.31.dist-info}/LICENSE RENAMED Viewed

File without changes

{SinaTools-0.1.29.dist-info → SinaTools-0.1.31.dist-info}/WHEEL RENAMED Viewed

File without changes

{SinaTools-0.1.29.dist-info → SinaTools-0.1.31.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{SinaTools-0.1.29.dist-info → SinaTools-0.1.31.dist-info}/top_level.txt RENAMED Viewed

File without changes

SinaTools 0.1.29__py2.py3-none-any.whl → 0.1.31__py2.py3-none-any.whl

SinaTools 0.1.29py2.py3-none-any.whl → 0.1.31py2.py3-none-any.whl