PyPI - SinaTools - Versions diffs - 0.1.35__py2.py3-none-any.whl → 0.1.36__py2.py3-none-any.whl - Mend

SinaTools 0.1.35py2.py3-none-any.whl → 0.1.36py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{SinaTools-0.1.35.dist-info → SinaTools-0.1.36.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: SinaTools
-Version: 0.1.35
+Version: 0.1.36
 Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
 Home-page: https://github.com/SinaLab/sinatools
 License: MIT license

{SinaTools-0.1.35.dist-info → SinaTools-0.1.36.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
-SinaTools-0.1.35.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
-sinatools/VERSION,sha256=cVbVTfIguj1zWCurwk_MTvuyWUDhNgp0IfcGYvhdzcY,6
+SinaTools-0.1.36.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
+sinatools/VERSION,sha256=4WO9ZLWQOVGEf7BUbcCdCnR4_2Fp3iJiMmtiLd4Vzo8,6
 sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
 sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
 sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
 sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
-sinatools/CLI/DataDownload/download_files.py,sha256=u_DFXbHcIU_4Ub5Y0cL9_p1hL8h6LLWPemn9Al-XFgc,2603
+sinatools/CLI/DataDownload/download_files.py,sha256=EezvbukR3pZ8s6mGZnzTcjsbo3CBDlC0g6KhJWlYp1w,2686
 sinatools/CLI/morphology/ALMA_multi_word.py,sha256=rmpa72twwIJHme_kpQ1lu3_7y_Jorj70QTvOnQMJRuI,1274
 sinatools/CLI/morphology/morph_analyzer.py,sha256=HPamEKos_JRYCJv_2q6c12N--da58_JXTno9haww5Ao,3497
 sinatools/CLI/ner/corpus_entity_extractor.py,sha256=DdvigsDQzko5nJBjzUXlIDqoBMBTVzktjSo7JfEXTIA,4778
@@ -77,13 +77,11 @@ sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma
 sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
 sinatools/morphology/morph_analyzer.py,sha256=JOH2UWKNQWo5UzpWNzP9R1D3B3qLSogIiMp8n0N_56o,7177
 sinatools/ner/__init__.py,sha256=59kLMX6UQhF6JpE10RhaDYC3a2_jiWOIVPuejsoflFE,1050
-sinatools/ner/data.py,sha256=lvOW86dXse8SC75Q0supQaE0rrRffoxNjIA0Qbv5WZY,4354
 sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
 sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
 sinatools/ner/entity_extractor.py,sha256=O2epRwRFUUcQs3SnFIYHVBI4zVhr8hRcj0XJYeby4ts,3588
 sinatools/ner/helpers.py,sha256=dnOoDY5JMyOLTUWVIZLMt8mBn2IbWlVaqHhQyjs1voo,2343
 sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
-sinatools/ner/relation_extractor.py,sha256=a85xGX6V72fDpJk0GKmmtlWf8S8ezY-2pm5oGc9_ESY,9750
 sinatools/ner/transforms.py,sha256=vti3mDdi-IRP8i0aTQ37QqpPlP9hdMmJ6_bAMa0uL-s,4871
 sinatools/ner/data/__init__.py,sha256=W0C1ge_XxTfmdEGz0hkclz57aLI5VFS5t6BjByCfkFk,57
 sinatools/ner/data/datasets.py,sha256=lcdDDenFMEKIGYQmfww2dk_9WKWrJO9HtKptaAEsRmY,5064
@@ -104,8 +102,6 @@ sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO839
 sinatools/synonyms/synonyms_generator.py,sha256=jRd0D3_kn-jYBaZzqY-7oOy0SFjSJ-mjM7JhsySzX58,9037
 sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
-sinatools/utils/implication.py,sha256=MsbI6S1LNY-fCxGMxFTuaV639r3QijkkdcfH48rvY7A,27804
-sinatools/utils/jaccard.py,sha256=kLIptPNB2VIqnemVve9auyOL1kXHIsCkKCEwxFM8yP4,10114
 sinatools/utils/parser.py,sha256=qvHdln5R5CAv_0UOJWe0mcp8JCsGqgazoeIIkoALH88,6259
 sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
 sinatools/utils/similarity.py,sha256=CgKOJpRAU5UaSjOg-sdZcACCNl9tuKDRwdFAKATCL_w,10762
@@ -115,13 +111,13 @@ sinatools/utils/tokenizer.py,sha256=nyk6lh5-p38wrU62hvh4wg7ni9ammkdqqIgcjbbBxxo,
 sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
 sinatools/utils/word_compare.py,sha256=rS2Z74sf7R-7MTXyrFj5miRi2TnSG9OdTDp_qQYuo2Y,28200
 sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
-sinatools/wsd/disambiguator.py,sha256=9ottQn_WwOFX5Trr0Rpg66-Jpaln5yJduFqP6cdOOBA,22616
+sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
 sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
 sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
-SinaTools-0.1.35.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
-SinaTools-0.1.35.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
-SinaTools-0.1.35.dist-info/METADATA,sha256=N1gUEgccLIIpfCHthFpI-2HU01LogkZWo1C-1qANx5M,3267
-SinaTools-0.1.35.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
-SinaTools-0.1.35.dist-info/entry_points.txt,sha256=-YGM-r0_UtNPnI0C4UcK1ptrpwFZpUhxdy2qHkehNCo,1303
-SinaTools-0.1.35.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
-SinaTools-0.1.35.dist-info/RECORD,,
+SinaTools-0.1.36.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
+SinaTools-0.1.36.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
+SinaTools-0.1.36.dist-info/METADATA,sha256=vukmjuNbUETy8EMIkA64uOOwAS5WO5WuWOOMeBoR6ps,3267
+SinaTools-0.1.36.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
+SinaTools-0.1.36.dist-info/entry_points.txt,sha256=-YGM-r0_UtNPnI0C4UcK1ptrpwFZpUhxdy2qHkehNCo,1303
+SinaTools-0.1.36.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
+SinaTools-0.1.36.dist-info/RECORD,,

sinatools/CLI/DataDownload/download_files.py CHANGED Viewed

@@ -52,16 +52,17 @@ def main():
         for file in args.files:
             print("file: ", file)
             if file == "wsd":
-                #download_file(urls["morph"])
-                #download_file(urls["ner"])
+                download_file(urls["morph"])
+                download_file(urls["ner"])
                 #download_file(urls["wsd_model"])
-                download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01")
                 #download_file(urls["wsd_tokenizer"])
-                #download_file(urls["one_gram"])
-                #download_file(urls["five_grams"])
-                #download_file(urls["four_grams"])
-                #download_file(urls["three_grams"])
-                #download_file(urls["two_grams"])
+                download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01")
+                download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02")
+                download_file(urls["one_gram"])
+                download_file(urls["five_grams"])
+                download_file(urls["four_grams"])
+                download_file(urls["three_grams"])
+                download_file(urls["two_grams"])
             elif file == "synonyms":
                 download_file(urls["graph_l2"])
                 download_file(urls["graph_l3"])

sinatools/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.35
1	+ 0.1.36

sinatools/wsd/disambiguator.py CHANGED Viewed

@@ -8,10 +8,6 @@ from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
 from sinatools.morphology.morph_analyzer import analyze
 from sinatools.ner.entity_extractor import extract
 from . import glosses_dic
-import time
-#import concurrent
-#import threading
-import multiprocessing
 def distill_entities(entities):
@@ -260,7 +256,7 @@ def find_named_entities(string):
    return found_entities
-def find_glosses_using_ALMA(word, glosses_dic):
+def find_glosses_using_ALMA(word):
    data = analyze(word, language ='MSA', task ='full', flag="1")
    Diac_lemma = ""
@@ -306,7 +302,7 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
       return my_json
-def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner, glosses_dic):
+def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner):
       output_list = []
       position = 0
       while position < len(input_sentence):
@@ -393,7 +389,7 @@ def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemm
          if flag == "False": # Not found in ner or in multi_word_dictionary, ASK ALMA
             word = input_sentence[position]
-            word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word, glosses_dic)
+            word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word)
             my_json = {}
             my_json['word'] = word
             my_json['concept_count'] = concept_count
@@ -436,95 +432,26 @@ def disambiguate_glosses_main(word, sentence):
       glosses = word['glosses']
       Diac_lemma = word['Diac_lemma']
       Undiac_lemma = word['Undiac_lemma']
-      start = time.time()
-      x = disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
-      end = time.time()
-      print(f"disambiguate time: {end - start}")
-      return x
-def init_resources():
-    global glosses_dic
-# Wrapper function for multiprocessing
-def disambiguate_glosses_in_parallel(word_and_sentence):
-    word, sentence = word_and_sentence
-    return disambiguate_glosses_main(word, sentence)
+      return disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
 def WSD(sentence):
-   start = time.time()
    input_sentence = simple_word_tokenize(sentence)
-   end = time.time()
-   print(f"tokenizer time: {end - start}")
-   start = time.time()
    five_word_lemma = find_five_word_lemma(input_sentence)
-   end = time.time()
-   print(f"5grams time: {end - start}")
-   start = time.time()
    four_word_lemma = find_four_word_lemma(input_sentence)
-   end = time.time()
-   print(f"4grams time: {end - start}")
-   start = time.time()
    three_word_lemma = find_three_word_lemma(input_sentence)
-   end = time.time()
-   print(f"3grams time: {end - start}")
-   start = time.time()
    two_word_lemma = find_two_word_lemma(input_sentence)
-   end = time.time()
-   print(f"2grams time: {end - start}")
-   start = time.time()
    ner = find_named_entities(" ".join(input_sentence))
-   end = time.time()
-   print(f"ner time: {end - start}")
-   start = time.time()
-   output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner, glosses_dic_shared)
-   end = time.time()
-   print(f"lookup time: {end - start}")
-#    for word in output_list:
-    #  start = time.time()
-    #  results.append(disambiguate_glosses_main(word, sentence))
-    #  end = time.time()
-    #  print(f"disambiguate time: {end - start}")
-#    return results
-#    with concurrent.futures.ProcessPoolExecutor() as executor:
-    #    results = list(executor.map(lambda word: disambiguate_glosses_main(word, sentence), output_list))
-#    return results
-   # Create and start threads
-#    for word in output_list:
-    #    thread = threading.Thread(target=worker, args=(word, sentence))
-    #    threads.append(thread)
-    #    thread.start()
-#
-#    for thread in threads:
-    #    thread.join()
-#
-#    return threading_results
-    # Number of CPUs
-   num_cpus = multiprocessing.cpu_count()
-   print("num_cpus : ", num_cpus)
-   # Create a manager to hold shared data
-#    with multiprocessing.Manager() as manager:
-    #    glosses_dic_shared = manager.dict(glosses_dic)
-    #    with multiprocessing.Pool(num_cpus) as pool:
-            # arguments = [(word, sentence) for word in output_list]
-            # results = pool.starmap(disambiguate_glosses_main, arguments)
-   with multiprocessing.Pool(initializer=init_resources) as pool:
-        # Map the list of words to the disambiguation function in parallel
-        results = pool.map(disambiguate_glosses_in_parallel, [(word, sentence) for word in output_list])
+   output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner)
+   results = []
+   for word in output_list:
+      results.append(disambiguate_glosses_main(word, sentence))
    return results
@@ -570,8 +497,5 @@ def disambiguate(sentence):
        content = ["Input is too long"]
        return content
     else:
-       start = time.time()
        results = WSD(sentence)
-       end = time.time()
-       print(f"WSD total time: {end - start}")
        return results

sinatools/ner/data.py DELETED Viewed

@@ -1,124 +0,0 @@
-from torch.utils.data import DataLoader
-from torchtext.vocab import vocab
-from collections import Counter, namedtuple
-import logging
-import re
-import itertools
-from sinatools.ner.helpers import load_object
-from sinatools.ner.datasets import Token
-from sinatools.utils.tokenizers_words import simple_word_tokenize
-logger = logging.getLogger(__name__)
-def conll_to_segments(filename):
-    """
-    Convert CoNLL files to segments. This return list of segments and each segment is
-    a list of tuples (token, tag)
-    :param filename: Path
-    :return: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
-    """
-    segments, segment = list(), list()
-    with open(filename, "r") as fh:
-        for token in fh.read().splitlines():
-            if not token.strip():
-                segments.append(segment)
-                segment = list()
-            else:
-                parts = token.split()
-                token = Token(text=parts[0], gold_tag=parts[1:])
-                segment.append(token)
-        segments.append(segment)
-    return segments
-def parse_conll_files(data_paths):
-    """
-    Parse CoNLL formatted files and return list of segments for each file and index
-    the vocabs and tags across all data_paths
-    :param data_paths: tuple(Path) - tuple of filenames
-    :return: tuple( [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i]
-                    [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i+1],
-                    ...
-                  )
-             List of segments for each dataset and each segment has list of (tokens, tags)
-    """
-    vocabs = namedtuple("Vocab", ["tags", "tokens"])
-    datasets, tags, tokens = list(), list(), list()
-    for data_path in data_paths:
-        dataset = conll_to_segments(data_path)
-        datasets.append(dataset)
-        tokens += [token.text for segment in dataset for token in segment]
-        tags += [token.gold_tag for segment in dataset for token in segment]
-    # Flatten list of tags
-    tags = list(itertools.chain(*tags))
-    # Generate vocabs for tags and tokens
-    tag_vocabs = tag_vocab_by_type(tags)
-    tag_vocabs.insert(0, vocab(Counter(tags)))
-    vocabs = vocabs(tokens=vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
-    return tuple(datasets), vocabs
-def tag_vocab_by_type(tags):
-    vocabs = list()
-    c = Counter(tags)
-    tag_names = c.keys()
-    tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
-    for tag_type in tag_types:
-        r = re.compile(".*-" + tag_type)
-        t = list(filter(r.match, tags)) + ["O"]
-        vocabs.append(vocab(Counter(t), specials=["<pad>"]))
-    return vocabs
-def text2segments(text):
-    """
-    Convert text to a datasets and index the tokens
-    """
-    #dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
-    list_of_tokens = simple_word_tokenize(text)
-    dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
-    tokens = [token.text for segment in dataset for token in segment]
-    # Generate vocabs for the tokens
-    segment_vocab = vocab(Counter(tokens), specials=["UNK"])
-    return dataset, segment_vocab
-def get_dataloaders(
-    datasets, vocab, data_config, batch_size=32, num_workers=0, shuffle=(True, False, False)
-):
-    """
-    From the datasets generate the dataloaders
-    :param datasets: list - list of the datasets, list of list of segments and tokens
-    :param batch_size: int
-    :param num_workers: int
-    :param shuffle: boolean - to shuffle the data or not
-    :return: List[torch.utils.data.DataLoader]
-    """
-    dataloaders = list()
-    for i, examples in enumerate(datasets):
-        data_config["kwargs"].update({"examples": examples, "vocab": vocab})
-        dataset = load_object("sinatools."+data_config["fn"], data_config["kwargs"])
-        dataloader = DataLoader(
-            dataset=dataset,
-            shuffle=shuffle[i],
-            batch_size=batch_size,
-            num_workers=num_workers,
-            collate_fn=dataset.collate_fn,
-        )
-        logger.info("%s batches found", len(dataloader))
-        dataloaders.append(dataloader)
-    return dataloaders

sinatools/ner/relation_extractor.py DELETED Viewed

@@ -1,201 +0,0 @@
-import torch
-import json
-from urllib.request import Request, urlopen
-from sinatools.ner.entity_extractor import extract
-from . import pipe
-# ============================ Extract entities and their types ========================
-def jsons_to_list_of_lists(json_list):
-    return [[d['token'], d['tags']] for d in json_list]
-def entities_and_types(sentence):
-    output_list = jsons_to_list_of_lists(extract(sentence))
-    json_short = distill_entities(output_list)
-    entities = {}
-    for entity in json_short:
-        name = entity[0]
-        entity_type = entity[1]
-        entities[name] = entity_type
-    return entities
-def distill_entities(entities):
-    # This is list that we put the output what we need
-    list_output = list()
-    # This line go to sort function and save the output to temp_entities
-    temp_entities = sortTags(entities)
-    # This list help us to make the output,
-    temp_list = list()
-    # initlize the temp_list
-    temp_list.append(["", "", 0, 0])
-    word_position = 0
-    # For each entity, convert ibo to distllir list.
-    for entity in temp_entities:
-        # This is counter tag of this entity
-        counter_tag = 0
-        # For each tag
-        for tag in str(entity[1]).split():
-            # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
-            if counter_tag >= len(temp_list):
-                temp_list.append(["", "", 0, 0])
-            # If tag equal O and word postion of this tag is not equal zero then it will add all
-            # not empty eliment of temp list in output list
-            if "O" == tag and word_position != 0:
-                for j in range(0, len(temp_list)):
-                    if temp_list[j][1] != "":
-                        list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
-                        temp_list[j][0] = ""
-                        temp_list[j][1] = ""
-                        temp_list[j][2] = word_position
-                        temp_list[j][3] = word_position
-            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
-            # of the split its B
-            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
-                # if the temp_list of counter is not empty then it will append in output list and hten it will
-                # initilize by new string and tag in templist of counter
-                if temp_list[counter_tag][1] != "":
-                    list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
-                temp_list[counter_tag][0] = str(entity[0]) + " "
-                temp_list[counter_tag][1] = str(tag).split("-")[1]
-                temp_list[counter_tag][2] = word_position
-                temp_list[counter_tag][3] = word_position
-            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
-            # of the split its O
-            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
-                # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
-                # then will complete if not it will save in output list and cheak another
-                for j in range(counter_tag,len(temp_list)):
-                    if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
-                        temp_list[j][0] += str(entity[0]) + " "
-                        temp_list[j][3] += 1
-                        break
-                    else:
-                        if temp_list[j][1] != "":
-                            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
-                            temp_list[j][0] = ""
-                            temp_list[j][1] = ""
-                            temp_list[j][2] = word_position
-                            temp_list[j][3] = word_position
-            counter_tag += 1
-        word_position += 1
-    # For each temp_list, at the end of the previous loop, there will be some
-    # values in this list, we should save it to the output list
-    for j in range(0, len(temp_list)):
-        if temp_list[j][1] != "":
-            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
-    return sorted(list_output, key=lambda x: (x[2]))
-def sortTags(entities):
-    temp_entities = entities
-    temp_counter = 0
-    # For each entity, this loop will sort each tag of entitiy, first it will check if the
-    # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
-    for entity in temp_entities:
-        tags = entity[1].split()
-        for tag in tags:
-            # if the counter is not 0 then, will complete
-            if temp_counter != 0:
-                # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
-                # count how many tag in previous tags
-                if "I-" == tag[0:2]:
-                    counter_of_this_tag = 0
-                    counter_of_previous_tag = 0
-                    for word in tags:
-                        if tag.split("-")[1] in word:
-                            counter_of_this_tag+=1
-                    for word in temp_entities[temp_counter-1][1].split():
-                        if tag.split("-")[1] in word:
-                            counter_of_previous_tag+=1
-                    # if the counter of previous tag is bigger than counter of this tag, then we
-                    # need to add I-tag in this tags
-                    if counter_of_previous_tag > counter_of_this_tag:
-                        tags.append("I-"+tag.split("-")[1])
-        # Sort the tags
-        tags.sort()
-        # Need to revers the tags because it should begins with I
-        tags.reverse()
-        # If the counter is not 0 then we can complete
-        if temp_counter != 0:
-            this_tags = tags
-            previous_tags = temp_entities[temp_counter - 1][1].split()
-            sorted_tags = list()
-            # Check if the this tag is not O and previous tags is not O, then will complete,
-            # if not then it will ignor this tag
-            if "O" not in this_tags and "O" not in previous_tags:
-                index = 0
-                #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
-                for i in previous_tags:
-                    j = 0
-                    while this_tags and j < len(this_tags):
-                        if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
-                            sorted_tags.insert(index, this_tags.pop(j))
-                            break
-                        elif this_tags[j][0:2] == "B-":
-                            break
-                        j += 1
-                    index += 1
-            sorted_tags += this_tags
-            tags = sorted_tags
-        str_tag = " "
-        str_tag = str_tag.join(tags)
-        str_tag = str_tag.strip()
-        temp_entities[temp_counter][1] = str_tag
-        temp_counter += 1
-    return temp_entities
-# ============= Prepare Templates and Catergorize Extracted Entities ================
-temp03={'location':'مكان حدوث','agent':'أحد المتأثرين في','happened at':'تاريخ حدوث'}
-categories = {
-    'agent': ['PERS', 'NORP', 'OCC', 'ORG'],
-    'location': ['LOC', 'FAC', 'GPE'],
-    'happened at': ['DATE', 'TIME']
-    }
-def get_entity_category(entity_type, categories):
-    for category, types in categories.items():
-        if entity_type in types:
-            return category
-    return None
-# ============ Extract entities, their types and categorize them ===============
-def relation_extraction(sentence):
-    #test_sentence="صورة إعتقال طفل فلسطيني خلال انتفاضة الأقصى ."
-    entities=entities_and_types(sentence)
-    event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type == 'EVENT']
-    arg_event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type != 'EVENT']
-    output_list=[]
-    for i in event_indices:
-        event_entity=list(entities.keys())[i]
-        for j in arg_event_indices:
-            arg_name= list(entities.keys())[j]
-            arg_type=entities[arg_name]
-            category = get_entity_category(arg_type, categories)
-            if category in temp03:
-                relation_sentence=f"[CLS] {sentence} [SEP] {event_entity} {temp03[category]} {arg_name}"
-                predicted_relation=pipe(relation_sentence)
-                score = predicted_relation[0][0]['score']
-                if score > 0.50:
-                    #print(f"Event:{event_entity} Relation:{category} Argument:{arg_name}\n")
-                    #output_list.append([{event_entity} ,{category}, {arg_name}])
-                    output_list.append(f"Event:{event_entity}, Relation:{category}, Argument:{arg_name}")
-                else:
-                    #print(f"Event:{event_entity} Relation:No relation Argument:{arg_name}\n")
-                    #output_list.append([{event_entity} ,'No relation', {arg_name}])
-                    output_list.append(f"Event:{event_entity}, Relation:No relation, Argument:{arg_name}")
-    return output_list

SinaTools 0.1.35__py2.py3-none-any.whl → 0.1.36__py2.py3-none-any.whl

SinaTools 0.1.35py2.py3-none-any.whl → 0.1.36py2.py3-none-any.whl