PyPI - SinaTools - Versions diffs - 0.1.27__py2.py3-none-any.whl → 0.1.29__py2.py3-none-any.whl - Mend

SinaTools 0.1.27py2.py3-none-any.whl → 0.1.29py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

{SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/METADATA +2 -2
{SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/RECORD +27 -26
sinatools/CLI/DataDownload/download_files.py +2 -5
sinatools/CLI/morphology/ALMA_multi_word.py +0 -34
sinatools/CLI/morphology/morph_analyzer.py +1 -1
sinatools/CLI/ner/corpus_entity_extractor.py +17 -4
sinatools/CLI/ner/entity_extractor.py +8 -8
sinatools/CLI/utils/implication.py +3 -3
sinatools/VERSION +1 -1
sinatools/morphology/morph_analyzer.py +44 -45
sinatools/ner/entity_extractor.py +41 -0
sinatools/semantic_relatedness/compute_relatedness.py +22 -0
sinatools/synonyms/synonyms_generator.py +45 -1
sinatools/utils/jaccard.py +1 -1
sinatools/utils/parser.py +12 -15
sinatools/utils/similarity.py +95 -4
sinatools/utils/text_dublication_detector.py +22 -0
sinatools/utils/text_transliteration.py +1 -1
sinatools/utils/tokenizer.py +1 -1
sinatools/utils/word_compare.py +667 -0
sinatools/wsd/disambiguator.py +20 -19
{SinaTools-0.1.27.data → SinaTools-0.1.29.data}/data/sinatools/environment.yml +0 -0
{SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/AUTHORS.rst +0 -0
{SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/LICENSE +0 -0
{SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/WHEEL +0 -0
{SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/entry_points.txt +0 -0
{SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/top_level.txt +0 -0

sinatools/utils/similarity.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from sinatools.utils.parser import arStrip
-from sinatools.utils.implication import Implication
+from sinatools.utils.word_compare import Implication
 import argparse
 def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
@@ -47,7 +47,29 @@ def get_non_preferred_word(word1, word2):
     return "#"
 def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, ignore_shadda_diacritic=False):
+    """
+    Computes the intersection of two sets of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
+    Args:
+        list1 (:obj:`list`): The first list.
+        list2 (:obj:`bool`): The second list.
+        ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
+        ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
+    Returns:
+        :obj:`list`: The intersection of the two lists, ignores diacritics if flags are true.
+    **Example:**
+    .. highlight:: python
+    .. code-block:: python
+        from sinatools.utils.similarity import get_intersection
+        list1 = ["كتب","فَعل","فَعَلَ"]
+        list2 = ["كتب","فَعّل"]
+        print(get_intersection(list1, list2, False, True))
+        #output: ["كتب" ,"فعل"]
+    """
     list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
     list1 = [str(i.strip()) for i in list1]
@@ -80,7 +102,29 @@ def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, i
 def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic):
+    """
+    Computes the union of two sets of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
+    Args:
+        list1 (:obj:`list`): The first list.
+        list2 (:obj:`bool`): The second list.
+        ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
+        ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
+    Returns:
+        :obj:`list`: The union of the two lists, ignoring diacritics if flags are true.
+    **Example:**
+    .. highlight:: python
+    .. code-block:: python
+        from sinatools.utils.similarity import get_union
+        list1 = ["كتب","فَعل","فَعَلَ"]
+        list2 = ["كتب","فَعّل"]
+        print(get_union(list1, list2, False, True))
+        #output: ["كتب" ,"فَعل" ,"فَعَلَ"]
+    """
     list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
     list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
@@ -110,7 +154,30 @@ def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_
 def get_jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_not_shadda: bool, ignore_shadda_diacritic: bool) -> float:
+    """
+    Calculates the Jaccard similarity coefficient between two lists of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
+    Args:
+        list1 (:obj:`list`): The first list.
+        list2 (:obj:`bool`): The second list.
+        ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
+        ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
+    Returns:
+        :obj:`float`: The Jaccard similarity coefficient between the two lists, ignoring diacritics if flags are true.
+    **Example:**
+    .. highlight:: python
+    .. code-block:: python
+        from sinatools.utils.similarity import get_jaccard_similarity
+        list1 = ["كتب","فَعل","فَعَلَ"]
+        list2 = ["كتب","فَعّل"]
+        print(get_jaccard_similarity(list1, list2, True, True))
+        #output: 0.67
+    """
     intersection_list = get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
     union_list = get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
@@ -118,7 +185,31 @@ def get_jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_n
     return float(len(intersection_list)) / float(len(union_list))
 def get_jaccard(delimiter, str1, str2, selection, ignoreAllDiacriticsButNotShadda=True, ignoreShaddaDiacritic=True):
+    """
+    Calculates and returns the Jaccard similarity values (union, intersection, or Jaccard similarity) between two lists of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
+    Args:
+        delimiter (:obj:`str`): The delimiter used to split the input strings.
+        str1 (:obj:`str`): The first input string to compare.
+        str1 (:obj:`str`): The second input string to compare.
+        selection (:obj:`str`) – The desired operation to perform on the two sets of strings. Must be one of intersection, union, jaccardSimilarity, or jaccardAll.
+        ignore_all_diacratics_but_not_shadda (:obj:`bool`) – If True, ignore all diacratics except for the Shadda diacritic. (Default is True)
+        ignore_shadda_diacritic (:obj:`bool`) – If True, ignore the Shadda diacritic.(Default is True)
+    Returns:
+        Three values (Jaccard similarity, union, or intersection) between the two lists of Arabic words depending on the parameter selection.
+    **Example:**
+    .. highlight:: python
+    .. code-block:: python
+        from sinatools.utils.similarity import get_jaccard
+        str1 = "فَعَلَ | فَعل"
+        str2 = "فَعّل"
+        print(get_jaccard("|", "jaccardAll", str1, str2, True, True))
+        #output: ['intersection:', ['فعل'], 'union:', ['فعل', 'فعل'], 'similarity:', 0.5]
+    """
     try:
         list1 = str1.split(delimiter)
         list2 = str2.split(delimiter)

sinatools/utils/text_dublication_detector.py CHANGED Viewed

@@ -15,6 +15,28 @@ def validator(sentence, max_tokens=500):
 def removal(csv_file, columnName, finalFileName, deletedFileName, similarityThreshold=0.8):
+    """
+    This method is designed to identify dublicate text in a given corpora/text. It processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold. We used cosine similarity to measure similarity between words and sentences. The method saves the filtered results and the identified duplicates to separate files.
+    Args:
+        csv_file (:obj:`str`) – The CSV file contains Arabic text that needs to be cleaned.
+        column_name (:obj:`str`) – This is the name of the column containing the text that needs to be checked for duplicate removal.
+        final_file_name (:obj:`str`) – This is the name of the CSV file that will contain the data after duplicate removal.
+        deleted_file_name (:obj:`str`) – This is the name of the file that will contain all the duplicate records that are deleted.
+        similarity_threshold (:obj:`float`) – This is a floating-point number. The default value is 0.8, indicating the percentage of similarity that the function should use when deleting duplicates from the text column.
+    Returns:
+        csv files.
+    **Example:**
+    .. highlight:: python
+    .. code-block:: python
+        from sinatools.utils.text_dublication_detector import removal
+        removal("/path/to/csv/file1", sentences, "/path/to/csv/file2", 0.8)
+    """
     # Read CSV file
     try:
         df = pd.read_csv(csv_file)

sinatools/utils/text_transliteration.py CHANGED Viewed

@@ -165,7 +165,7 @@ bw2ar_map = {
 #It takes a text and the schema as input and return 2-values: the transliteration and a flag of whether all chars are transliterated or not
 def perform_transliteration(text , schema ):
     """
-    This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
+    This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text is based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
     Args:
         text (:obj:`str`): The input text to be transliterated.

sinatools/utils/tokenizer.py CHANGED Viewed

@@ -58,7 +58,7 @@ def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclam
 def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
     """
-    This method receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in a CSV file.
+    This method is designed to tokenize a corpus into words. It receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in one CSV file. The data within files was split into sentences using the sentence_tokenizer module and into words using a word tokenizer. Additionally, it added a set of ids (row_id, docs_sentence_word_id, global_sentence_id, sentence_id, word_position).
     Args:
         dir_path (:obj:`str`): The path of the directory containing multiple Arabic txt files.

SinaTools 0.1.27__py2.py3-none-any.whl → 0.1.29__py2.py3-none-any.whl

SinaTools 0.1.27py2.py3-none-any.whl → 0.1.29py2.py3-none-any.whl