PyPI - SinaTools - Versions diffs - 0.1.3__py2.py3-none-any.whl → 0.1.7__py2.py3-none-any.whl - Mend

SinaTools 0.1.3py2.py3-none-any.whl → 0.1.7py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

{nlptools → sinatools}/salma/wsd.py RENAMED Viewed

@@ -1,11 +1,11 @@
-from nlptools.salma import settings
+from sinatools.salma import settings
 import re
 import warnings
 warnings.filterwarnings("ignore")
 import torch
 import numpy as np
 import pandas as pd
-from nlptools.arabert.preprocess import ArabertPreprocessor
+from sinatools.arabert.preprocess import ArabertPreprocessor
 def normalizearabert(s):
   model_name = 'aubmindlab/bert-base-arabertv02'

{nlptools/morphology → sinatools/utils}/charsets.py RENAMED Viewed

@@ -1,6 +1,4 @@
-# -*- coding: utf-8 -*-
-# We acknoledge that this file  charsets.py is imported from Camel tools citation. url
-#
+# We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/utils/charsets.html].
 import unicodedata

{nlptools → sinatools}/utils/implication.py RENAMED Viewed

@@ -3,7 +3,7 @@
 #  The matching between two words is defined as a tuple:
 #  <w1, w2, implication direction, distance, conflicts, verdict, preferredWord> .
-from nlptools.utils.parser import arStrip
+from sinatools.utils.parser import arStrip
 class Implication:
     """
     The implication class computes whether the two Arabic words are the same or not, regardless of how they are diacritized. The output also contains implication direction, distance, number of conflicts, and other outputs.
@@ -215,7 +215,7 @@ class Implication:
         .. highlight:: python
         .. code-block:: python
-            from nlptools.utils.implication import Implication
+            from sinatools.utils.implication import Implication
             word = Implication.normalize_alef("ًى")  # Returns "ىً"
             word = Implication.normalize_alef("ًا")  # Returns "اً"
@@ -249,7 +249,7 @@ class Implication:
         .. highlight:: python
         .. code-block:: python
-            from nlptools.utils.implication import Implication
+            from sinatools.utils.implication import Implication
             diacritics = ["َ", "ُ", "ِ", "ّ"]
             has_error = Implication.diacritics_syntax_error_in(diacritics)  # Returns False
@@ -288,7 +288,7 @@ class Implication:
         .. highlight:: python
         .. code-block:: python
-            from nlptools.utils.implication import Implication
+            from sinatools.utils.implication import Implication
             diacritic = 0
             is_wrong_end = Implication.wrong_end_diacritic(diacritic)  # Returns False
@@ -324,7 +324,7 @@ class Implication:
         .. highlight:: python
         .. code-block:: python
-            from nlptools.utils.implication import Implication
+            from sinatools.utils.implication import Implication
             implication = Implication(word1, word2)
             implication.calculate_words_implication()
@@ -377,7 +377,7 @@ class Implication:
         .. highlight:: python
         .. code-block:: python
-            from nlptools.utils.implication Implication
+            from sinatools.utils.implication Implication
             implication = Implication(word1, word2)
             result = implication.equal_words()
@@ -428,7 +428,7 @@ class Implication:
         .. highlight:: python
         .. code-block:: python
-            from nlptools.utils.implication import Implication
+            from sinatools.utils.implication import Implication
             implication = Implication(word1, word2)
             result = implication.calculate_letters_implication()
@@ -508,7 +508,7 @@ class Implication:
         .. highlight:: python
         .. code-block:: python
-            from nlptools.utils.implication import Implication
+            from sinatools.utils.implication import Implication
             word = "مُرَحَّبًا"
             diacritics = Implication.calculate_direction(word)
             print(diacritics)
@@ -600,7 +600,7 @@ class Implication:
         .. highlight:: python
         .. code-block:: python
-            from nlptools.utils.implication import Implication
+            from sinatools.utils.implication import Implication
             word = "مرحبا"
             letters = get_letters_array(word)
             print(letters)
@@ -644,7 +644,7 @@ class Implication:
         .. highlight:: python
         .. code-block:: python
-            from nlptools.utils.implication import Implication
+            from sinatools.utils.implication import Implication
             w1 = "hello"
             w2 = "hell"
             implication = Implication(w1, w2)

{nlptools → sinatools}/utils/jaccard.py RENAMED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
-from nlptools.utils.parser import arStrip
-from nlptools.utils.implication import Implication
+from sinatools.utils.parser import arStrip
+from sinatools.utils.implication import Implication
 import argparse
 def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:

{nlptools → sinatools}/utils/parser.py RENAMED Viewed

@@ -1,20 +1,19 @@
 import re
 import argparse
-def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, alif=True , specialChars=True ):
+def arStrip(text , diacs=True , small_diacs=True , shaddah=True , digit=True, alif=True , special_chars=True ):
     """
-    This method removes Arabic diacritics, small diacritcs, shaddah, Latin and Arabic digits, and unify alif.
-    And remove special characters, spaces, underscore and Arabic tatwelah from the input text.
+    This method removes Arabic diacritics, small diacritcs, shaddah, Latin and Arabic digits, unify alif, remove special characters, extra spaces, underscore and Arabic tatwelah from the input text.
     Args:
         text (:obj:`str`): Arabic text to be processed.
         diacs (:obj:`bool`): flag to remove Arabic diacretics [ ًٌٍَُِْ] (default is True).
-        smallDiacs (:obj:`bool`): flag to remove small diacretics (default is True).
+        small_diacs (:obj:`bool`): flag to remove small diacretics (default is True).
         shaddah (:obj:`bool`): flag to remove shaddah (default is True).
         digit (:obj:`bool`): flag to remove Latin and Arabic digits (default is True).
         alif (:obj:`bool`): flag to unify alif (default is True).
-        specialChars (:obj:`bool`): flag to remove special characters (default is True).
+        special_chars (:obj:`bool`): flag to remove special characters (default is True).
     Returns:
         :obj:`str`: stripped text.
@@ -24,19 +23,17 @@ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, ali
     .. highlight:: python
     .. code-block:: python
-        from nlptools.utils import parser
-        processed_text =parser.arStrip('2023الجو جميلُ')
-        print(processed_text)
+        from sinatools.utils import parser
+        output = parser.arStrip('2023الجو جميلُ')
+        print(output)
-        #putput
+        # output
         الجو جميل
-        name =parser.arStrip('أَلَمۡ یَأۡنِ لِلَّذِینَ ءَامَنُوۤا۟ أَن تَخۡشَعَ قُلُوبُهُمۡ لِذِكۡرِ ٱللَّهِ وَمَا نَزَلَ مِنَ ٱلۡحَقِّ وَلَا یَكُونُوا۟ كَٱلَّذِینَ أُوتُوا۟ ٱلۡكِتَـٰبَ مِن قَبۡلُ فَطَالَ عَلَیۡهِمُ ٱلۡأَمَدُ فَقَسَتۡ قُلُوبُهُمۡۖ وَكَثِیر مِّنۡهُمۡ فَـسِقُونَ' , True , True , True ,  True , True , True )
-        print(name)
-        #putput
+        output = parser.arStrip('أَلَمۡ یَأۡنِ لِلَّذِینَ ءَامَنُوۤا۟ أَن تَخۡشَعَ قُلُوبُهُمۡ لِذِكۡرِ ٱللَّهِ وَمَا نَزَلَ مِنَ ٱلۡحَقِّ وَلَا یَكُونُوا۟ كَٱلَّذِینَ أُوتُوا۟ ٱلۡكِتَـٰبَ مِن قَبۡلُ فَطَالَ عَلَیۡهِمُ ٱلۡأَمَدُ فَقَسَتۡ قُلُوبُهُمۡۖ وَكَثِیر مِّنۡهُمۡ فَـسِقُونَ' , True , True , True ,  True , True , True )
+        print(output)
+        #output
         الم یان للذین ءامنوا ان تخشع قلوبهم لذكر الله وما نزل من الحق ولا یكونوا كالذین اوتوا الكتٰب من قبل فطال علیهم الامد فقست قلوبهم وكثیر منهم فسقون
     """
     try:
         if text: # if the input string is not empty do the following
@@ -46,7 +43,7 @@ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, ali
                 text = re.sub(r'[\u0652]+', '',text) # Remove SUKUN
             if shaddah == True:
                 text = re.sub(r'[\u0651]+', '',text) # Remove shddah
-            if smallDiacs == True:
+            if small_diacs == True:
                 text = re.sub(r'[\u06D6-\u06ED]+', '',text) # Remove all small Quranic annotation signs
             if digit == True:
                 text = re.sub('[0-9]+', ' ',text) # Remove English digits
@@ -57,7 +54,7 @@ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, ali
                 text = re.sub('أ', 'ا',text);
                 text = re.sub('إ', 'ا',text);
                 text = re.sub('آ', 'ا',text);
-            if specialChars == True:
+            if special_chars == True:
                 text = re.sub('[?؟!@#$%-]+' , '' , text) # Remove some of special chars
             text = re.sub('[\\s]+'," ",text) # Remove all spaces
@@ -83,7 +80,7 @@ def remove_punctuation(text):
     .. highlight:: python
     .. code-block:: python
-        from nlptools.utils import parser
+        from sinatools.utils import parser
         return parser.remove_punctuation("te!@#،$%%؟st")
         #output
@@ -103,12 +100,12 @@ def remove_punctuation(text):
                                  r'[\u061B]+', r'[\u061E]+', r'[\u061F]+', r'[\u0640]+',
                                  r'[\u0653]+', r'[\u065C]+', r'[\u066C]+', r'[\u066A]+',
                                  r'["}"]+', r'["{"]+']
-            outputString = text
+            output_string = text
             for punctuation in punctuation_marks:
-                outputString = re.sub(punctuation, '', outputString)
+                output_string = re.sub(punctuation, '', output_string)
     except:
         return text
-    return outputString
+    return output_string
 def remove_latin(text):
     """
@@ -126,7 +123,7 @@ def remove_latin(text):
     .. highlight:: python
     .. code-block:: python
-        from nlptools.utils import parser
+        from sinatools.utils import parser
         return parser.remove_latin("miojkdujhvaj1546545spkdpoqfoiehwv nWEQFGWERHERTJETAWIKUYFC")
         #output

{nlptools → sinatools}/utils/text_transliteration.py RENAMED Viewed

@@ -181,7 +181,7 @@ def perform_transliteration(text , schema ):
     .. highlight:: python
     .. code-block:: python
-        from nlptools.utils import text_transliteration
+        from sinatools.utils import text_transliteration
         print(text_transliteration.perform_transliteration("مُحَمَدٌ نَـشِيْطٌـ1"  , "ar2bw"))
         print(text_transliteration.perform_transliteration("muHamadN"  , "bw2ar"))

nlptools/utils/corpus_tokenizer.py → sinatools/utils/tokenizer.py RENAMED Viewed

@@ -1,7 +1,60 @@
 import os
 import csv
-from nlptools.utils.sentence_tokenizer import sent_tokenize
-from nlptools.morphology.tokenizers_words import simple_word_tokenize
+from sinatools.utils.tokenizers_words import simple_word_tokenize
+def remove_empty_values(sentences):
+    return [value for value in sentences if value != '']
+def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
+    """
+    This method tokenizes a text into a set of sentences based on the selected separators, including the dot, new line, question mark, and exclamation mark.
+    Args:
+        text (:obj:`str`): Arabic text to be tokenized.
+        dot (:obj:`str`): flag to split text based on Dot (default is True).
+        new_line (:obj:`str`): flag to split text based on new_line (default is True).
+        question_mark (:obj:`str`): flag to split text based on question_mark (default is True).
+        exclamation_mark (:obj:`str`): flag to split text based on exclamation_mark (default is True).
+    Returns:
+        :obj:`list`: list of sentences.
+    **Example:**
+    .. highlight:: python
+    .. code-block:: python
+        from sinatools.utils import tokenizer
+        sentences = tokenizer.sentence_tokenizer("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.", dot=True, new_line=True, question_mark=True, exclamation_mark=True)
+        print(sentences)
+        #output
+        ['مختبر سينا لحوسبة اللغة والذكاء الإصطناعي.', 'في جامعة بيرزيت.']
+    """
+    separators = []
+    split_text = [text]
+    if new_line==True:
+        separators.append('\n')
+    if dot==True:
+        separators.append('.')
+    if question_mark==True:
+        separators.append('?')
+        separators.append('؟')
+    if exclamation_mark==True:
+        separators.append('!')
+    for sep in separators:
+        new_split_text = []
+        for part in split_text:
+            tokens = part.split(sep)
+            tokens_with_separator = [token + sep for token in tokens[:-1]]
+            tokens_with_separator.append(tokens[-1].strip())
+            new_split_text.extend(tokens_with_separator)
+        split_text = new_split_text
+    split_text = remove_empty_values(split_text)
+    return split_text
 def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
     """
@@ -28,8 +81,8 @@ def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
     .. highlight:: python
     .. code-block:: python
-        from nlptools.utils.corpus_tokenizer import corpus_tokenizer
-        corpus_tokenizer(dir_path="History", output_csv="ouputFile.csv", row_id = 1, global_sentence_id = 1)
+        from sinatools.utils import tokenizer
+        output = tokenizer.corpus_tokenizer(dir_path="History", output_csv="ouputFile.csv", row_id = 1, global_sentence_id = 1)
         #output
         # csv file called: ouputFile.csv
@@ -55,7 +108,7 @@ def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
                     file_path = os.path.join(root, file)
                     with open(file_path, 'r', encoding="utf-8") as f:
                         content = f.read()
-                        sentences = sent_tokenize(content, dot=True, new_line=True, question_mark=False, exclamation_mark=False)
+                        sentences = sentence_tokenizer(content, dot=True, new_line=True, question_mark=False, exclamation_mark=False)
                         for sentence_id, sentence in enumerate(sentences, start=1):
                             words = simple_word_tokenize(sentence)
                             global_sentence_id += 1

{nlptools/morphology → sinatools/utils}/tokenizers_words.py RENAMED Viewed

@@ -1,11 +1,8 @@
-# This code was taken from Camel tools without any change
-# -*- coding: utf-8 -*-
+# We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/tokenizers/word.html].
 import re
-from nlptools.morphology.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
-from nlptools.morphology.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET
+from sinatools.utils.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
+from sinatools.utils.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET
 _ALL_PUNCT = u''.join(UNICODE_PUNCT_SYMBOL_CHARSET)

SinaTools-0.1.3.dist-info/RECORD DELETED Viewed

@@ -1,122 +0,0 @@
-SinaTools-0.1.3.data/data/nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
-nlptools/VERSION,sha256=2_CXjsK1h6XWGH_cxBzOn_LA647vrboOtR84QKtu60Y,5
-nlptools/__init__.py,sha256=OoA_p_y2jPjMytcUrG1ED5uJlJemVhSRr9L9Wsym-rQ,134
-nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
-nlptools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
-nlptools/nlptools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
-nlptools/CLI/DataDownload/download_files.py,sha256=PMDEPXxZQbrFo-7iyhvrCpzx2RG5T5kPk6NJAwh8RSI,2322
-nlptools/CLI/arabiner/bin/infer.py,sha256=YrNCVro8B3UxpsHjIo_01qiBQURpDNTK7pKTkw1L21Y,4921
-nlptools/CLI/arabiner/bin/infer2.py,sha256=CtR9rwe20ks_qq-l_fQU-ThLqft_1o3Ztmd1my1kHMg,3905
-nlptools/CLI/morphology/ALMA_multi_word.py,sha256=NINts8BtT8BGQPBvs4BJ_y2PsR7czsGPOVAwngaT85A,2644
-nlptools/CLI/morphology/morph_analyzer.py,sha256=39vrFx6ppu7yEITcz8lAJhk3xHweaPWEqL-CcqBM37Q,3565
-nlptools/CLI/salma/salma_tools.py,sha256=7awpCb68QUc3kx-EuwRHxDmItZlX2aSdpukwKF1G3Fo,1999
-nlptools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nlptools/CLI/utils/arStrip.py,sha256=dzy16wZfSznkvGHHBn5P21EvyusKB55dqrZ4zbaa41w,3621
-nlptools/CLI/utils/corpus_tokenizer.py,sha256=S0YG8FRS29K1C8eJVEYuWSV1ABS7PKymlNS7KxvYqxI,2817
-nlptools/CLI/utils/implication.py,sha256=hjYTN0oiLf0bz0bRO_GD4rphZkaB3cH770clFFhuevE,3172
-nlptools/CLI/utils/jaccard.py,sha256=a6oc28yMgm7UewO6Lz25A4Yv8QEzVa85XF-QV9uhMwI,4639
-nlptools/CLI/utils/latin_remove.py,sha256=Xw6PB4GtMLLiYK3zTEwdLhBbivMyy1msD5Ab_QdJoQA,1303
-nlptools/CLI/utils/remove_Punc.py,sha256=dvSiSs9UulhGCogBgtpD8fU860BFuMBTnwa8Ek9aPKQ,1393
-nlptools/CLI/utils/sentence_tokenizer.py,sha256=AcJa_yRdlQqKMwVWWKSv1vRO1Yk-NK75-NpalkHqewc,3469
-nlptools/CLI/utils/text_transliteration.py,sha256=blIGB8FeF10iFeXADM-z01XJ4qeB1qgj6S2Xnk9w5fI,2266
-nlptools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nlptools/DataDownload/downloader.py,sha256=yONVa99OtPXD5Lewy4Fm3eUiJMpBt492G1JOPh5sXAU,6523
-nlptools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
-nlptools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
-nlptools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
-nlptools/arabert/arabert/create_classification_data.py,sha256=BhemGNRbYz_Pun0Q5WerN2-9n-ILmU3tm4J-OlHw5-A,7678
-nlptools/arabert/arabert/create_pretraining_data.py,sha256=2M-cF3CLHbQ0cdWrzFT6Frg1vVP4Y-CFoq8iEPyxgsE,18924
-nlptools/arabert/arabert/extract_features.py,sha256=C1IzASrlX7u4_M2xdr_PjzWfTRZgklhUXA2WHKgQt-I,15585
-nlptools/arabert/arabert/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
-nlptools/arabert/arabert/modeling.py,sha256=KliecCmA1pP3owg0mYge6On3IRHunMF5kMLuEwc0VLw,40896
-nlptools/arabert/arabert/optimization.py,sha256=Wx0Js6Zsfc3iVw-_7Q1SCnxfP_qqbdTAyFD-vZSpOyk,8153
-nlptools/arabert/arabert/run_classifier.py,sha256=AdVGyvidlmbEp12b-PauiBo6EmFLEO7tqeJKuLhK2DA,38777
-nlptools/arabert/arabert/run_pretraining.py,sha256=yO16nKkHDfcYA2Zx7vv8KN4te6_1qFOzyVeDzFT-DQw,21894
-nlptools/arabert/arabert/run_squad.py,sha256=PORxgiByP8L6vZqAFkqgHPJ_ZjAlqlg64gtkdLmDNns,53456
-nlptools/arabert/arabert/tokenization.py,sha256=R6xkyCb8_vgeksXiLeqDvV5vOnLb1cPNsvfDij6YVFk,14132
-nlptools/arabert/araelectra/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
-nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py,sha256=pIo6VFT3XXOYroZaab3msZAP6XjCKu0KcrIZQA0Pj8U,3881
-nlptools/arabert/araelectra/build_pretraining_dataset.py,sha256=Z8ZmKznaE_2SPDRoPYR1SDhjTN_NTpNCFFuhUkykwl8,9041
-nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py,sha256=W7HFr1XoO6bCDR7X7w-bOuwULFtTSjeKbJ2LHzzHf9k,3224
-nlptools/arabert/araelectra/configure_finetuning.py,sha256=YfGLMdgN6Qqm357Mzy5UMjkuLPPWtBs7f4dA-DKE6JM,7768
-nlptools/arabert/araelectra/configure_pretraining.py,sha256=oafQgu4WmVdxBcU5mSfXhPlvCk43CJwAWXC10Q58BlI,5801
-nlptools/arabert/araelectra/flops_computation.py,sha256=krHTeuPH9xQu5ldprBOPJNlJRvC7fmmvXXqUjfWrzPE,9499
-nlptools/arabert/araelectra/run_finetuning.py,sha256=JecbrSmGikBNyid4JKRZ49Rm5xFpt02WfgIIcs3TpcU,12976
-nlptools/arabert/araelectra/run_pretraining.py,sha256=1K2aAFTY0p3iaLY0xkhTlm6v0B-Zun8SwEzz-K6RXM4,20665
-nlptools/arabert/araelectra/finetune/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
-nlptools/arabert/araelectra/finetune/feature_spec.py,sha256=cqNlBa2KK_G1-vkKm1EJUv6BoS3gesCUAHwVagZB6wM,1888
-nlptools/arabert/araelectra/finetune/preprocessing.py,sha256=1mf7-IxknCRsobQZ-VV1zs4Cwt-mfOtoVxysDJa9LZ0,6657
-nlptools/arabert/araelectra/finetune/scorer.py,sha256=PjRg0P5ANCtul2ute7ccq3mRCCoIAoCb-lVLlwd4rVY,1571
-nlptools/arabert/araelectra/finetune/task.py,sha256=zM8M4PGSIrY2u6ytpmkQEXxG-jjoeN9wouEyVR23qeQ,1991
-nlptools/arabert/araelectra/finetune/task_builder.py,sha256=Zsoiuw5M3Ca8QhaZVLVLZyWw09K5R75UeMuPmazMlHI,2768
-nlptools/arabert/araelectra/model/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
-nlptools/arabert/araelectra/model/modeling.py,sha256=5XLIutnmr-SFQOV_XntJ-U5evSCY-J2e9NjvlwVXKkk,40877
-nlptools/arabert/araelectra/model/optimization.py,sha256=BCMb_C5hgBw7wC9ZR8AQ4lwoPopqLIcSiqcCrIjx9XU,7254
-nlptools/arabert/araelectra/model/tokenization.py,sha256=9CkyPzs3L6OEPzN-7EWQDNQmW2mIJoZD4o1rn6xLdL4,11082
-nlptools/arabert/araelectra/pretrain/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
-nlptools/arabert/araelectra/pretrain/pretrain_data.py,sha256=NLgIcLAq1-MgtBNXYu_isDxnOY5k67SyADYy-8nzBok,5413
-nlptools/arabert/araelectra/pretrain/pretrain_helpers.py,sha256=nFl7LEdxAU5kKwiodqJHzi-ty9jMFsCCNYOF__A69j8,9255
-nlptools/arabert/araelectra/util/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
-nlptools/arabert/araelectra/util/training_utils.py,sha256=7h_J1ljUWM0ynBcofEtjZWL_oAfZtTxEemQLkixgn-0,4142
-nlptools/arabert/araelectra/util/utils.py,sha256=G0UAETUCZMlU9R9ASD9AXrWZeodWI1aZJEE9F-goaH4,2591
-nlptools/arabert/aragpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
-nlptools/arabert/aragpt2/create_pretraining_data.py,sha256=fFa2_DAyTwc8L2IqQbshsh_Ia26nj1qtVLzC6DooSac,3105
-nlptools/arabert/aragpt2/train_bpe_tokenizer.py,sha256=b-8zHQ02fLmZV4GfjnrPptwjpX259F41SlnWzBrflMA,1888
-nlptools/arabert/aragpt2/gpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
-nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
-nlptools/arabert/aragpt2/gpt2/optimization.py,sha256=iqh23cypRSRUt53wt2G5SbNNpJMwERM7gZAOKVh5l4U,8411
-nlptools/arabert/aragpt2/gpt2/run_pretraining.py,sha256=4jjkUbvTO1DHoKJ89yKtlkkofcND_fyAunQ-mlnJhTM,13298
-nlptools/arabert/aragpt2/grover/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nlptools/arabert/aragpt2/grover/dataloader.py,sha256=-FWPTjtsvweEE1WaWRHBXfOSbsGiUmnXT3qK7KJP8cM,6853
-nlptools/arabert/aragpt2/grover/modeling.py,sha256=XcUvFwqRaxAwWiJstrH2FPBvDJe03pTWIyipdMfWj9g,38280
-nlptools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpWvExfbLzV2BqiEs7llFUw,51602
-nlptools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
-nlptools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
-nlptools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
-nlptools/arabiner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nlptools/arabiner/bin/__init__.py,sha256=d1ToN2uheCCVby3TjiSuD1dqo_pvNIuTgz4COFr2Khs,438
-nlptools/arabiner/bin/eval.py,sha256=ihtjJinY1jXpZXW5bQJzTC5MF6_V3GQ5zHzsc691_HQ,2591
-nlptools/arabiner/bin/infer.py,sha256=EZKeq4zucIE-ooHYnegODNxsRiIY_gY5GvDPChH5WRQ,3237
-nlptools/arabiner/bin/process.py,sha256=4QCZjsmYV5lep6waQE37fs7Fe59_1G5seIJLDkArg4s,4698
-nlptools/arabiner/bin/train.py,sha256=hf6ZRhqMZ7bFealMSusBjtWrbzHGHc5HB2Lh4rp2uQA,6390
-nlptools/arabiner/data/__init__.py,sha256=XPic1bPijmZda_LPFL5J6TOps_IHUTiBDJvMx-iJqKo,61
-nlptools/arabiner/data/datasets.py,sha256=p52Uc8Q2D3EzN1OmoHQcWVsJ2oB3TqgTzAcy1B9fJ68,5068
-nlptools/arabiner/data/transforms.py,sha256=KPCDdjZOEvhMC38eiFwJuiQC84cfDrvC0XM4Ye0o3do,4878
-nlptools/arabiner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
-nlptools/arabiner/nn/BertNestedTagger.py,sha256=7vU2tmDSoqSHn6GvMJmyN0hEMLvCkbr_r-AaiAaYdw8,1223
-nlptools/arabiner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
-nlptools/arabiner/nn/__init__.py,sha256=ZN7Psm83pysUhGI3ZSaJra2aCYBZb9DZ0UX4CiKGc0A,182
-nlptools/arabiner/trainers/BaseTrainer.py,sha256=oZgFJW-CawfCKT5gtaBHA7Q7XjNfiyqM62KnFsgVzPU,3919
-nlptools/arabiner/trainers/BertNestedTrainer.py,sha256=hVqPRdmaHf2iwftseNpgsAfwGkl6eMHJx1rKunQS_vM,8443
-nlptools/arabiner/trainers/BertTrainer.py,sha256=KkcgZXu6kqsrrnfFtiAQ8ucLsrQtDxLRqdbTiTnRWqI,6447
-nlptools/arabiner/trainers/__init__.py,sha256=kt8WqsaOjX0h1JMa-v7Y9ywT5mfwQIsZTyVWnIAWsEQ,200
-nlptools/arabiner/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nlptools/arabiner/utils/data.py,sha256=uuPiu-7v0gccNygZjdTKomJGE7X0H9FC24Y9nHZpf4c,4376
-nlptools/arabiner/utils/helpers.py,sha256=PyOOlx5uabvZVmU3SZtZ3ZLA3pliinJ3JXsvos9SUWU,5032
-nlptools/arabiner/utils/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
-nlptools/morphology/ALMA_multi_word.py,sha256=hlzZCk-uUdZ-GbiPsFxDTvoWoIuVof2Sm7NdaxaFipM,1313
-nlptools/morphology/__init__.py,sha256=z6_RGhiyfNHXNKMmhNSI6ObTLmdjQyP58vsFottI8GA,1706
-nlptools/morphology/charsets.py,sha256=7w9OrbnZTnLU3A9q-SUi9GhUN97qNtbYR5T0Pm72cF8,2784
-nlptools/morphology/morph_analyzer.py,sha256=OmCxm4-fM2qfYzKk8yOd6D_T3RsfzZCcd7Oz2V4Advg,6507
-nlptools/morphology/settings.py,sha256=sEZdnA7MiYXHdxrfHWXop1RcKClVzpOYzZwzHC1PxJ8,144
-nlptools/morphology/tokenizers_words.py,sha256=Smtt_KXifl2wRI464Qn07PtUvOsyGBJjZ7E20gd8zVM,602
-nlptools/salma/__init__.py,sha256=pOauGjD-xrGHw05sNx3EiSFc_wpM3bD1vJxQHoDDXOA,376
-nlptools/salma/settings.py,sha256=fqAQg2b22gorzT9Pf_AEJD9p8AlVUaVyKD3FH8g2yUs,1110
-nlptools/salma/views.py,sha256=EH1vc6P88CeAIzQKt7EU_HTI0uJipv4JdXiAX5NjrJY,18416
-nlptools/salma/wsd.py,sha256=kmP5ZvvVMkxApgk91TAGSBkMJZbPPbS0qoNk8OE37og,4434
-nlptools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nlptools/utils/corpus_tokenizer.py,sha256=IDWh87XJaFa7V2P4kIxY4QVywPKhz0fIErc_c0gJGUU,4581
-nlptools/utils/implication.py,sha256=Ro1Vw62oOBzELkX-zpHyieq4v2OsoyFrFeTU7BiK7qc,27794
-nlptools/utils/jaccard.py,sha256=TTC5KTVv6kONw5vZtzxEQvv7QM79BCsD0xcJAY0T5tU,10111
-nlptools/utils/parser.py,sha256=0Yd40CZf4wXso2q-d9LULUNAVUAMdiYMImfcVb6i9qQ,6175
-nlptools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
-nlptools/utils/sentence_tokenizer.py,sha256=3C0Wx1ns8ZHiGwKlUkcti-8zA3fB4ju0fIEtGACM7oU,2162
-nlptools/utils/text_transliteration.py,sha256=zhB3sFXSMtkkdqImRMVg415AAB80DOm9lMFKb2IBynw,8765
-nlptools/utils/utils.py,sha256=vKkFOkYclMu8nXS_VZb6Kobx8QGKW9onXkkLCeiRb6g,32
-SinaTools-0.1.3.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
-SinaTools-0.1.3.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
-SinaTools-0.1.3.dist-info/METADATA,sha256=zxuxnKe_i5AAHNC_uPGxpmAzB2T2y01iL-kHIRV5H-o,1527
-SinaTools-0.1.3.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
-SinaTools-0.1.3.dist-info/entry_points.txt,sha256=9-PNkvWGCid8SVN03S2NkJFuxAzvcB22tGpHe-et2q8,951
-SinaTools-0.1.3.dist-info/top_level.txt,sha256=sREDI6iHe4D0BZQmZbZ-LxYIn2cBWUayk9CZwAR9jaE,9
-SinaTools-0.1.3.dist-info/RECORD,,

SinaTools-0.1.3.dist-info/entry_points.txt DELETED Viewed

@@ -1,18 +0,0 @@
-[console_scripts]
-arabi_ner = nlptools.CLI.arabiner.bin.infer:main
-arabi_ner2 = nlptools.CLI.arabiner.bin.infer2:main
-install_env = nlptools.install_env:main
-sina_alma_multi_word = nlptools.CLI.morphology.ALMA_multi_word:main
-sina_appdatadir = nlptools.CLI.DataDownload.get_appdatadir:main
-sina_arStrip = nlptools.CLI.utils.arStrip:main
-sina_corpus_tokenizer = nlptools.CLI.utils.corpus_tokenizer:main
-sina_download_files = nlptools.CLI.DataDownload.download_files:main
-sina_implication = nlptools.CLI.utils.implication:main
-sina_jaccard_similarity = nlptools.CLI.utils.jaccard:main
-sina_morph_analyze = nlptools.CLI.morphology.morph_analyzer:main
-sina_remove_latin = nlptools.CLI.utils.latin_remove:main
-sina_remove_punctuation = nlptools.CLI.utils.remove_Punc:main
-sina_salma = nlptools.CLI.salma.salma_tools:main
-sina_sentence_tokenize = nlptools.CLI.utils.sentence_tokenizer:main
-sina_transliterate = nlptools.CLI.utils.text_transliteration:main

SinaTools-0.1.3.dist-info/top_level.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- nlptools

nlptools/CLI/morphology/morph_analyzer.py DELETED Viewed

@@ -1,91 +0,0 @@
-"""
-About:
-------
-The sina_morph_analyze tool is designed to provide morphological analysis for Arabic text using the SinaTools' `analyze` utility. Users can specify the language and desired analysis task (e.g., lemmatization, part-of-speech tagging, or a full morphological analysis).
-Usage:
-------
-Below is the usage information that can be generated by running sina_morph_analyze --help.
-.. code-block:: none
-    sina_morph_analyze --text=TEXT [OPTIONS]
-    sina_morph_analyze --file=FILE [OPTIONS]
-Options:
---------
-.. code-block:: none
-  --text TEXT
-        The text that needs to be morphologically analyzed.
-  --file FILE
-        File containing the text to be morphologically analyzed
-  --language LANGUAGE [default=MSA]
-        Specifies the language for the analysis. The default is MSA (Modern Standard Arabic).
-        Use other codes as appropriate for your requirements.
-  --task TASK [default=full]
-        Determines the specific type of morphological analysis to be performed. Available options are:
-          - lemmatizer: Provides lemmatization results.
-          - pos: Provides part-of-speech tagging.
-          - full: Provides a comprehensive morphological analysis.
-        The default is a full morphological analysis.
-Examples:
----------
-.. code-block:: none
-  sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
-  sina_morph_analyze --text "Your Arabic text here" --task lemmatizer
-  sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full
-  sina_morph_analyze --file "path/to/your/file.txt" --task lemmatizer
-Note:
------
-.. code-block:: none
-  - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
-  - The quality and accuracy of the analysis depend on the underlying capabilities of the SinaTools' `analyze` utility.
-  - The analysis can be influenced by the choice of language. Ensure you are using the correct language setting.
-"""
-import argparse
-from nlptools.morphology.morph_analyzer import analyze
-from nlptools.utils.readfile import read_file
-def main():
-    parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools')
-    # Adding arguments for the text, file, language, and task
-    parser.add_argument('--text', type=str, help='Text to be morphologically analyzed')
-    parser.add_argument('--file', type=str, help='File containing the text to be morphologically analyzed')
-    parser.add_argument('--language', type=str, default='MSA', help='Language for analysis (default: MSA)')
-    parser.add_argument('--task', type=str, default='full', choices=['lemmatizer', 'pos', 'full'],
-                        help='Task for the result filter [lemmatizer, pos, full] (default: full)')
-    args = parser.parse_args()
-    if args.text is None and args.file is None:
-        print("Error: Either --text or --file argument must be provided.")
-        return
-    # Get the input either from the --text argument or from the file specified in the --file argument
-    input_text = args.text if args.text else " ".join(read_file(args.file))
-    # Perform morphological analysis
-    results = analyze(input_text, args.language, args.task)
-    # Print the results
-    for result in results:
-        print(result)
-if __name__ == '__main__':
-    main()
-#sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
-#sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full

nlptools/CLI/utils/corpus_tokenizer.py DELETED Viewed

@@ -1,74 +0,0 @@
-"""
-About:
-------
-The sina_corpus_tokenizer tool offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file.
-Usage:
--------
-Below is the usage information that can be generated by running sina_corpus_tokenizer --help.
-.. code-block:: none
-    Usage:
-        sina_corpus_tokenizer dir_path output_csv
-.. code-block:: none
-    Positional Arguments:
-    dir_path
-            The path to the directory containing the text files.
-    output_csv
-            The path to the output CSV file.
-Examples:
----------
-.. code-block:: none
-    sina_corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv  "outputFile.csv"
-Note:
------
-.. code-block:: none
-    - The tool only processes text files (with a .txt extension).
-    - The output CSV will contain the following columns:
-        - 'Row_ID' (a unique identifier for each records in outputfile)
-        - 'Docs_Sentence_Word_ID' (a concatenated identifier comprising directory name, file name, global sentence id, sentence id, and word position).
-        - 'GlobalSentenceID' (Integer, a unique identifier for each sentence in the entire file)
-        - 'SentenceID' (Integer, a unique identifier for each file within the CSV file)
-        - 'Sentence' (Generated text that forms a sentence)
-        - 'Word Position' (Integer, the position of each word within the sentence)
-        - 'Word' (Each row contains a word from the generated sentence).
-    - Ensure that the text files are appropriately encoded in UTF-8 or compatible formats.
-    - The tool uses the `nltk` library for sentence and word tokenization. Make sure to have the library installed in your environment.
-"""
-import argparse
-from nlptools.utils.corpus_tokenizer import corpus_tokenizer
-# Define the main function that will parse the arguments
-def main():
-    # Create an ArgumentParser object
-    parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.')
-    # Add arguments to the parser
-    parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.')
-    parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.')
-    # Parse the command-line arguments
-    args = parser.parse_args()
-    # Call the corpus_tokenizer function with the parsed arguments
-    corpus_tokenizer(args.dir_path, args.output_csv)
-# Call the main function when the script is executed
-if __name__ == '__main__':
-    main()
-#sina_corpus_tokenizer /path/to/text/files output.csv

SinaTools 0.1.3__py2.py3-none-any.whl → 0.1.7__py2.py3-none-any.whl

SinaTools 0.1.3py2.py3-none-any.whl → 0.1.7py2.py3-none-any.whl