SinaTools 0.1.4__py2.py3-none-any.whl → 0.1.8__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/METADATA +10 -10
- SinaTools-0.1.8.dist-info/RECORD +101 -0
- SinaTools-0.1.8.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.8.dist-info/top_level.txt +1 -0
- {nlptools → sinatools}/CLI/DataDownload/download_files.py +9 -9
- {nlptools → sinatools}/CLI/morphology/ALMA_multi_word.py +10 -20
- sinatools/CLI/morphology/morph_analyzer.py +80 -0
- nlptools/CLI/arabiner/bin/infer2.py → sinatools/CLI/ner/corpus_entity_extractor.py +5 -9
- nlptools/CLI/arabiner/bin/infer.py → sinatools/CLI/ner/entity_extractor.py +4 -8
- {nlptools → sinatools}/CLI/salma/salma_tools.py +8 -8
- {nlptools → sinatools}/CLI/utils/arStrip.py +10 -21
- sinatools/CLI/utils/corpus_tokenizer.py +50 -0
- {nlptools → sinatools}/CLI/utils/implication.py +9 -9
- {nlptools → sinatools}/CLI/utils/jaccard.py +10 -10
- sinatools/CLI/utils/remove_latin.py +34 -0
- sinatools/CLI/utils/remove_punctuation.py +42 -0
- {nlptools → sinatools}/CLI/utils/sentence_tokenizer.py +9 -22
- {nlptools → sinatools}/CLI/utils/text_transliteration.py +10 -17
- {nlptools → sinatools}/DataDownload/downloader.py +9 -9
- sinatools/VERSION +1 -0
- {nlptools → sinatools}/__init__.py +1 -1
- {nlptools → sinatools}/morphology/ALMA_multi_word.py +4 -5
- {nlptools → sinatools}/morphology/__init__.py +4 -14
- sinatools/morphology/morph_analyzer.py +172 -0
- sinatools/ner/__init__.py +12 -0
- nlptools/arabiner/bin/infer.py → sinatools/ner/entity_extractor.py +9 -8
- {nlptools → sinatools}/salma/__init__.py +2 -2
- {nlptools → sinatools}/salma/settings.py +1 -1
- {nlptools → sinatools}/salma/views.py +9 -9
- {nlptools → sinatools}/salma/wsd.py +2 -2
- {nlptools/morphology → sinatools/utils}/charsets.py +1 -3
- {nlptools → sinatools}/utils/implication.py +10 -10
- {nlptools → sinatools}/utils/jaccard.py +2 -2
- {nlptools → sinatools}/utils/parser.py +18 -21
- {nlptools → sinatools}/utils/text_transliteration.py +1 -1
- nlptools/utils/corpus_tokenizer.py → sinatools/utils/tokenizer.py +58 -5
- {nlptools/morphology → sinatools/utils}/tokenizers_words.py +3 -6
- SinaTools-0.1.4.dist-info/RECORD +0 -122
- SinaTools-0.1.4.dist-info/entry_points.txt +0 -18
- SinaTools-0.1.4.dist-info/top_level.txt +0 -1
- nlptools/CLI/morphology/morph_analyzer.py +0 -91
- nlptools/CLI/utils/corpus_tokenizer.py +0 -74
- nlptools/CLI/utils/latin_remove.py +0 -51
- nlptools/CLI/utils/remove_Punc.py +0 -53
- nlptools/VERSION +0 -1
- nlptools/arabiner/bin/__init__.py +0 -14
- nlptools/arabiner/bin/eval.py +0 -87
- nlptools/arabiner/bin/process.py +0 -140
- nlptools/arabiner/bin/train.py +0 -221
- nlptools/arabiner/data/__init__.py +0 -1
- nlptools/arabiner/data/datasets.py +0 -146
- nlptools/arabiner/data/transforms.py +0 -118
- nlptools/arabiner/nn/BaseModel.py +0 -22
- nlptools/arabiner/nn/BertNestedTagger.py +0 -34
- nlptools/arabiner/nn/BertSeqTagger.py +0 -17
- nlptools/arabiner/nn/__init__.py +0 -3
- nlptools/arabiner/trainers/BaseTrainer.py +0 -117
- nlptools/arabiner/trainers/BertNestedTrainer.py +0 -203
- nlptools/arabiner/trainers/BertTrainer.py +0 -163
- nlptools/arabiner/trainers/__init__.py +0 -3
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +0 -124
- nlptools/arabiner/utils/helpers.py +0 -151
- nlptools/arabiner/utils/metrics.py +0 -69
- nlptools/morphology/morph_analyzer.py +0 -171
- nlptools/morphology/settings.py +0 -8
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/sentence_tokenizer.py +0 -53
- {SinaTools-0.1.4.data/data/nlptools → SinaTools-0.1.8.data/data/sinatools}/environment.yml +0 -0
- {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/WHEEL +0 -0
- {nlptools → sinatools}/CLI/utils/__init__.py +0 -0
- {nlptools → sinatools}/DataDownload/__init__.py +0 -0
- {nlptools → sinatools}/arabert/__init__.py +0 -0
- {nlptools → sinatools}/arabert/arabert/__init__.py +0 -0
- {nlptools → sinatools}/arabert/arabert/create_classification_data.py +0 -0
- {nlptools → sinatools}/arabert/arabert/create_pretraining_data.py +0 -0
- {nlptools → sinatools}/arabert/arabert/extract_features.py +0 -0
- {nlptools → sinatools}/arabert/arabert/lamb_optimizer.py +0 -0
- {nlptools → sinatools}/arabert/arabert/modeling.py +0 -0
- {nlptools → sinatools}/arabert/arabert/optimization.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_classifier.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_squad.py +0 -0
- {nlptools → sinatools}/arabert/arabert/tokenization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/configure_finetuning.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/configure_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/feature_spec.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/preprocessing.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/scorer.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/task.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/task_builder.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/flops_computation.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/modeling.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/optimization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/tokenization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_data.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/run_finetuning.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/training_utils.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/utils.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/create_pretraining_data.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/optimization.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/dataloader.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/modeling.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/train_tpu.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/utils.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
- {nlptools → sinatools}/arabert/preprocess.py +0 -0
- {nlptools → sinatools}/environment.yml +0 -0
- {nlptools → sinatools}/install_env.py +0 -0
- /nlptools/nlptools.py → /sinatools/sinatools.py +0 -0
- {nlptools/arabiner → sinatools/utils}/__init__.py +0 -0
- {nlptools → sinatools}/utils/readfile.py +0 -0
- {nlptools → sinatools}/utils/utils.py +0 -0
@@ -1,18 +1,18 @@
|
|
1
1
|
"""
|
2
2
|
About:
|
3
3
|
------
|
4
|
-
The
|
4
|
+
The jaccard tool computes the Jaccard similarity between two sets of strings. The Jaccard similarity is the size of the intersection divided by the size of the union of the sample sets. It provides a measure of similarity between two sets.
|
5
5
|
|
6
6
|
Usage:
|
7
7
|
------
|
8
|
-
Below is the usage information that can be generated by running
|
8
|
+
Below is the usage information that can be generated by running jaccard --help.
|
9
9
|
|
10
10
|
.. code-block:: none
|
11
11
|
|
12
12
|
Usage:
|
13
|
-
|
13
|
+
jaccard --list1="WORD1, WORD2" --list2="WORD1,WORD2" --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
|
14
14
|
|
15
|
-
|
15
|
+
jaccard --file1=File1 --file2=File2 --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
|
16
16
|
|
17
17
|
.. code-block:: none
|
18
18
|
|
@@ -39,9 +39,9 @@ Examples:
|
|
39
39
|
|
40
40
|
.. code-block:: none
|
41
41
|
|
42
|
-
|
42
|
+
jaccard --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|
43
43
|
|
44
|
-
|
44
|
+
jaccard --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|
45
45
|
|
46
46
|
Note:
|
47
47
|
-----
|
@@ -55,8 +55,8 @@ Note:
|
|
55
55
|
"""
|
56
56
|
|
57
57
|
import argparse
|
58
|
-
from
|
59
|
-
from
|
58
|
+
from sinatools.utils.jaccard import jaccard
|
59
|
+
from sinatools.utils.readfile import read_file
|
60
60
|
|
61
61
|
|
62
62
|
def main():
|
@@ -92,5 +92,5 @@ def main():
|
|
92
92
|
if __name__ == '__main__':
|
93
93
|
main()
|
94
94
|
|
95
|
-
#
|
96
|
-
#
|
95
|
+
# jaccard_similarity --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|
96
|
+
# jaccard_similarity --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|
@@ -0,0 +1,34 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The remove_latin command performs delete latin characters from the input text.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running remove_latin --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
remove_latin --text=TEXT
|
12
|
+
remove_latin --file "path/to/your/file.txt"
|
13
|
+
|
14
|
+
Examples:
|
15
|
+
---------
|
16
|
+
.. code-block:: none
|
17
|
+
latin_remove --text "123test"
|
18
|
+
latin_remove --file "path/to/your/file.txt"
|
19
|
+
"""
|
20
|
+
|
21
|
+
import argparse
|
22
|
+
from sinatools.utils.parser import remove_latin
|
23
|
+
|
24
|
+
|
25
|
+
def main():
|
26
|
+
parser = argparse.ArgumentParser(description='remove latin characters from the text')
|
27
|
+
|
28
|
+
parser.add_argument('--text', type=str, required=True, help='The input text')
|
29
|
+
args = parser.parse_args()
|
30
|
+
result = remove_latin(args.text)
|
31
|
+
|
32
|
+
print(result)
|
33
|
+
if __name__ == '__main__':
|
34
|
+
main()
|
@@ -0,0 +1,42 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The remove_punctuation command performs delete punctuation marks from the input text.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running remove_punctuation --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
Usage:
|
13
|
+
remove_punctuation --text=TEXT
|
14
|
+
remove_punctuation --file "path/to/your/file.txt"
|
15
|
+
|
16
|
+
Examples:
|
17
|
+
---------
|
18
|
+
.. code-block:: none
|
19
|
+
|
20
|
+
remove_punctuation --text "te%s@t...!!?"
|
21
|
+
remove_punctuation --file "path/to/your/file.txt"
|
22
|
+
"""
|
23
|
+
|
24
|
+
import argparse
|
25
|
+
from sinatools.utils.parser import remove_punctuation
|
26
|
+
#from sinatools.utils.parser import read_file
|
27
|
+
#from sinatools.utils.parser import write_file
|
28
|
+
|
29
|
+
|
30
|
+
def main():
|
31
|
+
parser = argparse.ArgumentParser(description='remove punctuation marks from the text')
|
32
|
+
|
33
|
+
parser.add_argument('--text',required=True,help="input text")
|
34
|
+
# parser.add_argument('myFile', type=argparse.FileType('r'),help='Input file csv')
|
35
|
+
args = parser.parse_args()
|
36
|
+
result = remove_punctuation(args.text)
|
37
|
+
|
38
|
+
print(result)
|
39
|
+
if __name__ == '__main__':
|
40
|
+
main()
|
41
|
+
|
42
|
+
|
@@ -2,20 +2,19 @@
|
|
2
2
|
About:
|
3
3
|
------
|
4
4
|
|
5
|
-
The
|
5
|
+
The sentence_tokenizer command allows you to tokenize text into sentences using the SinaTools utility. It provides
|
6
6
|
flexibility in tokenizing at different punctuation marks, including dots, question marks, and exclamation marks. It also
|
7
7
|
allows tokenization at new lines.
|
8
8
|
|
9
9
|
Usage:
|
10
10
|
------
|
11
|
-
Below is the usage information that can be generated by running
|
11
|
+
Below is the usage information that can be generated by running sentence_tokenizer --help.
|
12
12
|
|
13
13
|
.. code-block:: none
|
14
14
|
|
15
15
|
Usage:
|
16
|
-
|
17
|
-
|
18
|
-
sina_sentence_tokenize --file=FILE [options]
|
16
|
+
sentence_tokenizer --text=TEXT [options]
|
17
|
+
sentence_tokenizer --file=FILE [options]
|
19
18
|
|
20
19
|
.. code-block:: none
|
21
20
|
|
@@ -38,23 +37,13 @@ Examples:
|
|
38
37
|
|
39
38
|
.. code-block:: none
|
40
39
|
|
41
|
-
|
42
|
-
|
43
|
-
sina_sentence_tokenize --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
|
44
|
-
|
45
|
-
Note:
|
46
|
-
-----
|
47
|
-
|
48
|
-
.. code-block:: none
|
49
|
-
|
50
|
-
- The tokenization options allow for a customized experience. You can choose any combination of the options, or even none
|
51
|
-
- of them, to achieve the desired sentence tokenization behavior. If no tokenization options are provided, the tool will
|
52
|
-
- use default settings as implemented in the underlying `sent_tokenize` function of SinaTools.
|
40
|
+
sentence_tokenizer --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark
|
41
|
+
sentence_tokenizer --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
|
53
42
|
|
54
43
|
"""
|
55
44
|
import argparse
|
56
|
-
from
|
57
|
-
from
|
45
|
+
from sinatools.utils.tokenizer import sentence_tokenizer
|
46
|
+
from sinatools.utils.readfile import read_file
|
58
47
|
|
59
48
|
def main():
|
60
49
|
parser = argparse.ArgumentParser(description='Sentence Tokenization using SinaTools')
|
@@ -77,7 +66,7 @@ def main():
|
|
77
66
|
text_content = args.text if args.text else read_file(args.file)
|
78
67
|
|
79
68
|
# Perform sentence tokenization
|
80
|
-
sentences =
|
69
|
+
sentences = sentence_tokenizer(" ".join(text_content), dot=args.dot, new_line=args.new_line,
|
81
70
|
question_mark=args.question_mark, exclamation_mark=args.exclamation_mark)
|
82
71
|
|
83
72
|
# Print each sentence in a new line
|
@@ -86,5 +75,3 @@ def main():
|
|
86
75
|
|
87
76
|
if __name__ == '__main__':
|
88
77
|
main()
|
89
|
-
#sina_sentence_tokenize --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark
|
90
|
-
#sina_sentence_tokenize --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
|
@@ -2,21 +2,21 @@
|
|
2
2
|
About:
|
3
3
|
------
|
4
4
|
|
5
|
-
The
|
5
|
+
The transliterate tool allows you to transliterate text using the SinaTools' utility. This command-line utility
|
6
6
|
takes in a text and a desired schema, and outputs the transliterated text.
|
7
7
|
|
8
8
|
Usage:
|
9
9
|
------
|
10
|
-
Below is the usage information that can be generated by running
|
10
|
+
Below is the usage information that can be generated by running transliterate --help.
|
11
11
|
|
12
12
|
Usage:
|
13
13
|
------
|
14
14
|
|
15
15
|
.. code-block:: none
|
16
16
|
|
17
|
-
|
17
|
+
transliterate --text=TEXT --schema=SCHEMA
|
18
18
|
|
19
|
-
|
19
|
+
transliterate --file=FILE --schema=SCHEMA
|
20
20
|
|
21
21
|
Options:
|
22
22
|
--------
|
@@ -33,21 +33,14 @@ Examples:
|
|
33
33
|
|
34
34
|
.. code-block:: none
|
35
35
|
|
36
|
-
|
37
|
-
|
36
|
+
transliterate --text "klmp" --schema "bw2ar"
|
37
|
+
transliterate --file "path/to/your/file.txt" --schema "ar2bw"
|
38
38
|
|
39
|
-
Note:
|
40
|
-
-----
|
41
|
-
|
42
|
-
.. code-block:: none
|
43
|
-
|
44
|
-
For available transliteration schemas and more details, please refer to the SinaTools' documentation or the source code
|
45
|
-
of the function `perform_transliteration`.
|
46
39
|
|
47
40
|
"""
|
48
41
|
import argparse
|
49
|
-
from
|
50
|
-
from
|
42
|
+
from sinatools.utils.text_transliteration import perform_transliteration
|
43
|
+
from sinatools.utils.readfile import read_file
|
51
44
|
|
52
45
|
def main():
|
53
46
|
parser = argparse.ArgumentParser(description='Perform text transliteration using SinaTools')
|
@@ -73,5 +66,5 @@ def main():
|
|
73
66
|
if __name__ == '__main__':
|
74
67
|
main()
|
75
68
|
|
76
|
-
#
|
77
|
-
#
|
69
|
+
#transliterate --text "example text" --schema "bw2ar"
|
70
|
+
#transliterate --file "path/to/your/file.txt" --schema "bw2ar"
|
@@ -35,25 +35,25 @@ def get_appdatadir():
|
|
35
35
|
.. highlight:: python
|
36
36
|
.. code-block:: python
|
37
37
|
|
38
|
-
from
|
38
|
+
from sinatools.DataDownload import downloader
|
39
39
|
|
40
40
|
path = downloader.get_appdatadir()
|
41
41
|
|
42
|
-
Windows: 'C:/Users/<Username>/AppData/Roaming/
|
43
|
-
MacOS: '/Users/<Username>/Library/Application Support/
|
44
|
-
Linux: '/home/<Username>/.
|
45
|
-
Google Colab: '/content/
|
42
|
+
Windows: 'C:/Users/<Username>/AppData/Roaming/sinatools'
|
43
|
+
MacOS: '/Users/<Username>/Library/Application Support/sinatools'
|
44
|
+
Linux: '/home/<Username>/.sinatools'
|
45
|
+
Google Colab: '/content/sinatools'
|
46
46
|
|
47
47
|
"""
|
48
48
|
home = str(Path.home())
|
49
49
|
if 'google.colab' in sys.modules:
|
50
|
-
path = Path('/content/
|
50
|
+
path = Path('/content/sinatools')
|
51
51
|
elif sys.platform == 'win32':
|
52
|
-
path = Path(home, 'AppData/Roaming/
|
52
|
+
path = Path(home, 'AppData/Roaming/sinatools')
|
53
53
|
elif sys.platform == 'darwin':
|
54
|
-
path = Path(home, 'Library/Application Support/
|
54
|
+
path = Path(home, 'Library/Application Support/sinatools')
|
55
55
|
else:
|
56
|
-
path = Path(home, '.
|
56
|
+
path = Path(home, '.sinatools')
|
57
57
|
|
58
58
|
if not os.path.exists(path):
|
59
59
|
os.makedirs(path)
|
sinatools/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.8
|
@@ -1,13 +1,12 @@
|
|
1
|
-
from
|
2
|
-
from nlptools.utils.parser import arStrip
|
1
|
+
from sinatools.utils.parser import arStrip
|
3
2
|
import json
|
4
|
-
|
3
|
+
from . import dictionary
|
5
4
|
|
6
5
|
def ALMA_multi_word(multi_word):
|
7
6
|
undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False) # diacs , smallDiacs , shaddah , digit , alif , specialChars
|
8
7
|
result_word = []
|
9
|
-
if undiac_multi_word in
|
10
|
-
result_word =
|
8
|
+
if undiac_multi_word in dictionary.keys():
|
9
|
+
result_word = dictionary[undiac_multi_word]
|
11
10
|
|
12
11
|
my_json = {}
|
13
12
|
glosses_list = []
|
@@ -1,23 +1,13 @@
|
|
1
|
-
from nlptools.morphology import settings
|
2
1
|
import pickle
|
3
|
-
from
|
2
|
+
from sinatools.DataDownload import downloader
|
4
3
|
import os
|
5
4
|
|
6
|
-
|
7
|
-
#path =downloader.get_appdatadir()
|
8
|
-
#file_path = os.path.join(path, filename)
|
9
|
-
#with open(file_path, 'rb') as f:
|
10
|
-
# #Load the serialized data from the file
|
11
|
-
# settings.div_dic = pickle.load(f)
|
12
|
-
|
13
|
-
|
5
|
+
dictionary = {}
|
14
6
|
filename = 'lemmas_dic.pickle'
|
15
|
-
path =downloader.get_appdatadir()
|
7
|
+
path = downloader.get_appdatadir()
|
16
8
|
file_path = os.path.join(path, filename)
|
17
9
|
with open(file_path, 'rb') as f:
|
18
|
-
|
19
|
-
settings.div_dic = pickle.load(f)
|
20
|
-
|
10
|
+
dictionary = pickle.load(f)
|
21
11
|
|
22
12
|
#filename_five = 'five_grams.pickle'
|
23
13
|
#path =downloader.get_appdatadir()
|
@@ -0,0 +1,172 @@
|
|
1
|
+
import re
|
2
|
+
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
3
|
+
from sinatools.utils.parser import arStrip
|
4
|
+
from sinatools.utils.charsets import AR_CHARSET, AR_DIAC_CHARSET
|
5
|
+
from sinatools.DataDownload.downloader import get_appdatadir
|
6
|
+
from . import dictionary
|
7
|
+
|
8
|
+
_IS_AR_RE = re.compile(u'^[' + re.escape(u''.join(AR_CHARSET)) + u']+$')
|
9
|
+
|
10
|
+
def find_solution(token, language, flag):
|
11
|
+
if token in dictionary.keys():
|
12
|
+
resulted_solutions = []
|
13
|
+
solutions = dictionary[token]
|
14
|
+
if flag == '1':
|
15
|
+
solutions = [solutions[0]]
|
16
|
+
for solution in solutions:
|
17
|
+
# token, freq, lemma, lemma_id, root, pos
|
18
|
+
resulted_solutions.append([token, solution[0], solution[1], solution[2], solution[3], solution[4]])
|
19
|
+
return resulted_solutions
|
20
|
+
else:
|
21
|
+
return []
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
def analyze(text, language ='MSA', task ='full', flag="1"):
|
26
|
+
"""
|
27
|
+
This method processes an input text and returns morphological analysis for each token within the text, based on the specified language, task, and flag. As follows:
|
28
|
+
If:
|
29
|
+
The task is lemmatization, the morphological solution includes only the lemma_id, lemma, token, and token frequency.
|
30
|
+
The task is pos, the morphological solution includes only the part-of-speech, token, and token frequency.
|
31
|
+
The task is root, the morphological solution includes only the root, token, and token frequency.
|
32
|
+
The task is full, the morphological solution includes the lemma_id, lemma, part-of-speech, root, token, and token frequency.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
text (:obj:`str`): The Arabic text to be morphologically analyzed.
|
36
|
+
language (:obj:`str`): The type of the input text. Currently, only Modern Standard Arabic (MSA) is supported.
|
37
|
+
task (:obj:`str`): The task to filter the results by. Options are [lemmatization, pos, root, full]. The default task if not specified is `full`.
|
38
|
+
flag (:obj:`str`): The flag to filter the returned results. If the flag is `1`, the solution with the highest frequency will be returned. If the flag is `*`, all solutions will be returned, ordered descendingly, with the highest frequency solution first. The default flag if not specified is `1`.
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
list (:obj:`list`): A list of JSON objects, where each JSON could be contains:
|
42
|
+
token: The token from the original text.
|
43
|
+
lemma: The lemma of the token.
|
44
|
+
lemma_id: The id of the lemma.
|
45
|
+
pos: The part-of-speech of the token.
|
46
|
+
root: The root of the token.
|
47
|
+
frequency: The frequency of the token.
|
48
|
+
|
49
|
+
**Example:**
|
50
|
+
|
51
|
+
.. highlight:: python
|
52
|
+
.. code-block:: python
|
53
|
+
|
54
|
+
from sinatools.morphology.morph_analyzer import analyze
|
55
|
+
|
56
|
+
#Return the morpological solution for each token in this text
|
57
|
+
#Example: task = full
|
58
|
+
analyze('ذهب الولد الى المدرسة')
|
59
|
+
|
60
|
+
[
|
61
|
+
{
|
62
|
+
"token": "ذهب",
|
63
|
+
"lemma": "ذَهَبَ",
|
64
|
+
"lemma_id": "202001617",
|
65
|
+
"root": "ذ ه ب",
|
66
|
+
"pos": "فعل ماضي",
|
67
|
+
"frequency": "82202"
|
68
|
+
},{
|
69
|
+
"token": "الولد",
|
70
|
+
"lemma": "وَلَدٌ",
|
71
|
+
"lemma_id": "202003092",
|
72
|
+
"root": "و ل د",
|
73
|
+
"pos": "اسم",
|
74
|
+
"frequency": "19066"
|
75
|
+
},{
|
76
|
+
"token": "إلى",
|
77
|
+
"lemma": "إِلَى",
|
78
|
+
"lemma_id": "202000856",
|
79
|
+
"root": "إ ل ى",
|
80
|
+
"pos": "حرف جر",
|
81
|
+
"frequency": "7367507"
|
82
|
+
},{
|
83
|
+
"token": "المدرسة",
|
84
|
+
"lemma": "مَدْرَسَةٌ",
|
85
|
+
"lemma_id": "202002620",
|
86
|
+
"root": "د ر س",
|
87
|
+
"pos": "اسم",
|
88
|
+
"frequency": "145285"
|
89
|
+
}
|
90
|
+
]
|
91
|
+
"""
|
92
|
+
|
93
|
+
output_list = []
|
94
|
+
|
95
|
+
tokens = simple_word_tokenize(text)
|
96
|
+
|
97
|
+
for token in tokens:
|
98
|
+
result_token = []
|
99
|
+
token = arStrip(token , False , True , False , False , False , False)
|
100
|
+
token = re.sub('[ٱ]','ﺍ',token)
|
101
|
+
# token, freq, lemma, lemma_id, root, pos
|
102
|
+
solution = [token, 0, token+"_0", 0, token, ""]
|
103
|
+
|
104
|
+
if token.isdigit():
|
105
|
+
solution[5] = "digit" #pos
|
106
|
+
|
107
|
+
elif not _is_ar(token):
|
108
|
+
solution[5] = "Foreign" #pos
|
109
|
+
|
110
|
+
else:
|
111
|
+
result_token = find_solution(token,language,flag)
|
112
|
+
|
113
|
+
if result_token == []:
|
114
|
+
token_without_al = re.sub(r'^[ﻝ]','',re.sub(r'^[ﺍ]','',token))
|
115
|
+
if len(token_without_al) > 5 :
|
116
|
+
result_token = find_solution(token_without_al, language, flag)
|
117
|
+
|
118
|
+
if result_token == []:
|
119
|
+
# try with replace ﻩ with ﺓ
|
120
|
+
result_token = find_solution(re.sub(r'[ﻩ]$','ﺓ',token), language, flag)
|
121
|
+
|
122
|
+
|
123
|
+
if result_token == []:
|
124
|
+
# try with unify Alef
|
125
|
+
word_with_unify_alef = arStrip(token , False , False , False , False , True , False) # Unify Alef
|
126
|
+
result_token = find_solution(word_with_unify_alef, language, flag)
|
127
|
+
|
128
|
+
if result_token == []:
|
129
|
+
# try with remove diac
|
130
|
+
word_undiac = arStrip(token , True , False , True , True , False , False) # remove diacs, shaddah , digit
|
131
|
+
result_token = find_solution(word_undiac, language, flag)
|
132
|
+
|
133
|
+
if result_token == []:
|
134
|
+
# try with remove diac and unify alef
|
135
|
+
word_undiac = arStrip(token , True , True , True , False, True , False) # diacs , smallDiacs , shaddah , alif
|
136
|
+
result_token = find_solution(word_undiac, language, flag)
|
137
|
+
|
138
|
+
if result_token != []:
|
139
|
+
output_list += result_token
|
140
|
+
else:
|
141
|
+
output_list += [solution]
|
142
|
+
|
143
|
+
return filter_results(output_list, task)
|
144
|
+
|
145
|
+
|
146
|
+
def filter_results(data, task):
|
147
|
+
filtered_data = []
|
148
|
+
# token, freq, lemma, lemma_id, root, pos
|
149
|
+
if task == 'lemmatization':
|
150
|
+
filtered_data = [{'token': item[0], 'lemma': item[2], 'lemma_id': item[3], 'frequency': item[1]} for item in data]
|
151
|
+
elif task == 'pos':
|
152
|
+
filtered_data = [{'token': item[0], 'pos': item[5], 'frequency': item[1]} for item in data]
|
153
|
+
elif task == 'root':
|
154
|
+
filtered_data = [{'token': item[0], 'root': item[4], 'frequency': item[1]} for item in data]
|
155
|
+
else:
|
156
|
+
filtered_data = [{'token': item[0], 'lemma': item[2], 'lemma_id': item[3], 'root': item[4], 'pos':item[5], 'frequency': item[1]} for item in data]
|
157
|
+
|
158
|
+
return filtered_data
|
159
|
+
|
160
|
+
|
161
|
+
def _is_ar(word):
|
162
|
+
return _IS_AR_RE.match(word) is not None
|
163
|
+
|
164
|
+
|
165
|
+
|
166
|
+
|
167
|
+
|
168
|
+
|
169
|
+
|
170
|
+
|
171
|
+
|
172
|
+
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from sinatools.DataDownload import downloader
|
2
|
+
import os
|
3
|
+
from sinatools.ner.utils.helpers import load_checkpoint
|
4
|
+
|
5
|
+
tagger = None
|
6
|
+
tag_vocab = None
|
7
|
+
train_config = None
|
8
|
+
|
9
|
+
filename = 'Wj27012000.tar'
|
10
|
+
path =downloader.get_appdatadir()
|
11
|
+
model_path = os.path.join(path, filename)
|
12
|
+
tagger, tag_vocab, train_config = load_checkpoint(model_path)
|
@@ -1,9 +1,10 @@
|
|
1
1
|
import os
|
2
2
|
from collections import namedtuple
|
3
|
-
from
|
4
|
-
from
|
5
|
-
from
|
6
|
-
import
|
3
|
+
from sinatools.ner.utils.helpers import load_checkpoint
|
4
|
+
from sinatools.ner.utils.data import get_dataloaders, text2segments
|
5
|
+
from sinatools.DataDownload import downloader
|
6
|
+
from . import tag_vocab, train_config, tagger
|
7
|
+
|
7
8
|
def ner(text, batch_size=32):
|
8
9
|
"""
|
9
10
|
This method takes a text as input, and a batch size, then performs named entity recognition (NER) on the input text and returns a list of tagged mentions.
|
@@ -20,7 +21,7 @@ def ner(text, batch_size=32):
|
|
20
21
|
.. highlight:: python
|
21
22
|
.. code-block:: python
|
22
23
|
|
23
|
-
from
|
24
|
+
from sinatools.arabiner.bin import infer
|
24
25
|
infer.ner('ذهب محمد الى جامعة بيرزيت')
|
25
26
|
|
26
27
|
#the output
|
@@ -42,19 +43,19 @@ def ner(text, batch_size=32):
|
|
42
43
|
dataset, token_vocab = text2segments(text)
|
43
44
|
|
44
45
|
vocabs = namedtuple("Vocab", ["tags", "tokens"])
|
45
|
-
vocab = vocabs(tokens=token_vocab, tags=
|
46
|
+
vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
|
46
47
|
|
47
48
|
# From the datasets generate the dataloaders
|
48
49
|
dataloader = get_dataloaders(
|
49
50
|
(dataset,),
|
50
51
|
vocab,
|
51
|
-
|
52
|
+
train_config.data_config,
|
52
53
|
batch_size=batch_size,
|
53
54
|
shuffle=(False,),
|
54
55
|
)[0]
|
55
56
|
|
56
57
|
# Perform inference on the text and get back the tagged segments
|
57
|
-
segments =
|
58
|
+
segments = tagger.infer(dataloader)
|
58
59
|
segments_lists = []
|
59
60
|
# Print results
|
60
61
|
for segment in segments:
|
@@ -1,12 +1,12 @@
|
|
1
1
|
import json
|
2
|
-
from
|
3
|
-
from
|
4
|
-
from
|
5
|
-
from
|
6
|
-
from
|
7
|
-
from
|
8
|
-
from
|
9
|
-
#from
|
2
|
+
from sinatools.salma import settings
|
3
|
+
from sinatools.salma.wsd import normalizearabert
|
4
|
+
from sinatools.salma.wsd import GlossPredictor
|
5
|
+
from sinatools.utils.parser import arStrip
|
6
|
+
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
7
|
+
from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
|
8
|
+
from sinatools.morphology.morph_analyzer import analyze
|
9
|
+
#from sinatools.ner.entity_extractor import ner
|
10
10
|
|
11
11
|
def delete_form_list(position, word_lemma):
|
12
12
|
#"""
|
@@ -424,7 +424,7 @@ def SALMA(sentence):
|
|
424
424
|
.. highlight:: python
|
425
425
|
.. code-block:: python
|
426
426
|
|
427
|
-
from
|
427
|
+
from sinatools.salma.views import SALMA
|
428
428
|
JSON = SALMA("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.")
|
429
429
|
print(JSON["resp"])
|
430
430
|
|
@@ -1,11 +1,11 @@
|
|
1
|
-
from
|
1
|
+
from sinatools.salma import settings
|
2
2
|
import re
|
3
3
|
import warnings
|
4
4
|
warnings.filterwarnings("ignore")
|
5
5
|
import torch
|
6
6
|
import numpy as np
|
7
7
|
import pandas as pd
|
8
|
-
from
|
8
|
+
from sinatools.arabert.preprocess import ArabertPreprocessor
|
9
9
|
|
10
10
|
def normalizearabert(s):
|
11
11
|
model_name = 'aubmindlab/bert-base-arabertv02'
|
@@ -1,6 +1,4 @@
|
|
1
|
-
#
|
2
|
-
# We acknoledge that this file charsets.py is imported from Camel tools citation. url
|
3
|
-
#
|
1
|
+
# We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/utils/charsets.html].
|
4
2
|
|
5
3
|
import unicodedata
|
6
4
|
|