SinaTools 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
- SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
- SinaTools-0.1.1.dist-info/LICENSE +22 -0
- SinaTools-0.1.1.dist-info/METADATA +72 -0
- SinaTools-0.1.1.dist-info/RECORD +122 -0
- SinaTools-0.1.1.dist-info/WHEEL +6 -0
- SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.1.dist-info/top_level.txt +1 -0
- nlptools/CLI/DataDownload/download_files.py +71 -0
- nlptools/CLI/arabiner/bin/infer.py +117 -0
- nlptools/CLI/arabiner/bin/infer2.py +81 -0
- nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
- nlptools/CLI/morphology/morph_analyzer.py +91 -0
- nlptools/CLI/salma/salma_tools.py +68 -0
- nlptools/CLI/utils/__init__.py +0 -0
- nlptools/CLI/utils/arStrip.py +99 -0
- nlptools/CLI/utils/corpus_tokenizer.py +74 -0
- nlptools/CLI/utils/implication.py +92 -0
- nlptools/CLI/utils/jaccard.py +96 -0
- nlptools/CLI/utils/latin_remove.py +51 -0
- nlptools/CLI/utils/remove_Punc.py +53 -0
- nlptools/CLI/utils/sentence_tokenizer.py +90 -0
- nlptools/CLI/utils/text_transliteration.py +77 -0
- nlptools/DataDownload/__init__.py +0 -0
- nlptools/DataDownload/downloader.py +185 -0
- nlptools/VERSION +1 -0
- nlptools/__init__.py +5 -0
- nlptools/arabert/__init__.py +1 -0
- nlptools/arabert/arabert/__init__.py +14 -0
- nlptools/arabert/arabert/create_classification_data.py +260 -0
- nlptools/arabert/arabert/create_pretraining_data.py +534 -0
- nlptools/arabert/arabert/extract_features.py +444 -0
- nlptools/arabert/arabert/lamb_optimizer.py +158 -0
- nlptools/arabert/arabert/modeling.py +1027 -0
- nlptools/arabert/arabert/optimization.py +202 -0
- nlptools/arabert/arabert/run_classifier.py +1078 -0
- nlptools/arabert/arabert/run_pretraining.py +593 -0
- nlptools/arabert/arabert/run_squad.py +1440 -0
- nlptools/arabert/arabert/tokenization.py +414 -0
- nlptools/arabert/araelectra/__init__.py +1 -0
- nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
- nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
- nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
- nlptools/arabert/araelectra/configure_finetuning.py +172 -0
- nlptools/arabert/araelectra/configure_pretraining.py +143 -0
- nlptools/arabert/araelectra/finetune/__init__.py +14 -0
- nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
- nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
- nlptools/arabert/araelectra/finetune/scorer.py +54 -0
- nlptools/arabert/araelectra/finetune/task.py +74 -0
- nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
- nlptools/arabert/araelectra/flops_computation.py +215 -0
- nlptools/arabert/araelectra/model/__init__.py +14 -0
- nlptools/arabert/araelectra/model/modeling.py +1029 -0
- nlptools/arabert/araelectra/model/optimization.py +193 -0
- nlptools/arabert/araelectra/model/tokenization.py +355 -0
- nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
- nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
- nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
- nlptools/arabert/araelectra/run_finetuning.py +323 -0
- nlptools/arabert/araelectra/run_pretraining.py +469 -0
- nlptools/arabert/araelectra/util/__init__.py +14 -0
- nlptools/arabert/araelectra/util/training_utils.py +112 -0
- nlptools/arabert/araelectra/util/utils.py +109 -0
- nlptools/arabert/aragpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
- nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
- nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
- nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
- nlptools/arabert/aragpt2/grover/__init__.py +0 -0
- nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
- nlptools/arabert/aragpt2/grover/modeling.py +803 -0
- nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
- nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
- nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
- nlptools/arabert/aragpt2/grover/utils.py +234 -0
- nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
- nlptools/arabert/preprocess.py +818 -0
- nlptools/arabiner/__init__.py +0 -0
- nlptools/arabiner/bin/__init__.py +14 -0
- nlptools/arabiner/bin/eval.py +87 -0
- nlptools/arabiner/bin/infer.py +91 -0
- nlptools/arabiner/bin/process.py +140 -0
- nlptools/arabiner/bin/train.py +221 -0
- nlptools/arabiner/data/__init__.py +1 -0
- nlptools/arabiner/data/datasets.py +146 -0
- nlptools/arabiner/data/transforms.py +118 -0
- nlptools/arabiner/nn/BaseModel.py +22 -0
- nlptools/arabiner/nn/BertNestedTagger.py +34 -0
- nlptools/arabiner/nn/BertSeqTagger.py +17 -0
- nlptools/arabiner/nn/__init__.py +3 -0
- nlptools/arabiner/trainers/BaseTrainer.py +117 -0
- nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
- nlptools/arabiner/trainers/BertTrainer.py +163 -0
- nlptools/arabiner/trainers/__init__.py +3 -0
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +124 -0
- nlptools/arabiner/utils/helpers.py +151 -0
- nlptools/arabiner/utils/metrics.py +69 -0
- nlptools/environment.yml +227 -0
- nlptools/install_env.py +13 -0
- nlptools/morphology/ALMA_multi_word.py +34 -0
- nlptools/morphology/__init__.py +52 -0
- nlptools/morphology/charsets.py +60 -0
- nlptools/morphology/morph_analyzer.py +170 -0
- nlptools/morphology/settings.py +8 -0
- nlptools/morphology/tokenizers_words.py +19 -0
- nlptools/nlptools.py +1 -0
- nlptools/salma/__init__.py +12 -0
- nlptools/salma/settings.py +31 -0
- nlptools/salma/views.py +459 -0
- nlptools/salma/wsd.py +126 -0
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/corpus_tokenizer.py +73 -0
- nlptools/utils/implication.py +662 -0
- nlptools/utils/jaccard.py +247 -0
- nlptools/utils/parser.py +147 -0
- nlptools/utils/readfile.py +3 -0
- nlptools/utils/sentence_tokenizer.py +53 -0
- nlptools/utils/text_transliteration.py +232 -0
- nlptools/utils/utils.py +2 -0
@@ -0,0 +1,81 @@
|
|
1
|
+
import os
|
2
|
+
import csv
|
3
|
+
from nlptools.utils.sentence_tokenizer import sent_tokenize
|
4
|
+
from nlptools.morphology.tokenizers_words import simple_word_tokenize
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
"""
|
8
|
+
CSV NER Tagging Tool
|
9
|
+
|
10
|
+
Usage:
|
11
|
+
------
|
12
|
+
Run the script with the following command:
|
13
|
+
|
14
|
+
arabi_ner2 input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
|
15
|
+
"""
|
16
|
+
|
17
|
+
import argparse
|
18
|
+
import pandas as pd
|
19
|
+
from nlptools.utils.sentence_tokenizer import sent_tokenize
|
20
|
+
from nlptools.morphology.tokenizers_words import simple_word_tokenize
|
21
|
+
from nlptools.arabiner.bin.infer import ner
|
22
|
+
|
23
|
+
def infer(sentence):
|
24
|
+
output = ner(sentence)
|
25
|
+
return [word[1] for word in output]
|
26
|
+
|
27
|
+
|
28
|
+
def corpus_tokenizer(input_csv, output_csv, text_column, additional_columns, row_id, global_sentence_id):
|
29
|
+
print(input_csv, output_csv, text_column, additional_columns)
|
30
|
+
row_id = row_id - 1
|
31
|
+
global_sentence_id = global_sentence_id - 1
|
32
|
+
fieldnames = ['Row_ID', 'Docs_Sentence_Word_ID', 'Global Sentence ID', 'Sentence ID', 'Sentence', 'Word Position', 'Word', 'Ner tags']
|
33
|
+
for additional_column in additional_columns:
|
34
|
+
fieldnames.append(additional_column)
|
35
|
+
|
36
|
+
with open(output_csv, 'w', newline='', encoding="utf-8") as csvfile:
|
37
|
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
38
|
+
writer.writeheader()
|
39
|
+
|
40
|
+
df = pd.read_csv(input_csv)
|
41
|
+
for index, row in df.iterrows():
|
42
|
+
sentences = sent_tokenize(row[text_column], dot=True, new_line=True, question_mark=False, exclamation_mark=False)
|
43
|
+
for sentence_id, sentence in enumerate(sentences, start=1):
|
44
|
+
words = simple_word_tokenize(sentence)
|
45
|
+
global_sentence_id += 1
|
46
|
+
|
47
|
+
tags = infer(sentence)
|
48
|
+
for word_position, word in enumerate(words, start=1):
|
49
|
+
row_id += 1
|
50
|
+
doc_sentence_filename = input_csv.split(".csv")[0]
|
51
|
+
docs_sentence_word_id = f"{doc_sentence_filename}_{global_sentence_id}_{sentence_id}_{word_position}"
|
52
|
+
output_dic = {'Row_ID': row_id, 'Docs_Sentence_Word_ID': docs_sentence_word_id, 'Global Sentence ID': global_sentence_id, 'Sentence ID': sentence_id,
|
53
|
+
'Sentence': sentence, 'Word Position': word_position, 'Word': word, 'Ner tags':tags[word_position-1]}
|
54
|
+
for additional_column in additional_columns:
|
55
|
+
output_dic[additional_column] = row[additional_column]
|
56
|
+
|
57
|
+
writer.writerow(output_dic)
|
58
|
+
|
59
|
+
def main():
|
60
|
+
parser = argparse.ArgumentParser(description="CSV NER Tagging Tool")
|
61
|
+
parser.add_argument("--input_csv", help="Path to the input CSV file")
|
62
|
+
parser.add_argument("--text_column", required=True,
|
63
|
+
help="Column index in the CSV file to apply NER tagging")
|
64
|
+
parser.add_argument("--additional_columns", nargs='*', default=[],
|
65
|
+
help="Additional column indexes to retain in the output seperated by , ")
|
66
|
+
parser.add_argument("--output_csv", default="output.csv",
|
67
|
+
help="Path to the output CSV file")
|
68
|
+
parser.add_argument("--row_id", default="1",
|
69
|
+
help="Row id to starts with")
|
70
|
+
parser.add_argument("--global_sentence_id", default="1",
|
71
|
+
help="global_sentence_id to starts with")
|
72
|
+
|
73
|
+
args = parser.parse_args()
|
74
|
+
corpus_tokenizer(args.input_csv, args.output_csv, args.text_column, args.additional_columns, int(args.row_id), int(args.global_sentence_id))
|
75
|
+
|
76
|
+
|
77
|
+
if __name__ == "__main__":
|
78
|
+
main()
|
79
|
+
|
80
|
+
|
81
|
+
|
@@ -0,0 +1,75 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The sina_alma_multi_word tool performs multi-word morphological analysis using SinaTools' `ALMA_multi_word` utility. Given a multi-word Arabic text input, it returns a detailed analysis in JSON format.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running sina_alma_multi_word --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
sina_alma_multi_word --multi_word=MULTI_WORD_TEXT
|
13
|
+
sina_alma_multi_word --file
|
14
|
+
|
15
|
+
Options:
|
16
|
+
--------
|
17
|
+
|
18
|
+
.. code-block:: none
|
19
|
+
|
20
|
+
--multi_word MULTI_WORD_TEXT
|
21
|
+
The multi-word Arabic text that needs to be analyzed.
|
22
|
+
--file
|
23
|
+
File containing the multi-word text to be analyzed
|
24
|
+
|
25
|
+
Examples:
|
26
|
+
---------
|
27
|
+
|
28
|
+
.. code-block:: none
|
29
|
+
|
30
|
+
sina_alma_multi_word --multi_word "Your multi-word text here"
|
31
|
+
sina_alma_multi_word --file "path/to/your/file.txt"
|
32
|
+
|
33
|
+
Note:
|
34
|
+
-----
|
35
|
+
|
36
|
+
.. code-block:: none
|
37
|
+
|
38
|
+
- Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
|
39
|
+
- The tool returns results in JSON format with proper indentation for better readability.
|
40
|
+
- The quality and accuracy of the analysis depend on the underlying capabilities of the SinaTools' `ALMA_multi_word` utility.
|
41
|
+
- The tool is specifically designed for multi-word input. For single-word morphological analysis, other specific utilities/tools might be more appropriate.
|
42
|
+
|
43
|
+
"""
|
44
|
+
|
45
|
+
import argparse
|
46
|
+
from nlptools.morphology.ALMA_multi_word import ALMA_multi_word
|
47
|
+
import json
|
48
|
+
from nlptools.utils.readfile import read_file
|
49
|
+
|
50
|
+
def main():
|
51
|
+
parser = argparse.ArgumentParser(description='Multi-Word Analysis using SinaTools')
|
52
|
+
|
53
|
+
# Adding arguments for the multi-word input or file containing the multi-word input
|
54
|
+
parser.add_argument('--multi_word', type=str, help='Multi-word text to be analyzed')
|
55
|
+
parser.add_argument('--file', type=str, help='File containing the multi-word text to be analyzed')
|
56
|
+
|
57
|
+
args = parser.parse_args()
|
58
|
+
|
59
|
+
if args.multi_word is None and args.file is None:
|
60
|
+
print("Error: Either --multi_word or --file argument must be provided.")
|
61
|
+
return
|
62
|
+
|
63
|
+
# Get the input either from the --multi_word argument or from the file specified in the --file argument
|
64
|
+
multi_word_text = args.multi_word if args.multi_word else " ".join(read_file(args.file))
|
65
|
+
|
66
|
+
# Perform multi-word analysis
|
67
|
+
results = ALMA_multi_word(multi_word_text)
|
68
|
+
|
69
|
+
# Print the results in JSON format
|
70
|
+
print(json.dumps(results, ensure_ascii=False, indent=4))
|
71
|
+
|
72
|
+
if __name__ == '__main__':
|
73
|
+
main()
|
74
|
+
#sina_alma_multi_word --multi_word "Your multi-word text here"
|
75
|
+
#sina_alma_multi_word --file "path/to/your/file.txt"
|
@@ -0,0 +1,91 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The sina_morph_analyze tool is designed to provide morphological analysis for Arabic text using the SinaTools' `analyze` utility. Users can specify the language and desired analysis task (e.g., lemmatization, part-of-speech tagging, or a full morphological analysis).
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running sina_morph_analyze --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
sina_morph_analyze --text=TEXT [OPTIONS]
|
13
|
+
sina_morph_analyze --file=FILE [OPTIONS]
|
14
|
+
|
15
|
+
Options:
|
16
|
+
--------
|
17
|
+
|
18
|
+
.. code-block:: none
|
19
|
+
|
20
|
+
--text TEXT
|
21
|
+
The text that needs to be morphologically analyzed.
|
22
|
+
|
23
|
+
--file FILE
|
24
|
+
File containing the text to be morphologically analyzed
|
25
|
+
|
26
|
+
--language LANGUAGE [default=MSA]
|
27
|
+
Specifies the language for the analysis. The default is MSA (Modern Standard Arabic).
|
28
|
+
Use other codes as appropriate for your requirements.
|
29
|
+
|
30
|
+
--task TASK [default=full]
|
31
|
+
Determines the specific type of morphological analysis to be performed. Available options are:
|
32
|
+
- lemmatizer: Provides lemmatization results.
|
33
|
+
- pos: Provides part-of-speech tagging.
|
34
|
+
- full: Provides a comprehensive morphological analysis.
|
35
|
+
The default is a full morphological analysis.
|
36
|
+
|
37
|
+
Examples:
|
38
|
+
---------
|
39
|
+
|
40
|
+
.. code-block:: none
|
41
|
+
|
42
|
+
sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
|
43
|
+
sina_morph_analyze --text "Your Arabic text here" --task lemmatizer
|
44
|
+
sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full
|
45
|
+
sina_morph_analyze --file "path/to/your/file.txt" --task lemmatizer
|
46
|
+
|
47
|
+
Note:
|
48
|
+
-----
|
49
|
+
|
50
|
+
.. code-block:: none
|
51
|
+
|
52
|
+
- Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
|
53
|
+
- The quality and accuracy of the analysis depend on the underlying capabilities of the SinaTools' `analyze` utility.
|
54
|
+
- The analysis can be influenced by the choice of language. Ensure you are using the correct language setting.
|
55
|
+
|
56
|
+
"""
|
57
|
+
|
58
|
+
import argparse
|
59
|
+
from nlptools.morphology.morph_analyzer import analyze
|
60
|
+
from nlptools.utils.readfile import read_file
|
61
|
+
|
62
|
+
def main():
|
63
|
+
parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools')
|
64
|
+
|
65
|
+
# Adding arguments for the text, file, language, and task
|
66
|
+
parser.add_argument('--text', type=str, help='Text to be morphologically analyzed')
|
67
|
+
parser.add_argument('--file', type=str, help='File containing the text to be morphologically analyzed')
|
68
|
+
parser.add_argument('--language', type=str, default='MSA', help='Language for analysis (default: MSA)')
|
69
|
+
parser.add_argument('--task', type=str, default='full', choices=['lemmatizer', 'pos', 'full'],
|
70
|
+
help='Task for the result filter [lemmatizer, pos, full] (default: full)')
|
71
|
+
|
72
|
+
args = parser.parse_args()
|
73
|
+
|
74
|
+
if args.text is None and args.file is None:
|
75
|
+
print("Error: Either --text or --file argument must be provided.")
|
76
|
+
return
|
77
|
+
|
78
|
+
# Get the input either from the --text argument or from the file specified in the --file argument
|
79
|
+
input_text = args.text if args.text else " ".join(read_file(args.file))
|
80
|
+
|
81
|
+
# Perform morphological analysis
|
82
|
+
results = analyze(input_text, args.language, args.task)
|
83
|
+
|
84
|
+
# Print the results
|
85
|
+
for result in results:
|
86
|
+
print(result)
|
87
|
+
|
88
|
+
if __name__ == '__main__':
|
89
|
+
main()
|
90
|
+
#sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
|
91
|
+
#sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full
|
@@ -0,0 +1,68 @@
|
|
1
|
+
"""
|
2
|
+
SALMA CLI
|
3
|
+
|
4
|
+
About:
|
5
|
+
------
|
6
|
+
The SALMA command line interface (CLI) is a tool designed to utilize the SALMA function for processing Arabic sentences. This CLI allows users to input an Arabic sentence and receive a structured response that includes the processing result of the SALMA function.
|
7
|
+
|
8
|
+
Usage:
|
9
|
+
------
|
10
|
+
Below is the usage information that can be generated by running the command with the --help option.
|
11
|
+
|
12
|
+
.. code-block:: none
|
13
|
+
|
14
|
+
sina_salma --text=TEXT
|
15
|
+
sina_salma --file=INPUT_FILE
|
16
|
+
|
17
|
+
Options:
|
18
|
+
--------
|
19
|
+
.. code-block:: none
|
20
|
+
|
21
|
+
--text
|
22
|
+
The Arabic sentence to be processed by the SALMA function.
|
23
|
+
--file
|
24
|
+
The text file to be processed by the SALMA function.
|
25
|
+
|
26
|
+
Examples:
|
27
|
+
---------
|
28
|
+
.. code-block:: none
|
29
|
+
|
30
|
+
sina_salma --text "your Arabic sentence here"
|
31
|
+
sina_salma --file "path/to/your/file.txt"
|
32
|
+
|
33
|
+
Note:
|
34
|
+
-----
|
35
|
+
|
36
|
+
.. code-block:: none
|
37
|
+
|
38
|
+
- The input sentence should be provided in Arabic.
|
39
|
+
- It is recommended that the length of the input sentence does not exceed 500 characters to ensure optimal performance and accurate results.
|
40
|
+
|
41
|
+
"""
|
42
|
+
|
43
|
+
import argparse
|
44
|
+
import json
|
45
|
+
from nlptools.salma.views import SALMA
|
46
|
+
from nlptools.utils.readfile import read_file
|
47
|
+
|
48
|
+
def main():
|
49
|
+
parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
|
50
|
+
|
51
|
+
parser.add_argument('--text', type=str, help='Input sentence to process')
|
52
|
+
parser.add_argument('--file', type=str, help='File containing the Arabic sentence to process')
|
53
|
+
|
54
|
+
args = parser.parse_args()
|
55
|
+
|
56
|
+
if args.text is None and args.file is None:
|
57
|
+
print("Either --text or --file argument must be provided.")
|
58
|
+
return
|
59
|
+
|
60
|
+
text_content = args.text if args.text else " ".join(read_file(args.file))
|
61
|
+
result = SALMA(text_content)
|
62
|
+
print(json.dumps(result, ensure_ascii=False, indent=4))
|
63
|
+
|
64
|
+
if __name__ == "__main__":
|
65
|
+
main()
|
66
|
+
|
67
|
+
#sina_salma --text "your Arabic sentence here"
|
68
|
+
#sina_salma --file "path/to/your/file.txt"
|
File without changes
|
@@ -0,0 +1,99 @@
|
|
1
|
+
"""
|
2
|
+
|
3
|
+
About:
|
4
|
+
------
|
5
|
+
The sina_arStrip tool offers functionality to strip various elements from Arabic text using the SinaTools' `arStrip` utility. It provides flexibility to selectively strip diacritics, small diacritics, shaddah, digits, alif, and special characters.
|
6
|
+
|
7
|
+
Usage:
|
8
|
+
------
|
9
|
+
Below is the usage information that can be generated by running sina_arStrip --help.
|
10
|
+
|
11
|
+
.. code-block:: none
|
12
|
+
|
13
|
+
Usage:
|
14
|
+
sina_arStrip --text=TEXT [OPTIONS]
|
15
|
+
sina_arStrip --file "path/to/your/file.txt" [OPTIONS]
|
16
|
+
|
17
|
+
.. code-block:: none
|
18
|
+
|
19
|
+
Options:
|
20
|
+
--text TEXT
|
21
|
+
The Arabic text that needs to be stripped.
|
22
|
+
|
23
|
+
--file FILE
|
24
|
+
File containing text to be stripped.
|
25
|
+
|
26
|
+
--diacs BOOL [default=True]
|
27
|
+
Indicates whether to strip diacritics.
|
28
|
+
|
29
|
+
--smallDiacs BOOL [default=True]
|
30
|
+
Indicates whether to strip small diacritics.
|
31
|
+
|
32
|
+
--shaddah BOOL [default=True]
|
33
|
+
Indicates whether to strip shaddah.
|
34
|
+
|
35
|
+
--digit BOOL [default=True]
|
36
|
+
Indicates whether to strip digits.
|
37
|
+
|
38
|
+
--alif BOOL [default=True]
|
39
|
+
Indicates whether to strip alif.
|
40
|
+
|
41
|
+
--specialChars BOOL [default=True]
|
42
|
+
Indicates whether to strip special characters.
|
43
|
+
|
44
|
+
Examples:
|
45
|
+
---------
|
46
|
+
|
47
|
+
.. code-block:: none
|
48
|
+
|
49
|
+
sina_arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
|
50
|
+
|
51
|
+
sina_arStrip --file "path/to/your/file.txt" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
|
52
|
+
|
53
|
+
Note:
|
54
|
+
-----
|
55
|
+
|
56
|
+
.. code-block:: none
|
57
|
+
|
58
|
+
- This tool is specific to Arabic text, as it focuses on Arabic linguistic elements.
|
59
|
+
- Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
|
60
|
+
- Stripping certain elements might change the meaning or readability of the text. Use it judiciously.
|
61
|
+
|
62
|
+
"""
|
63
|
+
|
64
|
+
import argparse
|
65
|
+
from nlptools.utils.parser import arStrip
|
66
|
+
from nlptools.utils.readfile import read_file
|
67
|
+
|
68
|
+
def main():
|
69
|
+
parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
|
70
|
+
|
71
|
+
parser.add_argument('--text', type=str, help='Text to be stripped')
|
72
|
+
parser.add_argument('--file', type=str, help='File containing text to be stripped')
|
73
|
+
parser.add_argument('--diacs', type=bool, default=True, help='Whether to strip diacritics')
|
74
|
+
parser.add_argument('--smallDiacs', type=bool, default=True, help='Whether to strip small diacritics')
|
75
|
+
parser.add_argument('--shaddah', type=bool, default=True, help='Whether to strip shaddah')
|
76
|
+
parser.add_argument('--digit', type=bool, default=True, help='Whether to strip digits')
|
77
|
+
parser.add_argument('--alif', type=bool, default=True, help='Whether to strip alif')
|
78
|
+
parser.add_argument('--specialChars', type=bool, default=True, help='Whether to strip special characters')
|
79
|
+
|
80
|
+
args = parser.parse_args()
|
81
|
+
|
82
|
+
if args.file:
|
83
|
+
text_content = read_file(args.file)
|
84
|
+
elif args.text:
|
85
|
+
text_content = args.text
|
86
|
+
else:
|
87
|
+
print("Either --text or --file argument must be provided.")
|
88
|
+
return
|
89
|
+
|
90
|
+
stripped_text = arStrip(text_content, diacs=args.diacs, smallDiacs=args.smallDiacs,
|
91
|
+
shaddah=args.shaddah, digit=args.digit, alif=args.alif, specialChars=args.specialChars)
|
92
|
+
|
93
|
+
print(stripped_text)
|
94
|
+
|
95
|
+
if __name__ == '__main__':
|
96
|
+
main()
|
97
|
+
|
98
|
+
#sina_arStrip --text "example text" --diacs=True
|
99
|
+
#sina_arStrip --file "path/to/your/file.txt" --diacs=True
|
@@ -0,0 +1,74 @@
|
|
1
|
+
"""
|
2
|
+
|
3
|
+
About:
|
4
|
+
------
|
5
|
+
|
6
|
+
The sina_corpus_tokenizer tool offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file.
|
7
|
+
|
8
|
+
Usage:
|
9
|
+
-------
|
10
|
+
|
11
|
+
Below is the usage information that can be generated by running sina_corpus_tokenizer --help.
|
12
|
+
|
13
|
+
.. code-block:: none
|
14
|
+
|
15
|
+
Usage:
|
16
|
+
sina_corpus_tokenizer dir_path output_csv
|
17
|
+
|
18
|
+
.. code-block:: none
|
19
|
+
|
20
|
+
Positional Arguments:
|
21
|
+
dir_path
|
22
|
+
The path to the directory containing the text files.
|
23
|
+
|
24
|
+
output_csv
|
25
|
+
The path to the output CSV file.
|
26
|
+
|
27
|
+
Examples:
|
28
|
+
---------
|
29
|
+
|
30
|
+
.. code-block:: none
|
31
|
+
|
32
|
+
sina_corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv "outputFile.csv"
|
33
|
+
|
34
|
+
Note:
|
35
|
+
-----
|
36
|
+
|
37
|
+
.. code-block:: none
|
38
|
+
|
39
|
+
- The tool only processes text files (with a .txt extension).
|
40
|
+
- The output CSV will contain the following columns:
|
41
|
+
- 'Row_ID' (a unique identifier for each records in outputfile)
|
42
|
+
- 'Docs_Sentence_Word_ID' (a concatenated identifier comprising directory name, file name, global sentence id, sentence id, and word position).
|
43
|
+
- 'GlobalSentenceID' (Integer, a unique identifier for each sentence in the entire file)
|
44
|
+
- 'SentenceID' (Integer, a unique identifier for each file within the CSV file)
|
45
|
+
- 'Sentence' (Generated text that forms a sentence)
|
46
|
+
- 'Word Position' (Integer, the position of each word within the sentence)
|
47
|
+
- 'Word' (Each row contains a word from the generated sentence).
|
48
|
+
- Ensure that the text files are appropriately encoded in UTF-8 or compatible formats.
|
49
|
+
- The tool uses the `nltk` library for sentence and word tokenization. Make sure to have the library installed in your environment.
|
50
|
+
"""
|
51
|
+
|
52
|
+
import argparse
|
53
|
+
from nlptools.utils.corpus_tokenizer import corpus_tokenizer
|
54
|
+
|
55
|
+
# Define the main function that will parse the arguments
|
56
|
+
def main():
|
57
|
+
# Create an ArgumentParser object
|
58
|
+
parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.')
|
59
|
+
|
60
|
+
# Add arguments to the parser
|
61
|
+
parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.')
|
62
|
+
parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.')
|
63
|
+
|
64
|
+
# Parse the command-line arguments
|
65
|
+
args = parser.parse_args()
|
66
|
+
|
67
|
+
# Call the corpus_tokenizer function with the parsed arguments
|
68
|
+
corpus_tokenizer(args.dir_path, args.output_csv)
|
69
|
+
|
70
|
+
# Call the main function when the script is executed
|
71
|
+
if __name__ == '__main__':
|
72
|
+
main()
|
73
|
+
|
74
|
+
#sina_corpus_tokenizer /path/to/text/files output.csv
|
@@ -0,0 +1,92 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The sina_implication tool evaluates the implication between two words using the functionalities provided by the `Implication` class of SinaTools. This tool can be utilized to determine the relationship between two words and understand if one implies the other.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running sina_implication --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
Usage:
|
13
|
+
sina_implication --inputWord1=WORD1 --inputWord2=WORD2
|
14
|
+
|
15
|
+
sina_implication --inputFile1=File1 --inputFile2=File2
|
16
|
+
|
17
|
+
.. code-block:: none
|
18
|
+
|
19
|
+
Options:
|
20
|
+
--inputWord1 WORD1
|
21
|
+
First input word.
|
22
|
+
|
23
|
+
--inputWord2 WORD2
|
24
|
+
Second input word.
|
25
|
+
|
26
|
+
--file1 FILE1
|
27
|
+
File containing the words to evaluate the implication.
|
28
|
+
|
29
|
+
--file2 FILE2
|
30
|
+
File containing the words to evaluate the implication.
|
31
|
+
Examples:
|
32
|
+
---------
|
33
|
+
|
34
|
+
.. code-block:: none
|
35
|
+
|
36
|
+
sina_implication --inputWord1 "word1" --inputWord2 "word2"
|
37
|
+
|
38
|
+
sina_implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
|
39
|
+
|
40
|
+
Note:
|
41
|
+
-----
|
42
|
+
|
43
|
+
.. code-block:: none
|
44
|
+
|
45
|
+
- The results are based on the underlying logic and data sets present in the `Implication` class of SinaTools.
|
46
|
+
- The tool compares the implication between two words, and the relationship might vary based on linguistic nuances.
|
47
|
+
|
48
|
+
"""
|
49
|
+
import argparse
|
50
|
+
from nlptools.utils.implication import Implication
|
51
|
+
|
52
|
+
def read_file(file_path):
|
53
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
54
|
+
word = file.readline().strip()
|
55
|
+
if word:
|
56
|
+
return word
|
57
|
+
else:
|
58
|
+
raise ValueError(f"File {file_path} must contain at least one word.")
|
59
|
+
|
60
|
+
def main():
|
61
|
+
parser = argparse.ArgumentParser(description='Evaluate Implication between two words using SinaTools')
|
62
|
+
|
63
|
+
# Adding optional arguments for the two input words and the files
|
64
|
+
parser.add_argument('--inputWord1', type=str, help='First input word')
|
65
|
+
parser.add_argument('--inputWord2', type=str, help='Second input word')
|
66
|
+
parser.add_argument('--file1', type=str, help='File containing the first word to evaluate implication')
|
67
|
+
parser.add_argument('--file2', type=str, help='File containing the second word to evaluate implication')
|
68
|
+
|
69
|
+
args = parser.parse_args()
|
70
|
+
|
71
|
+
if args.file1 and args.file2:
|
72
|
+
word1 = read_file(args.file1)
|
73
|
+
word2 = read_file(args.file2)
|
74
|
+
elif args.inputWord1 and args.inputWord2:
|
75
|
+
word1, word2 = args.inputWord1, args.inputWord2
|
76
|
+
else:
|
77
|
+
print("Either --file1 and --file2 arguments or both --inputWord1 and --inputWord2 arguments must be provided.")
|
78
|
+
return
|
79
|
+
|
80
|
+
# Instantiate the Implication class
|
81
|
+
implication_obj = Implication(word1, word2)
|
82
|
+
|
83
|
+
# For this example, assuming there is a method `get_result()` in the Implication class.
|
84
|
+
result = implication_obj.get_result()
|
85
|
+
print(result)
|
86
|
+
|
87
|
+
if __name__ == '__main__':
|
88
|
+
main()
|
89
|
+
# sina_implication --inputWord1 "word1" --inputWord2 "word2"
|
90
|
+
# sina_implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt"
|
91
|
+
|
92
|
+
|
@@ -0,0 +1,96 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The sina_jaccard tool computes the Jaccard similarity between two sets of strings. The Jaccard similarity is the size of the intersection divided by the size of the union of the sample sets. It provides a measure of similarity between two sets.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running sina_jaccard --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
Usage:
|
13
|
+
sina_jaccard --list1="WORD1, WORD2" --list2="WORD1,WORD2" --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
|
14
|
+
|
15
|
+
sina_jaccard --file1=File1 --file2=File2 --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
|
16
|
+
|
17
|
+
.. code-block:: none
|
18
|
+
|
19
|
+
Options:
|
20
|
+
--list1 WORD1 WORD2 ...
|
21
|
+
First list of strings (delimiter-separated).
|
22
|
+
--list2 WORD1 WORD2 ...
|
23
|
+
Second list of strings (delimiter-separated).
|
24
|
+
--file1
|
25
|
+
First file containing the first set of words
|
26
|
+
--file2
|
27
|
+
Second file containing the second set of words
|
28
|
+
--delimiter
|
29
|
+
Denote the bounds between regions in a text
|
30
|
+
--selection
|
31
|
+
Selecting the Jaccard function type, which can be one of the following options: 'jaccardAll', 'intersection', 'union', or 'similarity'.
|
32
|
+
--ignoreAllDiacriticsButNotShadda
|
33
|
+
If this option is selected, the comparison will be between two lists after ignoring all diacritics from the lists but keeping the shadda.
|
34
|
+
--ignoreShaddaDiacritic
|
35
|
+
If this option is selected, the comparison will be between two lists after ignoring diacritics (shadda) from lists of strings.
|
36
|
+
|
37
|
+
Examples:
|
38
|
+
---------
|
39
|
+
|
40
|
+
.. code-block:: none
|
41
|
+
|
42
|
+
sina_jaccard --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|
43
|
+
|
44
|
+
sina_jaccard --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|
45
|
+
|
46
|
+
Note:
|
47
|
+
-----
|
48
|
+
|
49
|
+
.. code-block:: none
|
50
|
+
|
51
|
+
- The Jaccard similarity ranges from 0 to 1. A value of 1 indicates that the sets are identical, while a value of 0 indicates no similarity between the sets.
|
52
|
+
- Diacritics refer to the Arabic Diacritics (like fatha, damma, kasra, etc.) and shadda.
|
53
|
+
- The two normalization options can be used individually or together. However, the combination will result in both rules being applied, and thus,
|
54
|
+
|
55
|
+
"""
|
56
|
+
|
57
|
+
import argparse
|
58
|
+
from nlptools.utils.jaccard import jaccard
|
59
|
+
from nlptools.utils.readfile import read_file
|
60
|
+
|
61
|
+
|
62
|
+
def main():
|
63
|
+
parser = argparse.ArgumentParser(description='Compute Jaccard similarity between two sets of strings')
|
64
|
+
|
65
|
+
# Adding optional arguments for the two sets and the files
|
66
|
+
parser.add_argument('--delimiter', type=str, help='denote the bounds between regions in a text')
|
67
|
+
parser.add_argument('--list1', type=str, help='First string (delimiter-separated)')
|
68
|
+
parser.add_argument('--list2', type=str, help='Second string (delimiter-separated)')
|
69
|
+
parser.add_argument('--file1', type=str, help='File containing the first set of words')
|
70
|
+
parser.add_argument('--file2', type=str, help='File containing the second set of words')
|
71
|
+
parser.add_argument('--selection', type=str, help='selecting jaccard function type')
|
72
|
+
parser.add_argument('--ignoreAllDiacriticsButNotShadda', action='store_true', help='Ignore all diacritics but not shadda')
|
73
|
+
parser.add_argument('--ignoreShaddaDiacritic', action='store_true', help='Ignore shadda diacritic')
|
74
|
+
|
75
|
+
|
76
|
+
args = parser.parse_args()
|
77
|
+
|
78
|
+
if args.file1 and args.file2:
|
79
|
+
set1 = " ".join(read_file(args.file1))
|
80
|
+
set2 = " ".join(read_file(args.file2))
|
81
|
+
elif args.list1 is not None and args.list2 is not None:
|
82
|
+
set1 = args.list1
|
83
|
+
set2 = args.list2
|
84
|
+
else:
|
85
|
+
print("Either --file1 and --file2 arguments or both --set1 and --set2 arguments must be provided.")
|
86
|
+
return
|
87
|
+
|
88
|
+
similarity = jaccard(args.delimiter, set1, set2, args.selection, args.ignoreAllDiacriticsButNotShadda, args.ignoreShaddaDiacritic)
|
89
|
+
|
90
|
+
print("Jaccard Result:", similarity)
|
91
|
+
|
92
|
+
if __name__ == '__main__':
|
93
|
+
main()
|
94
|
+
|
95
|
+
# sina_jaccard_similarity --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|
96
|
+
# sina_jaccard_similarity --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|