SinaTools 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
  2. SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
  3. SinaTools-0.1.1.dist-info/LICENSE +22 -0
  4. SinaTools-0.1.1.dist-info/METADATA +72 -0
  5. SinaTools-0.1.1.dist-info/RECORD +122 -0
  6. SinaTools-0.1.1.dist-info/WHEEL +6 -0
  7. SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
  8. SinaTools-0.1.1.dist-info/top_level.txt +1 -0
  9. nlptools/CLI/DataDownload/download_files.py +71 -0
  10. nlptools/CLI/arabiner/bin/infer.py +117 -0
  11. nlptools/CLI/arabiner/bin/infer2.py +81 -0
  12. nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
  13. nlptools/CLI/morphology/morph_analyzer.py +91 -0
  14. nlptools/CLI/salma/salma_tools.py +68 -0
  15. nlptools/CLI/utils/__init__.py +0 -0
  16. nlptools/CLI/utils/arStrip.py +99 -0
  17. nlptools/CLI/utils/corpus_tokenizer.py +74 -0
  18. nlptools/CLI/utils/implication.py +92 -0
  19. nlptools/CLI/utils/jaccard.py +96 -0
  20. nlptools/CLI/utils/latin_remove.py +51 -0
  21. nlptools/CLI/utils/remove_Punc.py +53 -0
  22. nlptools/CLI/utils/sentence_tokenizer.py +90 -0
  23. nlptools/CLI/utils/text_transliteration.py +77 -0
  24. nlptools/DataDownload/__init__.py +0 -0
  25. nlptools/DataDownload/downloader.py +185 -0
  26. nlptools/VERSION +1 -0
  27. nlptools/__init__.py +5 -0
  28. nlptools/arabert/__init__.py +1 -0
  29. nlptools/arabert/arabert/__init__.py +14 -0
  30. nlptools/arabert/arabert/create_classification_data.py +260 -0
  31. nlptools/arabert/arabert/create_pretraining_data.py +534 -0
  32. nlptools/arabert/arabert/extract_features.py +444 -0
  33. nlptools/arabert/arabert/lamb_optimizer.py +158 -0
  34. nlptools/arabert/arabert/modeling.py +1027 -0
  35. nlptools/arabert/arabert/optimization.py +202 -0
  36. nlptools/arabert/arabert/run_classifier.py +1078 -0
  37. nlptools/arabert/arabert/run_pretraining.py +593 -0
  38. nlptools/arabert/arabert/run_squad.py +1440 -0
  39. nlptools/arabert/arabert/tokenization.py +414 -0
  40. nlptools/arabert/araelectra/__init__.py +1 -0
  41. nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
  42. nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
  43. nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
  44. nlptools/arabert/araelectra/configure_finetuning.py +172 -0
  45. nlptools/arabert/araelectra/configure_pretraining.py +143 -0
  46. nlptools/arabert/araelectra/finetune/__init__.py +14 -0
  47. nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
  48. nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
  49. nlptools/arabert/araelectra/finetune/scorer.py +54 -0
  50. nlptools/arabert/araelectra/finetune/task.py +74 -0
  51. nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
  52. nlptools/arabert/araelectra/flops_computation.py +215 -0
  53. nlptools/arabert/araelectra/model/__init__.py +14 -0
  54. nlptools/arabert/araelectra/model/modeling.py +1029 -0
  55. nlptools/arabert/araelectra/model/optimization.py +193 -0
  56. nlptools/arabert/araelectra/model/tokenization.py +355 -0
  57. nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
  58. nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
  59. nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
  60. nlptools/arabert/araelectra/run_finetuning.py +323 -0
  61. nlptools/arabert/araelectra/run_pretraining.py +469 -0
  62. nlptools/arabert/araelectra/util/__init__.py +14 -0
  63. nlptools/arabert/araelectra/util/training_utils.py +112 -0
  64. nlptools/arabert/araelectra/util/utils.py +109 -0
  65. nlptools/arabert/aragpt2/__init__.py +2 -0
  66. nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
  67. nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
  68. nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
  69. nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
  70. nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
  71. nlptools/arabert/aragpt2/grover/__init__.py +0 -0
  72. nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
  73. nlptools/arabert/aragpt2/grover/modeling.py +803 -0
  74. nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
  75. nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
  76. nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
  77. nlptools/arabert/aragpt2/grover/utils.py +234 -0
  78. nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
  79. nlptools/arabert/preprocess.py +818 -0
  80. nlptools/arabiner/__init__.py +0 -0
  81. nlptools/arabiner/bin/__init__.py +14 -0
  82. nlptools/arabiner/bin/eval.py +87 -0
  83. nlptools/arabiner/bin/infer.py +91 -0
  84. nlptools/arabiner/bin/process.py +140 -0
  85. nlptools/arabiner/bin/train.py +221 -0
  86. nlptools/arabiner/data/__init__.py +1 -0
  87. nlptools/arabiner/data/datasets.py +146 -0
  88. nlptools/arabiner/data/transforms.py +118 -0
  89. nlptools/arabiner/nn/BaseModel.py +22 -0
  90. nlptools/arabiner/nn/BertNestedTagger.py +34 -0
  91. nlptools/arabiner/nn/BertSeqTagger.py +17 -0
  92. nlptools/arabiner/nn/__init__.py +3 -0
  93. nlptools/arabiner/trainers/BaseTrainer.py +117 -0
  94. nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
  95. nlptools/arabiner/trainers/BertTrainer.py +163 -0
  96. nlptools/arabiner/trainers/__init__.py +3 -0
  97. nlptools/arabiner/utils/__init__.py +0 -0
  98. nlptools/arabiner/utils/data.py +124 -0
  99. nlptools/arabiner/utils/helpers.py +151 -0
  100. nlptools/arabiner/utils/metrics.py +69 -0
  101. nlptools/environment.yml +227 -0
  102. nlptools/install_env.py +13 -0
  103. nlptools/morphology/ALMA_multi_word.py +34 -0
  104. nlptools/morphology/__init__.py +52 -0
  105. nlptools/morphology/charsets.py +60 -0
  106. nlptools/morphology/morph_analyzer.py +170 -0
  107. nlptools/morphology/settings.py +8 -0
  108. nlptools/morphology/tokenizers_words.py +19 -0
  109. nlptools/nlptools.py +1 -0
  110. nlptools/salma/__init__.py +12 -0
  111. nlptools/salma/settings.py +31 -0
  112. nlptools/salma/views.py +459 -0
  113. nlptools/salma/wsd.py +126 -0
  114. nlptools/utils/__init__.py +0 -0
  115. nlptools/utils/corpus_tokenizer.py +73 -0
  116. nlptools/utils/implication.py +662 -0
  117. nlptools/utils/jaccard.py +247 -0
  118. nlptools/utils/parser.py +147 -0
  119. nlptools/utils/readfile.py +3 -0
  120. nlptools/utils/sentence_tokenizer.py +53 -0
  121. nlptools/utils/text_transliteration.py +232 -0
  122. nlptools/utils/utils.py +2 -0
@@ -0,0 +1,81 @@
1
+ import os
2
+ import csv
3
+ from nlptools.utils.sentence_tokenizer import sent_tokenize
4
+ from nlptools.morphology.tokenizers_words import simple_word_tokenize
5
+ import pandas as pd
6
+
7
+ """
8
+ CSV NER Tagging Tool
9
+
10
+ Usage:
11
+ ------
12
+ Run the script with the following command:
13
+
14
+ arabi_ner2 input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
15
+ """
16
+
17
+ import argparse
18
+ import pandas as pd
19
+ from nlptools.utils.sentence_tokenizer import sent_tokenize
20
+ from nlptools.morphology.tokenizers_words import simple_word_tokenize
21
+ from nlptools.arabiner.bin.infer import ner
22
+
23
+ def infer(sentence):
24
+ output = ner(sentence)
25
+ return [word[1] for word in output]
26
+
27
+
28
+ def corpus_tokenizer(input_csv, output_csv, text_column, additional_columns, row_id, global_sentence_id):
29
+ print(input_csv, output_csv, text_column, additional_columns)
30
+ row_id = row_id - 1
31
+ global_sentence_id = global_sentence_id - 1
32
+ fieldnames = ['Row_ID', 'Docs_Sentence_Word_ID', 'Global Sentence ID', 'Sentence ID', 'Sentence', 'Word Position', 'Word', 'Ner tags']
33
+ for additional_column in additional_columns:
34
+ fieldnames.append(additional_column)
35
+
36
+ with open(output_csv, 'w', newline='', encoding="utf-8") as csvfile:
37
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
38
+ writer.writeheader()
39
+
40
+ df = pd.read_csv(input_csv)
41
+ for index, row in df.iterrows():
42
+ sentences = sent_tokenize(row[text_column], dot=True, new_line=True, question_mark=False, exclamation_mark=False)
43
+ for sentence_id, sentence in enumerate(sentences, start=1):
44
+ words = simple_word_tokenize(sentence)
45
+ global_sentence_id += 1
46
+
47
+ tags = infer(sentence)
48
+ for word_position, word in enumerate(words, start=1):
49
+ row_id += 1
50
+ doc_sentence_filename = input_csv.split(".csv")[0]
51
+ docs_sentence_word_id = f"{doc_sentence_filename}_{global_sentence_id}_{sentence_id}_{word_position}"
52
+ output_dic = {'Row_ID': row_id, 'Docs_Sentence_Word_ID': docs_sentence_word_id, 'Global Sentence ID': global_sentence_id, 'Sentence ID': sentence_id,
53
+ 'Sentence': sentence, 'Word Position': word_position, 'Word': word, 'Ner tags':tags[word_position-1]}
54
+ for additional_column in additional_columns:
55
+ output_dic[additional_column] = row[additional_column]
56
+
57
+ writer.writerow(output_dic)
58
+
59
+ def main():
60
+ parser = argparse.ArgumentParser(description="CSV NER Tagging Tool")
61
+ parser.add_argument("--input_csv", help="Path to the input CSV file")
62
+ parser.add_argument("--text_column", required=True,
63
+ help="Column index in the CSV file to apply NER tagging")
64
+ parser.add_argument("--additional_columns", nargs='*', default=[],
65
+ help="Additional column indexes to retain in the output seperated by , ")
66
+ parser.add_argument("--output_csv", default="output.csv",
67
+ help="Path to the output CSV file")
68
+ parser.add_argument("--row_id", default="1",
69
+ help="Row id to starts with")
70
+ parser.add_argument("--global_sentence_id", default="1",
71
+ help="global_sentence_id to starts with")
72
+
73
+ args = parser.parse_args()
74
+ corpus_tokenizer(args.input_csv, args.output_csv, args.text_column, args.additional_columns, int(args.row_id), int(args.global_sentence_id))
75
+
76
+
77
+ if __name__ == "__main__":
78
+ main()
79
+
80
+
81
+
@@ -0,0 +1,75 @@
1
+ """
2
+ About:
3
+ ------
4
+ The sina_alma_multi_word tool performs multi-word morphological analysis using SinaTools' `ALMA_multi_word` utility. Given a multi-word Arabic text input, it returns a detailed analysis in JSON format.
5
+
6
+ Usage:
7
+ ------
8
+ Below is the usage information that can be generated by running sina_alma_multi_word --help.
9
+
10
+ .. code-block:: none
11
+
12
+ sina_alma_multi_word --multi_word=MULTI_WORD_TEXT
13
+ sina_alma_multi_word --file
14
+
15
+ Options:
16
+ --------
17
+
18
+ .. code-block:: none
19
+
20
+ --multi_word MULTI_WORD_TEXT
21
+ The multi-word Arabic text that needs to be analyzed.
22
+ --file
23
+ File containing the multi-word text to be analyzed
24
+
25
+ Examples:
26
+ ---------
27
+
28
+ .. code-block:: none
29
+
30
+ sina_alma_multi_word --multi_word "Your multi-word text here"
31
+ sina_alma_multi_word --file "path/to/your/file.txt"
32
+
33
+ Note:
34
+ -----
35
+
36
+ .. code-block:: none
37
+
38
+ - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
39
+ - The tool returns results in JSON format with proper indentation for better readability.
40
+ - The quality and accuracy of the analysis depend on the underlying capabilities of the SinaTools' `ALMA_multi_word` utility.
41
+ - The tool is specifically designed for multi-word input. For single-word morphological analysis, other specific utilities/tools might be more appropriate.
42
+
43
+ """
44
+
45
+ import argparse
46
+ from nlptools.morphology.ALMA_multi_word import ALMA_multi_word
47
+ import json
48
+ from nlptools.utils.readfile import read_file
49
+
50
+ def main():
51
+ parser = argparse.ArgumentParser(description='Multi-Word Analysis using SinaTools')
52
+
53
+ # Adding arguments for the multi-word input or file containing the multi-word input
54
+ parser.add_argument('--multi_word', type=str, help='Multi-word text to be analyzed')
55
+ parser.add_argument('--file', type=str, help='File containing the multi-word text to be analyzed')
56
+
57
+ args = parser.parse_args()
58
+
59
+ if args.multi_word is None and args.file is None:
60
+ print("Error: Either --multi_word or --file argument must be provided.")
61
+ return
62
+
63
+ # Get the input either from the --multi_word argument or from the file specified in the --file argument
64
+ multi_word_text = args.multi_word if args.multi_word else " ".join(read_file(args.file))
65
+
66
+ # Perform multi-word analysis
67
+ results = ALMA_multi_word(multi_word_text)
68
+
69
+ # Print the results in JSON format
70
+ print(json.dumps(results, ensure_ascii=False, indent=4))
71
+
72
+ if __name__ == '__main__':
73
+ main()
74
+ #sina_alma_multi_word --multi_word "Your multi-word text here"
75
+ #sina_alma_multi_word --file "path/to/your/file.txt"
@@ -0,0 +1,91 @@
1
+ """
2
+ About:
3
+ ------
4
+ The sina_morph_analyze tool is designed to provide morphological analysis for Arabic text using the SinaTools' `analyze` utility. Users can specify the language and desired analysis task (e.g., lemmatization, part-of-speech tagging, or a full morphological analysis).
5
+
6
+ Usage:
7
+ ------
8
+ Below is the usage information that can be generated by running sina_morph_analyze --help.
9
+
10
+ .. code-block:: none
11
+
12
+ sina_morph_analyze --text=TEXT [OPTIONS]
13
+ sina_morph_analyze --file=FILE [OPTIONS]
14
+
15
+ Options:
16
+ --------
17
+
18
+ .. code-block:: none
19
+
20
+ --text TEXT
21
+ The text that needs to be morphologically analyzed.
22
+
23
+ --file FILE
24
+ File containing the text to be morphologically analyzed
25
+
26
+ --language LANGUAGE [default=MSA]
27
+ Specifies the language for the analysis. The default is MSA (Modern Standard Arabic).
28
+ Use other codes as appropriate for your requirements.
29
+
30
+ --task TASK [default=full]
31
+ Determines the specific type of morphological analysis to be performed. Available options are:
32
+ - lemmatizer: Provides lemmatization results.
33
+ - pos: Provides part-of-speech tagging.
34
+ - full: Provides a comprehensive morphological analysis.
35
+ The default is a full morphological analysis.
36
+
37
+ Examples:
38
+ ---------
39
+
40
+ .. code-block:: none
41
+
42
+ sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
43
+ sina_morph_analyze --text "Your Arabic text here" --task lemmatizer
44
+ sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full
45
+ sina_morph_analyze --file "path/to/your/file.txt" --task lemmatizer
46
+
47
+ Note:
48
+ -----
49
+
50
+ .. code-block:: none
51
+
52
+ - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
53
+ - The quality and accuracy of the analysis depend on the underlying capabilities of the SinaTools' `analyze` utility.
54
+ - The analysis can be influenced by the choice of language. Ensure you are using the correct language setting.
55
+
56
+ """
57
+
58
+ import argparse
59
+ from nlptools.morphology.morph_analyzer import analyze
60
+ from nlptools.utils.readfile import read_file
61
+
62
+ def main():
63
+ parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools')
64
+
65
+ # Adding arguments for the text, file, language, and task
66
+ parser.add_argument('--text', type=str, help='Text to be morphologically analyzed')
67
+ parser.add_argument('--file', type=str, help='File containing the text to be morphologically analyzed')
68
+ parser.add_argument('--language', type=str, default='MSA', help='Language for analysis (default: MSA)')
69
+ parser.add_argument('--task', type=str, default='full', choices=['lemmatizer', 'pos', 'full'],
70
+ help='Task for the result filter [lemmatizer, pos, full] (default: full)')
71
+
72
+ args = parser.parse_args()
73
+
74
+ if args.text is None and args.file is None:
75
+ print("Error: Either --text or --file argument must be provided.")
76
+ return
77
+
78
+ # Get the input either from the --text argument or from the file specified in the --file argument
79
+ input_text = args.text if args.text else " ".join(read_file(args.file))
80
+
81
+ # Perform morphological analysis
82
+ results = analyze(input_text, args.language, args.task)
83
+
84
+ # Print the results
85
+ for result in results:
86
+ print(result)
87
+
88
+ if __name__ == '__main__':
89
+ main()
90
+ #sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
91
+ #sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full
@@ -0,0 +1,68 @@
1
+ """
2
+ SALMA CLI
3
+
4
+ About:
5
+ ------
6
+ The SALMA command line interface (CLI) is a tool designed to utilize the SALMA function for processing Arabic sentences. This CLI allows users to input an Arabic sentence and receive a structured response that includes the processing result of the SALMA function.
7
+
8
+ Usage:
9
+ ------
10
+ Below is the usage information that can be generated by running the command with the --help option.
11
+
12
+ .. code-block:: none
13
+
14
+ sina_salma --text=TEXT
15
+ sina_salma --file=INPUT_FILE
16
+
17
+ Options:
18
+ --------
19
+ .. code-block:: none
20
+
21
+ --text
22
+ The Arabic sentence to be processed by the SALMA function.
23
+ --file
24
+ The text file to be processed by the SALMA function.
25
+
26
+ Examples:
27
+ ---------
28
+ .. code-block:: none
29
+
30
+ sina_salma --text "your Arabic sentence here"
31
+ sina_salma --file "path/to/your/file.txt"
32
+
33
+ Note:
34
+ -----
35
+
36
+ .. code-block:: none
37
+
38
+ - The input sentence should be provided in Arabic.
39
+ - It is recommended that the length of the input sentence does not exceed 500 characters to ensure optimal performance and accurate results.
40
+
41
+ """
42
+
43
+ import argparse
44
+ import json
45
+ from nlptools.salma.views import SALMA
46
+ from nlptools.utils.readfile import read_file
47
+
48
+ def main():
49
+ parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
50
+
51
+ parser.add_argument('--text', type=str, help='Input sentence to process')
52
+ parser.add_argument('--file', type=str, help='File containing the Arabic sentence to process')
53
+
54
+ args = parser.parse_args()
55
+
56
+ if args.text is None and args.file is None:
57
+ print("Either --text or --file argument must be provided.")
58
+ return
59
+
60
+ text_content = args.text if args.text else " ".join(read_file(args.file))
61
+ result = SALMA(text_content)
62
+ print(json.dumps(result, ensure_ascii=False, indent=4))
63
+
64
+ if __name__ == "__main__":
65
+ main()
66
+
67
+ #sina_salma --text "your Arabic sentence here"
68
+ #sina_salma --file "path/to/your/file.txt"
File without changes
@@ -0,0 +1,99 @@
1
+ """
2
+
3
+ About:
4
+ ------
5
+ The sina_arStrip tool offers functionality to strip various elements from Arabic text using the SinaTools' `arStrip` utility. It provides flexibility to selectively strip diacritics, small diacritics, shaddah, digits, alif, and special characters.
6
+
7
+ Usage:
8
+ ------
9
+ Below is the usage information that can be generated by running sina_arStrip --help.
10
+
11
+ .. code-block:: none
12
+
13
+ Usage:
14
+ sina_arStrip --text=TEXT [OPTIONS]
15
+ sina_arStrip --file "path/to/your/file.txt" [OPTIONS]
16
+
17
+ .. code-block:: none
18
+
19
+ Options:
20
+ --text TEXT
21
+ The Arabic text that needs to be stripped.
22
+
23
+ --file FILE
24
+ File containing text to be stripped.
25
+
26
+ --diacs BOOL [default=True]
27
+ Indicates whether to strip diacritics.
28
+
29
+ --smallDiacs BOOL [default=True]
30
+ Indicates whether to strip small diacritics.
31
+
32
+ --shaddah BOOL [default=True]
33
+ Indicates whether to strip shaddah.
34
+
35
+ --digit BOOL [default=True]
36
+ Indicates whether to strip digits.
37
+
38
+ --alif BOOL [default=True]
39
+ Indicates whether to strip alif.
40
+
41
+ --specialChars BOOL [default=True]
42
+ Indicates whether to strip special characters.
43
+
44
+ Examples:
45
+ ---------
46
+
47
+ .. code-block:: none
48
+
49
+ sina_arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
50
+
51
+ sina_arStrip --file "path/to/your/file.txt" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
52
+
53
+ Note:
54
+ -----
55
+
56
+ .. code-block:: none
57
+
58
+ - This tool is specific to Arabic text, as it focuses on Arabic linguistic elements.
59
+ - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
60
+ - Stripping certain elements might change the meaning or readability of the text. Use it judiciously.
61
+
62
+ """
63
+
64
+ import argparse
65
+ from nlptools.utils.parser import arStrip
66
+ from nlptools.utils.readfile import read_file
67
+
68
+ def main():
69
+ parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
70
+
71
+ parser.add_argument('--text', type=str, help='Text to be stripped')
72
+ parser.add_argument('--file', type=str, help='File containing text to be stripped')
73
+ parser.add_argument('--diacs', type=bool, default=True, help='Whether to strip diacritics')
74
+ parser.add_argument('--smallDiacs', type=bool, default=True, help='Whether to strip small diacritics')
75
+ parser.add_argument('--shaddah', type=bool, default=True, help='Whether to strip shaddah')
76
+ parser.add_argument('--digit', type=bool, default=True, help='Whether to strip digits')
77
+ parser.add_argument('--alif', type=bool, default=True, help='Whether to strip alif')
78
+ parser.add_argument('--specialChars', type=bool, default=True, help='Whether to strip special characters')
79
+
80
+ args = parser.parse_args()
81
+
82
+ if args.file:
83
+ text_content = read_file(args.file)
84
+ elif args.text:
85
+ text_content = args.text
86
+ else:
87
+ print("Either --text or --file argument must be provided.")
88
+ return
89
+
90
+ stripped_text = arStrip(text_content, diacs=args.diacs, smallDiacs=args.smallDiacs,
91
+ shaddah=args.shaddah, digit=args.digit, alif=args.alif, specialChars=args.specialChars)
92
+
93
+ print(stripped_text)
94
+
95
+ if __name__ == '__main__':
96
+ main()
97
+
98
+ #sina_arStrip --text "example text" --diacs=True
99
+ #sina_arStrip --file "path/to/your/file.txt" --diacs=True
@@ -0,0 +1,74 @@
1
+ """
2
+
3
+ About:
4
+ ------
5
+
6
+ The sina_corpus_tokenizer tool offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file.
7
+
8
+ Usage:
9
+ -------
10
+
11
+ Below is the usage information that can be generated by running sina_corpus_tokenizer --help.
12
+
13
+ .. code-block:: none
14
+
15
+ Usage:
16
+ sina_corpus_tokenizer dir_path output_csv
17
+
18
+ .. code-block:: none
19
+
20
+ Positional Arguments:
21
+ dir_path
22
+ The path to the directory containing the text files.
23
+
24
+ output_csv
25
+ The path to the output CSV file.
26
+
27
+ Examples:
28
+ ---------
29
+
30
+ .. code-block:: none
31
+
32
+ sina_corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv "outputFile.csv"
33
+
34
+ Note:
35
+ -----
36
+
37
+ .. code-block:: none
38
+
39
+ - The tool only processes text files (with a .txt extension).
40
+ - The output CSV will contain the following columns:
41
+ - 'Row_ID' (a unique identifier for each records in outputfile)
42
+ - 'Docs_Sentence_Word_ID' (a concatenated identifier comprising directory name, file name, global sentence id, sentence id, and word position).
43
+ - 'GlobalSentenceID' (Integer, a unique identifier for each sentence in the entire file)
44
+ - 'SentenceID' (Integer, a unique identifier for each file within the CSV file)
45
+ - 'Sentence' (Generated text that forms a sentence)
46
+ - 'Word Position' (Integer, the position of each word within the sentence)
47
+ - 'Word' (Each row contains a word from the generated sentence).
48
+ - Ensure that the text files are appropriately encoded in UTF-8 or compatible formats.
49
+ - The tool uses the `nltk` library for sentence and word tokenization. Make sure to have the library installed in your environment.
50
+ """
51
+
52
+ import argparse
53
+ from nlptools.utils.corpus_tokenizer import corpus_tokenizer
54
+
55
+ # Define the main function that will parse the arguments
56
+ def main():
57
+ # Create an ArgumentParser object
58
+ parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.')
59
+
60
+ # Add arguments to the parser
61
+ parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.')
62
+ parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.')
63
+
64
+ # Parse the command-line arguments
65
+ args = parser.parse_args()
66
+
67
+ # Call the corpus_tokenizer function with the parsed arguments
68
+ corpus_tokenizer(args.dir_path, args.output_csv)
69
+
70
+ # Call the main function when the script is executed
71
+ if __name__ == '__main__':
72
+ main()
73
+
74
+ #sina_corpus_tokenizer /path/to/text/files output.csv
@@ -0,0 +1,92 @@
1
+ """
2
+ About:
3
+ ------
4
+ The sina_implication tool evaluates the implication between two words using the functionalities provided by the `Implication` class of SinaTools. This tool can be utilized to determine the relationship between two words and understand if one implies the other.
5
+
6
+ Usage:
7
+ ------
8
+ Below is the usage information that can be generated by running sina_implication --help.
9
+
10
+ .. code-block:: none
11
+
12
+ Usage:
13
+ sina_implication --inputWord1=WORD1 --inputWord2=WORD2
14
+
15
+ sina_implication --inputFile1=File1 --inputFile2=File2
16
+
17
+ .. code-block:: none
18
+
19
+ Options:
20
+ --inputWord1 WORD1
21
+ First input word.
22
+
23
+ --inputWord2 WORD2
24
+ Second input word.
25
+
26
+ --file1 FILE1
27
+ File containing the words to evaluate the implication.
28
+
29
+ --file2 FILE2
30
+ File containing the words to evaluate the implication.
31
+ Examples:
32
+ ---------
33
+
34
+ .. code-block:: none
35
+
36
+ sina_implication --inputWord1 "word1" --inputWord2 "word2"
37
+
38
+ sina_implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
39
+
40
+ Note:
41
+ -----
42
+
43
+ .. code-block:: none
44
+
45
+ - The results are based on the underlying logic and data sets present in the `Implication` class of SinaTools.
46
+ - The tool compares the implication between two words, and the relationship might vary based on linguistic nuances.
47
+
48
+ """
49
+ import argparse
50
+ from nlptools.utils.implication import Implication
51
+
52
+ def read_file(file_path):
53
+ with open(file_path, 'r', encoding='utf-8') as file:
54
+ word = file.readline().strip()
55
+ if word:
56
+ return word
57
+ else:
58
+ raise ValueError(f"File {file_path} must contain at least one word.")
59
+
60
+ def main():
61
+ parser = argparse.ArgumentParser(description='Evaluate Implication between two words using SinaTools')
62
+
63
+ # Adding optional arguments for the two input words and the files
64
+ parser.add_argument('--inputWord1', type=str, help='First input word')
65
+ parser.add_argument('--inputWord2', type=str, help='Second input word')
66
+ parser.add_argument('--file1', type=str, help='File containing the first word to evaluate implication')
67
+ parser.add_argument('--file2', type=str, help='File containing the second word to evaluate implication')
68
+
69
+ args = parser.parse_args()
70
+
71
+ if args.file1 and args.file2:
72
+ word1 = read_file(args.file1)
73
+ word2 = read_file(args.file2)
74
+ elif args.inputWord1 and args.inputWord2:
75
+ word1, word2 = args.inputWord1, args.inputWord2
76
+ else:
77
+ print("Either --file1 and --file2 arguments or both --inputWord1 and --inputWord2 arguments must be provided.")
78
+ return
79
+
80
+ # Instantiate the Implication class
81
+ implication_obj = Implication(word1, word2)
82
+
83
+ # For this example, assuming there is a method `get_result()` in the Implication class.
84
+ result = implication_obj.get_result()
85
+ print(result)
86
+
87
+ if __name__ == '__main__':
88
+ main()
89
+ # sina_implication --inputWord1 "word1" --inputWord2 "word2"
90
+ # sina_implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt"
91
+
92
+
@@ -0,0 +1,96 @@
1
+ """
2
+ About:
3
+ ------
4
+ The sina_jaccard tool computes the Jaccard similarity between two sets of strings. The Jaccard similarity is the size of the intersection divided by the size of the union of the sample sets. It provides a measure of similarity between two sets.
5
+
6
+ Usage:
7
+ ------
8
+ Below is the usage information that can be generated by running sina_jaccard --help.
9
+
10
+ .. code-block:: none
11
+
12
+ Usage:
13
+ sina_jaccard --list1="WORD1, WORD2" --list2="WORD1,WORD2" --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
14
+
15
+ sina_jaccard --file1=File1 --file2=File2 --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
16
+
17
+ .. code-block:: none
18
+
19
+ Options:
20
+ --list1 WORD1 WORD2 ...
21
+ First list of strings (delimiter-separated).
22
+ --list2 WORD1 WORD2 ...
23
+ Second list of strings (delimiter-separated).
24
+ --file1
25
+ First file containing the first set of words
26
+ --file2
27
+ Second file containing the second set of words
28
+ --delimiter
29
+ Denote the bounds between regions in a text
30
+ --selection
31
+ Selecting the Jaccard function type, which can be one of the following options: 'jaccardAll', 'intersection', 'union', or 'similarity'.
32
+ --ignoreAllDiacriticsButNotShadda
33
+ If this option is selected, the comparison will be between two lists after ignoring all diacritics from the lists but keeping the shadda.
34
+ --ignoreShaddaDiacritic
35
+ If this option is selected, the comparison will be between two lists after ignoring diacritics (shadda) from lists of strings.
36
+
37
+ Examples:
38
+ ---------
39
+
40
+ .. code-block:: none
41
+
42
+ sina_jaccard --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
43
+
44
+ sina_jaccard --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
45
+
46
+ Note:
47
+ -----
48
+
49
+ .. code-block:: none
50
+
51
+ - The Jaccard similarity ranges from 0 to 1. A value of 1 indicates that the sets are identical, while a value of 0 indicates no similarity between the sets.
52
+ - Diacritics refer to the Arabic Diacritics (like fatha, damma, kasra, etc.) and shadda.
53
+ - The two normalization options can be used individually or together. However, the combination will result in both rules being applied, and thus,
54
+
55
+ """
56
+
57
+ import argparse
58
+ from nlptools.utils.jaccard import jaccard
59
+ from nlptools.utils.readfile import read_file
60
+
61
+
62
+ def main():
63
+ parser = argparse.ArgumentParser(description='Compute Jaccard similarity between two sets of strings')
64
+
65
+ # Adding optional arguments for the two sets and the files
66
+ parser.add_argument('--delimiter', type=str, help='denote the bounds between regions in a text')
67
+ parser.add_argument('--list1', type=str, help='First string (delimiter-separated)')
68
+ parser.add_argument('--list2', type=str, help='Second string (delimiter-separated)')
69
+ parser.add_argument('--file1', type=str, help='File containing the first set of words')
70
+ parser.add_argument('--file2', type=str, help='File containing the second set of words')
71
+ parser.add_argument('--selection', type=str, help='selecting jaccard function type')
72
+ parser.add_argument('--ignoreAllDiacriticsButNotShadda', action='store_true', help='Ignore all diacritics but not shadda')
73
+ parser.add_argument('--ignoreShaddaDiacritic', action='store_true', help='Ignore shadda diacritic')
74
+
75
+
76
+ args = parser.parse_args()
77
+
78
+ if args.file1 and args.file2:
79
+ set1 = " ".join(read_file(args.file1))
80
+ set2 = " ".join(read_file(args.file2))
81
+ elif args.list1 is not None and args.list2 is not None:
82
+ set1 = args.list1
83
+ set2 = args.list2
84
+ else:
85
+ print("Either --file1 and --file2 arguments or both --set1 and --set2 arguments must be provided.")
86
+ return
87
+
88
+ similarity = jaccard(args.delimiter, set1, set2, args.selection, args.ignoreAllDiacriticsButNotShadda, args.ignoreShaddaDiacritic)
89
+
90
+ print("Jaccard Result:", similarity)
91
+
92
+ if __name__ == '__main__':
93
+ main()
94
+
95
+ # sina_jaccard_similarity --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
96
+ # sina_jaccard_similarity --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic