SinaTools 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
  2. SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
  3. SinaTools-0.1.1.dist-info/LICENSE +22 -0
  4. SinaTools-0.1.1.dist-info/METADATA +72 -0
  5. SinaTools-0.1.1.dist-info/RECORD +122 -0
  6. SinaTools-0.1.1.dist-info/WHEEL +6 -0
  7. SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
  8. SinaTools-0.1.1.dist-info/top_level.txt +1 -0
  9. nlptools/CLI/DataDownload/download_files.py +71 -0
  10. nlptools/CLI/arabiner/bin/infer.py +117 -0
  11. nlptools/CLI/arabiner/bin/infer2.py +81 -0
  12. nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
  13. nlptools/CLI/morphology/morph_analyzer.py +91 -0
  14. nlptools/CLI/salma/salma_tools.py +68 -0
  15. nlptools/CLI/utils/__init__.py +0 -0
  16. nlptools/CLI/utils/arStrip.py +99 -0
  17. nlptools/CLI/utils/corpus_tokenizer.py +74 -0
  18. nlptools/CLI/utils/implication.py +92 -0
  19. nlptools/CLI/utils/jaccard.py +96 -0
  20. nlptools/CLI/utils/latin_remove.py +51 -0
  21. nlptools/CLI/utils/remove_Punc.py +53 -0
  22. nlptools/CLI/utils/sentence_tokenizer.py +90 -0
  23. nlptools/CLI/utils/text_transliteration.py +77 -0
  24. nlptools/DataDownload/__init__.py +0 -0
  25. nlptools/DataDownload/downloader.py +185 -0
  26. nlptools/VERSION +1 -0
  27. nlptools/__init__.py +5 -0
  28. nlptools/arabert/__init__.py +1 -0
  29. nlptools/arabert/arabert/__init__.py +14 -0
  30. nlptools/arabert/arabert/create_classification_data.py +260 -0
  31. nlptools/arabert/arabert/create_pretraining_data.py +534 -0
  32. nlptools/arabert/arabert/extract_features.py +444 -0
  33. nlptools/arabert/arabert/lamb_optimizer.py +158 -0
  34. nlptools/arabert/arabert/modeling.py +1027 -0
  35. nlptools/arabert/arabert/optimization.py +202 -0
  36. nlptools/arabert/arabert/run_classifier.py +1078 -0
  37. nlptools/arabert/arabert/run_pretraining.py +593 -0
  38. nlptools/arabert/arabert/run_squad.py +1440 -0
  39. nlptools/arabert/arabert/tokenization.py +414 -0
  40. nlptools/arabert/araelectra/__init__.py +1 -0
  41. nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
  42. nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
  43. nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
  44. nlptools/arabert/araelectra/configure_finetuning.py +172 -0
  45. nlptools/arabert/araelectra/configure_pretraining.py +143 -0
  46. nlptools/arabert/araelectra/finetune/__init__.py +14 -0
  47. nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
  48. nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
  49. nlptools/arabert/araelectra/finetune/scorer.py +54 -0
  50. nlptools/arabert/araelectra/finetune/task.py +74 -0
  51. nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
  52. nlptools/arabert/araelectra/flops_computation.py +215 -0
  53. nlptools/arabert/araelectra/model/__init__.py +14 -0
  54. nlptools/arabert/araelectra/model/modeling.py +1029 -0
  55. nlptools/arabert/araelectra/model/optimization.py +193 -0
  56. nlptools/arabert/araelectra/model/tokenization.py +355 -0
  57. nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
  58. nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
  59. nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
  60. nlptools/arabert/araelectra/run_finetuning.py +323 -0
  61. nlptools/arabert/araelectra/run_pretraining.py +469 -0
  62. nlptools/arabert/araelectra/util/__init__.py +14 -0
  63. nlptools/arabert/araelectra/util/training_utils.py +112 -0
  64. nlptools/arabert/araelectra/util/utils.py +109 -0
  65. nlptools/arabert/aragpt2/__init__.py +2 -0
  66. nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
  67. nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
  68. nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
  69. nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
  70. nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
  71. nlptools/arabert/aragpt2/grover/__init__.py +0 -0
  72. nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
  73. nlptools/arabert/aragpt2/grover/modeling.py +803 -0
  74. nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
  75. nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
  76. nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
  77. nlptools/arabert/aragpt2/grover/utils.py +234 -0
  78. nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
  79. nlptools/arabert/preprocess.py +818 -0
  80. nlptools/arabiner/__init__.py +0 -0
  81. nlptools/arabiner/bin/__init__.py +14 -0
  82. nlptools/arabiner/bin/eval.py +87 -0
  83. nlptools/arabiner/bin/infer.py +91 -0
  84. nlptools/arabiner/bin/process.py +140 -0
  85. nlptools/arabiner/bin/train.py +221 -0
  86. nlptools/arabiner/data/__init__.py +1 -0
  87. nlptools/arabiner/data/datasets.py +146 -0
  88. nlptools/arabiner/data/transforms.py +118 -0
  89. nlptools/arabiner/nn/BaseModel.py +22 -0
  90. nlptools/arabiner/nn/BertNestedTagger.py +34 -0
  91. nlptools/arabiner/nn/BertSeqTagger.py +17 -0
  92. nlptools/arabiner/nn/__init__.py +3 -0
  93. nlptools/arabiner/trainers/BaseTrainer.py +117 -0
  94. nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
  95. nlptools/arabiner/trainers/BertTrainer.py +163 -0
  96. nlptools/arabiner/trainers/__init__.py +3 -0
  97. nlptools/arabiner/utils/__init__.py +0 -0
  98. nlptools/arabiner/utils/data.py +124 -0
  99. nlptools/arabiner/utils/helpers.py +151 -0
  100. nlptools/arabiner/utils/metrics.py +69 -0
  101. nlptools/environment.yml +227 -0
  102. nlptools/install_env.py +13 -0
  103. nlptools/morphology/ALMA_multi_word.py +34 -0
  104. nlptools/morphology/__init__.py +52 -0
  105. nlptools/morphology/charsets.py +60 -0
  106. nlptools/morphology/morph_analyzer.py +170 -0
  107. nlptools/morphology/settings.py +8 -0
  108. nlptools/morphology/tokenizers_words.py +19 -0
  109. nlptools/nlptools.py +1 -0
  110. nlptools/salma/__init__.py +12 -0
  111. nlptools/salma/settings.py +31 -0
  112. nlptools/salma/views.py +459 -0
  113. nlptools/salma/wsd.py +126 -0
  114. nlptools/utils/__init__.py +0 -0
  115. nlptools/utils/corpus_tokenizer.py +73 -0
  116. nlptools/utils/implication.py +662 -0
  117. nlptools/utils/jaccard.py +247 -0
  118. nlptools/utils/parser.py +147 -0
  119. nlptools/utils/readfile.py +3 -0
  120. nlptools/utils/sentence_tokenizer.py +53 -0
  121. nlptools/utils/text_transliteration.py +232 -0
  122. nlptools/utils/utils.py +2 -0
@@ -0,0 +1,247 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from nlptools.utils.parser import arStrip
4
+ from nlptools.utils.implication import Implication
5
+ import argparse
6
+
7
+ def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
8
+ """
9
+ Normalize a given Arabic word by removing diacritics and/or shadda diacritic.
10
+
11
+ Args:
12
+ word (:obj:`str`): The input text.
13
+ ignore_all_diacritics_but_not_shadda (:obj:`bool`): A boolean flag indicating whether to remove all diacritics except shadda (default is True).
14
+ ignore_shadda_diacritic (:obj:`bool`): A boolean flag indicating whether to remove shadda diacritic (default is True).
15
+
16
+ Returns:
17
+ :obj:`str` Normalized Arabic word.
18
+ """
19
+ if ignore_all_diacritics_but_not_shadda:
20
+ word = arStrip(word, True, True, False, False, False, False)
21
+
22
+ if ignore_shadda_diacritic:
23
+ word = arStrip(word, False, False, True, False, False, False)
24
+
25
+ return word
26
+
27
+
28
+ def get_preferred_word(word1, word2):
29
+ """
30
+ Returns the preferred word among two given words based on their implication.
31
+
32
+ Args:
33
+ word1 (:obj:`str`): The first word.
34
+ word2 (:obj:`str`): The second word.
35
+
36
+ Returns:
37
+ :obj:`str`: The preferred word among the two given words.
38
+
39
+ """
40
+ implication = Implication(word1, word2)
41
+
42
+ direction = implication.get_direction()
43
+
44
+ if direction in (0, 2):
45
+ return word1
46
+
47
+ elif direction == 1:
48
+ return word2
49
+
50
+ elif direction == 3:
51
+ if not word1.endswith("َ") and not word1.endswith("ُ"):
52
+ return word2
53
+ return word1
54
+
55
+
56
+ def get_non_preferred_word(word1, word2):
57
+ """
58
+ Returns the non-preferred word between the two input words.
59
+
60
+ Args:
61
+ word1 (:obj:`str`): The first word.
62
+ word2 (:obj:`str`): The second word.
63
+
64
+ Returns:
65
+ :obj:`str`: The non-preferred word. If there is no non-preferred word, The '#' is returned.
66
+
67
+ """
68
+
69
+ implication = Implication(word1, word2)
70
+ if implication.get_distance() < 15:
71
+ direction = implication.get_direction()
72
+ if direction == 0 or direction == 1:
73
+ return word1
74
+ elif direction == 2:
75
+ return word2
76
+ elif direction == 3:
77
+ if not word1.endswith("َ") and not word1.endswith("ُ"):
78
+ return word1
79
+ return word2
80
+ return "#"
81
+ #@TBD
82
+ def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, ignore_shadda_diacritic=False):
83
+ """
84
+ Get the intersection of two lists after normalization and ignoring diacritics based on input flags.
85
+
86
+ Args:
87
+ list1 (:obj:`list`): The first list.
88
+ list2 (:obj:`list`): The second list.
89
+ ignore_all_diacritics_but_not_shadda (:obj:`bool`, optional): A flag to ignore all diacritics except for the shadda. Defaults to False.
90
+ ignore_shadda_diacritic (:obj:`bool`, optional): A flag to ignore the shadda diacritic. Defaults to False.
91
+
92
+ Returns:
93
+ :obj:`list`: The intersection of the two lists after normalization and ignoring diacritics.
94
+
95
+ """
96
+
97
+ # Remove all None and empty values from first list
98
+ list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
99
+ list1 = [str(i.strip()) for i in list1]
100
+
101
+ # Remove all None and empty values from second list
102
+ list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
103
+ list2 = [str(i.strip()) for i in list2]
104
+
105
+ interection_list = []
106
+
107
+ # Add all Common words between the two list1 and list2 to interectionList
108
+ for list1_word in list1:
109
+ for list2_word in list2:
110
+ word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
111
+ word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
112
+
113
+ implication = Implication(word1, word2)
114
+ if implication.get_direction() >= 0 and implication.get_distance() < 15:
115
+ interection_list.append(get_preferred_word(word1, word2))
116
+
117
+ i = 0
118
+ while i < len(interection_list):
119
+ j = i + 1
120
+ while j < len(interection_list):
121
+ non_preferred_word = get_non_preferred_word(interection_list[i], interection_list[j])
122
+ if non_preferred_word != "#":
123
+ interection_list.remove(non_preferred_word)
124
+ j += 1
125
+ i += 1
126
+
127
+ return interection_list
128
+
129
+
130
+
131
+ def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic):
132
+ """
133
+ Finds the union of two lists by removing duplicates and normalizing words.
134
+
135
+ Args:
136
+ list1 (:obj:`list`): The first list.
137
+ list2 (:obj:`list`): The second list.
138
+ ignore_all_diacritics_but_not_shadda (:obj:`bool`): Whether to ignore all diacritics except shadda or not.
139
+ ignore_shadda_diacritic (:obj:`bool`): Whether to ignore shadda diacritic or not.
140
+ Returns:
141
+ :obj:`list`: The union of the two lists after removing duplicates and normalizing words.
142
+ """
143
+
144
+ list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
145
+
146
+ list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
147
+
148
+ union_list = []
149
+
150
+ for list1_word in list1:
151
+ word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
152
+ union_list.append(word1)
153
+
154
+ for list2_word in list2:
155
+ word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
156
+ union_list.append(word2)
157
+
158
+ i = 0
159
+ while i < len(union_list):
160
+ j = i + 1
161
+ while j < len(union_list):
162
+ non_preferred_word = get_non_preferred_word(union_list[i], union_list[j])
163
+ if (non_preferred_word != "#"):
164
+ union_list.remove(non_preferred_word)
165
+ j = j + 1
166
+ i = i + 1
167
+
168
+ return union_list
169
+
170
+
171
+
172
+ def jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_not_shadda: bool, ignore_shadda_diacritic: bool) -> float:
173
+ """
174
+ Calculates the Jaccard similarity coefficient between two lists.
175
+
176
+ Args:
177
+ list1 (:obj:`list`): The first list.
178
+ list2 (:obj:`list`): The second list.
179
+ ignore_all_diacritics_but_not_shadda (:obj:`bool`): A flag indicating whether to ignore all diacritics except for shadda.
180
+ ignore_shadda_diacritic (:obj:`bool`): A flag indicating whether to ignore the shadda diacritic.
181
+
182
+ Returns:
183
+ :obj:`float`: The Jaccard similarity coefficient between the two lists.
184
+ """
185
+ # Find the intersection between two sets
186
+ intersection_list = get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
187
+
188
+ # Find the union between two sets
189
+ union_list = get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
190
+
191
+ # Calculate the Jaccard similarity coefficient by dividing the length of the intersectionList by the length of the unionList
192
+ return float(len(intersection_list)) / float(len(union_list))
193
+
194
+
195
+
196
+
197
+ def jaccard(delimiter, str1, str2, selection, ignoreAllDiacriticsButNotShadda=True, ignoreShaddaDiacritic=True):
198
+ """
199
+ Compute the Jaccard similarity, union, or intersection of two sets of strings.
200
+
201
+ Args:
202
+ delimiter (:obj:`str`): The delimiter used to split the input strings.
203
+ str1 (:obj:`str`): The first input string to compare.
204
+ str2 (:obj:`str`): The second input string to compare.
205
+ selection (:obj:`str`): The desired operation to perform on the two sets of strings.
206
+ Must be one of *intersection*, *union*, *jaccardSimilarity*, or *jaccardAll*.
207
+ ignoreAllDiacriticsButNotShadda (:obj:`bool`): If True, ignore all diacritics except for the Shadda diacritic. (Defualt is True)
208
+ ignoreShaddaDiacritic (:obj:`bool`): If True, ignore the Shadda diacritic.(Default is True)
209
+
210
+ Returns:
211
+ The Jaccard similarity, union, or intersection of the two sets of strings,
212
+ depending on the value of the `selection` argument.
213
+
214
+ Note:
215
+ - If `selection` is *jaccardAll*, a list of the intersection, union, and Jaccard similarity
216
+ of the two sets of strings will be returned.
217
+ - If an error occurs, the method will return the string "An error has occurred".
218
+ Online tool: https://sina.birzeit.edu/resources/jaccardFunction.html
219
+ """
220
+ try:
221
+ list1 = str1.split(delimiter)
222
+ list2 = str2.split(delimiter)
223
+
224
+ if selection == "intersection":
225
+ intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
226
+ return intersection
227
+ elif selection == "union":
228
+ union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
229
+ return union
230
+ elif selection == "jaccardSimilarity":
231
+ similarity = jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
232
+ return similarity
233
+ elif selection == "jaccardAll":
234
+ intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
235
+ union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
236
+ similarity = jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
237
+ output_list = ["intersection:", intersection, "union:", union, "similarity:", similarity]
238
+ return output_list
239
+ else:
240
+ return 'Invalid selection option'
241
+
242
+ except AttributeError as ae:
243
+ print(f"Attribute error occurred: {str(ae)}")
244
+ return 'Invalid input type'
245
+ except Exception as e:
246
+ print(f"Error occurred: {str(e)}")
247
+ return 'An error has occurred'
@@ -0,0 +1,147 @@
1
+ import re
2
+ import argparse
3
+
4
+ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, alif=True , specialChars=True ):
5
+
6
+ """
7
+ This method removes Arabic diacritics, small diacritcs, shaddah, Latin and Arabic digits, and unify alif.
8
+ And remove special characters, spaces, underscore and Arabic tatwelah from the input text.
9
+
10
+ Args:
11
+ text (:obj:`str`): Arabic text to be processed.
12
+ diacs (:obj:`bool`): flag to remove Arabic diacretics [ ًٌٍَُِْ] (default is True).
13
+ smallDiacs (:obj:`bool`): flag to remove small diacretics (default is True).
14
+ shaddah (:obj:`bool`): flag to remove shaddah (default is True).
15
+ digit (:obj:`bool`): flag to remove Latin and Arabic digits (default is True).
16
+ alif (:obj:`bool`): flag to unify alif (default is True).
17
+ specialChars (:obj:`bool`): flag to remove special characters (default is True).
18
+
19
+ Returns:
20
+ :obj:`str`: stripped text.
21
+
22
+ **Example:**
23
+
24
+ .. highlight:: python
25
+ .. code-block:: python
26
+
27
+ from nlptools.utils import parser
28
+ processed_text =parser.arStrip('2023الجو جميلُ')
29
+ print(processed_text)
30
+
31
+ #putput
32
+ الجو جميل
33
+
34
+ name =parser.arStrip('أَلَمۡ یَأۡنِ لِلَّذِینَ ءَامَنُوۤا۟ أَن تَخۡشَعَ قُلُوبُهُمۡ لِذِكۡرِ ٱللَّهِ وَمَا نَزَلَ مِنَ ٱلۡحَقِّ وَلَا یَكُونُوا۟ كَٱلَّذِینَ أُوتُوا۟ ٱلۡكِتَـٰبَ مِن قَبۡلُ فَطَالَ عَلَیۡهِمُ ٱلۡأَمَدُ فَقَسَتۡ قُلُوبُهُمۡۖ وَكَثِیر مِّنۡهُمۡ فَـسِقُونَ' , True , True , True , True , True , True )
35
+ print(name)
36
+ #putput
37
+ الم یان للذین ءامنوا ان تخشع قلوبهم لذكر الله وما نزل من الحق ولا یكونوا كالذین اوتوا الكتٰب من قبل فطال علیهم الامد فقست قلوبهم وكثیر منهم فسقون
38
+
39
+
40
+ """
41
+ try:
42
+ if text: # if the input string is not empty do the following
43
+ #print("in if")
44
+ if diacs == True :
45
+ text = re.sub(r'[\u064B-\u0650]+', '',text) # Remove all Arabic diacretics [ ًٌٍَُِْ]
46
+ text = re.sub(r'[\u0652]+', '',text) # Remove SUKUN
47
+ if shaddah == True:
48
+ text = re.sub(r'[\u0651]+', '',text) # Remove shddah
49
+ if smallDiacs == True:
50
+ text = re.sub(r'[\u06D6-\u06ED]+', '',text) # Remove all small Quranic annotation signs
51
+ if digit == True:
52
+ text = re.sub('[0-9]+', ' ',text) # Remove English digits
53
+ text = re.sub('[٠-٩]+', ' ',text)# Remove Arabic digits
54
+
55
+ if alif == True: # Unify alif with hamzah:
56
+ text = re.sub('ٱ', 'ا',text);
57
+ text = re.sub('أ', 'ا',text);
58
+ text = re.sub('إ', 'ا',text);
59
+ text = re.sub('آ', 'ا',text);
60
+ if specialChars == True:
61
+ text = re.sub('[?؟!@#$%-]+' , '' , text) # Remove some of special chars
62
+
63
+ text = re.sub('[\\s]+'," ",text) # Remove all spaces
64
+ text = text.replace("_" , '') #Remove underscore
65
+ text = text.replace("ـ" , '') # Remove Arabic tatwelah
66
+ text = text.strip() # Trim input string
67
+ except:
68
+ return text
69
+ return text
70
+
71
+ def remove_punctuation(text):
72
+ """
73
+ Removes punctuation marks from the text.
74
+
75
+ Args:
76
+ text (:obj:`str`): The input text.
77
+
78
+ Returns:
79
+ :obj:`str`: The output text without punctuation marks.
80
+
81
+ **Example:**
82
+
83
+ .. highlight:: python
84
+ .. code-block:: python
85
+
86
+ from nlptools.utils import parser
87
+ return parser.remove_punctuation("te!@#،$%%؟st")
88
+
89
+ #output
90
+ test
91
+
92
+ output= parser.remove_punctuation(" {يَا أَيُّهَا الَّذِينَ آمَنُوا لِيَسْتَأْذِنْكُمُ ....}")
93
+ print(output)
94
+
95
+ #output
96
+ يَا أَيُّهَا الَّذِينَ آمَنُوا لِيَسْتَأْذِنْكُمُ
97
+
98
+ """
99
+ try:
100
+ if text:
101
+ punctuation_marks = [r'[\u0021-\u002F]+', r'[U+060C]+', r'[\u003A-\u0040]+',
102
+ r'[\u005B-\u0060]+', r'[\u007B-\u007E]+', r'[\u060C]+',
103
+ r'[\u061B]+', r'[\u061E]+', r'[\u061F]+', r'[\u0640]+',
104
+ r'[\u0653]+', r'[\u065C]+', r'[\u066C]+', r'[\u066A]+',
105
+ r'["}"]+', r'["{"]+']
106
+ outputString = text
107
+ for punctuation in punctuation_marks:
108
+ outputString = re.sub(punctuation, '', outputString)
109
+ except:
110
+ return text
111
+ return outputString
112
+
113
+ def remove_latin(text):
114
+ """
115
+ This method removes all Latin characters from the input text.
116
+
117
+ Args:
118
+ text (:obj:`str`): The input text.
119
+
120
+ Returns:
121
+ outputString (:obj:`str`): The text without Latin characters.
122
+ Note:
123
+ If an error occurs during processing, the original text is returned.
124
+ **Example:**
125
+
126
+ .. highlight:: python
127
+ .. code-block:: python
128
+
129
+ from nlptools.utils import parser
130
+ return parser.remove_latin("miojkdujhvaj1546545spkdpoqfoiehwv nWEQFGWERHERTJETAWIKUYFC")
131
+
132
+ #output
133
+ 1546545
134
+
135
+ output = parser.remove_latin("أصل المسمى «تخطيط موارد المؤسسة» هو تعريب لمسمى التخطيط باللغة الإنجليزية Enterprise Resource Planning")
136
+ print(output)
137
+
138
+ #output
139
+ أصل المسمى «تخطيط موارد المؤسسة» هو تعريب لمسمى التخطيط باللغة الإنجليزية
140
+ """
141
+ try:
142
+ if text:
143
+ text = re.sub('[a-zA-Z]+', ' ', text)
144
+ except:
145
+ return text
146
+ return text
147
+
@@ -0,0 +1,3 @@
1
+ def read_file(file_path):
2
+ with open(file_path, 'r', encoding='utf-8') as file:
3
+ return [line.strip() for line in file]
@@ -0,0 +1,53 @@
1
+ def remove_empty_values(sentences):
2
+ return [value for value in sentences if value != '']
3
+
4
+
5
+ def sent_tokenize(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
6
+ """
7
+ This method tokenizes a text into a set of sentences based on the selected separators, including the dot, new line, question mark, and exclamation mark.
8
+
9
+ Args:
10
+ text (:obj:`str`): Arabic text to be tokenized.
11
+ dot (:obj:`str`): flag to split text based on Dot (default is True).
12
+ new_line (:obj:`str`): flag to split text based on new_line (default is True).
13
+ question_mark (:obj:`str`): flag to split text based on question_mark (default is True).
14
+ exclamation_mark (:obj:`str`): flag to split text based on exclamation_mark (default is True).
15
+
16
+ Returns:
17
+ :obj:`list`: list of sentences.
18
+
19
+ **Example:**
20
+
21
+ .. highlight:: python
22
+ .. code-block:: python
23
+
24
+ from nlptools.utils import sentence_tokenizer
25
+ sentences = sentence_tokenizer.sent_tokenize("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.", dot=True, new_line=True, question_mark=True, exclamation_mark=True)
26
+ print(sentences)
27
+
28
+ #output
29
+ ['مختبر سينا لحوسبة اللغة والذكاء الإصطناعي.', 'في جامعة بيرزيت.']
30
+ """
31
+ separators = []
32
+ split_text = [text]
33
+ if new_line==True:
34
+ separators.append('\n')
35
+ if dot==True:
36
+ separators.append('.')
37
+ if question_mark==True:
38
+ separators.append('?')
39
+ separators.append('؟')
40
+ if exclamation_mark==True:
41
+ separators.append('!')
42
+
43
+ for sep in separators:
44
+ new_split_text = []
45
+ for part in split_text:
46
+ tokens = part.split(sep)
47
+ tokens_with_separator = [token + sep for token in tokens[:-1]]
48
+ tokens_with_separator.append(tokens[-1].strip())
49
+ new_split_text.extend(tokens_with_separator)
50
+ split_text = new_split_text
51
+
52
+ split_text = remove_empty_values(split_text)
53
+ return split_text
@@ -0,0 +1,232 @@
1
+ from collections import deque
2
+
3
+ # This is a mapping dictionery of Arabic letters to BW
4
+ # dictionery contains: key -> Unicode to Arabic, value -> BW chars
5
+ # It includes all BW mappings in addition to other special characters
6
+ # that are used in the SAMA database but not part of the BW character set such as numbers and Qur'anic diacritics
7
+ ar2bw_map = {
8
+ '\u0621' : '\'' , # ء
9
+ '\u0622' : '|' , # آ
10
+ '\u0623' : '>' , # أ
11
+ '\u0624' : '&' , # ؤ
12
+ '\u0625' : '<' , # إ
13
+ '\u0626' : '}' , # ئ
14
+ '\u0627' : 'A' , # ا
15
+ '\u0628' : 'b' , # ب
16
+ '\u0629' : 'p' , # ة
17
+ '\u062A' : 't' , # ت
18
+ '\u062B' : 'v' , # ث
19
+ '\u062C' : 'j' , # ج
20
+ '\u062D' : 'H' , # ح
21
+ '\u062E' : 'x' , # خ
22
+ '\u062F' : 'd' , # د
23
+ '\u0630' : '*' , # ذ
24
+ '\u0631' : 'r' , # ر
25
+ '\u0632' : 'z' , # ز
26
+ '\u0633' : 's' , # س
27
+ '\u0634' : '$' , # ش
28
+ '\u0635' : 'S' , # ص
29
+ '\u0636' : 'D' , # ض
30
+ '\u0637' : 'T' , # ط
31
+ '\u0638' : 'Z' , # ظ
32
+ '\u0639' : 'E' , # ع
33
+ '\u063A' : 'g' , # غ
34
+ '\u0020' : ' ' , #Space is replaced by space
35
+ '\u0640' : '_' , #ـ
36
+ '\u0641' : 'f' , # ف
37
+ '\u0642' : 'q' , # ق
38
+ '\u0643' : 'k' , # ك
39
+ '\u0644' : 'l' , # ل
40
+ '\u0645' : 'm' , # م
41
+ '\u0646' : 'n' , # ن
42
+ '\u0647' : 'h' , # ه
43
+ '\u0648' : 'w' , # و
44
+ '\u0649' : 'Y' , # ى
45
+ '\u064A' : 'y' , # ي
46
+ '\u064B' : 'F' , # TANWEEN FATH ً
47
+ '\u064C' : 'N' , #TANWEEN DHAM ٌ
48
+ '\u064D' : 'K' , #TANWEE KASR ٍ
49
+ '\u064E' : 'a' , #FATHA َ
50
+ '\u064F' : 'u' , #DHAMMA ُ
51
+ '\u0650' : 'i' , #KASRA ِ
52
+ '\u0651' : '~' , # SHADDAH ّ
53
+ '\u0652' : 'o' , #SUKON ْ
54
+ '\u0670' : '`' , # SHORT ALEF (dagger alif) ٰ
55
+ '\u0671' : '{' , #AL WITH HAMZA QATE'/WASEL MARK ٱ
56
+ '\u067E' : 'P' , # پ
57
+ '\u0686' : 'J' , # چ
58
+ '\u06A4' : 'V' , # ڤ
59
+ '\u06AF' : 'G' , # گ
60
+ '\u06EA' : '-' , # ۪ Not BW standerd
61
+ '\u0654' : '#' , # ٔ Not BW standerd
62
+ '\u06DC ' : ':' , # short sen ۜ Not BW standerd
63
+ '\u06E0' : '"' , # small circle ۟ ,Not BW standerd
64
+ '\u06E2' : '[' , # short meem ۢ , Not BW standerd
65
+ '\u06E3 ' : ';' , #small sen below letter ۣ , Not BW standerd
66
+ '\u06E5 ' : ',' , # samall wa ۥ , Not BW standerd
67
+ '\u06E6' : '.' , # ۦ Not BW standerd
68
+ '\u06E8 ' : '!' , # small noon ۨ Not BW standerd
69
+ '\u06EA' : '-' , # special qura'nic arabic ( littel circule underneath), is not part of standerd BW , not in Buckwalter ۪ Not BW standerd
70
+ '\u06EC' : '%' , # small solid circle ۬ Not BW standerd
71
+ '\u06ED ' : ']', # small meem ۭ Not BW standerd
72
+ '0' : '0' , #
73
+ '1' : '1' , #
74
+ '2' : '2' , #
75
+ '3' : '3' , #
76
+ '4' : '4' , #
77
+ '5' : '5' , #
78
+ '6' : '6' , #
79
+ '7' : '7' , #
80
+ '8' : '8' , #
81
+ '9' : '9' #
82
+ }
83
+
84
+ # This is a mapping dictionery of BW letters to Arabic
85
+ # It includes all Arabic mappings in addition to other special characters
86
+ # that are used in the SAMA database but not part of the Arabic character set such as numbers and Qur'anic diacritics
87
+ bw2ar_map = {
88
+ '\'' : '\u0621' , # ء
89
+ '|' : '\u0622' , # آ
90
+ '>' : '\u0623' , # أ
91
+ '&' : '\u0624' , # ؤ
92
+ '<' : '\u0625' , # إ
93
+ '}' : '\u0626' , # ئ
94
+ 'A' : '\u0627' , # ا
95
+ 'b' : '\u0628' , # ب
96
+ 'p' : '\u0629' , # ة
97
+ 't' : '\u062A' , # ت
98
+ 'v' : '\u062B' , # ث
99
+ 'j' : '\u062C' , # ج
100
+ 'H' : '\u062D' , # ح
101
+ 'x' : '\u062E' , # خ
102
+ 'd' : '\u062F' , # د
103
+ '*' : '\u0630' , # ذ
104
+ 'r' : '\u0631' , # ر
105
+ 'z' : '\u0632' , # ز
106
+ 's' : '\u0633' , # س
107
+ '$' : '\u0634' , # ش
108
+ 'S' : '\u0635' , # ص
109
+ 'D' : '\u0636' , # ض
110
+ 'T' : '\u0637' , # ط
111
+ 'Z' : '\u0638' , # ظ
112
+ 'E' : '\u0639' , # ع
113
+ 'g' : '\u063A' , # غ
114
+ ' ' : '\u0020' , #Space is replaced by space
115
+ '_' : ' ' , #ـ This is Temporary
116
+ # '_' : '\u0640' , #ـ
117
+ 'f' : '\u0641' , # ف
118
+ 'q' : '\u0642' , # ق
119
+ 'k' : '\u0643' , # ك
120
+ 'l' : '\u0644' , # ل
121
+ 'm' : '\u0645' , # م
122
+ 'n' : '\u0646' , # ن
123
+ 'h' : '\u0647' , # ه
124
+ 'w' : '\u0648' , # و
125
+ 'Y' : '\u0649' , # ى
126
+ 'y' : '\u064A' , # ي
127
+ 'F' : '\u064B' , # TANWEEN FATH ً
128
+ 'N' : '\u064C' , #TANWEEN DHAM ٌ
129
+ 'K' : '\u064D' , #TANWEE KASR ٍ
130
+ 'a' : '\u064E' , #FATHA َ
131
+ 'u' : '\u064F' , #DHAMMA ُ
132
+ 'i' : '\u0650' , #KASRA ِ
133
+ '~' : '\u0651' , # SHADDAH ّ
134
+ 'o' : '\u0652' , #SUKON ْ
135
+ '`' : '\u0670' , # SHORT ALEF (dagger alif) ٰ
136
+ '{' : '\u0671' , #AL WITH HAMZA QATE'/WASEL MARK ٱ
137
+ 'P' : '\u067E' , # پ
138
+ 'J' : '\u0686' , # چ
139
+ 'V' : '\u06A4' , # ڤ
140
+ 'G' : '\u06AF' , # گ
141
+ '-' : '\u06EA' , # ۪ Not BW standerd
142
+ '#' : '\u0654' , # ٔ ARABIC HAMZA ABOVE Not BW standerd
143
+ ':' : '\u06DC ' , # short sen ۜ Not BW standerd
144
+ '"' : '\u06E0' , # small circle ۟ ,Not BW standerd
145
+ '[' : '\u06E2' , # short meem ۢ , Not BW standerd
146
+ ';' : '\u06E3 ' , #small sen below letter ۣ , Not BW standerd
147
+ ',' : '\u06E5 ' , # samall wa ۥ , Not BW standerd
148
+ '.' : '\u06E6' , # ۦ Not BW standerd
149
+ '!' : '\u06E8 ' , # small noon ۨ Not BW standerd
150
+ '-' : '\u06EA' , # special qura'nic arabic ( littel circule underneath), is not part of standerd BW , not in Buckwalter ۪ Not BW standerd
151
+ '%' : '\u06EC' , # small solid circle ۬ Not BW standerd
152
+ ']' : '\u06ED ', # small meem ۭ Not BW standerd
153
+ '0' : '0' , #
154
+ '1' : '1' , #
155
+ '2' : '2' , #
156
+ '3' : '3' , #
157
+ '4' : '4' , #
158
+ '5' : '5' , #
159
+ '6' : '6' , #
160
+ '7' : '7' , #
161
+ '8' : '8' , #
162
+ '9' : '9' #
163
+ }
164
+ # A transliterate Function to transliterate Arabic letters and vice versa
165
+ #It takes a text and the schema as input and return 2-values: the transliteration and a flag of whether all chars are transliterated or not
166
+ def perform_transliteration(text , schema ):
167
+ """
168
+ This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
169
+
170
+ Args:
171
+ text (:obj:`str`): The input text to be transliterated.
172
+ schema (:obj:`str`): The transliteration schema to be used. Should be either `bw2ar` to transliterate Buckwalter-encoded Arabic text to Arabic script, or `ar2bw` to transliterate Arabic script to Buckwalter-encoded Arabic text.
173
+
174
+ Returns:
175
+ :obj:`tuple`: A tuple of two values:
176
+ - The transliterated text based on the given schema.
177
+ - A boolean flag indicating whether all characters in the input text were successfully transliterated or not.
178
+
179
+ **Example:**
180
+
181
+ .. highlight:: python
182
+ .. code-block:: python
183
+
184
+ from nlptools.utils import text_transliteration
185
+
186
+ print(text_transliteration.perform_transliteration("مُحَمَدٌ نَـشِيْطٌـ1" , "ar2bw"))
187
+ print(text_transliteration.perform_transliteration("muHamadN" , "bw2ar"))
188
+ print(text_transliteration.perform_transliteration("شَنُعُ۪ـ1" , "ar2bw"))
189
+ print(text_transliteration.perform_transliteration("$aw~aH_2" , "bw2ar"))
190
+
191
+ #output
192
+ ('muHamadN na_$iyoTN_1', True)
193
+ ('مُحَمَدٌ', True)
194
+ ('$anuE-u_1', True)
195
+ ('شَوَّح 2', True)
196
+ """
197
+ # to map BW into Arabic
198
+ if schema == "bw2ar":
199
+ transliterated_text = deque() #Empty deque list
200
+
201
+ is_all_mapped = True
202
+ for char in text:
203
+ # lockup every charecters in the dictionary, if not found return the original char
204
+ char_value = bw2ar_map.get(char)
205
+
206
+ if char_value is None:
207
+ is_all_mapped = False # False if one cjars is not in map
208
+ transliterated_text.append(char)
209
+ else:
210
+ transliterated_text.append(char_value)
211
+
212
+ return ''.join(transliterated_text) , is_all_mapped
213
+ # to map Arabic into BW
214
+ elif schema == "ar2bw":
215
+
216
+ transliterated_text = deque()
217
+
218
+ is_all_mapped = True
219
+ for char in text:
220
+ # lockup evry charecters in the dictionary, if not found return the original char
221
+ char_value = ar2bw_map.get(char)
222
+
223
+ if char_value is None:
224
+ is_all_mapped = False # False if one cjars is not in map
225
+ transliterated_text.append(char)
226
+ else:
227
+ transliterated_text.append(char_value)
228
+
229
+ return ''.join(transliterated_text) , is_all_mapped
230
+
231
+ else:
232
+ raise ValueError("Schema must be either 'bw2ar' or 'ar2bw'.")
@@ -0,0 +1,2 @@
1
+ def hello():
2
+ return 'hello'