SinaTools 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
- SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
- SinaTools-0.1.1.dist-info/LICENSE +22 -0
- SinaTools-0.1.1.dist-info/METADATA +72 -0
- SinaTools-0.1.1.dist-info/RECORD +122 -0
- SinaTools-0.1.1.dist-info/WHEEL +6 -0
- SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.1.dist-info/top_level.txt +1 -0
- nlptools/CLI/DataDownload/download_files.py +71 -0
- nlptools/CLI/arabiner/bin/infer.py +117 -0
- nlptools/CLI/arabiner/bin/infer2.py +81 -0
- nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
- nlptools/CLI/morphology/morph_analyzer.py +91 -0
- nlptools/CLI/salma/salma_tools.py +68 -0
- nlptools/CLI/utils/__init__.py +0 -0
- nlptools/CLI/utils/arStrip.py +99 -0
- nlptools/CLI/utils/corpus_tokenizer.py +74 -0
- nlptools/CLI/utils/implication.py +92 -0
- nlptools/CLI/utils/jaccard.py +96 -0
- nlptools/CLI/utils/latin_remove.py +51 -0
- nlptools/CLI/utils/remove_Punc.py +53 -0
- nlptools/CLI/utils/sentence_tokenizer.py +90 -0
- nlptools/CLI/utils/text_transliteration.py +77 -0
- nlptools/DataDownload/__init__.py +0 -0
- nlptools/DataDownload/downloader.py +185 -0
- nlptools/VERSION +1 -0
- nlptools/__init__.py +5 -0
- nlptools/arabert/__init__.py +1 -0
- nlptools/arabert/arabert/__init__.py +14 -0
- nlptools/arabert/arabert/create_classification_data.py +260 -0
- nlptools/arabert/arabert/create_pretraining_data.py +534 -0
- nlptools/arabert/arabert/extract_features.py +444 -0
- nlptools/arabert/arabert/lamb_optimizer.py +158 -0
- nlptools/arabert/arabert/modeling.py +1027 -0
- nlptools/arabert/arabert/optimization.py +202 -0
- nlptools/arabert/arabert/run_classifier.py +1078 -0
- nlptools/arabert/arabert/run_pretraining.py +593 -0
- nlptools/arabert/arabert/run_squad.py +1440 -0
- nlptools/arabert/arabert/tokenization.py +414 -0
- nlptools/arabert/araelectra/__init__.py +1 -0
- nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
- nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
- nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
- nlptools/arabert/araelectra/configure_finetuning.py +172 -0
- nlptools/arabert/araelectra/configure_pretraining.py +143 -0
- nlptools/arabert/araelectra/finetune/__init__.py +14 -0
- nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
- nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
- nlptools/arabert/araelectra/finetune/scorer.py +54 -0
- nlptools/arabert/araelectra/finetune/task.py +74 -0
- nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
- nlptools/arabert/araelectra/flops_computation.py +215 -0
- nlptools/arabert/araelectra/model/__init__.py +14 -0
- nlptools/arabert/araelectra/model/modeling.py +1029 -0
- nlptools/arabert/araelectra/model/optimization.py +193 -0
- nlptools/arabert/araelectra/model/tokenization.py +355 -0
- nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
- nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
- nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
- nlptools/arabert/araelectra/run_finetuning.py +323 -0
- nlptools/arabert/araelectra/run_pretraining.py +469 -0
- nlptools/arabert/araelectra/util/__init__.py +14 -0
- nlptools/arabert/araelectra/util/training_utils.py +112 -0
- nlptools/arabert/araelectra/util/utils.py +109 -0
- nlptools/arabert/aragpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
- nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
- nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
- nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
- nlptools/arabert/aragpt2/grover/__init__.py +0 -0
- nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
- nlptools/arabert/aragpt2/grover/modeling.py +803 -0
- nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
- nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
- nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
- nlptools/arabert/aragpt2/grover/utils.py +234 -0
- nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
- nlptools/arabert/preprocess.py +818 -0
- nlptools/arabiner/__init__.py +0 -0
- nlptools/arabiner/bin/__init__.py +14 -0
- nlptools/arabiner/bin/eval.py +87 -0
- nlptools/arabiner/bin/infer.py +91 -0
- nlptools/arabiner/bin/process.py +140 -0
- nlptools/arabiner/bin/train.py +221 -0
- nlptools/arabiner/data/__init__.py +1 -0
- nlptools/arabiner/data/datasets.py +146 -0
- nlptools/arabiner/data/transforms.py +118 -0
- nlptools/arabiner/nn/BaseModel.py +22 -0
- nlptools/arabiner/nn/BertNestedTagger.py +34 -0
- nlptools/arabiner/nn/BertSeqTagger.py +17 -0
- nlptools/arabiner/nn/__init__.py +3 -0
- nlptools/arabiner/trainers/BaseTrainer.py +117 -0
- nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
- nlptools/arabiner/trainers/BertTrainer.py +163 -0
- nlptools/arabiner/trainers/__init__.py +3 -0
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +124 -0
- nlptools/arabiner/utils/helpers.py +151 -0
- nlptools/arabiner/utils/metrics.py +69 -0
- nlptools/environment.yml +227 -0
- nlptools/install_env.py +13 -0
- nlptools/morphology/ALMA_multi_word.py +34 -0
- nlptools/morphology/__init__.py +52 -0
- nlptools/morphology/charsets.py +60 -0
- nlptools/morphology/morph_analyzer.py +170 -0
- nlptools/morphology/settings.py +8 -0
- nlptools/morphology/tokenizers_words.py +19 -0
- nlptools/nlptools.py +1 -0
- nlptools/salma/__init__.py +12 -0
- nlptools/salma/settings.py +31 -0
- nlptools/salma/views.py +459 -0
- nlptools/salma/wsd.py +126 -0
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/corpus_tokenizer.py +73 -0
- nlptools/utils/implication.py +662 -0
- nlptools/utils/jaccard.py +247 -0
- nlptools/utils/parser.py +147 -0
- nlptools/utils/readfile.py +3 -0
- nlptools/utils/sentence_tokenizer.py +53 -0
- nlptools/utils/text_transliteration.py +232 -0
- nlptools/utils/utils.py +2 -0
@@ -0,0 +1,247 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
from nlptools.utils.parser import arStrip
|
4
|
+
from nlptools.utils.implication import Implication
|
5
|
+
import argparse
|
6
|
+
|
7
|
+
def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
|
8
|
+
"""
|
9
|
+
Normalize a given Arabic word by removing diacritics and/or shadda diacritic.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
word (:obj:`str`): The input text.
|
13
|
+
ignore_all_diacritics_but_not_shadda (:obj:`bool`): A boolean flag indicating whether to remove all diacritics except shadda (default is True).
|
14
|
+
ignore_shadda_diacritic (:obj:`bool`): A boolean flag indicating whether to remove shadda diacritic (default is True).
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
:obj:`str` Normalized Arabic word.
|
18
|
+
"""
|
19
|
+
if ignore_all_diacritics_but_not_shadda:
|
20
|
+
word = arStrip(word, True, True, False, False, False, False)
|
21
|
+
|
22
|
+
if ignore_shadda_diacritic:
|
23
|
+
word = arStrip(word, False, False, True, False, False, False)
|
24
|
+
|
25
|
+
return word
|
26
|
+
|
27
|
+
|
28
|
+
def get_preferred_word(word1, word2):
|
29
|
+
"""
|
30
|
+
Returns the preferred word among two given words based on their implication.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
word1 (:obj:`str`): The first word.
|
34
|
+
word2 (:obj:`str`): The second word.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
:obj:`str`: The preferred word among the two given words.
|
38
|
+
|
39
|
+
"""
|
40
|
+
implication = Implication(word1, word2)
|
41
|
+
|
42
|
+
direction = implication.get_direction()
|
43
|
+
|
44
|
+
if direction in (0, 2):
|
45
|
+
return word1
|
46
|
+
|
47
|
+
elif direction == 1:
|
48
|
+
return word2
|
49
|
+
|
50
|
+
elif direction == 3:
|
51
|
+
if not word1.endswith("َ") and not word1.endswith("ُ"):
|
52
|
+
return word2
|
53
|
+
return word1
|
54
|
+
|
55
|
+
|
56
|
+
def get_non_preferred_word(word1, word2):
|
57
|
+
"""
|
58
|
+
Returns the non-preferred word between the two input words.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
word1 (:obj:`str`): The first word.
|
62
|
+
word2 (:obj:`str`): The second word.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
:obj:`str`: The non-preferred word. If there is no non-preferred word, The '#' is returned.
|
66
|
+
|
67
|
+
"""
|
68
|
+
|
69
|
+
implication = Implication(word1, word2)
|
70
|
+
if implication.get_distance() < 15:
|
71
|
+
direction = implication.get_direction()
|
72
|
+
if direction == 0 or direction == 1:
|
73
|
+
return word1
|
74
|
+
elif direction == 2:
|
75
|
+
return word2
|
76
|
+
elif direction == 3:
|
77
|
+
if not word1.endswith("َ") and not word1.endswith("ُ"):
|
78
|
+
return word1
|
79
|
+
return word2
|
80
|
+
return "#"
|
81
|
+
#@TBD
|
82
|
+
def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, ignore_shadda_diacritic=False):
|
83
|
+
"""
|
84
|
+
Get the intersection of two lists after normalization and ignoring diacritics based on input flags.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
list1 (:obj:`list`): The first list.
|
88
|
+
list2 (:obj:`list`): The second list.
|
89
|
+
ignore_all_diacritics_but_not_shadda (:obj:`bool`, optional): A flag to ignore all diacritics except for the shadda. Defaults to False.
|
90
|
+
ignore_shadda_diacritic (:obj:`bool`, optional): A flag to ignore the shadda diacritic. Defaults to False.
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
:obj:`list`: The intersection of the two lists after normalization and ignoring diacritics.
|
94
|
+
|
95
|
+
"""
|
96
|
+
|
97
|
+
# Remove all None and empty values from first list
|
98
|
+
list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
|
99
|
+
list1 = [str(i.strip()) for i in list1]
|
100
|
+
|
101
|
+
# Remove all None and empty values from second list
|
102
|
+
list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
|
103
|
+
list2 = [str(i.strip()) for i in list2]
|
104
|
+
|
105
|
+
interection_list = []
|
106
|
+
|
107
|
+
# Add all Common words between the two list1 and list2 to interectionList
|
108
|
+
for list1_word in list1:
|
109
|
+
for list2_word in list2:
|
110
|
+
word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
111
|
+
word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
112
|
+
|
113
|
+
implication = Implication(word1, word2)
|
114
|
+
if implication.get_direction() >= 0 and implication.get_distance() < 15:
|
115
|
+
interection_list.append(get_preferred_word(word1, word2))
|
116
|
+
|
117
|
+
i = 0
|
118
|
+
while i < len(interection_list):
|
119
|
+
j = i + 1
|
120
|
+
while j < len(interection_list):
|
121
|
+
non_preferred_word = get_non_preferred_word(interection_list[i], interection_list[j])
|
122
|
+
if non_preferred_word != "#":
|
123
|
+
interection_list.remove(non_preferred_word)
|
124
|
+
j += 1
|
125
|
+
i += 1
|
126
|
+
|
127
|
+
return interection_list
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic):
|
132
|
+
"""
|
133
|
+
Finds the union of two lists by removing duplicates and normalizing words.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
list1 (:obj:`list`): The first list.
|
137
|
+
list2 (:obj:`list`): The second list.
|
138
|
+
ignore_all_diacritics_but_not_shadda (:obj:`bool`): Whether to ignore all diacritics except shadda or not.
|
139
|
+
ignore_shadda_diacritic (:obj:`bool`): Whether to ignore shadda diacritic or not.
|
140
|
+
Returns:
|
141
|
+
:obj:`list`: The union of the two lists after removing duplicates and normalizing words.
|
142
|
+
"""
|
143
|
+
|
144
|
+
list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
|
145
|
+
|
146
|
+
list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
|
147
|
+
|
148
|
+
union_list = []
|
149
|
+
|
150
|
+
for list1_word in list1:
|
151
|
+
word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
152
|
+
union_list.append(word1)
|
153
|
+
|
154
|
+
for list2_word in list2:
|
155
|
+
word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
156
|
+
union_list.append(word2)
|
157
|
+
|
158
|
+
i = 0
|
159
|
+
while i < len(union_list):
|
160
|
+
j = i + 1
|
161
|
+
while j < len(union_list):
|
162
|
+
non_preferred_word = get_non_preferred_word(union_list[i], union_list[j])
|
163
|
+
if (non_preferred_word != "#"):
|
164
|
+
union_list.remove(non_preferred_word)
|
165
|
+
j = j + 1
|
166
|
+
i = i + 1
|
167
|
+
|
168
|
+
return union_list
|
169
|
+
|
170
|
+
|
171
|
+
|
172
|
+
def jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_not_shadda: bool, ignore_shadda_diacritic: bool) -> float:
|
173
|
+
"""
|
174
|
+
Calculates the Jaccard similarity coefficient between two lists.
|
175
|
+
|
176
|
+
Args:
|
177
|
+
list1 (:obj:`list`): The first list.
|
178
|
+
list2 (:obj:`list`): The second list.
|
179
|
+
ignore_all_diacritics_but_not_shadda (:obj:`bool`): A flag indicating whether to ignore all diacritics except for shadda.
|
180
|
+
ignore_shadda_diacritic (:obj:`bool`): A flag indicating whether to ignore the shadda diacritic.
|
181
|
+
|
182
|
+
Returns:
|
183
|
+
:obj:`float`: The Jaccard similarity coefficient between the two lists.
|
184
|
+
"""
|
185
|
+
# Find the intersection between two sets
|
186
|
+
intersection_list = get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
187
|
+
|
188
|
+
# Find the union between two sets
|
189
|
+
union_list = get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
190
|
+
|
191
|
+
# Calculate the Jaccard similarity coefficient by dividing the length of the intersectionList by the length of the unionList
|
192
|
+
return float(len(intersection_list)) / float(len(union_list))
|
193
|
+
|
194
|
+
|
195
|
+
|
196
|
+
|
197
|
+
def jaccard(delimiter, str1, str2, selection, ignoreAllDiacriticsButNotShadda=True, ignoreShaddaDiacritic=True):
|
198
|
+
"""
|
199
|
+
Compute the Jaccard similarity, union, or intersection of two sets of strings.
|
200
|
+
|
201
|
+
Args:
|
202
|
+
delimiter (:obj:`str`): The delimiter used to split the input strings.
|
203
|
+
str1 (:obj:`str`): The first input string to compare.
|
204
|
+
str2 (:obj:`str`): The second input string to compare.
|
205
|
+
selection (:obj:`str`): The desired operation to perform on the two sets of strings.
|
206
|
+
Must be one of *intersection*, *union*, *jaccardSimilarity*, or *jaccardAll*.
|
207
|
+
ignoreAllDiacriticsButNotShadda (:obj:`bool`): If True, ignore all diacritics except for the Shadda diacritic. (Defualt is True)
|
208
|
+
ignoreShaddaDiacritic (:obj:`bool`): If True, ignore the Shadda diacritic.(Default is True)
|
209
|
+
|
210
|
+
Returns:
|
211
|
+
The Jaccard similarity, union, or intersection of the two sets of strings,
|
212
|
+
depending on the value of the `selection` argument.
|
213
|
+
|
214
|
+
Note:
|
215
|
+
- If `selection` is *jaccardAll*, a list of the intersection, union, and Jaccard similarity
|
216
|
+
of the two sets of strings will be returned.
|
217
|
+
- If an error occurs, the method will return the string "An error has occurred".
|
218
|
+
Online tool: https://sina.birzeit.edu/resources/jaccardFunction.html
|
219
|
+
"""
|
220
|
+
try:
|
221
|
+
list1 = str1.split(delimiter)
|
222
|
+
list2 = str2.split(delimiter)
|
223
|
+
|
224
|
+
if selection == "intersection":
|
225
|
+
intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
226
|
+
return intersection
|
227
|
+
elif selection == "union":
|
228
|
+
union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
229
|
+
return union
|
230
|
+
elif selection == "jaccardSimilarity":
|
231
|
+
similarity = jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
232
|
+
return similarity
|
233
|
+
elif selection == "jaccardAll":
|
234
|
+
intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
235
|
+
union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
236
|
+
similarity = jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
237
|
+
output_list = ["intersection:", intersection, "union:", union, "similarity:", similarity]
|
238
|
+
return output_list
|
239
|
+
else:
|
240
|
+
return 'Invalid selection option'
|
241
|
+
|
242
|
+
except AttributeError as ae:
|
243
|
+
print(f"Attribute error occurred: {str(ae)}")
|
244
|
+
return 'Invalid input type'
|
245
|
+
except Exception as e:
|
246
|
+
print(f"Error occurred: {str(e)}")
|
247
|
+
return 'An error has occurred'
|
nlptools/utils/parser.py
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
import re
|
2
|
+
import argparse
|
3
|
+
|
4
|
+
def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, alif=True , specialChars=True ):
|
5
|
+
|
6
|
+
"""
|
7
|
+
This method removes Arabic diacritics, small diacritcs, shaddah, Latin and Arabic digits, and unify alif.
|
8
|
+
And remove special characters, spaces, underscore and Arabic tatwelah from the input text.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
text (:obj:`str`): Arabic text to be processed.
|
12
|
+
diacs (:obj:`bool`): flag to remove Arabic diacretics [ ًٌٍَُِْ] (default is True).
|
13
|
+
smallDiacs (:obj:`bool`): flag to remove small diacretics (default is True).
|
14
|
+
shaddah (:obj:`bool`): flag to remove shaddah (default is True).
|
15
|
+
digit (:obj:`bool`): flag to remove Latin and Arabic digits (default is True).
|
16
|
+
alif (:obj:`bool`): flag to unify alif (default is True).
|
17
|
+
specialChars (:obj:`bool`): flag to remove special characters (default is True).
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
:obj:`str`: stripped text.
|
21
|
+
|
22
|
+
**Example:**
|
23
|
+
|
24
|
+
.. highlight:: python
|
25
|
+
.. code-block:: python
|
26
|
+
|
27
|
+
from nlptools.utils import parser
|
28
|
+
processed_text =parser.arStrip('2023الجو جميلُ')
|
29
|
+
print(processed_text)
|
30
|
+
|
31
|
+
#putput
|
32
|
+
الجو جميل
|
33
|
+
|
34
|
+
name =parser.arStrip('أَلَمۡ یَأۡنِ لِلَّذِینَ ءَامَنُوۤا۟ أَن تَخۡشَعَ قُلُوبُهُمۡ لِذِكۡرِ ٱللَّهِ وَمَا نَزَلَ مِنَ ٱلۡحَقِّ وَلَا یَكُونُوا۟ كَٱلَّذِینَ أُوتُوا۟ ٱلۡكِتَـٰبَ مِن قَبۡلُ فَطَالَ عَلَیۡهِمُ ٱلۡأَمَدُ فَقَسَتۡ قُلُوبُهُمۡۖ وَكَثِیر مِّنۡهُمۡ فَـسِقُونَ' , True , True , True , True , True , True )
|
35
|
+
print(name)
|
36
|
+
#putput
|
37
|
+
الم یان للذین ءامنوا ان تخشع قلوبهم لذكر الله وما نزل من الحق ولا یكونوا كالذین اوتوا الكتٰب من قبل فطال علیهم الامد فقست قلوبهم وكثیر منهم فسقون
|
38
|
+
|
39
|
+
|
40
|
+
"""
|
41
|
+
try:
|
42
|
+
if text: # if the input string is not empty do the following
|
43
|
+
#print("in if")
|
44
|
+
if diacs == True :
|
45
|
+
text = re.sub(r'[\u064B-\u0650]+', '',text) # Remove all Arabic diacretics [ ًٌٍَُِْ]
|
46
|
+
text = re.sub(r'[\u0652]+', '',text) # Remove SUKUN
|
47
|
+
if shaddah == True:
|
48
|
+
text = re.sub(r'[\u0651]+', '',text) # Remove shddah
|
49
|
+
if smallDiacs == True:
|
50
|
+
text = re.sub(r'[\u06D6-\u06ED]+', '',text) # Remove all small Quranic annotation signs
|
51
|
+
if digit == True:
|
52
|
+
text = re.sub('[0-9]+', ' ',text) # Remove English digits
|
53
|
+
text = re.sub('[٠-٩]+', ' ',text)# Remove Arabic digits
|
54
|
+
|
55
|
+
if alif == True: # Unify alif with hamzah:
|
56
|
+
text = re.sub('ٱ', 'ا',text);
|
57
|
+
text = re.sub('أ', 'ا',text);
|
58
|
+
text = re.sub('إ', 'ا',text);
|
59
|
+
text = re.sub('آ', 'ا',text);
|
60
|
+
if specialChars == True:
|
61
|
+
text = re.sub('[?؟!@#$%-]+' , '' , text) # Remove some of special chars
|
62
|
+
|
63
|
+
text = re.sub('[\\s]+'," ",text) # Remove all spaces
|
64
|
+
text = text.replace("_" , '') #Remove underscore
|
65
|
+
text = text.replace("ـ" , '') # Remove Arabic tatwelah
|
66
|
+
text = text.strip() # Trim input string
|
67
|
+
except:
|
68
|
+
return text
|
69
|
+
return text
|
70
|
+
|
71
|
+
def remove_punctuation(text):
|
72
|
+
"""
|
73
|
+
Removes punctuation marks from the text.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
text (:obj:`str`): The input text.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
:obj:`str`: The output text without punctuation marks.
|
80
|
+
|
81
|
+
**Example:**
|
82
|
+
|
83
|
+
.. highlight:: python
|
84
|
+
.. code-block:: python
|
85
|
+
|
86
|
+
from nlptools.utils import parser
|
87
|
+
return parser.remove_punctuation("te!@#،$%%؟st")
|
88
|
+
|
89
|
+
#output
|
90
|
+
test
|
91
|
+
|
92
|
+
output= parser.remove_punctuation(" {يَا أَيُّهَا الَّذِينَ آمَنُوا لِيَسْتَأْذِنْكُمُ ....}")
|
93
|
+
print(output)
|
94
|
+
|
95
|
+
#output
|
96
|
+
يَا أَيُّهَا الَّذِينَ آمَنُوا لِيَسْتَأْذِنْكُمُ
|
97
|
+
|
98
|
+
"""
|
99
|
+
try:
|
100
|
+
if text:
|
101
|
+
punctuation_marks = [r'[\u0021-\u002F]+', r'[U+060C]+', r'[\u003A-\u0040]+',
|
102
|
+
r'[\u005B-\u0060]+', r'[\u007B-\u007E]+', r'[\u060C]+',
|
103
|
+
r'[\u061B]+', r'[\u061E]+', r'[\u061F]+', r'[\u0640]+',
|
104
|
+
r'[\u0653]+', r'[\u065C]+', r'[\u066C]+', r'[\u066A]+',
|
105
|
+
r'["}"]+', r'["{"]+']
|
106
|
+
outputString = text
|
107
|
+
for punctuation in punctuation_marks:
|
108
|
+
outputString = re.sub(punctuation, '', outputString)
|
109
|
+
except:
|
110
|
+
return text
|
111
|
+
return outputString
|
112
|
+
|
113
|
+
def remove_latin(text):
|
114
|
+
"""
|
115
|
+
This method removes all Latin characters from the input text.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
text (:obj:`str`): The input text.
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
outputString (:obj:`str`): The text without Latin characters.
|
122
|
+
Note:
|
123
|
+
If an error occurs during processing, the original text is returned.
|
124
|
+
**Example:**
|
125
|
+
|
126
|
+
.. highlight:: python
|
127
|
+
.. code-block:: python
|
128
|
+
|
129
|
+
from nlptools.utils import parser
|
130
|
+
return parser.remove_latin("miojkdujhvaj1546545spkdpoqfoiehwv nWEQFGWERHERTJETAWIKUYFC")
|
131
|
+
|
132
|
+
#output
|
133
|
+
1546545
|
134
|
+
|
135
|
+
output = parser.remove_latin("أصل المسمى «تخطيط موارد المؤسسة» هو تعريب لمسمى التخطيط باللغة الإنجليزية Enterprise Resource Planning")
|
136
|
+
print(output)
|
137
|
+
|
138
|
+
#output
|
139
|
+
أصل المسمى «تخطيط موارد المؤسسة» هو تعريب لمسمى التخطيط باللغة الإنجليزية
|
140
|
+
"""
|
141
|
+
try:
|
142
|
+
if text:
|
143
|
+
text = re.sub('[a-zA-Z]+', ' ', text)
|
144
|
+
except:
|
145
|
+
return text
|
146
|
+
return text
|
147
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
def remove_empty_values(sentences):
|
2
|
+
return [value for value in sentences if value != '']
|
3
|
+
|
4
|
+
|
5
|
+
def sent_tokenize(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
|
6
|
+
"""
|
7
|
+
This method tokenizes a text into a set of sentences based on the selected separators, including the dot, new line, question mark, and exclamation mark.
|
8
|
+
|
9
|
+
Args:
|
10
|
+
text (:obj:`str`): Arabic text to be tokenized.
|
11
|
+
dot (:obj:`str`): flag to split text based on Dot (default is True).
|
12
|
+
new_line (:obj:`str`): flag to split text based on new_line (default is True).
|
13
|
+
question_mark (:obj:`str`): flag to split text based on question_mark (default is True).
|
14
|
+
exclamation_mark (:obj:`str`): flag to split text based on exclamation_mark (default is True).
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
:obj:`list`: list of sentences.
|
18
|
+
|
19
|
+
**Example:**
|
20
|
+
|
21
|
+
.. highlight:: python
|
22
|
+
.. code-block:: python
|
23
|
+
|
24
|
+
from nlptools.utils import sentence_tokenizer
|
25
|
+
sentences = sentence_tokenizer.sent_tokenize("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.", dot=True, new_line=True, question_mark=True, exclamation_mark=True)
|
26
|
+
print(sentences)
|
27
|
+
|
28
|
+
#output
|
29
|
+
['مختبر سينا لحوسبة اللغة والذكاء الإصطناعي.', 'في جامعة بيرزيت.']
|
30
|
+
"""
|
31
|
+
separators = []
|
32
|
+
split_text = [text]
|
33
|
+
if new_line==True:
|
34
|
+
separators.append('\n')
|
35
|
+
if dot==True:
|
36
|
+
separators.append('.')
|
37
|
+
if question_mark==True:
|
38
|
+
separators.append('?')
|
39
|
+
separators.append('؟')
|
40
|
+
if exclamation_mark==True:
|
41
|
+
separators.append('!')
|
42
|
+
|
43
|
+
for sep in separators:
|
44
|
+
new_split_text = []
|
45
|
+
for part in split_text:
|
46
|
+
tokens = part.split(sep)
|
47
|
+
tokens_with_separator = [token + sep for token in tokens[:-1]]
|
48
|
+
tokens_with_separator.append(tokens[-1].strip())
|
49
|
+
new_split_text.extend(tokens_with_separator)
|
50
|
+
split_text = new_split_text
|
51
|
+
|
52
|
+
split_text = remove_empty_values(split_text)
|
53
|
+
return split_text
|
@@ -0,0 +1,232 @@
|
|
1
|
+
from collections import deque
|
2
|
+
|
3
|
+
# This is a mapping dictionery of Arabic letters to BW
|
4
|
+
# dictionery contains: key -> Unicode to Arabic, value -> BW chars
|
5
|
+
# It includes all BW mappings in addition to other special characters
|
6
|
+
# that are used in the SAMA database but not part of the BW character set such as numbers and Qur'anic diacritics
|
7
|
+
ar2bw_map = {
|
8
|
+
'\u0621' : '\'' , # ء
|
9
|
+
'\u0622' : '|' , # آ
|
10
|
+
'\u0623' : '>' , # أ
|
11
|
+
'\u0624' : '&' , # ؤ
|
12
|
+
'\u0625' : '<' , # إ
|
13
|
+
'\u0626' : '}' , # ئ
|
14
|
+
'\u0627' : 'A' , # ا
|
15
|
+
'\u0628' : 'b' , # ب
|
16
|
+
'\u0629' : 'p' , # ة
|
17
|
+
'\u062A' : 't' , # ت
|
18
|
+
'\u062B' : 'v' , # ث
|
19
|
+
'\u062C' : 'j' , # ج
|
20
|
+
'\u062D' : 'H' , # ح
|
21
|
+
'\u062E' : 'x' , # خ
|
22
|
+
'\u062F' : 'd' , # د
|
23
|
+
'\u0630' : '*' , # ذ
|
24
|
+
'\u0631' : 'r' , # ر
|
25
|
+
'\u0632' : 'z' , # ز
|
26
|
+
'\u0633' : 's' , # س
|
27
|
+
'\u0634' : '$' , # ش
|
28
|
+
'\u0635' : 'S' , # ص
|
29
|
+
'\u0636' : 'D' , # ض
|
30
|
+
'\u0637' : 'T' , # ط
|
31
|
+
'\u0638' : 'Z' , # ظ
|
32
|
+
'\u0639' : 'E' , # ع
|
33
|
+
'\u063A' : 'g' , # غ
|
34
|
+
'\u0020' : ' ' , #Space is replaced by space
|
35
|
+
'\u0640' : '_' , #ـ
|
36
|
+
'\u0641' : 'f' , # ف
|
37
|
+
'\u0642' : 'q' , # ق
|
38
|
+
'\u0643' : 'k' , # ك
|
39
|
+
'\u0644' : 'l' , # ل
|
40
|
+
'\u0645' : 'm' , # م
|
41
|
+
'\u0646' : 'n' , # ن
|
42
|
+
'\u0647' : 'h' , # ه
|
43
|
+
'\u0648' : 'w' , # و
|
44
|
+
'\u0649' : 'Y' , # ى
|
45
|
+
'\u064A' : 'y' , # ي
|
46
|
+
'\u064B' : 'F' , # TANWEEN FATH ً
|
47
|
+
'\u064C' : 'N' , #TANWEEN DHAM ٌ
|
48
|
+
'\u064D' : 'K' , #TANWEE KASR ٍ
|
49
|
+
'\u064E' : 'a' , #FATHA َ
|
50
|
+
'\u064F' : 'u' , #DHAMMA ُ
|
51
|
+
'\u0650' : 'i' , #KASRA ِ
|
52
|
+
'\u0651' : '~' , # SHADDAH ّ
|
53
|
+
'\u0652' : 'o' , #SUKON ْ
|
54
|
+
'\u0670' : '`' , # SHORT ALEF (dagger alif) ٰ
|
55
|
+
'\u0671' : '{' , #AL WITH HAMZA QATE'/WASEL MARK ٱ
|
56
|
+
'\u067E' : 'P' , # پ
|
57
|
+
'\u0686' : 'J' , # چ
|
58
|
+
'\u06A4' : 'V' , # ڤ
|
59
|
+
'\u06AF' : 'G' , # گ
|
60
|
+
'\u06EA' : '-' , # ۪ Not BW standerd
|
61
|
+
'\u0654' : '#' , # ٔ Not BW standerd
|
62
|
+
'\u06DC ' : ':' , # short sen ۜ Not BW standerd
|
63
|
+
'\u06E0' : '"' , # small circle ۟ ,Not BW standerd
|
64
|
+
'\u06E2' : '[' , # short meem ۢ , Not BW standerd
|
65
|
+
'\u06E3 ' : ';' , #small sen below letter ۣ , Not BW standerd
|
66
|
+
'\u06E5 ' : ',' , # samall wa ۥ , Not BW standerd
|
67
|
+
'\u06E6' : '.' , # ۦ Not BW standerd
|
68
|
+
'\u06E8 ' : '!' , # small noon ۨ Not BW standerd
|
69
|
+
'\u06EA' : '-' , # special qura'nic arabic ( littel circule underneath), is not part of standerd BW , not in Buckwalter ۪ Not BW standerd
|
70
|
+
'\u06EC' : '%' , # small solid circle ۬ Not BW standerd
|
71
|
+
'\u06ED ' : ']', # small meem ۭ Not BW standerd
|
72
|
+
'0' : '0' , #
|
73
|
+
'1' : '1' , #
|
74
|
+
'2' : '2' , #
|
75
|
+
'3' : '3' , #
|
76
|
+
'4' : '4' , #
|
77
|
+
'5' : '5' , #
|
78
|
+
'6' : '6' , #
|
79
|
+
'7' : '7' , #
|
80
|
+
'8' : '8' , #
|
81
|
+
'9' : '9' #
|
82
|
+
}
|
83
|
+
|
84
|
+
# This is a mapping dictionery of BW letters to Arabic
|
85
|
+
# It includes all Arabic mappings in addition to other special characters
|
86
|
+
# that are used in the SAMA database but not part of the Arabic character set such as numbers and Qur'anic diacritics
|
87
|
+
bw2ar_map = {
|
88
|
+
'\'' : '\u0621' , # ء
|
89
|
+
'|' : '\u0622' , # آ
|
90
|
+
'>' : '\u0623' , # أ
|
91
|
+
'&' : '\u0624' , # ؤ
|
92
|
+
'<' : '\u0625' , # إ
|
93
|
+
'}' : '\u0626' , # ئ
|
94
|
+
'A' : '\u0627' , # ا
|
95
|
+
'b' : '\u0628' , # ب
|
96
|
+
'p' : '\u0629' , # ة
|
97
|
+
't' : '\u062A' , # ت
|
98
|
+
'v' : '\u062B' , # ث
|
99
|
+
'j' : '\u062C' , # ج
|
100
|
+
'H' : '\u062D' , # ح
|
101
|
+
'x' : '\u062E' , # خ
|
102
|
+
'd' : '\u062F' , # د
|
103
|
+
'*' : '\u0630' , # ذ
|
104
|
+
'r' : '\u0631' , # ر
|
105
|
+
'z' : '\u0632' , # ز
|
106
|
+
's' : '\u0633' , # س
|
107
|
+
'$' : '\u0634' , # ش
|
108
|
+
'S' : '\u0635' , # ص
|
109
|
+
'D' : '\u0636' , # ض
|
110
|
+
'T' : '\u0637' , # ط
|
111
|
+
'Z' : '\u0638' , # ظ
|
112
|
+
'E' : '\u0639' , # ع
|
113
|
+
'g' : '\u063A' , # غ
|
114
|
+
' ' : '\u0020' , #Space is replaced by space
|
115
|
+
'_' : ' ' , #ـ This is Temporary
|
116
|
+
# '_' : '\u0640' , #ـ
|
117
|
+
'f' : '\u0641' , # ف
|
118
|
+
'q' : '\u0642' , # ق
|
119
|
+
'k' : '\u0643' , # ك
|
120
|
+
'l' : '\u0644' , # ل
|
121
|
+
'm' : '\u0645' , # م
|
122
|
+
'n' : '\u0646' , # ن
|
123
|
+
'h' : '\u0647' , # ه
|
124
|
+
'w' : '\u0648' , # و
|
125
|
+
'Y' : '\u0649' , # ى
|
126
|
+
'y' : '\u064A' , # ي
|
127
|
+
'F' : '\u064B' , # TANWEEN FATH ً
|
128
|
+
'N' : '\u064C' , #TANWEEN DHAM ٌ
|
129
|
+
'K' : '\u064D' , #TANWEE KASR ٍ
|
130
|
+
'a' : '\u064E' , #FATHA َ
|
131
|
+
'u' : '\u064F' , #DHAMMA ُ
|
132
|
+
'i' : '\u0650' , #KASRA ِ
|
133
|
+
'~' : '\u0651' , # SHADDAH ّ
|
134
|
+
'o' : '\u0652' , #SUKON ْ
|
135
|
+
'`' : '\u0670' , # SHORT ALEF (dagger alif) ٰ
|
136
|
+
'{' : '\u0671' , #AL WITH HAMZA QATE'/WASEL MARK ٱ
|
137
|
+
'P' : '\u067E' , # پ
|
138
|
+
'J' : '\u0686' , # چ
|
139
|
+
'V' : '\u06A4' , # ڤ
|
140
|
+
'G' : '\u06AF' , # گ
|
141
|
+
'-' : '\u06EA' , # ۪ Not BW standerd
|
142
|
+
'#' : '\u0654' , # ٔ ARABIC HAMZA ABOVE Not BW standerd
|
143
|
+
':' : '\u06DC ' , # short sen ۜ Not BW standerd
|
144
|
+
'"' : '\u06E0' , # small circle ۟ ,Not BW standerd
|
145
|
+
'[' : '\u06E2' , # short meem ۢ , Not BW standerd
|
146
|
+
';' : '\u06E3 ' , #small sen below letter ۣ , Not BW standerd
|
147
|
+
',' : '\u06E5 ' , # samall wa ۥ , Not BW standerd
|
148
|
+
'.' : '\u06E6' , # ۦ Not BW standerd
|
149
|
+
'!' : '\u06E8 ' , # small noon ۨ Not BW standerd
|
150
|
+
'-' : '\u06EA' , # special qura'nic arabic ( littel circule underneath), is not part of standerd BW , not in Buckwalter ۪ Not BW standerd
|
151
|
+
'%' : '\u06EC' , # small solid circle ۬ Not BW standerd
|
152
|
+
']' : '\u06ED ', # small meem ۭ Not BW standerd
|
153
|
+
'0' : '0' , #
|
154
|
+
'1' : '1' , #
|
155
|
+
'2' : '2' , #
|
156
|
+
'3' : '3' , #
|
157
|
+
'4' : '4' , #
|
158
|
+
'5' : '5' , #
|
159
|
+
'6' : '6' , #
|
160
|
+
'7' : '7' , #
|
161
|
+
'8' : '8' , #
|
162
|
+
'9' : '9' #
|
163
|
+
}
|
164
|
+
# A transliterate Function to transliterate Arabic letters and vice versa
|
165
|
+
#It takes a text and the schema as input and return 2-values: the transliteration and a flag of whether all chars are transliterated or not
|
166
|
+
def perform_transliteration(text , schema ):
|
167
|
+
"""
|
168
|
+
This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
|
169
|
+
|
170
|
+
Args:
|
171
|
+
text (:obj:`str`): The input text to be transliterated.
|
172
|
+
schema (:obj:`str`): The transliteration schema to be used. Should be either `bw2ar` to transliterate Buckwalter-encoded Arabic text to Arabic script, or `ar2bw` to transliterate Arabic script to Buckwalter-encoded Arabic text.
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
:obj:`tuple`: A tuple of two values:
|
176
|
+
- The transliterated text based on the given schema.
|
177
|
+
- A boolean flag indicating whether all characters in the input text were successfully transliterated or not.
|
178
|
+
|
179
|
+
**Example:**
|
180
|
+
|
181
|
+
.. highlight:: python
|
182
|
+
.. code-block:: python
|
183
|
+
|
184
|
+
from nlptools.utils import text_transliteration
|
185
|
+
|
186
|
+
print(text_transliteration.perform_transliteration("مُحَمَدٌ نَـشِيْطٌـ1" , "ar2bw"))
|
187
|
+
print(text_transliteration.perform_transliteration("muHamadN" , "bw2ar"))
|
188
|
+
print(text_transliteration.perform_transliteration("شَنُعُ۪ـ1" , "ar2bw"))
|
189
|
+
print(text_transliteration.perform_transliteration("$aw~aH_2" , "bw2ar"))
|
190
|
+
|
191
|
+
#output
|
192
|
+
('muHamadN na_$iyoTN_1', True)
|
193
|
+
('مُحَمَدٌ', True)
|
194
|
+
('$anuE-u_1', True)
|
195
|
+
('شَوَّح 2', True)
|
196
|
+
"""
|
197
|
+
# to map BW into Arabic
|
198
|
+
if schema == "bw2ar":
|
199
|
+
transliterated_text = deque() #Empty deque list
|
200
|
+
|
201
|
+
is_all_mapped = True
|
202
|
+
for char in text:
|
203
|
+
# lockup every charecters in the dictionary, if not found return the original char
|
204
|
+
char_value = bw2ar_map.get(char)
|
205
|
+
|
206
|
+
if char_value is None:
|
207
|
+
is_all_mapped = False # False if one cjars is not in map
|
208
|
+
transliterated_text.append(char)
|
209
|
+
else:
|
210
|
+
transliterated_text.append(char_value)
|
211
|
+
|
212
|
+
return ''.join(transliterated_text) , is_all_mapped
|
213
|
+
# to map Arabic into BW
|
214
|
+
elif schema == "ar2bw":
|
215
|
+
|
216
|
+
transliterated_text = deque()
|
217
|
+
|
218
|
+
is_all_mapped = True
|
219
|
+
for char in text:
|
220
|
+
# lockup evry charecters in the dictionary, if not found return the original char
|
221
|
+
char_value = ar2bw_map.get(char)
|
222
|
+
|
223
|
+
if char_value is None:
|
224
|
+
is_all_mapped = False # False if one cjars is not in map
|
225
|
+
transliterated_text.append(char)
|
226
|
+
else:
|
227
|
+
transliterated_text.append(char_value)
|
228
|
+
|
229
|
+
return ''.join(transliterated_text) , is_all_mapped
|
230
|
+
|
231
|
+
else:
|
232
|
+
raise ValueError("Schema must be either 'bw2ar' or 'ar2bw'.")
|
nlptools/utils/utils.py
ADDED