SinaTools 0.1.35__py2.py3-none-any.whl → 0.1.36__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,247 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- from sinatools.utils.parser import arStrip
4
- from sinatools.utils.word_compare import Implication
5
- import argparse
6
-
7
- def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
8
- """
9
- Normalize a given Arabic word by removing diacritics and/or shadda diacritic.
10
-
11
- Args:
12
- word (:obj:`str`): The input text.
13
- ignore_all_diacritics_but_not_shadda (:obj:`bool`): A boolean flag indicating whether to remove all diacritics except shadda (default is True).
14
- ignore_shadda_diacritic (:obj:`bool`): A boolean flag indicating whether to remove shadda diacritic (default is True).
15
-
16
- Returns:
17
- :obj:`str` Normalized Arabic word.
18
- """
19
- if ignore_all_diacritics_but_not_shadda:
20
- word = arStrip(word, True, True, False, False, False, False)
21
-
22
- if ignore_shadda_diacritic:
23
- word = arStrip(word, False, False, True, False, False, False)
24
-
25
- return word
26
-
27
-
28
- def get_preferred_word(word1, word2):
29
- """
30
- Returns the preferred word among two given words based on their implication.
31
-
32
- Args:
33
- word1 (:obj:`str`): The first word.
34
- word2 (:obj:`str`): The second word.
35
-
36
- Returns:
37
- :obj:`str`: The preferred word among the two given words.
38
-
39
- """
40
- implication = Implication(word1, word2)
41
-
42
- direction = implication.get_direction()
43
-
44
- if direction in (0, 2):
45
- return word1
46
-
47
- elif direction == 1:
48
- return word2
49
-
50
- elif direction == 3:
51
- if not word1.endswith("َ") and not word1.endswith("ُ"):
52
- return word2
53
- return word1
54
-
55
-
56
- def get_non_preferred_word(word1, word2):
57
- """
58
- Returns the non-preferred word between the two input words.
59
-
60
- Args:
61
- word1 (:obj:`str`): The first word.
62
- word2 (:obj:`str`): The second word.
63
-
64
- Returns:
65
- :obj:`str`: The non-preferred word. If there is no non-preferred word, The '#' is returned.
66
-
67
- """
68
-
69
- implication = Implication(word1, word2)
70
- if implication.get_distance() < 15:
71
- direction = implication.get_direction()
72
- if direction == 0 or direction == 1:
73
- return word1
74
- elif direction == 2:
75
- return word2
76
- elif direction == 3:
77
- if not word1.endswith("َ") and not word1.endswith("ُ"):
78
- return word1
79
- return word2
80
- return "#"
81
- #@TBD
82
- def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, ignore_shadda_diacritic=False):
83
- """
84
- Get the intersection of two lists after normalization and ignoring diacritics based on input flags.
85
-
86
- Args:
87
- list1 (:obj:`list`): The first list.
88
- list2 (:obj:`list`): The second list.
89
- ignore_all_diacritics_but_not_shadda (:obj:`bool`, optional): A flag to ignore all diacritics except for the shadda. Defaults to False.
90
- ignore_shadda_diacritic (:obj:`bool`, optional): A flag to ignore the shadda diacritic. Defaults to False.
91
-
92
- Returns:
93
- :obj:`list`: The intersection of the two lists after normalization and ignoring diacritics.
94
-
95
- """
96
-
97
- # Remove all None and empty values from first list
98
- list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
99
- list1 = [str(i.strip()) for i in list1]
100
-
101
- # Remove all None and empty values from second list
102
- list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
103
- list2 = [str(i.strip()) for i in list2]
104
-
105
- interection_list = []
106
-
107
- # Add all Common words between the two list1 and list2 to interectionList
108
- for list1_word in list1:
109
- for list2_word in list2:
110
- word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
111
- word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
112
-
113
- implication = Implication(word1, word2)
114
- if implication.get_direction() >= 0 and implication.get_distance() < 15:
115
- interection_list.append(get_preferred_word(word1, word2))
116
-
117
- i = 0
118
- while i < len(interection_list):
119
- j = i + 1
120
- while j < len(interection_list):
121
- non_preferred_word = get_non_preferred_word(interection_list[i], interection_list[j])
122
- if non_preferred_word != "#":
123
- interection_list.remove(non_preferred_word)
124
- j += 1
125
- i += 1
126
-
127
- return interection_list
128
-
129
-
130
-
131
- def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic):
132
- """
133
- Finds the union of two lists by removing duplicates and normalizing words.
134
-
135
- Args:
136
- list1 (:obj:`list`): The first list.
137
- list2 (:obj:`list`): The second list.
138
- ignore_all_diacritics_but_not_shadda (:obj:`bool`): Whether to ignore all diacritics except shadda or not.
139
- ignore_shadda_diacritic (:obj:`bool`): Whether to ignore shadda diacritic or not.
140
- Returns:
141
- :obj:`list`: The union of the two lists after removing duplicates and normalizing words.
142
- """
143
-
144
- list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
145
-
146
- list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
147
-
148
- union_list = []
149
-
150
- for list1_word in list1:
151
- word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
152
- union_list.append(word1)
153
-
154
- for list2_word in list2:
155
- word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
156
- union_list.append(word2)
157
-
158
- i = 0
159
- while i < len(union_list):
160
- j = i + 1
161
- while j < len(union_list):
162
- non_preferred_word = get_non_preferred_word(union_list[i], union_list[j])
163
- if (non_preferred_word != "#"):
164
- union_list.remove(non_preferred_word)
165
- j = j + 1
166
- i = i + 1
167
-
168
- return union_list
169
-
170
-
171
-
172
- def jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_not_shadda: bool, ignore_shadda_diacritic: bool) -> float:
173
- """
174
- Calculates the Jaccard similarity coefficient between two lists.
175
-
176
- Args:
177
- list1 (:obj:`list`): The first list.
178
- list2 (:obj:`list`): The second list.
179
- ignore_all_diacritics_but_not_shadda (:obj:`bool`): A flag indicating whether to ignore all diacritics except for shadda.
180
- ignore_shadda_diacritic (:obj:`bool`): A flag indicating whether to ignore the shadda diacritic.
181
-
182
- Returns:
183
- :obj:`float`: The Jaccard similarity coefficient between the two lists.
184
- """
185
- # Find the intersection between two sets
186
- intersection_list = get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
187
-
188
- # Find the union between two sets
189
- union_list = get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
190
-
191
- # Calculate the Jaccard similarity coefficient by dividing the length of the intersectionList by the length of the unionList
192
- return float(len(intersection_list)) / float(len(union_list))
193
-
194
-
195
-
196
-
197
- def jaccard(delimiter, str1, str2, selection, ignoreAllDiacriticsButNotShadda=True, ignoreShaddaDiacritic=True):
198
- """
199
- Compute the Jaccard similarity, union, or intersection of two sets of strings.
200
-
201
- Args:
202
- delimiter (:obj:`str`): The delimiter used to split the input strings.
203
- str1 (:obj:`str`): The first input string to compare.
204
- str2 (:obj:`str`): The second input string to compare.
205
- selection (:obj:`str`): The desired operation to perform on the two sets of strings.
206
- Must be one of *intersection*, *union*, *jaccardSimilarity*, or *jaccardAll*.
207
- ignoreAllDiacriticsButNotShadda (:obj:`bool`): If True, ignore all diacritics except for the Shadda diacritic. (Defualt is True)
208
- ignoreShaddaDiacritic (:obj:`bool`): If True, ignore the Shadda diacritic.(Default is True)
209
-
210
- Returns:
211
- The Jaccard similarity, union, or intersection of the two sets of strings,
212
- depending on the value of the `selection` argument.
213
-
214
- Note:
215
- - If `selection` is *jaccardAll*, a list of the intersection, union, and Jaccard similarity
216
- of the two sets of strings will be returned.
217
- - If an error occurs, the method will return the string "An error has occurred".
218
- Online tool: https://sina.birzeit.edu/resources/jaccardFunction.html
219
- """
220
- try:
221
- list1 = str1.split(delimiter)
222
- list2 = str2.split(delimiter)
223
-
224
- if selection == "intersection":
225
- intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
226
- return intersection
227
- elif selection == "union":
228
- union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
229
- return union
230
- elif selection == "jaccardSimilarity":
231
- similarity = jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
232
- return similarity
233
- elif selection == "jaccardAll":
234
- intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
235
- union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
236
- similarity = jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
237
- output_list = ["intersection:", intersection, "union:", union, "similarity:", similarity]
238
- return output_list
239
- else:
240
- return 'Invalid selection option'
241
-
242
- except AttributeError as ae:
243
- print(f"Attribute error occurred: {str(ae)}")
244
- return 'Invalid input type'
245
- except Exception as e:
246
- print(f"Error occurred: {str(e)}")
247
- return 'An error has occurred'