SinaTools 0.1.35__py2.py3-none-any.whl → 0.1.36__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.35.dist-info → SinaTools-0.1.36.dist-info}/METADATA +1 -1
- {SinaTools-0.1.35.dist-info → SinaTools-0.1.36.dist-info}/RECORD +11 -15
- sinatools/CLI/DataDownload/download_files.py +9 -8
- sinatools/VERSION +1 -1
- sinatools/wsd/disambiguator.py +14 -90
- sinatools/ner/data.py +0 -124
- sinatools/ner/relation_extractor.py +0 -201
- sinatools/utils/implication.py +0 -662
- sinatools/utils/jaccard.py +0 -247
- {SinaTools-0.1.35.data → SinaTools-0.1.36.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.35.dist-info → SinaTools-0.1.36.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.35.dist-info → SinaTools-0.1.36.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.35.dist-info → SinaTools-0.1.36.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.35.dist-info → SinaTools-0.1.36.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.35.dist-info → SinaTools-0.1.36.dist-info}/top_level.txt +0 -0
sinatools/utils/implication.py
DELETED
@@ -1,662 +0,0 @@
|
|
1
|
-
|
2
|
-
# The Imply algorithm takes two words as input and produces the matching tuple defined by (Words Matching).
|
3
|
-
# The matching between two words is defined as a tuple:
|
4
|
-
# <w1, w2, implication direction, distance, conflicts, verdict, preferredWord> .
|
5
|
-
|
6
|
-
from sinatools.utils.parser import arStrip
|
7
|
-
class Implication:
|
8
|
-
"""
|
9
|
-
The implication class computes whether the two Arabic words are the same or not, regardless of how they are diacritized. The output also contains implication direction, distance, number of conflicts, and other outputs.
|
10
|
-
Argd:
|
11
|
-
:obj:`str' word1: input text
|
12
|
-
:obj:`str' word2: input text
|
13
|
-
|
14
|
-
"""
|
15
|
-
# Diacritic Pair Distance Map
|
16
|
-
distanceTable = [
|
17
|
-
[0, 0, 1, 1, 1, 1, 1, 1, 15, 16, 16, 16, 0, 0, 0, 0 ],
|
18
|
-
[0, 0, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
|
19
|
-
[1, 101, 0, 101, 101, 101, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
|
20
|
-
[1, 101, 101, 0, 101, 101, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
|
21
|
-
[1, 101, 101, 101, 0, 101, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
|
22
|
-
[1, 101, 101, 101, 101, 0, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
|
23
|
-
[1, 101, 101, 101, 101, 101, 0, 101, 101, 101, 101, 101, 0, 0, 0, 0],
|
24
|
-
[1, 101, 101, 101, 101, 101, 101, 0, 101, 101, 101, 101, 0, 0, 0, 0],
|
25
|
-
[15, 101, 101, 101, 101, 101, 101, 101, 0, 1, 1, 1, 0, 0, 0, 0],
|
26
|
-
[16, 101, 101, 101, 101, 101, 101, 101, 1, 0, 101, 101, 0, 0, 0, 0],
|
27
|
-
[16, 101, 101, 101, 101, 101, 101, 101, 1, 101, 0, 101, 0, 0, 0, 0],
|
28
|
-
[16, 101, 101, 101, 101, 101, 101, 101, 1, 101, 101, 0, 0, 0, 0, 0],
|
29
|
-
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4],
|
30
|
-
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 100, 100],
|
31
|
-
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 100, 0, 100],
|
32
|
-
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 100, 100, 0]
|
33
|
-
]
|
34
|
-
|
35
|
-
# Implication direction Map
|
36
|
-
directionTable =[
|
37
|
-
[3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
|
38
|
-
[2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
|
39
|
-
[2, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
|
40
|
-
[2, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
|
41
|
-
[2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
|
42
|
-
[2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
|
43
|
-
[2, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, 0, 0, 0, 0],
|
44
|
-
[2, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, 0, 0, 0, 0],
|
45
|
-
[2, -1, -1, -1, -1, -1, -1, -1, 3, 1, 1, 1, 0, 0, 0, 0],
|
46
|
-
[2, -1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, 0, 0, 0, 0],
|
47
|
-
[2, -1, -1, -1, -1, -1, -1, -1, 2, -1, 3, -1, 0, 0, 0, 0],
|
48
|
-
[2, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, 3, 0, 0, 0, 0],
|
49
|
-
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 1],
|
50
|
-
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, -1, -1],
|
51
|
-
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -1, 3, -1],
|
52
|
-
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -1, -1, 3]
|
53
|
-
]
|
54
|
-
|
55
|
-
word1 , word2 = "" , "" # two words to be compared
|
56
|
-
conflictFlags = [False for i in range(5)]
|
57
|
-
verdict = "null" # verdict: takes one of the values: “compatible”, or “incompatible”
|
58
|
-
word1Undiac = "" # word1 without diacritics
|
59
|
-
word2Undiac = "" # word2 without diacritics
|
60
|
-
word1Diacritics = [] # Diacritics array of the first word
|
61
|
-
word2Diacritics = [] # Diacritics array of the second word
|
62
|
-
direction = -2147483648 # direction: is a number denoting the relationship between the two words, the defult value is given a low integer, arbitrarry value
|
63
|
-
distance = -2147483648 # distance: denotes the overall similarity of the diacritization between the two words, which we compute based on the distance map; the defult value is given a low integer, arbitrarry value
|
64
|
-
conflicts = -2147483648 # conflict: denotes the number of conflicting diacritics between the two words, the defult value is given a low integer, arbitrarry value
|
65
|
-
lettersDirection = [] # implication direction between diacritics
|
66
|
-
|
67
|
-
def __init__(self , inputWord1 , inputWord2):
|
68
|
-
|
69
|
-
#check if inputWord1 or inputWord2 is empty, then return the values below
|
70
|
-
if ( (not inputWord1) and (inputWord2) ) or ( ( inputWord1) and (not inputWord2) ):
|
71
|
-
self.verdict = "Incompatible"
|
72
|
-
self.direction = -3 # the two words have different letters
|
73
|
-
self.distance = 3000
|
74
|
-
self.conflicts = 0
|
75
|
-
return
|
76
|
-
|
77
|
-
self.conflictFlags = [False for i in range(5)] # reset conflictFlags array to Fales
|
78
|
-
self.word1 = Implication.normalize_alef(inputWord1) # unify alif
|
79
|
-
self.word2 = Implication.normalize_alef(inputWord2) # unify alif
|
80
|
-
|
81
|
-
if ( self.word1 == self.word2): # If w1 == w2 returns the values bellow
|
82
|
-
self.verdict = "Compatible"
|
83
|
-
self.direction = 3 # Both letters have exactly the same diacritics
|
84
|
-
self.distance = 0
|
85
|
-
self.conflicts = 0
|
86
|
-
return
|
87
|
-
else: # If w1 and w2 are noot exact match
|
88
|
-
try:
|
89
|
-
self.lettersDirection = []
|
90
|
-
# build diacritics array for each word
|
91
|
-
self.word1Diacritics = Implication.get_diacritics_array(self.word1)
|
92
|
-
self.word2Diacritics = Implication.get_diacritics_array(self.word2)
|
93
|
-
|
94
|
-
# defined lettersDirection array with size of word1Diacritics and fill it by zeros
|
95
|
-
for x in range(0 , len(self.word1Diacritics) + 1):
|
96
|
-
self.lettersDirection.append(0)
|
97
|
-
except :
|
98
|
-
# In case of errors returns the values below
|
99
|
-
self.verdict = "Incompatible"
|
100
|
-
self.direction = -3 # the two words have different letters
|
101
|
-
self.distance = 3000
|
102
|
-
self.conflicts = 0
|
103
|
-
return
|
104
|
-
|
105
|
-
# check if diacritics in both words for some of syntax errors then return Incompatible
|
106
|
-
if ( Implication.diacritics_syntax_error_in(self.word1Diacritics) == False and Implication.diacritics_syntax_error_in(self.word2Diacritics) == False) :
|
107
|
-
# If no syntax error found:
|
108
|
-
self.word1Undiac = arStrip(self.word1, diacs=False, shaddah=False)
|
109
|
-
self.word2Undiac = arStrip(self.word2, diacs=False, shaddah=False)
|
110
|
-
# return compatible if each word is one and same letter regardless of diacritics on this letter
|
111
|
-
if (len(self.word1Undiac) == 1 and len(self.word2Undiac) == 1 and self.word1Undiac == self.word2Undiac):
|
112
|
-
self.verdict = "Compatible"
|
113
|
-
self.direction = 3 # Both letters have exactly the same diacritics
|
114
|
-
self.distance = 0
|
115
|
-
self.conflicts = 0
|
116
|
-
else : # If words are more than letter or deffirent letter then calculate the impication
|
117
|
-
self.lettersDirection[0] = 3
|
118
|
-
self.calculate_words_implication()
|
119
|
-
|
120
|
-
else : # If found syntax error in diacitics in word1 or word2 then return these:
|
121
|
-
self.verdict = "Incompatible"
|
122
|
-
self.direction = -3 # the two words have different letters
|
123
|
-
self.distance = 3000
|
124
|
-
self.conflicts = 0
|
125
|
-
|
126
|
-
def get_non_preferred_word(self, word1, word2):
|
127
|
-
"""
|
128
|
-
This method returns the non-preferred word from two given words.
|
129
|
-
|
130
|
-
Args:
|
131
|
-
:obj:`str' word1: The first word.
|
132
|
-
:obj:`str' word2: The second word.
|
133
|
-
|
134
|
-
Returns:
|
135
|
-
:obj:`str': The non-preferred word.
|
136
|
-
|
137
|
-
Raises:
|
138
|
-
None
|
139
|
-
"""
|
140
|
-
# this function talkes 2-words and retuen preferredWord
|
141
|
-
word1 = word1.strip()
|
142
|
-
word2 = word2.strip()
|
143
|
-
if (word1 != None and word1 ) :
|
144
|
-
if (word2 != None and word2) :
|
145
|
-
preferredWord = ""
|
146
|
-
preferredWord = Implication.getPreferredWord(word1, word2)
|
147
|
-
if word1== preferredWord:
|
148
|
-
return word2
|
149
|
-
else:
|
150
|
-
return word1
|
151
|
-
else :
|
152
|
-
return word1
|
153
|
-
|
154
|
-
else :
|
155
|
-
if word2 != None and word2:
|
156
|
-
return word2
|
157
|
-
else:
|
158
|
-
return None
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
def get_preferred_word( self , word1, word2) :
|
163
|
-
"""
|
164
|
-
This method returns the preferred word from two given words.
|
165
|
-
|
166
|
-
Args:
|
167
|
-
:obj:`str' word1: The first word.
|
168
|
-
:obj:`str' word2: The second word.
|
169
|
-
|
170
|
-
Returns:
|
171
|
-
:obj:`str': The preferred word.
|
172
|
-
|
173
|
-
Raises:
|
174
|
-
None
|
175
|
-
"""
|
176
|
-
word1 = word1.strip()
|
177
|
-
word2 = word2.strip()
|
178
|
-
if ( word1 != None and word1) :
|
179
|
-
if (word2 != None and word2) :
|
180
|
-
implication = Implication(word1, word2)
|
181
|
-
direction = implication.get_distance()
|
182
|
-
if (direction < 15) :
|
183
|
-
if ( ( direction == 0 ) or
|
184
|
-
(direction == 2 ) ):
|
185
|
-
return word1
|
186
|
-
elif direction == 1 :
|
187
|
-
return word2
|
188
|
-
elif direction == 3 :
|
189
|
-
if ( ( not word1.endswith("َ") ) and ( not word1.endswith("ُ") ) ) :
|
190
|
-
return word2
|
191
|
-
return word1
|
192
|
-
|
193
|
-
return ""
|
194
|
-
else :
|
195
|
-
return word1
|
196
|
-
|
197
|
-
else :
|
198
|
-
if word2 != None and (not word2):
|
199
|
-
return word2
|
200
|
-
else:
|
201
|
-
return None
|
202
|
-
|
203
|
-
def normalize_alef(word):
|
204
|
-
"""
|
205
|
-
This method normalizes the alif (ألف) character in the given word.
|
206
|
-
|
207
|
-
Args:
|
208
|
-
word (:obj:`str`): The input word to be normalized.
|
209
|
-
|
210
|
-
Returns:
|
211
|
-
:obj:`str`: The normalized word with alif characters modified.
|
212
|
-
|
213
|
-
**Example:**
|
214
|
-
|
215
|
-
.. highlight:: python
|
216
|
-
.. code-block:: python
|
217
|
-
|
218
|
-
from sinatools.utils.implication import Implication
|
219
|
-
|
220
|
-
word = Implication.normalize_alef("ًى") # Returns "ىً"
|
221
|
-
word = Implication.normalize_alef("ًا") # Returns "اً"
|
222
|
-
word = Implication.normalize_alef("ٱلكتاب") # Returns "الكتاب"
|
223
|
-
"""
|
224
|
-
# If the tanween is before the alif, then it is placed after it,
|
225
|
-
# because in the Arabic language this word is similar
|
226
|
-
if word.endswith("ًى"):
|
227
|
-
word = word[:len(word) - 2] + "ىً"
|
228
|
-
|
229
|
-
if word.endswith("ًا"):
|
230
|
-
word = word[:len(word) - 2] + "اً"
|
231
|
-
# Replace Alif-dhamma with Alif
|
232
|
-
if word.startswith("ٱ"):
|
233
|
-
word = "ا" + word[1:]
|
234
|
-
return word
|
235
|
-
|
236
|
-
|
237
|
-
def diacritics_syntax_error_in( diacriticsArray ) :
|
238
|
-
"""
|
239
|
-
This method checks if the diacritics in a given array are incorrect.
|
240
|
-
|
241
|
-
Args:
|
242
|
-
diacritics_array (:obj:`list`): A list of diacritics to be checked.
|
243
|
-
|
244
|
-
Returns:
|
245
|
-
:obj:`bool`: True if there is a syntax error in the diacritics, False otherwise.
|
246
|
-
|
247
|
-
**Example:**
|
248
|
-
|
249
|
-
.. highlight:: python
|
250
|
-
.. code-block:: python
|
251
|
-
|
252
|
-
from sinatools.utils.implication import Implication
|
253
|
-
|
254
|
-
diacritics = ["َ", "ُ", "ِ", "ّ"]
|
255
|
-
has_error = Implication.diacritics_syntax_error_in(diacritics) # Returns False
|
256
|
-
|
257
|
-
diacritics = ["َ", "ُ", "ِ", "ٓ"]
|
258
|
-
has_error = Implication.diacritics_syntax_error_in(diacritics) # Returns True
|
259
|
-
"""
|
260
|
-
# This funcion return True when the diacritics is incorreclty
|
261
|
-
try:
|
262
|
-
# check last letter diacritic
|
263
|
-
if ( Implication.wrong_end_diacritic(diacriticsArray[ len(diacriticsArray) - 1]) ) :
|
264
|
-
return True
|
265
|
-
else :
|
266
|
-
# check All letters diacritic except the last letter diacritic
|
267
|
-
for i in range(0 , len(diacriticsArray) - 1 ) :
|
268
|
-
if (Implication.wrong_middle_iacritic(diacriticsArray[i])) :
|
269
|
-
return True
|
270
|
-
return False
|
271
|
-
|
272
|
-
except :
|
273
|
-
return False
|
274
|
-
|
275
|
-
|
276
|
-
def wrong_end_diacritic(diac):
|
277
|
-
"""
|
278
|
-
This method checks if the given diacritic is a wrong end diacritic.
|
279
|
-
|
280
|
-
Args:
|
281
|
-
diac (:obj:`int`): The diacritic value to be checked.
|
282
|
-
|
283
|
-
Returns:
|
284
|
-
:obj:`bool`: True if the diacritic is one of the follwoing number (85:SHADDAH WITH FATHATAN, 86:SHADDAH WITH KASRTA, 87:SHADDAH WITH DHAMTAN), False if diacritic is greator than or equal0 and diacritic is less than or equal 11.
|
285
|
-
|
286
|
-
**Example:**
|
287
|
-
|
288
|
-
.. highlight:: python
|
289
|
-
.. code-block:: python
|
290
|
-
|
291
|
-
from sinatools.utils.implication import Implication
|
292
|
-
|
293
|
-
diacritic = 0
|
294
|
-
is_wrong_end = Implication.wrong_end_diacritic(diacritic) # Returns False
|
295
|
-
|
296
|
-
diacritic = 85
|
297
|
-
is_wrong_end = Implication.wrong_end_diacritic(diacritic) # Returns True
|
298
|
-
"""
|
299
|
-
# 0 > No Diacritics , 1 > SUKUN, 2 > FATHA, 3 > KASRA, 4 > DAMMA, 5 > FATHATAN, 6 > KASRATAN,
|
300
|
-
# 7 > DAMMATAN, 8 > SHADDA, 9 > SHADDA with FATHA, 10 > SHADDA with KASRA, 11 > SHADDA with DAMMA
|
301
|
-
if (diac >= 0 and diac <= 11) :
|
302
|
-
return False
|
303
|
-
else :
|
304
|
-
# 85 - 86 - 87: SHADDAH WITH FATHATAN,SHADDAH WITH KASRTA, SHADDAH WITH DHAMTAN
|
305
|
-
return diac < 85 or diac > 87
|
306
|
-
|
307
|
-
def wrong_middle_iacritic( diac) :
|
308
|
-
|
309
|
-
if (diac >= 0 and diac <= 4) :
|
310
|
-
return False
|
311
|
-
else :
|
312
|
-
return diac < 8 or diac > 15
|
313
|
-
|
314
|
-
|
315
|
-
def calculate_words_implication(self):
|
316
|
-
"""
|
317
|
-
This method calculates the implication between two words, and updates the verdict, direction, distance, and conflicts attributes of the object based on the implication between the words.
|
318
|
-
|
319
|
-
Returns:
|
320
|
-
None
|
321
|
-
|
322
|
-
**Example:**
|
323
|
-
|
324
|
-
.. highlight:: python
|
325
|
-
.. code-block:: python
|
326
|
-
|
327
|
-
from sinatools.utils.implication import Implication
|
328
|
-
|
329
|
-
implication = Implication(word1, word2)
|
330
|
-
implication.calculate_words_implication()
|
331
|
-
# Access the updated attributes
|
332
|
-
verdict = implication.verdict
|
333
|
-
direction = implication.direction
|
334
|
-
distance = implication.distance
|
335
|
-
conflicts = implication.conflicts
|
336
|
-
"""
|
337
|
-
|
338
|
-
self.verdict = "Incompatible"
|
339
|
-
self.direction = -2
|
340
|
-
self.distance = 1000
|
341
|
-
if (Implication.equal_words(self) == False): # If both words are not thge same return these values
|
342
|
-
if ((len(self.word1Undiac) == 0 and len(self.word2Undiac) == 0)):
|
343
|
-
if (self.word1 == self.word2):
|
344
|
-
self.conflicts = 0
|
345
|
-
self.distance = 0
|
346
|
-
self.direction = 3
|
347
|
-
else:
|
348
|
-
self.conflicts = 1
|
349
|
-
self.distance = 1000
|
350
|
-
self.direction = -2
|
351
|
-
|
352
|
-
else:
|
353
|
-
self.conflicts = max(len(self.word1Undiac), len(self.word2Undiac))
|
354
|
-
|
355
|
-
else:
|
356
|
-
if (Implication.calculate_letters_implication(self)):
|
357
|
-
self.direction = Implication.calculate_direction(self)
|
358
|
-
if (self.direction == -1) :
|
359
|
-
self.distance = 101
|
360
|
-
else:
|
361
|
-
self.verdict = "Compatible"
|
362
|
-
|
363
|
-
else:
|
364
|
-
self.direction = -3 # the two words have different letters
|
365
|
-
self.distance = 3000
|
366
|
-
self.conflicts = 0
|
367
|
-
|
368
|
-
def equal_words( self ) :
|
369
|
-
"""
|
370
|
-
This method updates the word1Undiac and word2Undiac attributes by removing the first letter, and returns True if the words are equal, False otherwise.
|
371
|
-
|
372
|
-
Returns:
|
373
|
-
:obj:`bool`: True if the words are equal, False otherwise.
|
374
|
-
|
375
|
-
**Example:**
|
376
|
-
|
377
|
-
.. highlight:: python
|
378
|
-
.. code-block:: python
|
379
|
-
|
380
|
-
from sinatools.utils.implication Implication
|
381
|
-
|
382
|
-
implication = Implication(word1, word2)
|
383
|
-
result = implication.equal_words()
|
384
|
-
if result:
|
385
|
-
print("The words are equal")
|
386
|
-
else:
|
387
|
-
print("The words are not equal")
|
388
|
-
"""
|
389
|
-
# check if the tow words are the same taking into account the alif as the first letter
|
390
|
-
word1FirstLetter = self.word1Undiac[0 : 1] # First letter in word1
|
391
|
-
word2FirstLetter = self.word2Undiac[0 : 1] # First letter in word2
|
392
|
-
self.word1Undiac = self.word1Undiac[1 : ] # all word1 letters without diacritics except first letter
|
393
|
-
self.word2Undiac = self.word2Undiac[1 : ] # all word2 letters without diacritics except first letter
|
394
|
-
|
395
|
-
# If both words withot first letter are not equal return false, otherwise continue
|
396
|
-
if ( self.word1Undiac != self.word2Undiac):
|
397
|
-
return False
|
398
|
-
|
399
|
-
# If the first letter in both words the same and (the other letters are the same) then return true, otherwise continue
|
400
|
-
if word1FirstLetter == word2FirstLetter :
|
401
|
-
return True
|
402
|
-
|
403
|
-
# check if first letter is any alif (the other letters are the same) then return below values
|
404
|
-
if (word1FirstLetter != "ا" or word2FirstLetter != "آ" and word2FirstLetter != "أ" and word2FirstLetter != "إ") :
|
405
|
-
if ((word1FirstLetter == "آ" or word1FirstLetter == "أ" or word1FirstLetter == "إ") and word2FirstLetter == "ا") :
|
406
|
-
self.lettersDirection[0] = 2 # w2 implies w1
|
407
|
-
self.conflictFlags[3] = True
|
408
|
-
return True
|
409
|
-
else:
|
410
|
-
return False
|
411
|
-
else:
|
412
|
-
self.lettersDirection[0] = 1 # w1 implies w2
|
413
|
-
self.conflictFlags[2] = True
|
414
|
-
return True
|
415
|
-
|
416
|
-
return False
|
417
|
-
|
418
|
-
|
419
|
-
def calculate_letters_implication(self) :
|
420
|
-
"""
|
421
|
-
This method updates the lettersDirection, conflictFlags, and distance attributes based on the directionTable and distanceTable values for each pair of diacritics. It returns True after the calculation is completed.
|
422
|
-
|
423
|
-
Returns:
|
424
|
-
:obj:`bool`: True indicating the calculation is completed.
|
425
|
-
|
426
|
-
**Example:**
|
427
|
-
|
428
|
-
.. highlight:: python
|
429
|
-
.. code-block:: python
|
430
|
-
|
431
|
-
from sinatools.utils.implication import Implication
|
432
|
-
|
433
|
-
implication = Implication(word1, word2)
|
434
|
-
result = implication.calculate_letters_implication()
|
435
|
-
if result:
|
436
|
-
print("Letters implication calculation completed")
|
437
|
-
"""
|
438
|
-
self.distance = 0
|
439
|
-
word1Diac = 0
|
440
|
-
word2Diac = 0
|
441
|
-
|
442
|
-
for i in range ( 0 , len(self.word1Diacritics) - 1) :
|
443
|
-
word1Diac = self.word1Diacritics[i];
|
444
|
-
word2Diac = self.word2Diacritics[i];
|
445
|
-
|
446
|
-
self.lettersDirection[i + 1] = self.directionTable[word1Diac][word2Diac];
|
447
|
-
self.conflictFlags[self.lettersDirection[i + 1] + 1] = True;
|
448
|
-
self.distance = self.distance + self.distanceTable[word1Diac][word2Diac];
|
449
|
-
|
450
|
-
|
451
|
-
word1Diac = int( self.word1Diacritics[len(self.word1Diacritics) - 1] ) # last letter diacritics to word1
|
452
|
-
word2Diac = int( self.word2Diacritics[len(self.word1Diacritics) - 1] ) # last letter diacritics to word2
|
453
|
-
# 8: expresses the presence of shaddah
|
454
|
-
if (word1Diac == 8 or word2Diac == 8) :
|
455
|
-
self.lettersDirection[len(self.lettersDirection) - 1] = self.directionTable[word1Diac][word2Diac]
|
456
|
-
self.conflictFlags[self.lettersDirection[len(self.lettersDirection) - 1] + 1] = True
|
457
|
-
self.distance = self.distance + self.distanceTable[word1Diac][word2Diac]
|
458
|
-
return True
|
459
|
-
|
460
|
-
|
461
|
-
def calculate_direction(self ):
|
462
|
-
"""
|
463
|
-
This method calculates the direction of compatibility based on a conflict flags.
|
464
|
-
|
465
|
-
Returns:
|
466
|
-
:obj:`int`: The direction of compatibility:
|
467
|
-
-1: Incompatible-diacritics
|
468
|
-
0: Compatible-imply each other
|
469
|
-
1: Compatible-w1 implies w2
|
470
|
-
2: Compatible-w2 implies w1
|
471
|
-
3: Compatible-exactly equal
|
472
|
-
-2147483648: Default value for an invalid direction
|
473
|
-
"""
|
474
|
-
self.conflicts = 0
|
475
|
-
if (self.conflictFlags[0] == True):
|
476
|
-
return -1 # Incompatible-diacritics
|
477
|
-
|
478
|
-
if (self.conflictFlags[2] == True and self.conflictFlags[3] == True ):
|
479
|
-
return 0 # Compatible-imply each other
|
480
|
-
|
481
|
-
if (self.conflictFlags[2] == True and self.conflictFlags[3] == False ):
|
482
|
-
return 1 # Compatible-w1 implies w2
|
483
|
-
|
484
|
-
if (self.conflictFlags[2] == False and self.conflictFlags[3] == True ):
|
485
|
-
return 2 # Compatible-w2 implies w1
|
486
|
-
|
487
|
-
if (self.conflictFlags[4]):
|
488
|
-
return 3 # Compatible-exactly equal
|
489
|
-
return -2147483648
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
def get_diacritics_array(word):
|
494
|
-
"""
|
495
|
-
This method converts diacritics in a word to digits and returns the array of diacritics.
|
496
|
-
|
497
|
-
Args:
|
498
|
-
word (:obj:`str`): The word with diacritics.
|
499
|
-
|
500
|
-
Returns:
|
501
|
-
:obj:`list`: The array of diacritics converted to digits.
|
502
|
-
|
503
|
-
Raises:
|
504
|
-
Exception: If the first character of the word is a digit.
|
505
|
-
|
506
|
-
**Example:**
|
507
|
-
|
508
|
-
.. highlight:: python
|
509
|
-
.. code-block:: python
|
510
|
-
|
511
|
-
from sinatools.utils.implication import Implication
|
512
|
-
word = "مُرَحَّبًا"
|
513
|
-
diacritics = Implication.calculate_direction(word)
|
514
|
-
print(diacritics)
|
515
|
-
Output: [4, 3, 8, 5, 0]
|
516
|
-
"""
|
517
|
-
# Replace diacritics by digits
|
518
|
-
word = word.replace(" ", "") #Space
|
519
|
-
word = word.replace("ْ", "1") #SUKUN
|
520
|
-
word = word.replace("َ", "2") #FATHA
|
521
|
-
word = word.replace("ِ", "3") #KASRA
|
522
|
-
word = word.replace("ُ", "4") #DAMMA
|
523
|
-
word = word.replace("ً", "5") #FATHATAN
|
524
|
-
word = word.replace("ٍ", "6") #KASRATAN
|
525
|
-
word = word.replace("ٌ", "7") #DAMMATAN
|
526
|
-
word = word.replace("ّ", "8") #SHADDA
|
527
|
-
word = word.replace("11", "100") #SUKUN with SUKUN
|
528
|
-
word = word.replace("12", "100") #SUKUN with FATHA
|
529
|
-
word = word.replace("13", "100") #SUKUN with KASRA
|
530
|
-
word = word.replace("14", "100") #SUKUN with DAMMA
|
531
|
-
word = word.replace("15", "100") #SUKUN with FATHATAN
|
532
|
-
word = word.replace("82", "9") #SHADDA with FATHA
|
533
|
-
word = word.replace("83", "10") #SHADDA with KASRA
|
534
|
-
word = word.replace("84", "11") #SHADDA with DAMMA
|
535
|
-
# Standardization Alif
|
536
|
-
word = word[0 : 1].replace("ا", "ا12,") + word[1: ]
|
537
|
-
word = word[0 : 1].replace("أ", "ا13,") + word[1: ]
|
538
|
-
word = word[0 : 1].replace("إ", "ا14,") + word[1: ]
|
539
|
-
word = word[0 : 1].replace("آ", "ا15,") + word[1: ]
|
540
|
-
if word[0:1].isdigit(): # Because a word should not begin with a diacritics
|
541
|
-
raise Exception("Sorry, First char is digit")
|
542
|
-
else:
|
543
|
-
# word = re.sub(r'[\u0600-\u06FF]' , ",",word) # replace all chars with ,
|
544
|
-
for x in word:
|
545
|
-
if ( ( x.isalpha() or not x.isdigit() ) and x != ',' ): # If char is not digit then replace it by ,
|
546
|
-
word = word.replace(x , ",")
|
547
|
-
# word = word.replace("\\D", ",")
|
548
|
-
word = word[0 : len(word) - 1] + word[ len(word ) - 1].replace(",", ",,") # last letter does not have diacritic problem
|
549
|
-
|
550
|
-
while ( ",," in word ):
|
551
|
-
word = word.replace(",,", ",0,") # No-DIACRITIC
|
552
|
-
|
553
|
-
word = word[1 : len(word) ] # Ignore the first letter diacritic
|
554
|
-
diacritics = []
|
555
|
-
diacritics = word.split(",") # diacritics is array of diacritics
|
556
|
-
if '' in diacritics: # Remove empty index if exist
|
557
|
-
diacritics.remove('')
|
558
|
-
var3 = diacritics[len(diacritics) - 1] # last letter diacritic
|
559
|
-
|
560
|
-
|
561
|
-
# SHADDA with FATHA,SHADDA with KASRA,SHADDA with DAMMA,SHADDAH WITH FATHATAN,SHADDAH WITH KASRTA, SHADDAH WITH DHAMTAN
|
562
|
-
if var3 == "8" or var3 == "9" or var3 == "10" or var3 == "11" or var3 == "85" or var3 == "86" or var3 == "87":
|
563
|
-
diacritics[len(diacritics )- 1] = "8"
|
564
|
-
# SUKUN , FATHA , KASRA , DAMMA , FATHATAN , KASRATAN , DAMMATAN
|
565
|
-
elif var3 == "1" or var3 == "2" or var3 == "3" or var3 == "4" or var3 == "5" or var3 == "6" or var3 == "7":
|
566
|
-
diacritics[len(diacritics )- 1] = "0"
|
567
|
-
|
568
|
-
strDiacritics = []
|
569
|
-
strDiacritics = diacritics
|
570
|
-
|
571
|
-
# Convert string array digits to integer digits array
|
572
|
-
for x in range(0 , len(strDiacritics) ):
|
573
|
-
diacritics[x] = int(strDiacritics[x])
|
574
|
-
return diacritics
|
575
|
-
|
576
|
-
# def removeDiacritics( word ): # remove all diacritics from Arabic word
|
577
|
-
# word = word.replace(" ", "")
|
578
|
-
# word = word.replace("ْ", "") #SUKUN
|
579
|
-
# word = word.replace("َ", "") #FATHA
|
580
|
-
# word = word.replace("ِ", "") #KASRA
|
581
|
-
# word = word.replace("ُ", "") #DAMMA
|
582
|
-
# word = word.replace("ً", "") #FATHATAN
|
583
|
-
# word = word.replace("ٍ", "") #KASRATAN
|
584
|
-
# word = word.replace("ٌ", "") #DAMMATAN
|
585
|
-
# word = word.replace("ّ", "") #SHADDA
|
586
|
-
# return word
|
587
|
-
|
588
|
-
def get_letters_array(word):
|
589
|
-
"""
|
590
|
-
This method returns the array of letters from a given word.
|
591
|
-
|
592
|
-
Args:
|
593
|
-
word (:obj:`str`): The word from which to extract the letters.
|
594
|
-
|
595
|
-
Returns:
|
596
|
-
obj:`list`: The array of letters.
|
597
|
-
|
598
|
-
**Example:**
|
599
|
-
|
600
|
-
.. highlight:: python
|
601
|
-
.. code-block:: python
|
602
|
-
|
603
|
-
from sinatools.utils.implication import Implication
|
604
|
-
word = "مرحبا"
|
605
|
-
letters = get_letters_array(word)
|
606
|
-
print(letters)
|
607
|
-
Output: ['م', 'ر', 'ح', 'ب', 'ا']
|
608
|
-
"""
|
609
|
-
word = arStrip(word, diacs=False, shaddah=False)
|
610
|
-
return list(word)
|
611
|
-
|
612
|
-
def get_verdict(self ):
|
613
|
-
return self.verdict
|
614
|
-
|
615
|
-
|
616
|
-
def get_direction(self):
|
617
|
-
return self.direction
|
618
|
-
|
619
|
-
|
620
|
-
def get_distance(self) :
|
621
|
-
return self.distance
|
622
|
-
|
623
|
-
|
624
|
-
def get_conflicts(self) :
|
625
|
-
return self.conflicts
|
626
|
-
|
627
|
-
|
628
|
-
def get_word1(self) :
|
629
|
-
return self.word1
|
630
|
-
|
631
|
-
|
632
|
-
def get_word2(self) :
|
633
|
-
return self.word2
|
634
|
-
|
635
|
-
def get_result(self):
|
636
|
-
"""
|
637
|
-
This method returns the result of the comparison between two words.
|
638
|
-
|
639
|
-
Returns:
|
640
|
-
:obj:`str`: The result of the comparison. Can be *Same* or *Different*.
|
641
|
-
|
642
|
-
**Example:**
|
643
|
-
|
644
|
-
.. highlight:: python
|
645
|
-
.. code-block:: python
|
646
|
-
|
647
|
-
from sinatools.utils.implication import Implication
|
648
|
-
w1 = "hello"
|
649
|
-
w2 = "hell"
|
650
|
-
implication = Implication(w1, w2)
|
651
|
-
result = implication.get_result()
|
652
|
-
print(result)
|
653
|
-
Output: "Same"
|
654
|
-
"""
|
655
|
-
if Implication.get_direction(self) >= 0 and Implication.get_distance(self) < 15:
|
656
|
-
self.result = "Same"
|
657
|
-
else:
|
658
|
-
self.result = "Different"
|
659
|
-
return self.result
|
660
|
-
|
661
|
-
def toString(self) :
|
662
|
-
return self.word1 + "\t" + self.word2 + "\t" + str(self.verdict) + "\t" + str(self.direction) + "\t" + str(self.distance) + "\t"+ str(self.conflicts)
|