SinaTools 0.1.26__py2.py3-none-any.whl → 0.1.28__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. SinaTools-0.1.28.dist-info/METADATA +64 -0
  2. {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/RECORD +33 -30
  3. sinatools/CLI/DataDownload/download_files.py +5 -8
  4. sinatools/CLI/morphology/ALMA_multi_word.py +0 -34
  5. sinatools/CLI/morphology/morph_analyzer.py +1 -1
  6. sinatools/CLI/ner/corpus_entity_extractor.py +17 -4
  7. sinatools/CLI/ner/entity_extractor.py +8 -8
  8. sinatools/CLI/utils/implication.py +3 -3
  9. sinatools/CLI/utils/jaccard.py +2 -2
  10. sinatools/DataDownload/downloader.py +2 -2
  11. sinatools/VERSION +1 -1
  12. sinatools/morphology/morph_analyzer.py +44 -45
  13. sinatools/ner/__init__.py +6 -1
  14. sinatools/ner/entity_extractor.py +42 -1
  15. sinatools/ner/relation_extractor.py +201 -0
  16. sinatools/semantic_relatedness/compute_relatedness.py +22 -0
  17. sinatools/synonyms/__init__.py +2 -2
  18. sinatools/synonyms/synonyms_generator.py +45 -1
  19. sinatools/utils/jaccard.py +1 -1
  20. sinatools/utils/parser.py +12 -15
  21. sinatools/utils/similarity.py +240 -0
  22. sinatools/utils/text_dublication_detector.py +22 -0
  23. sinatools/utils/text_transliteration.py +1 -1
  24. sinatools/utils/tokenizer.py +1 -1
  25. sinatools/utils/word_compare.py +667 -0
  26. sinatools/wsd/__init__.py +1 -1
  27. sinatools/wsd/disambiguator.py +20 -19
  28. SinaTools-0.1.26.dist-info/METADATA +0 -34
  29. {SinaTools-0.1.26.data → SinaTools-0.1.28.data}/data/sinatools/environment.yml +0 -0
  30. {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/AUTHORS.rst +0 -0
  31. {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/LICENSE +0 -0
  32. {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/WHEEL +0 -0
  33. {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/entry_points.txt +0 -0
  34. {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/top_level.txt +0 -0
@@ -58,7 +58,7 @@ def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclam
58
58
 
59
59
  def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
60
60
  """
61
- This method receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in a CSV file.
61
+ This method is designed to tokenize a corpus into words. It receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in one CSV file. The data within files was split into sentences using the sentence_tokenizer module and into words using a word tokenizer. Additionally, it added a set of ids (row_id, docs_sentence_word_id, global_sentence_id, sentence_id, word_position).
62
62
 
63
63
  Args:
64
64
  dir_path (:obj:`str`): The path of the directory containing multiple Arabic txt files.
@@ -0,0 +1,667 @@
1
+
2
+ # The Imply algorithm takes two words as input and produces the matching tuple defined by (Words Matching).
3
+ # The matching between two words is defined as a tuple:
4
+ # <w1, w2, implication direction, distance, conflicts, verdict, preferredWord> .
5
+
6
+ from sinatools.utils.parser import arStrip
7
+ class Implication:
8
+ """
9
+ Compares two Arabic words to find out whether they have compatible diacratization (i.e., implication between the diacrtics).
10
+
11
+ For example: (فَعل) and (فَعَل) are compatible words. The first implies the second because it has less diacritics. Based on the implication direction score, the class determines the verdict (Same or different), as well as the diacritic distance and the number of diacritic conflicts between them. The class also returns the preferredWord, which is the “implied” word that has more diacritics.
12
+
13
+ You can try the demo online, and see the article for more details.
14
+
15
+ Args:
16
+ :obj:`str' word1: input text
17
+ :obj:`str' word2: input text
18
+
19
+ """
20
+ # Diacritic Pair Distance Map
21
+ distanceTable = [
22
+ [0, 0, 1, 1, 1, 1, 1, 1, 15, 16, 16, 16, 0, 0, 0, 0 ],
23
+ [0, 0, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
24
+ [1, 101, 0, 101, 101, 101, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
25
+ [1, 101, 101, 0, 101, 101, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
26
+ [1, 101, 101, 101, 0, 101, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
27
+ [1, 101, 101, 101, 101, 0, 101, 101, 101, 101, 101, 101, 0, 0, 0, 0],
28
+ [1, 101, 101, 101, 101, 101, 0, 101, 101, 101, 101, 101, 0, 0, 0, 0],
29
+ [1, 101, 101, 101, 101, 101, 101, 0, 101, 101, 101, 101, 0, 0, 0, 0],
30
+ [15, 101, 101, 101, 101, 101, 101, 101, 0, 1, 1, 1, 0, 0, 0, 0],
31
+ [16, 101, 101, 101, 101, 101, 101, 101, 1, 0, 101, 101, 0, 0, 0, 0],
32
+ [16, 101, 101, 101, 101, 101, 101, 101, 1, 101, 0, 101, 0, 0, 0, 0],
33
+ [16, 101, 101, 101, 101, 101, 101, 101, 1, 101, 101, 0, 0, 0, 0, 0],
34
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4],
35
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 100, 100],
36
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 100, 0, 100],
37
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 100, 100, 0]
38
+ ]
39
+
40
+ # Implication direction Map
41
+ directionTable =[
42
+ [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
43
+ [2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
44
+ [2, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
45
+ [2, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
46
+ [2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
47
+ [2, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0],
48
+ [2, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, 0, 0, 0, 0],
49
+ [2, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, 0, 0, 0, 0],
50
+ [2, -1, -1, -1, -1, -1, -1, -1, 3, 1, 1, 1, 0, 0, 0, 0],
51
+ [2, -1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, 0, 0, 0, 0],
52
+ [2, -1, -1, -1, -1, -1, -1, -1, 2, -1, 3, -1, 0, 0, 0, 0],
53
+ [2, -1, -1, -1, -1, -1, -1, -1, 2, -1, -1, 3, 0, 0, 0, 0],
54
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 1],
55
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, -1, -1],
56
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -1, 3, -1],
57
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, -1, -1, 3]
58
+ ]
59
+
60
+ word1 , word2 = "" , "" # two words to be compared
61
+ conflictFlags = [False for i in range(5)]
62
+ verdict = "null" # verdict: takes one of the values: “compatible”, or “incompatible”
63
+ word1Undiac = "" # word1 without diacritics
64
+ word2Undiac = "" # word2 without diacritics
65
+ word1Diacritics = [] # Diacritics array of the first word
66
+ word2Diacritics = [] # Diacritics array of the second word
67
+ direction = -2147483648 # direction: is a number denoting the relationship between the two words, the defult value is given a low integer, arbitrarry value
68
+ distance = -2147483648 # distance: denotes the overall similarity of the diacritization between the two words, which we compute based on the distance map; the defult value is given a low integer, arbitrarry value
69
+ conflicts = -2147483648 # conflict: denotes the number of conflicting diacritics between the two words, the defult value is given a low integer, arbitrarry value
70
+ lettersDirection = [] # implication direction between diacritics
71
+
72
+ def __init__(self , inputWord1 , inputWord2):
73
+
74
+ #check if inputWord1 or inputWord2 is empty, then return the values below
75
+ if ( (not inputWord1) and (inputWord2) ) or ( ( inputWord1) and (not inputWord2) ):
76
+ self.verdict = "Incompatible"
77
+ self.direction = -3 # the two words have different letters
78
+ self.distance = 3000
79
+ self.conflicts = 0
80
+ return
81
+
82
+ self.conflictFlags = [False for i in range(5)] # reset conflictFlags array to Fales
83
+ self.word1 = Implication.normalize_alef(inputWord1) # unify alif
84
+ self.word2 = Implication.normalize_alef(inputWord2) # unify alif
85
+
86
+ if ( self.word1 == self.word2): # If w1 == w2 returns the values bellow
87
+ self.verdict = "Compatible"
88
+ self.direction = 3 # Both letters have exactly the same diacritics
89
+ self.distance = 0
90
+ self.conflicts = 0
91
+ return
92
+ else: # If w1 and w2 are noot exact match
93
+ try:
94
+ self.lettersDirection = []
95
+ # build diacritics array for each word
96
+ self.word1Diacritics = Implication.get_diacritics_array(self.word1)
97
+ self.word2Diacritics = Implication.get_diacritics_array(self.word2)
98
+
99
+ # defined lettersDirection array with size of word1Diacritics and fill it by zeros
100
+ for x in range(0 , len(self.word1Diacritics) + 1):
101
+ self.lettersDirection.append(0)
102
+ except :
103
+ # In case of errors returns the values below
104
+ self.verdict = "Incompatible"
105
+ self.direction = -3 # the two words have different letters
106
+ self.distance = 3000
107
+ self.conflicts = 0
108
+ return
109
+
110
+ # check if diacritics in both words for some of syntax errors then return Incompatible
111
+ if ( Implication.diacritics_syntax_error_in(self.word1Diacritics) == False and Implication.diacritics_syntax_error_in(self.word2Diacritics) == False) :
112
+ # If no syntax error found:
113
+ self.word1Undiac = arStrip(self.word1, diacs=False, shaddah=False)
114
+ self.word2Undiac = arStrip(self.word2, diacs=False, shaddah=False)
115
+ # return compatible if each word is one and same letter regardless of diacritics on this letter
116
+ if (len(self.word1Undiac) == 1 and len(self.word2Undiac) == 1 and self.word1Undiac == self.word2Undiac):
117
+ self.verdict = "Compatible"
118
+ self.direction = 3 # Both letters have exactly the same diacritics
119
+ self.distance = 0
120
+ self.conflicts = 0
121
+ else : # If words are more than letter or deffirent letter then calculate the impication
122
+ self.lettersDirection[0] = 3
123
+ self.calculate_words_implication()
124
+
125
+ else : # If found syntax error in diacitics in word1 or word2 then return these:
126
+ self.verdict = "Incompatible"
127
+ self.direction = -3 # the two words have different letters
128
+ self.distance = 3000
129
+ self.conflicts = 0
130
+
131
+ def get_non_preferred_word(self, word1, word2):
132
+ """
133
+ This method returns the non-preferred word from two given words.
134
+
135
+ Args:
136
+ :obj:`str' word1: The first word.
137
+ :obj:`str' word2: The second word.
138
+
139
+ Returns:
140
+ :obj:`str': The non-preferred word.
141
+
142
+ Raises:
143
+ None
144
+ """
145
+ # this function talkes 2-words and retuen preferredWord
146
+ word1 = word1.strip()
147
+ word2 = word2.strip()
148
+ if (word1 != None and word1 ) :
149
+ if (word2 != None and word2) :
150
+ preferredWord = ""
151
+ preferredWord = Implication.getPreferredWord(word1, word2)
152
+ if word1== preferredWord:
153
+ return word2
154
+ else:
155
+ return word1
156
+ else :
157
+ return word1
158
+
159
+ else :
160
+ if word2 != None and word2:
161
+ return word2
162
+ else:
163
+ return None
164
+
165
+
166
+
167
+ def get_preferred_word( self , word1, word2) :
168
+ """
169
+ This method returns the preferred word from two given words.
170
+
171
+ Args:
172
+ :obj:`str' word1: The first word.
173
+ :obj:`str' word2: The second word.
174
+
175
+ Returns:
176
+ :obj:`str': The preferred word.
177
+
178
+ Raises:
179
+ None
180
+ """
181
+ word1 = word1.strip()
182
+ word2 = word2.strip()
183
+ if ( word1 != None and word1) :
184
+ if (word2 != None and word2) :
185
+ implication = Implication(word1, word2)
186
+ direction = implication.get_distance()
187
+ if (direction < 15) :
188
+ if ( ( direction == 0 ) or
189
+ (direction == 2 ) ):
190
+ return word1
191
+ elif direction == 1 :
192
+ return word2
193
+ elif direction == 3 :
194
+ if ( ( not word1.endswith("َ") ) and ( not word1.endswith("ُ") ) ) :
195
+ return word2
196
+ return word1
197
+
198
+ return ""
199
+ else :
200
+ return word1
201
+
202
+ else :
203
+ if word2 != None and (not word2):
204
+ return word2
205
+ else:
206
+ return None
207
+
208
+ def normalize_alef(word):
209
+ """
210
+ This method normalizes the alif (ألف) character in the given word.
211
+
212
+ Args:
213
+ word (:obj:`str`): The input word to be normalized.
214
+
215
+ Returns:
216
+ :obj:`str`: The normalized word with alif characters modified.
217
+
218
+ **Example:**
219
+
220
+ .. highlight:: python
221
+ .. code-block:: python
222
+
223
+ from sinatools.utils.implication import Implication
224
+
225
+ word = Implication.normalize_alef("ًى") # Returns "ىً"
226
+ word = Implication.normalize_alef("ًا") # Returns "اً"
227
+ word = Implication.normalize_alef("ٱلكتاب") # Returns "الكتاب"
228
+ """
229
+ # If the tanween is before the alif, then it is placed after it,
230
+ # because in the Arabic language this word is similar
231
+ if word.endswith("ًى"):
232
+ word = word[:len(word) - 2] + "ىً"
233
+
234
+ if word.endswith("ًا"):
235
+ word = word[:len(word) - 2] + "اً"
236
+ # Replace Alif-dhamma with Alif
237
+ if word.startswith("ٱ"):
238
+ word = "ا" + word[1:]
239
+ return word
240
+
241
+
242
+ def diacritics_syntax_error_in( diacriticsArray ) :
243
+ """
244
+ This method checks if the diacritics in a given array are incorrect.
245
+
246
+ Args:
247
+ diacritics_array (:obj:`list`): A list of diacritics to be checked.
248
+
249
+ Returns:
250
+ :obj:`bool`: True if there is a syntax error in the diacritics, False otherwise.
251
+
252
+ **Example:**
253
+
254
+ .. highlight:: python
255
+ .. code-block:: python
256
+
257
+ from sinatools.utils.implication import Implication
258
+
259
+ diacritics = ["َ", "ُ", "ِ", "ّ"]
260
+ has_error = Implication.diacritics_syntax_error_in(diacritics) # Returns False
261
+
262
+ diacritics = ["َ", "ُ", "ِ", "ٓ"]
263
+ has_error = Implication.diacritics_syntax_error_in(diacritics) # Returns True
264
+ """
265
+ # This funcion return True when the diacritics is incorreclty
266
+ try:
267
+ # check last letter diacritic
268
+ if ( Implication.wrong_end_diacritic(diacriticsArray[ len(diacriticsArray) - 1]) ) :
269
+ return True
270
+ else :
271
+ # check All letters diacritic except the last letter diacritic
272
+ for i in range(0 , len(diacriticsArray) - 1 ) :
273
+ if (Implication.wrong_middle_iacritic(diacriticsArray[i])) :
274
+ return True
275
+ return False
276
+
277
+ except :
278
+ return False
279
+
280
+
281
+ def wrong_end_diacritic(diac):
282
+ """
283
+ This method checks if the given diacritic is a wrong end diacritic.
284
+
285
+ Args:
286
+ diac (:obj:`int`): The diacritic value to be checked.
287
+
288
+ Returns:
289
+ :obj:`bool`: True if the diacritic is one of the follwoing number (85:SHADDAH WITH FATHATAN, 86:SHADDAH WITH KASRTA, 87:SHADDAH WITH DHAMTAN), False if diacritic is greator than or equal0 and diacritic is less than or equal 11.
290
+
291
+ **Example:**
292
+
293
+ .. highlight:: python
294
+ .. code-block:: python
295
+
296
+ from sinatools.utils.implication import Implication
297
+
298
+ diacritic = 0
299
+ is_wrong_end = Implication.wrong_end_diacritic(diacritic) # Returns False
300
+
301
+ diacritic = 85
302
+ is_wrong_end = Implication.wrong_end_diacritic(diacritic) # Returns True
303
+ """
304
+ # 0 > No Diacritics , 1 > SUKUN, 2 > FATHA, 3 > KASRA, 4 > DAMMA, 5 > FATHATAN, 6 > KASRATAN,
305
+ # 7 > DAMMATAN, 8 > SHADDA, 9 > SHADDA with FATHA, 10 > SHADDA with KASRA, 11 > SHADDA with DAMMA
306
+ if (diac >= 0 and diac <= 11) :
307
+ return False
308
+ else :
309
+ # 85 - 86 - 87: SHADDAH WITH FATHATAN,SHADDAH WITH KASRTA, SHADDAH WITH DHAMTAN
310
+ return diac < 85 or diac > 87
311
+
312
+ def wrong_middle_iacritic( diac) :
313
+
314
+ if (diac >= 0 and diac <= 4) :
315
+ return False
316
+ else :
317
+ return diac < 8 or diac > 15
318
+
319
+
320
+ def calculate_words_implication(self):
321
+ """
322
+ This method calculates the implication between two words, and updates the verdict, direction, distance, and conflicts attributes of the object based on the implication between the words.
323
+
324
+ Returns:
325
+ None
326
+
327
+ **Example:**
328
+
329
+ .. highlight:: python
330
+ .. code-block:: python
331
+
332
+ from sinatools.utils.implication import Implication
333
+
334
+ implication = Implication(word1, word2)
335
+ implication.calculate_words_implication()
336
+ # Access the updated attributes
337
+ verdict = implication.verdict
338
+ direction = implication.direction
339
+ distance = implication.distance
340
+ conflicts = implication.conflicts
341
+ """
342
+
343
+ self.verdict = "Incompatible"
344
+ self.direction = -2
345
+ self.distance = 1000
346
+ if (Implication.equal_words(self) == False): # If both words are not thge same return these values
347
+ if ((len(self.word1Undiac) == 0 and len(self.word2Undiac) == 0)):
348
+ if (self.word1 == self.word2):
349
+ self.conflicts = 0
350
+ self.distance = 0
351
+ self.direction = 3
352
+ else:
353
+ self.conflicts = 1
354
+ self.distance = 1000
355
+ self.direction = -2
356
+
357
+ else:
358
+ self.conflicts = max(len(self.word1Undiac), len(self.word2Undiac))
359
+
360
+ else:
361
+ if (Implication.calculate_letters_implication(self)):
362
+ self.direction = Implication.calculate_direction(self)
363
+ if (self.direction == -1) :
364
+ self.distance = 101
365
+ else:
366
+ self.verdict = "Compatible"
367
+
368
+ else:
369
+ self.direction = -3 # the two words have different letters
370
+ self.distance = 3000
371
+ self.conflicts = 0
372
+
373
+ def equal_words( self ) :
374
+ """
375
+ This method updates the word1Undiac and word2Undiac attributes by removing the first letter, and returns True if the words are equal, False otherwise.
376
+
377
+ Returns:
378
+ :obj:`bool`: True if the words are equal, False otherwise.
379
+
380
+ **Example:**
381
+
382
+ .. highlight:: python
383
+ .. code-block:: python
384
+
385
+ from sinatools.utils.implication Implication
386
+
387
+ implication = Implication(word1, word2)
388
+ result = implication.equal_words()
389
+ if result:
390
+ print("The words are equal")
391
+ else:
392
+ print("The words are not equal")
393
+ """
394
+ # check if the tow words are the same taking into account the alif as the first letter
395
+ word1FirstLetter = self.word1Undiac[0 : 1] # First letter in word1
396
+ word2FirstLetter = self.word2Undiac[0 : 1] # First letter in word2
397
+ self.word1Undiac = self.word1Undiac[1 : ] # all word1 letters without diacritics except first letter
398
+ self.word2Undiac = self.word2Undiac[1 : ] # all word2 letters without diacritics except first letter
399
+
400
+ # If both words withot first letter are not equal return false, otherwise continue
401
+ if ( self.word1Undiac != self.word2Undiac):
402
+ return False
403
+
404
+ # If the first letter in both words the same and (the other letters are the same) then return true, otherwise continue
405
+ if word1FirstLetter == word2FirstLetter :
406
+ return True
407
+
408
+ # check if first letter is any alif (the other letters are the same) then return below values
409
+ if (word1FirstLetter != "ا" or word2FirstLetter != "آ" and word2FirstLetter != "أ" and word2FirstLetter != "إ") :
410
+ if ((word1FirstLetter == "آ" or word1FirstLetter == "أ" or word1FirstLetter == "إ") and word2FirstLetter == "ا") :
411
+ self.lettersDirection[0] = 2 # w2 implies w1
412
+ self.conflictFlags[3] = True
413
+ return True
414
+ else:
415
+ return False
416
+ else:
417
+ self.lettersDirection[0] = 1 # w1 implies w2
418
+ self.conflictFlags[2] = True
419
+ return True
420
+
421
+ return False
422
+
423
+
424
+ def calculate_letters_implication(self) :
425
+ """
426
+ This method updates the lettersDirection, conflictFlags, and distance attributes based on the directionTable and distanceTable values for each pair of diacritics. It returns True after the calculation is completed.
427
+
428
+ Returns:
429
+ :obj:`bool`: True indicating the calculation is completed.
430
+
431
+ **Example:**
432
+
433
+ .. highlight:: python
434
+ .. code-block:: python
435
+
436
+ from sinatools.utils.implication import Implication
437
+
438
+ implication = Implication(word1, word2)
439
+ result = implication.calculate_letters_implication()
440
+ if result:
441
+ print("Letters implication calculation completed")
442
+ """
443
+ self.distance = 0
444
+ word1Diac = 0
445
+ word2Diac = 0
446
+
447
+ for i in range ( 0 , len(self.word1Diacritics) - 1) :
448
+ word1Diac = self.word1Diacritics[i]
449
+ word2Diac = self.word2Diacritics[i]
450
+
451
+ self.lettersDirection[i + 1] = self.directionTable[word1Diac][word2Diac]
452
+ self.conflictFlags[self.lettersDirection[i + 1] + 1] = True
453
+ self.distance = self.distance + self.distanceTable[word1Diac][word2Diac]
454
+
455
+
456
+ word1Diac = int( self.word1Diacritics[len(self.word1Diacritics) - 1] ) # last letter diacritics to word1
457
+ word2Diac = int( self.word2Diacritics[len(self.word1Diacritics) - 1] ) # last letter diacritics to word2
458
+ # 8: expresses the presence of shaddah
459
+ if (word1Diac == 8 or word2Diac == 8) :
460
+ self.lettersDirection[len(self.lettersDirection) - 1] = self.directionTable[word1Diac][word2Diac]
461
+ self.conflictFlags[self.lettersDirection[len(self.lettersDirection) - 1] + 1] = True
462
+ self.distance = self.distance + self.distanceTable[word1Diac][word2Diac]
463
+ return True
464
+
465
+
466
+ def calculate_direction(self ):
467
+ """
468
+ This method calculates the direction of compatibility based on a conflict flags.
469
+
470
+ Returns:
471
+ :obj:`int`: The direction of compatibility:
472
+ -1: Incompatible-diacritics
473
+ 0: Compatible-imply each other
474
+ 1: Compatible-w1 implies w2
475
+ 2: Compatible-w2 implies w1
476
+ 3: Compatible-exactly equal
477
+ -2147483648: Default value for an invalid direction
478
+ """
479
+ self.conflicts = 0
480
+ if (self.conflictFlags[0] == True):
481
+ return -1 # Incompatible-diacritics
482
+
483
+ if (self.conflictFlags[2] == True and self.conflictFlags[3] == True ):
484
+ return 0 # Compatible-imply each other
485
+
486
+ if (self.conflictFlags[2] == True and self.conflictFlags[3] == False ):
487
+ return 1 # Compatible-w1 implies w2
488
+
489
+ if (self.conflictFlags[2] == False and self.conflictFlags[3] == True ):
490
+ return 2 # Compatible-w2 implies w1
491
+
492
+ if (self.conflictFlags[4]):
493
+ return 3 # Compatible-exactly equal
494
+ return -2147483648
495
+
496
+
497
+
498
+ def get_diacritics_array(word):
499
+ """
500
+ This method converts diacritics in a word to digits and returns the array of diacritics.
501
+
502
+ Args:
503
+ word (:obj:`str`): The word with diacritics.
504
+
505
+ Returns:
506
+ :obj:`list`: The array of diacritics converted to digits.
507
+
508
+ Raises:
509
+ Exception: If the first character of the word is a digit.
510
+
511
+ **Example:**
512
+
513
+ .. highlight:: python
514
+ .. code-block:: python
515
+
516
+ from sinatools.utils.implication import Implication
517
+ word = "مُرَحَّبًا"
518
+ diacritics = Implication.calculate_direction(word)
519
+ print(diacritics)
520
+ Output: [4, 3, 8, 5, 0]
521
+ """
522
+ # Replace diacritics by digits
523
+ word = word.replace(" ", "") #Space
524
+ word = word.replace("ْ", "1") #SUKUN
525
+ word = word.replace("َ", "2") #FATHA
526
+ word = word.replace("ِ", "3") #KASRA
527
+ word = word.replace("ُ", "4") #DAMMA
528
+ word = word.replace("ً", "5") #FATHATAN
529
+ word = word.replace("ٍ", "6") #KASRATAN
530
+ word = word.replace("ٌ", "7") #DAMMATAN
531
+ word = word.replace("ّ", "8") #SHADDA
532
+ word = word.replace("11", "100") #SUKUN with SUKUN
533
+ word = word.replace("12", "100") #SUKUN with FATHA
534
+ word = word.replace("13", "100") #SUKUN with KASRA
535
+ word = word.replace("14", "100") #SUKUN with DAMMA
536
+ word = word.replace("15", "100") #SUKUN with FATHATAN
537
+ word = word.replace("82", "9") #SHADDA with FATHA
538
+ word = word.replace("83", "10") #SHADDA with KASRA
539
+ word = word.replace("84", "11") #SHADDA with DAMMA
540
+ # Standardization Alif
541
+ word = word[0 : 1].replace("ا", "ا12,") + word[1: ]
542
+ word = word[0 : 1].replace("أ", "ا13,") + word[1: ]
543
+ word = word[0 : 1].replace("إ", "ا14,") + word[1: ]
544
+ word = word[0 : 1].replace("آ", "ا15,") + word[1: ]
545
+ if word[0:1].isdigit(): # Because a word should not begin with a diacritics
546
+ raise Exception("Sorry, First char is digit")
547
+ else:
548
+ # word = re.sub(r'[\u0600-\u06FF]' , ",",word) # replace all chars with ,
549
+ for x in word:
550
+ if ( ( x.isalpha() or not x.isdigit() ) and x != ',' ): # If char is not digit then replace it by ,
551
+ word = word.replace(x , ",")
552
+ # word = word.replace("\\D", ",")
553
+ word = word[0 : len(word) - 1] + word[ len(word ) - 1].replace(",", ",,") # last letter does not have diacritic problem
554
+
555
+ while ( ",," in word ):
556
+ word = word.replace(",,", ",0,") # No-DIACRITIC
557
+
558
+ word = word[1 : len(word) ] # Ignore the first letter diacritic
559
+ diacritics = []
560
+ diacritics = word.split(",") # diacritics is array of diacritics
561
+ if '' in diacritics: # Remove empty index if exist
562
+ diacritics.remove('')
563
+ var3 = diacritics[len(diacritics) - 1] # last letter diacritic
564
+
565
+
566
+ # SHADDA with FATHA,SHADDA with KASRA,SHADDA with DAMMA,SHADDAH WITH FATHATAN,SHADDAH WITH KASRTA, SHADDAH WITH DHAMTAN
567
+ if var3 == "8" or var3 == "9" or var3 == "10" or var3 == "11" or var3 == "85" or var3 == "86" or var3 == "87":
568
+ diacritics[len(diacritics )- 1] = "8"
569
+ # SUKUN , FATHA , KASRA , DAMMA , FATHATAN , KASRATAN , DAMMATAN
570
+ elif var3 == "1" or var3 == "2" or var3 == "3" or var3 == "4" or var3 == "5" or var3 == "6" or var3 == "7":
571
+ diacritics[len(diacritics )- 1] = "0"
572
+
573
+ strDiacritics = []
574
+ strDiacritics = diacritics
575
+
576
+ # Convert string array digits to integer digits array
577
+ for x in range(0 , len(strDiacritics) ):
578
+ diacritics[x] = int(strDiacritics[x])
579
+ return diacritics
580
+
581
+ # def removeDiacritics( word ): # remove all diacritics from Arabic word
582
+ # word = word.replace(" ", "")
583
+ # word = word.replace("ْ", "") #SUKUN
584
+ # word = word.replace("َ", "") #FATHA
585
+ # word = word.replace("ِ", "") #KASRA
586
+ # word = word.replace("ُ", "") #DAMMA
587
+ # word = word.replace("ً", "") #FATHATAN
588
+ # word = word.replace("ٍ", "") #KASRATAN
589
+ # word = word.replace("ٌ", "") #DAMMATAN
590
+ # word = word.replace("ّ", "") #SHADDA
591
+ # return word
592
+
593
+ def get_letters_array(word):
594
+ """
595
+ This method returns the array of letters from a given word.
596
+
597
+ Args:
598
+ word (:obj:`str`): The word from which to extract the letters.
599
+
600
+ Returns:
601
+ obj:`list`: The array of letters.
602
+
603
+ **Example:**
604
+
605
+ .. highlight:: python
606
+ .. code-block:: python
607
+
608
+ from sinatools.utils.implication import Implication
609
+ word = "مرحبا"
610
+ letters = get_letters_array(word)
611
+ print(letters)
612
+ Output: ['م', 'ر', 'ح', 'ب', 'ا']
613
+ """
614
+ word = arStrip(word, diacs=False, shaddah=False)
615
+ return list(word)
616
+
617
+ #def get_verdict(self ):
618
+ # return self.verdict
619
+
620
+
621
+ def get_direction(self):
622
+ return self.direction
623
+
624
+
625
+ def get_distance(self) :
626
+ return self.distance
627
+
628
+
629
+ def get_conflicts(self) :
630
+ return self.conflicts
631
+
632
+
633
+ def get_word1(self) :
634
+ return self.word1
635
+
636
+
637
+ def get_word2(self) :
638
+ return self.word2
639
+
640
+ def get_verdict(self):
641
+ """
642
+ This method returns the result of the comparison between two words.
643
+
644
+ Returns:
645
+ :obj:`str`: The result of the comparison. Can be *Same* or *Different*.
646
+
647
+ **Example:**
648
+
649
+ .. highlight:: python
650
+ .. code-block:: python
651
+
652
+ from sinatools.utils.implication import Implication
653
+ w1 = "hello"
654
+ w2 = "hell"
655
+ implication = Implication(w1, w2)
656
+ result = implication.get_result()
657
+ print(result)
658
+ Output: "Same"
659
+ """
660
+ if Implication.get_direction(self) >= 0 and Implication.get_distance(self) < 15:
661
+ self.result = "Same"
662
+ else:
663
+ self.result = "Different"
664
+ return self.result
665
+
666
+ def toString(self) :
667
+ return self.word1 + "\t" + self.word2 + "\t" + str(self.verdict) + "\t" + str(self.direction) + "\t" + str(self.distance) + "\t"+ str(self.conflicts)
sinatools/wsd/__init__.py CHANGED
@@ -4,7 +4,7 @@ from sinatools.DataDownload import downloader
4
4
  import os
5
5
 
6
6
  glosses_dic = {}
7
- filename = 'glosses_dic.pickle'
7
+ filename = 'one_gram.pickle'
8
8
  path =downloader.get_appdatadir()
9
9
  file_path = os.path.join(path, filename)
10
10
  with open(file_path, 'rb') as f: