SinaTools 0.1.11__py2.py3-none-any.whl → 0.1.12__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/METADATA +2 -3
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/RECORD +47 -26
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/entry_points.txt +7 -3
- sinatools/CLI/DataDownload/download_files.py +0 -10
- sinatools/CLI/ner/corpus_entity_extractor.py +6 -6
- sinatools/CLI/ner/entity_extractor.py +18 -42
- sinatools/CLI/utils/arStrip.py +8 -8
- sinatools/CLI/utils/implication.py +0 -8
- sinatools/CLI/utils/jaccard.py +5 -14
- sinatools/CLI/utils/remove_latin.py +2 -2
- sinatools/CLI/utils/text_dublication_detector.py +25 -0
- sinatools/VERSION +1 -1
- sinatools/morphology/ALMA_multi_word.py +14 -16
- sinatools/morphology/__init__.py +32 -31
- sinatools/ner/__init__.py +28 -2
- sinatools/ner/data/__init__.py +1 -0
- sinatools/ner/data/datasets.py +146 -0
- sinatools/ner/data/transforms.py +118 -0
- sinatools/ner/data.py +124 -0
- sinatools/ner/data_format.py +124 -0
- sinatools/ner/datasets.py +146 -0
- sinatools/ner/entity_extractor.py +34 -54
- sinatools/ner/helpers.py +86 -0
- sinatools/ner/metrics.py +69 -0
- sinatools/ner/nn/BaseModel.py +22 -0
- sinatools/ner/nn/BertNestedTagger.py +34 -0
- sinatools/ner/nn/BertSeqTagger.py +17 -0
- sinatools/ner/nn/__init__.py +3 -0
- sinatools/ner/trainers/BaseTrainer.py +117 -0
- sinatools/ner/trainers/BertNestedTrainer.py +203 -0
- sinatools/ner/trainers/BertTrainer.py +163 -0
- sinatools/ner/trainers/__init__.py +3 -0
- sinatools/ner/transforms.py +119 -0
- sinatools/semantic_relatedness/__init__.py +20 -0
- sinatools/semantic_relatedness/compute_relatedness.py +31 -0
- sinatools/synonyms/__init__.py +18 -0
- sinatools/synonyms/synonyms_generator.py +192 -0
- sinatools/utils/text_dublication_detector.py +110 -0
- sinatools/wsd/__init__.py +11 -0
- sinatools/{salma/views.py → wsd/disambiguator.py} +135 -94
- sinatools/{salma → wsd}/wsd.py +1 -1
- sinatools/CLI/salma/salma_tools.py +0 -68
- sinatools/salma/__init__.py +0 -12
- sinatools/utils/utils.py +0 -2
- {SinaTools-0.1.11.data → SinaTools-0.1.12.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/top_level.txt +0 -0
- /sinatools/{salma → wsd}/settings.py +0 -0
@@ -1,57 +1,136 @@
|
|
1
1
|
import json
|
2
|
-
from sinatools.
|
3
|
-
from sinatools.
|
4
|
-
from sinatools.
|
2
|
+
from sinatools.wsd import settings
|
3
|
+
from sinatools.wsd.wsd import normalizearabert
|
4
|
+
from sinatools.wsd.wsd import GlossPredictor
|
5
5
|
from sinatools.utils.parser import arStrip
|
6
6
|
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
7
7
|
from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
|
8
8
|
from sinatools.morphology.morph_analyzer import analyze
|
9
|
-
|
9
|
+
from sinatools.ner.entity_extractor import extract
|
10
|
+
|
11
|
+
|
12
|
+
def distill_entities(entities):
|
13
|
+
list_output = list()
|
14
|
+
|
15
|
+
temp_entities = sortTags(entities)
|
16
|
+
|
17
|
+
temp_list = list()
|
18
|
+
|
19
|
+
temp_list.append(["", "", 0, 0])
|
20
|
+
word_position = 0
|
21
|
+
|
22
|
+
for entity in temp_entities:
|
23
|
+
counter_tag = 0
|
24
|
+
for tag in str(entity[1]).split():
|
25
|
+
if counter_tag >= len(temp_list):
|
26
|
+
temp_list.append(["", "", 0, 0])
|
27
|
+
|
28
|
+
if "O" == tag and word_position != 0:
|
29
|
+
for j in range(0, len(temp_list)):
|
30
|
+
if temp_list[j][1] != "":
|
31
|
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
32
|
+
temp_list[j][0] = ""
|
33
|
+
temp_list[j][1] = ""
|
34
|
+
temp_list[j][2] = word_position
|
35
|
+
temp_list[j][3] = word_position
|
36
|
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
|
37
|
+
if temp_list[counter_tag][1] != "":
|
38
|
+
list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
|
39
|
+
temp_list[counter_tag][0] = str(entity[0]) + " "
|
40
|
+
temp_list[counter_tag][1] = str(tag).split("-")[1]
|
41
|
+
temp_list[counter_tag][2] = word_position
|
42
|
+
temp_list[counter_tag][3] = word_position
|
43
|
+
|
44
|
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
|
45
|
+
for j in range(counter_tag,len(temp_list)):
|
46
|
+
if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
|
47
|
+
temp_list[j][0] += str(entity[0]) + " "
|
48
|
+
temp_list[j][3] += 1
|
49
|
+
break
|
50
|
+
else:
|
51
|
+
if temp_list[j][1] != "":
|
52
|
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
53
|
+
temp_list[j][0] = ""
|
54
|
+
temp_list[j][1] = ""
|
55
|
+
temp_list[j][2] = word_position
|
56
|
+
temp_list[j][3] = word_position
|
57
|
+
counter_tag += 1
|
58
|
+
word_position += 1
|
59
|
+
for j in range(0, len(temp_list)):
|
60
|
+
if temp_list[j][1] != "":
|
61
|
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
62
|
+
return sorted(list_output, key=lambda x: (x[2]))
|
63
|
+
|
64
|
+
|
65
|
+
def sortTags(entities):
|
66
|
+
temp_entities = entities
|
67
|
+
temp_counter = 0
|
68
|
+
for entity in temp_entities:
|
69
|
+
tags = entity[1].split()
|
70
|
+
for tag in tags:
|
71
|
+
if temp_counter != 0:
|
72
|
+
if "I-" == tag[0:2]:
|
73
|
+
counter_of_this_tag = 0
|
74
|
+
counter_of_previous_tag = 0
|
75
|
+
for word in tags:
|
76
|
+
if tag.split("-")[1] in word:
|
77
|
+
counter_of_this_tag+=1
|
78
|
+
for word in temp_entities[temp_counter-1][1].split():
|
79
|
+
if tag.split("-")[1] in word:
|
80
|
+
counter_of_previous_tag+=1
|
81
|
+
if counter_of_previous_tag > counter_of_this_tag:
|
82
|
+
tags.append("I-"+tag.split("-")[1])
|
83
|
+
tags.sort()
|
84
|
+
tags.reverse()
|
85
|
+
if temp_counter != 0:
|
86
|
+
this_tags = tags
|
87
|
+
previous_tags = temp_entities[temp_counter - 1][1].split()
|
88
|
+
sorted_tags = list()
|
89
|
+
|
90
|
+
if "O" not in this_tags and "O" not in previous_tags:
|
91
|
+
index = 0
|
92
|
+
for i in previous_tags:
|
93
|
+
j = 0
|
94
|
+
while this_tags and j < len(this_tags):
|
95
|
+
if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
|
96
|
+
sorted_tags.insert(index, this_tags.pop(j))
|
97
|
+
break
|
98
|
+
elif this_tags[j][0:2] == "B-":
|
99
|
+
break
|
100
|
+
j += 1
|
101
|
+
index += 1
|
102
|
+
sorted_tags += this_tags
|
103
|
+
tags = sorted_tags
|
104
|
+
str_tag = " "
|
105
|
+
str_tag = str_tag.join(tags)
|
106
|
+
str_tag = str_tag.strip()
|
107
|
+
temp_entities[temp_counter][1] = str_tag
|
108
|
+
temp_counter += 1
|
109
|
+
return temp_entities
|
10
110
|
|
11
111
|
def delete_form_list(position, word_lemma):
|
12
|
-
#"""
|
13
|
-
#Remove specific elements from the word_lemma list based on the given position.
|
14
|
-
#
|
15
|
-
#Parameters:
|
16
|
-
#position (int): The current position in the input sentence.
|
17
|
-
#word_lemma (list): List of word lemma details.
|
18
|
-
#
|
19
|
-
#Returns:
|
20
|
-
#list: Updated word_lemma list with the specific elements removed.
|
21
|
-
#list: The list of removed elements.
|
22
|
-
#int: The new position in the input sentence.
|
23
|
-
#"""
|
24
112
|
tmp_word_lemma = []
|
25
113
|
output = []
|
26
114
|
for wordLemma in word_lemma:
|
27
|
-
if position == int(wordLemma[2]):
|
115
|
+
if position == int(wordLemma[2]):
|
28
116
|
word = wordLemma[0]
|
29
117
|
gloss = wordLemma[1]
|
30
118
|
position = int(wordLemma[3])
|
31
119
|
concept_count = int(wordLemma[4])
|
32
120
|
undiac_multi_word_lemma = wordLemma[5]
|
33
121
|
multi_word_lemma = wordLemma[6]
|
34
|
-
output.append([word, gloss, concept_count, undiac_multi_word_lemma, multi_word_lemma])
|
122
|
+
output.append([word, gloss, concept_count, undiac_multi_word_lemma, multi_word_lemma])
|
35
123
|
elif position < int(wordLemma[2]):
|
36
124
|
tmp_word_lemma.append(wordLemma)
|
37
125
|
return tmp_word_lemma, output, position
|
38
126
|
|
39
127
|
def find_two_word_lemma(input_sentence):
|
40
|
-
#"""
|
41
|
-
#Find two-word lemmas in the input sentence using the ALMA_multi_word function.
|
42
|
-
#
|
43
|
-
#Parameters:
|
44
|
-
#input_sentence (list): Tokenized input sentence.
|
45
|
-
#
|
46
|
-
#Returns:
|
47
|
-
#list: List of details of found two-word lemmas.
|
48
|
-
#"""
|
49
128
|
i = 0
|
50
129
|
output = []
|
51
130
|
length = len(input_sentence)
|
52
131
|
while i < length - 1:
|
53
132
|
two_grams = input_sentence[i] +" "+ input_sentence[i + 1]
|
54
|
-
data = ALMA_multi_word(two_grams)
|
133
|
+
data = ALMA_multi_word(two_grams, 2)
|
55
134
|
try :
|
56
135
|
glosses_list = []
|
57
136
|
concept_count = 0
|
@@ -62,12 +141,10 @@ def find_two_word_lemma(input_sentence):
|
|
62
141
|
glosses_list.append(json.loads(value[1]))
|
63
142
|
concept_count = concept_count + value[0]
|
64
143
|
|
65
|
-
# found two_grams
|
66
|
-
#found_2Word_lemma = [two_grams,data[0]['glosses'], i, i + 1,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
67
144
|
found_2Word_lemma = [two_grams, glosses_list, i, i + 1, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
68
145
|
output.append(found_2Word_lemma)
|
69
146
|
i = i + 1
|
70
|
-
except:
|
147
|
+
except:
|
71
148
|
i = i + 1
|
72
149
|
return output
|
73
150
|
|
@@ -78,7 +155,7 @@ def find_three_word_lemma(input_sentence):
|
|
78
155
|
length = len(input_sentence)
|
79
156
|
while i < length - 2:
|
80
157
|
three_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2]
|
81
|
-
data = ALMA_multi_word(three_grams)
|
158
|
+
data = ALMA_multi_word(three_grams, 3)
|
82
159
|
try:
|
83
160
|
glosses_list = []
|
84
161
|
concept_count = 0
|
@@ -89,7 +166,6 @@ def find_three_word_lemma(input_sentence):
|
|
89
166
|
glosses_list.append(json.loads(value[1]))
|
90
167
|
concept_count = concept_count + value[0]
|
91
168
|
|
92
|
-
#found_3Word_lemma = [three_grams, data[0]['glosses'], i, i + 2,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
93
169
|
found_3Word_lemma = [three_grams, glosses_list, i, i + 2, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
94
170
|
output.append(found_3Word_lemma)
|
95
171
|
i = i + 1
|
@@ -103,7 +179,7 @@ def find_four_word_lemma(input_sentence):
|
|
103
179
|
length = len(input_sentence)
|
104
180
|
while i < length - 3:
|
105
181
|
four_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2] + " "+ input_sentence[i + 3]
|
106
|
-
data = ALMA_multi_word(four_grams)
|
182
|
+
data = ALMA_multi_word(four_grams, 4)
|
107
183
|
try:
|
108
184
|
glosses_list = []
|
109
185
|
concept_count = 0
|
@@ -113,7 +189,6 @@ def find_four_word_lemma(input_sentence):
|
|
113
189
|
value = settings.glosses_dic[lemma_id]
|
114
190
|
glosses_list.append(json.loads(value[1]))
|
115
191
|
concept_count = concept_count + value[0]
|
116
|
-
#found_4Word_lemma = [four_grams, data[0]['glosses'], i, i + 3,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
117
192
|
found_4Word_lemma = [four_grams, glosses_list, i, i + 3, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
118
193
|
output.append(found_4Word_lemma)
|
119
194
|
i = i + 1
|
@@ -128,7 +203,7 @@ def find_five_word_lemma(input_sentence):
|
|
128
203
|
length = len(input_sentence)
|
129
204
|
while i < length - 4:
|
130
205
|
five_grams = input_sentence[i] +" "+ input_sentence[i + 1] + " "+ input_sentence[i + 2] + " "+ input_sentence[i + 3] + " "+ input_sentence[i + 4]
|
131
|
-
data = ALMA_multi_word(five_grams)
|
206
|
+
data = ALMA_multi_word(five_grams, 5)
|
132
207
|
try:
|
133
208
|
glosses_list = []
|
134
209
|
concept_count = 0
|
@@ -138,7 +213,6 @@ def find_five_word_lemma(input_sentence):
|
|
138
213
|
value = settings.glosses_dic[lemma_id]
|
139
214
|
glosses_list.append(json.loads(value[1]))
|
140
215
|
concept_count = concept_count + value[0]
|
141
|
-
#found_5Word_lemma = [five_grams, data[0]['glosses'], i, i + 4,data[0]['concept_count'], data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
142
216
|
found_5Word_lemma = [five_grams, glosses_list, i, i + 4, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
143
217
|
output.append(found_5Word_lemma)
|
144
218
|
i = i + 1
|
@@ -146,19 +220,16 @@ def find_five_word_lemma(input_sentence):
|
|
146
220
|
i = i + 1
|
147
221
|
return output
|
148
222
|
|
223
|
+
def jsons_to_list_of_lists(json_list):
|
224
|
+
return [[d['token'], d['tags']] for d in json_list]
|
225
|
+
|
149
226
|
def find_named_entities(string):
|
150
|
-
#"""
|
151
|
-
# Find named entities in the input string using a NER tool.
|
152
|
-
#
|
153
|
-
# Parameters:
|
154
|
-
# string (str): Input string.
|
155
|
-
#
|
156
|
-
# Returns:
|
157
|
-
# list: List of details of found named entities.
|
158
|
-
# """
|
159
227
|
found_entities = []
|
160
|
-
|
161
|
-
|
228
|
+
|
229
|
+
ner_entites = extract(string)
|
230
|
+
list_of_entites = jsons_to_list_of_lists(ner_entites)
|
231
|
+
entites = distill_entities(list_of_entites)
|
232
|
+
|
162
233
|
tag_gloss = {
|
163
234
|
"PERS": "اسم شخص",
|
164
235
|
"ORG": "اسم مؤسسة",
|
@@ -196,54 +267,26 @@ def find_named_entities(string):
|
|
196
267
|
|
197
268
|
def find_glosses_using_ALMA(word):
|
198
269
|
|
199
|
-
data = analyze(word)
|
270
|
+
data = analyze(word, language ='MSA', task ='lemmatization', flag="1")
|
200
271
|
Diac_lemma = ""
|
201
272
|
pos = ""
|
202
273
|
Undiac_lemma = ""
|
203
274
|
glosses = []
|
204
|
-
Diac_lemma = data[0][
|
205
|
-
pos = data[0][
|
275
|
+
Diac_lemma = data[0]["lemma"]
|
276
|
+
pos = data[0]["pos"]
|
206
277
|
Undiac_lemma = arStrip(Diac_lemma, True, True, True, True, True, False) # Remove diacs , smallDiacs , shaddah , digit , alif , specialChars
|
207
|
-
#"""
|
208
|
-
# Find glosses for the given word using the ALMA tool.
|
209
|
-
#
|
210
|
-
# Parameters:
|
211
|
-
# word (str): Input word.
|
212
|
-
#
|
213
|
-
# Returns:
|
214
|
-
# tuple: Details of the word including glosses, lemmas, and POS.
|
215
|
-
# """
|
216
278
|
ids = []
|
217
279
|
glosses_list = []
|
218
280
|
concept_count = 0
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
value = settings.glosses_dic[lemma_id]
|
226
|
-
glosses_list.append(json.loads(value[1]))
|
227
|
-
concept_count = concept_count + value[0]
|
228
|
-
|
229
|
-
#glosses = data[0][4]
|
230
|
-
#concept_count = data[0][3]
|
281
|
+
lemma_id = data[0]["lemma_id"]
|
282
|
+
if lemma_id in settings.glosses_dic.keys():
|
283
|
+
value = settings.glosses_dic[lemma_id]
|
284
|
+
glosses_list.append(json.loads(value[1]))
|
285
|
+
concept_count = concept_count + value[0]
|
286
|
+
|
231
287
|
return word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses
|
232
288
|
|
233
289
|
def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, sentence):
|
234
|
-
#"""
|
235
|
-
# Disambiguate glosses using the SALMA tool.
|
236
|
-
#
|
237
|
-
# Parameters:
|
238
|
-
# glosses (list): List of glosses.
|
239
|
-
# Diac_lemma (str): Diacritic lemma of the word.
|
240
|
-
# Undiac_lemma (str): Undiacritic lemma of the word.
|
241
|
-
# word (str): The word being analyzed.
|
242
|
-
# sentence (str): The sentence containing the word.
|
243
|
-
#
|
244
|
-
# Returns:
|
245
|
-
# dict: Disambiguated gloss details.
|
246
|
-
# """
|
247
290
|
word = normalizearabert(word)
|
248
291
|
glosses_dictionary = {}
|
249
292
|
if glosses != None:
|
@@ -253,7 +296,7 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
|
|
253
296
|
|
254
297
|
my_json = {}
|
255
298
|
my_json['Concept_id'] = concept_id
|
256
|
-
|
299
|
+
# my_json['Gloss'] = gloss
|
257
300
|
my_json['word'] = word
|
258
301
|
my_json['Undiac_lemma'] = Undiac_lemma
|
259
302
|
my_json['Diac_lemma'] = Diac_lemma
|
@@ -409,7 +452,7 @@ def WSD(sentence):
|
|
409
452
|
return results
|
410
453
|
|
411
454
|
|
412
|
-
def
|
455
|
+
def disambiguate(sentence):
|
413
456
|
"""
|
414
457
|
This method disambiguate words within a sentence.
|
415
458
|
|
@@ -424,15 +467,14 @@ def SALMA(sentence):
|
|
424
467
|
.. highlight:: python
|
425
468
|
.. code-block:: python
|
426
469
|
|
427
|
-
from sinatools.
|
428
|
-
|
429
|
-
print(
|
470
|
+
from sinatools.wsd.disambiguator import disambiguate
|
471
|
+
result = disambiguate("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.")
|
472
|
+
print(result)
|
430
473
|
|
431
474
|
#output
|
432
475
|
[
|
433
476
|
{
|
434
477
|
"Concept_id": "303019218",
|
435
|
-
"Gloss": "ذهَب إلى عملِه:- قصَده، توجَّه إليه \"ذهَب إلى الجامعة/ بيروت - اذهَب إلى أبيك والتمس منه الصفح - ذهَب إلى قول فلان أخذ به - <اذْهَبْ إِلَى فِرْعَوْنَ إِنَّهُ طَغَى> طه/ 24 \". ذهَب رأسًا إليه",
|
436
478
|
"word": "ذهبت",
|
437
479
|
"Undiac_lemma": "ذهب",
|
438
480
|
"Diac_lemma": "ذَهَبَ۪ 1"
|
@@ -444,7 +486,6 @@ def SALMA(sentence):
|
|
444
486
|
},
|
445
487
|
{
|
446
488
|
"word": "جامعة بيرزيت",
|
447
|
-
"Gloss": جامعة فلسطينية تقع في بلدة بيرزيت، قرب مدينة رام الله، ويعود تاريخها إلى عام 1924 عندما تأسست كمدرسة ابتدائية ثم أصبحت جامعة عام 1975,
|
448
489
|
"Concept_id": "334000099",
|
449
490
|
"Diac_lemma": جامِعَة بيرزَيت,
|
450
491
|
"Undiac_lemma": "جامعة بيرزيت"
|
@@ -452,8 +493,8 @@ def SALMA(sentence):
|
|
452
493
|
]
|
453
494
|
"""
|
454
495
|
if len(sentence) > 500:
|
455
|
-
content =
|
496
|
+
content = ["Input is too long"]
|
456
497
|
return content
|
457
498
|
else:
|
458
499
|
results = WSD(sentence)
|
459
|
-
return
|
500
|
+
return results
|
sinatools/{salma → wsd}/wsd.py
RENAMED
@@ -1,68 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
SALMA CLI
|
3
|
-
|
4
|
-
About:
|
5
|
-
------
|
6
|
-
The SALMA command line interface (CLI) is a tool designed to utilize the SALMA function for processing Arabic sentences. This CLI allows users to input an Arabic sentence and receive a structured response that includes the processing result of the SALMA function.
|
7
|
-
|
8
|
-
Usage:
|
9
|
-
------
|
10
|
-
Below is the usage information that can be generated by running the command with the --help option.
|
11
|
-
|
12
|
-
.. code-block:: none
|
13
|
-
|
14
|
-
salma --text=TEXT
|
15
|
-
salma --file=INPUT_FILE
|
16
|
-
|
17
|
-
Options:
|
18
|
-
--------
|
19
|
-
.. code-block:: none
|
20
|
-
|
21
|
-
--text
|
22
|
-
The Arabic sentence to be processed by the SALMA function.
|
23
|
-
--file
|
24
|
-
The text file to be processed by the SALMA function.
|
25
|
-
|
26
|
-
Examples:
|
27
|
-
---------
|
28
|
-
.. code-block:: none
|
29
|
-
|
30
|
-
salma --text "your Arabic sentence here"
|
31
|
-
salma --file "path/to/your/file.txt"
|
32
|
-
|
33
|
-
Note:
|
34
|
-
-----
|
35
|
-
|
36
|
-
.. code-block:: none
|
37
|
-
|
38
|
-
- The input sentence should be provided in Arabic.
|
39
|
-
- It is recommended that the length of the input sentence does not exceed 500 characters to ensure optimal performance and accurate results.
|
40
|
-
|
41
|
-
"""
|
42
|
-
|
43
|
-
import argparse
|
44
|
-
import json
|
45
|
-
from sinatools.salma.views import SALMA
|
46
|
-
from sinatools.utils.readfile import read_file
|
47
|
-
|
48
|
-
def main():
|
49
|
-
parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
|
50
|
-
|
51
|
-
parser.add_argument('--text', type=str, help='Input sentence to process')
|
52
|
-
parser.add_argument('--file', type=str, help='File containing the Arabic sentence to process')
|
53
|
-
|
54
|
-
args = parser.parse_args()
|
55
|
-
|
56
|
-
if args.text is None and args.file is None:
|
57
|
-
print("Either --text or --file argument must be provided.")
|
58
|
-
return
|
59
|
-
|
60
|
-
text_content = args.text if args.text else " ".join(read_file(args.file))
|
61
|
-
result = SALMA(text_content)
|
62
|
-
print(json.dumps(result, ensure_ascii=False, indent=4))
|
63
|
-
|
64
|
-
if __name__ == "__main__":
|
65
|
-
main()
|
66
|
-
|
67
|
-
#salma --text "your Arabic sentence here"
|
68
|
-
#salma --file "path/to/your/file.txt"
|
sinatools/salma/__init__.py
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
from sinatools.salma import settings
|
2
|
-
import pickle
|
3
|
-
from sinatools.DataDownload import downloader
|
4
|
-
import os
|
5
|
-
|
6
|
-
#filename = 'glosses_dic.pickle'
|
7
|
-
#path =downloader.get_appdatadir()
|
8
|
-
#file_path = os.path.join(path, filename)
|
9
|
-
#with open(file_path, 'rb') as f:
|
10
|
-
# #Load the serialized data from the file
|
11
|
-
# settings.glosses_dic = pickle.load(f)
|
12
|
-
settings.glosses_dic = {}
|
sinatools/utils/utils.py
DELETED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|