SinaTools 0.1.33__py2.py3-none-any.whl → 0.1.34__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/METADATA +1 -1
- {SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/RECORD +13 -13
- sinatools/CLI/DataDownload/download_files.py +11 -9
- sinatools/DataDownload/downloader.py +34 -3
- sinatools/VERSION +1 -1
- sinatools/relations/relation_extractor.py +2 -2
- sinatools/wsd/disambiguator.py +90 -14
- {SinaTools-0.1.33.data → SinaTools-0.1.34.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.33.dist-info → SinaTools-0.1.34.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.34
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -1,10 +1,10 @@
|
|
1
|
-
SinaTools-0.1.
|
2
|
-
sinatools/VERSION,sha256=
|
1
|
+
SinaTools-0.1.34.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
2
|
+
sinatools/VERSION,sha256=hygBh9__JFOajJA1gAKoJF_AUzBnbP5eCrBYLp3dwDI,6
|
3
3
|
sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
|
4
4
|
sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
5
5
|
sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
|
6
6
|
sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
|
7
|
-
sinatools/CLI/DataDownload/download_files.py,sha256=
|
7
|
+
sinatools/CLI/DataDownload/download_files.py,sha256=u_DFXbHcIU_4Ub5Y0cL9_p1hL8h6LLWPemn9Al-XFgc,2603
|
8
8
|
sinatools/CLI/morphology/ALMA_multi_word.py,sha256=rmpa72twwIJHme_kpQ1lu3_7y_Jorj70QTvOnQMJRuI,1274
|
9
9
|
sinatools/CLI/morphology/morph_analyzer.py,sha256=HPamEKos_JRYCJv_2q6c12N--da58_JXTno9haww5Ao,3497
|
10
10
|
sinatools/CLI/ner/corpus_entity_extractor.py,sha256=DdvigsDQzko5nJBjzUXlIDqoBMBTVzktjSo7JfEXTIA,4778
|
@@ -20,7 +20,7 @@ sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1
|
|
20
20
|
sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
|
21
21
|
sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
|
22
22
|
sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
sinatools/DataDownload/downloader.py,sha256=
|
23
|
+
sinatools/DataDownload/downloader.py,sha256=VdUNgSqMKz1J-DuQD_eS1U2KWqEpy94WlSJ0pPODLig,7833
|
24
24
|
sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
25
25
|
sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
|
26
26
|
sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
|
@@ -97,7 +97,7 @@ sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh
|
|
97
97
|
sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
|
98
98
|
sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
|
99
99
|
sinatools/relations/__init__.py,sha256=cYjsP2mlTYvAwVIEFtgA6i9gLUSkGVOuDggMs7TvG5k,272
|
100
|
-
sinatools/relations/relation_extractor.py,sha256=
|
100
|
+
sinatools/relations/relation_extractor.py,sha256=UuDlaaR0ch9BFv4sBF1tr7P-P9xq8oRZF41tAze6_ok,9751
|
101
101
|
sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
|
102
102
|
sinatools/semantic_relatedness/compute_relatedness.py,sha256=_9HFPs3nQBLklHFfkc9o3gEjEI6Bd34Ha4E1Kvv1RIg,2256
|
103
103
|
sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO8390,548
|
@@ -115,13 +115,13 @@ sinatools/utils/tokenizer.py,sha256=nyk6lh5-p38wrU62hvh4wg7ni9ammkdqqIgcjbbBxxo,
|
|
115
115
|
sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
|
116
116
|
sinatools/utils/word_compare.py,sha256=rS2Z74sf7R-7MTXyrFj5miRi2TnSG9OdTDp_qQYuo2Y,28200
|
117
117
|
sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
|
118
|
-
sinatools/wsd/disambiguator.py,sha256=
|
118
|
+
sinatools/wsd/disambiguator.py,sha256=9ottQn_WwOFX5Trr0Rpg66-Jpaln5yJduFqP6cdOOBA,22616
|
119
119
|
sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
|
120
120
|
sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
|
121
|
-
SinaTools-0.1.
|
122
|
-
SinaTools-0.1.
|
123
|
-
SinaTools-0.1.
|
124
|
-
SinaTools-0.1.
|
125
|
-
SinaTools-0.1.
|
126
|
-
SinaTools-0.1.
|
127
|
-
SinaTools-0.1.
|
121
|
+
SinaTools-0.1.34.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
|
122
|
+
SinaTools-0.1.34.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
|
123
|
+
SinaTools-0.1.34.dist-info/METADATA,sha256=lzqCZL8XdEQ2ZqcXH5WsoUmLBwv9TklIItPwCB0MqKc,3267
|
124
|
+
SinaTools-0.1.34.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
|
125
|
+
SinaTools-0.1.34.dist-info/entry_points.txt,sha256=-YGM-r0_UtNPnI0C4UcK1ptrpwFZpUhxdy2qHkehNCo,1303
|
126
|
+
SinaTools-0.1.34.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
|
127
|
+
SinaTools-0.1.34.dist-info/RECORD,,
|
@@ -34,6 +34,7 @@ import argparse
|
|
34
34
|
from sinatools.DataDownload.downloader import download_file
|
35
35
|
from sinatools.DataDownload.downloader import download_files
|
36
36
|
from sinatools.DataDownload.downloader import get_appdatadir
|
37
|
+
from sinatools.DataDownload.downloader import download_folder_from_hf
|
37
38
|
from sinatools.DataDownload.downloader import urls
|
38
39
|
|
39
40
|
|
@@ -51,15 +52,16 @@ def main():
|
|
51
52
|
for file in args.files:
|
52
53
|
print("file: ", file)
|
53
54
|
if file == "wsd":
|
54
|
-
download_file(urls["morph"])
|
55
|
-
download_file(urls["ner"])
|
56
|
-
download_file(urls["wsd_model"])
|
57
|
-
|
58
|
-
download_file(urls["
|
59
|
-
download_file(urls["
|
60
|
-
download_file(urls["
|
61
|
-
download_file(urls["
|
62
|
-
download_file(urls["
|
55
|
+
#download_file(urls["morph"])
|
56
|
+
#download_file(urls["ner"])
|
57
|
+
#download_file(urls["wsd_model"])
|
58
|
+
download_folder_from_hf("SinaLab/ArabGlossBERT", "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01")
|
59
|
+
#download_file(urls["wsd_tokenizer"])
|
60
|
+
#download_file(urls["one_gram"])
|
61
|
+
#download_file(urls["five_grams"])
|
62
|
+
#download_file(urls["four_grams"])
|
63
|
+
#download_file(urls["three_grams"])
|
64
|
+
#download_file(urls["two_grams"])
|
63
65
|
elif file == "synonyms":
|
64
66
|
download_file(urls["graph_l2"])
|
65
67
|
download_file(urls["graph_l3"])
|
@@ -8,8 +8,8 @@ import tarfile
|
|
8
8
|
urls = {
|
9
9
|
'morph': 'https://sina.birzeit.edu/lemmas_dic.pickle',
|
10
10
|
'ner': 'https://sina.birzeit.edu/Wj27012000.tar.gz',
|
11
|
-
'wsd_model': 'https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
|
12
|
-
'wsd_tokenizer': 'https://sina.birzeit.edu/bert-base-arabertv02.zip',
|
11
|
+
# 'wsd_model': 'https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
|
12
|
+
# 'wsd_tokenizer': 'https://sina.birzeit.edu/bert-base-arabertv02.zip',
|
13
13
|
'one_gram': 'https://sina.birzeit.edu/one_gram.pickle',
|
14
14
|
'five_grams': 'https://sina.birzeit.edu/five_grams.pickle',
|
15
15
|
'four_grams':'https://sina.birzeit.edu/four_grams.pickle',
|
@@ -184,4 +184,35 @@ def download_files():
|
|
184
184
|
None
|
185
185
|
"""
|
186
186
|
for url in urls.values():
|
187
|
-
download_file(url)
|
187
|
+
download_file(url)
|
188
|
+
|
189
|
+
|
190
|
+
def download_folder_from_hf(repo_url, folder_name):
|
191
|
+
|
192
|
+
# Hugging Face API to fetch files from the repository
|
193
|
+
api_url = f"https://huggingface.co/api/models/{repo_url}/tree/main/{folder_name}"
|
194
|
+
|
195
|
+
# Make the request to get the folder structure
|
196
|
+
response = requests.get(api_url)
|
197
|
+
if response.status_code != 200:
|
198
|
+
print(f"Failed to fetch folder contents. Status code: {response.status_code}")
|
199
|
+
return
|
200
|
+
|
201
|
+
folder_content = response.json()
|
202
|
+
|
203
|
+
# Download each file in the folder
|
204
|
+
for file_info in folder_content:
|
205
|
+
file_name = file_info["path"]
|
206
|
+
file_url = f"https://huggingface.co/{repo_url}/resolve/main/{file_name}"
|
207
|
+
|
208
|
+
# Download the file and save it to the output directory
|
209
|
+
file_response = requests.get(file_url)
|
210
|
+
if file_response.status_code == 200:
|
211
|
+
# Create any necessary directories
|
212
|
+
output_file_path = os.path.join(get_appdatadir(), file_name)
|
213
|
+
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
|
214
|
+
with open(output_file_path, 'wb') as f:
|
215
|
+
f.write(file_response.content)
|
216
|
+
print(f"Downloaded: {file_name}")
|
217
|
+
else:
|
218
|
+
print(f"Failed to download {file_name}. Status code: {file_response.status_code}")
|
sinatools/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.34
|
@@ -193,7 +193,7 @@ def event_argument_relation_extraction(documnet):
|
|
193
193
|
score = predicted_relation[0][0]['score']
|
194
194
|
if score > 0.50:
|
195
195
|
triple_id+=1
|
196
|
-
relation={"TripleID":triple_id,"Subject":{"ID":entity_identifier[event_entity],"Type": entities[event_entity], "Label":event_entity}, "Relation": category, "Object":{"ID":entity_identifier[arg_name],"Type": entities[arg_name], "Label":arg_name,}}
|
196
|
+
relation={"TripleID":triple_id,"Subject":{"ID":entity_identifier[event_entity],"Type": entities[event_entity], "Label":event_entity}, "Relation": category, "Object":{"ID":entity_identifier[arg_name],"Type": entities[arg_name], "Label":arg_name,},"confidence": f"{score: .2f}"}
|
197
197
|
output_list.append(relation)
|
198
198
|
|
199
|
-
return output_list
|
199
|
+
return output_list
|
sinatools/wsd/disambiguator.py
CHANGED
@@ -8,6 +8,10 @@ from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
|
|
8
8
|
from sinatools.morphology.morph_analyzer import analyze
|
9
9
|
from sinatools.ner.entity_extractor import extract
|
10
10
|
from . import glosses_dic
|
11
|
+
import time
|
12
|
+
#import concurrent
|
13
|
+
#import threading
|
14
|
+
import multiprocessing
|
11
15
|
|
12
16
|
|
13
17
|
def distill_entities(entities):
|
@@ -256,7 +260,7 @@ def find_named_entities(string):
|
|
256
260
|
return found_entities
|
257
261
|
|
258
262
|
|
259
|
-
def find_glosses_using_ALMA(word):
|
263
|
+
def find_glosses_using_ALMA(word, glosses_dic):
|
260
264
|
|
261
265
|
data = analyze(word, language ='MSA', task ='full', flag="1")
|
262
266
|
Diac_lemma = ""
|
@@ -302,7 +306,7 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
|
|
302
306
|
return my_json
|
303
307
|
|
304
308
|
|
305
|
-
def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner):
|
309
|
+
def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner, glosses_dic):
|
306
310
|
output_list = []
|
307
311
|
position = 0
|
308
312
|
while position < len(input_sentence):
|
@@ -389,7 +393,7 @@ def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemm
|
|
389
393
|
|
390
394
|
if flag == "False": # Not found in ner or in multi_word_dictionary, ASK ALMA
|
391
395
|
word = input_sentence[position]
|
392
|
-
word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word)
|
396
|
+
word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses = find_glosses_using_ALMA(word, glosses_dic)
|
393
397
|
my_json = {}
|
394
398
|
my_json['word'] = word
|
395
399
|
my_json['concept_count'] = concept_count
|
@@ -432,26 +436,95 @@ def disambiguate_glosses_main(word, sentence):
|
|
432
436
|
glosses = word['glosses']
|
433
437
|
Diac_lemma = word['Diac_lemma']
|
434
438
|
Undiac_lemma = word['Undiac_lemma']
|
435
|
-
|
439
|
+
start = time.time()
|
440
|
+
x = disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, input_word, sentence)
|
441
|
+
end = time.time()
|
442
|
+
print(f"disambiguate time: {end - start}")
|
443
|
+
return x
|
444
|
+
|
445
|
+
|
446
|
+
def init_resources():
|
447
|
+
global glosses_dic
|
448
|
+
|
449
|
+
|
450
|
+
# Wrapper function for multiprocessing
|
451
|
+
def disambiguate_glosses_in_parallel(word_and_sentence):
|
452
|
+
word, sentence = word_and_sentence
|
453
|
+
return disambiguate_glosses_main(word, sentence)
|
436
454
|
|
437
455
|
def WSD(sentence):
|
438
|
-
|
456
|
+
start = time.time()
|
439
457
|
input_sentence = simple_word_tokenize(sentence)
|
440
|
-
|
458
|
+
end = time.time()
|
459
|
+
print(f"tokenizer time: {end - start}")
|
460
|
+
|
461
|
+
start = time.time()
|
441
462
|
five_word_lemma = find_five_word_lemma(input_sentence)
|
463
|
+
end = time.time()
|
464
|
+
print(f"5grams time: {end - start}")
|
442
465
|
|
466
|
+
start = time.time()
|
443
467
|
four_word_lemma = find_four_word_lemma(input_sentence)
|
444
|
-
|
468
|
+
end = time.time()
|
469
|
+
print(f"4grams time: {end - start}")
|
470
|
+
|
471
|
+
start = time.time()
|
445
472
|
three_word_lemma = find_three_word_lemma(input_sentence)
|
446
|
-
|
473
|
+
end = time.time()
|
474
|
+
print(f"3grams time: {end - start}")
|
475
|
+
|
476
|
+
start = time.time()
|
447
477
|
two_word_lemma = find_two_word_lemma(input_sentence)
|
448
|
-
|
449
|
-
|
478
|
+
end = time.time()
|
479
|
+
print(f"2grams time: {end - start}")
|
450
480
|
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
481
|
+
start = time.time()
|
482
|
+
ner = find_named_entities(" ".join(input_sentence))
|
483
|
+
end = time.time()
|
484
|
+
print(f"ner time: {end - start}")
|
485
|
+
|
486
|
+
|
487
|
+
start = time.time()
|
488
|
+
output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner, glosses_dic_shared)
|
489
|
+
end = time.time()
|
490
|
+
print(f"lookup time: {end - start}")
|
491
|
+
|
492
|
+
# for word in output_list:
|
493
|
+
# start = time.time()
|
494
|
+
# results.append(disambiguate_glosses_main(word, sentence))
|
495
|
+
# end = time.time()
|
496
|
+
# print(f"disambiguate time: {end - start}")
|
497
|
+
# return results
|
498
|
+
|
499
|
+
# with concurrent.futures.ProcessPoolExecutor() as executor:
|
500
|
+
# results = list(executor.map(lambda word: disambiguate_glosses_main(word, sentence), output_list))
|
501
|
+
# return results
|
502
|
+
|
503
|
+
# Create and start threads
|
504
|
+
# for word in output_list:
|
505
|
+
# thread = threading.Thread(target=worker, args=(word, sentence))
|
506
|
+
# threads.append(thread)
|
507
|
+
# thread.start()
|
508
|
+
#
|
509
|
+
# for thread in threads:
|
510
|
+
# thread.join()
|
511
|
+
#
|
512
|
+
# return threading_results
|
513
|
+
|
514
|
+
# Number of CPUs
|
515
|
+
num_cpus = multiprocessing.cpu_count()
|
516
|
+
print("num_cpus : ", num_cpus)
|
517
|
+
|
518
|
+
# Create a manager to hold shared data
|
519
|
+
# with multiprocessing.Manager() as manager:
|
520
|
+
# glosses_dic_shared = manager.dict(glosses_dic)
|
521
|
+
# with multiprocessing.Pool(num_cpus) as pool:
|
522
|
+
# arguments = [(word, sentence) for word in output_list]
|
523
|
+
# results = pool.starmap(disambiguate_glosses_main, arguments)
|
524
|
+
|
525
|
+
with multiprocessing.Pool(initializer=init_resources) as pool:
|
526
|
+
# Map the list of words to the disambiguation function in parallel
|
527
|
+
results = pool.map(disambiguate_glosses_in_parallel, [(word, sentence) for word in output_list])
|
455
528
|
return results
|
456
529
|
|
457
530
|
|
@@ -497,5 +570,8 @@ def disambiguate(sentence):
|
|
497
570
|
content = ["Input is too long"]
|
498
571
|
return content
|
499
572
|
else:
|
573
|
+
start = time.time()
|
500
574
|
results = WSD(sentence)
|
575
|
+
end = time.time()
|
576
|
+
print(f"WSD total time: {end - start}")
|
501
577
|
return results
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|