SinaTools 0.1.29__py2.py3-none-any.whl → 0.1.31__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.29
3
+ Version: 0.1.31
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -1,5 +1,5 @@
1
- SinaTools-0.1.29.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
- sinatools/VERSION,sha256=3gAvtibHsL3Zih60tzJshU5QcbL40f0qBmAAXPGrB-Q,6
1
+ SinaTools-0.1.31.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
+ sinatools/VERSION,sha256=0G-86l6j71-98w4IH9k4eO_HB6ywVt1xyOn6MUmJ0i4,6
3
3
  sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
4
4
  sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
5
5
  sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
@@ -20,7 +20,7 @@ sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1
20
20
  sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
21
21
  sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
22
22
  sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- sinatools/DataDownload/downloader.py,sha256=6xH55WlDhgtImPRFQ0AaeDFJjL8OMNU29x61PL8mZ2w,6468
23
+ sinatools/DataDownload/downloader.py,sha256=3UkRRH4TLbut10V1BgWO3EqJQaHVBqr6pAj7Fn4AQZ8,6511
24
24
  sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
25
25
  sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
26
26
  sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
@@ -75,8 +75,8 @@ sinatools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKwe
75
75
  sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
76
76
  sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
77
77
  sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
78
- sinatools/morphology/morph_analyzer.py,sha256=3B-ewxFg_If83oYlk1bDdVS1clb-mgyAF4WgAMqcAVI,7009
79
- sinatools/ner/__init__.py,sha256=CLPaqUcvPGAA4lU-6hjAqjNfKJ5WtwRfsma6QkYZHEk,1379
78
+ sinatools/morphology/morph_analyzer.py,sha256=XrLkFqI89GmQuRyZB5X7GNIpfedfGNnQwHzrz5bDu5A,7190
79
+ sinatools/ner/__init__.py,sha256=59kLMX6UQhF6JpE10RhaDYC3a2_jiWOIVPuejsoflFE,1050
80
80
  sinatools/ner/data.py,sha256=lvOW86dXse8SC75Q0supQaE0rrRffoxNjIA0Qbv5WZY,4354
81
81
  sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
82
82
  sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
@@ -96,6 +96,8 @@ sinatools/ner/trainers/BaseTrainer.py,sha256=Ifz4SeTxJwVn1_uWZ3I9KbcSo2hLPN3ojsI
96
96
  sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh3OrKQZnogYy8RQ,8429
97
97
  sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
98
98
  sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
99
+ sinatools/relations/__init__.py,sha256=cYjsP2mlTYvAwVIEFtgA6i9gLUSkGVOuDggMs7TvG5k,272
100
+ sinatools/relations/relation_extractor.py,sha256=gADRNy0LZvJ021UVgSuV4DfHodRJ8bM7FeCkdV4DeeY,9719
99
101
  sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
100
102
  sinatools/semantic_relatedness/compute_relatedness.py,sha256=_9HFPs3nQBLklHFfkc9o3gEjEI6Bd34Ha4E1Kvv1RIg,2256
101
103
  sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO8390,548
@@ -116,10 +118,10 @@ sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
116
118
  sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
117
119
  sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
118
120
  sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
119
- SinaTools-0.1.29.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
120
- SinaTools-0.1.29.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
121
- SinaTools-0.1.29.dist-info/METADATA,sha256=IorbBd2klVKi0amBxKMKIEgyJHjRxcpJNxAQBgyNn04,3267
122
- SinaTools-0.1.29.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
123
- SinaTools-0.1.29.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
124
- SinaTools-0.1.29.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
125
- SinaTools-0.1.29.dist-info/RECORD,,
121
+ SinaTools-0.1.31.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
122
+ SinaTools-0.1.31.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
123
+ SinaTools-0.1.31.dist-info/METADATA,sha256=oT_7vNHXOs1oX4m07uSty-cASC1eBGYEDCfKT4W1mio,3267
124
+ SinaTools-0.1.31.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
125
+ SinaTools-0.1.31.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
126
+ SinaTools-0.1.31.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
127
+ SinaTools-0.1.31.dist-info/RECORD,,
@@ -10,13 +10,14 @@ urls = {
10
10
  'ner': 'https://sina.birzeit.edu/Wj27012000.tar.gz',
11
11
  'wsd_model': 'https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
12
12
  'wsd_tokenizer': 'https://sina.birzeit.edu/bert-base-arabertv02.zip',
13
- 'glosses_dic': 'https://sina.birzeit.edu/glosses_dic.pickle',
13
+ 'one_gram': 'https://sina.birzeit.edu/one_gram.pickle',
14
14
  'five_grams': 'https://sina.birzeit.edu/five_grams.pickle',
15
15
  'four_grams':'https://sina.birzeit.edu/four_grams.pickle',
16
16
  'three_grams':'https://sina.birzeit.edu/three_grams.pickle',
17
17
  'two_grams':'https://sina.birzeit.edu/two_grams.pickle',
18
- 'synonyms_level2':'https://sina.birzeit.edu/graph_l2.pkl',
19
- 'synonyms_level3':'https://sina.birzeit.edu/graph_l3.pkl'
18
+ 'graph_l2':'https://sina.birzeit.edu/graph_l2.pkl',
19
+ 'graph_l3':'https://sina.birzeit.edu/graph_l3.pkl',
20
+ 'relation':'https://sina.birzeit.edu/relation_model.zip'
20
21
  }
21
22
 
22
23
  def get_appdatadir():
sinatools/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.29
1
+ 0.1.31
@@ -3,6 +3,7 @@ from sinatools.utils.tokenizers_words import simple_word_tokenize
3
3
  from sinatools.utils.parser import arStrip
4
4
  from sinatools.utils.charsets import AR_CHARSET, AR_DIAC_CHARSET
5
5
  from sinatools.DataDownload.downloader import get_appdatadir
6
+ from sinatools.morphology.morph_analyzer import remove_punctuation
6
7
  from . import dictionary
7
8
 
8
9
  _IS_AR_RE = re.compile(u'^[' + re.escape(u''.join(AR_CHARSET)) + u']+$')
@@ -98,13 +99,16 @@ def analyze(text, language ='MSA', task ='full', flag="1"):
98
99
  token = arStrip(token , False , True , False , False , False , False)
99
100
  token = re.sub('[ٱ]','ﺍ',token)
100
101
  # token, freq, lemma, lemma_id, root, pos
101
- solution = [token, 0, token+"_0", 0, token, ""]
102
+ solution = [token, 0, token, 0, token, ""]
102
103
 
103
104
  if token.isdigit():
104
- solution[5] = "digit" #pos
105
+ solution[5] = "رقم" #pos
106
+
107
+ elif remove_punctuation(token).strip() == "":
108
+ solution[5] = "علامة ترقيم" #pos
105
109
 
106
110
  elif not _is_ar(token):
107
- solution[5] = "Foreign" #pos
111
+ solution[5] = "أجنبي" #pos
108
112
 
109
113
  else:
110
114
  result_token = find_solution(token,language,flag)
sinatools/ner/__init__.py CHANGED
@@ -7,8 +7,6 @@ import torch
7
7
  import pickle
8
8
  import json
9
9
  from argparse import Namespace
10
- from transformers import pipeline
11
- #from transformers import AutoModelForSequenceClassification
12
10
 
13
11
  tagger = None
14
12
  tag_vocab = None
@@ -38,6 +36,3 @@ if torch.cuda.is_available():
38
36
  train_config.trainer_config["kwargs"]["model"] = model
39
37
  tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
40
38
  tagger.load(os.path.join(model_path,"checkpoints"))
41
-
42
- pipe = pipeline("sentiment-analysis", model= os.path.join(path, "best_model"), return_all_scores =True, max_length=128, truncation=True)
43
- #pipe = AutoModelForSequenceClassification.from_pretrained(os.path.join(path, "best_model"))
@@ -0,0 +1,7 @@
1
+ from sinatools.DataDownload import downloader
2
+ import os
3
+ from transformers import pipeline
4
+
5
+ path =downloader.get_appdatadir()
6
+
7
+ pipe = pipeline("sentiment-analysis", model= os.path.join(path, "relation_model"), return_all_scores =True, max_length=128, truncation=True)
@@ -0,0 +1,199 @@
1
+ from urllib.request import Request, urlopen
2
+ from sinatools.ner.entity_extractor import extract
3
+ from sinatools.utils.tokenizer import sentence_tokenizer
4
+ from . import pipe
5
+
6
+ # ============================ Extract entities and their types ========================
7
+ def jsons_to_list_of_lists(json_list):
8
+ return [[d['token'], d['tags']] for d in json_list]
9
+
10
+ def entities_and_types(sentence):
11
+ output_list = jsons_to_list_of_lists(extract(sentence))
12
+ json_short = distill_entities(output_list)
13
+
14
+ entities = {}
15
+ for entity in json_short:
16
+ name = entity[0]
17
+ entity_type = entity[1]
18
+ entities[name] = entity_type
19
+
20
+ return entities
21
+
22
+ def distill_entities(entities):
23
+ # This is list that we put the output what we need
24
+ list_output = list()
25
+
26
+ # This line go to sort function and save the output to temp_entities
27
+ temp_entities = sortTags(entities)
28
+
29
+ # This list help us to make the output,
30
+ temp_list = list()
31
+
32
+ # initlize the temp_list
33
+ temp_list.append(["", "", 0, 0])
34
+ word_position = 0
35
+
36
+ # For each entity, convert ibo to distllir list.
37
+ for entity in temp_entities:
38
+ # This is counter tag of this entity
39
+ counter_tag = 0
40
+ # For each tag
41
+ for tag in str(entity[1]).split():
42
+ # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
43
+ if counter_tag >= len(temp_list):
44
+ temp_list.append(["", "", 0, 0])
45
+
46
+ # If tag equal O and word postion of this tag is not equal zero then it will add all
47
+ # not empty eliment of temp list in output list
48
+ if "O" == tag and word_position != 0:
49
+ for j in range(0, len(temp_list)):
50
+ if temp_list[j][1] != "":
51
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
52
+ temp_list[j][0] = ""
53
+ temp_list[j][1] = ""
54
+ temp_list[j][2] = word_position
55
+ temp_list[j][3] = word_position
56
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
57
+ # of the split its B
58
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
59
+ # if the temp_list of counter is not empty then it will append in output list and hten it will
60
+ # initilize by new string and tag in templist of counter
61
+ if temp_list[counter_tag][1] != "":
62
+ list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
63
+ temp_list[counter_tag][0] = str(entity[0]) + " "
64
+ temp_list[counter_tag][1] = str(tag).split("-")[1]
65
+ temp_list[counter_tag][2] = word_position
66
+ temp_list[counter_tag][3] = word_position
67
+
68
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
69
+ # of the split its O
70
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
71
+ # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
72
+ # then will complete if not it will save in output list and cheak another
73
+ for j in range(counter_tag,len(temp_list)):
74
+ if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
75
+ temp_list[j][0] += str(entity[0]) + " "
76
+ temp_list[j][3] += 1
77
+ break
78
+ else:
79
+ if temp_list[j][1] != "":
80
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
81
+ temp_list[j][0] = ""
82
+ temp_list[j][1] = ""
83
+ temp_list[j][2] = word_position
84
+ temp_list[j][3] = word_position
85
+ counter_tag += 1
86
+ word_position += 1
87
+ # For each temp_list, at the end of the previous loop, there will be some
88
+ # values in this list, we should save it to the output list
89
+ for j in range(0, len(temp_list)):
90
+ if temp_list[j][1] != "":
91
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
92
+ return sorted(list_output, key=lambda x: (x[2]))
93
+
94
+ def sortTags(entities):
95
+ temp_entities = entities
96
+ temp_counter = 0
97
+ # For each entity, this loop will sort each tag of entitiy, first it will check if the
98
+ # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
99
+ for entity in temp_entities:
100
+ tags = entity[1].split()
101
+ for tag in tags:
102
+ # if the counter is not 0 then, will complete
103
+ if temp_counter != 0:
104
+ # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
105
+ # count how many tag in previous tags
106
+ if "I-" == tag[0:2]:
107
+ counter_of_this_tag = 0
108
+ counter_of_previous_tag = 0
109
+ for word in tags:
110
+ if tag.split("-")[1] in word:
111
+ counter_of_this_tag+=1
112
+ for word in temp_entities[temp_counter-1][1].split():
113
+ if tag.split("-")[1] in word:
114
+ counter_of_previous_tag+=1
115
+ # if the counter of previous tag is bigger than counter of this tag, then we
116
+ # need to add I-tag in this tags
117
+ if counter_of_previous_tag > counter_of_this_tag:
118
+ tags.append("I-"+tag.split("-")[1])
119
+ # Sort the tags
120
+ tags.sort()
121
+ # Need to revers the tags because it should begins with I
122
+ tags.reverse()
123
+ # If the counter is not 0 then we can complete
124
+ if temp_counter != 0:
125
+ this_tags = tags
126
+ previous_tags = temp_entities[temp_counter - 1][1].split()
127
+ sorted_tags = list()
128
+
129
+ # Check if the this tag is not O and previous tags is not O, then will complete,
130
+ # if not then it will ignor this tag
131
+ if "O" not in this_tags and "O" not in previous_tags:
132
+ index = 0
133
+ #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
134
+ for i in previous_tags:
135
+ j = 0
136
+ while this_tags and j < len(this_tags):
137
+ if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
138
+ sorted_tags.insert(index, this_tags.pop(j))
139
+ break
140
+ elif this_tags[j][0:2] == "B-":
141
+ break
142
+ j += 1
143
+ index += 1
144
+ sorted_tags += this_tags
145
+ tags = sorted_tags
146
+ str_tag = " "
147
+ str_tag = str_tag.join(tags)
148
+ str_tag = str_tag.strip()
149
+ temp_entities[temp_counter][1] = str_tag
150
+ temp_counter += 1
151
+ return temp_entities
152
+
153
+ # ============= Prepare Templates and Catergorize Extracted Entities ================
154
+ temp03={'location':'مكان حدوث','agent':'أحد المتأثرين في','happened at':'تاريخ حدوث'}
155
+ categories = {
156
+ 'agent': ['PERS', 'NORP', 'OCC', 'ORG'],
157
+ 'location': ['LOC', 'FAC', 'GPE'],
158
+ 'happened at': ['DATE', 'TIME']
159
+ }
160
+
161
+ def get_entity_category(entity_type, categories):
162
+ for category, types in categories.items():
163
+ if entity_type in types:
164
+ return category
165
+ return None
166
+
167
+
168
+ # ============ Extract entities, their types and categorize them ===============
169
+ def event_argument_relation_extraction(documnet):
170
+
171
+ sentences=sentence_tokenizer(documnet)
172
+ output_list=[]
173
+ relation={}
174
+ triple_id=0
175
+ for sentence in sentences:
176
+ entities=entities_and_types(sentence)
177
+ entity_identifier={entity:i for entity, i in zip(entities,range(1,len(entities)+1))}
178
+
179
+ event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type == 'EVENT']
180
+ arg_event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type != 'EVENT']
181
+
182
+
183
+ for i in event_indices:
184
+ event_entity=list(entities.keys())[i]
185
+ for j in arg_event_indices:
186
+ arg_name= list(entities.keys())[j]
187
+ arg_type=entities[arg_name]
188
+ category = get_entity_category(arg_type, categories)
189
+
190
+ if category in temp03:
191
+ relation_sentence=f"[CLS] {sentence} [SEP] {event_entity} {temp03[category]} {arg_name}"
192
+ predicted_relation=pipe(relation_sentence)
193
+ score = predicted_relation[0][0]['score']
194
+ if score > 0.50:
195
+ triple_id+=1
196
+ relation={"TripleID":triple_id,"Subject":{"ID":entity_identifier[event_entity],"Type": entities[event_entity], "Label":event_entity}, "Relation": category, "Object":{"ID":entity_identifier[arg_name],"Type": entities[arg_name], "Label":arg_name,}}
197
+ output_list.append(relation)
198
+
199
+ return output_list