ddi-fw 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +12 -0
- ddi_fw/datasets/core.py +416 -0
- ddi_fw/datasets/db_utils.py +204 -0
- ddi_fw/datasets/embedding_generator.py +66 -0
- ddi_fw/datasets/embedding_generator_new.py +105 -0
- ddi_fw/datasets/feature_vector_generation.py +100 -0
- ddi_fw/datasets/idf_helper.py +71 -0
- ddi_fw/drugbank/__init__.py +2 -0
- ddi_fw/drugbank/drugbank_parser.py +154 -0
- ddi_fw/drugbank/drugbank_processor.py +343 -0
- ddi_fw/drugbank/drugbank_processor_org.py +272 -0
- ddi_fw/drugbank/event_extractor.py +127 -0
- ddi_fw/experiments/__init__.py +2 -0
- ddi_fw/experiments/custom_torch_model.py +66 -0
- ddi_fw/experiments/evaluation_helper.py +232 -0
- ddi_fw/experiments/tensorflow_helper.py +296 -0
- ddi_fw/experiments/test.py +59 -0
- ddi_fw/ner/__init__.py +1 -0
- ddi_fw/ner/mmlrestclient.py +155 -0
- ddi_fw/ner/ner.py +340 -0
- ddi_fw/utils/__init__.py +3 -0
- ddi_fw/utils/enums.py +23 -0
- ddi_fw/utils/utils.py +103 -0
- ddi_fw/utils/zip_helper.py +66 -0
- {ddi_fw-0.0.1.dist-info → ddi_fw-0.0.2.dist-info}/METADATA +1 -1
- ddi_fw-0.0.2.dist-info/RECORD +28 -0
- ddi_fw-0.0.2.dist-info/top_level.txt +5 -0
- ddi_fw-0.0.1.dist-info/RECORD +0 -4
- ddi_fw-0.0.1.dist-info/top_level.txt +0 -1
- {ddi_fw-0.0.1.dist-info → ddi_fw-0.0.2.dist-info}/WHEEL +0 -0
ddi_fw/ner/ner.py
ADDED
@@ -0,0 +1,340 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
import glob
|
3
|
+
import json
|
4
|
+
from pathlib import Path
|
5
|
+
import pathlib
|
6
|
+
from time import sleep
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
from tqdm import tqdm
|
10
|
+
import os
|
11
|
+
import requests
|
12
|
+
# from mmlrestclient as metamapliteclient
|
13
|
+
from enum import Enum
|
14
|
+
from utils import create_folder_if_not_exists
|
15
|
+
|
16
|
+
|
17
|
+
# data = '''
|
18
|
+
# Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.
|
19
|
+
# [L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.
|
20
|
+
# [L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate
|
21
|
+
# on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609]
|
22
|
+
|
23
|
+
# Lepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT),
|
24
|
+
# an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539]
|
25
|
+
# HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4.
|
26
|
+
# This activates endothelial cells and platelets and enhances the formation of thrombi.
|
27
|
+
# [A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]'''
|
28
|
+
|
29
|
+
# response = requests.post(url, data=data)
|
30
|
+
|
31
|
+
# print(response.content)
|
32
|
+
|
33
|
+
HERE = pathlib.Path(__file__).resolve().parent
|
34
|
+
|
35
|
+
|
36
|
+
class CTakesNER:
|
37
|
+
def __init__(self, drugs_df = None,api_url= 'http://localhost:8080/ctakes-web-rest/service/analyze?pipeline=Default'
|
38
|
+
, output_path='ner-output/ctakes', ids=[],
|
39
|
+
columns=[]):
|
40
|
+
self.drugs_df = drugs_df
|
41
|
+
self.api_url = api_url
|
42
|
+
self.columns = columns
|
43
|
+
self.ids = ids
|
44
|
+
self.output_path = output_path
|
45
|
+
|
46
|
+
def run(self,
|
47
|
+
run_for=[]):
|
48
|
+
|
49
|
+
for column in self.columns:
|
50
|
+
if not os.path.exists(self.output_path+"/"+column):
|
51
|
+
os.makedirs(self.output_path+"/"+column)
|
52
|
+
for column in self.columns:
|
53
|
+
column_output_path = self.output_path+'/'+column
|
54
|
+
if not column in run_for:
|
55
|
+
continue
|
56
|
+
# not include
|
57
|
+
if self.ids:
|
58
|
+
self.drugs_df = self.drugs_df[~self.drugs_df['drugbank_id'].isin(
|
59
|
+
self.ids)]
|
60
|
+
for index, row in self.drugs_df.iterrows():
|
61
|
+
drugbank_id = row['drugbank_id']
|
62
|
+
data = row[column]
|
63
|
+
if data is None or pd.isna(data) or (type(data) == str and len(data.strip()) == 0): # or len(data) == 0:
|
64
|
+
with open(f'{column_output_path}/{drugbank_id}.json', 'w', encoding='utf-8') as f:
|
65
|
+
json.dump([], f, ensure_ascii=False, indent=4)
|
66
|
+
continue
|
67
|
+
data = data.encode()
|
68
|
+
response = requests.post(self.api_url, data=data)
|
69
|
+
|
70
|
+
with open(f'{column_output_path}/{drugbank_id}.json', 'w', encoding='utf-8') as f:
|
71
|
+
try:
|
72
|
+
obj = json.loads(response.text)
|
73
|
+
json.dump(obj, f, ensure_ascii=False, indent=4)
|
74
|
+
except:
|
75
|
+
# print(f'{drugbank_id} is not parsable')
|
76
|
+
json.dump([], f, ensure_ascii=False, indent=4)
|
77
|
+
continue
|
78
|
+
|
79
|
+
# if index % 10 == 0:
|
80
|
+
# sleep(10)
|
81
|
+
|
82
|
+
def load(self, filename = None, group = True):
|
83
|
+
file_path= filename if filename else HERE.joinpath('output/ctakes/ctakes_ner.pkl')
|
84
|
+
df = pd.read_pickle(file_path)
|
85
|
+
|
86
|
+
if group:
|
87
|
+
keys = list(df.columns.values)
|
88
|
+
|
89
|
+
df['tui'] = [[]] * df.shape[0]
|
90
|
+
df['cui'] = [[]] * df.shape[0]
|
91
|
+
df['entities'] = [[]] * df.shape[0]
|
92
|
+
|
93
|
+
tui_columns = [key for key in keys if key.startswith('tui')]
|
94
|
+
cui_columns = [key for key in keys if key.startswith('cui')]
|
95
|
+
entities_columns = [key for key in keys if key.startswith('entities')]
|
96
|
+
#bunu tek bir eşitlikle çöz
|
97
|
+
df['tui'] = df[tui_columns].values.tolist()
|
98
|
+
df['tui'] = df['tui'].apply(lambda items:{i for item in items for i in item})
|
99
|
+
|
100
|
+
df['cui'] = df[cui_columns].values.tolist()
|
101
|
+
df['cui'] = df['cui'].apply(lambda items:{i for item in items for i in item})
|
102
|
+
|
103
|
+
df['entities'] = df[entities_columns].values.tolist()
|
104
|
+
df['entities'] = df['entities'].apply(lambda items:{i for item in items for i in item})
|
105
|
+
|
106
|
+
return df
|
107
|
+
|
108
|
+
def create_dataframe(self, override = False): # dataframe_columns=[]
|
109
|
+
filename='ctakes_ner.pkl'
|
110
|
+
if not override and os.path.exists(self.output_path+"/" + filename):
|
111
|
+
return self.load(self.output_path+"/" + filename)
|
112
|
+
|
113
|
+
create_folder_if_not_exists(self.output_path+"/" + filename)
|
114
|
+
dict_of_dict = defaultdict(dict)
|
115
|
+
for column in self.columns:
|
116
|
+
all_json_files = f'{self.output_path}/{column}/'+'*.json*'
|
117
|
+
for filepath in tqdm(glob.glob(all_json_files)):
|
118
|
+
with open(filepath, 'r', encoding="utf8") as f:
|
119
|
+
file_name = Path(f.name).stem
|
120
|
+
t = dict_of_dict[file_name]
|
121
|
+
data = json.load(f)
|
122
|
+
entities = []
|
123
|
+
cuis = []
|
124
|
+
tuis = []
|
125
|
+
if data is None or len(data) == 0:
|
126
|
+
t['drugbank_id'] = file_name
|
127
|
+
t[f'cui_{column}'] = []
|
128
|
+
t[f'tui_{column}']= []
|
129
|
+
t[f'entities_{column}'] = []
|
130
|
+
dict_of_dict[file_name] = t
|
131
|
+
continue
|
132
|
+
for key, value in data.items():
|
133
|
+
entities = [v['text'] for v in value]
|
134
|
+
cuis = [attr['cui']
|
135
|
+
for v in value for attr in v['conceptAttributes']]
|
136
|
+
tuis = [attr['tui']
|
137
|
+
for v in value for attr in v['conceptAttributes']]
|
138
|
+
# codingScheme
|
139
|
+
|
140
|
+
if 'drugbank_id' not in t:
|
141
|
+
t['drugbank_id'] = file_name
|
142
|
+
t[f'cui_{column}'] = cuis
|
143
|
+
t[f'tui_{column}'] = tuis
|
144
|
+
t[f'entities_{column}'] = entities
|
145
|
+
dict_of_dict[file_name] = t
|
146
|
+
|
147
|
+
df = pd.DataFrame(dict_of_dict.values(),
|
148
|
+
# orient='index',
|
149
|
+
# columns=columns
|
150
|
+
)
|
151
|
+
df.to_pickle(self.output_path+"/" + filename)
|
152
|
+
# dataframe_columns.insert(0, 'drugbank_id')
|
153
|
+
|
154
|
+
# new_columns = {columns[i]: dataframe_columns[i]
|
155
|
+
# for i in range(len(columns))}
|
156
|
+
# df.rename(columns=new_columns, inplace=True)
|
157
|
+
return df
|
158
|
+
|
159
|
+
|
160
|
+
# no module named 'mmlrestclient'
|
161
|
+
# class MMSLiteNER:
|
162
|
+
|
163
|
+
# # https://ii.nlm.nih.gov/metamaplite/js/formControls.js
|
164
|
+
|
165
|
+
# class Groups(Enum):
|
166
|
+
|
167
|
+
# activities_group = ['acty', 'bhvr', 'dora',
|
168
|
+
# 'evnt', 'gora', 'inbe', 'mcha', 'ocac', 'socb']
|
169
|
+
|
170
|
+
# anatomy_group = ['anst', 'blor', 'bpoc', 'bsoj', 'bdsu',
|
171
|
+
# 'bdsy', 'cell', 'celc', 'emst', 'ffas', 'tisu']
|
172
|
+
|
173
|
+
# checmicals_and_drugs_group = ['aapp', 'antb', 'bacs', 'bodm', 'carb', 'chem', 'chvf', 'chvs',
|
174
|
+
# 'clnd', 'eico', 'elii', 'enzy', 'hops', 'horm', 'imft',
|
175
|
+
# 'irda', 'inch', 'lipd', 'nsba', 'nnon', 'orch', 'opco',
|
176
|
+
# 'phsu', 'rcpt', 'strd', 'vita']
|
177
|
+
|
178
|
+
# concept_and_ideas_group = ['clas', 'cnce', 'ftcn', 'grpa', 'idcn', 'inpr', 'lang',
|
179
|
+
# 'qlco', 'rnlw', 'spco', 'tmco']
|
180
|
+
|
181
|
+
# devices_group = ['drdd', 'medd', 'resd']
|
182
|
+
|
183
|
+
# disorders_group = ['acab', 'anab', 'bact', 'comd', 'cgab', 'dsyn',
|
184
|
+
# 'emod', 'fndg', 'inpo', 'mobd', 'patf', 'sosy']
|
185
|
+
|
186
|
+
# # abbreviated disorders group, finding and congenital abnormality removed
|
187
|
+
# disorders_abbrev_group = ['acab', 'anab', 'bact', 'cgab', 'dsyn',
|
188
|
+
# 'emod', 'inpo', 'mobd', 'patf', 'sosy']
|
189
|
+
|
190
|
+
# genes_and_molecular_sequences = [
|
191
|
+
# 'amas', 'crbs', 'gngm', 'mosq', 'nusq']
|
192
|
+
|
193
|
+
# geographic_areas = ['geoa']
|
194
|
+
|
195
|
+
# living_being = ['aggp', 'amph', 'anim', 'arch', 'bact', 'bird', 'euka', 'fish',
|
196
|
+
# 'fngs', 'grup', 'humn', 'mamm', 'orgm', 'podg',
|
197
|
+
# 'plnt', 'popg', 'prog', 'rept', 'vtbt', 'virs']
|
198
|
+
|
199
|
+
# objects = ['enty', 'food', 'mnob', 'sbst']
|
200
|
+
|
201
|
+
# occupations = ['bmod', 'ocdi']
|
202
|
+
|
203
|
+
# organizations = ['hcro', 'orgt', 'pros', 'shro']
|
204
|
+
|
205
|
+
# phenomena = ['eehu' 'hcpp', 'lbtr', 'npop', 'phpr']
|
206
|
+
|
207
|
+
# physiology = ['celf', 'clna', 'clnd']
|
208
|
+
|
209
|
+
# procedures = ['diap', 'edac', 'hlca', 'lbpr', 'mbrt', 'resa', 'topp']
|
210
|
+
|
211
|
+
# def __init__(self, drugs_df, input_path='drugbank/output', output_path='ner-output/metamaplite', ids=[],
|
212
|
+
# columns=[],
|
213
|
+
# included_groups: Groups = [],
|
214
|
+
# excluded_groups: Groups = [],
|
215
|
+
# ):
|
216
|
+
|
217
|
+
# self.drugs_df = drugs_df
|
218
|
+
# self.columns = columns
|
219
|
+
# self.ids = ids
|
220
|
+
# self.output_path = output_path
|
221
|
+
# self.included_groups = set()
|
222
|
+
# for i, g in enumerate(included_groups):
|
223
|
+
# for v in g.value:
|
224
|
+
# self.included_groups.add(v)
|
225
|
+
|
226
|
+
# self.excluded_groups = set()
|
227
|
+
# for i, g in enumerate(excluded_groups):
|
228
|
+
# for v in g.value:
|
229
|
+
# self.excluded_groups.add(v)
|
230
|
+
|
231
|
+
# for column in columns:
|
232
|
+
# if not os.path.exists(output_path+"/"+column):
|
233
|
+
# os.makedirs(output_path+"/"+column)
|
234
|
+
|
235
|
+
# def run_ner(self):
|
236
|
+
# # # url = 'https://ii-public1.nlm.nih.gov/metamaplite/rest/annotate'
|
237
|
+
# base_url = 'https://ii.nlm.nih.gov/metamaplite/rest/annotate'
|
238
|
+
# acceptfmt = 'text/plain'
|
239
|
+
# for column in self.columns:
|
240
|
+
# column_output_path = self.output_path+'/'+column
|
241
|
+
|
242
|
+
# if self.ids:
|
243
|
+
# self.drugs_df = self.drugs_df[~self.drugs_df['drugbank_id'].isin(
|
244
|
+
# self.ids)]
|
245
|
+
# for index, row in self.drugs_df.iterrows():
|
246
|
+
# drugbank_id = row['drugbank_id']
|
247
|
+
# input_text = row[column]
|
248
|
+
# params = [('inputtext', input_text), ('docformat', 'freetext'),
|
249
|
+
# ('resultformat', 'json'), ('sourceString', 'all'),
|
250
|
+
# ('semanticTypeString', 'all')]
|
251
|
+
# resp = metamapliteclient.handle_request(
|
252
|
+
# base_url, acceptfmt, params)
|
253
|
+
|
254
|
+
# with open(f'{column_output_path}/{drugbank_id}.json', 'w', encoding='utf-8') as f:
|
255
|
+
# obj = json.loads(resp.text)
|
256
|
+
# json.dump(obj, f, ensure_ascii=False, indent=4)
|
257
|
+
|
258
|
+
# if index % 10 == 0:
|
259
|
+
# sleep(10)
|
260
|
+
|
261
|
+
# def __dict_of_semantic_types__(self, path):
|
262
|
+
# m = dict()
|
263
|
+
# with open(path, 'r', encoding='utf-8') as f:
|
264
|
+
# data = f.read()
|
265
|
+
# rows = data.split("\n")
|
266
|
+
# for row in rows:
|
267
|
+
# if row != "":
|
268
|
+
# arr = row.split("|")
|
269
|
+
# m[arr[0]] = arr[1]
|
270
|
+
# return m
|
271
|
+
|
272
|
+
# def load(self, semantic_type_path: str, dataframe_columns=[]):
|
273
|
+
# semantic_type_dict = self.__dict_of_semantic_types__(
|
274
|
+
# semantic_type_path)
|
275
|
+
|
276
|
+
# cui_dict = defaultdict(dict)
|
277
|
+
# tui_dict = defaultdict(dict)
|
278
|
+
# for column in self.columns:
|
279
|
+
# all_json_files = f'{self.output_path}/{column}/'+'*.json*'
|
280
|
+
# for filepath in tqdm(glob.glob(all_json_files)):
|
281
|
+
# with open(filepath, 'r', encoding="utf8") as f:
|
282
|
+
# file_name = Path(f.name).stem
|
283
|
+
# data = json.load(f)
|
284
|
+
# filtered_obj = [o for o in data if len(o['evlist']) == 1]
|
285
|
+
# # filtered_obj = [o for o in data if len(o['evlist']) == 1 and set(
|
286
|
+
# # checmicals_and_drugs_group).intersection(set(o['evlist'][0]['conceptinfo']['semantictypes']))]
|
287
|
+
|
288
|
+
# if self.included_groups:
|
289
|
+
# evaluation = [o['evlist'][0]['conceptinfo'] for o in filtered_obj if len(o['evlist']) == 1
|
290
|
+
# and
|
291
|
+
# self.included_groups.intersection(
|
292
|
+
# set(o['evlist'][0]['conceptinfo']['semantictypes']))]
|
293
|
+
# # cuis = [o['evlist'][0]['conceptinfo']['cui'] for o in filtered_obj if len(o['evlist']) == 1
|
294
|
+
# # and
|
295
|
+
# # self.included_groups.intersection(
|
296
|
+
# # set(o['evlist'][0]['conceptinfo']['semantictypes']))]
|
297
|
+
# elif self.excluded_groups:
|
298
|
+
# evaluation = cuis = [o['evlist'][0]['conceptinfo'] for o in filtered_obj if len(o['evlist']) == 1
|
299
|
+
# and
|
300
|
+
# not self.excluded_groups.intersection(
|
301
|
+
# set(o['evlist'][0]['conceptinfo']['semantictypes']))]
|
302
|
+
# # cuis = [o['evlist'][0]['conceptinfo']['cui'] for o in filtered_obj if len(o['evlist']) == 1
|
303
|
+
# # and
|
304
|
+
# # not self.excluded_groups.intersection(
|
305
|
+
# # set(o['evlist'][0]['conceptinfo']['semantictypes']))]
|
306
|
+
# else:
|
307
|
+
# evaluation = [o['evlist'][0]['conceptinfo']
|
308
|
+
# for o in filtered_obj if len(o['evlist']) == 1]
|
309
|
+
# # cuis = [o['evlist'][0]['conceptinfo']['cui']
|
310
|
+
# # for o in filtered_obj if len(o['evlist']) == 1]
|
311
|
+
|
312
|
+
# # cuis = [o['evlist'][0]['conceptinfo']['cui'] for o in filtered_obj if len(o['evlist']) == 1 and set(
|
313
|
+
# # checmicals_and_drugs_group).intersection(set(o['evlist'][0]['conceptinfo']['semantictypes']))]
|
314
|
+
# cuis = [ev['cui'] for ev in evaluation]
|
315
|
+
# semantic_types = [ev['semantictypes'] for ev in evaluation]
|
316
|
+
# tuis = [semantic_type_dict[s]
|
317
|
+
# for semantic_type in semantic_types for s in semantic_type]
|
318
|
+
|
319
|
+
# d = cui_dict[file_name]
|
320
|
+
# d['drugbank_id'] = file_name
|
321
|
+
# d[column] = set(cuis)
|
322
|
+
|
323
|
+
# t = tui_dict[file_name]
|
324
|
+
# t['drugbank_id'] = file_name
|
325
|
+
# t[column] = set(tuis)
|
326
|
+
# tui_dict[file_name] = t
|
327
|
+
|
328
|
+
# columns = self.columns
|
329
|
+
# columns.insert(0, 'drugbank_id')
|
330
|
+
# df = pd.DataFrame(tui_dict.values(),
|
331
|
+
# # orient='index',
|
332
|
+
# columns=columns
|
333
|
+
# )
|
334
|
+
|
335
|
+
# dataframe_columns.insert(0, 'drugbank_id')
|
336
|
+
|
337
|
+
# new_columns = {columns[i]: dataframe_columns[i]
|
338
|
+
# for i in range(len(columns))}
|
339
|
+
# df.rename(columns=new_columns, inplace=True)
|
340
|
+
# return df
|
ddi_fw/utils/__init__.py
ADDED
ddi_fw/utils/enums.py
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
|
3
|
+
|
4
|
+
class UMLSCodeTypes(Enum):
|
5
|
+
TUI = 'tui',
|
6
|
+
CUI = 'cui',
|
7
|
+
ENTITIES = 'entities',
|
8
|
+
|
9
|
+
|
10
|
+
class DrugBankTextDataTypes(Enum):
|
11
|
+
DESCRIPTION = 'description',
|
12
|
+
INDICATION = 'indication',
|
13
|
+
SYNTHESIS_REFERENCE = 'synthesis_reference',
|
14
|
+
PHARMACODYNAMICS = 'pharmacodynamics',
|
15
|
+
MECHANISM_OF_ACTION = 'mechanism_of_action',
|
16
|
+
TOXICITY = 'toxicity',
|
17
|
+
METABOLISM = 'metabolism',
|
18
|
+
ABSORPTION = 'absorption',
|
19
|
+
HALF_LIFE = 'half_life',
|
20
|
+
PROTEIN_BINDING = 'protein_binding',
|
21
|
+
ROUTE_OF_ELIMINATION = 'route_of_elimination',
|
22
|
+
VOLUME_OF_DISTRIBUTION = 'volume_of_distribution',
|
23
|
+
CLEARANCE = 'clearance',
|
ddi_fw/utils/utils.py
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
import gzip
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
|
5
|
+
from datetime import datetime, timezone
|
6
|
+
|
7
|
+
from matplotlib import pyplot as plt
|
8
|
+
|
9
|
+
|
10
|
+
def create_folder_if_not_exists(path):
|
11
|
+
if not os.path.exists(path):
|
12
|
+
os.makedirs(path)
|
13
|
+
|
14
|
+
|
15
|
+
def utc_time_as_string():
|
16
|
+
utc_datetime = datetime.now(timezone.utc)
|
17
|
+
|
18
|
+
return datetime.strftime(utc_datetime, "%Y-%m-%dT%H:%M:%S.%f")[:-3]
|
19
|
+
|
20
|
+
def utc_time_as_string_simple_format():
|
21
|
+
utc_datetime = datetime.now(timezone.utc)
|
22
|
+
|
23
|
+
return datetime.strftime(utc_datetime, '%Y%m%d')
|
24
|
+
|
25
|
+
# https://gist.github.com/LouisAmon/4bd79b8ab80d3851601f3f9016300ac4
|
26
|
+
|
27
|
+
|
28
|
+
def compress_data(data):
|
29
|
+
# Convert to JSON
|
30
|
+
# json_data = json.dumps(data, indent=2)
|
31
|
+
json_data = json.dumps(data, separators=(',', ":"))
|
32
|
+
# Convert to bytes
|
33
|
+
encoded = json_data.encode('UTF-8')
|
34
|
+
# Compress
|
35
|
+
compressed = gzip.compress(encoded)
|
36
|
+
return compressed
|
37
|
+
|
38
|
+
|
39
|
+
def compress_and_save_data(data, path, file_name):
|
40
|
+
compressed = compress_data(data)
|
41
|
+
create_folder_if_not_exists(path)
|
42
|
+
with gzip.open(path+f'/{file_name}', 'wb') as f:
|
43
|
+
f.write(compressed)
|
44
|
+
|
45
|
+
def decompress(gzip_file):
|
46
|
+
with gzip.open(gzip_file, 'r') as fin: # 4. gzip
|
47
|
+
json_bytes = fin.read() # 3. bytes (i.e. UTF-8)
|
48
|
+
json_bytes = gzip.decompress(json_bytes)
|
49
|
+
json_str = json_bytes.decode('UTF-8') # 2. string (i.e. JSON)
|
50
|
+
data = json.loads(json_str)
|
51
|
+
return data
|
52
|
+
|
53
|
+
|
54
|
+
if __name__ == "__main__":
|
55
|
+
# json_file = f'C:\\Users\\kivanc\\Downloads\\metrics.json'
|
56
|
+
# file_data = open(json_file, "r", 1).read()
|
57
|
+
# a = json.loads(file_data) # store in json structure
|
58
|
+
# # a = {'key1':1, 'key2':2}
|
59
|
+
# compressed = compress_data(a)
|
60
|
+
# with gzip.open('deneme.gzip', 'wb') as f:
|
61
|
+
# f.write(compressed)
|
62
|
+
|
63
|
+
# with gzip.open('deneme.gzip', 'r') as fin: # 4. gzip
|
64
|
+
# json_bytes = fin.read() # 3. bytes (i.e. UTF-8)
|
65
|
+
# json_bytes = gzip.decompress(json_bytes)
|
66
|
+
# json_str = json_bytes.decode('UTF-8') # 2. string (i.e. JSON)
|
67
|
+
# data = json.loads(json_str)
|
68
|
+
# print(data)
|
69
|
+
|
70
|
+
gzip_file = f'C:\\Users\\kivanc\\Downloads\\metrics (2).gzip'
|
71
|
+
stored_file = f'C:\\Users\\kivanc\\Downloads\\save.png'
|
72
|
+
metrics = decompress(gzip_file)
|
73
|
+
# print(metrics)
|
74
|
+
|
75
|
+
# Plot Precision-Recall curves for each class and micro-average
|
76
|
+
fig = plt.figure()
|
77
|
+
plt.step(metrics['recall']['micro_event'], metrics['precision']['micro_event'],
|
78
|
+
color='b', alpha=0.2, where='post')
|
79
|
+
plt.fill_between(
|
80
|
+
metrics['recall']["micro_event"], metrics['precision']["micro_event"], step='post', alpha=0.2, color='b')
|
81
|
+
|
82
|
+
# for i in range(65):
|
83
|
+
# plt.step( metrics['recall'][str(i)], metrics['precision'][str(i)], where='post',
|
84
|
+
# label='Class {0} (AUC={1:0.2f})'.format(i, metrics['roc_aupr'][str(i)]))
|
85
|
+
|
86
|
+
plt.xlabel('Recall')
|
87
|
+
plt.ylabel('Precision')
|
88
|
+
plt.ylim([0.0, 1.05])
|
89
|
+
plt.xlim([0.0, 1.0])
|
90
|
+
plt.title(
|
91
|
+
'Micro-average Precision-Recall curve: AUC={0:0.2f}'.format(metrics['roc_aupr']["micro"]))
|
92
|
+
plt.legend(loc='best')
|
93
|
+
plt.savefig(stored_file)
|
94
|
+
# plt.show()
|
95
|
+
|
96
|
+
import plotly.express as px
|
97
|
+
import pandas as pd
|
98
|
+
df = pd.DataFrame(dict(
|
99
|
+
r=[1, 5, 2, 2, 3],
|
100
|
+
theta=['processing cost','mechanical properties','chemical stability',
|
101
|
+
'thermal stability', 'device integration']))
|
102
|
+
fig = px.line_polar(df, r='r', theta='theta', line_close=True)
|
103
|
+
fig.show()
|
@@ -0,0 +1,66 @@
|
|
1
|
+
import zipfile as z
|
2
|
+
import os
|
3
|
+
from os.path import basename
|
4
|
+
from collections import defaultdict
|
5
|
+
import math
|
6
|
+
|
7
|
+
|
8
|
+
class ZipHelper:
|
9
|
+
def __init__(self):
|
10
|
+
pass
|
11
|
+
|
12
|
+
def zip_single_file(self, name, file_path, output_path):
|
13
|
+
if not os.path.exists(output_path):
|
14
|
+
os.makedirs(output_path)
|
15
|
+
with z.ZipFile(f'{output_path}/{name}.zip', 'w', compression=z.ZIP_LZMA, compresslevel=z.ZIP_LZMA) as zipObj:
|
16
|
+
zipObj.write(file_path, basename(file_path))
|
17
|
+
|
18
|
+
def zip(self, zip_prefix, input_path, output_path, chunk_size):
|
19
|
+
files_paths = [input_path+'/' + p for p in os.listdir(input_path)]
|
20
|
+
count_of_chunks = math.ceil(len(files_paths) / chunk_size)
|
21
|
+
zero_padding_length = len(str(int(count_of_chunks))) + 2
|
22
|
+
|
23
|
+
if not os.path.exists(output_path):
|
24
|
+
os.makedirs(output_path)
|
25
|
+
|
26
|
+
part = 1
|
27
|
+
i = 0
|
28
|
+
zip_dict = defaultdict(list)
|
29
|
+
for filePath in files_paths:
|
30
|
+
padded_part = f'{part}'.zfill(zero_padding_length)
|
31
|
+
key = f'{zip_prefix}.{padded_part}'
|
32
|
+
zip_dict[key].append(filePath)
|
33
|
+
i += 1
|
34
|
+
if i % chunk_size == 0:
|
35
|
+
i = 0
|
36
|
+
part += 1
|
37
|
+
|
38
|
+
for key, value in zip_dict.items():
|
39
|
+
with z.ZipFile(f'{output_path}/{key}.zip', 'w', compression=z.ZIP_LZMA, compresslevel=z.ZIP_LZMA) as zipObj:
|
40
|
+
for file_path in value:
|
41
|
+
zipObj.write(file_path, basename(file_path))
|
42
|
+
|
43
|
+
def extract(self, input_path, output_path):
|
44
|
+
files_paths = [input_path+'/' + p for p in os.listdir(input_path)]
|
45
|
+
if not os.path.exists(output_path):
|
46
|
+
os.makedirs(output_path)
|
47
|
+
for file_path in files_paths:
|
48
|
+
if file_path.endswith('zip'):
|
49
|
+
with z.ZipFile(file_path, 'r') as z1:
|
50
|
+
z1.extractall(path=output_path)
|
51
|
+
print(f'{file_path} has been extracted')
|
52
|
+
|
53
|
+
|
54
|
+
# if __name__ == "__main__":
|
55
|
+
# helper = ZipHelper()
|
56
|
+
# helper.zip(zip_prefix='drugs', input_path='drugbank/drugs',
|
57
|
+
# output_path='drugbank/drugs-zips', chunk_size=1000)
|
58
|
+
# helper.extract(input_path='drugbank/drugs-zips',
|
59
|
+
# output_path='drugbank/drugs-extracted')
|
60
|
+
# path = ''
|
61
|
+
# import pandas as pd
|
62
|
+
# d = {'col1': [1, 2], 'col2': [3, 4]}
|
63
|
+
# df = pd.DataFrame(data=d)
|
64
|
+
# df.to_pickle('test/dataframe.pickle')
|
65
|
+
# helper.zip_single_file(file_path='test/dataframe.pickle',output_path='test/output', name='zip')
|
66
|
+
# helper.extract(input_path='test/output', output_path='test/output')
|
@@ -0,0 +1,28 @@
|
|
1
|
+
ddi_fw/datasets/__init__.py,sha256=WmupqKInz9XMorCAUFS_iUZoSB56xasTrC8eb0UlCVk,540
|
2
|
+
ddi_fw/datasets/core.py,sha256=O4lbhaiGkvbiGHS_Eubh6RqktBoGHjQnfSSrakdkfUU,18397
|
3
|
+
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
4
|
+
ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
|
5
|
+
ddi_fw/datasets/embedding_generator_new.py,sha256=GExjmBysPWkmFxTZQPs2yEmDdFllZ-qC9lhZeRQAfbQ,4320
|
6
|
+
ddi_fw/datasets/feature_vector_generation.py,sha256=dxTHvp6uTkao9PdThs116Q3bWw_WTo9T8WigVL4G01s,3245
|
7
|
+
ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
8
|
+
ddi_fw/drugbank/__init__.py,sha256=4_eKdZsnXUSJyr-TZpHwIn13JC6PqS5imeLJJbgt2-A,94
|
9
|
+
ddi_fw/drugbank/drugbank_parser.py,sha256=-ktqXKX-o68VShmfYkxI-5WMzwNQQiw_h5CF1fnwWQU,5450
|
10
|
+
ddi_fw/drugbank/drugbank_processor.py,sha256=q9J4piFicDu7U5gjZztOVcSLkjjpylyPdM_qeHSne8w,17671
|
11
|
+
ddi_fw/drugbank/drugbank_processor_org.py,sha256=hKyeunj6bNFHD9SmWM-a-eOp_NRyD0RIPoV8tzkbGAk,13308
|
12
|
+
ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
|
13
|
+
ddi_fw/experiments/__init__.py,sha256=UJwd2i3QcuaI1YjC_2yGCiLuEMTT5Yo7rDFxw89chIw,108
|
14
|
+
ddi_fw/experiments/custom_torch_model.py,sha256=iQ_R_EApzD2JCcASN8cie6D21oh7VCxaOQ45_dkiGwc,2576
|
15
|
+
ddi_fw/experiments/evaluation_helper.py,sha256=pY69cezV3WzrXw1bduIwRJfah1w3wXJ2YyTNim1J7ko,9349
|
16
|
+
ddi_fw/experiments/tensorflow_helper.py,sha256=xYWCWa_ixSHnQFlDJ6mM0TY_fJCvV_FuFi0oUszLfwo,12634
|
17
|
+
ddi_fw/experiments/test.py,sha256=rf7UB2SUZR2-UL_IVOm8_8NOY9__2dVGlUbct5tqf-0,1981
|
18
|
+
ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
|
19
|
+
ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
|
20
|
+
ddi_fw/ner/ner.py,sha256=9-77KkeikXbuHoOBFTrTT5MMKxpJkh6HwNVGbl9ttIQ,15856
|
21
|
+
ddi_fw/utils/__init__.py,sha256=nhNU_sEp55xsZ5VtvhozjKg6r4GWP6SJI13v8F_jbCg,217
|
22
|
+
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
23
|
+
ddi_fw/utils/utils.py,sha256=Na6Y8mY-CFbQjrgd9xC8agcrjVvTj_7KIXqFm1H_3qU,3549
|
24
|
+
ddi_fw/utils/zip_helper.py,sha256=DjtwcGBoYw8zOP-Ye5OxzeR1OgN3WfNkVx85nb0wbJA,2635
|
25
|
+
ddi_fw-0.0.2.dist-info/METADATA,sha256=hGGcfch-mbi1irldvZm8nr9ukLgwGDmgWAK75zOYwSU,391
|
26
|
+
ddi_fw-0.0.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
27
|
+
ddi_fw-0.0.2.dist-info/top_level.txt,sha256=Lfsqipq5Jm60ALnmFA_cdNfpVfzBJlKM0GiQ_sB8KGE,75
|
28
|
+
ddi_fw-0.0.2.dist-info/RECORD,,
|
ddi_fw-0.0.1.dist-info/RECORD
DELETED
@@ -1,4 +0,0 @@
|
|
1
|
-
ddi_fw-0.0.1.dist-info/METADATA,sha256=G-yvgvniNCT677dkbA9kmW6mgFh43bIb71E6zxhqnmA,391
|
2
|
-
ddi_fw-0.0.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
3
|
-
ddi_fw-0.0.1.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
4
|
-
ddi_fw-0.0.1.dist-info/RECORD,,
|
@@ -1 +0,0 @@
|
|
1
|
-
|
File without changes
|