ddi-fw 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +12 -0
- ddi_fw/datasets/core.py +416 -0
- ddi_fw/datasets/db_utils.py +204 -0
- ddi_fw/datasets/embedding_generator.py +66 -0
- ddi_fw/datasets/embedding_generator_new.py +105 -0
- ddi_fw/datasets/feature_vector_generation.py +100 -0
- ddi_fw/datasets/idf_helper.py +71 -0
- ddi_fw/drugbank/__init__.py +2 -0
- ddi_fw/drugbank/drugbank_parser.py +154 -0
- ddi_fw/drugbank/drugbank_processor.py +343 -0
- ddi_fw/drugbank/drugbank_processor_org.py +272 -0
- ddi_fw/drugbank/event_extractor.py +127 -0
- ddi_fw/experiments/__init__.py +2 -0
- ddi_fw/experiments/custom_torch_model.py +66 -0
- ddi_fw/experiments/evaluation_helper.py +232 -0
- ddi_fw/experiments/tensorflow_helper.py +296 -0
- ddi_fw/experiments/test.py +59 -0
- ddi_fw/ner/__init__.py +1 -0
- ddi_fw/ner/mmlrestclient.py +155 -0
- ddi_fw/ner/ner.py +340 -0
- ddi_fw/utils/__init__.py +3 -0
- ddi_fw/utils/enums.py +23 -0
- ddi_fw/utils/utils.py +103 -0
- ddi_fw/utils/zip_helper.py +66 -0
- {ddi_fw-0.0.1.dist-info → ddi_fw-0.0.2.dist-info}/METADATA +1 -1
- ddi_fw-0.0.2.dist-info/RECORD +28 -0
- ddi_fw-0.0.2.dist-info/top_level.txt +5 -0
- ddi_fw-0.0.1.dist-info/RECORD +0 -4
- ddi_fw-0.0.1.dist-info/top_level.txt +0 -1
- {ddi_fw-0.0.1.dist-info → ddi_fw-0.0.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,343 @@
|
|
1
|
+
import sqlite3
|
2
|
+
import pandas as pd
|
3
|
+
import os
|
4
|
+
import json
|
5
|
+
import glob
|
6
|
+
from tqdm import tqdm
|
7
|
+
|
8
|
+
import csv
|
9
|
+
|
10
|
+
from rdkit import Chem
|
11
|
+
from rdkit.Chem import AllChem
|
12
|
+
import numpy as np
|
13
|
+
from drugbank.event_extractor import EventExtractor
|
14
|
+
from ner.ner import CTakesNER
|
15
|
+
|
16
|
+
from utils import ZipHelper
|
17
|
+
# from event_extractor import EventExtractor
|
18
|
+
|
19
|
+
|
20
|
+
def multiline_to_singleline(multiline):
|
21
|
+
if multiline is None:
|
22
|
+
return ""
|
23
|
+
return " ".join(line.strip() for line in multiline.splitlines())
|
24
|
+
|
25
|
+
# targets -> target -> polypeptide
|
26
|
+
# enzymes -> enzyme -> polypeptide
|
27
|
+
# pathways from KEGG, KEGG ID is obtained from DrugBank
|
28
|
+
# https://www.genome.jp/dbget-bin/www_bget?drug:D03136
|
29
|
+
# https://www.kegg.jp/entry/D03136
|
30
|
+
|
31
|
+
|
32
|
+
class DrugBankProcessor():
|
33
|
+
|
34
|
+
def mask_interaction(self, drug_1, drug_2, interaction):
|
35
|
+
return interaction.replace(
|
36
|
+
drug_1, "DRUG").replace(drug_2, "DRUG")
|
37
|
+
|
38
|
+
def extract_zip_files(self, input_path='zips', output_path='drugs', override=False):
|
39
|
+
if override:
|
40
|
+
zip_helper = ZipHelper()
|
41
|
+
zip_helper.extract(input_path=input_path, output_path=output_path)
|
42
|
+
|
43
|
+
def get_external_identifiers(self,input_path='drugs'):
|
44
|
+
external_identifier_list = []
|
45
|
+
all_json_files = input_path+'/*.json*'
|
46
|
+
|
47
|
+
for filepath in tqdm(glob.glob(all_json_files)):
|
48
|
+
with open(filepath, 'r', encoding="utf8") as f:
|
49
|
+
|
50
|
+
data = json.load(f)
|
51
|
+
drug_1 = data['name']
|
52
|
+
drug_1_id = [d['value']
|
53
|
+
for d in data['drugbank-id'] if d['primary'] == True][0]
|
54
|
+
external_identifiers = data['external-identifiers'] if "external-identifiers" in data else None
|
55
|
+
external_identifiers_dict = {}
|
56
|
+
external_identifiers_dict['name'] = drug_1
|
57
|
+
external_identifiers_dict['drugbank_id'] = drug_1_id
|
58
|
+
if external_identifiers is not None:
|
59
|
+
for p in external_identifiers['external-identifier']:
|
60
|
+
external_identifiers_dict[p['resource'].lower().replace(" ","_")] = p['identifier']
|
61
|
+
# external_identifiers_dict = dict(
|
62
|
+
# [(p['resource'].lower().replace(" ","_"), p['identifier']) for p in external_identifiers['external-identifier']])
|
63
|
+
# external_identifiers_dict['name'] = drug_1
|
64
|
+
# external_identifiers_dict['drugbank_id'] = drug_1_id
|
65
|
+
external_identifier_list.append(external_identifiers_dict)
|
66
|
+
return external_identifier_list
|
67
|
+
|
68
|
+
|
69
|
+
def process(self, input_path='drugs', output_path='output', save_as_sql=True, db_path = r"./drugbank.db", zip_outputs=True):
|
70
|
+
if not os.path.exists(output_path):
|
71
|
+
os.makedirs(output_path)
|
72
|
+
ner_df =CTakesNER().load()
|
73
|
+
drugs_pickle_path = output_path+'/drugs.pkl'
|
74
|
+
drugs_csv_path = output_path+'/drugs.gzip'
|
75
|
+
ddi_pickle_path = output_path + '/ddi.pkl'
|
76
|
+
ddi_csv_path = output_path + '/ddi.gzip'
|
77
|
+
|
78
|
+
if not os.path.exists(drugs_pickle_path) or not os.path.exists(ddi_pickle_path):
|
79
|
+
drug_rows = []
|
80
|
+
all_ddis = []
|
81
|
+
external_identifier_list = []
|
82
|
+
all_json_files = input_path+'/*.json*'
|
83
|
+
|
84
|
+
for filepath in tqdm(glob.glob(all_json_files)):
|
85
|
+
with open(filepath, 'r', encoding="utf8") as f:
|
86
|
+
|
87
|
+
data = json.load(f)
|
88
|
+
|
89
|
+
# if data['drug-interactions'] is None:
|
90
|
+
if False:
|
91
|
+
continue
|
92
|
+
else:
|
93
|
+
drug_1 = data['name']
|
94
|
+
drug_1_id = [d['value']
|
95
|
+
for d in data['drugbank-id'] if d['primary'] == True][0]
|
96
|
+
description = multiline_to_singleline(
|
97
|
+
data['description'])
|
98
|
+
if data['drug-interactions'] is not None:
|
99
|
+
drug_interactions = [
|
100
|
+
interaction for interaction in data['drug-interactions']['drug-interaction']]
|
101
|
+
ddis = [(drug_1, interaction['name'], interaction['description'])
|
102
|
+
for interaction in data['drug-interactions']['drug-interaction']]
|
103
|
+
|
104
|
+
ddi_dict = [{
|
105
|
+
'drug_1_id': drug_1_id,
|
106
|
+
'drug_1': drug_1,
|
107
|
+
'drug_2_id': interaction['drugbank-id']['value'],
|
108
|
+
'drug_2': interaction['name'],
|
109
|
+
'interaction': interaction['description'],
|
110
|
+
'masked_interaction': self.mask_interaction(drug_1, interaction['name'], interaction['description'])}
|
111
|
+
for interaction in data['drug-interactions']['drug-interaction']]
|
112
|
+
all_ddis.extend(ddi_dict)
|
113
|
+
|
114
|
+
synthesis_reference = data['synthesis-reference']
|
115
|
+
indication = multiline_to_singleline(
|
116
|
+
data['indication'])
|
117
|
+
pharmacodynamics = multiline_to_singleline(
|
118
|
+
data['pharmacodynamics'])
|
119
|
+
mechanism_of_action = multiline_to_singleline(
|
120
|
+
data['mechanism-of-action'])
|
121
|
+
toxicity = multiline_to_singleline(data['toxicity'])
|
122
|
+
metabolism = multiline_to_singleline(
|
123
|
+
data['metabolism'])
|
124
|
+
absorption = multiline_to_singleline(
|
125
|
+
data['absorption'])
|
126
|
+
half_life = multiline_to_singleline(data['half-life'])
|
127
|
+
protein_binding = multiline_to_singleline(
|
128
|
+
data['protein-binding'])
|
129
|
+
route_of_elimination = multiline_to_singleline(
|
130
|
+
data['route-of-elimination'])
|
131
|
+
volume_of_distribution = multiline_to_singleline(
|
132
|
+
data['volume-of-distribution'])
|
133
|
+
clearance = multiline_to_singleline(data['clearance'])
|
134
|
+
|
135
|
+
food_interactions = data['food-interactions']
|
136
|
+
sequences = data['sequences'] if "sequences" in data else None
|
137
|
+
|
138
|
+
external_identifiers = data['external-identifiers'] if "external-identifiers" in data else None
|
139
|
+
experimental_properties = data['experimental-properties'] if "experimental-properties" in data else None
|
140
|
+
calculated_properties = data['calculated-properties'] if "calculated-properties" in data else None
|
141
|
+
|
142
|
+
enzymes_polypeptides = None
|
143
|
+
targets_polypeptides = None
|
144
|
+
pathways = None
|
145
|
+
|
146
|
+
# targets = data['targets'] if "targets" in data else None
|
147
|
+
if data['targets'] is not None:
|
148
|
+
# targets_polypeptides = [p['id'] for d in data['targets']['target'] for p in d['polypeptide'] if 'polypeptide' in d ]
|
149
|
+
targets_polypeptides = [
|
150
|
+
p['id'] for d in data['targets']['target'] if 'polypeptide' in d for p in d['polypeptide']]
|
151
|
+
|
152
|
+
if data['enzymes'] is not None:
|
153
|
+
# enzymes_polypeptides = [p['id'] for d in data['enzymes']['enzyme'] for p in d['polypeptide'] if 'polypeptide' in d]
|
154
|
+
enzymes_polypeptides = [
|
155
|
+
p['id'] for d in data['enzymes']['enzyme'] if 'polypeptide' in d for p in d['polypeptide']]
|
156
|
+
|
157
|
+
if data['pathways'] is not None:
|
158
|
+
pathways = [
|
159
|
+
d['smpdb-id'] for d in data['pathways']['pathway']]
|
160
|
+
|
161
|
+
|
162
|
+
if external_identifiers is not None:
|
163
|
+
external_identifiers_dict = dict(
|
164
|
+
[(p['resource'], p['identifier']) for p in external_identifiers['external-identifier']])
|
165
|
+
external_identifiers_dict['drugbank_id'] = drug_1_id
|
166
|
+
external_identifier_list.append(external_identifiers_dict)
|
167
|
+
# add note column
|
168
|
+
smiles = None
|
169
|
+
morgan_hashed = None
|
170
|
+
if calculated_properties is not None:
|
171
|
+
calculated_properties_dict = dict(
|
172
|
+
[(p['kind'], p['value']) for p in calculated_properties['property']])
|
173
|
+
smiles = calculated_properties_dict['SMILES'] if 'SMILES' in calculated_properties_dict else None
|
174
|
+
if smiles is not None:
|
175
|
+
try:
|
176
|
+
mol = Chem.MolFromSmiles(smiles)
|
177
|
+
morgan_hashed = AllChem.GetMorganFingerprintAsBitVect(
|
178
|
+
mol, 2, nBits=881).ToList()
|
179
|
+
except:
|
180
|
+
print("An exception occurred")
|
181
|
+
if morgan_hashed is None:
|
182
|
+
morgan_hashed = np.zeros(881).tolist()
|
183
|
+
|
184
|
+
# TODO cui, tui, entities other types of texts, test it
|
185
|
+
tuis_description = ner_df[ner_df['drugbank_id'] == drug_1_id]['tui_description'].values
|
186
|
+
if len(tuis_description) > 0:
|
187
|
+
tuis_description = tuis_description[0]
|
188
|
+
else:
|
189
|
+
tuis_description = None
|
190
|
+
|
191
|
+
cuis_description = ner_df[ner_df['drugbank_id'] == drug_1_id]['cui_description'].values
|
192
|
+
if len(cuis_description) > 0:
|
193
|
+
cuis_description = cuis_description[0]
|
194
|
+
else:
|
195
|
+
cuis_description = None
|
196
|
+
|
197
|
+
entities_description = ner_df[ner_df['drugbank_id'] == drug_1_id]['entities_description'].values
|
198
|
+
if len(entities_description) > 0:
|
199
|
+
entities_description = entities_description[0]
|
200
|
+
else:
|
201
|
+
entities_description = None
|
202
|
+
|
203
|
+
# k = [p[k] for p in calculated_properties['property'] for k in p.keys() if k =='SMILES']
|
204
|
+
# external_identifiers['external-identifier']
|
205
|
+
# experimental_properties['property']
|
206
|
+
|
207
|
+
# list to single line reminder
|
208
|
+
row = {'drugbank_id': drug_1_id,
|
209
|
+
'name': drug_1,
|
210
|
+
'description': description,
|
211
|
+
'synthesis_reference': synthesis_reference,
|
212
|
+
'indication': indication,
|
213
|
+
'pharmacodynamics': pharmacodynamics,
|
214
|
+
'mechanism_of_action': mechanism_of_action,
|
215
|
+
'toxicity': toxicity,
|
216
|
+
'metabolism': metabolism,
|
217
|
+
'absorption': absorption,
|
218
|
+
'half_life': half_life,
|
219
|
+
'protein_binding': protein_binding,
|
220
|
+
'route_of_elimination': route_of_elimination,
|
221
|
+
'volume_of_distribution': volume_of_distribution,
|
222
|
+
'clearance': clearance,
|
223
|
+
'smiles': smiles,
|
224
|
+
'smiles_morgan_fingerprint': ','.join(map(str, morgan_hashed)),
|
225
|
+
'enzymes_polypeptides': '|'.join(enzymes_polypeptides) if enzymes_polypeptides is not None else None,
|
226
|
+
'targets_polypeptides': '|'.join(targets_polypeptides) if targets_polypeptides is not None else None,
|
227
|
+
'pathways': '|'.join(pathways) if pathways is not None else None,
|
228
|
+
'tuis_description':'|'.join(tuis_description) if tuis_description is not None else None,
|
229
|
+
'cuis_description':'|'.join(cuis_description) if cuis_description is not None else None,
|
230
|
+
'entities_description':'|'.join(entities_description) if entities_description is not None else None
|
231
|
+
# 'external_identifiers': external_identifiers_dict
|
232
|
+
}
|
233
|
+
drug_rows.append(row)
|
234
|
+
|
235
|
+
# if len(drug_rows) == 10:
|
236
|
+
# break
|
237
|
+
# print(smiles_count)
|
238
|
+
print(f"Size of drugs {len(drug_rows)}")
|
239
|
+
print(f"Size of DDIs {len(all_ddis)}")
|
240
|
+
np.set_printoptions(threshold=np.inf)
|
241
|
+
|
242
|
+
# drug_names = [row['name'] for row in drug_rows]
|
243
|
+
drug_names = ['DRUG']
|
244
|
+
event_extractor = EventExtractor(drug_names)
|
245
|
+
|
246
|
+
replace_dict = {'MYO-029': 'Stamulumab'}
|
247
|
+
for ddi in tqdm(all_ddis):
|
248
|
+
for key, value in replace_dict.items():
|
249
|
+
ddi['masked_interaction'] = ddi['masked_interaction'].replace(
|
250
|
+
key, value)
|
251
|
+
# interaction = ddi['interaction']
|
252
|
+
# mechanism, action, drugA, drugB = event_extractor.extract(interaction)
|
253
|
+
# ddi['mechanism'] = mechanism
|
254
|
+
# ddi['action'] = action
|
255
|
+
|
256
|
+
self.drugs_df = pd.DataFrame(drug_rows)
|
257
|
+
# self.drugs_df.to_pickle(drugs_pickle_path)
|
258
|
+
# self.drugs_df.to_csv(
|
259
|
+
# drugs_csv_path, index=False, compression='gzip')
|
260
|
+
|
261
|
+
# print('mechanism_action calculation')
|
262
|
+
self.ddis_df = pd.DataFrame(all_ddis)
|
263
|
+
|
264
|
+
count = [0]
|
265
|
+
|
266
|
+
def fnc2(interaction, count):
|
267
|
+
count[0] = count[0] + 1
|
268
|
+
if count[0] % 1000 == 0:
|
269
|
+
print(f'{count[0]}/{len(all_ddis)}')
|
270
|
+
mechanism, action, drugA, drugB = event_extractor.extract(
|
271
|
+
interaction)
|
272
|
+
return mechanism+'__' + action
|
273
|
+
|
274
|
+
# self.ddis_df['mechanism_action'] = self.ddis_df['interaction'].apply(lambda x: fnc2(x))
|
275
|
+
# tqdm.pandas()
|
276
|
+
self.ddis_df['mechanism_action'] = self.ddis_df['masked_interaction'].apply(
|
277
|
+
fnc2, args=(count,))
|
278
|
+
|
279
|
+
# self.ddis_df.to_csv(ddi_csv_path, index=False, compression='gzip')
|
280
|
+
# self.ddis_df.to_pickle(ddi_pickle_path)
|
281
|
+
zip_helper = ZipHelper()
|
282
|
+
|
283
|
+
if save_as_sql:
|
284
|
+
conn = sqlite3.connect(db_path)
|
285
|
+
# self.drugs_df.to_sql('_Drugs', conn, if_exists='replace', index=True)
|
286
|
+
# self.ddis_df.to_sql('_Interactions', conn, if_exists='replace', index=True)
|
287
|
+
ext_id_df= pd.DataFrame.from_records(external_identifier_list)
|
288
|
+
ext_id_df.to_sql('_ExternalIdentifiers', conn, if_exists='replace', index=True)
|
289
|
+
|
290
|
+
zip_helper.zip_single_file(
|
291
|
+
file_path=db_path, output_path=output_path+'/zips', name='db')
|
292
|
+
conn.close()
|
293
|
+
|
294
|
+
if zip_outputs:
|
295
|
+
zip_helper.zip_single_file(
|
296
|
+
file_path=drugs_pickle_path, output_path=output_path+'/zips', name='drugs-pickle')
|
297
|
+
zip_helper.zip_single_file(
|
298
|
+
file_path=ddi_pickle_path, output_path=output_path+'/zips', name='ddi-pickle')
|
299
|
+
|
300
|
+
else:
|
301
|
+
print('Output path has processed data, load function is called')
|
302
|
+
self.load(output_path)
|
303
|
+
|
304
|
+
def load(self, path):
|
305
|
+
drugs_pickle_path = path+'/drugs.pkl'
|
306
|
+
ddi_pickle_path = path+'/ddi.pkl'
|
307
|
+
if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_pickle_path):
|
308
|
+
self.drugs_df = pd.read_pickle(drugs_pickle_path)
|
309
|
+
self.ddis_df = pd.read_pickle(ddi_pickle_path)
|
310
|
+
else:
|
311
|
+
print('One of given paths could not found')
|
312
|
+
|
313
|
+
def load_from_csv(self, path):
|
314
|
+
drugs_csv_path = path+'/drugs.gzip'
|
315
|
+
ddi_csv_path = path+'/ddi.gzip'
|
316
|
+
if os.path.exists(drugs_csv_path) and os.path.exists(ddi_csv_path):
|
317
|
+
self.drugs_df = pd.read_csv(drugs_csv_path, compression='gzip')
|
318
|
+
self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
|
319
|
+
else:
|
320
|
+
print('One of given paths could not found')
|
321
|
+
|
322
|
+
def load2(self, path):
|
323
|
+
drugs_pickle_path = path+'/drugs.pkl'
|
324
|
+
ddi_csv_path = path+'/ddi.gzip'
|
325
|
+
if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_csv_path):
|
326
|
+
self.drugs_df = pd.read_pickle(drugs_pickle_path)
|
327
|
+
self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
|
328
|
+
else:
|
329
|
+
print('One of given paths could not found')
|
330
|
+
|
331
|
+
def drugs_as_dataframe(self):
|
332
|
+
return self.drugs_df
|
333
|
+
|
334
|
+
def filtered_drugs_as_dataframe(self, drug_ids):
|
335
|
+
return self.drugs_df[self.drugs_df['drugbank_id'].isin(drug_ids)]
|
336
|
+
|
337
|
+
def ddis_as_dataframe(self):
|
338
|
+
return self.ddis_df
|
339
|
+
|
340
|
+
def filtered_ddis(self, drugs):
|
341
|
+
ddis_df = self.ddis_df.copy()
|
342
|
+
return ddis_df[(ddis_df['drug_1'] in drugs) & (
|
343
|
+
ddis_df['drug_2'] in drugs)]
|
@@ -0,0 +1,272 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import os
|
3
|
+
import json
|
4
|
+
import glob
|
5
|
+
from tqdm import tqdm
|
6
|
+
|
7
|
+
import csv
|
8
|
+
|
9
|
+
from rdkit import Chem
|
10
|
+
from rdkit.Chem import AllChem
|
11
|
+
import numpy as np
|
12
|
+
from drugbank.event_extractor import EventExtractor
|
13
|
+
|
14
|
+
from zip_helper import ZipHelper
|
15
|
+
# from event_extractor import EventExtractor
|
16
|
+
|
17
|
+
|
18
|
+
def multiline_to_singleline(multiline):
|
19
|
+
if multiline is None:
|
20
|
+
return ""
|
21
|
+
return " ".join(line.strip() for line in multiline.splitlines())
|
22
|
+
|
23
|
+
# targets -> target -> polypeptide
|
24
|
+
# enzymes -> enzyme -> polypeptide
|
25
|
+
# pathways from KEGG, KEGG ID is obtained from DrugBank
|
26
|
+
# https://www.genome.jp/dbget-bin/www_bget?drug:D03136
|
27
|
+
# https://www.kegg.jp/entry/D03136
|
28
|
+
|
29
|
+
|
30
|
+
class DrugBankProcessor():
|
31
|
+
|
32
|
+
def mask_interaction(self, drug_1, drug_2, interaction):
|
33
|
+
return interaction.replace(
|
34
|
+
drug_1, "DRUG").replace(drug_2, "DRUG")
|
35
|
+
|
36
|
+
def extract_zip_files(self, input_path='zips', output_path='drugs', override=False):
|
37
|
+
if override:
|
38
|
+
zip_helper = ZipHelper()
|
39
|
+
zip_helper.extract(input_path=input_path, output_path=output_path)
|
40
|
+
|
41
|
+
def process(self, input_path='drugs', output_path='output', zip_outputs=True):
|
42
|
+
if not os.path.exists(output_path):
|
43
|
+
os.makedirs(output_path)
|
44
|
+
|
45
|
+
drugs_pickle_path = output_path+'/drugs.pkl'
|
46
|
+
drugs_csv_path = output_path+'/drugs.gzip'
|
47
|
+
ddi_pickle_path = output_path + '/ddi.pkl'
|
48
|
+
ddi_csv_path = output_path + '/ddi.gzip'
|
49
|
+
|
50
|
+
if not os.path.exists(drugs_pickle_path) or not os.path.exists(ddi_pickle_path):
|
51
|
+
drug_rows = []
|
52
|
+
all_ddis = []
|
53
|
+
all_json_files = input_path+'/*.json*'
|
54
|
+
|
55
|
+
for filepath in tqdm(glob.glob(all_json_files)):
|
56
|
+
with open(filepath, 'r', encoding="utf8") as f:
|
57
|
+
|
58
|
+
data = json.load(f)
|
59
|
+
|
60
|
+
# if data['drug-interactions'] is None:
|
61
|
+
if False:
|
62
|
+
continue
|
63
|
+
else:
|
64
|
+
drug_1 = data['name']
|
65
|
+
drug_1_id = [d['value']
|
66
|
+
for d in data['drugbank-id'] if d['primary'] == True][0]
|
67
|
+
description = multiline_to_singleline(
|
68
|
+
data['description'])
|
69
|
+
if data['drug-interactions'] is not None:
|
70
|
+
drug_interactions = [
|
71
|
+
interaction for interaction in data['drug-interactions']['drug-interaction']]
|
72
|
+
ddis = [(drug_1, interaction['name'], interaction['description'])
|
73
|
+
for interaction in data['drug-interactions']['drug-interaction']]
|
74
|
+
|
75
|
+
ddi_dict = [{
|
76
|
+
'drug_1_id': drug_1_id,
|
77
|
+
'drug_1': drug_1,
|
78
|
+
'drug_2_id': interaction['drugbank-id']['value'],
|
79
|
+
'drug_2': interaction['name'],
|
80
|
+
'interaction': interaction['description'],
|
81
|
+
'masked_interaction': self.mask_interaction(drug_1, interaction['name'], interaction['description'])}
|
82
|
+
for interaction in data['drug-interactions']['drug-interaction']]
|
83
|
+
all_ddis.extend(ddi_dict)
|
84
|
+
|
85
|
+
synthesis_reference = data['synthesis-reference']
|
86
|
+
indication = multiline_to_singleline(
|
87
|
+
data['indication'])
|
88
|
+
pharmacodynamics = multiline_to_singleline(
|
89
|
+
data['pharmacodynamics'])
|
90
|
+
mechanism_of_action = multiline_to_singleline(
|
91
|
+
data['mechanism-of-action'])
|
92
|
+
toxicity = multiline_to_singleline(data['toxicity'])
|
93
|
+
metabolism = multiline_to_singleline(
|
94
|
+
data['metabolism'])
|
95
|
+
absorption = multiline_to_singleline(
|
96
|
+
data['absorption'])
|
97
|
+
half_life = multiline_to_singleline(data['half-life'])
|
98
|
+
protein_binding = multiline_to_singleline(
|
99
|
+
data['protein-binding'])
|
100
|
+
route_of_elimination = multiline_to_singleline(
|
101
|
+
data['route-of-elimination'])
|
102
|
+
volume_of_distribution = multiline_to_singleline(
|
103
|
+
data['volume-of-distribution'])
|
104
|
+
clearance = multiline_to_singleline(data['clearance'])
|
105
|
+
|
106
|
+
food_interactions = data['food-interactions']
|
107
|
+
sequences = data['sequences'] if "sequences" in data else None
|
108
|
+
|
109
|
+
external_identifiers = data['external-identifiers'] if "external-identifiers" in data else None
|
110
|
+
experimental_properties = data['experimental-properties'] if "experimental-properties" in data else None
|
111
|
+
calculated_properties = data['calculated-properties'] if "calculated-properties" in data else None
|
112
|
+
|
113
|
+
enzymes_polypeptides = None
|
114
|
+
targets_polypeptides = None
|
115
|
+
|
116
|
+
# targets = data['targets'] if "targets" in data else None
|
117
|
+
if data['targets'] is not None:
|
118
|
+
# targets_polypeptides = [p['id'] for d in data['targets']['target'] for p in d['polypeptide'] if 'polypeptide' in d ]
|
119
|
+
targets_polypeptides = [
|
120
|
+
p['id'] for d in data['targets']['target'] if 'polypeptide' in d for p in d['polypeptide']]
|
121
|
+
|
122
|
+
if data['enzymes'] is not None:
|
123
|
+
# enzymes_polypeptides = [p['id'] for d in data['enzymes']['enzyme'] for p in d['polypeptide'] if 'polypeptide' in d]
|
124
|
+
enzymes_polypeptides = [
|
125
|
+
p['id'] for d in data['enzymes']['enzyme'] if 'polypeptide' in d for p in d['polypeptide']]
|
126
|
+
|
127
|
+
if external_identifiers is not None:
|
128
|
+
external_identifiers_dict = dict(
|
129
|
+
[(p['resource'], p['identifier']) for p in external_identifiers['external-identifier']])
|
130
|
+
|
131
|
+
# add note column
|
132
|
+
smiles = None
|
133
|
+
morgan_hashed = None
|
134
|
+
if calculated_properties is not None:
|
135
|
+
calculated_properties_dict = dict(
|
136
|
+
[(p['kind'], p['value']) for p in calculated_properties['property']])
|
137
|
+
smiles = calculated_properties_dict['SMILES'] if 'SMILES' in calculated_properties_dict else None
|
138
|
+
if smiles is not None:
|
139
|
+
try:
|
140
|
+
mol = Chem.MolFromSmiles(smiles)
|
141
|
+
morgan_hashed = AllChem.GetMorganFingerprintAsBitVect(
|
142
|
+
mol, 2, nBits=881).ToList()
|
143
|
+
except:
|
144
|
+
print("An exception occurred")
|
145
|
+
if morgan_hashed is None:
|
146
|
+
morgan_hashed = np.zeros(881)
|
147
|
+
|
148
|
+
# k = [p[k] for p in calculated_properties['property'] for k in p.keys() if k =='SMILES']
|
149
|
+
# external_identifiers['external-identifier']
|
150
|
+
# experimental_properties['property']
|
151
|
+
|
152
|
+
row = {'drugbank_id': drug_1_id,
|
153
|
+
'name': drug_1,
|
154
|
+
'description': description,
|
155
|
+
'synthesis_reference': synthesis_reference,
|
156
|
+
'indication': indication,
|
157
|
+
'pharmacodynamics': pharmacodynamics,
|
158
|
+
'mechanism_of_action': mechanism_of_action,
|
159
|
+
'toxicity': toxicity,
|
160
|
+
'metabolism': metabolism,
|
161
|
+
'absorption': absorption,
|
162
|
+
'half_life': half_life,
|
163
|
+
'protein_binding': protein_binding,
|
164
|
+
'route_of_elimination': route_of_elimination,
|
165
|
+
'volume_of_distribution': volume_of_distribution,
|
166
|
+
'clearance': clearance,
|
167
|
+
'smiles': smiles,
|
168
|
+
'smiles_morgan_fingerprint': morgan_hashed,
|
169
|
+
'enzymes_polypeptides': enzymes_polypeptides,
|
170
|
+
'targets_polypeptides': targets_polypeptides,
|
171
|
+
'external_identifiers': external_identifiers_dict
|
172
|
+
}
|
173
|
+
drug_rows.append(row)
|
174
|
+
|
175
|
+
# if len(drug_rows) == 10:
|
176
|
+
# break
|
177
|
+
# print(smiles_count)
|
178
|
+
print(f"Size of drugs {len(drug_rows)}")
|
179
|
+
print(f"Size of DDIs {len(all_ddis)}")
|
180
|
+
np.set_printoptions(threshold=np.inf)
|
181
|
+
|
182
|
+
# drug_names = [row['name'] for row in drug_rows]
|
183
|
+
drug_names = ['DRUG']
|
184
|
+
event_extractor = EventExtractor(drug_names)
|
185
|
+
|
186
|
+
replace_dict = {'MYO-029': 'Stamulumab'}
|
187
|
+
for ddi in tqdm(all_ddis):
|
188
|
+
for key, value in replace_dict.items():
|
189
|
+
ddi['masked_interaction'] = ddi['masked_interaction'].replace(
|
190
|
+
key, value)
|
191
|
+
# interaction = ddi['interaction']
|
192
|
+
# mechanism, action, drugA, drugB = event_extractor.extract(interaction)
|
193
|
+
# ddi['mechanism'] = mechanism
|
194
|
+
# ddi['action'] = action
|
195
|
+
|
196
|
+
self.drugs_df = pd.DataFrame(drug_rows)
|
197
|
+
self.drugs_df.to_pickle(drugs_pickle_path)
|
198
|
+
self.drugs_df.to_csv(
|
199
|
+
drugs_csv_path, index=False, compression='gzip')
|
200
|
+
|
201
|
+
# print('mechanism_action calculation')
|
202
|
+
self.ddis_df = pd.DataFrame(all_ddis)
|
203
|
+
|
204
|
+
count = [0]
|
205
|
+
|
206
|
+
def fnc2(interaction, count):
|
207
|
+
count[0] = count[0] + 1
|
208
|
+
if count[0] % 1000 == 0:
|
209
|
+
print(f'{count[0]}/{len(all_ddis)}')
|
210
|
+
mechanism, action, drugA, drugB = event_extractor.extract(
|
211
|
+
interaction)
|
212
|
+
return mechanism+'__' + action
|
213
|
+
|
214
|
+
# self.ddis_df['mechanism_action'] = self.ddis_df['interaction'].apply(lambda x: fnc2(x))
|
215
|
+
# tqdm.pandas()
|
216
|
+
self.ddis_df['mechanism_action'] = self.ddis_df['masked_interaction'].apply(
|
217
|
+
fnc2, args=(count,))
|
218
|
+
|
219
|
+
self.ddis_df.to_csv(ddi_csv_path, index=False, compression='gzip')
|
220
|
+
self.ddis_df.to_pickle(ddi_pickle_path)
|
221
|
+
|
222
|
+
if zip_outputs:
|
223
|
+
zip_helper = ZipHelper()
|
224
|
+
zip_helper.zip_single_file(
|
225
|
+
file_path=drugs_pickle_path, output_path=output_path+'/zips', name='drugs-pickle')
|
226
|
+
zip_helper.zip_single_file(
|
227
|
+
file_path=ddi_pickle_path, output_path=output_path+'/zips', name='ddi-pickle')
|
228
|
+
|
229
|
+
else:
|
230
|
+
print('Output path has processed data, load function is called')
|
231
|
+
self.load(output_path)
|
232
|
+
|
233
|
+
def load(self, path):
|
234
|
+
drugs_pickle_path = path+'/drugs.pkl'
|
235
|
+
ddi_pickle_path = path+'/ddi.pkl'
|
236
|
+
if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_pickle_path):
|
237
|
+
self.drugs_df = pd.read_pickle(drugs_pickle_path)
|
238
|
+
self.ddis_df = pd.read_pickle(ddi_pickle_path)
|
239
|
+
else:
|
240
|
+
print('One of given paths could not found')
|
241
|
+
|
242
|
+
def load_from_csv(self, path):
|
243
|
+
drugs_csv_path = path+'/drugs.gzip'
|
244
|
+
ddi_csv_path = path+'/ddi.gzip'
|
245
|
+
if os.path.exists(drugs_csv_path) and os.path.exists(ddi_csv_path):
|
246
|
+
self.drugs_df = pd.read_csv(drugs_csv_path, compression='gzip')
|
247
|
+
self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
|
248
|
+
else:
|
249
|
+
print('One of given paths could not found')
|
250
|
+
|
251
|
+
def load2(self, path):
|
252
|
+
drugs_pickle_path = path+'/drugs.pkl'
|
253
|
+
ddi_csv_path = path+'/ddi.gzip'
|
254
|
+
if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_csv_path):
|
255
|
+
self.drugs_df = pd.read_pickle(drugs_pickle_path)
|
256
|
+
self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
|
257
|
+
else:
|
258
|
+
print('One of given paths could not found')
|
259
|
+
|
260
|
+
def drugs_as_dataframe(self):
|
261
|
+
return self.drugs_df
|
262
|
+
|
263
|
+
def filtered_drugs_as_dataframe(self, drug_ids):
|
264
|
+
return self.drugs_df[self.drugs_df['drugbank_id'].isin(drug_ids)]
|
265
|
+
|
266
|
+
def ddis_as_dataframe(self):
|
267
|
+
return self.ddis_df
|
268
|
+
|
269
|
+
def filtered_ddis(self, drugs):
|
270
|
+
ddis_df = self.ddis_df.copy()
|
271
|
+
return ddis_df[(ddis_df['drug_1'] in drugs) & (
|
272
|
+
ddis_df['drug_2'] in drugs)]
|