ddi-fw 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,343 @@
1
+ import sqlite3
2
+ import pandas as pd
3
+ import os
4
+ import json
5
+ import glob
6
+ from tqdm import tqdm
7
+
8
+ import csv
9
+
10
+ from rdkit import Chem
11
+ from rdkit.Chem import AllChem
12
+ import numpy as np
13
+ from drugbank.event_extractor import EventExtractor
14
+ from ner.ner import CTakesNER
15
+
16
+ from utils import ZipHelper
17
+ # from event_extractor import EventExtractor
18
+
19
+
20
+ def multiline_to_singleline(multiline):
21
+ if multiline is None:
22
+ return ""
23
+ return " ".join(line.strip() for line in multiline.splitlines())
24
+
25
+ # targets -> target -> polypeptide
26
+ # enzymes -> enzyme -> polypeptide
27
+ # pathways from KEGG, KEGG ID is obtained from DrugBank
28
+ # https://www.genome.jp/dbget-bin/www_bget?drug:D03136
29
+ # https://www.kegg.jp/entry/D03136
30
+
31
+
32
+ class DrugBankProcessor():
33
+
34
+ def mask_interaction(self, drug_1, drug_2, interaction):
35
+ return interaction.replace(
36
+ drug_1, "DRUG").replace(drug_2, "DRUG")
37
+
38
+ def extract_zip_files(self, input_path='zips', output_path='drugs', override=False):
39
+ if override:
40
+ zip_helper = ZipHelper()
41
+ zip_helper.extract(input_path=input_path, output_path=output_path)
42
+
43
+ def get_external_identifiers(self,input_path='drugs'):
44
+ external_identifier_list = []
45
+ all_json_files = input_path+'/*.json*'
46
+
47
+ for filepath in tqdm(glob.glob(all_json_files)):
48
+ with open(filepath, 'r', encoding="utf8") as f:
49
+
50
+ data = json.load(f)
51
+ drug_1 = data['name']
52
+ drug_1_id = [d['value']
53
+ for d in data['drugbank-id'] if d['primary'] == True][0]
54
+ external_identifiers = data['external-identifiers'] if "external-identifiers" in data else None
55
+ external_identifiers_dict = {}
56
+ external_identifiers_dict['name'] = drug_1
57
+ external_identifiers_dict['drugbank_id'] = drug_1_id
58
+ if external_identifiers is not None:
59
+ for p in external_identifiers['external-identifier']:
60
+ external_identifiers_dict[p['resource'].lower().replace(" ","_")] = p['identifier']
61
+ # external_identifiers_dict = dict(
62
+ # [(p['resource'].lower().replace(" ","_"), p['identifier']) for p in external_identifiers['external-identifier']])
63
+ # external_identifiers_dict['name'] = drug_1
64
+ # external_identifiers_dict['drugbank_id'] = drug_1_id
65
+ external_identifier_list.append(external_identifiers_dict)
66
+ return external_identifier_list
67
+
68
+
69
+ def process(self, input_path='drugs', output_path='output', save_as_sql=True, db_path = r"./drugbank.db", zip_outputs=True):
70
+ if not os.path.exists(output_path):
71
+ os.makedirs(output_path)
72
+ ner_df =CTakesNER().load()
73
+ drugs_pickle_path = output_path+'/drugs.pkl'
74
+ drugs_csv_path = output_path+'/drugs.gzip'
75
+ ddi_pickle_path = output_path + '/ddi.pkl'
76
+ ddi_csv_path = output_path + '/ddi.gzip'
77
+
78
+ if not os.path.exists(drugs_pickle_path) or not os.path.exists(ddi_pickle_path):
79
+ drug_rows = []
80
+ all_ddis = []
81
+ external_identifier_list = []
82
+ all_json_files = input_path+'/*.json*'
83
+
84
+ for filepath in tqdm(glob.glob(all_json_files)):
85
+ with open(filepath, 'r', encoding="utf8") as f:
86
+
87
+ data = json.load(f)
88
+
89
+ # if data['drug-interactions'] is None:
90
+ if False:
91
+ continue
92
+ else:
93
+ drug_1 = data['name']
94
+ drug_1_id = [d['value']
95
+ for d in data['drugbank-id'] if d['primary'] == True][0]
96
+ description = multiline_to_singleline(
97
+ data['description'])
98
+ if data['drug-interactions'] is not None:
99
+ drug_interactions = [
100
+ interaction for interaction in data['drug-interactions']['drug-interaction']]
101
+ ddis = [(drug_1, interaction['name'], interaction['description'])
102
+ for interaction in data['drug-interactions']['drug-interaction']]
103
+
104
+ ddi_dict = [{
105
+ 'drug_1_id': drug_1_id,
106
+ 'drug_1': drug_1,
107
+ 'drug_2_id': interaction['drugbank-id']['value'],
108
+ 'drug_2': interaction['name'],
109
+ 'interaction': interaction['description'],
110
+ 'masked_interaction': self.mask_interaction(drug_1, interaction['name'], interaction['description'])}
111
+ for interaction in data['drug-interactions']['drug-interaction']]
112
+ all_ddis.extend(ddi_dict)
113
+
114
+ synthesis_reference = data['synthesis-reference']
115
+ indication = multiline_to_singleline(
116
+ data['indication'])
117
+ pharmacodynamics = multiline_to_singleline(
118
+ data['pharmacodynamics'])
119
+ mechanism_of_action = multiline_to_singleline(
120
+ data['mechanism-of-action'])
121
+ toxicity = multiline_to_singleline(data['toxicity'])
122
+ metabolism = multiline_to_singleline(
123
+ data['metabolism'])
124
+ absorption = multiline_to_singleline(
125
+ data['absorption'])
126
+ half_life = multiline_to_singleline(data['half-life'])
127
+ protein_binding = multiline_to_singleline(
128
+ data['protein-binding'])
129
+ route_of_elimination = multiline_to_singleline(
130
+ data['route-of-elimination'])
131
+ volume_of_distribution = multiline_to_singleline(
132
+ data['volume-of-distribution'])
133
+ clearance = multiline_to_singleline(data['clearance'])
134
+
135
+ food_interactions = data['food-interactions']
136
+ sequences = data['sequences'] if "sequences" in data else None
137
+
138
+ external_identifiers = data['external-identifiers'] if "external-identifiers" in data else None
139
+ experimental_properties = data['experimental-properties'] if "experimental-properties" in data else None
140
+ calculated_properties = data['calculated-properties'] if "calculated-properties" in data else None
141
+
142
+ enzymes_polypeptides = None
143
+ targets_polypeptides = None
144
+ pathways = None
145
+
146
+ # targets = data['targets'] if "targets" in data else None
147
+ if data['targets'] is not None:
148
+ # targets_polypeptides = [p['id'] for d in data['targets']['target'] for p in d['polypeptide'] if 'polypeptide' in d ]
149
+ targets_polypeptides = [
150
+ p['id'] for d in data['targets']['target'] if 'polypeptide' in d for p in d['polypeptide']]
151
+
152
+ if data['enzymes'] is not None:
153
+ # enzymes_polypeptides = [p['id'] for d in data['enzymes']['enzyme'] for p in d['polypeptide'] if 'polypeptide' in d]
154
+ enzymes_polypeptides = [
155
+ p['id'] for d in data['enzymes']['enzyme'] if 'polypeptide' in d for p in d['polypeptide']]
156
+
157
+ if data['pathways'] is not None:
158
+ pathways = [
159
+ d['smpdb-id'] for d in data['pathways']['pathway']]
160
+
161
+
162
+ if external_identifiers is not None:
163
+ external_identifiers_dict = dict(
164
+ [(p['resource'], p['identifier']) for p in external_identifiers['external-identifier']])
165
+ external_identifiers_dict['drugbank_id'] = drug_1_id
166
+ external_identifier_list.append(external_identifiers_dict)
167
+ # add note column
168
+ smiles = None
169
+ morgan_hashed = None
170
+ if calculated_properties is not None:
171
+ calculated_properties_dict = dict(
172
+ [(p['kind'], p['value']) for p in calculated_properties['property']])
173
+ smiles = calculated_properties_dict['SMILES'] if 'SMILES' in calculated_properties_dict else None
174
+ if smiles is not None:
175
+ try:
176
+ mol = Chem.MolFromSmiles(smiles)
177
+ morgan_hashed = AllChem.GetMorganFingerprintAsBitVect(
178
+ mol, 2, nBits=881).ToList()
179
+ except:
180
+ print("An exception occurred")
181
+ if morgan_hashed is None:
182
+ morgan_hashed = np.zeros(881).tolist()
183
+
184
+ # TODO cui, tui, entities other types of texts, test it
185
+ tuis_description = ner_df[ner_df['drugbank_id'] == drug_1_id]['tui_description'].values
186
+ if len(tuis_description) > 0:
187
+ tuis_description = tuis_description[0]
188
+ else:
189
+ tuis_description = None
190
+
191
+ cuis_description = ner_df[ner_df['drugbank_id'] == drug_1_id]['cui_description'].values
192
+ if len(cuis_description) > 0:
193
+ cuis_description = cuis_description[0]
194
+ else:
195
+ cuis_description = None
196
+
197
+ entities_description = ner_df[ner_df['drugbank_id'] == drug_1_id]['entities_description'].values
198
+ if len(entities_description) > 0:
199
+ entities_description = entities_description[0]
200
+ else:
201
+ entities_description = None
202
+
203
+ # k = [p[k] for p in calculated_properties['property'] for k in p.keys() if k =='SMILES']
204
+ # external_identifiers['external-identifier']
205
+ # experimental_properties['property']
206
+
207
+ # list to single line reminder
208
+ row = {'drugbank_id': drug_1_id,
209
+ 'name': drug_1,
210
+ 'description': description,
211
+ 'synthesis_reference': synthesis_reference,
212
+ 'indication': indication,
213
+ 'pharmacodynamics': pharmacodynamics,
214
+ 'mechanism_of_action': mechanism_of_action,
215
+ 'toxicity': toxicity,
216
+ 'metabolism': metabolism,
217
+ 'absorption': absorption,
218
+ 'half_life': half_life,
219
+ 'protein_binding': protein_binding,
220
+ 'route_of_elimination': route_of_elimination,
221
+ 'volume_of_distribution': volume_of_distribution,
222
+ 'clearance': clearance,
223
+ 'smiles': smiles,
224
+ 'smiles_morgan_fingerprint': ','.join(map(str, morgan_hashed)),
225
+ 'enzymes_polypeptides': '|'.join(enzymes_polypeptides) if enzymes_polypeptides is not None else None,
226
+ 'targets_polypeptides': '|'.join(targets_polypeptides) if targets_polypeptides is not None else None,
227
+ 'pathways': '|'.join(pathways) if pathways is not None else None,
228
+ 'tuis_description':'|'.join(tuis_description) if tuis_description is not None else None,
229
+ 'cuis_description':'|'.join(cuis_description) if cuis_description is not None else None,
230
+ 'entities_description':'|'.join(entities_description) if entities_description is not None else None
231
+ # 'external_identifiers': external_identifiers_dict
232
+ }
233
+ drug_rows.append(row)
234
+
235
+ # if len(drug_rows) == 10:
236
+ # break
237
+ # print(smiles_count)
238
+ print(f"Size of drugs {len(drug_rows)}")
239
+ print(f"Size of DDIs {len(all_ddis)}")
240
+ np.set_printoptions(threshold=np.inf)
241
+
242
+ # drug_names = [row['name'] for row in drug_rows]
243
+ drug_names = ['DRUG']
244
+ event_extractor = EventExtractor(drug_names)
245
+
246
+ replace_dict = {'MYO-029': 'Stamulumab'}
247
+ for ddi in tqdm(all_ddis):
248
+ for key, value in replace_dict.items():
249
+ ddi['masked_interaction'] = ddi['masked_interaction'].replace(
250
+ key, value)
251
+ # interaction = ddi['interaction']
252
+ # mechanism, action, drugA, drugB = event_extractor.extract(interaction)
253
+ # ddi['mechanism'] = mechanism
254
+ # ddi['action'] = action
255
+
256
+ self.drugs_df = pd.DataFrame(drug_rows)
257
+ # self.drugs_df.to_pickle(drugs_pickle_path)
258
+ # self.drugs_df.to_csv(
259
+ # drugs_csv_path, index=False, compression='gzip')
260
+
261
+ # print('mechanism_action calculation')
262
+ self.ddis_df = pd.DataFrame(all_ddis)
263
+
264
+ count = [0]
265
+
266
+ def fnc2(interaction, count):
267
+ count[0] = count[0] + 1
268
+ if count[0] % 1000 == 0:
269
+ print(f'{count[0]}/{len(all_ddis)}')
270
+ mechanism, action, drugA, drugB = event_extractor.extract(
271
+ interaction)
272
+ return mechanism+'__' + action
273
+
274
+ # self.ddis_df['mechanism_action'] = self.ddis_df['interaction'].apply(lambda x: fnc2(x))
275
+ # tqdm.pandas()
276
+ self.ddis_df['mechanism_action'] = self.ddis_df['masked_interaction'].apply(
277
+ fnc2, args=(count,))
278
+
279
+ # self.ddis_df.to_csv(ddi_csv_path, index=False, compression='gzip')
280
+ # self.ddis_df.to_pickle(ddi_pickle_path)
281
+ zip_helper = ZipHelper()
282
+
283
+ if save_as_sql:
284
+ conn = sqlite3.connect(db_path)
285
+ # self.drugs_df.to_sql('_Drugs', conn, if_exists='replace', index=True)
286
+ # self.ddis_df.to_sql('_Interactions', conn, if_exists='replace', index=True)
287
+ ext_id_df= pd.DataFrame.from_records(external_identifier_list)
288
+ ext_id_df.to_sql('_ExternalIdentifiers', conn, if_exists='replace', index=True)
289
+
290
+ zip_helper.zip_single_file(
291
+ file_path=db_path, output_path=output_path+'/zips', name='db')
292
+ conn.close()
293
+
294
+ if zip_outputs:
295
+ zip_helper.zip_single_file(
296
+ file_path=drugs_pickle_path, output_path=output_path+'/zips', name='drugs-pickle')
297
+ zip_helper.zip_single_file(
298
+ file_path=ddi_pickle_path, output_path=output_path+'/zips', name='ddi-pickle')
299
+
300
+ else:
301
+ print('Output path has processed data, load function is called')
302
+ self.load(output_path)
303
+
304
+ def load(self, path):
305
+ drugs_pickle_path = path+'/drugs.pkl'
306
+ ddi_pickle_path = path+'/ddi.pkl'
307
+ if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_pickle_path):
308
+ self.drugs_df = pd.read_pickle(drugs_pickle_path)
309
+ self.ddis_df = pd.read_pickle(ddi_pickle_path)
310
+ else:
311
+ print('One of given paths could not found')
312
+
313
+ def load_from_csv(self, path):
314
+ drugs_csv_path = path+'/drugs.gzip'
315
+ ddi_csv_path = path+'/ddi.gzip'
316
+ if os.path.exists(drugs_csv_path) and os.path.exists(ddi_csv_path):
317
+ self.drugs_df = pd.read_csv(drugs_csv_path, compression='gzip')
318
+ self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
319
+ else:
320
+ print('One of given paths could not found')
321
+
322
+ def load2(self, path):
323
+ drugs_pickle_path = path+'/drugs.pkl'
324
+ ddi_csv_path = path+'/ddi.gzip'
325
+ if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_csv_path):
326
+ self.drugs_df = pd.read_pickle(drugs_pickle_path)
327
+ self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
328
+ else:
329
+ print('One of given paths could not found')
330
+
331
+ def drugs_as_dataframe(self):
332
+ return self.drugs_df
333
+
334
+ def filtered_drugs_as_dataframe(self, drug_ids):
335
+ return self.drugs_df[self.drugs_df['drugbank_id'].isin(drug_ids)]
336
+
337
+ def ddis_as_dataframe(self):
338
+ return self.ddis_df
339
+
340
+ def filtered_ddis(self, drugs):
341
+ ddis_df = self.ddis_df.copy()
342
+ return ddis_df[(ddis_df['drug_1'] in drugs) & (
343
+ ddis_df['drug_2'] in drugs)]
@@ -0,0 +1,272 @@
1
+ import pandas as pd
2
+ import os
3
+ import json
4
+ import glob
5
+ from tqdm import tqdm
6
+
7
+ import csv
8
+
9
+ from rdkit import Chem
10
+ from rdkit.Chem import AllChem
11
+ import numpy as np
12
+ from drugbank.event_extractor import EventExtractor
13
+
14
+ from zip_helper import ZipHelper
15
+ # from event_extractor import EventExtractor
16
+
17
+
18
+ def multiline_to_singleline(multiline):
19
+ if multiline is None:
20
+ return ""
21
+ return " ".join(line.strip() for line in multiline.splitlines())
22
+
23
+ # targets -> target -> polypeptide
24
+ # enzymes -> enzyme -> polypeptide
25
+ # pathways from KEGG, KEGG ID is obtained from DrugBank
26
+ # https://www.genome.jp/dbget-bin/www_bget?drug:D03136
27
+ # https://www.kegg.jp/entry/D03136
28
+
29
+
30
+ class DrugBankProcessor():
31
+
32
+ def mask_interaction(self, drug_1, drug_2, interaction):
33
+ return interaction.replace(
34
+ drug_1, "DRUG").replace(drug_2, "DRUG")
35
+
36
+ def extract_zip_files(self, input_path='zips', output_path='drugs', override=False):
37
+ if override:
38
+ zip_helper = ZipHelper()
39
+ zip_helper.extract(input_path=input_path, output_path=output_path)
40
+
41
+ def process(self, input_path='drugs', output_path='output', zip_outputs=True):
42
+ if not os.path.exists(output_path):
43
+ os.makedirs(output_path)
44
+
45
+ drugs_pickle_path = output_path+'/drugs.pkl'
46
+ drugs_csv_path = output_path+'/drugs.gzip'
47
+ ddi_pickle_path = output_path + '/ddi.pkl'
48
+ ddi_csv_path = output_path + '/ddi.gzip'
49
+
50
+ if not os.path.exists(drugs_pickle_path) or not os.path.exists(ddi_pickle_path):
51
+ drug_rows = []
52
+ all_ddis = []
53
+ all_json_files = input_path+'/*.json*'
54
+
55
+ for filepath in tqdm(glob.glob(all_json_files)):
56
+ with open(filepath, 'r', encoding="utf8") as f:
57
+
58
+ data = json.load(f)
59
+
60
+ # if data['drug-interactions'] is None:
61
+ if False:
62
+ continue
63
+ else:
64
+ drug_1 = data['name']
65
+ drug_1_id = [d['value']
66
+ for d in data['drugbank-id'] if d['primary'] == True][0]
67
+ description = multiline_to_singleline(
68
+ data['description'])
69
+ if data['drug-interactions'] is not None:
70
+ drug_interactions = [
71
+ interaction for interaction in data['drug-interactions']['drug-interaction']]
72
+ ddis = [(drug_1, interaction['name'], interaction['description'])
73
+ for interaction in data['drug-interactions']['drug-interaction']]
74
+
75
+ ddi_dict = [{
76
+ 'drug_1_id': drug_1_id,
77
+ 'drug_1': drug_1,
78
+ 'drug_2_id': interaction['drugbank-id']['value'],
79
+ 'drug_2': interaction['name'],
80
+ 'interaction': interaction['description'],
81
+ 'masked_interaction': self.mask_interaction(drug_1, interaction['name'], interaction['description'])}
82
+ for interaction in data['drug-interactions']['drug-interaction']]
83
+ all_ddis.extend(ddi_dict)
84
+
85
+ synthesis_reference = data['synthesis-reference']
86
+ indication = multiline_to_singleline(
87
+ data['indication'])
88
+ pharmacodynamics = multiline_to_singleline(
89
+ data['pharmacodynamics'])
90
+ mechanism_of_action = multiline_to_singleline(
91
+ data['mechanism-of-action'])
92
+ toxicity = multiline_to_singleline(data['toxicity'])
93
+ metabolism = multiline_to_singleline(
94
+ data['metabolism'])
95
+ absorption = multiline_to_singleline(
96
+ data['absorption'])
97
+ half_life = multiline_to_singleline(data['half-life'])
98
+ protein_binding = multiline_to_singleline(
99
+ data['protein-binding'])
100
+ route_of_elimination = multiline_to_singleline(
101
+ data['route-of-elimination'])
102
+ volume_of_distribution = multiline_to_singleline(
103
+ data['volume-of-distribution'])
104
+ clearance = multiline_to_singleline(data['clearance'])
105
+
106
+ food_interactions = data['food-interactions']
107
+ sequences = data['sequences'] if "sequences" in data else None
108
+
109
+ external_identifiers = data['external-identifiers'] if "external-identifiers" in data else None
110
+ experimental_properties = data['experimental-properties'] if "experimental-properties" in data else None
111
+ calculated_properties = data['calculated-properties'] if "calculated-properties" in data else None
112
+
113
+ enzymes_polypeptides = None
114
+ targets_polypeptides = None
115
+
116
+ # targets = data['targets'] if "targets" in data else None
117
+ if data['targets'] is not None:
118
+ # targets_polypeptides = [p['id'] for d in data['targets']['target'] for p in d['polypeptide'] if 'polypeptide' in d ]
119
+ targets_polypeptides = [
120
+ p['id'] for d in data['targets']['target'] if 'polypeptide' in d for p in d['polypeptide']]
121
+
122
+ if data['enzymes'] is not None:
123
+ # enzymes_polypeptides = [p['id'] for d in data['enzymes']['enzyme'] for p in d['polypeptide'] if 'polypeptide' in d]
124
+ enzymes_polypeptides = [
125
+ p['id'] for d in data['enzymes']['enzyme'] if 'polypeptide' in d for p in d['polypeptide']]
126
+
127
+ if external_identifiers is not None:
128
+ external_identifiers_dict = dict(
129
+ [(p['resource'], p['identifier']) for p in external_identifiers['external-identifier']])
130
+
131
+ # add note column
132
+ smiles = None
133
+ morgan_hashed = None
134
+ if calculated_properties is not None:
135
+ calculated_properties_dict = dict(
136
+ [(p['kind'], p['value']) for p in calculated_properties['property']])
137
+ smiles = calculated_properties_dict['SMILES'] if 'SMILES' in calculated_properties_dict else None
138
+ if smiles is not None:
139
+ try:
140
+ mol = Chem.MolFromSmiles(smiles)
141
+ morgan_hashed = AllChem.GetMorganFingerprintAsBitVect(
142
+ mol, 2, nBits=881).ToList()
143
+ except:
144
+ print("An exception occurred")
145
+ if morgan_hashed is None:
146
+ morgan_hashed = np.zeros(881)
147
+
148
+ # k = [p[k] for p in calculated_properties['property'] for k in p.keys() if k =='SMILES']
149
+ # external_identifiers['external-identifier']
150
+ # experimental_properties['property']
151
+
152
+ row = {'drugbank_id': drug_1_id,
153
+ 'name': drug_1,
154
+ 'description': description,
155
+ 'synthesis_reference': synthesis_reference,
156
+ 'indication': indication,
157
+ 'pharmacodynamics': pharmacodynamics,
158
+ 'mechanism_of_action': mechanism_of_action,
159
+ 'toxicity': toxicity,
160
+ 'metabolism': metabolism,
161
+ 'absorption': absorption,
162
+ 'half_life': half_life,
163
+ 'protein_binding': protein_binding,
164
+ 'route_of_elimination': route_of_elimination,
165
+ 'volume_of_distribution': volume_of_distribution,
166
+ 'clearance': clearance,
167
+ 'smiles': smiles,
168
+ 'smiles_morgan_fingerprint': morgan_hashed,
169
+ 'enzymes_polypeptides': enzymes_polypeptides,
170
+ 'targets_polypeptides': targets_polypeptides,
171
+ 'external_identifiers': external_identifiers_dict
172
+ }
173
+ drug_rows.append(row)
174
+
175
+ # if len(drug_rows) == 10:
176
+ # break
177
+ # print(smiles_count)
178
+ print(f"Size of drugs {len(drug_rows)}")
179
+ print(f"Size of DDIs {len(all_ddis)}")
180
+ np.set_printoptions(threshold=np.inf)
181
+
182
+ # drug_names = [row['name'] for row in drug_rows]
183
+ drug_names = ['DRUG']
184
+ event_extractor = EventExtractor(drug_names)
185
+
186
+ replace_dict = {'MYO-029': 'Stamulumab'}
187
+ for ddi in tqdm(all_ddis):
188
+ for key, value in replace_dict.items():
189
+ ddi['masked_interaction'] = ddi['masked_interaction'].replace(
190
+ key, value)
191
+ # interaction = ddi['interaction']
192
+ # mechanism, action, drugA, drugB = event_extractor.extract(interaction)
193
+ # ddi['mechanism'] = mechanism
194
+ # ddi['action'] = action
195
+
196
+ self.drugs_df = pd.DataFrame(drug_rows)
197
+ self.drugs_df.to_pickle(drugs_pickle_path)
198
+ self.drugs_df.to_csv(
199
+ drugs_csv_path, index=False, compression='gzip')
200
+
201
+ # print('mechanism_action calculation')
202
+ self.ddis_df = pd.DataFrame(all_ddis)
203
+
204
+ count = [0]
205
+
206
+ def fnc2(interaction, count):
207
+ count[0] = count[0] + 1
208
+ if count[0] % 1000 == 0:
209
+ print(f'{count[0]}/{len(all_ddis)}')
210
+ mechanism, action, drugA, drugB = event_extractor.extract(
211
+ interaction)
212
+ return mechanism+'__' + action
213
+
214
+ # self.ddis_df['mechanism_action'] = self.ddis_df['interaction'].apply(lambda x: fnc2(x))
215
+ # tqdm.pandas()
216
+ self.ddis_df['mechanism_action'] = self.ddis_df['masked_interaction'].apply(
217
+ fnc2, args=(count,))
218
+
219
+ self.ddis_df.to_csv(ddi_csv_path, index=False, compression='gzip')
220
+ self.ddis_df.to_pickle(ddi_pickle_path)
221
+
222
+ if zip_outputs:
223
+ zip_helper = ZipHelper()
224
+ zip_helper.zip_single_file(
225
+ file_path=drugs_pickle_path, output_path=output_path+'/zips', name='drugs-pickle')
226
+ zip_helper.zip_single_file(
227
+ file_path=ddi_pickle_path, output_path=output_path+'/zips', name='ddi-pickle')
228
+
229
+ else:
230
+ print('Output path has processed data, load function is called')
231
+ self.load(output_path)
232
+
233
+ def load(self, path):
234
+ drugs_pickle_path = path+'/drugs.pkl'
235
+ ddi_pickle_path = path+'/ddi.pkl'
236
+ if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_pickle_path):
237
+ self.drugs_df = pd.read_pickle(drugs_pickle_path)
238
+ self.ddis_df = pd.read_pickle(ddi_pickle_path)
239
+ else:
240
+ print('One of given paths could not found')
241
+
242
+ def load_from_csv(self, path):
243
+ drugs_csv_path = path+'/drugs.gzip'
244
+ ddi_csv_path = path+'/ddi.gzip'
245
+ if os.path.exists(drugs_csv_path) and os.path.exists(ddi_csv_path):
246
+ self.drugs_df = pd.read_csv(drugs_csv_path, compression='gzip')
247
+ self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
248
+ else:
249
+ print('One of given paths could not found')
250
+
251
+ def load2(self, path):
252
+ drugs_pickle_path = path+'/drugs.pkl'
253
+ ddi_csv_path = path+'/ddi.gzip'
254
+ if os.path.exists(drugs_pickle_path) and os.path.exists(ddi_csv_path):
255
+ self.drugs_df = pd.read_pickle(drugs_pickle_path)
256
+ self.ddis_df = pd.read_csv(ddi_csv_path, compression='gzip')
257
+ else:
258
+ print('One of given paths could not found')
259
+
260
+ def drugs_as_dataframe(self):
261
+ return self.drugs_df
262
+
263
+ def filtered_drugs_as_dataframe(self, drug_ids):
264
+ return self.drugs_df[self.drugs_df['drugbank_id'].isin(drug_ids)]
265
+
266
+ def ddis_as_dataframe(self):
267
+ return self.ddis_df
268
+
269
+ def filtered_ddis(self, drugs):
270
+ ddis_df = self.ddis_df.copy()
271
+ return ddis_df[(ddis_df['drug_1'] in drugs) & (
272
+ ddis_df['drug_2'] in drugs)]