ddi-fw 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {ddi_fw-0.0.1 → ddi_fw-0.0.3}/PKG-INFO +1 -1
  2. ddi_fw-0.0.3/ddi_fw/datasets/__init__.py +12 -0
  3. ddi_fw-0.0.3/ddi_fw/datasets/core.py +416 -0
  4. ddi_fw-0.0.3/ddi_fw/datasets/db_utils.py +204 -0
  5. ddi_fw-0.0.3/ddi_fw/datasets/embedding_generator.py +66 -0
  6. ddi_fw-0.0.3/ddi_fw/datasets/embedding_generator_new.py +105 -0
  7. ddi_fw-0.0.3/ddi_fw/datasets/feature_vector_generation.py +100 -0
  8. ddi_fw-0.0.3/ddi_fw/datasets/idf_helper.py +71 -0
  9. ddi_fw-0.0.3/ddi_fw/drugbank/__init__.py +2 -0
  10. ddi_fw-0.0.3/ddi_fw/drugbank/drugbank_parser.py +154 -0
  11. ddi_fw-0.0.3/ddi_fw/drugbank/drugbank_processor.py +343 -0
  12. ddi_fw-0.0.3/ddi_fw/drugbank/drugbank_processor_org.py +272 -0
  13. ddi_fw-0.0.3/ddi_fw/drugbank/event_extractor.py +127 -0
  14. ddi_fw-0.0.3/ddi_fw/experiments/__init__.py +2 -0
  15. ddi_fw-0.0.3/ddi_fw/experiments/custom_torch_model.py +66 -0
  16. ddi_fw-0.0.3/ddi_fw/experiments/evaluation_helper.py +232 -0
  17. ddi_fw-0.0.3/ddi_fw/experiments/tensorflow_helper.py +296 -0
  18. ddi_fw-0.0.3/ddi_fw/experiments/test.py +59 -0
  19. ddi_fw-0.0.3/ddi_fw/ner/__init__.py +1 -0
  20. ddi_fw-0.0.3/ddi_fw/ner/mmlrestclient.py +155 -0
  21. ddi_fw-0.0.3/ddi_fw/ner/ner.py +340 -0
  22. ddi_fw-0.0.3/ddi_fw/utils/__init__.py +3 -0
  23. ddi_fw-0.0.3/ddi_fw/utils/enums.py +23 -0
  24. ddi_fw-0.0.3/ddi_fw/utils/utils.py +103 -0
  25. ddi_fw-0.0.3/ddi_fw/utils/zip_helper.py +66 -0
  26. {ddi_fw-0.0.1 → ddi_fw-0.0.3}/ddi_fw.egg-info/PKG-INFO +1 -1
  27. ddi_fw-0.0.3/ddi_fw.egg-info/SOURCES.txt +30 -0
  28. ddi_fw-0.0.3/ddi_fw.egg-info/top_level.txt +5 -0
  29. {ddi_fw-0.0.1 → ddi_fw-0.0.3}/setup.py +3 -2
  30. ddi_fw-0.0.1/ddi_fw.egg-info/SOURCES.txt +0 -6
  31. ddi_fw-0.0.1/ddi_fw.egg-info/top_level.txt +0 -1
  32. {ddi_fw-0.0.1 → ddi_fw-0.0.3}/README.md +0 -0
  33. {ddi_fw-0.0.1 → ddi_fw-0.0.3}/ddi_fw.egg-info/dependency_links.txt +0 -0
  34. {ddi_fw-0.0.1 → ddi_fw-0.0.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 1.2
2
2
  Name: ddi_fw
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: Do not use :)
5
5
  Home-page: UNKNOWN
6
6
  Author: Kıvanç Bayraktar
@@ -0,0 +1,12 @@
1
+ from .core import BaseDataset
2
+ from .ddi_mdl.base import DDIMDLDataset
3
+ from .mdf_sa_ddi.base import MDFSADDIDataset
4
+ from .custom import CustomDataset
5
+ from .embedding_generator import create_embeddings
6
+ from .embedding_generator_new import EmbeddingGenerator,PretrainedEmbeddingGenerator,SBertEmbeddingGenerator,LLMEmbeddingGenerator,create_embeddings_new
7
+ from .idf_helper import IDF
8
+ from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
9
+
10
+ __all__ = ['BaseDataset','DDIMDLDataset','MDFSADDIDataset']
11
+
12
+
@@ -0,0 +1,416 @@
1
+ import glob
2
+ from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
3
+ from sklearn.preprocessing import LabelBinarizer
4
+ from abc import ABC, abstractmethod
5
+ import numpy as np
6
+ import pandas as pd
7
+ import pathlib
8
+ from ddi_fw.datasets.idf_helper import IDF
9
+
10
+ from ddi_fw.utils.zip_helper import ZipHelper
11
+ from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
12
+ from ner.ner import CTakesNER
13
+ from ddi_fw.utils import create_folder_if_not_exists
14
+ from stopwatch import Stopwatch, profile
15
+
16
+ HERE = pathlib.Path(__file__).resolve().parent
17
+
18
+
19
+ def stack(df_column):
20
+ return np.stack(df_column.values)
21
+
22
+
23
+ class BaseDataset(ABC):
24
+ def __init__(self, chemical_property_columns, embedding_columns, ner_columns,
25
+ **kwargs):
26
+ self.__similarity_related_columns__ = []
27
+ self.__similarity_related_columns__.extend(chemical_property_columns)
28
+ self.__similarity_related_columns__.extend(ner_columns)
29
+
30
+ self.chemical_property_columns = chemical_property_columns
31
+ self.embedding_columns = embedding_columns
32
+ self.ner_columns = ner_columns
33
+ self.threshold_method = kwargs.get('threshold_method', 'idf')
34
+ self.tui_threshold = kwargs.get('tui_threshold', 0)
35
+ self.cui_threshold = kwargs.get('cui_threshold', 0)
36
+ self.entities_threshold = kwargs.get('entities_threshold', 0)
37
+
38
+ self.stopwatch = Stopwatch()
39
+
40
+ # self.store_similarity_matrices = kwargs.get('store_similarity_matrices', True)
41
+ # self.similarity_matrices_path = kwargs.get('similarity_matrices_path', True)
42
+
43
+ # önce load veya split çalıştırılmalı
44
+ def produce_inputs(self):
45
+ items = []
46
+ y_train_label, y_test_label = stack(self.y_train), stack(self.y_test)
47
+ for column in self.__similarity_related_columns__:
48
+ train_data, test_data = stack(
49
+ self.X_train[column]), stack(self.X_test[column])
50
+ items.append([f'{column}', np.nan_to_num(train_data),
51
+ y_train_label, np.nan_to_num(test_data), y_test_label])
52
+ for column in self.embedding_columns:
53
+ train_data, test_data = stack(
54
+ self.X_train[column+'_embedding']), stack(self.X_test[column+'_embedding'])
55
+ items.append([f'{column}_embedding', train_data,
56
+ y_train_label, test_data, y_test_label])
57
+ return items
58
+
59
+ ##remove this function
60
+ def generate_sim_matrices(self, chemical_properties_df, two_d_dict):
61
+
62
+ jaccard_sim_dict = {}
63
+ sim_matrix_gen = SimilarityMatrixGenerator()
64
+
65
+ for column in self.__similarity_related_columns__:
66
+ key = '2D_'+column
67
+ jaccard_sim_dict[column] = sim_matrix_gen.create_jaccard_similarity_matrices(
68
+ two_d_dict[key])
69
+
70
+ drugbank_ids = chemical_properties_df['id'].to_list()
71
+
72
+ similarity_matrices = {}
73
+
74
+ for column in self.__similarity_related_columns__:
75
+ sim_matrix = jaccard_sim_dict[column]
76
+ jaccard_sim_feature = {}
77
+ for i in range(len(drugbank_ids)):
78
+ jaccard_sim_feature[drugbank_ids[i]] = sim_matrix[i]
79
+ similarity_matrices[column] = jaccard_sim_feature
80
+
81
+ return similarity_matrices
82
+
83
+ def generate_sim_matrices_new(self, chemical_properties_df):
84
+ self.stopwatch.reset()
85
+ self.stopwatch.start()
86
+ jaccard_sim_dict = {}
87
+ sim_matrix_gen = SimilarityMatrixGenerator()
88
+
89
+ for column in self.__similarity_related_columns__:
90
+ # key = '2D_'+column
91
+ key = column
92
+ jaccard_sim_dict[column] = sim_matrix_gen.create_jaccard_similarity_matrices(
93
+ self.generated_vectors[key])
94
+ self.stopwatch.stop()
95
+ print(f'similarity_matrix_generation_part_1: {self.stopwatch.elapsed}')
96
+
97
+ self.stopwatch.reset()
98
+ self.stopwatch.start()
99
+ similarity_matrices = {}
100
+ drugbank_ids = chemical_properties_df['id'].to_list()
101
+ new_columns = {}
102
+ for idx in range(len(drugbank_ids)):
103
+ new_columns[idx] = drugbank_ids[idx]
104
+ for column in self.__similarity_related_columns__:
105
+ new_df = pd.DataFrame.from_dict(jaccard_sim_dict[column])
106
+ new_df = new_df.rename(index=new_columns, columns=new_columns)
107
+ similarity_matrices[column] = new_df
108
+ self.stopwatch.stop()
109
+ print(f'similarity_matrix_generation_part_2: {self.stopwatch.elapsed}')
110
+ return similarity_matrices
111
+
112
+ # matris formuna çevirmek için
113
+ def transform_2d(self, chemical_properties_df):
114
+ two_d_dict = {}
115
+ for column in self.__similarity_related_columns__:
116
+ key = '2D_'+column
117
+ new_column = column + '_vectors'
118
+ two_d_dict[key] = np.stack(
119
+ chemical_properties_df[new_column].to_numpy())
120
+
121
+ return two_d_dict
122
+
123
+ #todo dictionary içinde ndarray dönsün
124
+ def generate_vectors(self, chemical_properties_df):
125
+ self.stopwatch.reset()
126
+ self.stopwatch.start()
127
+ vectorGenerator = VectorGenerator(chemical_properties_df)
128
+
129
+ new_columns = [
130
+ c+'_vectors' for c in self.__similarity_related_columns__]
131
+ self.generated_vectors = vectorGenerator.generate_feature_vectors(
132
+ self.__similarity_related_columns__)
133
+
134
+ # for column, new_column in zip(self.__similarity_related_columns__, new_columns):
135
+ # chemical_properties_df.loc[:,
136
+ # new_column] = generated_vectors[column]
137
+ # self.generated_vectors = generated_vectors
138
+ self.stopwatch.stop()
139
+ print(f'vector_generation: {self.stopwatch.elapsed}')
140
+
141
+
142
+ ##remove this function
143
+ def sim(self,chemical_properties_df):
144
+ self.stopwatch.reset()
145
+ self.stopwatch.start()
146
+ from scipy.spatial.distance import pdist
147
+ sim_matrix_gen = SimilarityMatrixGenerator()
148
+
149
+ drugbank_ids = chemical_properties_df['id'].to_list()
150
+ similarity_matrices = {}
151
+ for column in self.__similarity_related_columns__:
152
+ df = pd.DataFrame(np.stack(chemical_properties_df[f'{column}_vectors'].values), index = drugbank_ids)
153
+ # similarity_matrices[column] = 1 - pdist(df.to_numpy(), metric='jaccard')
154
+ similarity_matrices[column] = sim_matrix_gen.create_jaccard_similarity_matrices(df.to_numpy())
155
+ self.stopwatch.stop()
156
+ print(f'sim: {self.stopwatch.elapsed}')
157
+ return similarity_matrices
158
+
159
+ # import pandas as pd
160
+ # a = [[0,0,1],[0,0,1],[0,0,0]]
161
+ # s = pd.Series(a)
162
+ # # print(np.vstack(s.to_numpy()))
163
+ # l = np.argmax(np.vstack(s.to_numpy()),axis = 1)
164
+ # l
165
+ def split_dataset(self,
166
+ fold_size=5,
167
+ shuffle=True,
168
+ test_size=0.2,
169
+ save_indexes=False):
170
+ save_path = self.index_path
171
+ self.prep()
172
+ X = self.dataframe.drop('class', axis=1)
173
+ y = self.dataframe['class']
174
+ X_train, X_test, y_train, y_test = train_test_split(
175
+ X, y, shuffle=shuffle, test_size=test_size, stratify=np.argmax(np.vstack(y.to_numpy()),axis = 1))
176
+ # k_fold = KFold(n_splits=fold_size, shuffle=shuffle, random_state=1)
177
+ # folds = k_fold.split(X_train)
178
+
179
+ k_fold = StratifiedKFold(n_splits=fold_size, shuffle=shuffle, random_state=1)
180
+ folds = k_fold.split(X_train, np.argmax(np.vstack(y_train.to_numpy()),axis = 1))
181
+ train_idx_arr = []
182
+ val_idx_arr = []
183
+ for i, (train_index, val_index) in enumerate(folds):
184
+ train_idx_arr.append(train_index)
185
+ val_idx_arr.append(val_index)
186
+
187
+ if save_indexes:
188
+ # train_pairs = [row['id1'].join(',').row['id2'] for index, row in X_train.iterrows()]
189
+ self.__save_indexes__(
190
+ save_path, 'train_indexes.txt', X_train['index'].values)
191
+ self.__save_indexes__(
192
+ save_path, 'test_indexes.txt', X_test['index'].values)
193
+ # self.__save_indexes__(
194
+ # save_path, 'train_indexes.txt', X_train.index.values)
195
+ # self.__save_indexes__(
196
+ # save_path, 'test_indexes.txt', X_test.index.values)
197
+
198
+ for i, (train_idx, val_idx) in enumerate(zip(train_idx_arr, val_idx_arr)):
199
+ self.__save_indexes__(
200
+ save_path, f'train_fold_{i}.txt', train_idx)
201
+ self.__save_indexes__(
202
+ save_path, f'validation_fold_{i}.txt', val_idx)
203
+
204
+ self.X_train = X_train
205
+ self.X_test = X_test
206
+ self.y_train = y_train
207
+ self.y_test = y_test
208
+ self.train_indexes = X_train.index
209
+ self.test_indexes = X_test.index
210
+ self.train_idx_arr = train_idx_arr
211
+ self.val_idx_arr = val_idx_arr
212
+ return X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr
213
+
214
+ def __get_indexes__(self, path):
215
+ train_index_path = path+'/train_indexes.txt'
216
+ test_index_path = path+'/test_indexes.txt'
217
+ train_fold_files = f'{path}/train_fold_*.txt'
218
+ val_fold_files = f'{path}/validation_fold_*.txt'
219
+ train_idx_arr = []
220
+ val_idx_arr = []
221
+ with open(train_index_path, 'r', encoding="utf8") as f:
222
+ train_idx_all = [int(r) for r in f.readlines()]
223
+ with open(test_index_path, 'r', encoding="utf8") as f:
224
+ test_idx_all = [int(r) for r in f.readlines()]
225
+
226
+ for filepath in glob.glob(train_fold_files):
227
+ with open(filepath, 'r', encoding="utf8") as f:
228
+ train_idx = [int(r) for r in f.readlines()]
229
+ train_idx_arr.append(train_idx)
230
+ for filepath in glob.glob(val_fold_files):
231
+ with open(filepath, 'r', encoding="utf8") as f:
232
+ val_idx = [int(r) for r in f.readlines()]
233
+ val_idx_arr.append(val_idx)
234
+ return train_idx_all, test_idx_all, train_idx_arr, val_idx_arr
235
+
236
+ def __save_indexes__(self, path, filename, indexes):
237
+ create_folder_if_not_exists(path)
238
+ file_path = path + '/'+filename
239
+ str_indexes = [str(index) for index in indexes]
240
+ with open(file_path, 'w') as f:
241
+ f.write('\n'.join(str_indexes))
242
+
243
+ # @abstractmethod
244
+ # def prep(self):
245
+ # pass
246
+
247
+ # @abstractmethod
248
+ # def load(self):
249
+ # pass
250
+
251
+ # her bir metin tipi için embedding oluşturursan burayı düzenle
252
+ def prep(self):
253
+ if self.embedding_columns:
254
+ zip_helper = ZipHelper()
255
+ zip_helper.extract(str(HERE.joinpath('zips/embeddings')),
256
+ str(HERE.joinpath('zips/embeddings')))
257
+
258
+ embedding_dict = dict()
259
+ for embedding_column in self.embedding_columns:
260
+ embedding_file = HERE.joinpath(
261
+ f'zips/embeddings/{embedding_column}_embeddings.pkl')
262
+ embedding_values = pd.read_pickle(embedding_file)
263
+ d = embedding_values.apply(
264
+ lambda x: {x.id: x[f'{embedding_column}_embedding']}, axis=1)
265
+ x = {k: v for l in d.values.tolist() for k, v in l.items()}
266
+ embedding_dict[embedding_column] = x
267
+
268
+ self.ner_df = CTakesNER().load()
269
+ drug_names = self.drugs_df['name'].to_list()
270
+ drug_ids = self.drugs_df['id'].to_list()
271
+
272
+ # self.ddis_df = self.ddis_df[(self.ddis_df['name1'].isin(drug_names)) & (
273
+ # self.ddis_df['name2'].isin(drug_names))]
274
+
275
+ filtered_df = self.drugs_df
276
+ filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
277
+ drug_ids)]
278
+ filtered_ner_df = self.ner_df.copy()
279
+
280
+ combined_df = filtered_df.copy()
281
+ # TODO: eğer kullanılan veri setinde tui, cui veya entity bilgileri yoksa o veri setine bu sütunları eklemek için aşağısı gerekli
282
+
283
+ idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
284
+ idf_calc.calculate()
285
+ idf_scores_df = idf_calc.to_dataframe()
286
+
287
+ for key in filtered_ner_df.keys():
288
+ threshold = 0
289
+ if key.startswith('tui'):
290
+ threshold = self.tui_threshold
291
+ if key.startswith('cui'):
292
+ threshold = self.cui_threshold
293
+ if key.startswith('entities'):
294
+ threshold = self.entities_threshold
295
+ combined_df[key] = filtered_ner_df[key]
296
+ valid_codes = idf_scores_df[idf_scores_df[key] > threshold].index
297
+
298
+ # print(f'{key}: valid code size = {len(valid_codes)}')
299
+ combined_df[key] = combined_df[key].apply(lambda items:
300
+ [item for item in items if item in valid_codes])
301
+
302
+ # Yukarıdaki koda evrildi
303
+ # combined_df['tui_description'] = filtered_ner_df['description_tuis']
304
+ # combined_df['cui_description'] = filtered_ner_df['description_cuis']
305
+ # combined_df['entities_description'] = filtered_ner_df['description_entities']
306
+
307
+ # tui_idf = IDF(combined_df['tui_description'], self.tui_threshold)
308
+ # cui_idf = IDF(combined_df['cui_description'], self.cui_threshold)
309
+ # entities_idf = IDF(
310
+ # combined_df['entities_description'], self.entities_threshold)
311
+
312
+ # tui_idf.calculate()
313
+ # cui_idf.calculate()
314
+ # entities_idf.calculate()
315
+
316
+ # valid_tui_codes = tui_idf.find_items_over_threshold()
317
+ # valid_cui_codes = cui_idf.find_items_over_threshold()
318
+ # valid_entities_codes = entities_idf.find_items_over_threshold()
319
+
320
+ # combined_df['tui_description'] = combined_df['tui_description'].apply(lambda items:
321
+ # [item for item in items if item in valid_tui_codes])
322
+ # combined_df['cui_description'] = combined_df['cui_description'].apply(lambda items:
323
+ # [item for item in items if item in valid_cui_codes])
324
+ # combined_df['entities_description'] = combined_df['entities_description'].apply(lambda items:
325
+ # [item for item in items if item in valid_entities_codes])
326
+
327
+ moved_columns = ['id']
328
+ moved_columns.extend(self.__similarity_related_columns__)
329
+ chemical_properties_df = combined_df[moved_columns]
330
+
331
+ chemical_properties_df = chemical_properties_df.fillna("").apply(list)
332
+
333
+ # generate vectors dictionary içinde ndarray dönecek
334
+ self.generate_vectors(chemical_properties_df)
335
+
336
+ # two_d_dict = self.transform_2d(chemical_properties_df)
337
+
338
+ similarity_matrices = self.generate_sim_matrices_new(
339
+ chemical_properties_df)
340
+
341
+ # similarity_matrices = self.sim(chemical_properties_df)
342
+
343
+ event_categories = self.ddis_df['event_category']
344
+ labels = event_categories.tolist()
345
+ lb = LabelBinarizer()
346
+ lb.fit(labels)
347
+ classes = lb.transform(labels)
348
+
349
+ # def similarity_lambda_fnc(row, value):
350
+ # if row['id1'] in value and row['id2'] in value:
351
+ # return value[row['id1']][row['id2']]
352
+
353
+ def similarity_lambda_fnc(row, value):
354
+ if row['id1'] in value:
355
+ return value[row['id1']]
356
+
357
+ def lambda_fnc(row, value):
358
+ if row['id1'] in value and row['id2'] in value:
359
+ return np.float16(np.hstack(
360
+ (value[row['id1']], value[row['id2']])))
361
+ # return np.hstack(
362
+ # (value[row['id1']], value[row['id2']]), dtype=np.float16)
363
+
364
+ def x_fnc(row, embedding_values, embedding_column):
365
+ # first = embedding_values[embedding_values.id == row['id1']]
366
+ # second = embedding_values[embedding_values.id == row['id2']]
367
+ # v1 = first.iloc[0][embedding_column+'_embedding']
368
+ # v2 = second.iloc[0][embedding_column+'_embedding']
369
+ v1 = embedding_dict[embedding_column][row['id1']]
370
+ v2 = embedding_dict[embedding_column][row['id2']]
371
+ # v1 = embedding_dict[row['id1']][embedding_column+'_embedding']
372
+ # v2 = embedding_dict[row['id2']][embedding_column+'_embedding']
373
+ return np.float16(np.hstack(
374
+ (v1, v2)))
375
+
376
+ for key, value in similarity_matrices.items():
377
+
378
+ print(f'sim matrix: {key}')
379
+ self.ddis_df[key] = self.ddis_df.apply(
380
+ lambda_fnc, args=(value,), axis=1)
381
+ print(self.ddis_df[key].head())
382
+
383
+ for embedding_column in self.embedding_columns:
384
+ print(f"concat {embedding_column} embeddings")
385
+ # column_embeddings_dict = embedding_values[embedding_column]
386
+ self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
387
+ x_fnc, args=(embedding_values, embedding_column), axis=1)
388
+
389
+ self.dataframe = self.ddis_df.copy()
390
+ self.dataframe['class'] = list(classes)
391
+ print(self.dataframe.shape)
392
+
393
+ def load(self):
394
+ if self.index_path == None:
395
+ raise Exception(
396
+ "There is no index path, please call split function")
397
+
398
+ # prep - split - load
399
+ train_idx_all, test_idx_all, train_idx_arr, val_idx_arr = self.__get_indexes__(
400
+ self.index_path)
401
+
402
+ self.prep()
403
+ train = self.dataframe[self.dataframe['index'].isin(train_idx_all)]
404
+ test = self.dataframe[self.dataframe['index'].isin(test_idx_all)]
405
+
406
+ self.X_train = train.drop('class', axis=1)
407
+ self.y_train = train['class']
408
+ self.X_test = test.drop('class', axis=1)
409
+ self.y_test = test['class']
410
+
411
+ self.train_indexes = self.X_train.index
412
+ self.test_indexes = self.X_test.index
413
+ self.train_idx_arr = train_idx_arr
414
+ self.val_idx_arr = val_idx_arr
415
+
416
+ return self.X_train, self.X_test, self.y_train, self.y_test, self.X_train.index, self.X_test.index, train_idx_arr, val_idx_arr
@@ -0,0 +1,204 @@
1
+
2
+ from sqlite3 import Error
3
+ import sqlite3
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+
8
+ def create_connection(db_file=r"./event.db"):
9
+ """ create a database connection to the SQLite database
10
+ specified by db_file
11
+ :param db_file: database file
12
+ :return: Connection object or None
13
+ """
14
+ conn = None
15
+ try:
16
+ conn = sqlite3.connect(db_file)
17
+ except Error as e:
18
+ print(e)
19
+
20
+ return conn
21
+
22
+
23
+ def select_all_drugs(conn):
24
+ cur = conn.cursor()
25
+ cur.execute(
26
+ '''select "index", id, name, target, enzyme, pathway, smile from drug''')
27
+ rows = cur.fetchall()
28
+ return rows
29
+
30
+
31
+ def select_all_drugs_as_dataframe(conn):
32
+ headers = ['index','id', 'name', 'target', 'enzyme', 'pathway', 'smile']
33
+ rows = select_all_drugs(conn)
34
+ df = pd.DataFrame(columns=headers, data=rows)
35
+ df['enzyme'] = df['enzyme'].apply(lambda x: x.split('|'))
36
+ df['target'] = df['target'].apply(lambda x: x.split('|'))
37
+ df['pathway'] = df['pathway'].apply(lambda x: x.split('|'))
38
+ df['smile'] = df['smile'].apply(lambda x: x.split('|'))
39
+ return df
40
+
41
+
42
+ def select_all_events(conn):
43
+ """
44
+ Query all rows in the event table
45
+ :param conn: the Connection object
46
+ :return:
47
+ """
48
+ cur = conn.cursor()
49
+ cur.execute("select * from event")
50
+
51
+ rows = cur.fetchall()
52
+ return rows
53
+
54
+
55
+ def select_all_events_as_dataframe(conn):
56
+ headers = ["index", "id1", "name1", "id2", "name2", "event_category"]
57
+ rows = select_all_events(conn)
58
+ return pd.DataFrame(columns=headers, data=rows)
59
+
60
+
61
+ def select_events_with_category(conn):
62
+ sql = '''select id1, name1, id2, name2, mechanism || ' ' ||action from event ev
63
+ join extraction ex
64
+ on ev.name1 = ex.drugA and ev.name2 = ex.drugB
65
+ union
66
+ select id1, name1, id2, name2, mechanism || ' ' ||action from event ev
67
+ join extraction ex
68
+ on ev.name1 = ex.drugB and ev.name2 = ex.drugA
69
+ '''
70
+ cur = conn.cursor()
71
+ cur.execute(sql)
72
+
73
+ rows = cur.fetchall()
74
+
75
+ headers = ['id1', 'name1', 'id2', 'name2', 'event_category']
76
+ return pd.DataFrame(columns=headers, data=rows)
77
+
78
+
79
+ def select_all_interactions_tuple_as_dataframe(conn):
80
+ cur = conn.cursor()
81
+ cur.execute("select id1, id2 from event")
82
+ rows = cur.fetchall()
83
+ headers = ['id1', 'id2']
84
+
85
+ return pd.DataFrame(columns=headers, data=rows)
86
+
87
+
88
+ def select_ddi_pairs(conn):
89
+ cur = conn.cursor()
90
+ cur.execute('''
91
+ select d1.[index] as Drug1Index, d2.[index] as Drug2Index, 1 from event e
92
+ join drug d1 on e.id1 = d1.id
93
+ join drug d2 on e.id2 = d2.id
94
+ ''')
95
+ rows = cur.fetchall()
96
+ return rows
97
+
98
+
99
+ def select_ddi_pairs_as_dataframe(conn):
100
+ headers = ["Drug1Index", "Drug2Index", "Interaction"]
101
+ rows = select_ddi_pairs(conn)
102
+ return pd.DataFrame(columns=headers, data=rows)
103
+
104
+
105
+ def get_interactions(conn):
106
+ cur = conn.cursor()
107
+ cur.execute('''
108
+ select
109
+ drug_1_id,
110
+ drug_1,
111
+ drug_2_id,
112
+ drug_2,
113
+ mechanism_action,
114
+ interaction,
115
+ masked_interaction
116
+ from _Interactions
117
+ ''')
118
+
119
+ rows = cur.fetchall()
120
+
121
+ headers = ['id1', 'name1', 'id2', 'name2',
122
+ 'event_category', 'interaction', 'masked_interaction']
123
+ df = pd.DataFrame(columns=headers, data=rows)
124
+ return df
125
+
126
+
127
+ def get_extended_version(conn):
128
+ cur = conn.cursor()
129
+ cur.execute('''
130
+ select
131
+ _Drugs."index",
132
+ drugbank_id,
133
+ _Drugs.name,
134
+ description,
135
+ synthesis_reference,
136
+ indication,
137
+ pharmacodynamics,
138
+ mechanism_of_action,
139
+ toxicity,
140
+ metabolism,
141
+ absorption,
142
+ half_life,
143
+ protein_binding,
144
+ route_of_elimination,
145
+ volume_of_distribution,
146
+ clearance,
147
+ smiles,
148
+ smiles_morgan_fingerprint,
149
+ enzymes_polypeptides,
150
+ targets_polypeptides
151
+
152
+ from drug
153
+ join _Drugs on drug.id = _Drugs.drugbank_id
154
+ where
155
+ targets_polypeptides is not null and
156
+ enzymes_polypeptides is not null and
157
+ smiles_morgan_fingerprint is not null
158
+ ''')
159
+ # pathway is absent
160
+
161
+ rows = cur.fetchall()
162
+ headers = ['index', 'id', 'name', 'description', 'synthesis_reference', 'indication', 'pharmacodynamics', 'mechanism_of_action', 'toxicity', 'metabolism', 'absorption', 'half_life',
163
+ 'protein_binding', 'route_of_elimination', 'volume_of_distribution', 'clearance', 'smiles_notation', 'smile', 'enzyme', 'target']
164
+ df = pd.DataFrame(columns=headers, data=rows)
165
+ df['smile'] = df['smile'].apply(lambda x:
166
+ np.fromstring(
167
+ x.replace(
168
+ '\n', '')
169
+ .replace('[', '')
170
+ .replace(']', '')
171
+ .replace(' ', ' '), sep=','))
172
+ df['enzyme'] = df['enzyme'].apply(
173
+ lambda x: x.split('|'))
174
+ df['target'] = df['target'].apply(
175
+ lambda x: x.split('|'))
176
+ return df
177
+
178
+
179
+ # SELECT
180
+ # CASE
181
+ # WHEN masked_interaction like '%'+drug_1+'%' THEN drug_1
182
+ # WHEN masked_interaction like '%'+drug_2+'%' THEN drug_2
183
+ # Else drug_2
184
+ # END AS Absent,
185
+
186
+ # drug_1, drug_2,
187
+ # masked_interaction
188
+
189
+ # from _Interactions
190
+ # where LENGTH(masked_interaction) = LENGTH(REPLACE(masked_interaction, 'DRUG', ''))
191
+ # or LENGTH(masked_interaction) = LENGTH(REPLACE(masked_interaction, 'DRUG', '')) + 4
192
+
193
+ if __name__ == "__main__":
194
+ conn = create_connection(r"./event-extended.db")
195
+ extended_version_df = get_extended_version(conn)
196
+
197
+ df = select_all_events_as_dataframe(conn)
198
+ print(df.head())
199
+
200
+ events_with_category_df = select_events_with_category(conn)
201
+ print(events_with_category_df.head())
202
+
203
+ u = events_with_category_df['event_category'].unique()
204
+ print(len(u))