ddi-fw 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/__init__.py +12 -0
- ddi_fw/datasets/core.py +416 -0
- ddi_fw/datasets/db_utils.py +204 -0
- ddi_fw/datasets/embedding_generator.py +66 -0
- ddi_fw/datasets/embedding_generator_new.py +105 -0
- ddi_fw/datasets/feature_vector_generation.py +100 -0
- ddi_fw/datasets/idf_helper.py +71 -0
- ddi_fw/drugbank/__init__.py +2 -0
- ddi_fw/drugbank/drugbank_parser.py +154 -0
- ddi_fw/drugbank/drugbank_processor.py +343 -0
- ddi_fw/drugbank/drugbank_processor_org.py +272 -0
- ddi_fw/drugbank/event_extractor.py +127 -0
- ddi_fw/experiments/__init__.py +2 -0
- ddi_fw/experiments/custom_torch_model.py +66 -0
- ddi_fw/experiments/evaluation_helper.py +232 -0
- ddi_fw/experiments/tensorflow_helper.py +296 -0
- ddi_fw/experiments/test.py +59 -0
- ddi_fw/ner/__init__.py +1 -0
- ddi_fw/ner/mmlrestclient.py +155 -0
- ddi_fw/ner/ner.py +340 -0
- ddi_fw/utils/__init__.py +3 -0
- ddi_fw/utils/enums.py +23 -0
- ddi_fw/utils/utils.py +103 -0
- ddi_fw/utils/zip_helper.py +66 -0
- {ddi_fw-0.0.1.dist-info → ddi_fw-0.0.2.dist-info}/METADATA +1 -1
- ddi_fw-0.0.2.dist-info/RECORD +28 -0
- ddi_fw-0.0.2.dist-info/top_level.txt +5 -0
- ddi_fw-0.0.1.dist-info/RECORD +0 -4
- ddi_fw-0.0.1.dist-info/top_level.txt +0 -1
- {ddi_fw-0.0.1.dist-info → ddi_fw-0.0.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,12 @@
|
|
1
|
+
from .core import BaseDataset
|
2
|
+
from .ddi_mdl.base import DDIMDLDataset
|
3
|
+
from .mdf_sa_ddi.base import MDFSADDIDataset
|
4
|
+
from .custom import CustomDataset
|
5
|
+
from .embedding_generator import create_embeddings
|
6
|
+
from .embedding_generator_new import EmbeddingGenerator,PretrainedEmbeddingGenerator,SBertEmbeddingGenerator,LLMEmbeddingGenerator,create_embeddings_new
|
7
|
+
from .idf_helper import IDF
|
8
|
+
from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
|
9
|
+
|
10
|
+
__all__ = ['BaseDataset','DDIMDLDataset','MDFSADDIDataset']
|
11
|
+
|
12
|
+
|
ddi_fw/datasets/core.py
ADDED
@@ -0,0 +1,416 @@
|
|
1
|
+
import glob
|
2
|
+
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
|
3
|
+
from sklearn.preprocessing import LabelBinarizer
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
import pathlib
|
8
|
+
from datasets.idf_helper import IDF
|
9
|
+
|
10
|
+
from utils.zip_helper import ZipHelper
|
11
|
+
from .feature_vector_generation import SimilarityMatrixGenerator, VectorGenerator
|
12
|
+
from ner.ner import CTakesNER
|
13
|
+
from utils import create_folder_if_not_exists
|
14
|
+
from stopwatch import Stopwatch, profile
|
15
|
+
|
16
|
+
HERE = pathlib.Path(__file__).resolve().parent
|
17
|
+
|
18
|
+
|
19
|
+
def stack(df_column):
|
20
|
+
return np.stack(df_column.values)
|
21
|
+
|
22
|
+
|
23
|
+
class BaseDataset(ABC):
|
24
|
+
def __init__(self, chemical_property_columns, embedding_columns, ner_columns,
|
25
|
+
**kwargs):
|
26
|
+
self.__similarity_related_columns__ = []
|
27
|
+
self.__similarity_related_columns__.extend(chemical_property_columns)
|
28
|
+
self.__similarity_related_columns__.extend(ner_columns)
|
29
|
+
|
30
|
+
self.chemical_property_columns = chemical_property_columns
|
31
|
+
self.embedding_columns = embedding_columns
|
32
|
+
self.ner_columns = ner_columns
|
33
|
+
self.threshold_method = kwargs.get('threshold_method', 'idf')
|
34
|
+
self.tui_threshold = kwargs.get('tui_threshold', 0)
|
35
|
+
self.cui_threshold = kwargs.get('cui_threshold', 0)
|
36
|
+
self.entities_threshold = kwargs.get('entities_threshold', 0)
|
37
|
+
|
38
|
+
self.stopwatch = Stopwatch()
|
39
|
+
|
40
|
+
# self.store_similarity_matrices = kwargs.get('store_similarity_matrices', True)
|
41
|
+
# self.similarity_matrices_path = kwargs.get('similarity_matrices_path', True)
|
42
|
+
|
43
|
+
# önce load veya split çalıştırılmalı
|
44
|
+
def produce_inputs(self):
|
45
|
+
items = []
|
46
|
+
y_train_label, y_test_label = stack(self.y_train), stack(self.y_test)
|
47
|
+
for column in self.__similarity_related_columns__:
|
48
|
+
train_data, test_data = stack(
|
49
|
+
self.X_train[column]), stack(self.X_test[column])
|
50
|
+
items.append([f'{column}', np.nan_to_num(train_data),
|
51
|
+
y_train_label, np.nan_to_num(test_data), y_test_label])
|
52
|
+
for column in self.embedding_columns:
|
53
|
+
train_data, test_data = stack(
|
54
|
+
self.X_train[column+'_embedding']), stack(self.X_test[column+'_embedding'])
|
55
|
+
items.append([f'{column}_embedding', train_data,
|
56
|
+
y_train_label, test_data, y_test_label])
|
57
|
+
return items
|
58
|
+
|
59
|
+
##remove this function
|
60
|
+
def generate_sim_matrices(self, chemical_properties_df, two_d_dict):
|
61
|
+
|
62
|
+
jaccard_sim_dict = {}
|
63
|
+
sim_matrix_gen = SimilarityMatrixGenerator()
|
64
|
+
|
65
|
+
for column in self.__similarity_related_columns__:
|
66
|
+
key = '2D_'+column
|
67
|
+
jaccard_sim_dict[column] = sim_matrix_gen.create_jaccard_similarity_matrices(
|
68
|
+
two_d_dict[key])
|
69
|
+
|
70
|
+
drugbank_ids = chemical_properties_df['id'].to_list()
|
71
|
+
|
72
|
+
similarity_matrices = {}
|
73
|
+
|
74
|
+
for column in self.__similarity_related_columns__:
|
75
|
+
sim_matrix = jaccard_sim_dict[column]
|
76
|
+
jaccard_sim_feature = {}
|
77
|
+
for i in range(len(drugbank_ids)):
|
78
|
+
jaccard_sim_feature[drugbank_ids[i]] = sim_matrix[i]
|
79
|
+
similarity_matrices[column] = jaccard_sim_feature
|
80
|
+
|
81
|
+
return similarity_matrices
|
82
|
+
|
83
|
+
def generate_sim_matrices_new(self, chemical_properties_df):
|
84
|
+
self.stopwatch.reset()
|
85
|
+
self.stopwatch.start()
|
86
|
+
jaccard_sim_dict = {}
|
87
|
+
sim_matrix_gen = SimilarityMatrixGenerator()
|
88
|
+
|
89
|
+
for column in self.__similarity_related_columns__:
|
90
|
+
# key = '2D_'+column
|
91
|
+
key = column
|
92
|
+
jaccard_sim_dict[column] = sim_matrix_gen.create_jaccard_similarity_matrices(
|
93
|
+
self.generated_vectors[key])
|
94
|
+
self.stopwatch.stop()
|
95
|
+
print(f'similarity_matrix_generation_part_1: {self.stopwatch.elapsed}')
|
96
|
+
|
97
|
+
self.stopwatch.reset()
|
98
|
+
self.stopwatch.start()
|
99
|
+
similarity_matrices = {}
|
100
|
+
drugbank_ids = chemical_properties_df['id'].to_list()
|
101
|
+
new_columns = {}
|
102
|
+
for idx in range(len(drugbank_ids)):
|
103
|
+
new_columns[idx] = drugbank_ids[idx]
|
104
|
+
for column in self.__similarity_related_columns__:
|
105
|
+
new_df = pd.DataFrame.from_dict(jaccard_sim_dict[column])
|
106
|
+
new_df = new_df.rename(index=new_columns, columns=new_columns)
|
107
|
+
similarity_matrices[column] = new_df
|
108
|
+
self.stopwatch.stop()
|
109
|
+
print(f'similarity_matrix_generation_part_2: {self.stopwatch.elapsed}')
|
110
|
+
return similarity_matrices
|
111
|
+
|
112
|
+
# matris formuna çevirmek için
|
113
|
+
def transform_2d(self, chemical_properties_df):
|
114
|
+
two_d_dict = {}
|
115
|
+
for column in self.__similarity_related_columns__:
|
116
|
+
key = '2D_'+column
|
117
|
+
new_column = column + '_vectors'
|
118
|
+
two_d_dict[key] = np.stack(
|
119
|
+
chemical_properties_df[new_column].to_numpy())
|
120
|
+
|
121
|
+
return two_d_dict
|
122
|
+
|
123
|
+
#todo dictionary içinde ndarray dönsün
|
124
|
+
def generate_vectors(self, chemical_properties_df):
|
125
|
+
self.stopwatch.reset()
|
126
|
+
self.stopwatch.start()
|
127
|
+
vectorGenerator = VectorGenerator(chemical_properties_df)
|
128
|
+
|
129
|
+
new_columns = [
|
130
|
+
c+'_vectors' for c in self.__similarity_related_columns__]
|
131
|
+
self.generated_vectors = vectorGenerator.generate_feature_vectors(
|
132
|
+
self.__similarity_related_columns__)
|
133
|
+
|
134
|
+
# for column, new_column in zip(self.__similarity_related_columns__, new_columns):
|
135
|
+
# chemical_properties_df.loc[:,
|
136
|
+
# new_column] = generated_vectors[column]
|
137
|
+
# self.generated_vectors = generated_vectors
|
138
|
+
self.stopwatch.stop()
|
139
|
+
print(f'vector_generation: {self.stopwatch.elapsed}')
|
140
|
+
|
141
|
+
|
142
|
+
##remove this function
|
143
|
+
def sim(self,chemical_properties_df):
|
144
|
+
self.stopwatch.reset()
|
145
|
+
self.stopwatch.start()
|
146
|
+
from scipy.spatial.distance import pdist
|
147
|
+
sim_matrix_gen = SimilarityMatrixGenerator()
|
148
|
+
|
149
|
+
drugbank_ids = chemical_properties_df['id'].to_list()
|
150
|
+
similarity_matrices = {}
|
151
|
+
for column in self.__similarity_related_columns__:
|
152
|
+
df = pd.DataFrame(np.stack(chemical_properties_df[f'{column}_vectors'].values), index = drugbank_ids)
|
153
|
+
# similarity_matrices[column] = 1 - pdist(df.to_numpy(), metric='jaccard')
|
154
|
+
similarity_matrices[column] = sim_matrix_gen.create_jaccard_similarity_matrices(df.to_numpy())
|
155
|
+
self.stopwatch.stop()
|
156
|
+
print(f'sim: {self.stopwatch.elapsed}')
|
157
|
+
return similarity_matrices
|
158
|
+
|
159
|
+
# import pandas as pd
|
160
|
+
# a = [[0,0,1],[0,0,1],[0,0,0]]
|
161
|
+
# s = pd.Series(a)
|
162
|
+
# # print(np.vstack(s.to_numpy()))
|
163
|
+
# l = np.argmax(np.vstack(s.to_numpy()),axis = 1)
|
164
|
+
# l
|
165
|
+
def split_dataset(self,
|
166
|
+
fold_size=5,
|
167
|
+
shuffle=True,
|
168
|
+
test_size=0.2,
|
169
|
+
save_indexes=False):
|
170
|
+
save_path = self.index_path
|
171
|
+
self.prep()
|
172
|
+
X = self.dataframe.drop('class', axis=1)
|
173
|
+
y = self.dataframe['class']
|
174
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
175
|
+
X, y, shuffle=shuffle, test_size=test_size, stratify=np.argmax(np.vstack(y.to_numpy()),axis = 1))
|
176
|
+
# k_fold = KFold(n_splits=fold_size, shuffle=shuffle, random_state=1)
|
177
|
+
# folds = k_fold.split(X_train)
|
178
|
+
|
179
|
+
k_fold = StratifiedKFold(n_splits=fold_size, shuffle=shuffle, random_state=1)
|
180
|
+
folds = k_fold.split(X_train, np.argmax(np.vstack(y_train.to_numpy()),axis = 1))
|
181
|
+
train_idx_arr = []
|
182
|
+
val_idx_arr = []
|
183
|
+
for i, (train_index, val_index) in enumerate(folds):
|
184
|
+
train_idx_arr.append(train_index)
|
185
|
+
val_idx_arr.append(val_index)
|
186
|
+
|
187
|
+
if save_indexes:
|
188
|
+
# train_pairs = [row['id1'].join(',').row['id2'] for index, row in X_train.iterrows()]
|
189
|
+
self.__save_indexes__(
|
190
|
+
save_path, 'train_indexes.txt', X_train['index'].values)
|
191
|
+
self.__save_indexes__(
|
192
|
+
save_path, 'test_indexes.txt', X_test['index'].values)
|
193
|
+
# self.__save_indexes__(
|
194
|
+
# save_path, 'train_indexes.txt', X_train.index.values)
|
195
|
+
# self.__save_indexes__(
|
196
|
+
# save_path, 'test_indexes.txt', X_test.index.values)
|
197
|
+
|
198
|
+
for i, (train_idx, val_idx) in enumerate(zip(train_idx_arr, val_idx_arr)):
|
199
|
+
self.__save_indexes__(
|
200
|
+
save_path, f'train_fold_{i}.txt', train_idx)
|
201
|
+
self.__save_indexes__(
|
202
|
+
save_path, f'validation_fold_{i}.txt', val_idx)
|
203
|
+
|
204
|
+
self.X_train = X_train
|
205
|
+
self.X_test = X_test
|
206
|
+
self.y_train = y_train
|
207
|
+
self.y_test = y_test
|
208
|
+
self.train_indexes = X_train.index
|
209
|
+
self.test_indexes = X_test.index
|
210
|
+
self.train_idx_arr = train_idx_arr
|
211
|
+
self.val_idx_arr = val_idx_arr
|
212
|
+
return X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr
|
213
|
+
|
214
|
+
def __get_indexes__(self, path):
|
215
|
+
train_index_path = path+'/train_indexes.txt'
|
216
|
+
test_index_path = path+'/test_indexes.txt'
|
217
|
+
train_fold_files = f'{path}/train_fold_*.txt'
|
218
|
+
val_fold_files = f'{path}/validation_fold_*.txt'
|
219
|
+
train_idx_arr = []
|
220
|
+
val_idx_arr = []
|
221
|
+
with open(train_index_path, 'r', encoding="utf8") as f:
|
222
|
+
train_idx_all = [int(r) for r in f.readlines()]
|
223
|
+
with open(test_index_path, 'r', encoding="utf8") as f:
|
224
|
+
test_idx_all = [int(r) for r in f.readlines()]
|
225
|
+
|
226
|
+
for filepath in glob.glob(train_fold_files):
|
227
|
+
with open(filepath, 'r', encoding="utf8") as f:
|
228
|
+
train_idx = [int(r) for r in f.readlines()]
|
229
|
+
train_idx_arr.append(train_idx)
|
230
|
+
for filepath in glob.glob(val_fold_files):
|
231
|
+
with open(filepath, 'r', encoding="utf8") as f:
|
232
|
+
val_idx = [int(r) for r in f.readlines()]
|
233
|
+
val_idx_arr.append(val_idx)
|
234
|
+
return train_idx_all, test_idx_all, train_idx_arr, val_idx_arr
|
235
|
+
|
236
|
+
def __save_indexes__(self, path, filename, indexes):
|
237
|
+
create_folder_if_not_exists(path)
|
238
|
+
file_path = path + '/'+filename
|
239
|
+
str_indexes = [str(index) for index in indexes]
|
240
|
+
with open(file_path, 'w') as f:
|
241
|
+
f.write('\n'.join(str_indexes))
|
242
|
+
|
243
|
+
# @abstractmethod
|
244
|
+
# def prep(self):
|
245
|
+
# pass
|
246
|
+
|
247
|
+
# @abstractmethod
|
248
|
+
# def load(self):
|
249
|
+
# pass
|
250
|
+
|
251
|
+
# her bir metin tipi için embedding oluşturursan burayı düzenle
|
252
|
+
def prep(self):
|
253
|
+
if self.embedding_columns:
|
254
|
+
zip_helper = ZipHelper()
|
255
|
+
zip_helper.extract(str(HERE.joinpath('zips/embeddings')),
|
256
|
+
str(HERE.joinpath('zips/embeddings')))
|
257
|
+
|
258
|
+
embedding_dict = dict()
|
259
|
+
for embedding_column in self.embedding_columns:
|
260
|
+
embedding_file = HERE.joinpath(
|
261
|
+
f'zips/embeddings/{embedding_column}_embeddings.pkl')
|
262
|
+
embedding_values = pd.read_pickle(embedding_file)
|
263
|
+
d = embedding_values.apply(
|
264
|
+
lambda x: {x.id: x[f'{embedding_column}_embedding']}, axis=1)
|
265
|
+
x = {k: v for l in d.values.tolist() for k, v in l.items()}
|
266
|
+
embedding_dict[embedding_column] = x
|
267
|
+
|
268
|
+
self.ner_df = CTakesNER().load()
|
269
|
+
drug_names = self.drugs_df['name'].to_list()
|
270
|
+
drug_ids = self.drugs_df['id'].to_list()
|
271
|
+
|
272
|
+
# self.ddis_df = self.ddis_df[(self.ddis_df['name1'].isin(drug_names)) & (
|
273
|
+
# self.ddis_df['name2'].isin(drug_names))]
|
274
|
+
|
275
|
+
filtered_df = self.drugs_df
|
276
|
+
filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
|
277
|
+
drug_ids)]
|
278
|
+
filtered_ner_df = self.ner_df.copy()
|
279
|
+
|
280
|
+
combined_df = filtered_df.copy()
|
281
|
+
# TODO: eğer kullanılan veri setinde tui, cui veya entity bilgileri yoksa o veri setine bu sütunları eklemek için aşağısı gerekli
|
282
|
+
|
283
|
+
idf_calc = IDF(filtered_ner_df, [f for f in filtered_ner_df.keys()])
|
284
|
+
idf_calc.calculate()
|
285
|
+
idf_scores_df = idf_calc.to_dataframe()
|
286
|
+
|
287
|
+
for key in filtered_ner_df.keys():
|
288
|
+
threshold = 0
|
289
|
+
if key.startswith('tui'):
|
290
|
+
threshold = self.tui_threshold
|
291
|
+
if key.startswith('cui'):
|
292
|
+
threshold = self.cui_threshold
|
293
|
+
if key.startswith('entities'):
|
294
|
+
threshold = self.entities_threshold
|
295
|
+
combined_df[key] = filtered_ner_df[key]
|
296
|
+
valid_codes = idf_scores_df[idf_scores_df[key] > threshold].index
|
297
|
+
|
298
|
+
# print(f'{key}: valid code size = {len(valid_codes)}')
|
299
|
+
combined_df[key] = combined_df[key].apply(lambda items:
|
300
|
+
[item for item in items if item in valid_codes])
|
301
|
+
|
302
|
+
# Yukarıdaki koda evrildi
|
303
|
+
# combined_df['tui_description'] = filtered_ner_df['description_tuis']
|
304
|
+
# combined_df['cui_description'] = filtered_ner_df['description_cuis']
|
305
|
+
# combined_df['entities_description'] = filtered_ner_df['description_entities']
|
306
|
+
|
307
|
+
# tui_idf = IDF(combined_df['tui_description'], self.tui_threshold)
|
308
|
+
# cui_idf = IDF(combined_df['cui_description'], self.cui_threshold)
|
309
|
+
# entities_idf = IDF(
|
310
|
+
# combined_df['entities_description'], self.entities_threshold)
|
311
|
+
|
312
|
+
# tui_idf.calculate()
|
313
|
+
# cui_idf.calculate()
|
314
|
+
# entities_idf.calculate()
|
315
|
+
|
316
|
+
# valid_tui_codes = tui_idf.find_items_over_threshold()
|
317
|
+
# valid_cui_codes = cui_idf.find_items_over_threshold()
|
318
|
+
# valid_entities_codes = entities_idf.find_items_over_threshold()
|
319
|
+
|
320
|
+
# combined_df['tui_description'] = combined_df['tui_description'].apply(lambda items:
|
321
|
+
# [item for item in items if item in valid_tui_codes])
|
322
|
+
# combined_df['cui_description'] = combined_df['cui_description'].apply(lambda items:
|
323
|
+
# [item for item in items if item in valid_cui_codes])
|
324
|
+
# combined_df['entities_description'] = combined_df['entities_description'].apply(lambda items:
|
325
|
+
# [item for item in items if item in valid_entities_codes])
|
326
|
+
|
327
|
+
moved_columns = ['id']
|
328
|
+
moved_columns.extend(self.__similarity_related_columns__)
|
329
|
+
chemical_properties_df = combined_df[moved_columns]
|
330
|
+
|
331
|
+
chemical_properties_df = chemical_properties_df.fillna("").apply(list)
|
332
|
+
|
333
|
+
# generate vectors dictionary içinde ndarray dönecek
|
334
|
+
self.generate_vectors(chemical_properties_df)
|
335
|
+
|
336
|
+
# two_d_dict = self.transform_2d(chemical_properties_df)
|
337
|
+
|
338
|
+
similarity_matrices = self.generate_sim_matrices_new(
|
339
|
+
chemical_properties_df)
|
340
|
+
|
341
|
+
# similarity_matrices = self.sim(chemical_properties_df)
|
342
|
+
|
343
|
+
event_categories = self.ddis_df['event_category']
|
344
|
+
labels = event_categories.tolist()
|
345
|
+
lb = LabelBinarizer()
|
346
|
+
lb.fit(labels)
|
347
|
+
classes = lb.transform(labels)
|
348
|
+
|
349
|
+
# def similarity_lambda_fnc(row, value):
|
350
|
+
# if row['id1'] in value and row['id2'] in value:
|
351
|
+
# return value[row['id1']][row['id2']]
|
352
|
+
|
353
|
+
def similarity_lambda_fnc(row, value):
|
354
|
+
if row['id1'] in value:
|
355
|
+
return value[row['id1']]
|
356
|
+
|
357
|
+
def lambda_fnc(row, value):
|
358
|
+
if row['id1'] in value and row['id2'] in value:
|
359
|
+
return np.float16(np.hstack(
|
360
|
+
(value[row['id1']], value[row['id2']])))
|
361
|
+
# return np.hstack(
|
362
|
+
# (value[row['id1']], value[row['id2']]), dtype=np.float16)
|
363
|
+
|
364
|
+
def x_fnc(row, embedding_values, embedding_column):
|
365
|
+
# first = embedding_values[embedding_values.id == row['id1']]
|
366
|
+
# second = embedding_values[embedding_values.id == row['id2']]
|
367
|
+
# v1 = first.iloc[0][embedding_column+'_embedding']
|
368
|
+
# v2 = second.iloc[0][embedding_column+'_embedding']
|
369
|
+
v1 = embedding_dict[embedding_column][row['id1']]
|
370
|
+
v2 = embedding_dict[embedding_column][row['id2']]
|
371
|
+
# v1 = embedding_dict[row['id1']][embedding_column+'_embedding']
|
372
|
+
# v2 = embedding_dict[row['id2']][embedding_column+'_embedding']
|
373
|
+
return np.float16(np.hstack(
|
374
|
+
(v1, v2)))
|
375
|
+
|
376
|
+
for key, value in similarity_matrices.items():
|
377
|
+
|
378
|
+
print(f'sim matrix: {key}')
|
379
|
+
self.ddis_df[key] = self.ddis_df.apply(
|
380
|
+
lambda_fnc, args=(value,), axis=1)
|
381
|
+
print(self.ddis_df[key].head())
|
382
|
+
|
383
|
+
for embedding_column in self.embedding_columns:
|
384
|
+
print(f"concat {embedding_column} embeddings")
|
385
|
+
# column_embeddings_dict = embedding_values[embedding_column]
|
386
|
+
self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
|
387
|
+
x_fnc, args=(embedding_values, embedding_column), axis=1)
|
388
|
+
|
389
|
+
self.dataframe = self.ddis_df.copy()
|
390
|
+
self.dataframe['class'] = list(classes)
|
391
|
+
print(self.dataframe.shape)
|
392
|
+
|
393
|
+
def load(self):
|
394
|
+
if self.index_path == None:
|
395
|
+
raise Exception(
|
396
|
+
"There is no index path, please call split function")
|
397
|
+
|
398
|
+
# prep - split - load
|
399
|
+
train_idx_all, test_idx_all, train_idx_arr, val_idx_arr = self.__get_indexes__(
|
400
|
+
self.index_path)
|
401
|
+
|
402
|
+
self.prep()
|
403
|
+
train = self.dataframe[self.dataframe['index'].isin(train_idx_all)]
|
404
|
+
test = self.dataframe[self.dataframe['index'].isin(test_idx_all)]
|
405
|
+
|
406
|
+
self.X_train = train.drop('class', axis=1)
|
407
|
+
self.y_train = train['class']
|
408
|
+
self.X_test = test.drop('class', axis=1)
|
409
|
+
self.y_test = test['class']
|
410
|
+
|
411
|
+
self.train_indexes = self.X_train.index
|
412
|
+
self.test_indexes = self.X_test.index
|
413
|
+
self.train_idx_arr = train_idx_arr
|
414
|
+
self.val_idx_arr = val_idx_arr
|
415
|
+
|
416
|
+
return self.X_train, self.X_test, self.y_train, self.y_test, self.X_train.index, self.X_test.index, train_idx_arr, val_idx_arr
|
@@ -0,0 +1,204 @@
|
|
1
|
+
|
2
|
+
from sqlite3 import Error
|
3
|
+
import sqlite3
|
4
|
+
import pandas as pd
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
|
8
|
+
def create_connection(db_file=r"./event.db"):
|
9
|
+
""" create a database connection to the SQLite database
|
10
|
+
specified by db_file
|
11
|
+
:param db_file: database file
|
12
|
+
:return: Connection object or None
|
13
|
+
"""
|
14
|
+
conn = None
|
15
|
+
try:
|
16
|
+
conn = sqlite3.connect(db_file)
|
17
|
+
except Error as e:
|
18
|
+
print(e)
|
19
|
+
|
20
|
+
return conn
|
21
|
+
|
22
|
+
|
23
|
+
def select_all_drugs(conn):
|
24
|
+
cur = conn.cursor()
|
25
|
+
cur.execute(
|
26
|
+
'''select "index", id, name, target, enzyme, pathway, smile from drug''')
|
27
|
+
rows = cur.fetchall()
|
28
|
+
return rows
|
29
|
+
|
30
|
+
|
31
|
+
def select_all_drugs_as_dataframe(conn):
|
32
|
+
headers = ['index','id', 'name', 'target', 'enzyme', 'pathway', 'smile']
|
33
|
+
rows = select_all_drugs(conn)
|
34
|
+
df = pd.DataFrame(columns=headers, data=rows)
|
35
|
+
df['enzyme'] = df['enzyme'].apply(lambda x: x.split('|'))
|
36
|
+
df['target'] = df['target'].apply(lambda x: x.split('|'))
|
37
|
+
df['pathway'] = df['pathway'].apply(lambda x: x.split('|'))
|
38
|
+
df['smile'] = df['smile'].apply(lambda x: x.split('|'))
|
39
|
+
return df
|
40
|
+
|
41
|
+
|
42
|
+
def select_all_events(conn):
|
43
|
+
"""
|
44
|
+
Query all rows in the event table
|
45
|
+
:param conn: the Connection object
|
46
|
+
:return:
|
47
|
+
"""
|
48
|
+
cur = conn.cursor()
|
49
|
+
cur.execute("select * from event")
|
50
|
+
|
51
|
+
rows = cur.fetchall()
|
52
|
+
return rows
|
53
|
+
|
54
|
+
|
55
|
+
def select_all_events_as_dataframe(conn):
|
56
|
+
headers = ["index", "id1", "name1", "id2", "name2", "event_category"]
|
57
|
+
rows = select_all_events(conn)
|
58
|
+
return pd.DataFrame(columns=headers, data=rows)
|
59
|
+
|
60
|
+
|
61
|
+
def select_events_with_category(conn):
|
62
|
+
sql = '''select id1, name1, id2, name2, mechanism || ' ' ||action from event ev
|
63
|
+
join extraction ex
|
64
|
+
on ev.name1 = ex.drugA and ev.name2 = ex.drugB
|
65
|
+
union
|
66
|
+
select id1, name1, id2, name2, mechanism || ' ' ||action from event ev
|
67
|
+
join extraction ex
|
68
|
+
on ev.name1 = ex.drugB and ev.name2 = ex.drugA
|
69
|
+
'''
|
70
|
+
cur = conn.cursor()
|
71
|
+
cur.execute(sql)
|
72
|
+
|
73
|
+
rows = cur.fetchall()
|
74
|
+
|
75
|
+
headers = ['id1', 'name1', 'id2', 'name2', 'event_category']
|
76
|
+
return pd.DataFrame(columns=headers, data=rows)
|
77
|
+
|
78
|
+
|
79
|
+
def select_all_interactions_tuple_as_dataframe(conn):
|
80
|
+
cur = conn.cursor()
|
81
|
+
cur.execute("select id1, id2 from event")
|
82
|
+
rows = cur.fetchall()
|
83
|
+
headers = ['id1', 'id2']
|
84
|
+
|
85
|
+
return pd.DataFrame(columns=headers, data=rows)
|
86
|
+
|
87
|
+
|
88
|
+
def select_ddi_pairs(conn):
|
89
|
+
cur = conn.cursor()
|
90
|
+
cur.execute('''
|
91
|
+
select d1.[index] as Drug1Index, d2.[index] as Drug2Index, 1 from event e
|
92
|
+
join drug d1 on e.id1 = d1.id
|
93
|
+
join drug d2 on e.id2 = d2.id
|
94
|
+
''')
|
95
|
+
rows = cur.fetchall()
|
96
|
+
return rows
|
97
|
+
|
98
|
+
|
99
|
+
def select_ddi_pairs_as_dataframe(conn):
|
100
|
+
headers = ["Drug1Index", "Drug2Index", "Interaction"]
|
101
|
+
rows = select_ddi_pairs(conn)
|
102
|
+
return pd.DataFrame(columns=headers, data=rows)
|
103
|
+
|
104
|
+
|
105
|
+
def get_interactions(conn):
|
106
|
+
cur = conn.cursor()
|
107
|
+
cur.execute('''
|
108
|
+
select
|
109
|
+
drug_1_id,
|
110
|
+
drug_1,
|
111
|
+
drug_2_id,
|
112
|
+
drug_2,
|
113
|
+
mechanism_action,
|
114
|
+
interaction,
|
115
|
+
masked_interaction
|
116
|
+
from _Interactions
|
117
|
+
''')
|
118
|
+
|
119
|
+
rows = cur.fetchall()
|
120
|
+
|
121
|
+
headers = ['id1', 'name1', 'id2', 'name2',
|
122
|
+
'event_category', 'interaction', 'masked_interaction']
|
123
|
+
df = pd.DataFrame(columns=headers, data=rows)
|
124
|
+
return df
|
125
|
+
|
126
|
+
|
127
|
+
def get_extended_version(conn):
|
128
|
+
cur = conn.cursor()
|
129
|
+
cur.execute('''
|
130
|
+
select
|
131
|
+
_Drugs."index",
|
132
|
+
drugbank_id,
|
133
|
+
_Drugs.name,
|
134
|
+
description,
|
135
|
+
synthesis_reference,
|
136
|
+
indication,
|
137
|
+
pharmacodynamics,
|
138
|
+
mechanism_of_action,
|
139
|
+
toxicity,
|
140
|
+
metabolism,
|
141
|
+
absorption,
|
142
|
+
half_life,
|
143
|
+
protein_binding,
|
144
|
+
route_of_elimination,
|
145
|
+
volume_of_distribution,
|
146
|
+
clearance,
|
147
|
+
smiles,
|
148
|
+
smiles_morgan_fingerprint,
|
149
|
+
enzymes_polypeptides,
|
150
|
+
targets_polypeptides
|
151
|
+
|
152
|
+
from drug
|
153
|
+
join _Drugs on drug.id = _Drugs.drugbank_id
|
154
|
+
where
|
155
|
+
targets_polypeptides is not null and
|
156
|
+
enzymes_polypeptides is not null and
|
157
|
+
smiles_morgan_fingerprint is not null
|
158
|
+
''')
|
159
|
+
# pathway is absent
|
160
|
+
|
161
|
+
rows = cur.fetchall()
|
162
|
+
headers = ['index', 'id', 'name', 'description', 'synthesis_reference', 'indication', 'pharmacodynamics', 'mechanism_of_action', 'toxicity', 'metabolism', 'absorption', 'half_life',
|
163
|
+
'protein_binding', 'route_of_elimination', 'volume_of_distribution', 'clearance', 'smiles_notation', 'smile', 'enzyme', 'target']
|
164
|
+
df = pd.DataFrame(columns=headers, data=rows)
|
165
|
+
df['smile'] = df['smile'].apply(lambda x:
|
166
|
+
np.fromstring(
|
167
|
+
x.replace(
|
168
|
+
'\n', '')
|
169
|
+
.replace('[', '')
|
170
|
+
.replace(']', '')
|
171
|
+
.replace(' ', ' '), sep=','))
|
172
|
+
df['enzyme'] = df['enzyme'].apply(
|
173
|
+
lambda x: x.split('|'))
|
174
|
+
df['target'] = df['target'].apply(
|
175
|
+
lambda x: x.split('|'))
|
176
|
+
return df
|
177
|
+
|
178
|
+
|
179
|
+
# SELECT
|
180
|
+
# CASE
|
181
|
+
# WHEN masked_interaction like '%'+drug_1+'%' THEN drug_1
|
182
|
+
# WHEN masked_interaction like '%'+drug_2+'%' THEN drug_2
|
183
|
+
# Else drug_2
|
184
|
+
# END AS Absent,
|
185
|
+
|
186
|
+
# drug_1, drug_2,
|
187
|
+
# masked_interaction
|
188
|
+
|
189
|
+
# from _Interactions
|
190
|
+
# where LENGTH(masked_interaction) = LENGTH(REPLACE(masked_interaction, 'DRUG', ''))
|
191
|
+
# or LENGTH(masked_interaction) = LENGTH(REPLACE(masked_interaction, 'DRUG', '')) + 4
|
192
|
+
|
193
|
+
if __name__ == "__main__":
|
194
|
+
conn = create_connection(r"./event-extended.db")
|
195
|
+
extended_version_df = get_extended_version(conn)
|
196
|
+
|
197
|
+
df = select_all_events_as_dataframe(conn)
|
198
|
+
print(df.head())
|
199
|
+
|
200
|
+
events_with_category_df = select_events_with_category(conn)
|
201
|
+
print(events_with_category_df.head())
|
202
|
+
|
203
|
+
u = events_with_category_df['event_category'].unique()
|
204
|
+
print(len(u))
|