ddi-fw 0.0.128__py3-none-any.whl → 0.0.130__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/ddi_mdl_text/base.py +156 -0
- ddi_fw/datasets/ddi_mdl_text/data/event.db +0 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/test_indexes.txt +14906 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_0.txt +47697 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_1.txt +47697 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_2.txt +47698 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_3.txt +47698 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/train_fold_4.txt +47698 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/train_indexes.txt +59622 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_0.txt +11925 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_1.txt +11925 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_2.txt +11924 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_3.txt +11924 -0
- ddi_fw/datasets/ddi_mdl_text/indexes/validation_fold_4.txt +11924 -0
- ddi_fw/langchain/storage.py +2 -1
- {ddi_fw-0.0.128.dist-info → ddi_fw-0.0.130.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.128.dist-info → ddi_fw-0.0.130.dist-info}/RECORD +19 -5
- {ddi_fw-0.0.128.dist-info → ddi_fw-0.0.130.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.128.dist-info → ddi_fw-0.0.130.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
import pathlib
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator
|
7
|
+
from ddi_fw.langchain.embeddings import PoolingStrategy
|
8
|
+
from .. import BaseDataset
|
9
|
+
from ..db_utils import create_connection
|
10
|
+
|
11
|
+
HERE = pathlib.Path(__file__).resolve().parent
|
12
|
+
list_of_embedding_columns = ['description',
|
13
|
+
'indication',
|
14
|
+
'mechanism_of_action',
|
15
|
+
'pharmacodynamics',
|
16
|
+
'description_indication',
|
17
|
+
'description_mechanism_of_action',
|
18
|
+
'description_pharmacodynamics',
|
19
|
+
'indication_mechanism_of_action',
|
20
|
+
'indication_pharmacodynamics',
|
21
|
+
'mechanism_of_action_pharmacodynamics',
|
22
|
+
'description_indication_mechanism_of_action',
|
23
|
+
'description_indication_pharmacodynamics',
|
24
|
+
'description_mechanism_of_action_pharmacodynamics',
|
25
|
+
'indication_mechanism_of_action_pharmacodynamics'
|
26
|
+
'description_indication_mechanism_of_action_pharmacodynamics',
|
27
|
+
]
|
28
|
+
|
29
|
+
list_of_chemical_property_columns = ['enzyme',
|
30
|
+
'target',
|
31
|
+
'pathway',
|
32
|
+
'smile']
|
33
|
+
list_of_ner_columns = ['tui', 'cui', 'entities']
|
34
|
+
|
35
|
+
|
36
|
+
def indices_to_binary_vector(indices, vector_length=881):
|
37
|
+
# vector_length = len(indices)
|
38
|
+
# Initialize a zero vector of the given length
|
39
|
+
binary_vector = [0] * vector_length
|
40
|
+
|
41
|
+
# Set the positions specified by indices to 1
|
42
|
+
for index in indices:
|
43
|
+
if 0 <= index < vector_length:
|
44
|
+
binary_vector[index] = 1
|
45
|
+
|
46
|
+
return binary_vector
|
47
|
+
|
48
|
+
|
49
|
+
class DDIMDLDataset(BaseDataset):
|
50
|
+
def __init__(self, embedding_size,
|
51
|
+
embedding_dict,
|
52
|
+
embeddings_pooling_strategy: PoolingStrategy,
|
53
|
+
ner_df,
|
54
|
+
chemical_property_columns=['enzyme',
|
55
|
+
'target',
|
56
|
+
'pathway',
|
57
|
+
'smile'],
|
58
|
+
embedding_columns=[],
|
59
|
+
ner_columns=[],
|
60
|
+
**kwargs):
|
61
|
+
columns = kwargs['columns']
|
62
|
+
if columns:
|
63
|
+
chemical_property_columns = []
|
64
|
+
embedding_columns = []
|
65
|
+
ner_columns = []
|
66
|
+
for column in columns:
|
67
|
+
if column in list_of_chemical_property_columns:
|
68
|
+
chemical_property_columns.append(column)
|
69
|
+
elif column in list_of_embedding_columns:
|
70
|
+
embedding_columns.append(column)
|
71
|
+
elif column in list_of_ner_columns:
|
72
|
+
ner_columns.append(column)
|
73
|
+
# elif column == 'smile_2':
|
74
|
+
# continue
|
75
|
+
else:
|
76
|
+
raise Exception(f"{column} is not related this dataset")
|
77
|
+
|
78
|
+
super().__init__(embedding_size=embedding_size,
|
79
|
+
embedding_dict=embedding_dict,
|
80
|
+
embeddings_pooling_strategy=embeddings_pooling_strategy,
|
81
|
+
ner_df=ner_df,
|
82
|
+
chemical_property_columns=chemical_property_columns,
|
83
|
+
embedding_columns=embedding_columns,
|
84
|
+
ner_columns=ner_columns,
|
85
|
+
**kwargs)
|
86
|
+
|
87
|
+
# kwargs = {'index_path': str(HERE.joinpath('indexes'))}
|
88
|
+
kwargs['index_path'] = str(HERE.joinpath('indexes'))
|
89
|
+
|
90
|
+
db = HERE.joinpath('data/event.db')
|
91
|
+
conn = create_connection(db)
|
92
|
+
print("db prep")
|
93
|
+
self.drugs_df = self.__select_all_drugs_as_dataframe__(conn)
|
94
|
+
self.ddis_df = self.__select_all_events__(conn)
|
95
|
+
print("db bitti")
|
96
|
+
self.index_path = kwargs.get('index_path')
|
97
|
+
|
98
|
+
# jaccard_sim_dict = {}
|
99
|
+
# sim_matrix_gen = SimilarityMatrixGenerator()
|
100
|
+
# jaccard_sim_dict["smile_2"] = sim_matrix_gen.create_jaccard_similarity_matrices(
|
101
|
+
# self.drugs_df["smile_2"].to_list())
|
102
|
+
|
103
|
+
# similarity_matrices = {}
|
104
|
+
# drugbank_ids = self.drugs_df['id'].to_list()
|
105
|
+
# new_columns = {}
|
106
|
+
# for idx in range(len(drugbank_ids)):
|
107
|
+
# new_columns[idx] = drugbank_ids[idx]
|
108
|
+
# new_df = pd.DataFrame.from_dict(jaccard_sim_dict["smile_2"])
|
109
|
+
# new_df = new_df.rename(index=new_columns, columns=new_columns)
|
110
|
+
# similarity_matrices["smile_2"] = new_df
|
111
|
+
|
112
|
+
# def lambda_fnc(row, value):
|
113
|
+
# if row['id1'] in value and row['id2'] in value:
|
114
|
+
# return np.float16(np.hstack(
|
115
|
+
# (value[row['id1']], value[row['id2']])))
|
116
|
+
# for key, value in similarity_matrices.items():
|
117
|
+
|
118
|
+
# print(f'sim matrix: {key}')
|
119
|
+
# self.ddis_df[key] = self.ddis_df.apply(
|
120
|
+
# lambda_fnc, args=(value,), axis=1)
|
121
|
+
# print(self.ddis_df[key].head())
|
122
|
+
# print("init finished")
|
123
|
+
|
124
|
+
def __select_all_drugs_as_dataframe__(self, conn):
|
125
|
+
headers = ['index', 'id', 'name',
|
126
|
+
'target', 'enzyme', 'pathway', 'smile']
|
127
|
+
cur = conn.cursor()
|
128
|
+
cur.execute(
|
129
|
+
'''select "index", id, name, target, enzyme, pathway, smile from drug''')
|
130
|
+
rows = cur.fetchall()
|
131
|
+
df = pd.DataFrame(columns=headers, data=rows)
|
132
|
+
df['enzyme'] = df['enzyme'].apply(lambda x: x.split('|'))
|
133
|
+
df['target'] = df['target'].apply(lambda x: x.split('|'))
|
134
|
+
df['pathway'] = df['pathway'].apply(lambda x: x.split('|'))
|
135
|
+
# df['smile_2'] = df['smile'].apply(lambda x: indices_to_binary_vector(indices = list(map(int, x.split('|'))), vector_length = 881))
|
136
|
+
df['smile'] = df['smile'].apply(lambda x: x.split('|'))
|
137
|
+
|
138
|
+
return df
|
139
|
+
|
140
|
+
def __select_all_events__(self, conn):
|
141
|
+
"""
|
142
|
+
Query all rows in the event table
|
143
|
+
:param conn: the Connection object
|
144
|
+
:return:
|
145
|
+
"""
|
146
|
+
cur = conn.cursor()
|
147
|
+
cur.execute('''
|
148
|
+
select ex."index", d1.id, d1.name, d2.id, d2.name, mechanism || ' ' ||action from extraction ex
|
149
|
+
join drug d1 on d1.name = ex.drugA
|
150
|
+
join drug d2 on d2.name = ex.drugB
|
151
|
+
''')
|
152
|
+
|
153
|
+
rows = cur.fetchall()
|
154
|
+
|
155
|
+
headers = ["index", "id1", "name1", "id2", "name2", "event_category"]
|
156
|
+
return pd.DataFrame(columns=headers, data=rows)
|
Binary file
|