ddi-fw 0.0.128__py3-none-any.whl → 0.0.130__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ import pathlib
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator
7
+ from ddi_fw.langchain.embeddings import PoolingStrategy
8
+ from .. import BaseDataset
9
+ from ..db_utils import create_connection
10
+
11
+ HERE = pathlib.Path(__file__).resolve().parent
12
+ list_of_embedding_columns = ['description',
13
+ 'indication',
14
+ 'mechanism_of_action',
15
+ 'pharmacodynamics',
16
+ 'description_indication',
17
+ 'description_mechanism_of_action',
18
+ 'description_pharmacodynamics',
19
+ 'indication_mechanism_of_action',
20
+ 'indication_pharmacodynamics',
21
+ 'mechanism_of_action_pharmacodynamics',
22
+ 'description_indication_mechanism_of_action',
23
+ 'description_indication_pharmacodynamics',
24
+ 'description_mechanism_of_action_pharmacodynamics',
25
+ 'indication_mechanism_of_action_pharmacodynamics'
26
+ 'description_indication_mechanism_of_action_pharmacodynamics',
27
+ ]
28
+
29
+ list_of_chemical_property_columns = ['enzyme',
30
+ 'target',
31
+ 'pathway',
32
+ 'smile']
33
+ list_of_ner_columns = ['tui', 'cui', 'entities']
34
+
35
+
36
+ def indices_to_binary_vector(indices, vector_length=881):
37
+ # vector_length = len(indices)
38
+ # Initialize a zero vector of the given length
39
+ binary_vector = [0] * vector_length
40
+
41
+ # Set the positions specified by indices to 1
42
+ for index in indices:
43
+ if 0 <= index < vector_length:
44
+ binary_vector[index] = 1
45
+
46
+ return binary_vector
47
+
48
+
49
+ class DDIMDLDataset(BaseDataset):
50
+ def __init__(self, embedding_size,
51
+ embedding_dict,
52
+ embeddings_pooling_strategy: PoolingStrategy,
53
+ ner_df,
54
+ chemical_property_columns=['enzyme',
55
+ 'target',
56
+ 'pathway',
57
+ 'smile'],
58
+ embedding_columns=[],
59
+ ner_columns=[],
60
+ **kwargs):
61
+ columns = kwargs['columns']
62
+ if columns:
63
+ chemical_property_columns = []
64
+ embedding_columns = []
65
+ ner_columns = []
66
+ for column in columns:
67
+ if column in list_of_chemical_property_columns:
68
+ chemical_property_columns.append(column)
69
+ elif column in list_of_embedding_columns:
70
+ embedding_columns.append(column)
71
+ elif column in list_of_ner_columns:
72
+ ner_columns.append(column)
73
+ # elif column == 'smile_2':
74
+ # continue
75
+ else:
76
+ raise Exception(f"{column} is not related this dataset")
77
+
78
+ super().__init__(embedding_size=embedding_size,
79
+ embedding_dict=embedding_dict,
80
+ embeddings_pooling_strategy=embeddings_pooling_strategy,
81
+ ner_df=ner_df,
82
+ chemical_property_columns=chemical_property_columns,
83
+ embedding_columns=embedding_columns,
84
+ ner_columns=ner_columns,
85
+ **kwargs)
86
+
87
+ # kwargs = {'index_path': str(HERE.joinpath('indexes'))}
88
+ kwargs['index_path'] = str(HERE.joinpath('indexes'))
89
+
90
+ db = HERE.joinpath('data/event.db')
91
+ conn = create_connection(db)
92
+ print("db prep")
93
+ self.drugs_df = self.__select_all_drugs_as_dataframe__(conn)
94
+ self.ddis_df = self.__select_all_events__(conn)
95
+ print("db bitti")
96
+ self.index_path = kwargs.get('index_path')
97
+
98
+ # jaccard_sim_dict = {}
99
+ # sim_matrix_gen = SimilarityMatrixGenerator()
100
+ # jaccard_sim_dict["smile_2"] = sim_matrix_gen.create_jaccard_similarity_matrices(
101
+ # self.drugs_df["smile_2"].to_list())
102
+
103
+ # similarity_matrices = {}
104
+ # drugbank_ids = self.drugs_df['id'].to_list()
105
+ # new_columns = {}
106
+ # for idx in range(len(drugbank_ids)):
107
+ # new_columns[idx] = drugbank_ids[idx]
108
+ # new_df = pd.DataFrame.from_dict(jaccard_sim_dict["smile_2"])
109
+ # new_df = new_df.rename(index=new_columns, columns=new_columns)
110
+ # similarity_matrices["smile_2"] = new_df
111
+
112
+ # def lambda_fnc(row, value):
113
+ # if row['id1'] in value and row['id2'] in value:
114
+ # return np.float16(np.hstack(
115
+ # (value[row['id1']], value[row['id2']])))
116
+ # for key, value in similarity_matrices.items():
117
+
118
+ # print(f'sim matrix: {key}')
119
+ # self.ddis_df[key] = self.ddis_df.apply(
120
+ # lambda_fnc, args=(value,), axis=1)
121
+ # print(self.ddis_df[key].head())
122
+ # print("init finished")
123
+
124
+ def __select_all_drugs_as_dataframe__(self, conn):
125
+ headers = ['index', 'id', 'name',
126
+ 'target', 'enzyme', 'pathway', 'smile']
127
+ cur = conn.cursor()
128
+ cur.execute(
129
+ '''select "index", id, name, target, enzyme, pathway, smile from drug''')
130
+ rows = cur.fetchall()
131
+ df = pd.DataFrame(columns=headers, data=rows)
132
+ df['enzyme'] = df['enzyme'].apply(lambda x: x.split('|'))
133
+ df['target'] = df['target'].apply(lambda x: x.split('|'))
134
+ df['pathway'] = df['pathway'].apply(lambda x: x.split('|'))
135
+ # df['smile_2'] = df['smile'].apply(lambda x: indices_to_binary_vector(indices = list(map(int, x.split('|'))), vector_length = 881))
136
+ df['smile'] = df['smile'].apply(lambda x: x.split('|'))
137
+
138
+ return df
139
+
140
+ def __select_all_events__(self, conn):
141
+ """
142
+ Query all rows in the event table
143
+ :param conn: the Connection object
144
+ :return:
145
+ """
146
+ cur = conn.cursor()
147
+ cur.execute('''
148
+ select ex."index", d1.id, d1.name, d2.id, d2.name, mechanism || ' ' ||action from extraction ex
149
+ join drug d1 on d1.name = ex.drugA
150
+ join drug d2 on d2.name = ex.drugB
151
+ ''')
152
+
153
+ rows = cur.fetchall()
154
+
155
+ headers = ["index", "id1", "name1", "id2", "name2", "event_category"]
156
+ return pd.DataFrame(columns=headers, data=rows)