ddi-fw 0.0.111__py3-none-any.whl → 0.0.112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/ddi_mdl/base.py +47 -0
- ddi_fw/datasets/feature_vector_generation.py +30 -1
- {ddi_fw-0.0.111.dist-info → ddi_fw-0.0.112.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.111.dist-info → ddi_fw-0.0.112.dist-info}/RECORD +6 -6
- {ddi_fw-0.0.111.dist-info → ddi_fw-0.0.112.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.111.dist-info → ddi_fw-0.0.112.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/ddi_mdl/base.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import pathlib
|
2
2
|
|
3
|
+
import numpy as np
|
3
4
|
import pandas as pd
|
4
5
|
|
6
|
+
from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator
|
5
7
|
from ddi_fw.langchain.embeddings import PoolingStrategy
|
6
8
|
from .. import BaseDataset
|
7
9
|
from ..db_utils import create_connection
|
@@ -22,6 +24,18 @@ list_of_chemical_property_columns = ['enzyme',
|
|
22
24
|
list_of_ner_columns = ['tui', 'cui', 'entities']
|
23
25
|
|
24
26
|
|
27
|
+
def indices_to_binary_vector(indices, vector_length=881):
|
28
|
+
# vector_length = len(indices)
|
29
|
+
# Initialize a zero vector of the given length
|
30
|
+
binary_vector = [0] * vector_length
|
31
|
+
|
32
|
+
# Set the positions specified by indices to 1
|
33
|
+
for index in indices:
|
34
|
+
if 0 <= index < vector_length:
|
35
|
+
binary_vector[index] = 1
|
36
|
+
|
37
|
+
return binary_vector
|
38
|
+
|
25
39
|
class DDIMDLDataset(BaseDataset):
|
26
40
|
def __init__(self, embedding_size,
|
27
41
|
embedding_dict,
|
@@ -70,6 +84,34 @@ class DDIMDLDataset(BaseDataset):
|
|
70
84
|
print("db bitti")
|
71
85
|
self.index_path = kwargs.get('index_path')
|
72
86
|
|
87
|
+
jaccard_sim_dict = {}
|
88
|
+
sim_matrix_gen = SimilarityMatrixGenerator()
|
89
|
+
jaccard_sim_dict["smile_2"] = sim_matrix_gen.create_jaccard_similarity_matrices(
|
90
|
+
self.drugs_df["smile_2"].to_list())
|
91
|
+
|
92
|
+
similarity_matrices = {}
|
93
|
+
drugbank_ids = self.drugs_df['id'].to_list()
|
94
|
+
new_columns = {}
|
95
|
+
for idx in range(len(drugbank_ids)):
|
96
|
+
new_columns[idx] = drugbank_ids[idx]
|
97
|
+
for idx in range(len(drugbank_ids)):
|
98
|
+
new_columns[idx] = drugbank_ids[idx]
|
99
|
+
new_df = pd.DataFrame.from_dict(jaccard_sim_dict["smile_2"])
|
100
|
+
new_df = new_df.rename(index=new_columns, columns=new_columns)
|
101
|
+
similarity_matrices["smile_2"] = new_df
|
102
|
+
|
103
|
+
def lambda_fnc(row, value):
|
104
|
+
if row['id1'] in value and row['id2'] in value:
|
105
|
+
return np.float16(np.hstack(
|
106
|
+
(value[row['id1']], value[row['id2']])))
|
107
|
+
for key, value in similarity_matrices.items():
|
108
|
+
|
109
|
+
print(f'sim matrix: {key}')
|
110
|
+
self.ddis_df[key] = self.ddis_df.apply(
|
111
|
+
lambda_fnc, args=(value,), axis=1)
|
112
|
+
print(self.ddis_df[key].head())
|
113
|
+
print("init finished")
|
114
|
+
|
73
115
|
def __select_all_drugs_as_dataframe__(self, conn):
|
74
116
|
headers = ['index', 'id', 'name',
|
75
117
|
'target', 'enzyme', 'pathway', 'smile']
|
@@ -82,6 +124,11 @@ class DDIMDLDataset(BaseDataset):
|
|
82
124
|
df['target'] = df['target'].apply(lambda x: x.split('|'))
|
83
125
|
df['pathway'] = df['pathway'].apply(lambda x: x.split('|'))
|
84
126
|
df['smile'] = df['smile'].apply(lambda x: x.split('|'))
|
127
|
+
df['smile_2'] = df['smile'].apply(lambda x: indices_to_binary_vector(indices = list(map(int, x.split('|'))), vector_length = 881))
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
85
132
|
return df
|
86
133
|
|
87
134
|
def __select_all_events__(self, conn):
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import numpy as np
|
2
|
+
import pandas as pd
|
2
3
|
from scipy.spatial.distance import pdist, squareform
|
3
4
|
|
4
5
|
# todo pd.unique kullan
|
@@ -48,12 +49,40 @@ class VectorGenerator:
|
|
48
49
|
def __init__(self, df):
|
49
50
|
self.df = df
|
50
51
|
|
52
|
+
# https://github.com/YifanDengWHU/DDIMDL/blob/master/DDIMDL.py#L86
|
53
|
+
# def generate_feature_vector(self, column):
|
54
|
+
# # Initialize list to store all distinct features across all rows
|
55
|
+
# all_features = []
|
56
|
+
|
57
|
+
# # Loop through the column to extract features, split by '|', and collect all distinct ones
|
58
|
+
# drug_list = np.array(self.df[column]).tolist()
|
59
|
+
# for i in drug_list:
|
60
|
+
# for each_feature in i.split('|'):
|
61
|
+
# if each_feature not in all_features:
|
62
|
+
# all_features.append(each_feature)
|
63
|
+
|
64
|
+
# # Initialize a matrix to hold feature vectors (rows for each element, columns for each distinct feature)
|
65
|
+
# feature_matrix = np.zeros((len(drug_list), len(all_features)), dtype=float)
|
66
|
+
|
67
|
+
# # Create a DataFrame to store the feature matrix with the column names as the distinct features
|
68
|
+
# df_feature = pd.DataFrame(feature_matrix, columns=all_features)
|
69
|
+
|
70
|
+
# # Fill the feature matrix (set value to 1 if feature is present for the specific item in the column)
|
71
|
+
# for i in range(len(drug_list)):
|
72
|
+
# for each_feature in drug_list[i].split('|'):
|
73
|
+
# if each_feature in all_features:
|
74
|
+
# df_feature[each_feature].iloc[i] = 1
|
75
|
+
|
76
|
+
# # Convert DataFrame to numpy array and return
|
77
|
+
# print("Feature vectors generated")
|
78
|
+
# return df_feature.to_numpy()
|
79
|
+
|
51
80
|
def generate_feature_vector(self, column):
|
52
81
|
bit_vectors = []
|
53
82
|
map = dict()
|
54
83
|
idx = 0
|
55
84
|
count = find_distinct_elements_count(self.df[column])
|
56
|
-
print(f"
|
85
|
+
print(f"{column} has {count} different items")
|
57
86
|
for ind in self.df.index:
|
58
87
|
e = self.df[column][ind]
|
59
88
|
# vector = np.zeros(len(sorted_features))
|
@@ -2,10 +2,10 @@ ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,3
|
|
2
2
|
ddi_fw/datasets/core.py,sha256=0bEJSxqO22x0XBoCKOYmxXNa2j7_CqFqKiBZ4KFd9Mk,17039
|
3
3
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
4
4
|
ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
|
5
|
-
ddi_fw/datasets/feature_vector_generation.py,sha256=
|
5
|
+
ddi_fw/datasets/feature_vector_generation.py,sha256=gvjpEzkgVV8dp4V8NMMv59u0v-1tNAJ7v83R-keWGoA,4748
|
6
6
|
ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
7
7
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
8
|
-
ddi_fw/datasets/ddi_mdl/base.py,sha256=
|
8
|
+
ddi_fw/datasets/ddi_mdl/base.py,sha256=QtDAaXpqDiPxQ-xOVA0Xd-wcm1sRY5FpO_4zNAKoksM,6146
|
9
9
|
ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
|
10
10
|
ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
|
11
11
|
ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
|
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
|
|
91
91
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
92
92
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
93
93
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
94
|
-
ddi_fw-0.0.
|
95
|
-
ddi_fw-0.0.
|
96
|
-
ddi_fw-0.0.
|
97
|
-
ddi_fw-0.0.
|
94
|
+
ddi_fw-0.0.112.dist-info/METADATA,sha256=tF5FdHznWj9YznDQbTGQtsspXYlHOxmEJLxeEcI7oLg,1967
|
95
|
+
ddi_fw-0.0.112.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
96
|
+
ddi_fw-0.0.112.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
97
|
+
ddi_fw-0.0.112.dist-info/RECORD,,
|
File without changes
|
File without changes
|