ddi-fw 0.0.110__py3-none-any.whl → 0.0.112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +1 -1
- ddi_fw/datasets/ddi_mdl/base.py +47 -0
- ddi_fw/datasets/feature_vector_generation.py +30 -1
- ddi_fw/langchain/storage.py +49 -38
- {ddi_fw-0.0.110.dist-info → ddi_fw-0.0.112.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.110.dist-info → ddi_fw-0.0.112.dist-info}/RECORD +8 -8
- {ddi_fw-0.0.110.dist-info → ddi_fw-0.0.112.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.110.dist-info → ddi_fw-0.0.112.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -274,7 +274,7 @@ class BaseDataset(ABC):
|
|
274
274
|
filtered_df = self.drugs_df
|
275
275
|
combined_df = filtered_df.copy()
|
276
276
|
|
277
|
-
if self.ner_df:
|
277
|
+
if self.ner_df is not None and not self.ner_df.empty:
|
278
278
|
filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
|
279
279
|
drug_ids)]
|
280
280
|
filtered_ner_df = self.ner_df.copy()
|
ddi_fw/datasets/ddi_mdl/base.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
import pathlib
|
2
2
|
|
3
|
+
import numpy as np
|
3
4
|
import pandas as pd
|
4
5
|
|
6
|
+
from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator
|
5
7
|
from ddi_fw.langchain.embeddings import PoolingStrategy
|
6
8
|
from .. import BaseDataset
|
7
9
|
from ..db_utils import create_connection
|
@@ -22,6 +24,18 @@ list_of_chemical_property_columns = ['enzyme',
|
|
22
24
|
list_of_ner_columns = ['tui', 'cui', 'entities']
|
23
25
|
|
24
26
|
|
27
|
+
def indices_to_binary_vector(indices, vector_length=881):
|
28
|
+
# vector_length = len(indices)
|
29
|
+
# Initialize a zero vector of the given length
|
30
|
+
binary_vector = [0] * vector_length
|
31
|
+
|
32
|
+
# Set the positions specified by indices to 1
|
33
|
+
for index in indices:
|
34
|
+
if 0 <= index < vector_length:
|
35
|
+
binary_vector[index] = 1
|
36
|
+
|
37
|
+
return binary_vector
|
38
|
+
|
25
39
|
class DDIMDLDataset(BaseDataset):
|
26
40
|
def __init__(self, embedding_size,
|
27
41
|
embedding_dict,
|
@@ -70,6 +84,34 @@ class DDIMDLDataset(BaseDataset):
|
|
70
84
|
print("db bitti")
|
71
85
|
self.index_path = kwargs.get('index_path')
|
72
86
|
|
87
|
+
jaccard_sim_dict = {}
|
88
|
+
sim_matrix_gen = SimilarityMatrixGenerator()
|
89
|
+
jaccard_sim_dict["smile_2"] = sim_matrix_gen.create_jaccard_similarity_matrices(
|
90
|
+
self.drugs_df["smile_2"].to_list())
|
91
|
+
|
92
|
+
similarity_matrices = {}
|
93
|
+
drugbank_ids = self.drugs_df['id'].to_list()
|
94
|
+
new_columns = {}
|
95
|
+
for idx in range(len(drugbank_ids)):
|
96
|
+
new_columns[idx] = drugbank_ids[idx]
|
97
|
+
for idx in range(len(drugbank_ids)):
|
98
|
+
new_columns[idx] = drugbank_ids[idx]
|
99
|
+
new_df = pd.DataFrame.from_dict(jaccard_sim_dict["smile_2"])
|
100
|
+
new_df = new_df.rename(index=new_columns, columns=new_columns)
|
101
|
+
similarity_matrices["smile_2"] = new_df
|
102
|
+
|
103
|
+
def lambda_fnc(row, value):
|
104
|
+
if row['id1'] in value and row['id2'] in value:
|
105
|
+
return np.float16(np.hstack(
|
106
|
+
(value[row['id1']], value[row['id2']])))
|
107
|
+
for key, value in similarity_matrices.items():
|
108
|
+
|
109
|
+
print(f'sim matrix: {key}')
|
110
|
+
self.ddis_df[key] = self.ddis_df.apply(
|
111
|
+
lambda_fnc, args=(value,), axis=1)
|
112
|
+
print(self.ddis_df[key].head())
|
113
|
+
print("init finished")
|
114
|
+
|
73
115
|
def __select_all_drugs_as_dataframe__(self, conn):
|
74
116
|
headers = ['index', 'id', 'name',
|
75
117
|
'target', 'enzyme', 'pathway', 'smile']
|
@@ -82,6 +124,11 @@ class DDIMDLDataset(BaseDataset):
|
|
82
124
|
df['target'] = df['target'].apply(lambda x: x.split('|'))
|
83
125
|
df['pathway'] = df['pathway'].apply(lambda x: x.split('|'))
|
84
126
|
df['smile'] = df['smile'].apply(lambda x: x.split('|'))
|
127
|
+
df['smile_2'] = df['smile'].apply(lambda x: indices_to_binary_vector(indices = list(map(int, x.split('|'))), vector_length = 881))
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
85
132
|
return df
|
86
133
|
|
87
134
|
def __select_all_events__(self, conn):
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import numpy as np
|
2
|
+
import pandas as pd
|
2
3
|
from scipy.spatial.distance import pdist, squareform
|
3
4
|
|
4
5
|
# todo pd.unique kullan
|
@@ -48,12 +49,40 @@ class VectorGenerator:
|
|
48
49
|
def __init__(self, df):
|
49
50
|
self.df = df
|
50
51
|
|
52
|
+
# https://github.com/YifanDengWHU/DDIMDL/blob/master/DDIMDL.py#L86
|
53
|
+
# def generate_feature_vector(self, column):
|
54
|
+
# # Initialize list to store all distinct features across all rows
|
55
|
+
# all_features = []
|
56
|
+
|
57
|
+
# # Loop through the column to extract features, split by '|', and collect all distinct ones
|
58
|
+
# drug_list = np.array(self.df[column]).tolist()
|
59
|
+
# for i in drug_list:
|
60
|
+
# for each_feature in i.split('|'):
|
61
|
+
# if each_feature not in all_features:
|
62
|
+
# all_features.append(each_feature)
|
63
|
+
|
64
|
+
# # Initialize a matrix to hold feature vectors (rows for each element, columns for each distinct feature)
|
65
|
+
# feature_matrix = np.zeros((len(drug_list), len(all_features)), dtype=float)
|
66
|
+
|
67
|
+
# # Create a DataFrame to store the feature matrix with the column names as the distinct features
|
68
|
+
# df_feature = pd.DataFrame(feature_matrix, columns=all_features)
|
69
|
+
|
70
|
+
# # Fill the feature matrix (set value to 1 if feature is present for the specific item in the column)
|
71
|
+
# for i in range(len(drug_list)):
|
72
|
+
# for each_feature in drug_list[i].split('|'):
|
73
|
+
# if each_feature in all_features:
|
74
|
+
# df_feature[each_feature].iloc[i] = 1
|
75
|
+
|
76
|
+
# # Convert DataFrame to numpy array and return
|
77
|
+
# print("Feature vectors generated")
|
78
|
+
# return df_feature.to_numpy()
|
79
|
+
|
51
80
|
def generate_feature_vector(self, column):
|
52
81
|
bit_vectors = []
|
53
82
|
map = dict()
|
54
83
|
idx = 0
|
55
84
|
count = find_distinct_elements_count(self.df[column])
|
56
|
-
print(f"
|
85
|
+
print(f"{column} has {count} different items")
|
57
86
|
for ind in self.df.index:
|
58
87
|
e = self.df[column][ind]
|
59
88
|
# vector = np.zeros(len(sorted_features))
|
ddi_fw/langchain/storage.py
CHANGED
@@ -3,6 +3,7 @@ from langchain.vectorstores import Chroma
|
|
3
3
|
# from langchain_community.vectorstores import Chroma
|
4
4
|
from langchain_community.vectorstores.utils import filter_complex_metadata
|
5
5
|
from langchain_core.embeddings import Embeddings
|
6
|
+
import time
|
6
7
|
|
7
8
|
|
8
9
|
from langchain.docstore.document import Document
|
@@ -102,7 +103,7 @@ class DataFrameToVectorDB:
|
|
102
103
|
persist_directory,
|
103
104
|
embeddings: Embeddings,
|
104
105
|
text_splitter: TextSplitter,
|
105
|
-
batch_size=
|
106
|
+
batch_size=1024):
|
106
107
|
self.collection_name = collection_name
|
107
108
|
self.persist_directory = persist_directory
|
108
109
|
self.embeddings = embeddings
|
@@ -118,10 +119,10 @@ class DataFrameToVectorDB:
|
|
118
119
|
|
119
120
|
def __split_list(self, input_list, batch_size):
|
120
121
|
# for i in range(0, len(input_list), batch_size):
|
121
|
-
for s, e in split_dataframe_indices(input_list):
|
122
|
+
for s, e in split_dataframe_indices(input_list, batch_size):
|
122
123
|
yield input_list[s:e+1]
|
123
124
|
|
124
|
-
def store_documents(self, df, columns, page_content_columns):
|
125
|
+
def store_documents(self, df, columns, page_content_columns, partial_df_size=None):
|
125
126
|
"""
|
126
127
|
Core function that processes the documents and adds them to the vector database.
|
127
128
|
"""
|
@@ -131,39 +132,49 @@ class DataFrameToVectorDB:
|
|
131
132
|
col_df = df[copy_columns].copy()
|
132
133
|
col_df.dropna(subset=[page_content_column], inplace=True)
|
133
134
|
col_df['type'] = page_content_column # Set the type column
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
documents
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
135
|
+
if partial_df_size:
|
136
|
+
total = 0
|
137
|
+
partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
|
138
|
+
for partial_df in partial_dfs:
|
139
|
+
import torch
|
140
|
+
|
141
|
+
documents = []
|
142
|
+
loader = DataFrameLoader(
|
143
|
+
data_frame=partial_df, page_content_column=page_content_column)
|
144
|
+
loaded_docs = loader.load()
|
145
|
+
# print(loaded_docs)
|
146
|
+
# documents.extend(self.__split_docs(loaded_docs))
|
147
|
+
total += len(partial_df)
|
148
|
+
|
149
|
+
self.vectordb.add_documents(loaded_docs)
|
150
|
+
self.vectordb.persist()
|
151
|
+
print(f"{page_content_column}: {total}/{len(col_df)}")
|
152
|
+
torch.cuda.empty_cache()
|
153
|
+
# time.sleep(30) # The GPU will not be used during this period
|
154
|
+
|
155
|
+
# split_docs_chunked = self.__split_list(
|
156
|
+
# loaded_docs, self.batch_size)
|
157
|
+
# print(f"Number of chunks: {len(split_docs_chunked)}")
|
158
|
+
# for split_docs_chunk in split_docs_chunked:
|
159
|
+
# print(f"Split docs size: {len(split_docs_chunk)}")
|
160
|
+
# self.vectordb.add_documents(split_docs_chunk)
|
161
|
+
# self.vectordb.persist()
|
162
|
+
else:
|
163
|
+
documents = []
|
164
|
+
print(col_df.shape)
|
165
|
+
loader = DataFrameLoader(
|
166
|
+
data_frame=col_df, page_content_column=page_content_column)
|
167
|
+
loaded_docs = loader.load()
|
168
|
+
documents.extend(self.__split_docs(loaded_docs))
|
169
|
+
print(f"Documents size: {len(loaded_docs)}")
|
170
|
+
split_docs_chunked = self.__split_list(
|
171
|
+
documents, self.batch_size)
|
172
|
+
for split_docs_chunk in split_docs_chunked:
|
173
|
+
import torch
|
174
|
+
torch.cuda.empty_cache()
|
175
|
+
self.vectordb.add_documents(split_docs_chunk)
|
176
|
+
self.vectordb.persist()
|
177
|
+
print(f"{page_content_column}, size:{len(split_docs_chunk)}")
|
167
178
|
|
168
179
|
|
169
180
|
def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
|
@@ -239,6 +250,6 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
239
250
|
persist_directory=persist_directory,
|
240
251
|
embeddings=model,
|
241
252
|
text_splitter=text_splitter,
|
242
|
-
batch_size=
|
253
|
+
batch_size=batch_size)
|
243
254
|
to_vector_db.store_documents(
|
244
|
-
df, columns, page_content_columns)
|
255
|
+
df, columns, page_content_columns, partial_df_size=batch_size)
|
@@ -1,11 +1,11 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=0bEJSxqO22x0XBoCKOYmxXNa2j7_CqFqKiBZ4KFd9Mk,17039
|
3
3
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
4
4
|
ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
|
5
|
-
ddi_fw/datasets/feature_vector_generation.py,sha256=
|
5
|
+
ddi_fw/datasets/feature_vector_generation.py,sha256=gvjpEzkgVV8dp4V8NMMv59u0v-1tNAJ7v83R-keWGoA,4748
|
6
6
|
ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
7
7
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
8
|
-
ddi_fw/datasets/ddi_mdl/base.py,sha256=
|
8
|
+
ddi_fw/datasets/ddi_mdl/base.py,sha256=QtDAaXpqDiPxQ-xOVA0Xd-wcm1sRY5FpO_4zNAKoksM,6146
|
9
9
|
ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
|
10
10
|
ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
|
11
11
|
ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
|
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
|
|
58
58
|
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
59
59
|
ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
|
60
60
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
61
|
-
ddi_fw/langchain/storage.py,sha256=
|
61
|
+
ddi_fw/langchain/storage.py,sha256=Vz1aICIyZzKwOnOwxkhWAMYWJ9X6kOfqEkIeQJyQIHY,10762
|
62
62
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
63
63
|
ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
64
64
|
ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
|
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
|
|
91
91
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
92
92
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
93
93
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
94
|
-
ddi_fw-0.0.
|
95
|
-
ddi_fw-0.0.
|
96
|
-
ddi_fw-0.0.
|
97
|
-
ddi_fw-0.0.
|
94
|
+
ddi_fw-0.0.112.dist-info/METADATA,sha256=tF5FdHznWj9YznDQbTGQtsspXYlHOxmEJLxeEcI7oLg,1967
|
95
|
+
ddi_fw-0.0.112.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
96
|
+
ddi_fw-0.0.112.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
97
|
+
ddi_fw-0.0.112.dist-info/RECORD,,
|
File without changes
|
File without changes
|