ddi-fw 0.0.110__py3-none-any.whl → 0.0.112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -274,7 +274,7 @@ class BaseDataset(ABC):
274
274
  filtered_df = self.drugs_df
275
275
  combined_df = filtered_df.copy()
276
276
 
277
- if self.ner_df:
277
+ if self.ner_df is not None and not self.ner_df.empty:
278
278
  filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
279
279
  drug_ids)]
280
280
  filtered_ner_df = self.ner_df.copy()
@@ -1,7 +1,9 @@
1
1
  import pathlib
2
2
 
3
+ import numpy as np
3
4
  import pandas as pd
4
5
 
6
+ from ddi_fw.datasets.feature_vector_generation import SimilarityMatrixGenerator
5
7
  from ddi_fw.langchain.embeddings import PoolingStrategy
6
8
  from .. import BaseDataset
7
9
  from ..db_utils import create_connection
@@ -22,6 +24,18 @@ list_of_chemical_property_columns = ['enzyme',
22
24
  list_of_ner_columns = ['tui', 'cui', 'entities']
23
25
 
24
26
 
27
+ def indices_to_binary_vector(indices, vector_length=881):
28
+ # vector_length = len(indices)
29
+ # Initialize a zero vector of the given length
30
+ binary_vector = [0] * vector_length
31
+
32
+ # Set the positions specified by indices to 1
33
+ for index in indices:
34
+ if 0 <= index < vector_length:
35
+ binary_vector[index] = 1
36
+
37
+ return binary_vector
38
+
25
39
  class DDIMDLDataset(BaseDataset):
26
40
  def __init__(self, embedding_size,
27
41
  embedding_dict,
@@ -70,6 +84,34 @@ class DDIMDLDataset(BaseDataset):
70
84
  print("db bitti")
71
85
  self.index_path = kwargs.get('index_path')
72
86
 
87
+ jaccard_sim_dict = {}
88
+ sim_matrix_gen = SimilarityMatrixGenerator()
89
+ jaccard_sim_dict["smile_2"] = sim_matrix_gen.create_jaccard_similarity_matrices(
90
+ self.drugs_df["smile_2"].to_list())
91
+
92
+ similarity_matrices = {}
93
+ drugbank_ids = self.drugs_df['id'].to_list()
94
+ new_columns = {}
95
+ for idx in range(len(drugbank_ids)):
96
+ new_columns[idx] = drugbank_ids[idx]
97
+ for idx in range(len(drugbank_ids)):
98
+ new_columns[idx] = drugbank_ids[idx]
99
+ new_df = pd.DataFrame.from_dict(jaccard_sim_dict["smile_2"])
100
+ new_df = new_df.rename(index=new_columns, columns=new_columns)
101
+ similarity_matrices["smile_2"] = new_df
102
+
103
+ def lambda_fnc(row, value):
104
+ if row['id1'] in value and row['id2'] in value:
105
+ return np.float16(np.hstack(
106
+ (value[row['id1']], value[row['id2']])))
107
+ for key, value in similarity_matrices.items():
108
+
109
+ print(f'sim matrix: {key}')
110
+ self.ddis_df[key] = self.ddis_df.apply(
111
+ lambda_fnc, args=(value,), axis=1)
112
+ print(self.ddis_df[key].head())
113
+ print("init finished")
114
+
73
115
  def __select_all_drugs_as_dataframe__(self, conn):
74
116
  headers = ['index', 'id', 'name',
75
117
  'target', 'enzyme', 'pathway', 'smile']
@@ -82,6 +124,11 @@ class DDIMDLDataset(BaseDataset):
82
124
  df['target'] = df['target'].apply(lambda x: x.split('|'))
83
125
  df['pathway'] = df['pathway'].apply(lambda x: x.split('|'))
84
126
  df['smile'] = df['smile'].apply(lambda x: x.split('|'))
127
+ df['smile_2'] = df['smile'].apply(lambda x: indices_to_binary_vector(indices = list(map(int, x.split('|'))), vector_length = 881))
128
+
129
+
130
+
131
+
85
132
  return df
86
133
 
87
134
  def __select_all_events__(self, conn):
@@ -1,4 +1,5 @@
1
1
  import numpy as np
2
+ import pandas as pd
2
3
  from scipy.spatial.distance import pdist, squareform
3
4
 
4
5
  # todo pd.unique kullan
@@ -48,12 +49,40 @@ class VectorGenerator:
48
49
  def __init__(self, df):
49
50
  self.df = df
50
51
 
52
+ # https://github.com/YifanDengWHU/DDIMDL/blob/master/DDIMDL.py#L86
53
+ # def generate_feature_vector(self, column):
54
+ # # Initialize list to store all distinct features across all rows
55
+ # all_features = []
56
+
57
+ # # Loop through the column to extract features, split by '|', and collect all distinct ones
58
+ # drug_list = np.array(self.df[column]).tolist()
59
+ # for i in drug_list:
60
+ # for each_feature in i.split('|'):
61
+ # if each_feature not in all_features:
62
+ # all_features.append(each_feature)
63
+
64
+ # # Initialize a matrix to hold feature vectors (rows for each element, columns for each distinct feature)
65
+ # feature_matrix = np.zeros((len(drug_list), len(all_features)), dtype=float)
66
+
67
+ # # Create a DataFrame to store the feature matrix with the column names as the distinct features
68
+ # df_feature = pd.DataFrame(feature_matrix, columns=all_features)
69
+
70
+ # # Fill the feature matrix (set value to 1 if feature is present for the specific item in the column)
71
+ # for i in range(len(drug_list)):
72
+ # for each_feature in drug_list[i].split('|'):
73
+ # if each_feature in all_features:
74
+ # df_feature[each_feature].iloc[i] = 1
75
+
76
+ # # Convert DataFrame to numpy array and return
77
+ # print("Feature vectors generated")
78
+ # return df_feature.to_numpy()
79
+
51
80
  def generate_feature_vector(self, column):
52
81
  bit_vectors = []
53
82
  map = dict()
54
83
  idx = 0
55
84
  count = find_distinct_elements_count(self.df[column])
56
- print(f"find_distinct_elements_count bitti, boyut: {count}")
85
+ print(f"{column} has {count} different items")
57
86
  for ind in self.df.index:
58
87
  e = self.df[column][ind]
59
88
  # vector = np.zeros(len(sorted_features))
@@ -3,6 +3,7 @@ from langchain.vectorstores import Chroma
3
3
  # from langchain_community.vectorstores import Chroma
4
4
  from langchain_community.vectorstores.utils import filter_complex_metadata
5
5
  from langchain_core.embeddings import Embeddings
6
+ import time
6
7
 
7
8
 
8
9
  from langchain.docstore.document import Document
@@ -102,7 +103,7 @@ class DataFrameToVectorDB:
102
103
  persist_directory,
103
104
  embeddings: Embeddings,
104
105
  text_splitter: TextSplitter,
105
- batch_size=1000):
106
+ batch_size=1024):
106
107
  self.collection_name = collection_name
107
108
  self.persist_directory = persist_directory
108
109
  self.embeddings = embeddings
@@ -118,10 +119,10 @@ class DataFrameToVectorDB:
118
119
 
119
120
  def __split_list(self, input_list, batch_size):
120
121
  # for i in range(0, len(input_list), batch_size):
121
- for s, e in split_dataframe_indices(input_list):
122
+ for s, e in split_dataframe_indices(input_list, batch_size):
122
123
  yield input_list[s:e+1]
123
124
 
124
- def store_documents(self, df, columns, page_content_columns):
125
+ def store_documents(self, df, columns, page_content_columns, partial_df_size=None):
125
126
  """
126
127
  Core function that processes the documents and adds them to the vector database.
127
128
  """
@@ -131,39 +132,49 @@ class DataFrameToVectorDB:
131
132
  col_df = df[copy_columns].copy()
132
133
  col_df.dropna(subset=[page_content_column], inplace=True)
133
134
  col_df['type'] = page_content_column # Set the type column
134
- # if partial_df_size:
135
- # documents = []
136
-
137
- # partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
138
- # for partial_df in partial_dfs:
139
- # print(f"{page_content_column}, size:{len(partial_df)}")
140
- # print(partial_df.count())
141
- # loader = DataFrameLoader(
142
- # data_frame=partial_df, page_content_column=page_content_column)
143
- # loaded_docs = loader.load()
144
- # documents.extend(self.__split_docs(loaded_docs))
145
- # print(f"Documents size: {len(documents)}")
146
- # split_docs_chunked = self.__split_list(
147
- # documents, self.batch_size)
148
-
149
- # for split_docs_chunk in split_docs_chunked:
150
- # print(f"Split docs size: {len(split_docs_chunk)}")
151
- # self.vectordb.add_documents(split_docs_chunk)
152
- # self.vectordb.persist()
153
- # else:
154
- documents = []
155
-
156
- loader = DataFrameLoader(
157
- data_frame=col_df, page_content_column=page_content_column)
158
- loaded_docs = loader.load()
159
- documents.extend(self.__split_docs(loaded_docs))
160
-
161
- split_docs_chunked = self.__split_list(
162
- documents, self.batch_size)
163
-
164
- for split_docs_chunk in split_docs_chunked:
165
- self.vectordb.add_documents(split_docs_chunk)
166
- self.vectordb.persist()
135
+ if partial_df_size:
136
+ total = 0
137
+ partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
138
+ for partial_df in partial_dfs:
139
+ import torch
140
+
141
+ documents = []
142
+ loader = DataFrameLoader(
143
+ data_frame=partial_df, page_content_column=page_content_column)
144
+ loaded_docs = loader.load()
145
+ # print(loaded_docs)
146
+ # documents.extend(self.__split_docs(loaded_docs))
147
+ total += len(partial_df)
148
+
149
+ self.vectordb.add_documents(loaded_docs)
150
+ self.vectordb.persist()
151
+ print(f"{page_content_column}: {total}/{len(col_df)}")
152
+ torch.cuda.empty_cache()
153
+ # time.sleep(30) # The GPU will not be used during this period
154
+
155
+ # split_docs_chunked = self.__split_list(
156
+ # loaded_docs, self.batch_size)
157
+ # print(f"Number of chunks: {len(split_docs_chunked)}")
158
+ # for split_docs_chunk in split_docs_chunked:
159
+ # print(f"Split docs size: {len(split_docs_chunk)}")
160
+ # self.vectordb.add_documents(split_docs_chunk)
161
+ # self.vectordb.persist()
162
+ else:
163
+ documents = []
164
+ print(col_df.shape)
165
+ loader = DataFrameLoader(
166
+ data_frame=col_df, page_content_column=page_content_column)
167
+ loaded_docs = loader.load()
168
+ documents.extend(self.__split_docs(loaded_docs))
169
+ print(f"Documents size: {len(loaded_docs)}")
170
+ split_docs_chunked = self.__split_list(
171
+ documents, self.batch_size)
172
+ for split_docs_chunk in split_docs_chunked:
173
+ import torch
174
+ torch.cuda.empty_cache()
175
+ self.vectordb.add_documents(split_docs_chunk)
176
+ self.vectordb.persist()
177
+ print(f"{page_content_column}, size:{len(split_docs_chunk)}")
167
178
 
168
179
 
169
180
  def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
@@ -239,6 +250,6 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
239
250
  persist_directory=persist_directory,
240
251
  embeddings=model,
241
252
  text_splitter=text_splitter,
242
- batch_size=1024)
253
+ batch_size=batch_size)
243
254
  to_vector_db.store_documents(
244
- df, columns, page_content_columns)
255
+ df, columns, page_content_columns, partial_df_size=batch_size)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.110
3
+ Version: 0.0.112
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,11 +1,11 @@
1
1
  ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
2
- ddi_fw/datasets/core.py,sha256=9RaUPhAYCn4RDeTZpHATtJaqNWsO17bduYyVqxAZWs0,17001
2
+ ddi_fw/datasets/core.py,sha256=0bEJSxqO22x0XBoCKOYmxXNa2j7_CqFqKiBZ4KFd9Mk,17039
3
3
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
4
4
  ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
5
- ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
5
+ ddi_fw/datasets/feature_vector_generation.py,sha256=gvjpEzkgVV8dp4V8NMMv59u0v-1tNAJ7v83R-keWGoA,4748
6
6
  ddi_fw/datasets/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
7
7
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
8
- ddi_fw/datasets/ddi_mdl/base.py,sha256=45cUmDRyyD8CC07oj5Dka2DWfgWU4Qi7-Am0vCvRKbo,4237
8
+ ddi_fw/datasets/ddi_mdl/base.py,sha256=QtDAaXpqDiPxQ-xOVA0Xd-wcm1sRY5FpO_4zNAKoksM,6146
9
9
  ddi_fw/datasets/ddi_mdl/readme.md,sha256=WC6lpmsEKvIISnZqENY7TWtzCQr98HPpE3oRsBl8pIw,625
10
10
  ddi_fw/datasets/ddi_mdl/data/event.db,sha256=cmlSsf9MYjRzqR-mw3cUDnTnfT6FkpOG2yCl2mMwwew,30580736
11
11
  ddi_fw/datasets/ddi_mdl/indexes/test_indexes.txt,sha256=XVlDqYATckrQwNSXqMSKVBqyoN_Hg8SK6CL-XMdLADY,102176
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
58
58
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
59
59
  ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
60
60
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
61
- ddi_fw/langchain/storage.py,sha256=gIQfpRG1t8SY1r3jWZGp-MOblQ_-8EgqZ55A1ZQ8kBg,10047
61
+ ddi_fw/langchain/storage.py,sha256=Vz1aICIyZzKwOnOwxkhWAMYWJ9X6kOfqEkIeQJyQIHY,10762
62
62
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
63
63
  ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
64
64
  ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
91
91
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
92
92
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
93
93
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
94
- ddi_fw-0.0.110.dist-info/METADATA,sha256=bozy0pU7E9nOOqwWvqXof28dNdfr9UQqoyb1OVKyfHw,1967
95
- ddi_fw-0.0.110.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
- ddi_fw-0.0.110.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
- ddi_fw-0.0.110.dist-info/RECORD,,
94
+ ddi_fw-0.0.112.dist-info/METADATA,sha256=tF5FdHznWj9YznDQbTGQtsspXYlHOxmEJLxeEcI7oLg,1967
95
+ ddi_fw-0.0.112.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
+ ddi_fw-0.0.112.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
+ ddi_fw-0.0.112.dist-info/RECORD,,