ddi-fw 0.0.110__py3-none-any.whl → 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -274,7 +274,7 @@ class BaseDataset(ABC):
274
274
  filtered_df = self.drugs_df
275
275
  combined_df = filtered_df.copy()
276
276
 
277
- if self.ner_df:
277
+ if self.ner_df is not None and not self.ner_df.empty:
278
278
  filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
279
279
  drug_ids)]
280
280
  filtered_ner_df = self.ner_df.copy()
@@ -3,6 +3,7 @@ from langchain.vectorstores import Chroma
3
3
  # from langchain_community.vectorstores import Chroma
4
4
  from langchain_community.vectorstores.utils import filter_complex_metadata
5
5
  from langchain_core.embeddings import Embeddings
6
+ import time
6
7
 
7
8
 
8
9
  from langchain.docstore.document import Document
@@ -102,7 +103,7 @@ class DataFrameToVectorDB:
102
103
  persist_directory,
103
104
  embeddings: Embeddings,
104
105
  text_splitter: TextSplitter,
105
- batch_size=1000):
106
+ batch_size=1024):
106
107
  self.collection_name = collection_name
107
108
  self.persist_directory = persist_directory
108
109
  self.embeddings = embeddings
@@ -118,10 +119,10 @@ class DataFrameToVectorDB:
118
119
 
119
120
  def __split_list(self, input_list, batch_size):
120
121
  # for i in range(0, len(input_list), batch_size):
121
- for s, e in split_dataframe_indices(input_list):
122
+ for s, e in split_dataframe_indices(input_list, batch_size):
122
123
  yield input_list[s:e+1]
123
124
 
124
- def store_documents(self, df, columns, page_content_columns):
125
+ def store_documents(self, df, columns, page_content_columns, partial_df_size=None):
125
126
  """
126
127
  Core function that processes the documents and adds them to the vector database.
127
128
  """
@@ -131,39 +132,49 @@ class DataFrameToVectorDB:
131
132
  col_df = df[copy_columns].copy()
132
133
  col_df.dropna(subset=[page_content_column], inplace=True)
133
134
  col_df['type'] = page_content_column # Set the type column
134
- # if partial_df_size:
135
- # documents = []
136
-
137
- # partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
138
- # for partial_df in partial_dfs:
139
- # print(f"{page_content_column}, size:{len(partial_df)}")
140
- # print(partial_df.count())
141
- # loader = DataFrameLoader(
142
- # data_frame=partial_df, page_content_column=page_content_column)
143
- # loaded_docs = loader.load()
144
- # documents.extend(self.__split_docs(loaded_docs))
145
- # print(f"Documents size: {len(documents)}")
146
- # split_docs_chunked = self.__split_list(
147
- # documents, self.batch_size)
148
-
149
- # for split_docs_chunk in split_docs_chunked:
150
- # print(f"Split docs size: {len(split_docs_chunk)}")
151
- # self.vectordb.add_documents(split_docs_chunk)
152
- # self.vectordb.persist()
153
- # else:
154
- documents = []
155
-
156
- loader = DataFrameLoader(
157
- data_frame=col_df, page_content_column=page_content_column)
158
- loaded_docs = loader.load()
159
- documents.extend(self.__split_docs(loaded_docs))
160
-
161
- split_docs_chunked = self.__split_list(
162
- documents, self.batch_size)
163
-
164
- for split_docs_chunk in split_docs_chunked:
165
- self.vectordb.add_documents(split_docs_chunk)
166
- self.vectordb.persist()
135
+ if partial_df_size:
136
+ total = 0
137
+ partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
138
+ for partial_df in partial_dfs:
139
+ import torch
140
+
141
+ documents = []
142
+ loader = DataFrameLoader(
143
+ data_frame=partial_df, page_content_column=page_content_column)
144
+ loaded_docs = loader.load()
145
+ # print(loaded_docs)
146
+ # documents.extend(self.__split_docs(loaded_docs))
147
+ total += len(partial_df)
148
+
149
+ self.vectordb.add_documents(loaded_docs)
150
+ self.vectordb.persist()
151
+ print(f"{page_content_column}: {total}/{len(col_df)}")
152
+ torch.cuda.empty_cache()
153
+ # time.sleep(30) # The GPU will not be used during this period
154
+
155
+ # split_docs_chunked = self.__split_list(
156
+ # loaded_docs, self.batch_size)
157
+ # print(f"Number of chunks: {len(split_docs_chunked)}")
158
+ # for split_docs_chunk in split_docs_chunked:
159
+ # print(f"Split docs size: {len(split_docs_chunk)}")
160
+ # self.vectordb.add_documents(split_docs_chunk)
161
+ # self.vectordb.persist()
162
+ else:
163
+ documents = []
164
+ print(col_df.shape)
165
+ loader = DataFrameLoader(
166
+ data_frame=col_df, page_content_column=page_content_column)
167
+ loaded_docs = loader.load()
168
+ documents.extend(self.__split_docs(loaded_docs))
169
+ print(f"Documents size: {len(loaded_docs)}")
170
+ split_docs_chunked = self.__split_list(
171
+ documents, self.batch_size)
172
+ for split_docs_chunk in split_docs_chunked:
173
+ import torch
174
+ torch.cuda.empty_cache()
175
+ self.vectordb.add_documents(split_docs_chunk)
176
+ self.vectordb.persist()
177
+ print(f"{page_content_column}, size:{len(split_docs_chunk)}")
167
178
 
168
179
 
169
180
  def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
@@ -239,6 +250,6 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
239
250
  persist_directory=persist_directory,
240
251
  embeddings=model,
241
252
  text_splitter=text_splitter,
242
- batch_size=1024)
253
+ batch_size=batch_size)
243
254
  to_vector_db.store_documents(
244
- df, columns, page_content_columns)
255
+ df, columns, page_content_columns, partial_df_size=batch_size)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.110
3
+ Version: 0.0.111
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,5 +1,5 @@
1
1
  ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
2
- ddi_fw/datasets/core.py,sha256=9RaUPhAYCn4RDeTZpHATtJaqNWsO17bduYyVqxAZWs0,17001
2
+ ddi_fw/datasets/core.py,sha256=0bEJSxqO22x0XBoCKOYmxXNa2j7_CqFqKiBZ4KFd9Mk,17039
3
3
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
4
4
  ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
5
5
  ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
58
58
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
59
59
  ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
60
60
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
61
- ddi_fw/langchain/storage.py,sha256=gIQfpRG1t8SY1r3jWZGp-MOblQ_-8EgqZ55A1ZQ8kBg,10047
61
+ ddi_fw/langchain/storage.py,sha256=Vz1aICIyZzKwOnOwxkhWAMYWJ9X6kOfqEkIeQJyQIHY,10762
62
62
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
63
63
  ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
64
64
  ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
91
91
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
92
92
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
93
93
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
94
- ddi_fw-0.0.110.dist-info/METADATA,sha256=bozy0pU7E9nOOqwWvqXof28dNdfr9UQqoyb1OVKyfHw,1967
95
- ddi_fw-0.0.110.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
- ddi_fw-0.0.110.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
- ddi_fw-0.0.110.dist-info/RECORD,,
94
+ ddi_fw-0.0.111.dist-info/METADATA,sha256=MqKx9zRNVISPJE-dq49tMzh-EfbmxvD0Deq7N2klGeQ,1967
95
+ ddi_fw-0.0.111.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
+ ddi_fw-0.0.111.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
+ ddi_fw-0.0.111.dist-info/RECORD,,