ddi-fw 0.0.110__py3-none-any.whl → 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +1 -1
- ddi_fw/langchain/storage.py +49 -38
- {ddi_fw-0.0.110.dist-info → ddi_fw-0.0.111.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.110.dist-info → ddi_fw-0.0.111.dist-info}/RECORD +6 -6
- {ddi_fw-0.0.110.dist-info → ddi_fw-0.0.111.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.110.dist-info → ddi_fw-0.0.111.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -274,7 +274,7 @@ class BaseDataset(ABC):
|
|
274
274
|
filtered_df = self.drugs_df
|
275
275
|
combined_df = filtered_df.copy()
|
276
276
|
|
277
|
-
if self.ner_df:
|
277
|
+
if self.ner_df is not None and not self.ner_df.empty:
|
278
278
|
filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
|
279
279
|
drug_ids)]
|
280
280
|
filtered_ner_df = self.ner_df.copy()
|
ddi_fw/langchain/storage.py
CHANGED
@@ -3,6 +3,7 @@ from langchain.vectorstores import Chroma
|
|
3
3
|
# from langchain_community.vectorstores import Chroma
|
4
4
|
from langchain_community.vectorstores.utils import filter_complex_metadata
|
5
5
|
from langchain_core.embeddings import Embeddings
|
6
|
+
import time
|
6
7
|
|
7
8
|
|
8
9
|
from langchain.docstore.document import Document
|
@@ -102,7 +103,7 @@ class DataFrameToVectorDB:
|
|
102
103
|
persist_directory,
|
103
104
|
embeddings: Embeddings,
|
104
105
|
text_splitter: TextSplitter,
|
105
|
-
batch_size=
|
106
|
+
batch_size=1024):
|
106
107
|
self.collection_name = collection_name
|
107
108
|
self.persist_directory = persist_directory
|
108
109
|
self.embeddings = embeddings
|
@@ -118,10 +119,10 @@ class DataFrameToVectorDB:
|
|
118
119
|
|
119
120
|
def __split_list(self, input_list, batch_size):
|
120
121
|
# for i in range(0, len(input_list), batch_size):
|
121
|
-
for s, e in split_dataframe_indices(input_list):
|
122
|
+
for s, e in split_dataframe_indices(input_list, batch_size):
|
122
123
|
yield input_list[s:e+1]
|
123
124
|
|
124
|
-
def store_documents(self, df, columns, page_content_columns):
|
125
|
+
def store_documents(self, df, columns, page_content_columns, partial_df_size=None):
|
125
126
|
"""
|
126
127
|
Core function that processes the documents and adds them to the vector database.
|
127
128
|
"""
|
@@ -131,39 +132,49 @@ class DataFrameToVectorDB:
|
|
131
132
|
col_df = df[copy_columns].copy()
|
132
133
|
col_df.dropna(subset=[page_content_column], inplace=True)
|
133
134
|
col_df['type'] = page_content_column # Set the type column
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
documents
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
135
|
+
if partial_df_size:
|
136
|
+
total = 0
|
137
|
+
partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
|
138
|
+
for partial_df in partial_dfs:
|
139
|
+
import torch
|
140
|
+
|
141
|
+
documents = []
|
142
|
+
loader = DataFrameLoader(
|
143
|
+
data_frame=partial_df, page_content_column=page_content_column)
|
144
|
+
loaded_docs = loader.load()
|
145
|
+
# print(loaded_docs)
|
146
|
+
# documents.extend(self.__split_docs(loaded_docs))
|
147
|
+
total += len(partial_df)
|
148
|
+
|
149
|
+
self.vectordb.add_documents(loaded_docs)
|
150
|
+
self.vectordb.persist()
|
151
|
+
print(f"{page_content_column}: {total}/{len(col_df)}")
|
152
|
+
torch.cuda.empty_cache()
|
153
|
+
# time.sleep(30) # The GPU will not be used during this period
|
154
|
+
|
155
|
+
# split_docs_chunked = self.__split_list(
|
156
|
+
# loaded_docs, self.batch_size)
|
157
|
+
# print(f"Number of chunks: {len(split_docs_chunked)}")
|
158
|
+
# for split_docs_chunk in split_docs_chunked:
|
159
|
+
# print(f"Split docs size: {len(split_docs_chunk)}")
|
160
|
+
# self.vectordb.add_documents(split_docs_chunk)
|
161
|
+
# self.vectordb.persist()
|
162
|
+
else:
|
163
|
+
documents = []
|
164
|
+
print(col_df.shape)
|
165
|
+
loader = DataFrameLoader(
|
166
|
+
data_frame=col_df, page_content_column=page_content_column)
|
167
|
+
loaded_docs = loader.load()
|
168
|
+
documents.extend(self.__split_docs(loaded_docs))
|
169
|
+
print(f"Documents size: {len(loaded_docs)}")
|
170
|
+
split_docs_chunked = self.__split_list(
|
171
|
+
documents, self.batch_size)
|
172
|
+
for split_docs_chunk in split_docs_chunked:
|
173
|
+
import torch
|
174
|
+
torch.cuda.empty_cache()
|
175
|
+
self.vectordb.add_documents(split_docs_chunk)
|
176
|
+
self.vectordb.persist()
|
177
|
+
print(f"{page_content_column}, size:{len(split_docs_chunk)}")
|
167
178
|
|
168
179
|
|
169
180
|
def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
|
@@ -239,6 +250,6 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
239
250
|
persist_directory=persist_directory,
|
240
251
|
embeddings=model,
|
241
252
|
text_splitter=text_splitter,
|
242
|
-
batch_size=
|
253
|
+
batch_size=batch_size)
|
243
254
|
to_vector_db.store_documents(
|
244
|
-
df, columns, page_content_columns)
|
255
|
+
df, columns, page_content_columns, partial_df_size=batch_size)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=0bEJSxqO22x0XBoCKOYmxXNa2j7_CqFqKiBZ4KFd9Mk,17039
|
3
3
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
4
4
|
ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
|
5
5
|
ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
|
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
|
|
58
58
|
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
59
59
|
ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
|
60
60
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
61
|
-
ddi_fw/langchain/storage.py,sha256=
|
61
|
+
ddi_fw/langchain/storage.py,sha256=Vz1aICIyZzKwOnOwxkhWAMYWJ9X6kOfqEkIeQJyQIHY,10762
|
62
62
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
63
63
|
ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
64
64
|
ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
|
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
|
|
91
91
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
92
92
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
93
93
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
94
|
-
ddi_fw-0.0.
|
95
|
-
ddi_fw-0.0.
|
96
|
-
ddi_fw-0.0.
|
97
|
-
ddi_fw-0.0.
|
94
|
+
ddi_fw-0.0.111.dist-info/METADATA,sha256=MqKx9zRNVISPJE-dq49tMzh-EfbmxvD0Deq7N2klGeQ,1967
|
95
|
+
ddi_fw-0.0.111.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
96
|
+
ddi_fw-0.0.111.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
97
|
+
ddi_fw-0.0.111.dist-info/RECORD,,
|
File without changes
|
File without changes
|