ddi-fw 0.0.109__py3-none-any.whl → 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +1 -1
- ddi_fw/langchain/storage.py +74 -64
- {ddi_fw-0.0.109.dist-info → ddi_fw-0.0.111.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.109.dist-info → ddi_fw-0.0.111.dist-info}/RECORD +6 -6
- {ddi_fw-0.0.109.dist-info → ddi_fw-0.0.111.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.109.dist-info → ddi_fw-0.0.111.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -274,7 +274,7 @@ class BaseDataset(ABC):
|
|
274
274
|
filtered_df = self.drugs_df
|
275
275
|
combined_df = filtered_df.copy()
|
276
276
|
|
277
|
-
if self.ner_df:
|
277
|
+
if self.ner_df is not None and not self.ner_df.empty:
|
278
278
|
filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
|
279
279
|
drug_ids)]
|
280
280
|
filtered_ner_df = self.ner_df.copy()
|
ddi_fw/langchain/storage.py
CHANGED
@@ -3,6 +3,7 @@ from langchain.vectorstores import Chroma
|
|
3
3
|
# from langchain_community.vectorstores import Chroma
|
4
4
|
from langchain_community.vectorstores.utils import filter_complex_metadata
|
5
5
|
from langchain_core.embeddings import Embeddings
|
6
|
+
import time
|
6
7
|
|
7
8
|
|
8
9
|
from langchain.docstore.document import Document
|
@@ -25,16 +26,17 @@ def load_configuration(config_file):
|
|
25
26
|
config = json.load(f)
|
26
27
|
return config
|
27
28
|
|
29
|
+
|
28
30
|
def split_dataframe(df, min_size=512):
|
29
31
|
total_size = len(df)
|
30
32
|
# If the dataframe is smaller than min_size, return the dataframe as a whole
|
31
33
|
if total_size <= min_size:
|
32
34
|
return [df]
|
33
|
-
|
35
|
+
|
34
36
|
# List to store partial DataFrames
|
35
37
|
partial_dfs = []
|
36
38
|
start_idx = 0
|
37
|
-
|
39
|
+
|
38
40
|
# Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
|
39
41
|
num_chunks = total_size // min_size
|
40
42
|
remaining_rows = total_size
|
@@ -44,62 +46,64 @@ def split_dataframe(df, min_size=512):
|
|
44
46
|
chunk_size = min_size
|
45
47
|
if (remaining_rows - chunk_size) < min_size:
|
46
48
|
chunk_size = remaining_rows # Last chunk takes all remaining rows
|
47
|
-
|
49
|
+
|
48
50
|
partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
|
49
|
-
|
51
|
+
|
50
52
|
# Update the start index and remaining rows
|
51
53
|
start_idx += chunk_size
|
52
54
|
remaining_rows -= chunk_size
|
53
|
-
|
55
|
+
|
54
56
|
# If there are any remaining rows left after the loop, they should form the last chunk
|
55
57
|
if remaining_rows > 0:
|
56
58
|
partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
|
57
|
-
|
59
|
+
|
58
60
|
return partial_dfs
|
59
61
|
|
62
|
+
|
60
63
|
def split_dataframe_indices(df, min_size=512):
|
61
64
|
total_size = len(df)
|
62
|
-
|
65
|
+
|
63
66
|
# If the dataframe is smaller than min_size, return the entire range
|
64
67
|
if total_size <= min_size:
|
65
68
|
return [(0, total_size - 1)]
|
66
|
-
|
69
|
+
|
67
70
|
# List to store the start and end indices of each chunk
|
68
71
|
chunk_indices = []
|
69
72
|
start_idx = 0
|
70
|
-
|
73
|
+
|
71
74
|
# Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
|
72
75
|
num_chunks = total_size // min_size
|
73
76
|
remaining_rows = total_size
|
74
|
-
|
77
|
+
|
75
78
|
# Split into chunks
|
76
79
|
for i in range(num_chunks):
|
77
80
|
chunk_size = min_size
|
78
81
|
if (remaining_rows - chunk_size) < min_size:
|
79
82
|
chunk_size = remaining_rows # Last chunk takes all remaining rows
|
80
|
-
|
83
|
+
|
81
84
|
# Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
|
82
85
|
end_idx = start_idx + chunk_size - 1
|
83
86
|
chunk_indices.append((start_idx, end_idx))
|
84
|
-
|
87
|
+
|
85
88
|
# Update the start index and remaining rows
|
86
89
|
start_idx += chunk_size
|
87
90
|
remaining_rows -= chunk_size
|
88
|
-
|
91
|
+
|
89
92
|
# If there are any remaining rows after the loop, they should form the last chunk
|
90
93
|
if remaining_rows > 0:
|
91
94
|
end_idx = start_idx + remaining_rows - 1
|
92
95
|
chunk_indices.append((start_idx, end_idx))
|
93
|
-
|
96
|
+
|
94
97
|
return chunk_indices
|
95
98
|
|
99
|
+
|
96
100
|
class DataFrameToVectorDB:
|
97
101
|
def __init__(self,
|
98
102
|
collection_name,
|
99
103
|
persist_directory,
|
100
104
|
embeddings: Embeddings,
|
101
105
|
text_splitter: TextSplitter,
|
102
|
-
batch_size=
|
106
|
+
batch_size=1024):
|
103
107
|
self.collection_name = collection_name
|
104
108
|
self.persist_directory = persist_directory
|
105
109
|
self.embeddings = embeddings
|
@@ -115,10 +119,10 @@ class DataFrameToVectorDB:
|
|
115
119
|
|
116
120
|
def __split_list(self, input_list, batch_size):
|
117
121
|
# for i in range(0, len(input_list), batch_size):
|
118
|
-
for s,e in split_dataframe_indices(input_list):
|
122
|
+
for s, e in split_dataframe_indices(input_list, batch_size):
|
119
123
|
yield input_list[s:e+1]
|
120
124
|
|
121
|
-
def store_documents(self, df, columns, page_content_columns):
|
125
|
+
def store_documents(self, df, columns, page_content_columns, partial_df_size=None):
|
122
126
|
"""
|
123
127
|
Core function that processes the documents and adds them to the vector database.
|
124
128
|
"""
|
@@ -128,43 +132,49 @@ class DataFrameToVectorDB:
|
|
128
132
|
col_df = df[copy_columns].copy()
|
129
133
|
col_df.dropna(subset=[page_content_column], inplace=True)
|
130
134
|
col_df['type'] = page_content_column # Set the type column
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
135
|
+
if partial_df_size:
|
136
|
+
total = 0
|
137
|
+
partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
|
138
|
+
for partial_df in partial_dfs:
|
139
|
+
import torch
|
140
|
+
|
141
|
+
documents = []
|
142
|
+
loader = DataFrameLoader(
|
143
|
+
data_frame=partial_df, page_content_column=page_content_column)
|
144
|
+
loaded_docs = loader.load()
|
145
|
+
# print(loaded_docs)
|
146
|
+
# documents.extend(self.__split_docs(loaded_docs))
|
147
|
+
total += len(partial_df)
|
148
|
+
|
149
|
+
self.vectordb.add_documents(loaded_docs)
|
150
|
+
self.vectordb.persist()
|
151
|
+
print(f"{page_content_column}: {total}/{len(col_df)}")
|
152
|
+
torch.cuda.empty_cache()
|
153
|
+
# time.sleep(30) # The GPU will not be used during this period
|
154
|
+
|
155
|
+
# split_docs_chunked = self.__split_list(
|
156
|
+
# loaded_docs, self.batch_size)
|
157
|
+
# print(f"Number of chunks: {len(split_docs_chunked)}")
|
158
|
+
# for split_docs_chunk in split_docs_chunked:
|
159
|
+
# print(f"Split docs size: {len(split_docs_chunk)}")
|
160
|
+
# self.vectordb.add_documents(split_docs_chunk)
|
161
|
+
# self.vectordb.persist()
|
162
|
+
else:
|
163
|
+
documents = []
|
164
|
+
print(col_df.shape)
|
165
|
+
loader = DataFrameLoader(
|
166
|
+
data_frame=col_df, page_content_column=page_content_column)
|
167
|
+
loaded_docs = loader.load()
|
168
|
+
documents.extend(self.__split_docs(loaded_docs))
|
169
|
+
print(f"Documents size: {len(loaded_docs)}")
|
170
|
+
split_docs_chunked = self.__split_list(
|
171
|
+
documents, self.batch_size)
|
172
|
+
for split_docs_chunk in split_docs_chunked:
|
173
|
+
import torch
|
174
|
+
torch.cuda.empty_cache()
|
175
|
+
self.vectordb.add_documents(split_docs_chunk)
|
176
|
+
self.vectordb.persist()
|
177
|
+
print(f"{page_content_column}, size:{len(split_docs_chunk)}")
|
168
178
|
|
169
179
|
|
170
180
|
def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
|
@@ -196,11 +206,11 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
196
206
|
# print(f"Configuration for collection {id} not found.")
|
197
207
|
# continue
|
198
208
|
|
199
|
-
embedding_model_type = collection_config
|
200
|
-
text_splitters_types = collection_config
|
201
|
-
batch_size = collection_config
|
202
|
-
columns = collection_config
|
203
|
-
page_content_columns = collection_config
|
209
|
+
embedding_model_type = collection_config.get('embedding_model_type')
|
210
|
+
text_splitters_types = collection_config.get('text_splitters_types')
|
211
|
+
batch_size = collection_config.get('batch_size')
|
212
|
+
columns = collection_config.get('columns')
|
213
|
+
page_content_columns = collection_config.get('page_content_columns')
|
204
214
|
persist_directory = f'{persist_directory}/{id}'
|
205
215
|
|
206
216
|
# Load the embedding model and text splitter dynamically
|
@@ -208,8 +218,7 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
208
218
|
|
209
219
|
# Assuming the classes for the embeddings and splitters are available
|
210
220
|
try:
|
211
|
-
model_kwargs = collection_config
|
212
|
-
SBertEmbeddings(model_name="a", model_config={})
|
221
|
+
model_kwargs = collection_config.get('model_kwargs')
|
213
222
|
model = get_import(embedding_model_type)(
|
214
223
|
model_name=name, **model_kwargs)
|
215
224
|
except:
|
@@ -220,7 +229,8 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
220
229
|
text_splitters_suffixes = []
|
221
230
|
for text_splitter_type in text_splitters_types:
|
222
231
|
try:
|
223
|
-
type_of_text_splitter = get_import(
|
232
|
+
type_of_text_splitter = get_import(
|
233
|
+
text_splitter_type.get("type"))
|
224
234
|
kwargs = text_splitter_type.get("params")
|
225
235
|
suffix = text_splitter_type.get("suffix")
|
226
236
|
if kwargs:
|
@@ -240,6 +250,6 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
240
250
|
persist_directory=persist_directory,
|
241
251
|
embeddings=model,
|
242
252
|
text_splitter=text_splitter,
|
243
|
-
batch_size=
|
244
|
-
to_vector_db.
|
253
|
+
batch_size=batch_size)
|
254
|
+
to_vector_db.store_documents(
|
245
255
|
df, columns, page_content_columns, partial_df_size=batch_size)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=0bEJSxqO22x0XBoCKOYmxXNa2j7_CqFqKiBZ4KFd9Mk,17039
|
3
3
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
4
4
|
ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
|
5
5
|
ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
|
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
|
|
58
58
|
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
59
59
|
ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
|
60
60
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
61
|
-
ddi_fw/langchain/storage.py,sha256=
|
61
|
+
ddi_fw/langchain/storage.py,sha256=Vz1aICIyZzKwOnOwxkhWAMYWJ9X6kOfqEkIeQJyQIHY,10762
|
62
62
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
63
63
|
ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
64
64
|
ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
|
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
|
|
91
91
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
92
92
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
93
93
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
94
|
-
ddi_fw-0.0.
|
95
|
-
ddi_fw-0.0.
|
96
|
-
ddi_fw-0.0.
|
97
|
-
ddi_fw-0.0.
|
94
|
+
ddi_fw-0.0.111.dist-info/METADATA,sha256=MqKx9zRNVISPJE-dq49tMzh-EfbmxvD0Deq7N2klGeQ,1967
|
95
|
+
ddi_fw-0.0.111.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
96
|
+
ddi_fw-0.0.111.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
97
|
+
ddi_fw-0.0.111.dist-info/RECORD,,
|
File without changes
|
File without changes
|