ddi-fw 0.0.108__py3-none-any.whl → 0.0.110__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/langchain/storage.py +80 -44
- {ddi_fw-0.0.108.dist-info → ddi_fw-0.0.110.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.108.dist-info → ddi_fw-0.0.110.dist-info}/RECORD +5 -5
- {ddi_fw-0.0.108.dist-info → ddi_fw-0.0.110.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.108.dist-info → ddi_fw-0.0.110.dist-info}/top_level.txt +0 -0
ddi_fw/langchain/storage.py
CHANGED
@@ -25,16 +25,17 @@ def load_configuration(config_file):
|
|
25
25
|
config = json.load(f)
|
26
26
|
return config
|
27
27
|
|
28
|
+
|
28
29
|
def split_dataframe(df, min_size=512):
|
29
30
|
total_size = len(df)
|
30
31
|
# If the dataframe is smaller than min_size, return the dataframe as a whole
|
31
32
|
if total_size <= min_size:
|
32
33
|
return [df]
|
33
|
-
|
34
|
+
|
34
35
|
# List to store partial DataFrames
|
35
36
|
partial_dfs = []
|
36
37
|
start_idx = 0
|
37
|
-
|
38
|
+
|
38
39
|
# Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
|
39
40
|
num_chunks = total_size // min_size
|
40
41
|
remaining_rows = total_size
|
@@ -44,19 +45,57 @@ def split_dataframe(df, min_size=512):
|
|
44
45
|
chunk_size = min_size
|
45
46
|
if (remaining_rows - chunk_size) < min_size:
|
46
47
|
chunk_size = remaining_rows # Last chunk takes all remaining rows
|
47
|
-
|
48
|
+
|
48
49
|
partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
|
49
|
-
|
50
|
+
|
50
51
|
# Update the start index and remaining rows
|
51
52
|
start_idx += chunk_size
|
52
53
|
remaining_rows -= chunk_size
|
53
|
-
|
54
|
+
|
54
55
|
# If there are any remaining rows left after the loop, they should form the last chunk
|
55
56
|
if remaining_rows > 0:
|
56
57
|
partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
|
57
|
-
|
58
|
+
|
58
59
|
return partial_dfs
|
59
60
|
|
61
|
+
|
62
|
+
def split_dataframe_indices(df, min_size=512):
|
63
|
+
total_size = len(df)
|
64
|
+
|
65
|
+
# If the dataframe is smaller than min_size, return the entire range
|
66
|
+
if total_size <= min_size:
|
67
|
+
return [(0, total_size - 1)]
|
68
|
+
|
69
|
+
# List to store the start and end indices of each chunk
|
70
|
+
chunk_indices = []
|
71
|
+
start_idx = 0
|
72
|
+
|
73
|
+
# Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
|
74
|
+
num_chunks = total_size // min_size
|
75
|
+
remaining_rows = total_size
|
76
|
+
|
77
|
+
# Split into chunks
|
78
|
+
for i in range(num_chunks):
|
79
|
+
chunk_size = min_size
|
80
|
+
if (remaining_rows - chunk_size) < min_size:
|
81
|
+
chunk_size = remaining_rows # Last chunk takes all remaining rows
|
82
|
+
|
83
|
+
# Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
|
84
|
+
end_idx = start_idx + chunk_size - 1
|
85
|
+
chunk_indices.append((start_idx, end_idx))
|
86
|
+
|
87
|
+
# Update the start index and remaining rows
|
88
|
+
start_idx += chunk_size
|
89
|
+
remaining_rows -= chunk_size
|
90
|
+
|
91
|
+
# If there are any remaining rows after the loop, they should form the last chunk
|
92
|
+
if remaining_rows > 0:
|
93
|
+
end_idx = start_idx + remaining_rows - 1
|
94
|
+
chunk_indices.append((start_idx, end_idx))
|
95
|
+
|
96
|
+
return chunk_indices
|
97
|
+
|
98
|
+
|
60
99
|
class DataFrameToVectorDB:
|
61
100
|
def __init__(self,
|
62
101
|
collection_name,
|
@@ -78,8 +117,9 @@ class DataFrameToVectorDB:
|
|
78
117
|
return docs
|
79
118
|
|
80
119
|
def __split_list(self, input_list, batch_size):
|
81
|
-
for i in range(0, len(input_list), batch_size):
|
82
|
-
|
120
|
+
# for i in range(0, len(input_list), batch_size):
|
121
|
+
for s, e in split_dataframe_indices(input_list):
|
122
|
+
yield input_list[s:e+1]
|
83
123
|
|
84
124
|
def store_documents(self, df, columns, page_content_columns):
|
85
125
|
"""
|
@@ -91,6 +131,26 @@ class DataFrameToVectorDB:
|
|
91
131
|
col_df = df[copy_columns].copy()
|
92
132
|
col_df.dropna(subset=[page_content_column], inplace=True)
|
93
133
|
col_df['type'] = page_content_column # Set the type column
|
134
|
+
# if partial_df_size:
|
135
|
+
# documents = []
|
136
|
+
|
137
|
+
# partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
|
138
|
+
# for partial_df in partial_dfs:
|
139
|
+
# print(f"{page_content_column}, size:{len(partial_df)}")
|
140
|
+
# print(partial_df.count())
|
141
|
+
# loader = DataFrameLoader(
|
142
|
+
# data_frame=partial_df, page_content_column=page_content_column)
|
143
|
+
# loaded_docs = loader.load()
|
144
|
+
# documents.extend(self.__split_docs(loaded_docs))
|
145
|
+
# print(f"Documents size: {len(documents)}")
|
146
|
+
# split_docs_chunked = self.__split_list(
|
147
|
+
# documents, self.batch_size)
|
148
|
+
|
149
|
+
# for split_docs_chunk in split_docs_chunked:
|
150
|
+
# print(f"Split docs size: {len(split_docs_chunk)}")
|
151
|
+
# self.vectordb.add_documents(split_docs_chunk)
|
152
|
+
# self.vectordb.persist()
|
153
|
+
# else:
|
94
154
|
documents = []
|
95
155
|
|
96
156
|
loader = DataFrameLoader(
|
@@ -98,37 +158,13 @@ class DataFrameToVectorDB:
|
|
98
158
|
loaded_docs = loader.load()
|
99
159
|
documents.extend(self.__split_docs(loaded_docs))
|
100
160
|
|
101
|
-
split_docs_chunked = self.__split_list(
|
161
|
+
split_docs_chunked = self.__split_list(
|
162
|
+
documents, self.batch_size)
|
102
163
|
|
103
164
|
for split_docs_chunk in split_docs_chunked:
|
104
|
-
# vectordb = Chroma.from_documents(
|
105
|
-
# collection_name=collection_name,
|
106
|
-
# documents=split_docs_chunk,
|
107
|
-
# embedding=embeddings,
|
108
|
-
# persist_directory=persist_directory,
|
109
|
-
# )
|
110
165
|
self.vectordb.add_documents(split_docs_chunk)
|
111
166
|
self.vectordb.persist()
|
112
167
|
|
113
|
-
def store(self, df, columns, page_content_columns, partial_df_size=None):
|
114
|
-
"""
|
115
|
-
Store function to handle both full and partial dataframe processing.
|
116
|
-
"""
|
117
|
-
if partial_df_size:
|
118
|
-
partial_dfs = split_dataframe(df, min_size = partial_df_size)
|
119
|
-
for partial_df in partial_dfs:
|
120
|
-
self.store_documents(df=partial_df, columns=columns,
|
121
|
-
page_content_columns=page_content_columns)
|
122
|
-
# Process the dataframe in chunks if partial_df_size is provided
|
123
|
-
# for i in range(0, len(df), partial_df_size):
|
124
|
-
# batch = df[i: i + partial_df_size]
|
125
|
-
# self.store_documents(df=batch, columns=columns,
|
126
|
-
# page_content_columns=page_content_columns)
|
127
|
-
else:
|
128
|
-
# Process the entire dataframe if no partial_df_size is specified
|
129
|
-
self.store_documents(df=df, columns=columns,
|
130
|
-
page_content_columns=page_content_columns)
|
131
|
-
|
132
168
|
|
133
169
|
def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
|
134
170
|
"""
|
@@ -159,11 +195,11 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
159
195
|
# print(f"Configuration for collection {id} not found.")
|
160
196
|
# continue
|
161
197
|
|
162
|
-
embedding_model_type = collection_config
|
163
|
-
text_splitters_types = collection_config
|
164
|
-
batch_size = collection_config
|
165
|
-
columns = collection_config
|
166
|
-
page_content_columns = collection_config
|
198
|
+
embedding_model_type = collection_config.get('embedding_model_type')
|
199
|
+
text_splitters_types = collection_config.get('text_splitters_types')
|
200
|
+
batch_size = collection_config.get('batch_size')
|
201
|
+
columns = collection_config.get('columns')
|
202
|
+
page_content_columns = collection_config.get('page_content_columns')
|
167
203
|
persist_directory = f'{persist_directory}/{id}'
|
168
204
|
|
169
205
|
# Load the embedding model and text splitter dynamically
|
@@ -171,8 +207,7 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
171
207
|
|
172
208
|
# Assuming the classes for the embeddings and splitters are available
|
173
209
|
try:
|
174
|
-
model_kwargs = collection_config
|
175
|
-
SBertEmbeddings(model_name="a", model_config={})
|
210
|
+
model_kwargs = collection_config.get('model_kwargs')
|
176
211
|
model = get_import(embedding_model_type)(
|
177
212
|
model_name=name, **model_kwargs)
|
178
213
|
except:
|
@@ -183,7 +218,8 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
183
218
|
text_splitters_suffixes = []
|
184
219
|
for text_splitter_type in text_splitters_types:
|
185
220
|
try:
|
186
|
-
type_of_text_splitter = get_import(
|
221
|
+
type_of_text_splitter = get_import(
|
222
|
+
text_splitter_type.get("type"))
|
187
223
|
kwargs = text_splitter_type.get("params")
|
188
224
|
suffix = text_splitter_type.get("suffix")
|
189
225
|
if kwargs:
|
@@ -204,5 +240,5 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
204
240
|
embeddings=model,
|
205
241
|
text_splitter=text_splitter,
|
206
242
|
batch_size=1024)
|
207
|
-
to_vector_db.
|
208
|
-
df, columns, page_content_columns
|
243
|
+
to_vector_db.store_documents(
|
244
|
+
df, columns, page_content_columns)
|
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
|
|
58
58
|
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
59
59
|
ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
|
60
60
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
61
|
-
ddi_fw/langchain/storage.py,sha256=
|
61
|
+
ddi_fw/langchain/storage.py,sha256=gIQfpRG1t8SY1r3jWZGp-MOblQ_-8EgqZ55A1ZQ8kBg,10047
|
62
62
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
63
63
|
ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
64
64
|
ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
|
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
|
|
91
91
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
92
92
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
93
93
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
94
|
-
ddi_fw-0.0.
|
95
|
-
ddi_fw-0.0.
|
96
|
-
ddi_fw-0.0.
|
97
|
-
ddi_fw-0.0.
|
94
|
+
ddi_fw-0.0.110.dist-info/METADATA,sha256=bozy0pU7E9nOOqwWvqXof28dNdfr9UQqoyb1OVKyfHw,1967
|
95
|
+
ddi_fw-0.0.110.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
96
|
+
ddi_fw-0.0.110.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
97
|
+
ddi_fw-0.0.110.dist-info/RECORD,,
|
File without changes
|
File without changes
|