ddi-fw 0.0.109__py3-none-any.whl → 0.0.110__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/langchain/storage.py +50 -51
- {ddi_fw-0.0.109.dist-info → ddi_fw-0.0.110.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.109.dist-info → ddi_fw-0.0.110.dist-info}/RECORD +5 -5
- {ddi_fw-0.0.109.dist-info → ddi_fw-0.0.110.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.109.dist-info → ddi_fw-0.0.110.dist-info}/top_level.txt +0 -0
ddi_fw/langchain/storage.py
CHANGED
@@ -25,16 +25,17 @@ def load_configuration(config_file):
|
|
25
25
|
config = json.load(f)
|
26
26
|
return config
|
27
27
|
|
28
|
+
|
28
29
|
def split_dataframe(df, min_size=512):
|
29
30
|
total_size = len(df)
|
30
31
|
# If the dataframe is smaller than min_size, return the dataframe as a whole
|
31
32
|
if total_size <= min_size:
|
32
33
|
return [df]
|
33
|
-
|
34
|
+
|
34
35
|
# List to store partial DataFrames
|
35
36
|
partial_dfs = []
|
36
37
|
start_idx = 0
|
37
|
-
|
38
|
+
|
38
39
|
# Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
|
39
40
|
num_chunks = total_size // min_size
|
40
41
|
remaining_rows = total_size
|
@@ -44,55 +45,57 @@ def split_dataframe(df, min_size=512):
|
|
44
45
|
chunk_size = min_size
|
45
46
|
if (remaining_rows - chunk_size) < min_size:
|
46
47
|
chunk_size = remaining_rows # Last chunk takes all remaining rows
|
47
|
-
|
48
|
+
|
48
49
|
partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
|
49
|
-
|
50
|
+
|
50
51
|
# Update the start index and remaining rows
|
51
52
|
start_idx += chunk_size
|
52
53
|
remaining_rows -= chunk_size
|
53
|
-
|
54
|
+
|
54
55
|
# If there are any remaining rows left after the loop, they should form the last chunk
|
55
56
|
if remaining_rows > 0:
|
56
57
|
partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
|
57
|
-
|
58
|
+
|
58
59
|
return partial_dfs
|
59
60
|
|
61
|
+
|
60
62
|
def split_dataframe_indices(df, min_size=512):
|
61
63
|
total_size = len(df)
|
62
|
-
|
64
|
+
|
63
65
|
# If the dataframe is smaller than min_size, return the entire range
|
64
66
|
if total_size <= min_size:
|
65
67
|
return [(0, total_size - 1)]
|
66
|
-
|
68
|
+
|
67
69
|
# List to store the start and end indices of each chunk
|
68
70
|
chunk_indices = []
|
69
71
|
start_idx = 0
|
70
|
-
|
72
|
+
|
71
73
|
# Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
|
72
74
|
num_chunks = total_size // min_size
|
73
75
|
remaining_rows = total_size
|
74
|
-
|
76
|
+
|
75
77
|
# Split into chunks
|
76
78
|
for i in range(num_chunks):
|
77
79
|
chunk_size = min_size
|
78
80
|
if (remaining_rows - chunk_size) < min_size:
|
79
81
|
chunk_size = remaining_rows # Last chunk takes all remaining rows
|
80
|
-
|
82
|
+
|
81
83
|
# Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
|
82
84
|
end_idx = start_idx + chunk_size - 1
|
83
85
|
chunk_indices.append((start_idx, end_idx))
|
84
|
-
|
86
|
+
|
85
87
|
# Update the start index and remaining rows
|
86
88
|
start_idx += chunk_size
|
87
89
|
remaining_rows -= chunk_size
|
88
|
-
|
90
|
+
|
89
91
|
# If there are any remaining rows after the loop, they should form the last chunk
|
90
92
|
if remaining_rows > 0:
|
91
93
|
end_idx = start_idx + remaining_rows - 1
|
92
94
|
chunk_indices.append((start_idx, end_idx))
|
93
|
-
|
95
|
+
|
94
96
|
return chunk_indices
|
95
97
|
|
98
|
+
|
96
99
|
class DataFrameToVectorDB:
|
97
100
|
def __init__(self,
|
98
101
|
collection_name,
|
@@ -115,7 +118,7 @@ class DataFrameToVectorDB:
|
|
115
118
|
|
116
119
|
def __split_list(self, input_list, batch_size):
|
117
120
|
# for i in range(0, len(input_list), batch_size):
|
118
|
-
for s,e in split_dataframe_indices(input_list):
|
121
|
+
for s, e in split_dataframe_indices(input_list):
|
119
122
|
yield input_list[s:e+1]
|
120
123
|
|
121
124
|
def store_documents(self, df, columns, page_content_columns):
|
@@ -128,6 +131,26 @@ class DataFrameToVectorDB:
|
|
128
131
|
col_df = df[copy_columns].copy()
|
129
132
|
col_df.dropna(subset=[page_content_column], inplace=True)
|
130
133
|
col_df['type'] = page_content_column # Set the type column
|
134
|
+
# if partial_df_size:
|
135
|
+
# documents = []
|
136
|
+
|
137
|
+
# partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
|
138
|
+
# for partial_df in partial_dfs:
|
139
|
+
# print(f"{page_content_column}, size:{len(partial_df)}")
|
140
|
+
# print(partial_df.count())
|
141
|
+
# loader = DataFrameLoader(
|
142
|
+
# data_frame=partial_df, page_content_column=page_content_column)
|
143
|
+
# loaded_docs = loader.load()
|
144
|
+
# documents.extend(self.__split_docs(loaded_docs))
|
145
|
+
# print(f"Documents size: {len(documents)}")
|
146
|
+
# split_docs_chunked = self.__split_list(
|
147
|
+
# documents, self.batch_size)
|
148
|
+
|
149
|
+
# for split_docs_chunk in split_docs_chunked:
|
150
|
+
# print(f"Split docs size: {len(split_docs_chunk)}")
|
151
|
+
# self.vectordb.add_documents(split_docs_chunk)
|
152
|
+
# self.vectordb.persist()
|
153
|
+
# else:
|
131
154
|
documents = []
|
132
155
|
|
133
156
|
loader = DataFrameLoader(
|
@@ -135,37 +158,13 @@ class DataFrameToVectorDB:
|
|
135
158
|
loaded_docs = loader.load()
|
136
159
|
documents.extend(self.__split_docs(loaded_docs))
|
137
160
|
|
138
|
-
split_docs_chunked = self.__split_list(
|
161
|
+
split_docs_chunked = self.__split_list(
|
162
|
+
documents, self.batch_size)
|
139
163
|
|
140
164
|
for split_docs_chunk in split_docs_chunked:
|
141
|
-
# vectordb = Chroma.from_documents(
|
142
|
-
# collection_name=collection_name,
|
143
|
-
# documents=split_docs_chunk,
|
144
|
-
# embedding=embeddings,
|
145
|
-
# persist_directory=persist_directory,
|
146
|
-
# )
|
147
165
|
self.vectordb.add_documents(split_docs_chunk)
|
148
166
|
self.vectordb.persist()
|
149
167
|
|
150
|
-
def store(self, df, columns, page_content_columns, partial_df_size=None):
|
151
|
-
"""
|
152
|
-
Store function to handle both full and partial dataframe processing.
|
153
|
-
"""
|
154
|
-
if partial_df_size:
|
155
|
-
partial_dfs = split_dataframe(df, min_size = partial_df_size)
|
156
|
-
for partial_df in partial_dfs:
|
157
|
-
self.store_documents(df=partial_df, columns=columns,
|
158
|
-
page_content_columns=page_content_columns)
|
159
|
-
# Process the dataframe in chunks if partial_df_size is provided
|
160
|
-
# for i in range(0, len(df), partial_df_size):
|
161
|
-
# batch = df[i: i + partial_df_size]
|
162
|
-
# self.store_documents(df=batch, columns=columns,
|
163
|
-
# page_content_columns=page_content_columns)
|
164
|
-
else:
|
165
|
-
# Process the entire dataframe if no partial_df_size is specified
|
166
|
-
self.store_documents(df=df, columns=columns,
|
167
|
-
page_content_columns=page_content_columns)
|
168
|
-
|
169
168
|
|
170
169
|
def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
|
171
170
|
"""
|
@@ -196,11 +195,11 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
196
195
|
# print(f"Configuration for collection {id} not found.")
|
197
196
|
# continue
|
198
197
|
|
199
|
-
embedding_model_type = collection_config
|
200
|
-
text_splitters_types = collection_config
|
201
|
-
batch_size = collection_config
|
202
|
-
columns = collection_config
|
203
|
-
page_content_columns = collection_config
|
198
|
+
embedding_model_type = collection_config.get('embedding_model_type')
|
199
|
+
text_splitters_types = collection_config.get('text_splitters_types')
|
200
|
+
batch_size = collection_config.get('batch_size')
|
201
|
+
columns = collection_config.get('columns')
|
202
|
+
page_content_columns = collection_config.get('page_content_columns')
|
204
203
|
persist_directory = f'{persist_directory}/{id}'
|
205
204
|
|
206
205
|
# Load the embedding model and text splitter dynamically
|
@@ -208,8 +207,7 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
208
207
|
|
209
208
|
# Assuming the classes for the embeddings and splitters are available
|
210
209
|
try:
|
211
|
-
model_kwargs = collection_config
|
212
|
-
SBertEmbeddings(model_name="a", model_config={})
|
210
|
+
model_kwargs = collection_config.get('model_kwargs')
|
213
211
|
model = get_import(embedding_model_type)(
|
214
212
|
model_name=name, **model_kwargs)
|
215
213
|
except:
|
@@ -220,7 +218,8 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
220
218
|
text_splitters_suffixes = []
|
221
219
|
for text_splitter_type in text_splitters_types:
|
222
220
|
try:
|
223
|
-
type_of_text_splitter = get_import(
|
221
|
+
type_of_text_splitter = get_import(
|
222
|
+
text_splitter_type.get("type"))
|
224
223
|
kwargs = text_splitter_type.get("params")
|
225
224
|
suffix = text_splitter_type.get("suffix")
|
226
225
|
if kwargs:
|
@@ -241,5 +240,5 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
|
|
241
240
|
embeddings=model,
|
242
241
|
text_splitter=text_splitter,
|
243
242
|
batch_size=1024)
|
244
|
-
to_vector_db.
|
245
|
-
df, columns, page_content_columns
|
243
|
+
to_vector_db.store_documents(
|
244
|
+
df, columns, page_content_columns)
|
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
|
|
58
58
|
ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
|
59
59
|
ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
|
60
60
|
ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
|
61
|
-
ddi_fw/langchain/storage.py,sha256=
|
61
|
+
ddi_fw/langchain/storage.py,sha256=gIQfpRG1t8SY1r3jWZGp-MOblQ_-8EgqZ55A1ZQ8kBg,10047
|
62
62
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
63
63
|
ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
|
64
64
|
ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
|
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
|
|
91
91
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
92
92
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
93
93
|
ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
|
94
|
-
ddi_fw-0.0.
|
95
|
-
ddi_fw-0.0.
|
96
|
-
ddi_fw-0.0.
|
97
|
-
ddi_fw-0.0.
|
94
|
+
ddi_fw-0.0.110.dist-info/METADATA,sha256=bozy0pU7E9nOOqwWvqXof28dNdfr9UQqoyb1OVKyfHw,1967
|
95
|
+
ddi_fw-0.0.110.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
96
|
+
ddi_fw-0.0.110.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
97
|
+
ddi_fw-0.0.110.dist-info/RECORD,,
|
File without changes
|
File without changes
|