ddi-fw 0.0.109__py3-none-any.whl → 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -274,7 +274,7 @@ class BaseDataset(ABC):
274
274
  filtered_df = self.drugs_df
275
275
  combined_df = filtered_df.copy()
276
276
 
277
- if self.ner_df:
277
+ if self.ner_df is not None and not self.ner_df.empty:
278
278
  filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
279
279
  drug_ids)]
280
280
  filtered_ner_df = self.ner_df.copy()
@@ -3,6 +3,7 @@ from langchain.vectorstores import Chroma
3
3
  # from langchain_community.vectorstores import Chroma
4
4
  from langchain_community.vectorstores.utils import filter_complex_metadata
5
5
  from langchain_core.embeddings import Embeddings
6
+ import time
6
7
 
7
8
 
8
9
  from langchain.docstore.document import Document
@@ -25,16 +26,17 @@ def load_configuration(config_file):
25
26
  config = json.load(f)
26
27
  return config
27
28
 
29
+
28
30
  def split_dataframe(df, min_size=512):
29
31
  total_size = len(df)
30
32
  # If the dataframe is smaller than min_size, return the dataframe as a whole
31
33
  if total_size <= min_size:
32
34
  return [df]
33
-
35
+
34
36
  # List to store partial DataFrames
35
37
  partial_dfs = []
36
38
  start_idx = 0
37
-
39
+
38
40
  # Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
39
41
  num_chunks = total_size // min_size
40
42
  remaining_rows = total_size
@@ -44,62 +46,64 @@ def split_dataframe(df, min_size=512):
44
46
  chunk_size = min_size
45
47
  if (remaining_rows - chunk_size) < min_size:
46
48
  chunk_size = remaining_rows # Last chunk takes all remaining rows
47
-
49
+
48
50
  partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
49
-
51
+
50
52
  # Update the start index and remaining rows
51
53
  start_idx += chunk_size
52
54
  remaining_rows -= chunk_size
53
-
55
+
54
56
  # If there are any remaining rows left after the loop, they should form the last chunk
55
57
  if remaining_rows > 0:
56
58
  partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
57
-
59
+
58
60
  return partial_dfs
59
61
 
62
+
60
63
  def split_dataframe_indices(df, min_size=512):
61
64
  total_size = len(df)
62
-
65
+
63
66
  # If the dataframe is smaller than min_size, return the entire range
64
67
  if total_size <= min_size:
65
68
  return [(0, total_size - 1)]
66
-
69
+
67
70
  # List to store the start and end indices of each chunk
68
71
  chunk_indices = []
69
72
  start_idx = 0
70
-
73
+
71
74
  # Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
72
75
  num_chunks = total_size // min_size
73
76
  remaining_rows = total_size
74
-
77
+
75
78
  # Split into chunks
76
79
  for i in range(num_chunks):
77
80
  chunk_size = min_size
78
81
  if (remaining_rows - chunk_size) < min_size:
79
82
  chunk_size = remaining_rows # Last chunk takes all remaining rows
80
-
83
+
81
84
  # Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
82
85
  end_idx = start_idx + chunk_size - 1
83
86
  chunk_indices.append((start_idx, end_idx))
84
-
87
+
85
88
  # Update the start index and remaining rows
86
89
  start_idx += chunk_size
87
90
  remaining_rows -= chunk_size
88
-
91
+
89
92
  # If there are any remaining rows after the loop, they should form the last chunk
90
93
  if remaining_rows > 0:
91
94
  end_idx = start_idx + remaining_rows - 1
92
95
  chunk_indices.append((start_idx, end_idx))
93
-
96
+
94
97
  return chunk_indices
95
98
 
99
+
96
100
  class DataFrameToVectorDB:
97
101
  def __init__(self,
98
102
  collection_name,
99
103
  persist_directory,
100
104
  embeddings: Embeddings,
101
105
  text_splitter: TextSplitter,
102
- batch_size=1000):
106
+ batch_size=1024):
103
107
  self.collection_name = collection_name
104
108
  self.persist_directory = persist_directory
105
109
  self.embeddings = embeddings
@@ -115,10 +119,10 @@ class DataFrameToVectorDB:
115
119
 
116
120
  def __split_list(self, input_list, batch_size):
117
121
  # for i in range(0, len(input_list), batch_size):
118
- for s,e in split_dataframe_indices(input_list):
122
+ for s, e in split_dataframe_indices(input_list, batch_size):
119
123
  yield input_list[s:e+1]
120
124
 
121
- def store_documents(self, df, columns, page_content_columns):
125
+ def store_documents(self, df, columns, page_content_columns, partial_df_size=None):
122
126
  """
123
127
  Core function that processes the documents and adds them to the vector database.
124
128
  """
@@ -128,43 +132,49 @@ class DataFrameToVectorDB:
128
132
  col_df = df[copy_columns].copy()
129
133
  col_df.dropna(subset=[page_content_column], inplace=True)
130
134
  col_df['type'] = page_content_column # Set the type column
131
- documents = []
132
-
133
- loader = DataFrameLoader(
134
- data_frame=col_df, page_content_column=page_content_column)
135
- loaded_docs = loader.load()
136
- documents.extend(self.__split_docs(loaded_docs))
137
-
138
- split_docs_chunked = self.__split_list(documents, self.batch_size)
139
-
140
- for split_docs_chunk in split_docs_chunked:
141
- # vectordb = Chroma.from_documents(
142
- # collection_name=collection_name,
143
- # documents=split_docs_chunk,
144
- # embedding=embeddings,
145
- # persist_directory=persist_directory,
146
- # )
147
- self.vectordb.add_documents(split_docs_chunk)
148
- self.vectordb.persist()
149
-
150
- def store(self, df, columns, page_content_columns, partial_df_size=None):
151
- """
152
- Store function to handle both full and partial dataframe processing.
153
- """
154
- if partial_df_size:
155
- partial_dfs = split_dataframe(df, min_size = partial_df_size)
156
- for partial_df in partial_dfs:
157
- self.store_documents(df=partial_df, columns=columns,
158
- page_content_columns=page_content_columns)
159
- # Process the dataframe in chunks if partial_df_size is provided
160
- # for i in range(0, len(df), partial_df_size):
161
- # batch = df[i: i + partial_df_size]
162
- # self.store_documents(df=batch, columns=columns,
163
- # page_content_columns=page_content_columns)
164
- else:
165
- # Process the entire dataframe if no partial_df_size is specified
166
- self.store_documents(df=df, columns=columns,
167
- page_content_columns=page_content_columns)
135
+ if partial_df_size:
136
+ total = 0
137
+ partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
138
+ for partial_df in partial_dfs:
139
+ import torch
140
+
141
+ documents = []
142
+ loader = DataFrameLoader(
143
+ data_frame=partial_df, page_content_column=page_content_column)
144
+ loaded_docs = loader.load()
145
+ # print(loaded_docs)
146
+ # documents.extend(self.__split_docs(loaded_docs))
147
+ total += len(partial_df)
148
+
149
+ self.vectordb.add_documents(loaded_docs)
150
+ self.vectordb.persist()
151
+ print(f"{page_content_column}: {total}/{len(col_df)}")
152
+ torch.cuda.empty_cache()
153
+ # time.sleep(30) # The GPU will not be used during this period
154
+
155
+ # split_docs_chunked = self.__split_list(
156
+ # loaded_docs, self.batch_size)
157
+ # print(f"Number of chunks: {len(split_docs_chunked)}")
158
+ # for split_docs_chunk in split_docs_chunked:
159
+ # print(f"Split docs size: {len(split_docs_chunk)}")
160
+ # self.vectordb.add_documents(split_docs_chunk)
161
+ # self.vectordb.persist()
162
+ else:
163
+ documents = []
164
+ print(col_df.shape)
165
+ loader = DataFrameLoader(
166
+ data_frame=col_df, page_content_column=page_content_column)
167
+ loaded_docs = loader.load()
168
+ documents.extend(self.__split_docs(loaded_docs))
169
+ print(f"Documents size: {len(loaded_docs)}")
170
+ split_docs_chunked = self.__split_list(
171
+ documents, self.batch_size)
172
+ for split_docs_chunk in split_docs_chunked:
173
+ import torch
174
+ torch.cuda.empty_cache()
175
+ self.vectordb.add_documents(split_docs_chunk)
176
+ self.vectordb.persist()
177
+ print(f"{page_content_column}, size:{len(split_docs_chunk)}")
168
178
 
169
179
 
170
180
  def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
@@ -196,11 +206,11 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
196
206
  # print(f"Configuration for collection {id} not found.")
197
207
  # continue
198
208
 
199
- embedding_model_type = collection_config['embedding_model_type']
200
- text_splitters_types = collection_config['text_splitters_types']
201
- batch_size = collection_config['batch_size']
202
- columns = collection_config['columns']
203
- page_content_columns = collection_config['page_content_columns']
209
+ embedding_model_type = collection_config.get('embedding_model_type')
210
+ text_splitters_types = collection_config.get('text_splitters_types')
211
+ batch_size = collection_config.get('batch_size')
212
+ columns = collection_config.get('columns')
213
+ page_content_columns = collection_config.get('page_content_columns')
204
214
  persist_directory = f'{persist_directory}/{id}'
205
215
 
206
216
  # Load the embedding model and text splitter dynamically
@@ -208,8 +218,7 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
208
218
 
209
219
  # Assuming the classes for the embeddings and splitters are available
210
220
  try:
211
- model_kwargs = collection_config['model_kwargs']
212
- SBertEmbeddings(model_name="a", model_config={})
221
+ model_kwargs = collection_config.get('model_kwargs')
213
222
  model = get_import(embedding_model_type)(
214
223
  model_name=name, **model_kwargs)
215
224
  except:
@@ -220,7 +229,8 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
220
229
  text_splitters_suffixes = []
221
230
  for text_splitter_type in text_splitters_types:
222
231
  try:
223
- type_of_text_splitter = get_import(text_splitter_type.get("type"))
232
+ type_of_text_splitter = get_import(
233
+ text_splitter_type.get("type"))
224
234
  kwargs = text_splitter_type.get("params")
225
235
  suffix = text_splitter_type.get("suffix")
226
236
  if kwargs:
@@ -240,6 +250,6 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
240
250
  persist_directory=persist_directory,
241
251
  embeddings=model,
242
252
  text_splitter=text_splitter,
243
- batch_size=1024)
244
- to_vector_db.store(
253
+ batch_size=batch_size)
254
+ to_vector_db.store_documents(
245
255
  df, columns, page_content_columns, partial_df_size=batch_size)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.109
3
+ Version: 0.0.111
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,5 +1,5 @@
1
1
  ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
2
- ddi_fw/datasets/core.py,sha256=9RaUPhAYCn4RDeTZpHATtJaqNWsO17bduYyVqxAZWs0,17001
2
+ ddi_fw/datasets/core.py,sha256=0bEJSxqO22x0XBoCKOYmxXNa2j7_CqFqKiBZ4KFd9Mk,17039
3
3
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
4
4
  ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
5
5
  ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
58
58
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
59
59
  ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
60
60
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
61
- ddi_fw/langchain/storage.py,sha256=02cIFOrPxo2Nav5ufkKLOh8cfVY8QupxiO6rzRjNB7I,10399
61
+ ddi_fw/langchain/storage.py,sha256=Vz1aICIyZzKwOnOwxkhWAMYWJ9X6kOfqEkIeQJyQIHY,10762
62
62
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
63
63
  ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
64
64
  ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
91
91
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
92
92
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
93
93
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
94
- ddi_fw-0.0.109.dist-info/METADATA,sha256=0kpJYBb0pe7lXlETKf2vSuK0_dOizijcPX0N4h4aBN8,1967
95
- ddi_fw-0.0.109.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
- ddi_fw-0.0.109.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
- ddi_fw-0.0.109.dist-info/RECORD,,
94
+ ddi_fw-0.0.111.dist-info/METADATA,sha256=MqKx9zRNVISPJE-dq49tMzh-EfbmxvD0Deq7N2klGeQ,1967
95
+ ddi_fw-0.0.111.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
+ ddi_fw-0.0.111.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
+ ddi_fw-0.0.111.dist-info/RECORD,,