PyPI - ddi-fw - Versions diffs - 0.0.109__py3-none-any.whl → 0.0.111__py3-none-any.whl - Mend

ddi-fw 0.0.109py3-none-any.whl → 0.0.111py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

ddi_fw/datasets/core.py CHANGED Viewed

@@ -274,7 +274,7 @@ class BaseDataset(ABC):
         filtered_df = self.drugs_df
         combined_df = filtered_df.copy()
-        if self.ner_df:
+        if self.ner_df is not None and not self.ner_df.empty:
             filtered_ner_df = self.ner_df[self.ner_df['drugbank_id'].isin(
                 drug_ids)]
             filtered_ner_df = self.ner_df.copy()

ddi_fw/langchain/storage.py CHANGED Viewed

@@ -3,6 +3,7 @@ from langchain.vectorstores import Chroma
 # from langchain_community.vectorstores import Chroma
 from langchain_community.vectorstores.utils import filter_complex_metadata
 from langchain_core.embeddings import Embeddings
+import time
 from langchain.docstore.document import Document
@@ -25,16 +26,17 @@ def load_configuration(config_file):
         config = json.load(f)
     return config
 def split_dataframe(df, min_size=512):
     total_size = len(df)
     # If the dataframe is smaller than min_size, return the dataframe as a whole
     if total_size <= min_size:
         return [df]
     # List to store partial DataFrames
     partial_dfs = []
     start_idx = 0
     # Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
     num_chunks = total_size // min_size
     remaining_rows = total_size
@@ -44,62 +46,64 @@ def split_dataframe(df, min_size=512):
         chunk_size = min_size
         if (remaining_rows - chunk_size) < min_size:
             chunk_size = remaining_rows  # Last chunk takes all remaining rows
         partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
         # Update the start index and remaining rows
         start_idx += chunk_size
         remaining_rows -= chunk_size
     # If there are any remaining rows left after the loop, they should form the last chunk
     if remaining_rows > 0:
         partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
     return partial_dfs
 def split_dataframe_indices(df, min_size=512):
     total_size = len(df)
     # If the dataframe is smaller than min_size, return the entire range
     if total_size <= min_size:
         return [(0, total_size - 1)]
     # List to store the start and end indices of each chunk
     chunk_indices = []
     start_idx = 0
     # Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
     num_chunks = total_size // min_size
     remaining_rows = total_size
     # Split into chunks
     for i in range(num_chunks):
         chunk_size = min_size
         if (remaining_rows - chunk_size) < min_size:
             chunk_size = remaining_rows  # Last chunk takes all remaining rows
         # Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
         end_idx = start_idx + chunk_size - 1
         chunk_indices.append((start_idx, end_idx))
         # Update the start index and remaining rows
         start_idx += chunk_size
         remaining_rows -= chunk_size
     # If there are any remaining rows after the loop, they should form the last chunk
     if remaining_rows > 0:
         end_idx = start_idx + remaining_rows - 1
         chunk_indices.append((start_idx, end_idx))
     return chunk_indices
 class DataFrameToVectorDB:
     def __init__(self,
                  collection_name,
                  persist_directory,
                  embeddings: Embeddings,
                  text_splitter: TextSplitter,
-                 batch_size=1000):
+                 batch_size=1024):
         self.collection_name = collection_name
         self.persist_directory = persist_directory
         self.embeddings = embeddings
@@ -115,10 +119,10 @@ class DataFrameToVectorDB:
     def __split_list(self, input_list, batch_size):
         # for i in range(0, len(input_list), batch_size):
-        for s,e in split_dataframe_indices(input_list):
+        for s, e in split_dataframe_indices(input_list, batch_size):
             yield input_list[s:e+1]
-    def store_documents(self, df, columns, page_content_columns):
+    def store_documents(self, df, columns, page_content_columns, partial_df_size=None):
         """
         Core function that processes the documents and adds them to the vector database.
         """
@@ -128,43 +132,49 @@ class DataFrameToVectorDB:
             col_df = df[copy_columns].copy()
             col_df.dropna(subset=[page_content_column], inplace=True)
             col_df['type'] = page_content_column  # Set the type column
-            documents = []
-            loader = DataFrameLoader(
-                data_frame=col_df, page_content_column=page_content_column)
-            loaded_docs = loader.load()
-            documents.extend(self.__split_docs(loaded_docs))
-            split_docs_chunked = self.__split_list(documents, self.batch_size)
-            for split_docs_chunk in split_docs_chunked:
-                # vectordb = Chroma.from_documents(
-                #     collection_name=collection_name,
-                #     documents=split_docs_chunk,
-                #     embedding=embeddings,
-                #     persist_directory=persist_directory,
-                # )
-                self.vectordb.add_documents(split_docs_chunk)
-                self.vectordb.persist()
-    def store(self, df, columns, page_content_columns, partial_df_size=None):
-        """
-        Store function to handle both full and partial dataframe processing.
-        """
-        if partial_df_size:
-            partial_dfs  = split_dataframe(df, min_size = partial_df_size)
-            for partial_df in partial_dfs:
-                self.store_documents(df=partial_df, columns=columns,
-                                     page_content_columns=page_content_columns)
-            # Process the dataframe in chunks if partial_df_size is provided
-            # for i in range(0, len(df), partial_df_size):
-            #     batch = df[i: i + partial_df_size]
-            #     self.store_documents(df=batch, columns=columns,
-            #                          page_content_columns=page_content_columns)
-        else:
-            # Process the entire dataframe if no partial_df_size is specified
-            self.store_documents(df=df, columns=columns,
-                                 page_content_columns=page_content_columns)
+            if partial_df_size:
+                total = 0
+                partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
+                for partial_df in partial_dfs:
+                    import torch
+                    documents = []
+                    loader = DataFrameLoader(
+                        data_frame=partial_df, page_content_column=page_content_column)
+                    loaded_docs = loader.load()
+                    # print(loaded_docs)
+                    # documents.extend(self.__split_docs(loaded_docs))
+                    total += len(partial_df)
+                    self.vectordb.add_documents(loaded_docs)
+                    self.vectordb.persist()
+                    print(f"{page_content_column}: {total}/{len(col_df)}")
+                    torch.cuda.empty_cache()
+                    # time.sleep(30)  # The GPU will not be used during this period
+                    # split_docs_chunked = self.__split_list(
+                    #     loaded_docs, self.batch_size)
+                    # print(f"Number of chunks: {len(split_docs_chunked)}")
+                    # for split_docs_chunk in split_docs_chunked:
+                    #     print(f"Split docs size: {len(split_docs_chunk)}")
+                    #     self.vectordb.add_documents(split_docs_chunk)
+                    #     self.vectordb.persist()
+            else:
+                documents = []
+                print(col_df.shape)
+                loader = DataFrameLoader(
+                    data_frame=col_df, page_content_column=page_content_column)
+                loaded_docs = loader.load()
+                documents.extend(self.__split_docs(loaded_docs))
+                print(f"Documents size: {len(loaded_docs)}")
+                split_docs_chunked = self.__split_list(
+                    documents, self.batch_size)
+                for split_docs_chunk in split_docs_chunked:
+                    import torch
+                    torch.cuda.empty_cache()
+                    self.vectordb.add_documents(split_docs_chunk)
+                    self.vectordb.persist()
+                    print(f"{page_content_column}, size:{len(split_docs_chunk)}")
 def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
@@ -196,11 +206,11 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
         #     print(f"Configuration for collection {id} not found.")
         #     continue
-        embedding_model_type = collection_config['embedding_model_type']
-        text_splitters_types = collection_config['text_splitters_types']
-        batch_size = collection_config['batch_size']
-        columns = collection_config['columns']
-        page_content_columns = collection_config['page_content_columns']
+        embedding_model_type = collection_config.get('embedding_model_type')
+        text_splitters_types = collection_config.get('text_splitters_types')
+        batch_size = collection_config.get('batch_size')
+        columns = collection_config.get('columns')
+        page_content_columns = collection_config.get('page_content_columns')
         persist_directory = f'{persist_directory}/{id}'
         # Load the embedding model and text splitter dynamically
@@ -208,8 +218,7 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
         # Assuming the classes for the embeddings and splitters are available
         try:
-            model_kwargs = collection_config['model_kwargs']
-            SBertEmbeddings(model_name="a", model_config={})
+            model_kwargs = collection_config.get('model_kwargs')
             model = get_import(embedding_model_type)(
                 model_name=name, **model_kwargs)
         except:
@@ -220,7 +229,8 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
         text_splitters_suffixes = []
         for text_splitter_type in text_splitters_types:
             try:
-                type_of_text_splitter = get_import(text_splitter_type.get("type"))
+                type_of_text_splitter = get_import(
+                    text_splitter_type.get("type"))
                 kwargs = text_splitter_type.get("params")
                 suffix = text_splitter_type.get("suffix")
                 if kwargs:
@@ -240,6 +250,6 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
                                                persist_directory=persist_directory,
                                                embeddings=model,
                                                text_splitter=text_splitter,
-                                               batch_size=1024)
-            to_vector_db.store(
+                                               batch_size=batch_size)
+            to_vector_db.store_documents(
                 df, columns, page_content_columns, partial_df_size=batch_size)

{ddi_fw-0.0.109.dist-info → ddi_fw-0.0.111.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ddi_fw
-Version: 0.0.109
+Version: 0.0.111
 Summary: Do not use :)
 Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
 Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>

{ddi_fw-0.0.109.dist-info → ddi_fw-0.0.111.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 ddi_fw/datasets/__init__.py,sha256=HSwQrqnzrEjIG4gif41pwJ_cST3t2XHGDxqFyuEBRwo,351
-ddi_fw/datasets/core.py,sha256=9RaUPhAYCn4RDeTZpHATtJaqNWsO17bduYyVqxAZWs0,17001
+ddi_fw/datasets/core.py,sha256=0bEJSxqO22x0XBoCKOYmxXNa2j7_CqFqKiBZ4KFd9Mk,17039
 ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
 ddi_fw/datasets/embedding_generator.py,sha256=Jqrlv88RCu0Lg812KsA12X0cSaZuxbckJ4LNRKNy_qw,2173
 ddi_fw/datasets/feature_vector_generation.py,sha256=EImavcALxkIB0YG_smOzagMNzuWMbK9SaWSKwARx_qU,3254
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
 ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
 ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
 ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
-ddi_fw/langchain/storage.py,sha256=02cIFOrPxo2Nav5ufkKLOh8cfVY8QupxiO6rzRjNB7I,10399
+ddi_fw/langchain/storage.py,sha256=Vz1aICIyZzKwOnOwxkhWAMYWJ9X6kOfqEkIeQJyQIHY,10762
 ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
 ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
 ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
 ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
 ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
 ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
-ddi_fw-0.0.109.dist-info/METADATA,sha256=0kpJYBb0pe7lXlETKf2vSuK0_dOizijcPX0N4h4aBN8,1967
-ddi_fw-0.0.109.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
-ddi_fw-0.0.109.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
-ddi_fw-0.0.109.dist-info/RECORD,,
+ddi_fw-0.0.111.dist-info/METADATA,sha256=MqKx9zRNVISPJE-dq49tMzh-EfbmxvD0Deq7N2klGeQ,1967
+ddi_fw-0.0.111.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
+ddi_fw-0.0.111.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
+ddi_fw-0.0.111.dist-info/RECORD,,

{ddi_fw-0.0.109.dist-info → ddi_fw-0.0.111.dist-info}/WHEEL RENAMED Viewed

File without changes

{ddi_fw-0.0.109.dist-info → ddi_fw-0.0.111.dist-info}/top_level.txt RENAMED Viewed

File without changes

ddi-fw 0.0.109__py3-none-any.whl → 0.0.111__py3-none-any.whl

ddi-fw 0.0.109py3-none-any.whl → 0.0.111py3-none-any.whl