PyPI - ddi-fw - Versions diffs - 0.0.251__py3-none-any.whl → 0.0.253__py3-none-any.whl - Mend

ddi-fw 0.0.251py3-none-any.whl → 0.0.253py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

ddi_fw/langchain/faiss_storage.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 from uuid import uuid4
 from langchain_community.vectorstores.faiss import FAISS
 from langchain_community.docstore.in_memory import InMemoryDocstore
-from typing import Callable, Optional, Dict, Any, Type
+from typing import Callable, List, Optional, Dict, Any, Type
 from langchain_core.documents import Document
 import numpy as np  # optional, if you're using NumPy vectors
 from langchain_core.embeddings import Embeddings
@@ -56,16 +56,20 @@ class FaissVectorStoreManager(BaseVectorStoreManager):
     #     uuids = [str(uuid4()) for _ in range(len(docs))]
     #     self.vector_store.add_documents(documents=docs, ids=uuids)
     def initialize_embedding_dict(self, **kwargs):
-        # vector_db_persist_directory = kwargs.get("vector_db_persist_directory")
+        """
+        Initializes a dictionary where keys are types (e.g., 'description', 'indication'),
+        and values are dictionaries mapping drugbank_ids to a list of their embeddings.
+        Returns:
+            dict: A dictionary with the structure {type: {drugbank_id: [embedding]}}.
+        """
         self.load(self.persist_directory)
         df = self.as_dataframe(formatter_fn=custom_formatter)
-        type_dict = (
-            df.groupby('type')
-            .apply(lambda group: dict(zip(group['id'], group['embedding'])))
-            .to_dict()
-            )
+        type_dict = {}
+        for drug_type, group in df.groupby('type'):
+            type_dict[drug_type] = dict(zip(group['id'], group['embedding'].apply(lambda x: [x])))
         return type_dict
     def generate_vector_store(self, docs, handle_empty='zero'):
@@ -256,12 +260,106 @@ def load_configuration(config_file):
     return config
+# def generate_embeddings(
+#     df,
+#     vector_store_manager_type:Type[BaseVectorStoreManager],
+#     config_file,
+#     new_model_names,
+#     collections,
+#     persist_directory="embeddings",
+# ):
+#     """
+#     Generate embeddings for collections based on a configuration file.
+#     collections: List of collections that contain metadata for embedding generation.
+#     config_file: Path to the configuration file containing model settings.
+#     new_model_names: List of model names to generate embeddings for.
+#     vector_store_manager_type: Class type of the vector store manager (e.g., FaissVectorStoreManager or ChromaVectorStoreManager)
+#     """
+#     if not collections and not config_file:
+#         raise ValueError("Either 'collections' or 'config_file' must be provided.")
+#     if collections and config_file:
+#         raise ValueError("Only one of 'collections' or 'config_file' should be provided.")
+#     if not collections:
+#         collections = load_configuration(config_file)
+#     for collection_config in collections:
+#         id = collection_config['id']
+#         name = collection_config['name']
+#         if name not in new_model_names:
+#             continue
+#         embedding_model_type = collection_config.get('embedding_model_type')
+#         text_splitters_types = collection_config.get('text_splitters_types')
+#         batch_size = collection_config.get('batch_size')
+#         partial_df_size = collection_config.get('partial_dataframe_size')
+#         columns = collection_config.get('columns')
+#         page_content_columns = collection_config.get('page_content_columns')
+#         persist_dir = f'{persist_directory}/{id}'
+#         # Load embedding model
+#         try:
+#             model_kwargs = collection_config.get('model_kwargs')
+#             model = get_import(embedding_model_type)(
+#                 model_name=name, **model_kwargs)
+#         except Exception as e:
+#             raise Exception(f"Unknown embedding model: {embedding_model_type}") from e
+#         # Load text splitters
+#         text_splitters = []
+#         text_splitters_suffixes = []
+#         for text_splitter_type in text_splitters_types:
+#             try:
+#                 type_of_text_splitter = get_import(
+#                     text_splitter_type.get("type"))
+#                 kwargs = text_splitter_type.get("params")
+#                 suffix = text_splitter_type.get("suffix")
+#                 if kwargs:
+#                     text_splitter = type_of_text_splitter(**kwargs)
+#                 else:
+#                     text_splitter = type_of_text_splitter()
+#                 text_splitters.append(text_splitter)
+#                 text_splitters_suffixes.append(suffix)
+#             except Exception as e:
+#                 raise Exception(f"Unknown text splitter: {text_splitter_type}") from e
+#         for text_splitter, suffix in zip(text_splitters, text_splitters_suffixes):
+#             print(f"{id}_{suffix}")
+#             # Prepare manager parameters
+#             manager_params = {
+#                 "collection_name": f"{id}_{suffix}",
+#                 "persist_directory": persist_dir,
+#                 "embeddings": model,
+#                 "text_splitter": text_splitter,
+#                 "batch_size": batch_size
+#             }
+#             # Instantiate the manager class
+#             vector_store_manager = vector_store_manager_type(**manager_params)
+#             # Prepare documents
+#             # You may need to use a DataFrameLoader or similar to convert df to LangChain Documents
+#             loader = DataFrameLoader(
+#                 data_frame=df, page_content_column=page_content_columns[0]
+#             )
+#             docs = loader.load()
+#             # Generate vector store
+#             vector_store_manager.generate_vector_store(docs)
+#             # Optionally persist/save
+#             vector_store_manager.save(persist_dir)
 def generate_embeddings(
-    df,
-    vector_store_manager_type:Type[BaseVectorStoreManager],
-    config_file,
-    new_model_names,
-    collections,
+    docs,
+    vector_store_manager_type:Type[BaseVectorStoreManager],
+    config_file:Optional[str],
+    new_model_names:Optional[List],
+    collections:Optional[Dict],
     persist_directory="embeddings",
 ):
     """
@@ -276,17 +374,16 @@ def generate_embeddings(
         raise ValueError("Either 'collections' or 'config_file' must be provided.")
     if collections and config_file:
         raise ValueError("Only one of 'collections' or 'config_file' should be provided.")
     if not collections:
         collections = load_configuration(config_file)
+    if collections is None:
+        raise ValueError("No collections found in the configuration file.")
     for collection_config in collections:
         id = collection_config['id']
         name = collection_config['name']
         if name not in new_model_names:
             continue
         embedding_model_type = collection_config.get('embedding_model_type')
         text_splitters_types = collection_config.get('text_splitters_types')
         batch_size = collection_config.get('batch_size')
@@ -336,15 +433,8 @@ def generate_embeddings(
             # Instantiate the manager class
             vector_store_manager = vector_store_manager_type(**manager_params)
-            # Prepare documents
-            # You may need to use a DataFrameLoader or similar to convert df to LangChain Documents
-            loader = DataFrameLoader(
-                data_frame=df, page_content_column=page_content_columns[0]
-            )
-            docs = loader.load()
             # Generate vector store
             vector_store_manager.generate_vector_store(docs)
             # Optionally persist/save
-            vector_store_manager.save(persist_dir)
+            vector_store_manager.save(persist_dir)

{ddi_fw-0.0.251.dist-info → ddi_fw-0.0.253.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ddi_fw
-Version: 0.0.251
+Version: 0.0.253
 Summary: Do not use :)
 Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
 Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>

{ddi_fw-0.0.251.dist-info → ddi_fw-0.0.253.dist-info}/RECORD RENAMED Viewed

@@ -6,7 +6,7 @@ ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,10
 ddi_fw/langchain/__init__.py,sha256=xGNaTEZCUxyc_aT1zvzVWGRfsj-9VXqMvPKtV_G7ChA,399
 ddi_fw/langchain/chroma_storage.py,sha256=7LSUhdiPdQHZvKC_NapOeVbHtS71iE5ABZVTrI0YQ-A,15520
 ddi_fw/langchain/embeddings.py,sha256=eEWy4okcjdhUJHi4N48Wd8XauPXyeaQVLUdNWEvtEcY,6754
-ddi_fw/langchain/faiss_storage.py,sha256=os7OuUwB1WbeX5Ptw6kxES6MntKXjTTcpPtkJMBgi14,13608
+ddi_fw/langchain/faiss_storage.py,sha256=e6WbjKAlCbFZUq9P3LTjTeXmWTAh6Oxp_NMVbvfK-kc,17727
 ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
 ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
 ddi_fw/ml/__init__.py,sha256=FteYEawCkVQOaK-cTv2VrHZ2ZnfeFr31BD6VucO7_DQ,268
@@ -38,7 +38,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
 ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
 ddi_fw/vectorization/feature_vector_generation.py,sha256=QQQGhCti653BdU343Ag1bH_g1fzi2hlic7dgNy7otjE,7694
 ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
-ddi_fw-0.0.251.dist-info/METADATA,sha256=GxHCFYuKnptAcelnZaLCOziDuHD798ptjslXgW6xXHk,2623
-ddi_fw-0.0.251.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ddi_fw-0.0.251.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
-ddi_fw-0.0.251.dist-info/RECORD,,
+ddi_fw-0.0.253.dist-info/METADATA,sha256=vYOtl4WgNa-ydlgj2dcuC2eRuFfMRlT-9OubOT3ep2U,2623
+ddi_fw-0.0.253.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ddi_fw-0.0.253.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
+ddi_fw-0.0.253.dist-info/RECORD,,

{ddi_fw-0.0.251.dist-info → ddi_fw-0.0.253.dist-info}/WHEEL RENAMED Viewed

File without changes

{ddi_fw-0.0.251.dist-info → ddi_fw-0.0.253.dist-info}/top_level.txt RENAMED Viewed

File without changes

ddi-fw 0.0.251__py3-none-any.whl → 0.0.253__py3-none-any.whl

ddi-fw 0.0.251py3-none-any.whl → 0.0.253py3-none-any.whl