PyPI - ddi-fw - Versions diffs - 0.0.98__py3-none-any.whl → 0.0.99__py3-none-any.whl - Mend

ddi-fw 0.0.98py3-none-any.whl → 0.0.99py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

ddi_fw/langchain/__init__.py +1 -1
ddi_fw/langchain/embeddings.py +1 -1
ddi_fw/langchain/storage.py +113 -12
ddi_fw/utils/__init__.py +2 -0
ddi_fw/utils/kaggle.py +56 -0
ddi_fw/utils/package_helper.py +31 -0
{ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/METADATA +1 -1
{ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/RECORD +10 -8
{ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/WHEEL +0 -0
{ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/top_level.txt +0 -0

ddi_fw/langchain/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from ..langchain.embeddings import PoolingStrategy,SumPoolingStrategy,MeanPoolingStrategy,SentenceTransformerDecorator,PretrainedEmbeddings,SBertEmbeddings
 from .sentence_splitter import SentenceSplitter
-from .storage import DataFrameToVectorDB
+from .storage import DataFrameToVectorDB, generate_embeddings

ddi_fw/langchain/embeddings.py CHANGED Viewed

@@ -69,7 +69,7 @@ class SentenceTransformerDecorator(Embeddings):
 class PretrainedEmbeddings(Embeddings):
     def __init__(self, model_name):
-        self.mmodel_name = model_name
+        self.model_name = model_name
         self.model = AutoModel.from_pretrained(model_name)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.shape = self.model.get_input_embeddings().weight.shape

ddi_fw/langchain/storage.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 from langchain.vectorstores import Chroma
 # from langchain_community.vectorstores import Chroma
 from langchain_community.vectorstores.utils import filter_complex_metadata
@@ -11,6 +12,16 @@ from langchain.document_loaders import DataFrameLoader
 from langchain.text_splitter import TextSplitter
 # from langchain_community.document_loaders.hugging_face_dataset import HuggingFaceDatasetLoader
+from utils import get_import
+def load_configuration(config_file):
+    """
+    Load the configuration from a JSON file.
+    """
+    with open(config_file, 'r') as f:
+        config = json.load(f)
+    return config
 class DataFrameToVectorDB:
@@ -18,15 +29,13 @@ class DataFrameToVectorDB:
                  collection_name,
                  persist_directory,
                  embeddings: Embeddings,
-                 text_splitter: TextSplitter,
-                 chunk_size=1000,
-                 chunk_overlap=20):
+                 text_splitter: TextSplitter,
+                 batch_size=1000):
         self.collection_name = collection_name
         self.persist_directory = persist_directory
         self.embeddings = embeddings
         self.text_splitter = text_splitter
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
+        self.batch_size = batch_size  # to store chunks partially
         self.vectordb = Chroma(collection_name=collection_name,
                                persist_directory=persist_directory,
                                embedding_function=embeddings)
@@ -35,11 +44,14 @@ class DataFrameToVectorDB:
         docs = self.text_splitter.split_documents(documents)
         return docs
-    def __split_list(self, input_list, chunk_size):
-        for i in range(0, len(input_list), chunk_size):
-            yield input_list[i:i + chunk_size]
+    def __split_list(self, input_list, batch_size):
+        for i in range(0, len(input_list), batch_size):
+            yield input_list[i:i + batch_size]
-    def store(self, df, columns, page_content_columns, max_batch_size=1000):
+    def store_documents(self, df, columns, page_content_columns):
+        """
+        Core function that processes the documents and adds them to the vector database.
+        """
         for page_content_column in page_content_columns:
             copy_columns = columns.copy()
             copy_columns.append(page_content_column)
@@ -48,11 +60,12 @@ class DataFrameToVectorDB:
             col_df['type'] = page_content_column  # Set the type column
             documents = []
-            loader = DataFrameLoader(data_frame=col_df, page_content_column=page_content_column)
+            loader = DataFrameLoader(
+                data_frame=col_df, page_content_column=page_content_column)
             loaded_docs = loader.load()
             documents.extend(self.__split_docs(loaded_docs))
-            split_docs_chunked = self.__split_list(documents, max_batch_size)
+            split_docs_chunked = self.__split_list(documents, self.batch_size)
             for split_docs_chunk in split_docs_chunked:
                 # vectordb = Chroma.from_documents(
@@ -64,5 +77,93 @@ class DataFrameToVectorDB:
                 self.vectordb.add_documents(split_docs_chunk)
                 self.vectordb.persist()
+    def store(self, df, columns, page_content_columns, partial_df_size=None):
+        """
+        Store function to handle both full and partial dataframe processing.
+        """
+        if partial_df_size:
+            # Process the dataframe in chunks if partial_df_size is provided
+            for i in range(0, len(df), partial_df_size):
+                batch = df[i: i + partial_df_size]
+                self.store_documents(df=batch, columns=columns,
+                                     page_content_columns=page_content_columns, batch_size=self.batch_size)
+        else:
+            # Process the entire dataframe if no partial_df_size is specified
+            self.store_documents(df=df, columns=columns,
+                                 page_content_columns=page_content_columns, batch_size=self.batch_size)
+def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
+    """
+    Generate embeddings for collections based on a configuration file.
+    collections: List of collections that contain metadata for embedding generation.
+    config_file: Path to the configuration file containing model settings.
+    new_model_names: List of model names to generate embeddings for.
+    """
+    # Load the configuration from the provided file
+    if not collections:
+        collections = load_configuration(config_file)
+    # Process each collection
+    for collection_config in collections:
+        id = collection_config['id']
+        name = collection_config['name']
+        # Skip if the collection's name is not in the list of new model names
+        if name not in new_model_names:
+            continue
+        # # Find the matching configuration for the collection
+        # collection_config = next(
+        #     (item for item in collections if item['id'] == id), None)
+        # if not collection_config:
+        #     print(f"Configuration for collection {id} not found.")
+        #     continue
+        embedding_model_type = collection_config['embedding_model_tpe']
+        text_splitters_types = collection_config['text_splitters_types']
+        batch_size = collection_config['batch_size']
+        columns = collection_config['columns']
+        page_content_columns = collection_config['page_content_columns']
+        persist_directory = f'{persist_directory}/{id}'
+        # Load the embedding model and text splitter dynamically
+        print(f"Generating embeddings for {id} with model {name}...")
+        # Assuming the classes for the embeddings and splitters are available
+        try:
+            model = get_import(embedding_model_type)(
+                model_name=name, model_kwargs=c['model_kwargs'])
+        except:
+            # print(f"Unknown embedding model: {embedding_model_type}")
+            raise Exception(f"Unknown embedding model: {embedding_model_type}")
+        text_splitters = []
+        text_splitters_suffixes = []
+        for text_splitter_type in text_splitters_types:
+            try:
+                type_of_text_splitter = get_import(text_splitter_type)
+                text_splitter_params = text_splitter.get("params")
+                suffix = text_splitter.get("suffix")
+                if text_splitter_params:
+                    text_splitter = type_of_text_splitter(
+                        **text_splitter_params)
+                else:
+                    text_splitter = type_of_text_splitter()
+                text_splitters.append(text_splitter)
+                text_splitters_suffixes.append(suffix)
+            except:
+                print(f"Unknown text splitter: {text_splitter_type}")
+                raise Exception(f"Unknown text splitter: {text_splitter_type}")
+        for text_splitter, suffix in zip(text_splitters, text_splitters_suffixes):
+            print(f"{id}_{suffix}")
+            to_vector_db = DataFrameToVectorDB(collection_name=f"{id}_{suffix}",
+                                               persist_directory=persist_directory,
+                                               embeddings=model,
+                                               text_splitter=text_splitter,
+                                               batch_size=1024)
+            to_vector_db.store(
+                df, columns, page_content_columns, partial_df_size=batch_size)

ddi_fw/utils/__init__.py CHANGED Viewed

@@ -2,3 +2,5 @@ from .utils import clear_directory,create_folder_if_not_exists, utc_time_as_stri
 from .zip_helper import ZipHelper
 from .py7zr_helper import Py7ZipHelper
 from .enums import UMLSCodeTypes, DrugBankTextDataTypes
+from .package_helper import get_import
+from .kaggle import create_kaggle_dataset

ddi_fw/utils/kaggle.py ADDED Viewed

@@ -0,0 +1,56 @@
+import os
+import json
+def create_kaggle_dataset(base_path: str, collections: list):
+    """
+    This function creates metadata JSON files and uploads datasets to Kaggle from folders.
+    Args:
+        base_path (str): The base path containing dataset folders.
+        collections (list): A list of dictionaries containing metadata about collections (e.g., model names).
+        path (str): The path to your root directory (default is "/content" for Google Colab).
+    Returns:
+        None
+    """
+    # Step 1: Loop through each folder in base_path
+    for folder_name in os.listdir(base_path):
+        folder_path = os.path.join(base_path, folder_name)
+        # Step 2: Get metadata for the current folder
+        model_info = next((c for c in collections if c['id'] == folder_name), None)
+        if model_info is None:
+            continue  # Skip if model info is not found
+        title = model_info['kaggle_title']
+        id = model_info['kaggle_id'].lower().replace(' ', '-')
+        licenses = model_info['kaggle_licenses']
+        description = model_info['kaggle_description']
+        # Ensure title is between 6 and 50 characters
+        if not (6 <= len(title) <= 50):
+            continue  # Skip if title length is out of the expected range
+        # Step 3: Define the metadata content
+        metadata = {
+            "title": title,
+            "id": id,
+            "licenses": licenses,
+            "description": description,
+        }
+        # Step 4: Write the metadata to a JSON file in the folder
+        metadata_file_path = os.path.join(folder_path, 'dataset-metadata.json')
+        with open(metadata_file_path, 'w') as f:
+            json.dump(metadata, f, indent=4)
+        print(f"Created metadata for {folder_name}: {metadata_file_path}")
+    # Step 5: Create datasets on Kaggle using the Kaggle API
+    for folder_name in os.listdir(base_path):
+        folder_path = os.path.join(base_path, folder_name)
+        if os.path.isdir(folder_path):
+            # Run the Kaggle dataset creation command
+            os.system(f"kaggle datasets create -p {folder_path} --dir-mode zip")
+            print(f"Dataset created for {folder_name}.")

ddi_fw/utils/package_helper.py ADDED Viewed

@@ -0,0 +1,31 @@
+import importlib
+def get_import(full_path_of_import):
+    """Dynamically imports an object from a module given its full path.
+    Args:
+        full_path_of_import (str): The full path of the import (e.g., 'module.submodule.ClassName').
+    Returns:
+        object: The imported object.
+    Raises:
+        ImportError: If the module cannot be imported.
+        AttributeError: If the attribute does not exist in the module.
+    """
+    if not full_path_of_import:
+        raise ValueError("The import path cannot be empty.")
+    parts = full_path_of_import.split('.')
+    import_name = parts[-1]
+    module_name = ".".join(parts[:-1]) if len(parts) > 1 else ""
+    try:
+        module = importlib.import_module(module_name)
+        return getattr(module, import_name)
+    except ModuleNotFoundError as e:
+        raise ImportError(f"Module '{module_name}' could not be found.") from e
+    except AttributeError as e:
+        raise AttributeError(
+            f"'{module_name}' has no attribute '{import_name}'") from e

{ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ddi_fw
-Version: 0.0.98
+Version: 0.0.99
 Summary: Do not use :)
 Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
 Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>

{ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/RECORD RENAMED Viewed

@@ -55,10 +55,10 @@ ddi_fw/drugbank/drugbank_parser.py,sha256=lxUuhB0s8ef_aPNDs0V8ClKF7-KIWugNIV9gVs
 ddi_fw/drugbank/drugbank_processor.py,sha256=vmkt68n9nFLevufgGyXhOSDtTo4G1XzwT9PVncGTXtk,18127
 ddi_fw/drugbank/drugbank_processor_org.py,sha256=eO5Yset50P91qkic79RUXPoEuxRxQKFkKW0l4G29Mas,13322
 ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHrBXsI8,4600
-ddi_fw/langchain/__init__.py,sha256=8dBPZivc01WWaCH8sZ_UV8-XPyo74e9Qy6-fYgAiNLE,248
-ddi_fw/langchain/embeddings.py,sha256=8J_SfO9pyET2W-Ltzq0_r9EchFzBsYdUabiOMma42Us,7515
+ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
+ddi_fw/langchain/embeddings.py,sha256=b9BUG73Ayx3Wy8MQrfsVeZ-qBB41vjVECSp2YhH-CIY,7514
 ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
-ddi_fw/langchain/storage.py,sha256=uy5clVB07So2eFbRGdAKzHIPdfEk4se33cPktis7Aa4,2716
+ddi_fw/langchain/storage.py,sha256=ljK_ybzjtrikb4XdJ1qkNFTqqyw5r62OBAnBJ5B-X_k,7408
 ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
 ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
 ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -84,12 +84,14 @@ ddi_fw/test/sklearn-tfidf.py,sha256=cjtg27vLskEMXgrsqUR_EapRGVd4xgwOQ9zYsu72zjs,
 ddi_fw/test/test.py,sha256=zJh9ZBcZl-vZIFDvuftcRrRV8WAwtiFVhPPd6Et4OU4,2997
 ddi_fw/test/torch_cuda_test.py,sha256=R-4VGVErl_Ufk54DoZbgL_YXWoCYFyanIVWd6P39IEk,312
 ddi_fw/test/type_guarding_test.py,sha256=KxjyBxohDu7lwpejalCj-REjtJ-k1S1wQbOB6TGY0O8,766
-ddi_fw/utils/__init__.py,sha256=x1ypYJRKJlbF9x4psHYGXj-YbDD8T_c28gXZkr03cdE,273
+ddi_fw/utils/__init__.py,sha256=77563ikqAtdzjjgRlLp5OAsJBbpLA1Cao8iecGaVUXQ,354
 ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
+ddi_fw/utils/kaggle.py,sha256=FjWR1ncOEif6XCCzDYpErLDz_9fxAQub0L7X4aVPw24,2266
+ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
 ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
 ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
 ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
-ddi_fw-0.0.98.dist-info/METADATA,sha256=W4ZdrQs8YgQp6aHxr4Py5_lO4zrzKnk1XjDfFhrlsq8,1966
-ddi_fw-0.0.98.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
-ddi_fw-0.0.98.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
-ddi_fw-0.0.98.dist-info/RECORD,,
+ddi_fw-0.0.99.dist-info/METADATA,sha256=NrGCSF9-dHrO0FteW7HO0OFPF7dfgeNs2gDVvLMtiNg,1966
+ddi_fw-0.0.99.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
+ddi_fw-0.0.99.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
+ddi_fw-0.0.99.dist-info/RECORD,,

{ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/WHEEL RENAMED Viewed

File without changes

{ddi_fw-0.0.98.dist-info → ddi_fw-0.0.99.dist-info}/top_level.txt RENAMED Viewed

File without changes

ddi-fw 0.0.98__py3-none-any.whl → 0.0.99__py3-none-any.whl

ddi-fw 0.0.98py3-none-any.whl → 0.0.99py3-none-any.whl