PyPI - ddi-fw - Versions diffs - 0.0.214__tar.gz → 0.0.215__tar.gz - Mend

ddi-fw 0.0.214tar.gz → 0.0.215tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

{ddi_fw-0.0.214 → ddi_fw-0.0.215}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ddi_fw
-Version: 0.0.214
+Version: 0.0.215
 Summary: Do not use :)
 Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
 Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>

{ddi_fw-0.0.214 → ddi_fw-0.0.215}/pyproject.toml RENAMED Viewed

@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ddi_fw"
-version = "0.0.214"
+version = "0.0.215"
 description = "Do not use :)"
 readme = "README.md"
 authors = [

ddi_fw-0.0.215/src/ddi_fw/datasets/db_utils.py ADDED Viewed

@@ -0,0 +1,204 @@
+from sqlite3 import Error
+import sqlite3
+import pandas as pd
+import numpy as np
+def create_connection(db_file=r"./event.db"):
+    """ create a database connection to the SQLite database
+        specified by db_file
+    :param db_file: database file
+    :return: Connection object or None
+    """
+    conn = None
+    try:
+        conn = sqlite3.connect(db_file)
+    except Error as e:
+        print(e)
+    return conn
+# def select_all_drugs(conn):
+#     cur = conn.cursor()
+#     cur.execute(
+#         '''select "index", id, name, target, enzyme, pathway, smile from drug''')
+#     rows = cur.fetchall()
+#     return rows
+# def select_all_drugs_as_dataframe(conn):
+#     headers = ['index','id', 'name', 'target', 'enzyme', 'pathway', 'smile']
+#     rows = select_all_drugs(conn)
+#     df = pd.DataFrame(columns=headers, data=rows)
+#     df['enzyme'] = df['enzyme'].apply(lambda x: x.split('|'))
+#     df['target'] = df['target'].apply(lambda x: x.split('|'))
+#     df['pathway'] = df['pathway'].apply(lambda x: x.split('|'))
+#     df['smile'] = df['smile'].apply(lambda x: x.split('|'))
+#     return df
+# def select_all_events(conn):
+#     """
+#     Query all rows in the event table
+#     :param conn: the Connection object
+#     :return:
+#     """
+#     cur = conn.cursor()
+#     cur.execute("select * from event")
+#     rows = cur.fetchall()
+#     return rows
+# def select_all_events_as_dataframe(conn):
+#     headers = ["index", "id1", "name1", "id2", "name2", "event_category"]
+#     rows = select_all_events(conn)
+#     return pd.DataFrame(columns=headers, data=rows)
+# def select_events_with_category(conn):
+#     sql = '''select id1, name1, id2, name2, mechanism || ' ' ||action from event ev
+#             join extraction ex
+#             on ev.name1 = ex.drugA and ev.name2 = ex.drugB
+#             union
+#             select id1, name1, id2, name2, mechanism || ' ' ||action from event ev
+#             join extraction ex
+#             on ev.name1 = ex.drugB and ev.name2 = ex.drugA
+#             '''
+#     cur = conn.cursor()
+#     cur.execute(sql)
+#     rows = cur.fetchall()
+#     headers = ['id1', 'name1', 'id2', 'name2', 'event_category']
+#     return pd.DataFrame(columns=headers, data=rows)
+# def select_all_interactions_tuple_as_dataframe(conn):
+#     cur = conn.cursor()
+#     cur.execute("select id1, id2 from event")
+#     rows = cur.fetchall()
+#     headers = ['id1', 'id2']
+#     return pd.DataFrame(columns=headers, data=rows)
+# def select_ddi_pairs(conn):
+#     cur = conn.cursor()
+#     cur.execute('''
+#         select d1.[index] as Drug1Index, d2.[index] as Drug2Index, 1 from event e
+#         join drug d1 on e.id1 = d1.id
+#         join drug d2 on e.id2 = d2.id
+#     ''')
+#     rows = cur.fetchall()
+#     return rows
+# def select_ddi_pairs_as_dataframe(conn):
+#     headers = ["Drug1Index", "Drug2Index", "Interaction"]
+#     rows = select_ddi_pairs(conn)
+#     return pd.DataFrame(columns=headers, data=rows)
+# def get_interactions(conn):
+#     cur = conn.cursor()
+#     cur.execute('''
+#         select
+#             drug_1_id,
+#             drug_1,
+#             drug_2_id,
+#             drug_2,
+#             mechanism_action,
+#             interaction,
+#             masked_interaction
+#         from _Interactions
+#                 ''')
+#     rows = cur.fetchall()
+#     headers = ['id1', 'name1', 'id2', 'name2',
+#                'event_category', 'interaction', 'masked_interaction']
+#     df = pd.DataFrame(columns=headers, data=rows)
+#     return df
+# def get_extended_version(conn):
+#     cur = conn.cursor()
+#     cur.execute('''
+#         select
+#         _Drugs."index",
+#         drugbank_id,
+#         _Drugs.name,
+#         description,
+#         synthesis_reference,
+#         indication,
+#         pharmacodynamics,
+#         mechanism_of_action,
+#         toxicity,
+#         metabolism,
+#         absorption,
+#         half_life,
+#         protein_binding,
+#         route_of_elimination,
+#         volume_of_distribution,
+#         clearance,
+#         smiles,
+#         smiles_morgan_fingerprint,
+#         enzymes_polypeptides,
+#         targets_polypeptides
+#         from drug
+#         join _Drugs on drug.id = _Drugs.drugbank_id
+#         where
+#                 targets_polypeptides is not null and
+#                 enzymes_polypeptides is not null and
+#                 smiles_morgan_fingerprint is not null
+# ''')
+#     # pathway is absent
+#     rows = cur.fetchall()
+#     headers = ['index', 'id', 'name', 'description', 'synthesis_reference', 'indication', 'pharmacodynamics', 'mechanism_of_action', 'toxicity', 'metabolism', 'absorption', 'half_life',
+#                'protein_binding', 'route_of_elimination', 'volume_of_distribution', 'clearance', 'smiles_notation', 'smile', 'enzyme', 'target']
+#     df = pd.DataFrame(columns=headers, data=rows)
+#     df['smile'] = df['smile'].apply(lambda x:
+#                                     np.fromstring(
+#                                         x.replace(
+#                                             '\n', '')
+#                                         .replace('[', '')
+#                                         .replace(']', '')
+#                                         .replace('  ', ' '), sep=','))
+#     df['enzyme'] = df['enzyme'].apply(
+#         lambda x: x.split('|'))
+#     df['target'] = df['target'].apply(
+#         lambda x: x.split('|'))
+#     return df
+# SELECT
+#  CASE
+#  WHEN masked_interaction like '%'+drug_1+'%' THEN drug_1
+#  WHEN masked_interaction like '%'+drug_2+'%' THEN drug_2
+#  Else drug_2
+#  END AS Absent,
+# drug_1, drug_2,
+# masked_interaction
+# from _Interactions
+# where LENGTH(masked_interaction) = LENGTH(REPLACE(masked_interaction, 'DRUG', ''))
+# or LENGTH(masked_interaction) = LENGTH(REPLACE(masked_interaction, 'DRUG', '')) + 4
+# if __name__ == "__main__":
+#     conn = create_connection(r"./event-extended.db")
+#     extended_version_df = get_extended_version(conn)
+#     df = select_all_events_as_dataframe(conn)
+#     print(df.head())
+#     events_with_category_df = select_events_with_category(conn)
+#     print(events_with_category_df.head())
+#     u = events_with_category_df['event_category'].unique()
+#     print(len(u))

{ddi_fw-0.0.214 → ddi_fw-0.0.215}/src/ddi_fw/datasets/ddi_mdl/base.py RENAMED Viewed

@@ -96,10 +96,10 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
         logger.info(f'{self.dataset_name} is initialized')
     def load_drugs_and_events(self):
-        self.drugs_df = self.__select_all_drugs_as_dataframe__()
-        self.ddis_df = self.__select_all_events__()
+        self.drugs_df = self.__select_all_drugs_as_dataframe()
+        self.ddis_df = self.__select_all_events()
-    def __select_all_drugs_as_dataframe__(self):
+    def __select_all_drugs_as_dataframe(self):
         headers = ['index', 'id', 'name',
                    'target', 'enzyme', 'pathway', 'smile']
         if self._conn is None:
@@ -117,7 +117,7 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
         return df
-    def __select_all_events__(self):
+    def __select_all_events(self):
         if self._conn is None:
             raise Exception("There is no connection")
         cur = self._conn.cursor()
@@ -221,16 +221,16 @@ class DDIMDLDataset(BaseDataset,TextDatasetMixin):
                 lambda_fnc, args=(value,), axis=1)
             self.columns.append(key)
             print(self.ddis_df[key].head())
-        if self.embedding_dict is not None:
-            for embedding_column in self.embedding_columns:
-                print(f"concat {embedding_column} embeddings")
-                embeddings_after_pooling = {k: self.pooling_strategy.apply(
-                    v) for k, v in self.embedding_dict[embedding_column].items()}
-                # column_embeddings_dict = embedding_values[embedding_column]
-                self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
-                    x_fnc, args=(embeddings_after_pooling,), axis=1)
-                self.columns.append(embedding_column+'_embedding')
+        if isinstance(self, TextDatasetMixin):
+            if self.embedding_dict is not None:
+                for embedding_column in self.embedding_columns:
+                    print(f"concat {embedding_column} embeddings")
+                    embeddings_after_pooling = {k: self.pooling_strategy.apply(
+                        v) for k, v in self.embedding_dict[embedding_column].items()}
+                    # column_embeddings_dict = embedding_values[embedding_column]
+                    self.ddis_df[embedding_column+'_embedding'] = self.ddis_df.apply(
+                        x_fnc, args=(embeddings_after_pooling,), axis=1)
+                    self.columns.append(embedding_column+'_embedding')
         dataframe = self.ddis_df.copy()
         if not isinstance(classes, (list, pd.Series, np.ndarray)):

{ddi_fw-0.0.214 → ddi_fw-0.0.215}/src/ddi_fw/datasets/ddi_mdl_text/base.py RENAMED Viewed

@@ -90,8 +90,8 @@ class DDIMDLDatasetV2(BaseDataset):
         db = HERE.joinpath('data/event.db')
         conn = create_connection(db)
         print("db prep")
-        self.drugs_df = self.__select_all_drugs_as_dataframe__(conn)
-        self.ddis_df = self.__select_all_events__(conn)
+        self.drugs_df = self.__select_all_drugs_as_dataframe(conn)
+        self.ddis_df = self.__select_all_events(conn)
         print("db bitti")
         self.index_path = kwargs.get('index_path')
@@ -121,7 +121,7 @@ class DDIMDLDatasetV2(BaseDataset):
         #     print(self.ddis_df[key].head())
         # print("init finished")
-    def __select_all_drugs_as_dataframe__(self, conn):
+    def __select_all_drugs_as_dataframe(self, conn):
         headers = ['index', 'id', 'name',
                    'target', 'enzyme', 'pathway', 'smile']
         cur = conn.cursor()
@@ -137,7 +137,7 @@ class DDIMDLDatasetV2(BaseDataset):
         return df
-    def __select_all_events__(self, conn):
+    def __select_all_events(self, conn):
         """
         Query all rows in the event table
         :param conn: the Connection object

{ddi_fw-0.0.214 → ddi_fw-0.0.215}/src/ddi_fw/datasets/mdf_sa_ddi/base.py RENAMED Viewed

@@ -9,7 +9,6 @@ from ddi_fw.utils import ZipHelper
 from .. import BaseDataset
 from ddi_fw.langchain.embeddings import PoolingStrategy
 from ..db_utils import create_connection
-# from ..db_utils import create_connection, select_all_drugs_as_dataframe, select_events_with_category
 HERE = pathlib.Path(__file__).resolve().parent
 list_of_embedding_columns = ['all_text', 'description',

{ddi_fw-0.0.214 → ddi_fw-0.0.215}/src/ddi_fw/pipeline/multi_pipeline.py RENAMED Viewed

@@ -120,6 +120,8 @@ class MultiPipeline():
         columns = config.get("columns")
         ner_data_file = config.get("ner_data_file")
         ner_threshold = config.get("ner_threshold")
+        ner_min_threshold_dict = config.get("ner_min_threshold_dict")
+        ner_max_threshold_dict = config.get("ner_max_threshold_dict")
         column_embedding_configs = config.get("column_embedding_configs")
         vector_db_persist_directory = config.get("vector_db_persist_directory")
         vector_db_collection_name = config.get("vector_db_collection_name")
@@ -170,10 +172,14 @@ class MultiPipeline():
                 experiment_tags=experiment_tags,
                 tracking_uri=tracking_uri,
                 dataset_type=dataset_type,
+                dataset_splitter_type=dataset_splitter_type,
                 umls_code_types = None,
                 text_types = None,
-                columns=['tui', 'cui', 'entities'],
+                min_threshold_dict=ner_min_threshold_dict,
+                max_threshold_dict=ner_max_threshold_dict,
+                columns=columns,
                 ner_data_file=ner_data_file,
+                default_model=default_model,
                 multi_modal= multi_modal
             )

ddi_fw-0.0.215/src/ddi_fw/pipeline/multi_pipeline_v2.py ADDED Viewed

@@ -0,0 +1,231 @@
+import json
+from typing import Optional
+from ddi_fw.pipeline.pipeline import Pipeline
+from ddi_fw.pipeline.ner_pipeline import NerParameterSearch
+import importlib
+def load_config(file_path):
+    with open(file_path, 'r') as file:
+        config = json.load(file)
+    return config
+def get_import(full_path_of_import):
+    """Dynamically imports an object from a module given its full path.
+    Args:
+        full_path_of_import (str): The full path of the import (e.g., 'module.submodule.ClassName').
+    Returns:
+        object: The imported object.
+    Raises:
+        ImportError: If the module cannot be imported.
+        AttributeError: If the attribute does not exist in the module.
+    """
+    if not full_path_of_import:
+        raise ValueError("The import path cannot be empty.")
+    parts = full_path_of_import.split('.')
+    import_name = parts[-1]
+    module_name = ".".join(parts[:-1]) if len(parts) > 1 else ""
+    try:
+        module = importlib.import_module(module_name)
+        return getattr(module, import_name)
+    except ModuleNotFoundError as e:
+        raise ImportError(f"Module '{module_name}' could not be found.") from e
+    except AttributeError as e:
+        raise AttributeError(
+            f"'{module_name}' has no attribute '{import_name}'") from e
+class MultiPipeline():
+    # def __init__(self, experiments_config_file, experiments_config):
+    #     if experiments_config_file is None and experiments_config is None:
+    #         raise ValueError("Either experiments_config_file or experiments_config must be provided.")
+    #     if experiments_config_file is not None and experiments_config is not None:
+    #         raise ValueError("Only one of experiments_config_file or experiments_config should be provided.")
+    #     if experiments_config_file is not None:
+    #         self.experiments_config = load_config(experiments_config_file)
+    #     else:
+    #         self.experiments_config = experiments_config
+    #     self.items = []
+    #     self.pipeline_resuts = dict()
+    def __init__(self, experiments_config_file: Optional[str] = None, experiments_config: Optional[dict] = None):
+        """
+        Initialize the MultiPipeline.
+        Args:
+            experiments_config_file (str, optional): Path to the experiments configuration file.
+            experiments_config (dict, optional): Dictionary containing the experiments configuration.
+        Raises:
+            ValueError: If neither or both of the parameters are provided.
+        """
+        self.experiments_config = self._validate_and_load_config(experiments_config_file, experiments_config)
+        self.items = []
+        # self.pipeline_results = {}
+        self.pipeline_resuts = dict()
+    def _validate_and_load_config(self, experiments_config_file: Optional[str], experiments_config: Optional[dict]) -> dict:
+        """
+        Validate and load the experiments configuration.
+        Args:
+            experiments_config_file (str, optional): Path to the experiments configuration file.
+            experiments_config (dict, optional): Dictionary containing the experiments configuration.
+        Returns:
+            dict: The loaded experiments configuration.
+        Raises:
+            ValueError: If neither or both of the parameters are provided.
+        """
+        if experiments_config_file is None and experiments_config is None:
+            raise ValueError("Either 'experiments_config_file' or 'experiments_config' must be provided.")
+        if experiments_config_file is not None and experiments_config is not None:
+            raise ValueError("Only one of 'experiments_config_file' or 'experiments_config' should be provided.")
+        if experiments_config_file is not None:
+            try:
+               config = load_config(experiments_config_file)
+            except FileNotFoundError:
+                raise FileNotFoundError(f"Configuration file '{experiments_config_file}' not found.")
+        else:
+            config = experiments_config
+        if config is None:
+            raise ValueError("Configuration cannot be None.")
+        if not isinstance(config, dict):
+            raise ValueError("Configuration must be a dictionary.")
+        # if "experiments" not in config:
+        #     raise ValueError("Configuration must contain 'experiments' key.")
+        return config
+    def __create_pipeline(self, config):
+        type = config.get("type")
+        library = config.get("library")
+        experiment_name = config.get("experiment_name")
+        experiment_description = config.get("experiment_description")
+        experiment_tags = config.get("experiment_tags")
+        # Tracking configuration
+        tracking_config = config.get("tracking_config", {})
+        tracking_library = tracking_config.get("library")
+        use_tracking = tracking_config.get("use_tracking", False)
+        tracking_params = tracking_config.get("params", {}).get(tracking_library, {})
+        # tracking_uri = config.get("tracking_uri")
+        # artifact_location = config.get("artifact_location")
+         # Dataset configuration
+        dataset_config = config.get("dataset", {})
+        dataset_type = get_import(dataset_config.get("dataset_type"))
+        dataset_splitter_type = get_import(dataset_config.get("dataset_splitter_type"))
+        columns = dataset_config.get("columns", [])
+        additional_config = dataset_config.get("additional_config", {})
+         # Vector database configuration
+        vector_database = config.get("vector_databases", {})
+        vector_db_persist_directory = None
+        vector_db_collection_name = None
+        embedding_pooling_strategy = None
+        if vector_database:
+            vector_db_persist_directory = vector_database.get("vector_db_persist_directory")
+            vector_db_collection_name = vector_database.get("vector_db_collection_name")
+            embedding_pooling_strategy = get_import(vector_database.get("embedding_pooling_strategy"))
+            column_embedding_configs = vector_database.get("column_embedding_configs")
+        # Combination strategy
+        combination_strategy_config = config.get("combination_strategy", {})
+        combination_type = get_import(combination_strategy_config.get("type")) if combination_strategy_config else None
+        kwargs_combination_params = combination_strategy_config.get("params", {})
+        combinations = combination_type(**kwargs_combination_params).generate() if combination_type else []
+        # Default model configuration
+        default_model_config = config.get("default_model", {})
+        default_model_type = get_import(default_model_config.get("model_type"))
+        default_model_params = default_model_config.get("params", {})
+        multi_modal = config.get("multi_modal")
+        #ner move it to related dataset
+        # ner_data_file = config.get("ner_data_file")
+        # ner_threshold = config.get("ner_threshold")
+        combination_type = None
+        kwargs_combination_params=None
+        if config.get("combination_strategy"):
+            combination_type = get_import(config.get("combination_strategy").get("type"))
+            kwargs_combination_params = config.get("combination_strategy").get("params")
+        combinations = []
+        if combination_type is not None:
+            combinations = combination_type(**kwargs_combination_params).generate()
+        pipeline = None
+        if type == "general":
+            pipeline = Pipeline(
+                library=library,
+                use_mlflow=use_mlflow,
+                experiment_name=experiment_name,
+                experiment_description=experiment_description,
+                experiment_tags=experiment_tags,
+                artifact_location=artifact_location,
+                tracking_uri=tracking_uri,
+                dataset_type=dataset_type,
+                dataset_splitter_type=dataset_splitter_type,
+                columns=columns,
+                column_embedding_configs=column_embedding_configs,
+                vector_db_persist_directory=vector_db_persist_directory,
+                vector_db_collection_name=vector_db_collection_name,
+                embedding_pooling_strategy_type=embedding_pooling_strategy,
+                ner_data_file=ner_data_file,
+                ner_threshold=ner_threshold,
+                combinations=combinations,
+                default_model=default_model,
+                multi_modal= multi_modal)
+        elif type== "ner_search":
+            pipeline = NerParameterSearch(
+                library=library,
+                experiment_name=experiment_name,
+                experiment_description=experiment_description,
+                experiment_tags=experiment_tags,
+                tracking_uri=tracking_uri,
+                dataset_type=dataset_type,
+                umls_code_types = None,
+                text_types = None,
+                columns=['tui', 'cui', 'entities'],
+                ner_data_file=ner_data_file,
+                multi_modal= multi_modal
+            )
+        return {
+            "name": experiment_name,
+            "library": library,
+            "pipeline": pipeline}
+    def build(self):
+        for config in self.experiments_config['experiments']:
+            item = self.__create_pipeline(config)
+            self.items.append(item)
+        return self
+    def run(self):
+        for item in self.items:
+            print(f"{item['name']} is running")
+            pipeline = item['pipeline']
+            pipeline.build()
+            result = pipeline.run()
+            self.pipeline_resuts[item['name']] = result
+        return self
+    def results(self):
+        return self.pipeline_resuts

ddi-fw 0.0.214__tar.gz → 0.0.215__tar.gz

ddi-fw 0.0.214tar.gz → 0.0.215tar.gz