PyPI - aiagents4pharma - Versions diffs - 1.44.0__py3-none-any.whl → 1.45.1__py3-none-any.whl - Mend

aiagents4pharma 1.44.0py3-none-any.whl → 1.45.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (289) hide show

aiagents4pharma/talk2knowledgegraphs/tools/milvus_multimodal_subgraph_extraction.py CHANGED Viewed

@@ -3,7 +3,7 @@ Tool for performing multimodal subgraph extraction.
 """
 import logging
-from typing import Annotated, Type
+from typing import Annotated
 import hydra
 import pandas as pd
@@ -39,14 +39,10 @@ class MultimodalSubgraphExtractionInput(BaseModel):
         arg_data: Argument for analytical process over graph data.
     """
-    tool_call_id: Annotated[str, InjectedToolCallId] = Field(
-        description="Tool call ID."
-    )
+    tool_call_id: Annotated[str, InjectedToolCallId] = Field(description="Tool call ID.")
     state: Annotated[dict, InjectedState] = Field(description="Injected state.")
     prompt: str = Field(description="Prompt to interact with the backend.")
-    arg_data: ArgumentData = Field(
-        description="Experiment over graph data.", default=None
-    )
+    arg_data: ArgumentData = Field(description="Experiment over graph data.", default=None)
 class MultimodalSubgraphExtractionTool(BaseTool):
@@ -57,18 +53,19 @@ class MultimodalSubgraphExtractionTool(BaseTool):
     name: str = "subgraph_extraction"
     description: str = "A tool for subgraph extraction based on user's prompt."
-    args_schema: Type[BaseModel] = MultimodalSubgraphExtractionInput
+    args_schema: type[BaseModel] = MultimodalSubgraphExtractionInput
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         # Initialize hardware detection and dynamic library loading
-        object.__setattr__(self, 'detector', SystemDetector())
-        object.__setattr__(self, 'loader', DynamicLibraryLoader(self.detector))
-        logger.info("MultimodalSubgraphExtractionTool initialized with %s mode",
-                   "GPU" if self.loader.use_gpu else "CPU")
+        object.__setattr__(self, "detector", SystemDetector())
+        object.__setattr__(self, "loader", DynamicLibraryLoader(self.detector))
+        logger.info(
+            "MultimodalSubgraphExtractionTool initialized with %s mode",
+            "GPU" if self.loader.use_gpu else "CPU",
+        )
-    def _read_multimodal_files(self,
-                               state: Annotated[dict, InjectedState]):
+    def _read_multimodal_files(self, state: Annotated[dict, InjectedState]):
         """
         Read the uploaded multimodal files and return a DataFrame.
@@ -86,8 +83,9 @@ class MultimodalSubgraphExtractionTool(BaseTool):
             # Check if multimodal file is uploaded
             if state["uploaded_files"][i]["file_type"] == "multimodal":
                 # Read the Excel file
-                multimodal_df = pd.read_excel(state["uploaded_files"][i]["file_path"],
-                                                sheet_name=None)
+                multimodal_df = pd.read_excel(
+                    state["uploaded_files"][i]["file_path"], sheet_name=None
+                )
         # Check if the multimodal_df is empty
         logger.log(logging.INFO, "Checking if multimodal_df is empty")
@@ -98,11 +96,12 @@ class MultimodalSubgraphExtractionTool(BaseTool):
             multimodal_df = pd.concat(multimodal_df).reset_index()
             multimodal_df = self.loader.df.DataFrame(multimodal_df)
             multimodal_df.drop(columns=["level_1"], inplace=True)
-            multimodal_df.rename(columns={"level_0": "q_node_type",
-                                        "name": "q_node_name"}, inplace=True)
+            multimodal_df.rename(
+                columns={"level_0": "q_node_type", "name": "q_node_name"}, inplace=True
+            )
             # Since an excel sheet name could not contain a `/`,
             # but the node type can be 'gene/protein' as exists in the PrimeKG
-            multimodal_df["q_node_type"] = multimodal_df["q_node_type"].str.replace('-', '_')
+            multimodal_df["q_node_type"] = multimodal_df["q_node_type"].str.replace("-", "_")
         return multimodal_df
@@ -115,30 +114,28 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         collection.load()
         # Query the collection with node names from multimodal_df
-        node_names_series = node_type_df['q_node_name']
-        q_node_names = getattr(node_names_series,
-                               "to_pandas",
-                               lambda series=node_names_series: series)().tolist()
-        q_columns = ["node_id", "node_name", "node_type",
-                     "feat", "feat_emb", "desc", "desc_emb"]
+        node_names_series = node_type_df["q_node_name"]
+        q_node_names = getattr(
+            node_names_series, "to_pandas", lambda series=node_names_series: series
+        )().tolist()
+        q_columns = ["node_id", "node_name", "node_type", "feat", "feat_emb", "desc", "desc_emb"]
         res = collection.query(
-            expr=f'node_name IN [{','.join(f'"{name}"' for name in q_node_names)}]',
+            expr=f"node_name IN [{','.join(f'"{name}"' for name in q_node_names)}]",
             output_fields=q_columns,
         )
         # Convert the embeedings into floats
         for r_ in res:
-            r_['feat_emb'] = [float(x) for x in r_['feat_emb']]
-            r_['desc_emb'] = [float(x) for x in r_['desc_emb']]
+            r_["feat_emb"] = [float(x) for x in r_["feat_emb"]]
+            r_["desc_emb"] = [float(x) for x in r_["desc_emb"]]
         # Convert the result to a DataFrame
         res_df = self.loader.df.DataFrame(res)[q_columns]
         res_df["use_description"] = False
         return res_df
-    def _prepare_query_modalities(self,
-                                  prompt: dict,
-                                  state: Annotated[dict, InjectedState],
-                                  cfg_db: dict):
+    def _prepare_query_modalities(
+        self, prompt: dict, state: Annotated[dict, InjectedState], cfg_db: dict
+    ):
         """
         Prepare the modality-specific query for subgraph extraction.
@@ -153,16 +150,18 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         # Initialize dataframes
         logger.log(logging.INFO, "Initializing dataframes")
         query_df = []
-        prompt_df = self.loader.df.DataFrame({
-            'node_id': 'user_prompt',
-            'node_name': 'User Prompt',
-            'node_type': 'prompt',
-            'feat': prompt["text"],
-            'feat_emb': prompt["emb"],
-            'desc': prompt["text"],
-            'desc_emb': prompt["emb"],
-            'use_description': True # set to True for user prompt embedding
-        })
+        prompt_df = self.loader.df.DataFrame(
+            {
+                "node_id": "user_prompt",
+                "node_name": "User Prompt",
+                "node_type": "prompt",
+                "feat": prompt["text"],
+                "feat_emb": prompt["emb"],
+                "desc": prompt["text"],
+                "desc_emb": prompt["emb"],
+                "use_description": True,  # set to True for user prompt embedding
+            }
+        )
         # Read multimodal files uploaded by the user
         multimodal_df = self._read_multimodal_files(state)
@@ -171,7 +170,10 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         logger.log(logging.INFO, "Prepare query modalities")
         if len(multimodal_df) > 0:
             # Query the Milvus database for each node type in multimodal_df
-            logger.log(logging.INFO, "Querying Milvus database for each node type in multimodal_df")
+            logger.log(
+                logging.INFO,
+                "Querying Milvus database for each node type in multimodal_df",
+            )
             for node_type, node_type_df in multimodal_df.groupby("q_node_type"):
                 print(f"Processing node type: {node_type}")
                 res_df = self._query_milvus_collection(node_type, node_type_df, cfg_db)
@@ -183,11 +185,12 @@ class MultimodalSubgraphExtractionTool(BaseTool):
             # Update the state by adding the the selected node IDs
             logger.log(logging.INFO, "Updating state with selected node IDs")
-            state["selections"] = getattr(query_df,
-                                          "to_pandas",
-                                          lambda: query_df)().groupby(
-                "node_type"
-            )["node_id"].apply(list).to_dict()
+            state["selections"] = (
+                getattr(query_df, "to_pandas", lambda: query_df)()
+                .groupby("node_type")["node_id"]
+                .apply(list)
+                .to_dict()
+            )
             # Append a user prompt to the query dataframe
             logger.log(logging.INFO, "Adding user prompt to query dataframe")
@@ -198,11 +201,13 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         return query_df
-    def _perform_subgraph_extraction(self,
-                                     state: Annotated[dict, InjectedState],
-                                     cfg: dict,
-                                     cfg_db: dict,
-                                     query_df: pd.DataFrame) -> dict:
+    def _perform_subgraph_extraction(
+        self,
+        state: Annotated[dict, InjectedState],
+        cfg: dict,
+        cfg_db: dict,
+        query_df: pd.DataFrame,
+    ) -> dict:
         """
         Perform multimodal subgraph extraction based on modal-specific embeddings.
@@ -217,10 +222,7 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         """
         # Initialize the subgraph dictionary
         subgraphs = []
-        unified_subgraph = {
-            "nodes": [],
-            "edges": []
-        }
+        unified_subgraph = {"nodes": [], "edges": []}
         # subgraphs = {}
         # subgraphs["nodes"] = []
         # subgraphs["edges"] = []
@@ -228,22 +230,21 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         # Loop over query embeddings and modalities
         for q in getattr(query_df, "to_pandas", lambda: query_df)().iterrows():
             logger.log(logging.INFO, "===========================================")
-            logger.log(logging.INFO, "Processing query: %s", q[1]['node_name'])
+            logger.log(logging.INFO, "Processing query: %s", q[1]["node_name"])
             # Prepare the PCSTPruning object and extract the subgraph
             # Parameters were set in the configuration file obtained from Hydra
             # start = datetime.datetime.now()
             # Get dynamic metric type (overrides any config setting)
             # Get dynamic metric type (overrides any config setting)
-            has_vector_processing = hasattr(cfg, 'vector_processing')
+            has_vector_processing = hasattr(cfg, "vector_processing")
             if has_vector_processing:
-                dynamic_metrics_enabled = getattr(cfg.vector_processing, 'dynamic_metrics', True)
+                dynamic_metrics_enabled = getattr(cfg.vector_processing, "dynamic_metrics", True)
             else:
                 dynamic_metrics_enabled = False
             if has_vector_processing and dynamic_metrics_enabled:
                 dynamic_metric_type = self.loader.metric_type
             else:
-                dynamic_metric_type = getattr(cfg, 'search_metric_type',
-                                               self.loader.metric_type)
+                dynamic_metric_type = getattr(cfg, "search_metric_type", self.loader.metric_type)
             subgraph = MultimodalPCSTPruning(
                 topk=state["topk_nodes"],
@@ -254,20 +255,21 @@ class MultimodalSubgraphExtractionTool(BaseTool):
                 num_clusters=cfg.num_clusters,
                 pruning=cfg.pruning,
                 verbosity_level=cfg.verbosity_level,
-                use_description=q[1]['use_description'],
+                use_description=q[1]["use_description"],
                 metric_type=dynamic_metric_type,  # Use dynamic or config metric type
-                loader=self.loader  # Pass the loader instance
-            ).extract_subgraph(q[1]['desc_emb'],
-                               q[1]['feat_emb'],
-                               q[1]['node_type'],
-                               cfg_db)
+                loader=self.loader,  # Pass the loader instance
+            ).extract_subgraph(q[1]["desc_emb"], q[1]["feat_emb"], q[1]["node_type"], cfg_db)
             # Append the extracted subgraph to the dictionary
             unified_subgraph["nodes"].append(subgraph["nodes"].tolist())
             unified_subgraph["edges"].append(subgraph["edges"].tolist())
-            subgraphs.append((q[1]['node_name'],
-                              subgraph["nodes"].tolist(),
-                              subgraph["edges"].tolist()))
+            subgraphs.append(
+                (
+                    q[1]["node_name"],
+                    subgraph["nodes"].tolist(),
+                    subgraph["edges"].tolist(),
+                )
+            )
             # end = datetime.datetime.now()
             # logger.log(logging.INFO, "Subgraph extraction time: %s seconds",
@@ -284,10 +286,10 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         ).tolist()
         # Convert the unified subgraph and subgraphs to DataFrames
-        unified_subgraph = self.loader.df.DataFrame([("Unified Subgraph",
-                                                      unified_subgraph["nodes"],
-                                                      unified_subgraph["edges"])],
-                                                     columns=["name", "nodes", "edges"])
+        unified_subgraph = self.loader.df.DataFrame(
+            [("Unified Subgraph", unified_subgraph["nodes"], unified_subgraph["edges"])],
+            columns=["name", "nodes", "edges"],
+        )
         subgraphs = self.loader.df.DataFrame(subgraphs, columns=["name", "nodes", "edges"])
         # Concatenate both DataFrames
@@ -295,11 +297,9 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         return subgraphs
-    def _prepare_final_subgraph(self,
-                                state:Annotated[dict, InjectedState],
-                                subgraph: dict,
-                                cfg: dict,
-                                cfg_db) -> dict:
+    def _prepare_final_subgraph(
+        self, state: Annotated[dict, InjectedState], subgraph: dict, cfg: dict, cfg_db
+    ) -> dict:
         """
         Prepare the subgraph based on the extracted subgraph.
@@ -314,53 +314,62 @@ class MultimodalSubgraphExtractionTool(BaseTool):
             A dictionary containing the PyG graph, NetworkX graph, and textualized graph.
         """
         # Convert the dict to a DataFrame
-        node_colors = {n: cfg.node_colors_dict[k]
-                        for k, v in state["selections"].items() for n in v}
+        node_colors = {
+            n: cfg.node_colors_dict[k] for k, v in state["selections"].items() for n in v
+        }
         color_df = self.loader.df.DataFrame(list(node_colors.items()), columns=["node_id", "color"])
         # print(color_df)
         # Prepare the subgraph dictionary
-        graph_dict = {
-            "name": [],
-            "nodes": [],
-            "edges": [],
-            "text": ""
-        }
+        graph_dict = {"name": [], "nodes": [], "edges": [], "text": ""}
         for sub in getattr(subgraph, "to_pandas", lambda: subgraph)().itertuples(index=False):
             graph_nodes, graph_edges = self._process_subgraph_data(sub, cfg_db, color_df)
             # Prepare lists for visualization
             graph_dict["name"].append(sub.name)
-            graph_dict["nodes"].append([(
-                row.node_id,
-                {'hover': "Node Name : " + row.node_name + "\n" +\
-                    "Node Type : " + row.node_type + "\n" +
-                    "Desc : " + row.desc,
-                'click': '$hover',
-                'color': row.color})
-                for row in getattr(graph_nodes,
-                                   "to_pandas",
-                                   lambda graph_nodes=graph_nodes: graph_nodes)()
-                                   .itertuples(index=False)])
-            graph_dict["edges"].append([(
-                row.head_id,
-                row.tail_id,
-                {'label': tuple(row.edge_type)})
-                for row in getattr(graph_edges,
-                                   "to_pandas",
-                                   lambda graph_edges=graph_edges: graph_edges)()
-                                   .itertuples(index=False)])
+            graph_dict["nodes"].append(
+                [
+                    (
+                        row.node_id,
+                        {
+                            "hover": "Node Name : "
+                            + row.node_name
+                            + "\n"
+                            + "Node Type : "
+                            + row.node_type
+                            + "\n"
+                            + "Desc : "
+                            + row.desc,
+                            "click": "$hover",
+                            "color": row.color,
+                        },
+                    )
+                    for row in getattr(
+                        graph_nodes, "to_pandas", lambda graph_nodes=graph_nodes: graph_nodes
+                    )().itertuples(index=False)
+                ]
+            )
+            graph_dict["edges"].append(
+                [
+                    (row.head_id, row.tail_id, {"label": tuple(row.edge_type)})
+                    for row in getattr(
+                        graph_edges, "to_pandas", lambda graph_edges=graph_edges: graph_edges
+                    )().itertuples(index=False)
+                ]
+            )
             # Prepare the textualized subgraph
             if sub.name == "Unified Subgraph":
-                graph_nodes = graph_nodes[['node_id', 'desc']]
-                graph_nodes.rename(columns={'desc': 'node_attr'}, inplace=True)
-                graph_edges = graph_edges[['head_id', 'edge_type', 'tail_id']]
-                nodes_pandas = getattr(graph_nodes, "to_pandas",
-                                        lambda graph_nodes=graph_nodes: graph_nodes)()
+                graph_nodes = graph_nodes[["node_id", "desc"]]
+                graph_nodes.rename(columns={"desc": "node_attr"}, inplace=True)
+                graph_edges = graph_edges[["head_id", "edge_type", "tail_id"]]
+                nodes_pandas = getattr(
+                    graph_nodes, "to_pandas", lambda graph_nodes=graph_nodes: graph_nodes
+                )()
                 nodes_csv = nodes_pandas.to_csv(index=False)
-                edges_pandas = getattr(graph_edges, "to_pandas",
-                                        lambda graph_edges=graph_edges: graph_edges)()
+                edges_pandas = getattr(
+                    graph_edges, "to_pandas", lambda graph_edges=graph_edges: graph_edges
+                )()
                 edges_csv = edges_pandas.to_csv(index=False)
                 graph_dict["text"] = nodes_csv + "\n" + edges_csv
@@ -369,44 +378,43 @@ class MultimodalSubgraphExtractionTool(BaseTool):
     def _process_subgraph_data(self, sub, cfg_db, color_df):
         """Helper method to process individual subgraph data."""
         print(f"Processing subgraph: {sub.name}")
-        print('---')
+        print("---")
         print(sub.nodes)
-        print('---')
+        print("---")
         print(sub.edges)
-        print('---')
+        print("---")
         # Prepare graph dataframes - Nodes
         coll_name = f"{cfg_db.milvus_db.database_name}_nodes"
         node_coll = Collection(name=coll_name)
         node_coll.load()
         graph_nodes = node_coll.query(
-            expr=f'node_index IN [{",".join(f"{n}" for n in sub.nodes)}]',
-            output_fields=['node_id', 'node_name', 'node_type', 'desc']
+            expr=f"node_index IN [{','.join(f'{n}' for n in sub.nodes)}]",
+            output_fields=["node_id", "node_name", "node_type", "desc"],
         )
         graph_nodes = self.loader.df.DataFrame(graph_nodes)
-        graph_nodes.drop(columns=['node_index'], inplace=True)
+        graph_nodes.drop(columns=["node_index"], inplace=True)
         if not color_df.empty:
             graph_nodes = graph_nodes.merge(color_df, on="node_id", how="left")
         else:
-            graph_nodes["color"] = 'black'
-        graph_nodes['color'] = graph_nodes['color'].fillna('black')
+            graph_nodes["color"] = "black"
+        graph_nodes["color"] = graph_nodes["color"].fillna("black")
         # Edges
         coll_name = f"{cfg_db.milvus_db.database_name}_edges"
         edge_coll = Collection(name=coll_name)
         edge_coll.load()
         graph_edges = edge_coll.query(
-            expr=f'triplet_index IN [{",".join(f"{e}" for e in sub.edges)}]',
-            output_fields=['head_id', 'tail_id', 'edge_type']
+            expr=f"triplet_index IN [{','.join(f'{e}' for e in sub.edges)}]",
+            output_fields=["head_id", "tail_id", "edge_type"],
         )
         graph_edges = self.loader.df.DataFrame(graph_edges)
-        graph_edges.drop(columns=['triplet_index'], inplace=True)
-        graph_edges['edge_type'] = graph_edges['edge_type'].str.split('|')
+        graph_edges.drop(columns=["triplet_index"], inplace=True)
+        graph_edges["edge_type"] = graph_edges["edge_type"].str.split("|")
         return graph_nodes, graph_edges
-    def normalize_vector(self,
-                         v : list) -> list:
+    def normalize_vector(self, v: list) -> list:
         """
         Normalize a vector using appropriate library (CuPy for GPU, NumPy for CPU).
@@ -448,7 +456,8 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         # Load hydra configuration
         with hydra.initialize(version_base=None, config_path="../configs"):
             cfg = hydra.compose(
-                config_name="config", overrides=["tools/multimodal_subgraph_extraction=default"]
+                config_name="config",
+                overrides=["tools/multimodal_subgraph_extraction=default"],
             )
             cfg_db = cfg.app.frontend
             cfg = cfg.tools.multimodal_subgraph_extraction
@@ -468,10 +477,9 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         logger.log(logging.INFO, "_prepare_query_modalities")
         # start = datetime.datetime.now()
         query_df = self._prepare_query_modalities(
-            {"text": prompt,
-             "emb": [self.normalize_vector(
-                 state["embedding_model"].embed_query(prompt)
-                 )]
+            {
+                "text": prompt,
+                "emb": [self.normalize_vector(state["embedding_model"].embed_query(prompt))],
             },
             state,
             cfg_db,
@@ -483,10 +491,7 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         # Perform subgraph extraction
         logger.log(logging.INFO, "_perform_subgraph_extraction")
         # start = datetime.datetime.now()
-        subgraphs = self._perform_subgraph_extraction(state,
-                                                      cfg,
-                                                      cfg_db,
-                                                      query_df)
+        subgraphs = self._perform_subgraph_extraction(state, cfg, cfg_db, query_df)
         # end = datetime.datetime.now()
         # logger.log(logging.INFO, "_perform_subgraph_extraction time: %s seconds",
         #            (end - start).total_seconds())
@@ -495,10 +500,7 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         logger.log(logging.INFO, "_prepare_final_subgraph")
         logger.log(logging.INFO, "Subgraphs extracted: %s", len(subgraphs))
         # start = datetime.datetime.now()
-        final_subgraph = self._prepare_final_subgraph(state,
-                                                      subgraphs,
-                                                      cfg,
-                                                      cfg_db)
+        final_subgraph = self._prepare_final_subgraph(state, subgraphs, cfg, cfg_db)
         # end = datetime.datetime.now()
         # logger.log(logging.INFO, "_prepare_final_subgraph time: %s seconds",
         #            (end - start).total_seconds())
@@ -534,7 +536,8 @@ class MultimodalSubgraphExtractionTool(BaseTool):
         # Return the updated state of the tool
         return Command(
-            update=dic_updated_state_for_model | {
+            update=dic_updated_state_for_model
+            | {
                 # update the message history
                 "messages": [
                     ToolMessage(

aiagents4pharma 1.44.0__py3-none-any.whl → 1.45.1__py3-none-any.whl

aiagents4pharma 1.44.0py3-none-any.whl → 1.45.1py3-none-any.whl