PyPI - aiagents4pharma - Versions diffs - 0.0.0__py3-none-any.whl - Mend

aiagents4pharma 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (336) hide show

aiagents4pharma/talk2knowledgegraphs/states/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""
+This file is used to import all the models in the package.
+"""
+from . import state_talk2knowledgegraphs

aiagents4pharma/talk2knowledgegraphs/states/state_talk2knowledgegraphs.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""
+This is the state file for the Talk2KnowledgeGraphs agent.
+"""
+from typing import Annotated
+# import operator
+from langchain_core.embeddings.embeddings import Embeddings
+from langchain_core.language_models.chat_models import BaseChatModel
+from langgraph.prebuilt.chat_agent_executor import AgentState
+def add_data(data1: dict, data2: dict) -> dict:
+    """
+    A reducer function to merge two dictionaries.
+    """
+    left_idx_by_name = {data["name"]: idx for idx, data in enumerate(data1)}
+    merged = data1.copy()
+    for data in data2:
+        idx = left_idx_by_name.get(data["name"])
+        if idx is not None:
+            merged[idx] = data
+        else:
+            merged.append(data)
+    return merged
+class Talk2KnowledgeGraphs(AgentState):
+    """
+    The state for the Talk2KnowledgeGraphs agent.
+    """
+    llm_model: BaseChatModel
+    embedding_model: Embeddings
+    selections: dict
+    uploaded_files: list
+    topk_nodes: int
+    topk_edges: int
+    dic_source_graph: Annotated[list[dict], add_data]
+    dic_extracted_graph: Annotated[list[dict], add_data]

aiagents4pharma/talk2knowledgegraphs/tests/__init__.py ADDED Viewed

File without changes

aiagents4pharma/talk2knowledgegraphs/tests/test_agents_t2kg_agent.py ADDED Viewed

@@ -0,0 +1,318 @@
+"""
+Test cases for agents/t2kg_agent.py
+"""
+from contextlib import ExitStack
+from unittest.mock import MagicMock, patch
+import pandas as pd
+import pytest
+from langchain_core.messages import HumanMessage, ToolMessage
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langgraph.types import Command
+from ..agents.t2kg_agent import get_app
+from ..tools.milvus_multimodal_subgraph_extraction import (
+    MultimodalSubgraphExtractionTool,
+)
+DATA_PATH = "aiagents4pharma/talk2knowledgegraphs/tests/files"
+@pytest.fixture(name="input_dict")
+def input_dict_fixture():
+    """
+    Input dictionary fixture.
+    """
+    input_dict = {
+        "llm_model": None,
+        "embedding_model": None,
+        "selections": {
+            "gene/protein": [],
+            "molecular_function": [],
+            "cellular_component": [],
+            "biological_process": [],
+            "drug": [],
+            "disease": [],
+        },
+        "uploaded_files": [
+            {
+                "file_name": "adalimumab.pdf",
+                "file_path": f"{DATA_PATH}/adalimumab.pdf",
+                "file_type": "drug_data",
+                "uploaded_by": "VPEUser",
+                "uploaded_timestamp": "2024-11-05 00:00:00",
+            },
+        ],
+        "topk_nodes": 3,
+        "topk_edges": 3,
+        "dic_source_graph": [
+            {
+                "name": "BioBridge",
+                "kg_pyg_path": f"{DATA_PATH}/biobridge_multimodal_pyg_graph.pkl",
+                "kg_text_path": f"{DATA_PATH}/biobridge_multimodal_text_graph.pkl",
+            }
+        ],
+        "dic_extracted_graph": [],
+    }
+    return input_dict
+def mock_milvus_collection(name):
+    """
+    Mock Milvus collection for testing.
+    """
+    # name is intentionally unused in this simplified mock
+    del name
+    nodes = MagicMock()
+    nodes.query.return_value = [
+        {
+            "node_index": 0,
+            "node_id": "id1",
+            "node_name": "Adalimumab",
+            "node_type": "drug",
+            "feat": "featA",
+            "feat_emb": [0.1, 0.2, 0.3],
+            "desc": "descA",
+            "desc_emb": [0.1, 0.2, 0.3],
+        },
+        {
+            "node_index": 1,
+            "node_id": "id2",
+            "node_name": "TNF",
+            "node_type": "gene/protein",
+            "feat": "featB",
+            "feat_emb": [0.4, 0.5, 0.6],
+            "desc": "descB",
+            "desc_emb": [0.4, 0.5, 0.6],
+        },
+    ]
+    nodes.load.return_value = None
+    edges = MagicMock()
+    edges.query.return_value = [
+        {
+            "triplet_index": 0,
+            "head_id": "id1",
+            "head_index": 0,
+            "tail_id": "id2",
+            "tail_index": 1,
+            "edge_type": "drug,acts_on,gene/protein",
+            "display_relation": "acts_on",
+            "feat": "featC",
+            "feat_emb": [0.7, 0.8, 0.9],
+        }
+    ]
+    edges.load.return_value = None
+    # Default path in tests expects None for unknown collections (implicit)
+def _invoke_app_with_mocks(unique_id, input_dict):
+    """Run the app with patched Milvus + tool stack and return (app, config, response)."""
+    app = get_app(unique_id, llm_model=input_dict["llm_model"])
+    config = {"configurable": {"thread_id": unique_id}}
+    app.update_state(config, input_dict)
+    prompt = (
+        "Adalimumab is a fully human monoclonal antibody (IgG1) that "
+        "specifically binds to tumor necrosis factor-alpha (TNF-α), a "
+        "pro-inflammatory cytokine.\n\n"
+        "I would like to get evidence from the knowledge graph about the "
+        "mechanism of actions related to Adalimumab in treating inflammatory "
+        "bowel disease (IBD). Please follow these steps:\n"
+        "- Extract a subgraph from the PrimeKG that contains information about "
+        "Adalimumab.\n- Summarize the extracted subgraph.\n"
+        "- Reason about the mechanism of action of Adalimumab in treating IBD.\n\n"
+        "Please set the extraction name for the extraction process as `subkg_"
+        f"{unique_id}`."
+    )
+    mocks = {
+        "pcst": MagicMock(),
+        "connections": MagicMock(),
+        "compose": MagicMock(),
+        "connections_manager": MagicMock(),
+        "db": MagicMock(),
+        "conn_mgr": MagicMock(),
+    }
+    with ExitStack() as stack:
+        stack.enter_context(
+            patch(
+                "aiagents4pharma.talk2knowledgegraphs.tools."
+                "milvus_multimodal_subgraph_extraction.Collection",
+                side_effect=mock_milvus_collection,
+            )
+        )
+        stack.enter_context(
+            patch(
+                "aiagents4pharma.talk2knowledgegraphs.tools."
+                "milvus_multimodal_subgraph_extraction.MultimodalPCSTPruning",
+                mocks["pcst"],
+            )
+        )
+        stack.enter_context(patch("pymilvus.connections", mocks["connections"]))
+        stack.enter_context(
+            patch(
+                "aiagents4pharma.talk2knowledgegraphs.tools."
+                "milvus_multimodal_subgraph_extraction.hydra.initialize"
+            )
+        )
+        stack.enter_context(
+            patch(
+                "aiagents4pharma.talk2knowledgegraphs.tools."
+                "milvus_multimodal_subgraph_extraction.hydra.compose",
+                mocks["compose"],
+            )
+        )
+        stack.enter_context(
+            patch(
+                "aiagents4pharma.talk2knowledgegraphs.utils.database."
+                "milvus_connection_manager.connections",
+                mocks["connections_manager"],
+            )
+        )
+        stack.enter_context(
+            patch(
+                "aiagents4pharma.talk2knowledgegraphs.utils.database."
+                "milvus_connection_manager.Collection",
+                side_effect=mock_milvus_collection,
+            )
+        )
+        stack.enter_context(
+            patch(
+                "aiagents4pharma.talk2knowledgegraphs.utils.database.milvus_connection_manager.db",
+                mocks["db"],
+            )
+        )
+        stack.enter_context(
+            patch(
+                "aiagents4pharma.talk2knowledgegraphs.tools."
+                "milvus_multimodal_subgraph_extraction.MilvusConnectionManager",
+                mocks["conn_mgr"],
+            )
+        )
+        def mock_tool_execution(tool_call_id, state, prompt, arg_data=None):
+            del prompt, arg_data
+            mock_extracted_graph = {
+                "name": f"subkg_{unique_id}",
+                "tool_call_id": tool_call_id,
+                "graph_source": "BioBridge",
+                "topk_nodes": 3,
+                "topk_edges": 3,
+                "graph_dict": {
+                    "name": "extracted_subgraph",
+                    "nodes": ["Adalimumab", "TNF"],
+                    "edges": [("Adalimumab", "acts_on", "TNF")],
+                },
+                "graph_text": (
+                    "Adalimumab acts on TNF protein for treating inflammatory diseases."
+                ),
+                "graph_summary": None,
+            }
+            tool_message = ToolMessage(
+                content=(
+                    "Subgraph extraction completed successfully. "
+                    "Extracted subgraph containing Adalimumab and TNF interactions."
+                ),
+                tool_call_id=tool_call_id,
+                name="subgraph_extraction",
+            )
+            return Command(
+                update={
+                    "messages": [tool_message],
+                    "dic_extracted_graph": state.get("dic_extracted_graph", [])
+                    + [mock_extracted_graph],
+                }
+            )
+        stack.enter_context(
+            patch.object(MultimodalSubgraphExtractionTool, "_run", side_effect=mock_tool_execution)
+        )
+        # set return values via the mocks dict
+        mocks["connections"].has_connection.return_value = True
+        mocks["connections_manager"].has_connection.return_value = True
+        mocks["db"].using_database.return_value = None
+        pcst_instance = MagicMock()
+        pcst_instance.extract_subgraph.return_value = {
+            "nodes": pd.Series([0, 1]),
+            "edges": pd.Series([0]),
+        }
+        mocks["pcst"].return_value = pcst_instance
+        cfg = MagicMock()
+        for k, v in {
+            "cost_e": 1.0,
+            "c_const": 1.0,
+            "root": 0,
+            "num_clusters": 1,
+            "pruning": True,
+            "verbosity_level": 0,
+            "search_metric_type": "L2",
+        }.items():
+            setattr(cfg, k, v)
+        cfg.node_colors_dict = {"drug": "blue", "gene/protein": "red"}
+        mocks["compose"].return_value = MagicMock()
+        mocks["compose"].return_value.tools.multimodal_subgraph_extraction = cfg
+        mocks[
+            "compose"
+        ].return_value.tools.subgraph_summarization.prompt_subgraph_summarization = (
+            "Summarize the following subgraph: {textualized_subgraph}"
+        )
+        db_cfg = MagicMock()
+        for k, v in {
+            "alias": "test_alias",
+            "host": "localhost",
+            "port": "19530",
+            "user": "root",
+            "password": "password",
+            "database_name": "test_db",
+        }.items():
+            setattr(db_cfg.milvus_db, k, v)
+        mocks["compose"].return_value.utils.database.milvus = db_cfg.milvus_db
+        conn = MagicMock()
+        conn.ensure_connection.return_value = True
+        conn.get_connection_info.return_value = {"database": "test_db", "connected": True}
+        conn.test_connection.return_value = True
+        mocks["conn_mgr"].return_value = conn
+        response = app.invoke({"messages": [HumanMessage(content=prompt)]}, config=config)
+    return app, config, response
+def test_t2kg_agent_openai_milvus_mock(input_dict):
+    """
+    Test the T2KG agent using OpenAI model and Milvus mock.
+    Args:
+        input_dict: Input dictionary
+    """
+    input_dict["llm_model"] = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
+    input_dict["embedding_model"] = OpenAIEmbeddings(model="text-embedding-3-small")
+    unique_id = 12345
+    app, config, response = _invoke_app_with_mocks(unique_id, input_dict)
+    assert isinstance(response["messages"][-1].content, str)
+    dic_extracted_graph = app.get_state(config).values["dic_extracted_graph"][0]
+    assert isinstance(dic_extracted_graph, dict)
+    assert dic_extracted_graph["name"] == "subkg_12345"
+    assert dic_extracted_graph["graph_source"] == "BioBridge"
+    assert dic_extracted_graph["topk_nodes"] == 3
+    assert dic_extracted_graph["topk_edges"] == 3
+    assert isinstance(dic_extracted_graph["graph_dict"], dict)
+    assert len(dic_extracted_graph["graph_dict"]["nodes"]) > 0
+    assert len(dic_extracted_graph["graph_dict"]["edges"]) > 0
+    assert isinstance(dic_extracted_graph["graph_text"], str)
+    assert isinstance(dic_extracted_graph["graph_summary"], str)
+    assert "Adalimumab" in response["messages"][-1].content
+    assert "TNF" in response["messages"][-1].content
+    # Another test for unknown collection
+    assert mock_milvus_collection("unknown") is None

aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_biobridge_primekg.py ADDED Viewed

@@ -0,0 +1,248 @@
+"""
+Test cases for datasets/primekg_loader.py
+"""
+import os
+import shutil
+import pytest
+from ..datasets.biobridge_primekg import BioBridgePrimeKG
+# Remove the data folder for testing if it exists
+PRIMEKG_LOCAL_DIR = "../data/primekg_test/"
+LOCAL_DIR = "../data/biobridge_primekg_test/"
+shutil.rmtree(LOCAL_DIR, ignore_errors=True)
+@pytest.fixture(name="biobridge_primekg")
+def biobridge_primekg_fixture():
+    """
+    Fixture for creating an instance of PrimeKG.
+    """
+    return BioBridgePrimeKG(primekg_dir=PRIMEKG_LOCAL_DIR, local_dir=LOCAL_DIR)
+def test_download_primekg(biobridge_primekg):
+    """
+    Test the loading method of the BioBridge-PrimeKG class by downloading data from repository.
+    """
+    # Load BioBridge-PrimeKG data
+    biobridge_primekg.load_data()
+    primekg_nodes = biobridge_primekg.get_primekg().get_nodes()
+    primekg_edges = biobridge_primekg.get_primekg().get_edges()
+    biobridge_data_config = biobridge_primekg.get_data_config()
+    biobridge_emb_dict = biobridge_primekg.get_node_embeddings()
+    biobridge_triplets = biobridge_primekg.get_primekg_triplets()
+    biobridge_splits = biobridge_primekg.get_train_test_split()
+    biobridge_node_info = biobridge_primekg.get_node_info_dict()
+    # Check if the local directories exists
+    assert os.path.exists(biobridge_primekg.primekg_dir)
+    assert os.path.exists(biobridge_primekg.local_dir)
+    # Check if downloaded and processed files exist
+    # PrimeKG files
+    files = ["nodes.tab", "primekg_nodes.tsv.gz", "edges.csv", "primekg_edges.tsv.gz"]
+    for file in files:
+        path = f"{biobridge_primekg.primekg_dir}/{file}"
+        assert os.path.exists(path)
+    # BioBridge data config
+    assert os.path.exists(f"{biobridge_primekg.local_dir}/data_config.json")
+    # BioBridge embeddings
+    files = [
+        "protein.pkl",
+        "mf.pkl",
+        "cc.pkl",
+        "bp.pkl",
+        "drug.pkl",
+        "disease.pkl",
+        "embedding_dict.pkl",
+    ]
+    for file in files:
+        path = f"{biobridge_primekg.local_dir}/embeddings/{file}"
+        assert os.path.exists(path)
+    # BioBridge processed files
+    files = [
+        "protein.csv",
+        "mf.csv",
+        "cc.csv",
+        "bp.csv",
+        "drug.csv",
+        "disease.csv",
+        "triplet_full.tsv.gz",
+        "triplet_full_altered.tsv.gz",
+        "node_train.tsv.gz",
+        "triplet_train.tsv.gz",
+        "node_test.tsv.gz",
+        "triplet_test.tsv.gz",
+    ]
+    for file in files:
+        path = f"{biobridge_primekg.local_dir}/processed/{file}"
+        assert os.path.exists(path)
+    # Check processed PrimeKG dataframes
+    # Nodes
+    assert primekg_nodes is not None
+    assert len(primekg_nodes) > 0
+    assert primekg_nodes.shape[0] == 129375
+    # Edges
+    assert primekg_edges is not None
+    assert len(primekg_edges) > 0
+    assert primekg_edges.shape[0] == 8100498
+    # Check processed BioBridge data config
+    assert biobridge_data_config is not None
+    assert len(biobridge_data_config) > 0
+    assert len(biobridge_data_config["node_type"]) == 10
+    assert len(biobridge_data_config["relation_type"]) == 18
+    assert len(biobridge_data_config["emb_dim"]) == 6
+    # Check processed BioBridge embeddings
+    assert biobridge_emb_dict is not None
+    assert len(biobridge_emb_dict) > 0
+    assert len(biobridge_emb_dict) == 85466
+    # Check processed BioBridge triplets
+    assert biobridge_triplets is not None
+    assert len(biobridge_triplets) > 0
+    assert biobridge_triplets.shape[0] == 3904610
+    assert list(biobridge_splits.keys()) == ["train", "node_train", "test", "node_test"]
+    assert len(biobridge_splits["train"]) == 3510930
+    assert len(biobridge_splits["node_train"]) == 76486
+    assert len(biobridge_splits["test"]) == 393680
+    assert len(biobridge_splits["node_test"]) == 8495
+    # Check node info dictionary
+    assert list(biobridge_node_info.keys()) == [
+        "gene/protein",
+        "molecular_function",
+        "cellular_component",
+        "biological_process",
+        "drug",
+        "disease",
+    ]
+    assert len(biobridge_node_info["gene/protein"]) == 19162
+    assert len(biobridge_node_info["molecular_function"]) == 10966
+    assert len(biobridge_node_info["cellular_component"]) == 4013
+    assert len(biobridge_node_info["biological_process"]) == 27478
+    assert len(biobridge_node_info["drug"]) == 6948
+    assert len(biobridge_node_info["disease"]) == 44133
+def test_load_existing_primekg(biobridge_primekg):
+    """
+    Test the loading method of the BioBridge-PrimeKG class by loading existing data in local.
+    """
+    # Load BioBridge-PrimeKG data
+    biobridge_primekg.load_data()
+    primekg_nodes = biobridge_primekg.get_primekg().get_nodes()
+    primekg_edges = biobridge_primekg.get_primekg().get_edges()
+    biobridge_data_config = biobridge_primekg.get_data_config()
+    biobridge_emb_dict = biobridge_primekg.get_node_embeddings()
+    biobridge_triplets = biobridge_primekg.get_primekg_triplets()
+    biobridge_splits = biobridge_primekg.get_train_test_split()
+    biobridge_node_info = biobridge_primekg.get_node_info_dict()
+    # Check if the local directories exists
+    assert os.path.exists(biobridge_primekg.primekg_dir)
+    assert os.path.exists(biobridge_primekg.local_dir)
+    # Check if downloaded and processed files exist
+    # PrimeKG files
+    files = ["nodes.tab", "primekg_nodes.tsv.gz", "edges.csv", "primekg_edges.tsv.gz"]
+    for file in files:
+        path = f"{biobridge_primekg.primekg_dir}/{file}"
+        assert os.path.exists(path)
+    # BioBridge data config
+    assert os.path.exists(f"{biobridge_primekg.local_dir}/data_config.json")
+    # BioBridge embeddings
+    files = [
+        "protein.pkl",
+        "mf.pkl",
+        "cc.pkl",
+        "bp.pkl",
+        "drug.pkl",
+        "disease.pkl",
+        "embedding_dict.pkl",
+    ]
+    for file in files:
+        path = f"{biobridge_primekg.local_dir}/embeddings/{file}"
+        assert os.path.exists(path)
+    # BioBridge processed files
+    files = [
+        "protein.csv",
+        "mf.csv",
+        "cc.csv",
+        "bp.csv",
+        "drug.csv",
+        "disease.csv",
+        "triplet_full.tsv.gz",
+        "triplet_full_altered.tsv.gz",
+        "node_train.tsv.gz",
+        "triplet_train.tsv.gz",
+        "node_test.tsv.gz",
+        "triplet_test.tsv.gz",
+    ]
+    for file in files:
+        path = f"{biobridge_primekg.local_dir}/processed/{file}"
+        assert os.path.exists(path)
+    # Check processed PrimeKG dataframes
+    # Nodes
+    assert primekg_nodes is not None
+    assert len(primekg_nodes) > 0
+    assert primekg_nodes.shape[0] == 129375
+    # Edges
+    assert primekg_edges is not None
+    assert len(primekg_edges) > 0
+    assert primekg_edges.shape[0] == 8100498
+    # Check processed BioBridge data config
+    assert biobridge_data_config is not None
+    assert len(biobridge_data_config) > 0
+    assert len(biobridge_data_config["node_type"]) == 10
+    assert len(biobridge_data_config["relation_type"]) == 18
+    assert len(biobridge_data_config["emb_dim"]) == 6
+    # Check processed BioBridge embeddings
+    assert biobridge_emb_dict is not None
+    assert len(biobridge_emb_dict) > 0
+    assert len(biobridge_emb_dict) == 85466
+    # Check processed BioBridge triplets
+    assert biobridge_triplets is not None
+    assert len(biobridge_triplets) > 0
+    assert biobridge_triplets.shape[0] == 3904610
+    assert list(biobridge_splits.keys()) == ["train", "node_train", "test", "node_test"]
+    assert len(biobridge_splits["train"]) == 3510930
+    assert len(biobridge_splits["node_train"]) == 76486
+    assert len(biobridge_splits["test"]) == 393680
+    assert len(biobridge_splits["node_test"]) == 8495
+    # Check node info dictionary
+    assert list(biobridge_node_info.keys()) == [
+        "gene/protein",
+        "molecular_function",
+        "cellular_component",
+        "biological_process",
+        "drug",
+        "disease",
+    ]
+    assert len(biobridge_node_info["gene/protein"]) == 19162
+    assert len(biobridge_node_info["molecular_function"]) == 10966
+    assert len(biobridge_node_info["cellular_component"]) == 4013
+    assert len(biobridge_node_info["biological_process"]) == 27478
+    assert len(biobridge_node_info["drug"]) == 6948
+    assert len(biobridge_node_info["disease"]) == 44133
+# def test_load_existing_primekg_with_negative_triplets(biobridge_primekg):
+#     """
+#     Test the loading method of the BioBridge-PrimeKG class by loading existing data in local.
+#     In addition, it builds negative triplets for training data.
+#     """
+#     # Load BioBridge-PrimeKG data
+#     # Using 1 negative sample per positive triplet
+#     biobridge_primekg.load_data(build_neg_triplest=True, n_neg_samples=1)
+#     biobridge_neg_triplets = biobridge_primekg.get_primekg_triplets_negative()
+#     # Check if the local directories exists
+#     assert os.path.exists(biobridge_primekg.primekg_dir)
+#     assert os.path.exists(biobridge_primekg.local_dir)
+#     # Check if downloaded and processed files exist
+#     path = f"{biobridge_primekg.local_dir}/processed/triplet_train_negative.tsv.gz"
+#     assert os.path.exists(path)
+#     # Check processed BioBridge triplets
+#     assert biobridge_neg_triplets is not None
+#     assert len(biobridge_neg_triplets) > 0
+#     assert biobridge_neg_triplets.shape[0] == 3510930
+#     assert len(biobridge_neg_triplets.negative_tail_index[0]) == 1

aiagents4pharma/talk2knowledgegraphs/tests/test_datasets_dataset.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Test cases for datasets/dataset.py
+"""
+from ..datasets.dataset import Dataset
+class MockDataset(Dataset):
+    """
+    Mock dataset class for testing purposes.
+    """
+    def setup(self):
+        pass
+    def load_data(self):
+        pass
+def test_dataset_setup():
+    """
+    Test the setup method of the Dataset class.
+    """
+    dataset = MockDataset()
+    assert dataset.setup() is None
+def test_dataset_load_data():
+    """
+    Test the load_data method of the Dataset class.
+    """
+    dataset = MockDataset()
+    assert dataset.load_data() is None