PyPI - aiagents4pharma - Versions diffs - 0.0.0__py3-none-any.whl - Mend

aiagents4pharma 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (336) hide show

aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""
+Class for loading PrimeKG dataset.
+"""
+import os
+import pandas as pd
+import requests
+from tqdm import tqdm
+from .dataset import Dataset
+class PrimeKG(Dataset):
+    """
+    Class for loading PrimeKG dataset.
+    It downloads the data from the Harvard Dataverse and stores it in the local directory.
+    The data is then loaded into pandas DataFrame of nodes and edges.
+    """
+    def __init__(self, local_dir: str = "../../../data/primekg/"):
+        """
+        Constructor for PrimeKG class.
+        Args:
+            local_dir (str): The local directory where the data will be stored.
+        """
+        self.name: str = "primekg"
+        self.server_path: str = "https://dataverse.harvard.edu/api/access/datafile/"
+        self.file_ids: dict = {"nodes": 6180617, "edges": 6180616}
+        self.local_dir: str = local_dir
+        # Attributes to store the data
+        self.nodes: pd.DataFrame = None
+        self.edges: pd.DataFrame = None
+        # Set up the dataset
+        self.setup()
+    def setup(self):
+        """
+        A method to set up the dataset.
+        """
+        # Make the directory if it doesn't exist
+        os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
+    def _download_file(self, remote_url: str, local_path: str):
+        """
+        A helper function to download a file from remote URL to the local directory.
+        Args:
+            remote_url (str): The remote URL of the file to be downloaded.
+            local_path (str): The local path where the file will be saved.
+        """
+        response = requests.get(remote_url, stream=True, timeout=300)
+        response.raise_for_status()
+        progress_bar = tqdm(
+            total=int(response.headers.get("content-length", 0)),
+            unit="iB",
+            unit_scale=True,
+        )
+        with open(local_path, "wb") as file:
+            for data in response.iter_content(1024):
+                progress_bar.update(len(data))
+                file.write(data)
+        progress_bar.close()
+    def _load_nodes(self) -> pd.DataFrame:
+        """
+        Private method to load the nodes dataframe of PrimeKG dataset.
+        This method downloads the nodes file from the Harvard Dataverse if it does not exist
+        in the local directory. Otherwise, it loads the data from the local directory.
+        It further processes the dataframe of nodes and returns it.
+        Returns:
+            The nodes dataframe of PrimeKG dataset.
+        """
+        local_file = os.path.join(self.local_dir, f"{self.name}_nodes.tsv.gz")
+        if os.path.exists(local_file):
+            print(f"{local_file} already exists. Loading the data from the local directory.")
+            # Load the dataframe from the local directory and assign it to the nodes attribute
+            nodes = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
+        else:
+            print(f"Downloading node file from {self.server_path}{self.file_ids['nodes']}")
+            # Download the file from the Harvard Dataverse with designated file_id for node
+            self._download_file(
+                f"{self.server_path}{self.file_ids['nodes']}",
+                os.path.join(self.local_dir, "nodes.tab"),
+            )
+            # Load the downloaded file into a pandas DataFrame
+            nodes = pd.read_csv(
+                os.path.join(self.local_dir, "nodes.tab"), sep="\t", low_memory=False
+            )
+            # Further processing of the dataframe
+            nodes = nodes[["node_index", "node_name", "node_source", "node_id", "node_type"]]
+            # Store compressed dataframe in the local directory
+            nodes.to_csv(local_file, index=False, sep="\t", compression="gzip")
+        return nodes
+    def _load_edges(self, nodes: pd.DataFrame) -> pd.DataFrame:
+        """
+        Private method to load the edges dataframe of PrimeKG dataset.
+        This method downloads the edges file from the Harvard Dataverse if it does not exist
+        in the local directory. Otherwise, it loads the data from the local directory.
+        It further processes the dataframe of edges and returns it.
+        Args:
+            nodes (pd.DataFrame): The nodes dataframe of PrimeKG dataset.
+        Returns:
+            The edges dataframe of PrimeKG dataset.
+        """
+        local_file = os.path.join(self.local_dir, f"{self.name}_edges.tsv.gz")
+        if os.path.exists(local_file):
+            print(f"{local_file} already exists. Loading the data from the local directory.")
+            # Load the dataframe from the local directory and assign it to the edges attribute
+            edges = pd.read_csv(local_file, sep="\t", compression="gzip", low_memory=False)
+        else:
+            print(f"Downloading edge file from {self.server_path}{self.file_ids['edges']}")
+            # Download the file from the Harvard Dataverse with designated file_id for edge
+            self._download_file(
+                f"{self.server_path}{self.file_ids['edges']}",
+                os.path.join(self.local_dir, "edges.csv"),
+            )
+            # Load the downloaded file into a pandas DataFrame
+            edges = pd.read_csv(
+                os.path.join(self.local_dir, "edges.csv"), sep=",", low_memory=False
+            )
+            # Further processing of the dataframe
+            edges = edges.merge(nodes, left_on="x_index", right_on="node_index")
+            edges.drop(["x_index"], axis=1, inplace=True)
+            edges.rename(
+                columns={
+                    "node_index": "head_index",
+                    "node_name": "head_name",
+                    "node_source": "head_source",
+                    "node_id": "head_id",
+                    "node_type": "head_type",
+                },
+                inplace=True,
+            )
+            edges = edges.merge(nodes, left_on="y_index", right_on="node_index")
+            edges.drop(["y_index"], axis=1, inplace=True)
+            edges.rename(
+                columns={
+                    "node_index": "tail_index",
+                    "node_name": "tail_name",
+                    "node_source": "tail_source",
+                    "node_id": "tail_id",
+                    "node_type": "tail_type",
+                },
+                inplace=True,
+            )
+            edges = edges[
+                [
+                    "head_index",
+                    "head_name",
+                    "head_source",
+                    "head_id",
+                    "head_type",
+                    "tail_index",
+                    "tail_name",
+                    "tail_source",
+                    "tail_id",
+                    "tail_type",
+                    "display_relation",
+                    "relation",
+                ]
+            ]
+            # Store compressed dataframe in the local directory
+            edges.to_csv(local_file, index=False, sep="\t", compression="gzip")
+        return edges
+    def load_data(self):
+        """
+        Load the PrimeKG dataset into pandas DataFrame of nodes and edges.
+        """
+        print("Loading nodes of PrimeKG dataset ...")
+        self.nodes = self._load_nodes()
+        print("Loading edges of PrimeKG dataset ...")
+        self.edges = self._load_edges(self.nodes)
+    def get_nodes(self) -> pd.DataFrame:
+        """
+        Get the nodes dataframe of PrimeKG dataset.
+        Returns:
+            The nodes dataframe of PrimeKG dataset.
+        """
+        return self.nodes
+    def get_edges(self) -> pd.DataFrame:
+        """
+        Get the edges dataframe of PrimeKG dataset.
+        Returns:
+            The edges dataframe of PrimeKG dataset.
+        """
+        return self.edges

aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py ADDED Viewed

@@ -0,0 +1,210 @@
+"""
+Class for loading StarkQAPrimeKG dataset.
+"""
+import os
+import shutil
+import gdown
+import joblib
+import numpy as np
+import pandas as pd
+import torch
+from huggingface_hub import hf_hub_download, list_repo_files
+from tqdm import tqdm
+from .dataset import Dataset
+class StarkQAPrimeKG(Dataset):
+    """
+    Class for loading StarkQAPrimeKG dataset.
+    It downloads the data from the HuggingFace repo and stores it in the local directory.
+    The data is then loaded into pandas DataFrame of QA pairs, dictionary of split indices,
+    and node information.
+    """
+    def __init__(self, local_dir: str = "../../../data/starkqa_primekg/"):
+        """
+        Constructor for StarkQAPrimeKG class.
+        Args:
+            local_dir (str): The local directory to store the dataset files.
+        """
+        self.name: str = "starkqa_primekg"
+        self.hf_repo_id: str = "snap-stanford/stark"
+        self.local_dir: str = local_dir
+        # Attributes to store the data
+        self.starkqa: pd.DataFrame = None
+        self.starkqa_split_idx: dict = None
+        self.starkqa_node_info: dict = None
+        self.query_emb_dict: dict = None
+        self.node_emb_dict: dict = None
+        # Set up the dataset
+        self.setup()
+    def setup(self):
+        """
+        A method to set up the dataset.
+        """
+        # Make the directory if it doesn't exist
+        os.makedirs(os.path.dirname(self.local_dir), exist_ok=True)
+    def _load_stark_repo(self) -> tuple[pd.DataFrame, dict, dict]:
+        """
+        Private method to load related files of StarkQAPrimeKG dataset.
+        Returns:
+            The nodes dataframe of StarkQAPrimeKG dataset.
+            The split indices of StarkQAPrimeKG dataset.
+            The node information of StarkQAPrimeKG dataset.
+        """
+        # Download the file if it does not exist in the local directory
+        # Otherwise, load the data from the local directory
+        local_file = os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv")
+        if os.path.exists(local_file):
+            print(f"{local_file} already exists. Loading the data from the local directory.")
+        else:
+            print(f"Downloading files from {self.hf_repo_id}")
+            # List all related files in the HuggingFace Hub repository
+            files = list_repo_files(self.hf_repo_id, repo_type="dataset")
+            files = [
+                f
+                for f in files
+                if (
+                    (f.startswith("qa/prime/") or f.startswith("skb/prime/"))
+                    and f.find("raw") == -1
+                )
+            ]
+            # Download and save each file in the specified folder
+            for file in tqdm(files):
+                _ = hf_hub_download(
+                    self.hf_repo_id, file, repo_type="dataset", local_dir=self.local_dir
+                )
+            # Unzip the processed files
+            shutil.unpack_archive(
+                os.path.join(self.local_dir, "skb/prime/processed.zip"),
+                os.path.join(self.local_dir, "skb/prime/"),
+            )
+        # Load StarkQA dataframe
+        starkqa = pd.read_csv(
+            os.path.join(self.local_dir, "qa/prime/stark_qa/stark_qa.csv"),
+            low_memory=False,
+        )
+        # Read split indices
+        qa_indices = sorted(starkqa["id"].tolist())
+        starkqa_split_idx = {}
+        for split in ["train", "val", "test", "test-0.1"]:
+            indices_file = os.path.join(self.local_dir, "qa/prime/split", f"{split}.index")
+            with open(indices_file, encoding="utf-8") as f:
+                indices = f.read().strip().split("\n")
+            query_ids = [int(idx) for idx in indices]
+            starkqa_split_idx[split] = np.array(
+                [qa_indices.index(query_id) for query_id in query_ids]
+            )
+        # Load the node info of PrimeKG preprocessed for StarkQA
+        starkqa_node_info = joblib.load(
+            os.path.join(self.local_dir, "skb/prime/processed/node_info.pkl")
+        )
+        return starkqa, starkqa_split_idx, starkqa_node_info
+    def _load_stark_embeddings(self) -> tuple[dict, dict]:
+        """
+        Private method to load the embeddings of StarkQAPrimeKG dataset.
+        Returns:
+            The query embeddings of StarkQAPrimeKG dataset.
+            The node embeddings of StarkQAPrimeKG dataset.
+        """
+        # Load the provided embeddings of query and nodes
+        # Note that they utilized 'text-embedding-ada-002' for embeddings
+        emb_model = "text-embedding-ada-002"
+        query_emb_url = "https://drive.google.com/uc?id=1MshwJttPZsHEM2cKA5T13SIrsLeBEdyU"
+        node_emb_url = "https://drive.google.com/uc?id=16EJvCMbgkVrQ0BuIBvLBp-BYPaye-Edy"
+        # Prepare respective directories to store the embeddings
+        emb_dir = os.path.join(self.local_dir, emb_model)
+        query_emb_dir = os.path.join(emb_dir, "query")
+        node_emb_dir = os.path.join(emb_dir, "doc")
+        os.makedirs(query_emb_dir, exist_ok=True)
+        os.makedirs(node_emb_dir, exist_ok=True)
+        query_emb_path = os.path.join(query_emb_dir, "query_emb_dict.pt")
+        node_emb_path = os.path.join(node_emb_dir, "candidate_emb_dict.pt")
+        # Download the embeddings if they do not exist in the local directory
+        if not os.path.exists(query_emb_path) or not os.path.exists(node_emb_path):
+            # Download the query embeddings
+            gdown.download(query_emb_url, query_emb_path, quiet=False)
+            # Download the node embeddings
+            gdown.download(node_emb_url, node_emb_path, quiet=False)
+        # Load the embeddings
+        query_emb_dict = torch.load(query_emb_path)
+        node_emb_dict = torch.load(node_emb_path)
+        return query_emb_dict, node_emb_dict
+    def load_data(self):
+        """
+        Load the StarkQAPrimeKG dataset into pandas DataFrame of QA pairs,
+        dictionary of split indices, and node information.
+        """
+        print("Loading StarkQAPrimeKG dataset...")
+        self.starkqa, self.starkqa_split_idx, self.starkqa_node_info = self._load_stark_repo()
+        print("Loading StarkQAPrimeKG embeddings...")
+        self.query_emb_dict, self.node_emb_dict = self._load_stark_embeddings()
+    def get_starkqa(self) -> pd.DataFrame:
+        """
+        Get the dataframe of StarkQAPrimeKG dataset, containing the QA pairs.
+        Returns:
+            The nodes dataframe of PrimeKG dataset.
+        """
+        return self.starkqa
+    def get_starkqa_split_indicies(self) -> dict:
+        """
+        Get the split indices of StarkQAPrimeKG dataset.
+        Returns:
+            The split indices of StarkQAPrimeKG dataset.
+        """
+        return self.starkqa_split_idx
+    def get_starkqa_node_info(self) -> dict:
+        """
+        Get the node information of StarkQAPrimeKG dataset.
+        Returns:
+            The node information of StarkQAPrimeKG dataset.
+        """
+        return self.starkqa_node_info
+    def get_query_embeddings(self) -> dict:
+        """
+        Get the query embeddings of StarkQAPrimeKG dataset.
+        Returns:
+            The query embeddings of StarkQAPrimeKG dataset.
+        """
+        return self.query_emb_dict
+    def get_node_embeddings(self) -> dict:
+        """
+        Get the node embeddings of StarkQAPrimeKG dataset.
+        Returns:
+            The node embeddings of StarkQAPrimeKG dataset.
+        """
+        return self.node_emb_dict

aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/.env.example ADDED Viewed

@@ -0,0 +1,23 @@
+# .env.example (DO NOT put actual API keys here, read the README.md)
+# OPENAI API KEY
+OPENAI_API_KEY=your_openai_api_key_here
+# LangSmith API KEY
+LANGCHAIN_TRACING_V2=true
+LANGCHAIN_API_KEY=your_langchain_api_key_here
+# NVIDIA API KEY
+NVIDIA_API_KEY=your_nvidia_api_key_here
+# Set environment variables for data loader
+MILVUS_HOST=milvus-standalone
+MILVUS_PORT=19530
+MILVUS_USER=root
+MILVUS_PASSWORD=Milvus
+MILVUS_DATABASE=t2kg_primekg
+# Specify the data directory for multimodal data to your own data directory
+# DATA_DIR=/your_absolute_path_to_your_data_dir/
+BATCH_SIZE=500

aiagents4pharma/talk2knowledgegraphs/docker-compose/cpu/docker-compose.yml ADDED Viewed

@@ -0,0 +1,93 @@
+version: "1.0.0"
+services:
+  # talk2knowledgegraphs with automatic data loading via entrypoint
+  talk2knowledgegraphs:
+    container_name: talk2knowledgegraphs
+    image: vpatientengine/talk2knowledgegraphs:latest-cpu
+    platform: linux/amd64
+    ports:
+      - "8501:8501"
+    environment:
+      - MILVUS_HOST=milvus-standalone
+      - MILVUS_PORT=19530
+    env_file:
+      - .env
+    volumes:
+      # Mount external data directory if DATA_DIR is specified in .env
+      - ${DATA_DIR:-./default_data}:/mnt/external_data:ro
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8501/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 120s # Allow time for data loading
+    depends_on:
+      milvus-standalone:
+        condition: service_healthy
+  # Milvus Dependencies
+  etcd:
+    container_name: milvus-etcd
+    image: quay.io/coreos/etcd:v3.5.18
+    environment:
+      - ETCD_AUTO_COMPACTION_MODE=revision
+      - ETCD_AUTO_COMPACTION_RETENTION=1000
+      - ETCD_QUOTA_BACKEND_BYTES=4294967296
+      - ETCD_SNAPSHOT_COUNT=50000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
+    command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+  minio:
+    container_name: milvus-minio
+    image: minio/minio:RELEASE.2024-05-28T17-19-04Z
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    ports:
+      - "9001:9001"
+      - "9000:9000"
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
+    command: minio server /minio_data --console-address ":9001"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+  # Milvus Vector Database (CPU-only)
+  milvus-standalone:
+    container_name: milvus-standalone
+    image: milvusdb/milvus:v2.5.14
+    command: ["milvus", "run", "standalone"]
+    security_opt:
+      - seccomp:unconfined
+    environment:
+      MINIO_REGION: us-east-1
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
+    ports:
+      - "19530:19530"
+      - "9091:9091"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
+      interval: 30s
+      start_period: 90s
+      timeout: 20s
+      retries: 3
+    depends_on:
+      - "etcd"
+      - "minio"
+networks:
+  milvus:
+    name: milvus

aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/.env.example ADDED Viewed

@@ -0,0 +1,23 @@
+# .env.example (DO NOT put actual API keys here, read the README.md)
+# OPENAI API KEY
+OPENAI_API_KEY=your_openai_api_key_here
+# LangSmith API KEY
+LANGCHAIN_TRACING_V2=true
+LANGCHAIN_API_KEY=your_langchain_api_key_here
+# NVIDIA API KEY
+NVIDIA_API_KEY=your_nvidia_api_key_here
+# Set environment variables for data loader
+MILVUS_HOST=milvus-standalone
+MILVUS_PORT=19530
+MILVUS_USER=root
+MILVUS_PASSWORD=Milvus
+MILVUS_DATABASE=t2kg_primekg
+# Specify the data directory for multimodal data to your own data directory
+# DATA_DIR=/your_absolute_path_to_your_data_dir/
+BATCH_SIZE=500

aiagents4pharma/talk2knowledgegraphs/docker-compose/gpu/docker-compose.yml ADDED Viewed

@@ -0,0 +1,108 @@
+version: "1.0.0"
+services:
+  # talk2knowledgegraphs with automatic data loading via entrypoint
+  talk2knowledgegraphs:
+    container_name: talk2knowledgegraphs
+    image: vpatientengine/talk2knowledgegraphs:latest-gpu
+    platform: linux/amd64
+    ports:
+      - "8501:8501"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: ["gpu"]
+              device_ids: ["0"]
+    environment:
+      - MILVUS_HOST=milvus-standalone
+      - MILVUS_PORT=19530
+    env_file:
+      - .env
+    volumes:
+      # Mount external data directory if DATA_DIR is specified in .env
+      - ${DATA_DIR:-./default_data}:/mnt/external_data:ro
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8501/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 120s # Allow time for data loading
+    depends_on:
+      milvus-standalone:
+        condition: service_healthy
+  # Milvus Dependencies
+  etcd:
+    container_name: milvus-etcd
+    image: quay.io/coreos/etcd:v3.5.18
+    environment:
+      - ETCD_AUTO_COMPACTION_MODE=revision
+      - ETCD_AUTO_COMPACTION_RETENTION=1000
+      - ETCD_QUOTA_BACKEND_BYTES=4294967296
+      - ETCD_SNAPSHOT_COUNT=50000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
+    command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+  minio:
+    container_name: milvus-minio
+    image: minio/minio:RELEASE.2023-03-20T20-16-18Z
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    ports:
+      - "9001:9001"
+      - "9000:9000"
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
+    command: minio server /minio_data --console-address ":9001"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+  # Milvus Vector Database (GPU-enabled)
+  milvus-standalone:
+    container_name: milvus-standalone
+    image: milvusdb/milvus:v2.6.0-rc1-gpu
+    command: ["milvus", "run", "standalone"]
+    security_opt:
+      - seccomp:unconfined
+    environment:
+      MINIO_REGION: us-east-1
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9000
+      MQ_TYPE: woodpecker
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
+    ports:
+      - "19530:19530"
+      - "9091:9091"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: ["gpu"]
+              device_ids: ["0"]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
+      interval: 30s
+      start_period: 90s
+      timeout: 20s
+      retries: 3
+    depends_on:
+      - "etcd"
+      - "minio"
+networks:
+  milvus:
+    name: milvus