PyPI - aiagents4pharma - Versions diffs - 0.0.0__py3-none-any.whl - Mend

aiagents4pharma 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (336) hide show

aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""
+Embedding class using HuggingFace model based on LangChain Embeddings class.
+"""
+import torch
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+from .embeddings import Embeddings
+class EmbeddingWithHuggingFace(Embeddings):
+    """
+    Embedding class using HuggingFace model based on LangChain Embeddings class.
+    """
+    def __init__(
+        self,
+        model_name: str,
+        model_cache_dir: str = None,
+        truncation: bool = True,
+        device: str = "cpu",
+    ):
+        """
+        Initialize the EmbeddingWithHuggingFace class.
+        Args:
+            model_name: The name of the HuggingFace model to be used.
+            model_cache_dir: The directory to cache the HuggingFace model.
+            truncation: The truncation flag for the HuggingFace tokenizer.
+            return_tensors: The return_tensors flag for the HuggingFace tokenizer.
+            device: The device to run the model on.
+        """
+        # Set parameters
+        self.model_name = model_name
+        self.model_cache_dir = model_cache_dir
+        self.truncation = truncation
+        self.device = device
+        # Try to load the model from HuggingFace Hub
+        try:
+            AutoConfig.from_pretrained(self.model_name)
+        except OSError as e:
+            raise ValueError(f"Model {self.model_name} is not available on HuggingFace Hub.") from e
+        # Load HuggingFace tokenizer and model
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name, cache_dir=self.model_cache_dir
+        )
+        self.model = AutoModel.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
+    def meanpooling(self, output, mask) -> torch.Tensor:
+        """
+        Mean Pooling - Take attention mask into account for correct averaging.
+        According to the following documentation:
+        https://huggingface.co/NeuML/pubmedbert-base-embeddings
+        Args:
+            output: The output of the model.
+            mask: The mask of the model.
+        """
+        embeddings = output[0]  # First element of model_output contains all token embeddings
+        mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
+        return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
+    def embed_documents(self, texts: list[str]) -> list[float]:
+        """
+        Generate embedding for a list of input texts using HuggingFace model.
+        Args:
+            texts: The list of texts to be embedded.
+        Returns:
+            The list of embeddings for the given texts.
+        """
+        # Generate the embedding
+        with torch.no_grad():
+            inputs = self.tokenizer(
+                texts,
+                padding=True,
+                truncation=self.truncation,
+                return_tensors="pt",
+            ).to(self.device)
+            outputs = self.model.to(self.device)(**inputs)
+            embeddings = self.meanpooling(outputs, inputs["attention_mask"]).cpu()
+        return embeddings
+    def embed_query(self, text: str) -> list[float]:
+        """
+        Generate embeddings for an input text using HuggingFace model.
+        Args:
+            text: A query to be embedded.
+        Returns:
+            The embeddings for the given query.
+        """
+        # Generate the embedding
+        with torch.no_grad():
+            inputs = self.tokenizer(
+                text,
+                padding=True,
+                truncation=self.truncation,
+                return_tensors="pt",
+            ).to(self.device)
+            outputs = self.model.to(self.device)(**inputs)
+            embeddings = self.meanpooling(outputs, inputs["attention_mask"]).cpu()[0]
+        return embeddings

aiagents4pharma/talk2knowledgegraphs/utils/embeddings/nim_molmim.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""
+Embedding class using MOLMIM model from NVIDIA NIM.
+"""
+import json
+import requests
+from .embeddings import Embeddings
+class EmbeddingWithMOLMIM(Embeddings):
+    """
+    Embedding class using MOLMIM model from NVIDIA NIM
+    """
+    def __init__(self, base_url: str):
+        """
+        Initialize the EmbeddingWithMOLMIM class.
+        Args:
+            base_url: The base URL for the NIM/MOLMIM model.
+        """
+        # Set base URL
+        self.base_url = base_url
+    def embed_documents(self, texts: list[str]) -> list[float]:
+        """
+        Generate embedding for a list of SMILES strings using MOLMIM model.
+        Args:
+            texts: The list of SMILES strings to be embedded.
+        Returns:
+            The list of embeddings for the given SMILES strings.
+        """
+        headers = {"accept": "application/json", "Content-Type": "application/json"}
+        data = json.dumps({"sequences": texts})
+        response = requests.post(self.base_url, headers=headers, data=data, timeout=60)
+        embeddings = response.json()["embeddings"]
+        return embeddings
+    def embed_query(self, text: str) -> list[float]:
+        """
+        Generate embeddings for an input query using MOLMIM model.
+        Args:
+            text: A query to be embedded.
+        Returns:
+            The embeddings for the given query.
+        """
+        # Generate the embedding
+        embeddings = self.embed_documents([text])
+        return embeddings

aiagents4pharma/talk2knowledgegraphs/utils/embeddings/ollama.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""
+Embedding class using Ollama model based on LangChain Embeddings class.
+"""
+import subprocess
+import time
+import ollama
+from langchain_ollama import OllamaEmbeddings
+from .embeddings import Embeddings
+class EmbeddingWithOllama(Embeddings):
+    """
+    Embedding class using Ollama model based on LangChain Embeddings class.
+    """
+    def __init__(self, model_name: str):
+        """
+        Initialize the EmbeddingWithOllama class.
+        Args:
+            model_name: The name of the Ollama model to be used.
+        """
+        # Setup the Ollama server
+        self.__setup(model_name)
+        # Set parameters
+        self.model_name = model_name
+        # Prepare model
+        self.model = OllamaEmbeddings(model=self.model_name)
+    def __setup(self, model_name: str) -> None:
+        """
+        Check if the Ollama model is available and run the Ollama server if needed.
+        Args:
+            model_name: The name of the Ollama model to be used.
+        """
+        try:
+            models_list = ollama.list()["models"]
+            if model_name not in [m["model"].replace(":latest", "") for m in models_list]:
+                ollama.pull(model_name)
+                time.sleep(30)
+                raise ValueError(f"Pulled {model_name} model")
+        except Exception as e:
+            with subprocess.Popen(
+                "ollama serve",
+                shell=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            ):
+                time.sleep(10)
+            raise ValueError(f"Error: {e} and restarted Ollama server.") from e
+    def embed_documents(self, texts: list[str]) -> list[float]:
+        """
+        Generate embedding for a list of input texts using Ollama model.
+        Args:
+            texts: The list of texts to be embedded.
+        Returns:
+            The list of embeddings for the given texts.
+        """
+        # Generate the embedding
+        embeddings = self.model.embed_documents(texts)
+        return embeddings
+    def embed_query(self, text: str) -> list[float]:
+        """
+        Generate embeddings for an input text using Ollama model.
+        Args:
+            text: A query to be embedded.
+        Returns:
+            The embeddings for the given query.
+        """
+        # Generate the embedding
+        embeddings = self.model.embed_query(text)
+        return embeddings

aiagents4pharma/talk2knowledgegraphs/utils/embeddings/sentence_transformer.py ADDED Viewed

@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+"""
+Embedding class using SentenceTransformer model based on LangChain Embeddings class.
+"""
+from sentence_transformers import SentenceTransformer
+from .embeddings import Embeddings
+class EmbeddingWithSentenceTransformer(Embeddings):
+    """
+    Embedding class using SentenceTransformer model based on LangChain Embeddings class.
+    """
+    def __init__(
+        self,
+        model_name: str,
+        model_cache_dir: str = None,
+        trust_remote_code: bool = True,
+    ):
+        """
+        Initialize the EmbeddingWithSentenceTransformer class.
+        Args:
+            model_name: The name of the SentenceTransformer model to be used.
+            model_cache_dir: The directory to cache the SentenceTransformer model.
+            trust_remote_code: Whether to trust the remote code of the model.
+        """
+        # Set parameters
+        self.model_name = model_name
+        self.model_cache_dir = model_cache_dir
+        self.trust_remote_code = trust_remote_code
+        # Load the model
+        self.model = SentenceTransformer(
+            self.model_name,
+            cache_folder=self.model_cache_dir,
+            trust_remote_code=self.trust_remote_code,
+        )
+    def embed_documents(self, texts: list[str]) -> list[float]:
+        """
+        Generate embedding for a list of input texts using SentenceTransformer model.
+        Args:
+            texts: The list of texts to be embedded.
+        Returns:
+            The list of embeddings for the given texts.
+        """
+        # Generate the embedding
+        embeddings = self.model.encode(texts, show_progress_bar=False)
+        return embeddings
+    def embed_query(self, text: str) -> list[float]:
+        """
+        Generate embeddings for an input text using SentenceTransformer model.
+        Args:
+            text: A query to be embedded.
+        Returns:
+            The embeddings for the given query.
+        """
+        # Generate the embedding
+        embeddings = self.model.encode(text, show_progress_bar=False)
+        return embeddings

aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""
+This package contains modules to use the enrichment model
+"""
+from . import (
+    enrichments,
+    ollama,
+    ols_terms,
+    pubchem_strings,
+    reactome_pathways,
+    uniprot_proteins,
+)

aiagents4pharma/talk2knowledgegraphs/utils/enrichments/enrichments.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""
+Enrichments interface
+"""
+from abc import ABC, abstractmethod
+class Enrichments(ABC):
+    """Interface for enrichment models.
+    This is an interface meant for implementing text enrichment models.
+    Enrichment models are used to enrich node or relation features in a given knowledge graph.
+    """
+    @abstractmethod
+    def enrich_documents(self, texts: list[str]) -> list[list[str]]:
+        """Enrich documents.
+        Args:
+            texts: List of documents to enrich.
+        Returns:
+            List of enriched documents.
+        """
+    @abstractmethod
+    def enrich_documents_with_rag(self, texts: list[str], docs: list[str]) -> list[str]:
+        """Enrich documents with RAG.
+        Args:
+            texts: List of documents to enrich.
+            docs: List of reference documents to enrich the input texts.
+        Returns:
+            List of enriched documents with RAG.
+        """

aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ollama.py ADDED Viewed

@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""
+Enrichment class using Ollama model based on LangChain Enrichment class.
+"""
+import ast
+import subprocess
+import time
+import ollama
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_ollama import ChatOllama
+from .enrichments import Enrichments
+class EnrichmentWithOllama(Enrichments):
+    """
+    Enrichment class using Ollama model based on the Enrichment abstract class.
+    """
+    def __init__(
+        self,
+        model_name: str,
+        prompt_enrichment: str,
+        temperature: float,
+        streaming: bool,
+    ):
+        """
+        Initialize the EnrichmentWithOllama class.
+        Args:
+            model_name: The name of the Ollama model to be used.
+            prompt_enrichment: The prompt enrichment template.
+            temperature: The temperature for the Ollama model.
+            streaming: The streaming flag for the Ollama model.
+        """
+        # Setup the Ollama server
+        self.__setup(model_name)
+        # Set parameters
+        self.model_name = model_name
+        self.prompt_enrichment = prompt_enrichment
+        self.temperature = temperature
+        self.streaming = streaming
+        # Prepare prompt template
+        self.prompt_template = ChatPromptTemplate.from_messages(
+            [
+                ("system", self.prompt_enrichment),
+                ("human", "{input}"),
+            ]
+        )
+        # Prepare model
+        self.model = ChatOllama(
+            model=self.model_name,
+            temperature=self.temperature,
+            streaming=self.streaming,
+        )
+    def __setup(self, model_name: str) -> None:
+        """
+        Check if the Ollama model is available and run the Ollama server if needed.
+        Args:
+            model_name: The name of the Ollama model to be used.
+        """
+        try:
+            models_list = ollama.list()["models"]
+            if model_name not in [m["model"].replace(":latest", "") for m in models_list]:
+                ollama.pull(model_name)
+                time.sleep(30)
+                raise ValueError(f"Pulled {model_name} model")
+        except Exception as e:
+            with subprocess.Popen(
+                "ollama serve",
+                shell=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            ):
+                time.sleep(10)
+            raise ValueError(f"Error: {e} and restarted Ollama server.") from e
+    def enrich_documents(self, texts: list[str]) -> list[str]:
+        """
+        Enrich a list of input texts with additional textual features using OLLAMA model.
+        Important: Make sure the input is a list of texts based on the defined prompt template
+        with 'input' as the variable name.
+        Args:
+            texts: The list of texts to be enriched.
+        Returns:
+            The list of enriched texts.
+        """
+        # Perform enrichment
+        chain = self.prompt_template | self.model | StrOutputParser()
+        # Generate the enriched node
+        # Important: Make sure the input is a list of texts based on the defined prompt template
+        # with 'input' as the variable name
+        enriched_texts = chain.invoke({"input": "[" + ", ".join(texts) + "]"})
+        # Convert the enriched nodes to a list of dictionary
+        enriched_texts = ast.literal_eval(enriched_texts.replace("```", ""))
+        # Final check for the enriched texts
+        assert len(enriched_texts) == len(texts)
+        return enriched_texts
+    def enrich_documents_with_rag(self, texts, docs):
+        """
+        Enrich a list of input texts with additional textual features using OLLAMA model with RAG.
+        As of now, we don't have a RAG model to test this method yet.
+        Thus, we will just call the enrich_documents method instead.
+        Args:
+            texts: The list of texts to be enriched.
+            docs: The list of reference documents to enrich the input texts.
+        Returns:
+            The list of enriched texts
+        """
+        return self.enrich_documents(texts)

aiagents4pharma/talk2knowledgegraphs/utils/enrichments/ols_terms.py ADDED Viewed

@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Enrichment class for enriching OLS terms with textual descriptions
+"""
+import json
+import logging
+import hydra
+import requests
+from .enrichments import Enrichments
+# Initialize logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class EnrichmentWithOLS(Enrichments):
+    """
+    Enrichment class using OLS terms
+    """
+    def enrich_documents(self, texts: list[str]) -> list[str]:
+        """
+        Enrich a list of input OLS terms
+        Args:
+            texts: The list of OLS terms to be enriched.
+        Returns:
+            The list of enriched descriptions
+        """
+        ols_ids = texts
+        logger.log(logging.INFO, "Load Hydra configuration for OLS enrichments.")
+        with hydra.initialize(version_base=None, config_path="../../configs"):
+            cfg = hydra.compose(
+                config_name="config", overrides=["utils/enrichments/ols_terms=default"]
+            )
+            cfg = cfg.utils.enrichments.ols_terms
+        descriptions = []
+        for ols_id in ols_ids:
+            params = {"short_form": ols_id}
+            r = requests.get(
+                cfg.base_url,
+                headers={"Accept": "application/json"},
+                params=params,
+                timeout=cfg.timeout,
+            )
+            response_body = json.loads(r.text)
+            # if the response body is empty
+            if "_embedded" not in response_body:
+                descriptions.append("")
+                continue
+            # Add the description to the list
+            description = []
+            for term in response_body["_embedded"]["terms"]:
+                # If the term has a description, add it to the list
+                description += term.get("description", [])
+                # Add synonyms to the description
+                description += term.get("synonyms", [])
+                # Add the label to the description
+                # Label is not provided as list, so we need to convert it to a list
+                label = term.get("label", "")
+                if label:
+                    description += [label]
+            # Make unique the description
+            description = list(set(description))
+            # Join the description with new line
+            description = "\n".join(description)
+            # Ensure we always return a string, even if empty
+            descriptions.append(description if description else "")
+        return descriptions
+    def enrich_documents_with_rag(self, texts, docs):
+        """
+        Enrich a list of input OLS terms
+        Args:
+            texts: The list of OLS to be enriched.
+        Returns:
+            The list of enriched descriptions
+        """
+        return self.enrich_documents(texts)

aiagents4pharma/talk2knowledgegraphs/utils/enrichments/pubchem_strings.py ADDED Viewed

@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""
+Enrichment class for enriching PubChem IDs with their STRINGS representation and descriptions.
+"""
+import logging
+import hydra
+import requests
+from ..pubchem_utils import pubchem_cid_description
+from .enrichments import Enrichments
+# Initialize logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class EnrichmentWithPubChem(Enrichments):
+    """
+    Enrichment class using PubChem
+    """
+    def enrich_documents(self, texts: list[str]) -> list[str]:
+        """
+        Enrich a list of input PubChem IDs with their STRINGS representation.
+        Args:
+            texts: The list of pubchem IDs to be enriched.
+        Returns:
+            The list of enriched STRINGS and their descriptions.
+        """
+        enriched_pubchem_ids_smiles = []
+        enriched_pubchem_ids_descriptions = []
+        # Load Hydra configuration to get the base URL for PubChem
+        with hydra.initialize(version_base=None, config_path="../../configs"):
+            cfg = hydra.compose(config_name="config", overrides=["utils/pubchem_utils=default"])
+            cfg = cfg.utils.pubchem_utils
+        # Iterate over each PubChem ID in the input list
+        pubchem_cids = texts
+        for pubchem_cid in pubchem_cids:
+            # Prepare the URL
+            pubchem_url = f"{cfg.pubchem_cid2smiles_url}/{pubchem_cid}/property/smiles/JSON"
+            # Get the data
+            response = requests.get(pubchem_url, timeout=60)
+            data = response.json()
+            # Extract the PubChem CID SMILES
+            smiles = ""
+            description = ""
+            if "PropertyTable" in data:
+                for prop in data["PropertyTable"]["Properties"]:
+                    smiles = prop.get("SMILES", "")
+                    description = pubchem_cid_description(pubchem_cid)
+            else:
+                # If the PubChem ID is not found, set smiles and description to None
+                smiles = None
+                description = None
+            enriched_pubchem_ids_smiles.append(smiles)
+            enriched_pubchem_ids_descriptions.append(description)
+        return enriched_pubchem_ids_descriptions, enriched_pubchem_ids_smiles
+    def enrich_documents_with_rag(self, texts, docs):
+        """
+        Enrich a list of input PubChem IDs with their STRINGS representation.
+        Args:
+            texts: The list of pubchem IDs to be enriched.
+            docs: None
+        Returns:
+            The list of enriched STRINGS
+        """
+        return self.enrich_documents(texts)