aiagents4pharma 1.5.4__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,6 @@
1
+ '''
2
+ This file is used to import all the models in the package.
3
+ '''
4
+ from . import embeddings
5
+ from . import sentence_transformer
6
+ from . import huggingface
@@ -0,0 +1,77 @@
1
+ """
2
+ Embeddings interface from LangChain Core.
3
+ https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/embeddings/embeddings.py
4
+ """
5
+ from abc import ABC, abstractmethod
6
+ from langchain_core.runnables.config import run_in_executor
7
+
8
+ class Embeddings(ABC):
9
+ """Interface for embedding models.
10
+
11
+ This is an interface meant for implementing text embedding models.
12
+
13
+ Text embedding models are used to map text to a vector (a point in n-dimensional
14
+ space).
15
+
16
+ Texts that are similar will usually be mapped to points that are close to each
17
+ other in this space. The exact details of what's considered "similar" and how
18
+ "distance" is measured in this space are dependent on the specific embedding model.
19
+
20
+ This abstraction contains a method for embedding a list of documents and a method
21
+ for embedding a query text. The embedding of a query text is expected to be a single
22
+ vector, while the embedding of a list of documents is expected to be a list of
23
+ vectors.
24
+
25
+ Usually the query embedding is identical to the document embedding, but the
26
+ abstraction allows treating them independently.
27
+
28
+ In addition to the synchronous methods, this interface also provides asynchronous
29
+ versions of the methods.
30
+
31
+ By default, the asynchronous methods are implemented using the synchronous methods;
32
+ however, implementations may choose to override the asynchronous methods with
33
+ an async native implementation for performance reasons.
34
+ """
35
+ @abstractmethod
36
+ def embed_documents(self, texts: list[str]) -> list[list[float]]:
37
+ """Embed search docs.
38
+
39
+ Args:
40
+ texts: List of text to embed.
41
+
42
+ Returns:
43
+ List of embeddings.
44
+ """
45
+
46
+ @abstractmethod
47
+ def embed_query(self, text: str) -> list[float]:
48
+ """Embed query text.
49
+
50
+ Args:
51
+ text: Text to embed.
52
+
53
+ Returns:
54
+ Embedding.
55
+ """
56
+
57
+ async def aembed_documents(self, texts: list[str]) -> list[list[float]]:
58
+ """Asynchronous Embed search docs.
59
+
60
+ Args:
61
+ texts: List of text to embed.
62
+
63
+ Returns:
64
+ List of embeddings.
65
+ """
66
+ return await run_in_executor(None, self.embed_documents, texts)
67
+
68
+ async def aembed_query(self, text: str) -> list[float]:
69
+ """Asynchronous Embed query text.
70
+
71
+ Args:
72
+ text: Text to embed.
73
+
74
+ Returns:
75
+ Embedding.
76
+ """
77
+ return await run_in_executor(None, self.embed_query, text)
@@ -0,0 +1,114 @@
1
+ """
2
+ Embedding class using HuggingFace model based on LangChain Embeddings class.
3
+ """
4
+
5
+ from typing import List
6
+ import torch
7
+ from transformers import AutoModel, AutoTokenizer, AutoConfig
8
+ from .embeddings import Embeddings
9
+
10
+ class EmbeddingWithHuggingFace(Embeddings):
11
+ """
12
+ Embedding class using HuggingFace model based on LangChain Embeddings class.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ model_name: str,
18
+ model_cache_dir: str = None,
19
+ truncation: bool = True,
20
+ device: str = "cpu",
21
+ ):
22
+ """
23
+ Initialize the EmbeddingWithHuggingFace class.
24
+
25
+ Args:
26
+ model_name: The name of the HuggingFace model to be used.
27
+ model_cache_dir: The directory to cache the HuggingFace model.
28
+ truncation: The truncation flag for the HuggingFace tokenizer.
29
+ return_tensors: The return_tensors flag for the HuggingFace tokenizer.
30
+ device: The device to run the model on.
31
+ """
32
+
33
+ # Set parameters
34
+ self.model_name = model_name
35
+ self.model_cache_dir = model_cache_dir
36
+ self.truncation = truncation
37
+ self.device = device
38
+
39
+ # Try to load the model from HuggingFace Hub
40
+ try:
41
+ AutoConfig.from_pretrained(self.model_name)
42
+ except EnvironmentError as e:
43
+ raise ValueError(
44
+ f"Model {self.model_name} is not available on HuggingFace Hub."
45
+ ) from e
46
+
47
+ # Load HuggingFace tokenizer and model
48
+ self.tokenizer = AutoTokenizer.from_pretrained(
49
+ self.model_name, cache_dir=self.model_cache_dir
50
+ )
51
+ self.model = AutoModel.from_pretrained(
52
+ self.model_name, cache_dir=self.model_cache_dir
53
+ )
54
+
55
+ def meanpooling(self, output, mask) -> torch.Tensor:
56
+ """
57
+ Mean Pooling - Take attention mask into account for correct averaging.
58
+ According to the following documentation:
59
+ https://huggingface.co/NeuML/pubmedbert-base-embeddings
60
+
61
+ Args:
62
+ output: The output of the model.
63
+ mask: The mask of the model.
64
+ """
65
+ embeddings = output[0] # First element of model_output contains all token embeddings
66
+ mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
67
+ return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
68
+
69
+ def embed_documents(self, texts: List[str]) -> List[float]:
70
+ """
71
+ Generate embedding for a list of input texts using HuggingFace model.
72
+
73
+ Args:
74
+ texts: The list of texts to be embedded.
75
+
76
+ Returns:
77
+ The list of embeddings for the given texts.
78
+ """
79
+
80
+ # Generate the embedding
81
+ with torch.no_grad():
82
+ inputs = self.tokenizer(
83
+ texts,
84
+ padding=True,
85
+ truncation=self.truncation,
86
+ return_tensors="pt",
87
+ ).to(self.device)
88
+ outputs = self.model.to(self.device)(**inputs)
89
+ embeddings = self.meanpooling(outputs, inputs['attention_mask']).cpu()
90
+
91
+ return embeddings
92
+
93
+ def embed_query(self, text: str) -> List[float]:
94
+ """
95
+ Generate embeddings for an input text using HuggingFace model.
96
+
97
+ Args:
98
+ text: A query to be embedded.
99
+ Returns:
100
+ The embeddings for the given query.
101
+ """
102
+
103
+ # Generate the embedding
104
+ with torch.no_grad():
105
+ inputs = self.tokenizer(
106
+ text,
107
+ padding=True,
108
+ truncation=self.truncation,
109
+ return_tensors="pt",
110
+ ).to(self.device)
111
+ outputs = self.model.to(self.device)(**inputs)
112
+ embeddings = self.meanpooling(outputs, inputs['attention_mask']).cpu()[0]
113
+
114
+ return embeddings
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Embedding class using SentenceTransformer model based on LangChain Embeddings class.
5
+ """
6
+
7
+ from typing import List
8
+ from sentence_transformers import SentenceTransformer
9
+ from .embeddings import Embeddings
10
+
11
+
12
+ class EmbeddingWithSentenceTransformer(Embeddings):
13
+ """
14
+ Embedding class using SentenceTransformer model based on LangChain Embeddings class.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ model_name: str,
20
+ model_cache_dir: str = None,
21
+ trust_remote_code: bool = True,
22
+ ):
23
+ """
24
+ Initialize the EmbeddingWithSentenceTransformer class.
25
+
26
+ Args:
27
+ model_name: The name of the SentenceTransformer model to be used.
28
+ model_cache_dir: The directory to cache the SentenceTransformer model.
29
+ trust_remote_code: Whether to trust the remote code of the model.
30
+ """
31
+
32
+ # Set parameters
33
+ self.model_name = model_name
34
+ self.model_cache_dir = model_cache_dir
35
+ self.trust_remote_code = trust_remote_code
36
+
37
+ # Load the model
38
+ self.model = SentenceTransformer(self.model_name,
39
+ cache_folder=self.model_cache_dir,
40
+ trust_remote_code=self.trust_remote_code)
41
+
42
+ def embed_documents(self, texts: List[str]) -> List[float]:
43
+ """
44
+ Generate embedding for a list of input texts using SentenceTransformer model.
45
+
46
+ Args:
47
+ texts: The list of texts to be embedded.
48
+
49
+ Returns:
50
+ The list of embeddings for the given texts.
51
+ """
52
+
53
+ # Generate the embedding
54
+ embeddings = self.model.encode(texts, show_progress_bar=False)
55
+
56
+ return embeddings
57
+
58
+ def embed_query(self, text: str) -> List[float]:
59
+ """
60
+ Generate embeddings for an input text using SentenceTransformer model.
61
+
62
+ Args:
63
+ text: A query to be embedded.
64
+ Returns:
65
+ The embeddings for the given query.
66
+ """
67
+
68
+ # Generate the embedding
69
+ embeddings = self.model.encode(text, show_progress_bar=False)
70
+
71
+ return embeddings
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python3
2
+
3
+ '''A utility module for knowledge graph operations'''
4
+
5
+ from typing import Tuple
6
+ import networkx as nx
7
+ import pandas as pd
8
+
9
+ def kg_to_df_pandas(kg: nx.DiGraph) -> Tuple[pd.DataFrame, pd.DataFrame]:
10
+ """
11
+ Convert a directed knowledge graph to a pandas DataFrame.
12
+
13
+ Args:
14
+ kg: The directed knowledge graph in networkX format.
15
+
16
+ Returns:
17
+ df_nodes: A pandas DataFrame of the nodes in the knowledge graph.
18
+ df_edges: A pandas DataFrame of the edges in the knowledge graph.
19
+ """
20
+
21
+ # Create a pandas DataFrame of the nodes
22
+ df_nodes = pd.DataFrame.from_dict(kg.nodes, orient='index')
23
+
24
+ # Create a pandas DataFrame of the edges
25
+ df_edges = nx.to_pandas_edgelist(kg,
26
+ source='node_source',
27
+ target='node_target')
28
+
29
+ return df_nodes, df_edges
30
+
31
+ def df_pandas_to_kg(df: pd.DataFrame,
32
+ df_nodes_attrs: pd.DataFrame,
33
+ node_source: str,
34
+ node_target: str
35
+ ) -> nx.DiGraph:
36
+ """
37
+ Convert a pandas DataFrame to a directed knowledge graph.
38
+
39
+ Args:
40
+ df: A pandas DataFrame of the edges in the knowledge graph.
41
+ df_nodes_attrs: A pandas DataFrame of the nodes in the knowledge graph.
42
+ node_source: The column name of the source node in the df.
43
+ node_target: The column name of the target node in the df.
44
+
45
+ Returns:
46
+ kg: The directed knowledge graph in networkX format.
47
+ """
48
+
49
+ # Assert if the columns node_source and node_target are in the df
50
+ assert node_source in df.columns, f'{node_source} not in df'
51
+ assert node_target in df.columns, f'{node_target} not in df'
52
+
53
+ # Assert that the nodes in the index of the df_nodes_attrs
54
+ # are present in the source and target columns of the df
55
+ assert set(df_nodes_attrs.index).issubset(set(df[node_source]).\
56
+ union(set(df[node_target]))), \
57
+ 'Nodes in index of df_nodes not found in df_edges'
58
+
59
+ # Create a knowledge graph from the dataframes
60
+ # Add edges and nodes to the knowledge graph
61
+ kg = nx.from_pandas_edgelist(df,
62
+ source=node_source,
63
+ target=node_target,
64
+ create_using=nx.DiGraph,
65
+ edge_attr=True)
66
+ kg.add_nodes_from(df_nodes_attrs.to_dict('index').items())
67
+
68
+ return kg
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: aiagents4pharma
3
- Version: 1.5.4
3
+ Version: 1.6.0
4
4
  Summary: AI Agents for drug discovery, drug development, and other pharmaceutical R&D
5
5
  Classifier: Programming Language :: Python :: 3
6
6
  Classifier: License :: OSI Approved :: MIT License
@@ -10,6 +10,7 @@ Description-Content-Type: text/markdown
10
10
  License-File: LICENSE
11
11
  Requires-Dist: copasi_basico==0.78
12
12
  Requires-Dist: coverage==7.6.4
13
+ Requires-Dist: einops==0.8.0
13
14
  Requires-Dist: gdown==5.2.0
14
15
  Requires-Dist: huggingface_hub==0.26.5
15
16
  Requires-Dist: joblib==1.4.2
@@ -25,10 +26,14 @@ Requires-Dist: plotly==5.24.1
25
26
  Requires-Dist: pydantic==2.9.2
26
27
  Requires-Dist: pylint==3.3.1
27
28
  Requires-Dist: pytest==8.3.3
29
+ Requires-Dist: pytest-asyncio==0.25.2
28
30
  Requires-Dist: streamlit==1.39.0
31
+ Requires-Dist: sentence_transformers==3.3.1
29
32
  Requires-Dist: tabulate==0.9.0
30
- Requires-Dist: torch==2.5.1
33
+ Requires-Dist: torch==2.2.2
34
+ Requires-Dist: torch_geometric==2.6.1
31
35
  Requires-Dist: tqdm==4.66.6
36
+ Requires-Dist: transformers==4.48.0
32
37
  Requires-Dist: mkdocs==1.6.1
33
38
  Requires-Dist: mkdocs-jupyter==0.25.1
34
39
  Requires-Dist: mkdocs-material==9.5.47
@@ -16,8 +16,14 @@ aiagents4pharma/talk2knowledgegraphs/datasets/biobridge_primekg.py,sha256=QlzDXm
16
16
  aiagents4pharma/talk2knowledgegraphs/datasets/dataset.py,sha256=-LaPLse8BkALqwFetNK7wch2dt9Dz6QKGKZKBKM6bIk,409
17
17
  aiagents4pharma/talk2knowledgegraphs/datasets/primekg.py,sha256=KBMhCJ7yjMWqQJJctFYdpjYAlwv48Jl6i1dddXP4f08,7599
18
18
  aiagents4pharma/talk2knowledgegraphs/datasets/starkqa_primekg.py,sha256=Y-6-nORsnBJlU6rH0skyfr9S9J4PfTWK-af_p5UuknQ,7483
19
- aiagents4pharma-1.5.4.dist-info/LICENSE,sha256=IcIbyB1Hyk5ZDah03VNQvJkbNk2hkBCDqQ8qtnCvB4Q,1077
20
- aiagents4pharma-1.5.4.dist-info/METADATA,sha256=h2EUjQ_tbIGPYJkx7solW6NNWArVlykxyZkLI9uY0Gk,6746
21
- aiagents4pharma-1.5.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
22
- aiagents4pharma-1.5.4.dist-info/top_level.txt,sha256=-AH8rMmrSnJtq7HaAObS78UU-cTCwvX660dSxeM7a0A,16
23
- aiagents4pharma-1.5.4.dist-info/RECORD,,
19
+ aiagents4pharma/talk2knowledgegraphs/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ aiagents4pharma/talk2knowledgegraphs/utils/kg_utils.py,sha256=6vQnPkeOWae_8jePjhma3sJuMTngy0I0tqzdFt6OqKg,2507
21
+ aiagents4pharma/talk2knowledgegraphs/utils/embeddings/__init__.py,sha256=xRb0x7SoAb0nSVZYgjrqxWvENOMDuqIdL43NMjoOaCs,153
22
+ aiagents4pharma/talk2knowledgegraphs/utils/embeddings/embeddings.py,sha256=1nGznrAj-xT0xuSMBGz2dOujJ7M_IwSR84njxtxsy9A,2523
23
+ aiagents4pharma/talk2knowledgegraphs/utils/embeddings/huggingface.py,sha256=2vi_elf6EgzfagFAO5QnL3a_aXZyN7B1EBziu44MTfM,3806
24
+ aiagents4pharma/talk2knowledgegraphs/utils/embeddings/sentence_transformer.py,sha256=36iKlisOpMtGR5xfTAlSHXWvPqVC_Jbezod8kbBBMVg,2136
25
+ aiagents4pharma-1.6.0.dist-info/LICENSE,sha256=IcIbyB1Hyk5ZDah03VNQvJkbNk2hkBCDqQ8qtnCvB4Q,1077
26
+ aiagents4pharma-1.6.0.dist-info/METADATA,sha256=kad1BCtIEP5RVf2x6uI_w4UJyM1mnDZz2R2JT7ilnZo,6931
27
+ aiagents4pharma-1.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
28
+ aiagents4pharma-1.6.0.dist-info/top_level.txt,sha256=-AH8rMmrSnJtq7HaAObS78UU-cTCwvX660dSxeM7a0A,16
29
+ aiagents4pharma-1.6.0.dist-info/RECORD,,