gsurgeon 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gsurgeon-1.0.0/LICENSE +21 -0
- gsurgeon-1.0.0/PKG-INFO +111 -0
- gsurgeon-1.0.0/README.md +83 -0
- gsurgeon-1.0.0/pyproject.toml +29 -0
- gsurgeon-1.0.0/src/gsurgeon/__init__.py +0 -0
- gsurgeon-1.0.0/src/gsurgeon/metrics/__init__.py +0 -0
- gsurgeon-1.0.0/src/gsurgeon/metrics/bootstrapping.py +24 -0
- gsurgeon-1.0.0/src/gsurgeon/metrics/finemapping.py +27 -0
- gsurgeon-1.0.0/src/gsurgeon/metrics/network.py +28 -0
- gsurgeon-1.0.0/src/gsurgeon/operations/__init__.py +0 -0
- gsurgeon-1.0.0/src/gsurgeon/operations/standard.py +80 -0
- gsurgeon-1.0.0/src/gsurgeon/procedures/__init__.py +0 -0
- gsurgeon-1.0.0/src/gsurgeon/procedures/react.py +81 -0
- gsurgeon-1.0.0/src/gsurgeon/procedures/standard.py +75 -0
- gsurgeon-1.0.0/src/gsurgeon/surgeon/__init__.py +0 -0
- gsurgeon-1.0.0/src/gsurgeon/surgeon/agent.py +176 -0
- gsurgeon-1.0.0/src/gsurgeon/surgeon/prompts.py +41 -0
- gsurgeon-1.0.0/src/gsurgeon/tools/__init__.py +0 -0
- gsurgeon-1.0.0/src/gsurgeon/tools/general.py +95 -0
- gsurgeon-1.0.0/src/gsurgeon/tools/gn.py +219 -0
- gsurgeon-1.0.0/src/gsurgeon/tools/ncbi.py +81 -0
gsurgeon-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 JoM
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
gsurgeon-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gsurgeon
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A tool to dissect biology of model organisms using genomic information
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Johannes Medagbe
|
|
8
|
+
Author-email: johanmedagbe@gmail.com
|
|
9
|
+
Requires-Python: >=3.11,<3.15
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Requires-Dist: bio (>=1.8.3,<2.0.0)
|
|
16
|
+
Requires-Dist: botocore (>=1.43.11,<2.0.0)
|
|
17
|
+
Requires-Dist: dotenv (>=0.9.9,<0.10.0)
|
|
18
|
+
Requires-Dist: dspy (>=3.2.1,<4.0.0)
|
|
19
|
+
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
20
|
+
Requires-Dist: langchain-core (>=1.4.0,<2.0.0)
|
|
21
|
+
Requires-Dist: langgraph (>=1.2.0,<2.0.0)
|
|
22
|
+
Requires-Dist: pydantic (>=2.13.4,<3.0.0)
|
|
23
|
+
Requires-Dist: sparqlwrapper (>=2.0.0,<3.0.0)
|
|
24
|
+
Requires-Dist: typing-extensions (>=4.15.0,<5.0.0)
|
|
25
|
+
Project-URL: Homepage, https://github.com/johanmed/gsurgeon
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# GSurgeon: the genomic surgeon
|
|
29
|
+
|
|
30
|
+
## What is GSurgeon?
|
|
31
|
+
|
|
32
|
+
**GSurgeon** is an AI tool to dissect biology of model organisms through genomic information. It leverages LLM capabilities to send dynamic requests in natural language to genomic databases and extract any biological information.
|
|
33
|
+
|
|
34
|
+
## What questions can you ask GSurgeon?
|
|
35
|
+
|
|
36
|
+
**GSurgeon** has been tested on questions related to model organisms involving markers, genes and traits.
|
|
37
|
+
|
|
38
|
+
As such it has good performance on queries such as:
|
|
39
|
+
|
|
40
|
+
- Which genes on chromosome 1 of the mouse genome are related to inflammation and diabetes at the same time?
|
|
41
|
+
- List traits measured in GeneNetwork that are related to diabetes.
|
|
42
|
+
|
|
43
|
+
Other queries in the realm of biology and genomics are also possible.
|
|
44
|
+
|
|
45
|
+
## How to install and run GSurgeon?
|
|
46
|
+
|
|
47
|
+
#### 1. Get the source code
|
|
48
|
+
|
|
49
|
+
You can clone this repository.
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
git clone https://github.com/johanmed/gsurgeon.git
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
#### 2. Set tool parameters
|
|
56
|
+
|
|
57
|
+
**GSurgeon** expects a number of parameters to be defined for the surgery:
|
|
58
|
+
|
|
59
|
+
- N_ITERATIONS: number of operations
|
|
60
|
+
- MODEL_NAME: DSPy model identifier
|
|
61
|
+
- API_KEY: provider key
|
|
62
|
+
- EMAIL: email address for NCBI authentication
|
|
63
|
+
|
|
64
|
+
We recommend creating them in an environment file. For more details, see file `env_example`.
|
|
65
|
+
|
|
66
|
+
#### 3. Add gsurgeon path to your search path
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
export PATH="$PATH:/path/to/project"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Replace the path above by yours. You can also add it to your file `~/.bashrc`.
|
|
73
|
+
|
|
74
|
+
#### 4. Run your query
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
gsurgeon --env-file env_example "Which genes on chromosome 1 of the mouse genome are related to inflammation and diabetes at the same time?"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Replace the query above by yours.
|
|
81
|
+
|
|
82
|
+
## Why use GSurgeon?
|
|
83
|
+
|
|
84
|
+
#### 1. Access to up-to-date biological information
|
|
85
|
+
|
|
86
|
+
Accessing genomic information is a pain. It requires knowledge of right databases to query but also skills to dig deep and find relevant information. **GSurgeon** makes the process easier for the community by providing a simpler, yet powerful interface to interact in real time with biological databases.
|
|
87
|
+
|
|
88
|
+
In the research ecosystem, this can be used for a variety of applications:
|
|
89
|
+
|
|
90
|
+
- literature review
|
|
91
|
+
- cross-checking of research findings against current knowledge
|
|
92
|
+
- hypothesis exploration
|
|
93
|
+
- biological link discovery
|
|
94
|
+
- advanced bioinformatic analyses
|
|
95
|
+
|
|
96
|
+
#### 2. Prevent hallucination, trust a bit more language models used in biology
|
|
97
|
+
|
|
98
|
+
Despite advances in language AI, hallucination remains a serious concern in biological research. **GSurgeon** offers a scalable solution by grounding generation in true information from biological databases. Current databases supported include:
|
|
99
|
+
- [GeneNetwork](https://genenetwork.org/): database service to explore biology of model organisms with bioinformatic tools
|
|
100
|
+
- [NCBI](https://www.ncbi.nlm.nih.gov/): database service for access and analysis of biological information
|
|
101
|
+
|
|
102
|
+
#### 3. Empower your LLM to handle with surgical precision the hard work for you with no limits
|
|
103
|
+
|
|
104
|
+
**GSurgeon** exploits reasoning capabilities of LLM to orchestrate the search of biological information. Using its knowledge of biological databases, it finds dynamically the best approach of answering or completing the task you have in mind.
|
|
105
|
+
|
|
106
|
+
The execution logic is abstracted to give full control to the agents. No need for extra coding!
|
|
107
|
+
|
|
108
|
+
The tool footprint is lightweight. Most of the computational resources required to run the system are handled by the provider. No need to have monstruous specs to get started!
|
|
109
|
+
|
|
110
|
+
**GSurgeon** can be executed on the command-line on any model, provided sufficient training.
|
|
111
|
+
|
gsurgeon-1.0.0/README.md
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# GSurgeon: the genomic surgeon
|
|
2
|
+
|
|
3
|
+
## What is GSurgeon?
|
|
4
|
+
|
|
5
|
+
**GSurgeon** is an AI tool to dissect biology of model organisms through genomic information. It leverages LLM capabilities to send dynamic requests in natural language to genomic databases and extract any biological information.
|
|
6
|
+
|
|
7
|
+
## What questions can you ask GSurgeon?
|
|
8
|
+
|
|
9
|
+
**GSurgeon** has been tested on questions related to model organisms involving markers, genes and traits.
|
|
10
|
+
|
|
11
|
+
As such it has good performance on queries such as:
|
|
12
|
+
|
|
13
|
+
- Which genes on chromosome 1 of the mouse genome are related to inflammation and diabetes at the same time?
|
|
14
|
+
- List traits measured in GeneNetwork that are related to diabetes.
|
|
15
|
+
|
|
16
|
+
Other queries in the realm of biology and genomics are also possible.
|
|
17
|
+
|
|
18
|
+
## How to install and run GSurgeon?
|
|
19
|
+
|
|
20
|
+
#### 1. Get the source code
|
|
21
|
+
|
|
22
|
+
You can clone this repository.
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git clone https://github.com/johanmed/gsurgeon.git
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
#### 2. Set tool parameters
|
|
29
|
+
|
|
30
|
+
**GSurgeon** expects a number of parameters to be defined for the surgery:
|
|
31
|
+
|
|
32
|
+
- N_ITERATIONS: number of operations
|
|
33
|
+
- MODEL_NAME: DSPy model identifier
|
|
34
|
+
- API_KEY: provider key
|
|
35
|
+
- EMAIL: email address for NCBI authentication
|
|
36
|
+
|
|
37
|
+
We recommend creating them in an environment file. For more details, see file `env_example`.
|
|
38
|
+
|
|
39
|
+
#### 3. Add gsurgeon path to your search path
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
export PATH="$PATH:/path/to/project"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Replace the path above by yours. You can also add it to your file `~/.bashrc`.
|
|
46
|
+
|
|
47
|
+
#### 4. Run your query
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
gsurgeon --env-file env_example "Which genes on chromosome 1 of the mouse genome are related to inflammation and diabetes at the same time?"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Replace the query above by yours.
|
|
54
|
+
|
|
55
|
+
## Why use GSurgeon?
|
|
56
|
+
|
|
57
|
+
#### 1. Access to up-to-date biological information
|
|
58
|
+
|
|
59
|
+
Accessing genomic information is a pain. It requires knowledge of right databases to query but also skills to dig deep and find relevant information. **GSurgeon** makes the process easier for the community by providing a simpler, yet powerful interface to interact in real time with biological databases.
|
|
60
|
+
|
|
61
|
+
In the research ecosystem, this can be used for a variety of applications:
|
|
62
|
+
|
|
63
|
+
- literature review
|
|
64
|
+
- cross-checking of research findings against current knowledge
|
|
65
|
+
- hypothesis exploration
|
|
66
|
+
- biological link discovery
|
|
67
|
+
- advanced bioinformatic analyses
|
|
68
|
+
|
|
69
|
+
#### 2. Prevent hallucination, trust a bit more language models used in biology
|
|
70
|
+
|
|
71
|
+
Despite advances in language AI, hallucination remains a serious concern in biological research. **GSurgeon** offers a scalable solution by grounding generation in true information from biological databases. Current databases supported include:
|
|
72
|
+
- [GeneNetwork](https://genenetwork.org/): database service to explore biology of model organisms with bioinformatic tools
|
|
73
|
+
- [NCBI](https://www.ncbi.nlm.nih.gov/): database service for access and analysis of biological information
|
|
74
|
+
|
|
75
|
+
#### 3. Empower your LLM to handle with surgical precision the hard work for you with no limits
|
|
76
|
+
|
|
77
|
+
**GSurgeon** exploits reasoning capabilities of LLM to orchestrate the search of biological information. Using its knowledge of biological databases, it finds dynamically the best approach of answering or completing the task you have in mind.
|
|
78
|
+
|
|
79
|
+
The execution logic is abstracted to give full control to the agents. No need for extra coding!
|
|
80
|
+
|
|
81
|
+
The tool footprint is lightweight. Most of the computational resources required to run the system are handled by the provider. No need to have monstruous specs to get started!
|
|
82
|
+
|
|
83
|
+
**GSurgeon** can be executed on the command-line on any model, provided sufficient training.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "gsurgeon"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "A tool to dissect biology of model organisms using genomic information"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Johannes Medagbe",email = "johanmedagbe@gmail.com"}
|
|
7
|
+
]
|
|
8
|
+
license = "MIT"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11,<3.15"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"langchain-core (>=1.4.0,<2.0.0)",
|
|
13
|
+
"langgraph (>=1.2.0,<2.0.0)",
|
|
14
|
+
"bio (>=1.8.3,<2.0.0)",
|
|
15
|
+
"httpx (>=0.28.1,<0.29.0)",
|
|
16
|
+
"pydantic (>=2.13.4,<3.0.0)",
|
|
17
|
+
"typing-extensions (>=4.15.0,<5.0.0)",
|
|
18
|
+
"dotenv (>=0.9.9,<0.10.0)",
|
|
19
|
+
"dspy (>=3.2.1,<4.0.0)",
|
|
20
|
+
"botocore (>=1.43.11,<2.0.0)",
|
|
21
|
+
"sparqlwrapper (>=2.0.0,<3.0.0)",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://github.com/johanmed/gsurgeon"
|
|
26
|
+
|
|
27
|
+
[build-system]
|
|
28
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
29
|
+
build-backend = "poetry.core.masonry.api"
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Module with bootstrapping logic"""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap(element_ranks: dict[str, list[int]]) -> dict[tuple, float]:
|
|
7
|
+
"""
|
|
8
|
+
Compute bootstrap rank for each element
|
|
9
|
+
Args:
|
|
10
|
+
element_ranks: dictionary of elements and ranks (list)
|
|
11
|
+
Output:
|
|
12
|
+
Dictionary of elements and bootstrap ranks (single value)
|
|
13
|
+
"""
|
|
14
|
+
bootstrap_ranks = {}
|
|
15
|
+
for element, ranks in element_ranks.items():
|
|
16
|
+
sorted_ranks = sorted(ranks)
|
|
17
|
+
total = len(sorted_ranks)
|
|
18
|
+
top = sorted(
|
|
19
|
+
Counter(sorted_ranks).items(), key=lambda item: item[1], reverse=True
|
|
20
|
+
)[0]
|
|
21
|
+
top_rank, frequency = top
|
|
22
|
+
bootstrap_ranks[(element, top_rank)] = frequency / total
|
|
23
|
+
bootstrap_ranks = dict(sorted(bootstrap_ranks.items(), key=lambda item: item[0][1]))
|
|
24
|
+
return bootstrap_ranks
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Module with constructs to compute metrics for finemapping"""
|
|
2
|
+
|
|
3
|
+
import dspy
|
|
4
|
+
from gsurgeon.metrics.bootstrapping import bootstrap
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ExtractGene(dspy.Signature):
|
|
8
|
+
"""
|
|
9
|
+
Construct a list of genes consistent across answers.
|
|
10
|
+
Scan answers and extract rank for each gene in the previous list.
|
|
11
|
+
Build a dictionary with gene as key and list of ranks as value.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
query: str = dspy.InputField(desc="Finemapping query")
|
|
15
|
+
answers: list[str] = dspy.InputField(
|
|
16
|
+
desc="List of answers to the query, each reporting ranked list of genes"
|
|
17
|
+
)
|
|
18
|
+
gene_ranks: dict[str, list[int]] = dspy.OutputField(
|
|
19
|
+
desc="Dictionary of genes and assigned ranks. Example: {'gene A': [1, 1, 2], 'gene B': [2, 3, 2]}"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def bootstrap_rank(query: str, answers: list[str]) -> dict[tuple, float]:
|
|
24
|
+
"""Compute a bootstrap rank for each gene"""
|
|
25
|
+
extract = dspy.Predict(ExtractGene)
|
|
26
|
+
gene_ranks = extract(query=query, answers=answers).get("gene_ranks")
|
|
27
|
+
return bootstrap(gene_ranks)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Module with constructs to compute metrics for network analysis"""
|
|
2
|
+
|
|
3
|
+
import dspy
|
|
4
|
+
from gsurgeon.metrics.bootstrapping import bootstrap
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ExtractEdge(dspy.Signature):
|
|
8
|
+
"""
|
|
9
|
+
Construct a list of connections consistent across answers.
|
|
10
|
+
Scan answers and extract rank for each connection in the previous list.
|
|
11
|
+
Build a dictionary with edge as key and list of ranks as value.
|
|
12
|
+
Example: {"gene A -> gene C": [1, 1, 2], "gene B -> gene D": [2, 3, 2]}
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
query: str = dspy.InputField(desc="Network analysis query")
|
|
16
|
+
answers: list[str] = dspy.InputField(
|
|
17
|
+
desc="List of answers to the query, each reporting a ranked of list of connections between genes"
|
|
18
|
+
)
|
|
19
|
+
edge_ranks: dict[str, list[int]] = dspy.OutputField(
|
|
20
|
+
desc="Dictionary of connections and assigned ranks assigned"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def bootstrap_rank(query: str, answers: list[str]) -> dict[tuple, float]:
|
|
25
|
+
"""Compute a bootstrap rank for each edge"""
|
|
26
|
+
extract = dspy.Predict(ExtractEdge)
|
|
27
|
+
edge_ranks = extract(query=query, answers=answers).get("edge_ranks")
|
|
28
|
+
return bootstrap(edge_ranks)
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Module with standard operation constructs for GSurgeon"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
import dspy
|
|
6
|
+
from gsurgeon.procedures.standard import Reproduce
|
|
7
|
+
from gsurgeon.surgeon.agent import GSurgeon
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def operate(query: str, n_steps: int = 5) -> str:
|
|
11
|
+
"""Execute operation or analysis with GSurgeon"""
|
|
12
|
+
surgeon = GSurgeon(max_steps=n_steps)
|
|
13
|
+
return await surgeon.handle(query)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def reoperate(query: str, n_steps: int = 5, n_iterations: int = 5) -> str:
|
|
17
|
+
"""
|
|
18
|
+
Execute operation or analysis a given number of times for reproducibility
|
|
19
|
+
Args:
|
|
20
|
+
query: inquiry
|
|
21
|
+
n_steps: max number of steps allowed during operation
|
|
22
|
+
n_iterations: number of operation repetitions
|
|
23
|
+
Output:
|
|
24
|
+
Consensus resulting from different runs
|
|
25
|
+
"""
|
|
26
|
+
print(f"Bootstrapping operation {n_iterations} times for query...")
|
|
27
|
+
results = await asyncio.gather(
|
|
28
|
+
*[operate(query, n_steps) for n in range(n_iterations)]
|
|
29
|
+
)
|
|
30
|
+
reproduce = dspy.Predict(Reproduce)
|
|
31
|
+
print("Bootstrapped run completed")
|
|
32
|
+
return reproduce(query=query, results=results).get("consensus")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def serialize(
|
|
36
|
+
queries: list, n_steps: int = 5, n_iterations: int = 5
|
|
37
|
+
) -> dict:
|
|
38
|
+
"""
|
|
39
|
+
Execute operation a given number of times for a set/series of queries
|
|
40
|
+
Args:
|
|
41
|
+
queries: list of queries to investigate
|
|
42
|
+
n_steps: max number of steps allowed during operation
|
|
43
|
+
n_iterations: number of operation repetitions
|
|
44
|
+
Output:
|
|
45
|
+
Dictionary of query and responses
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
base = dspy.ChainOfThought("query -> answer: str")
|
|
49
|
+
base_results = [base(query=query).get("answer") for query in queries]
|
|
50
|
+
|
|
51
|
+
surgeon_results = await asyncio.gather(
|
|
52
|
+
*[reoperate(query, n_steps, n_iterations) for query in queries]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
collection = {}
|
|
56
|
+
for query, base_result, surgeon_result in zip(
|
|
57
|
+
queries, base_results, surgeon_results
|
|
58
|
+
):
|
|
59
|
+
collection[f"Query was '{query}'"] = [
|
|
60
|
+
f"Base response was '{base_result}'",
|
|
61
|
+
f"Surgeon response was: '{surgeon_result}'",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
return collection
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def meta_analyze(
|
|
68
|
+
query: str, n_steps: int = 5, n_iterations: int = 5, n_bootstraps: int = 10
|
|
69
|
+
) -> list[str]:
|
|
70
|
+
"""
|
|
71
|
+
Perform a meta-analysis of an operation or genomic task with n samples
|
|
72
|
+
Args:
|
|
73
|
+
query: genomic task
|
|
74
|
+
n_steps: max number of steps allowed during operation
|
|
75
|
+
n_iterations: number of operation repetitions
|
|
76
|
+
n_bootstraps: number of repetition sampling for statistical support
|
|
77
|
+
"""
|
|
78
|
+
return await asyncio.gather(
|
|
79
|
+
*[reoperate(query, n_steps, n_iterations) for n in range(n_bootstraps)]
|
|
80
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Module with ReAct procedures (dspy programs) for GSurgeon"""
|
|
2
|
+
|
|
3
|
+
import dspy
|
|
4
|
+
from gsurgeon.tools.general import checker, reformulator, splitter
|
|
5
|
+
from gsurgeon.tools.gn import make_sparql_tool
|
|
6
|
+
from gsurgeon.tools.ncbi import ncbi_searcher, record_fetcher, record_synthesizer
|
|
7
|
+
from langchain_core.messages import BaseMessage
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ReactSig(dspy.Signature):
|
|
11
|
+
query: list[BaseMessage] = dspy.InputField()
|
|
12
|
+
reasoning: str = dspy.OutputField(desc="Concise explanation of solution")
|
|
13
|
+
solution: str = dspy.OutputField(
|
|
14
|
+
desc="Final answer to the query in 2000 words max making use of all relevant information in accumulated context."
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Research(dspy.Module):
|
|
19
|
+
"""
|
|
20
|
+
Address a query or plan to completion using GeneNetwork resources only.
|
|
21
|
+
For efficiency, only call a tool when it is strictly necessary in completing the next task.
|
|
22
|
+
Use splitter when input query is too complex to be handled in a single step.
|
|
23
|
+
Harness the reformulator to clarify a request when it seems ambiguous.
|
|
24
|
+
To get a specific information, call the fetcher. It has access to data and can extract any information.
|
|
25
|
+
Once an information is extracted, check its relevance with the checker before proceeding.
|
|
26
|
+
For reproducibility, seed your reasoning on the following master thoughts:
|
|
27
|
+
1. GeneNetwork has the data requested by the user and it can be obtained with targeted SPARQL queries.
|
|
28
|
+
2. Reasoning does not need to be complicated. Prefer the simplest approach.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
super().__init__()
|
|
33
|
+
fetcher = make_sparql_tool("https://sparql.genenetwork.org/sparql")
|
|
34
|
+
self.tools = [splitter, checker, reformulator, fetcher]
|
|
35
|
+
|
|
36
|
+
self.react = dspy.ReAct(
|
|
37
|
+
signature=ReactSig,
|
|
38
|
+
tools=self.tools,
|
|
39
|
+
max_iters=10, # maximum number of steps for reasoning and tool calling
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def forward(self, query: list[BaseMessage]):
|
|
43
|
+
return self.react(query=query)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Consult(dspy.Module):
|
|
47
|
+
"""
|
|
48
|
+
Address a query or plan to completion using NCBI resources only.
|
|
49
|
+
For effficiency, only call a tool when it is strictly necessary in completing the next task.
|
|
50
|
+
Use splitter when input query is too complex to be handled in a single step.
|
|
51
|
+
Harness the reformulator to clarify a request when it seems ambiguous.
|
|
52
|
+
Extract answers from NCBI by performing first a search with ncbi_searcher. The search terms must be as specific as possible.
|
|
53
|
+
When search results contain records, fetch information with record_fetcher.
|
|
54
|
+
For records with a lot of data specifically, take some time to synthesize informations.
|
|
55
|
+
Check relevance of generated information with the checker before proceeding.
|
|
56
|
+
Every information you extracted regarding genes and/or functions must be verified with another search using proper terms with ncbi_searcher.
|
|
57
|
+
You must ascertain that all informations in the final answer are true at all cost.
|
|
58
|
+
For reproducibility, seed your reasoning on the following master thoughts:
|
|
59
|
+
1. NCBI has the data requested by the user and it can be obtained by using a combination of terms with AND or OR keywords.
|
|
60
|
+
2. Reasoning does not need to be complicated. Prefer the simplest approach.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self):
|
|
64
|
+
super().__init__()
|
|
65
|
+
self.tools = [
|
|
66
|
+
splitter,
|
|
67
|
+
checker,
|
|
68
|
+
reformulator,
|
|
69
|
+
ncbi_searcher,
|
|
70
|
+
record_fetcher,
|
|
71
|
+
record_synthesizer,
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
self.react = dspy.ReAct(
|
|
75
|
+
signature=ReactSig,
|
|
76
|
+
tools=self.tools,
|
|
77
|
+
max_iters=20,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def forward(self, query: list[BaseMessage]):
|
|
81
|
+
return self.react(query=query)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Module with standard procedures (dspy modules) for GSurgeon"""
|
|
2
|
+
|
|
3
|
+
import dspy
|
|
4
|
+
from langchain_core.messages import BaseMessage
|
|
5
|
+
from typing_extensions import Literal
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Plan(dspy.Signature):
|
|
9
|
+
"""
|
|
10
|
+
Generate plan to solve query in background.
|
|
11
|
+
For reproducibility, seed the plan generation on the following master thought:
|
|
12
|
+
The query submitted by the user can be addressed in less than 5 straightforward and targeted steps. No need to complicate things.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
background: list[BaseMessage] = dspy.InputField()
|
|
16
|
+
answer: str = dspy.OutputField(desc="The plan to solve the task")
|
|
17
|
+
reasoning: str = dspy.OutputField(
|
|
18
|
+
desc="Concise explanation of the output in 50 words"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Tune(dspy.Signature):
|
|
23
|
+
"""Make recommendations to improve user satisfaction to answer generated so far to query"""
|
|
24
|
+
|
|
25
|
+
background: list[BaseMessage] = dspy.InputField()
|
|
26
|
+
answer: str = dspy.OutputField(desc="The new questions")
|
|
27
|
+
reasoning: str = dspy.OutputField(
|
|
28
|
+
desc="Concise explanation of the output in 50 words"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Supervise(dspy.Signature):
|
|
33
|
+
"""
|
|
34
|
+
Decide the next action the system should take.
|
|
35
|
+
To select the next step, you must take into account the query and the curent context.
|
|
36
|
+
If the query is not related to GeneNetwork traits, do not call gn_researcher. ncbi_expert should be the main actor.
|
|
37
|
+
Similarly, do not call the ncbi_expert if the query is GeneNetwork specific.
|
|
38
|
+
When the query is related to genes, finemapping or network analysis, you must call the ncbi_expert and not the gn_researcher.
|
|
39
|
+
Call the reflector only to improve generation from gn_researcher and ncbi_expert.
|
|
40
|
+
Act on suggestions proposed by reflector using the most appropriate actor between gn_researcher and ncbi_expert depending on the query.
|
|
41
|
+
End execution if there is nothing else to do.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
background: list[BaseMessage] = dspy.InputField()
|
|
45
|
+
next_decision: Literal["gn_researcher", "ncbi_expert", "reflector", "end"] = (
|
|
46
|
+
dspy.OutputField(desc="The next step to take based on instructions")
|
|
47
|
+
)
|
|
48
|
+
reasoning: str = dspy.OutputField(
|
|
49
|
+
desc="Concise explanation of the decision in 50 words"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class Finalize(dspy.Signature):
|
|
54
|
+
"""Build the final synthesis to send back to the user in less than 500 words"""
|
|
55
|
+
|
|
56
|
+
messages: list[BaseMessage] = dspy.InputField()
|
|
57
|
+
feedback: str = dspy.OutputField(
|
|
58
|
+
desc="Detailed and comprehensive final feedback combining AI outputs in the list of messages and linking them when necessary"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class Reproduce(dspy.Signature):
|
|
63
|
+
"""
|
|
64
|
+
Extract answers that are consistent across results.
|
|
65
|
+
Do not include information that is missing in some results for reproducibility.
|
|
66
|
+
Synthesize a coherent and detailed solution to the query using answer consensus.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
query: str = dspy.InputField()
|
|
70
|
+
results: list = dspy.InputField(
|
|
71
|
+
desc="List of results generated to the same query by the system"
|
|
72
|
+
)
|
|
73
|
+
consensus: str = dspy.OutputField(
|
|
74
|
+
desc="Final output built from consistent answers across results"
|
|
75
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""GSurgeon: Multi-agent system to dissect genomic information"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import time
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import dspy
|
|
12
|
+
from gsurgeon.procedures.react import Consult, Research
|
|
13
|
+
from gsurgeon.procedures.standard import Finalize, Plan, Supervise, Tune
|
|
14
|
+
from gsurgeon.surgeon.prompts import (
|
|
15
|
+
expert_prompt,
|
|
16
|
+
planner_prompt,
|
|
17
|
+
reflector_prompt,
|
|
18
|
+
researcher_prompt,
|
|
19
|
+
supervisor_prompt1,
|
|
20
|
+
supervisor_prompt2,
|
|
21
|
+
)
|
|
22
|
+
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
|
|
23
|
+
from langgraph.graph import END, START, StateGraph
|
|
24
|
+
from langgraph.graph.message import add_messages
|
|
25
|
+
from langgraph.prebuilt import ToolNode, tools_condition
|
|
26
|
+
from pydantic import BaseModel
|
|
27
|
+
from typing_extensions import Annotated, Literal
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AgentState(BaseModel):
|
|
31
|
+
"""
|
|
32
|
+
Represent agent state
|
|
33
|
+
Avail 02 attributes to allow communication between agents
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
messages: Annotated[list[BaseMessage], add_messages]
|
|
37
|
+
next_decision: Literal[
|
|
38
|
+
"gn_researcher", "planner", "reflector", "ncbi_expert", "end"
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class GSurgeon:
|
|
44
|
+
"""
|
|
45
|
+
Represent Search Agent
|
|
46
|
+
Input:
|
|
47
|
+
max_steps: maximum number of steps allowed
|
|
48
|
+
Operations:
|
|
49
|
+
Initialization of multi-agent graph
|
|
50
|
+
Run of query through system
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
max_steps: int = 5
|
|
54
|
+
_graph: Any = field(init=False)
|
|
55
|
+
|
|
56
|
+
def __post_init__(self):
|
|
57
|
+
self._graph = self._build_graph()
|
|
58
|
+
|
|
59
|
+
async def _researcher(self, state: AgentState) -> dict:
|
|
60
|
+
"""Answer query with GeneNetwork information"""
|
|
61
|
+
print("Calling the researcher...")
|
|
62
|
+
if len(state.messages) < 3: # handle first call to researcher
|
|
63
|
+
input_text = state.messages[0] # use original query
|
|
64
|
+
else:
|
|
65
|
+
input_text = state.messages[-1] # use reflection insights
|
|
66
|
+
input_text = [researcher_prompt, input_text.content]
|
|
67
|
+
research = Research()
|
|
68
|
+
result = await asyncio.to_thread(research, query=input_text)
|
|
69
|
+
print("Researcher performed analysis")
|
|
70
|
+
return {
|
|
71
|
+
"messages": [AIMessage(result.get("solution"))],
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
async def _expert(self, state: AgentState) -> dict:
|
|
75
|
+
"""Answer query with NCBI information"""
|
|
76
|
+
print("Calling the expert...")
|
|
77
|
+
if len(state.messages) < 4: # handle first call to expert
|
|
78
|
+
input_text = state.messages[1] + state.messages[0] # use plan and query
|
|
79
|
+
else:
|
|
80
|
+
input_text = state.messages[-2] # use reflection insights
|
|
81
|
+
input_text = [expert_prompt, input_text]
|
|
82
|
+
consult = Consult()
|
|
83
|
+
result = await asyncio.to_thread(consult, query=input_text)
|
|
84
|
+
print("Expert produced answers")
|
|
85
|
+
return {
|
|
86
|
+
"messages": [AIMessage(result.get("solution"))],
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
async def _planner(self, state: AgentState) -> dict:
|
|
90
|
+
"""Generate a plan to solve query"""
|
|
91
|
+
print("Generating a plan to solve the problem...")
|
|
92
|
+
plan = dspy.Predict(Plan)
|
|
93
|
+
input_text = [planner_prompt] + state.messages
|
|
94
|
+
result = await asyncio.to_thread(plan, background=input_text)
|
|
95
|
+
print("Plan acquired")
|
|
96
|
+
return {
|
|
97
|
+
"messages": [AIMessage(result.get("answer"))],
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async def _reflector(self, state: AgentState) -> dict:
|
|
101
|
+
"""Propose improvements to answer"""
|
|
102
|
+
print("Calling the reflector...")
|
|
103
|
+
tune = dspy.Predict(Tune)
|
|
104
|
+
trans_map = {AIMessage: HumanMessage, HumanMessage: AIMessage}
|
|
105
|
+
translated_messages = [reflector_prompt, state.messages[0]] + [
|
|
106
|
+
trans_map[msg.__class__](content=msg.content) for msg in state.messages[1:]
|
|
107
|
+
]
|
|
108
|
+
result = await asyncio.to_thread(tune, background=translated_messages)
|
|
109
|
+
print("Reflector made suggestions")
|
|
110
|
+
return {
|
|
111
|
+
"messages": [
|
|
112
|
+
HumanMessage(
|
|
113
|
+
f"Progress has been made. Use now all the resources to addess this new suggestion: {result.get('answer')}"
|
|
114
|
+
)
|
|
115
|
+
],
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
async def _supervisor(self, state: AgentState) -> dict:
|
|
119
|
+
"""Orchestrate agentic system"""
|
|
120
|
+
print("Getting guidance from the supervisor...")
|
|
121
|
+
supervise = dspy.Predict(Supervise)
|
|
122
|
+
messages = [
|
|
123
|
+
supervisor_prompt1,
|
|
124
|
+
*state.messages,
|
|
125
|
+
supervisor_prompt2,
|
|
126
|
+
]
|
|
127
|
+
if len(messages) > self.max_steps:
|
|
128
|
+
return {"next_decision": "end"}
|
|
129
|
+
result = await asyncio.to_thread(supervise, background=messages)
|
|
130
|
+
print("Supervisor selected the next worker")
|
|
131
|
+
return {
|
|
132
|
+
"next_decision": result.get("next_decision"),
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
def _build_graph(self) -> Any:
|
|
136
|
+
graph_builder = StateGraph(AgentState)
|
|
137
|
+
graph_builder.add_node("gn_researcher", self._researcher)
|
|
138
|
+
graph_builder.add_node("planner", self._planner)
|
|
139
|
+
graph_builder.add_node("reflector", self._reflector)
|
|
140
|
+
graph_builder.add_node("supervisor", self._supervisor)
|
|
141
|
+
graph_builder.add_node("ncbi_expert", self._expert)
|
|
142
|
+
graph_builder.add_edge(START, "planner")
|
|
143
|
+
graph_builder.add_edge("planner", "supervisor")
|
|
144
|
+
graph_builder.add_edge("gn_researcher", "supervisor")
|
|
145
|
+
graph_builder.add_edge("ncbi_expert", "supervisor")
|
|
146
|
+
graph_builder.add_edge("reflector", "supervisor")
|
|
147
|
+
graph_builder.add_conditional_edges(
|
|
148
|
+
"supervisor",
|
|
149
|
+
lambda state: state.next_decision,
|
|
150
|
+
{
|
|
151
|
+
"reflector": "reflector",
|
|
152
|
+
"gn_researcher": "gn_researcher",
|
|
153
|
+
"ncbi_expert": "ncbi_expert",
|
|
154
|
+
"end": END,
|
|
155
|
+
},
|
|
156
|
+
)
|
|
157
|
+
return graph_builder.compile()
|
|
158
|
+
|
|
159
|
+
async def _run_graph(self, query: str) -> Any:
|
|
160
|
+
initial_state = {
|
|
161
|
+
"messages": [HumanMessage(query)],
|
|
162
|
+
"next_decision": "planner", # always plan first
|
|
163
|
+
}
|
|
164
|
+
return await self._graph.ainvoke(initial_state)
|
|
165
|
+
|
|
166
|
+
async def handle(self, query: str) -> str:
|
|
167
|
+
"""Run query through the system"""
|
|
168
|
+
print("Starting operation...")
|
|
169
|
+
result = await self._run_graph(query)
|
|
170
|
+
unprocessed_result = result.get("messages")[2].content
|
|
171
|
+
finalize = dspy.Predict(Finalize)
|
|
172
|
+
processed_result = await asyncio.to_thread(
|
|
173
|
+
lambda: finalize(messages=result.get("messages")).get("feedback")
|
|
174
|
+
)
|
|
175
|
+
print("Operation complete")
|
|
176
|
+
return f"Raw feedback: {unprocessed_result}\nProcessed feedback: {processed_result}"
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Module with system prompts"""
|
|
2
|
+
|
|
3
|
+
from langchain_core.messages import SystemMessage
|
|
4
|
+
|
|
5
|
+
researcher_prompt = SystemMessage(
|
|
6
|
+
"""
|
|
7
|
+
You are a researcher who have access to a variety of tools built on GeneNetwork RDF knowledge base. You can investigate any question users might have. Leverage the tools at your disposal to efficiently extract the answer to the query. While carrying out your job, you must follow the plan proposed by the planner. Do not leave anything out. Verify the final answer before reporting it.
|
|
8
|
+
"""
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
supervisor_prompt1 = SystemMessage(
|
|
12
|
+
"""
|
|
13
|
+
You are a supervisor for a genomic analysis. You tasked with managing a conversation between the following workers: [gn_researcher, ncbi_expert, reflector]. Given the following user request, respond with the worker to act next. Each worker will perform a task and respond with its results.
|
|
14
|
+
Follow the plan made by the planner to decide the next node. Do not finish before completing the plan. When finished, respond with end. Make sure to reflect right before finishing.
|
|
15
|
+
"""
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
supervisor_prompt2 = SystemMessage(
|
|
19
|
+
"""
|
|
20
|
+
Given the conversation above, who should act next? Or should we end? Select one of: [gn_researcher, ncbi_expert, reflector, end]. You must help in making progress towards executing and completing the plan. Look at the messages. Do not repeat the same step consecutively. For example, do not call the expert two times consecutively.
|
|
21
|
+
"""
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
planner_prompt = SystemMessage(
|
|
25
|
+
"""
|
|
26
|
+
You are an experienced and powerful task planner for genomic analysis. Generate a list of clear and relevant steps to take to solve the query below.
|
|
27
|
+
"""
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
reflector_prompt = SystemMessage(
|
|
31
|
+
"""
|
|
32
|
+
You have been doing research for almost 50 years and have a very deep knowledge of biology, genomics and bioinformatics. You always have relevant follow questions. Improve the system answer by providing follow up questions.
|
|
33
|
+
"""
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
expert_prompt = SystemMessage(
|
|
37
|
+
"""
|
|
38
|
+
You are a powerful system that have access to specialized tools to fetch relevant information and help you achieve your task. With those tools, you can extract any information in biology and specifically in genetics and genomics. Regardless of the organism, you can find the information that is requested.
|
|
39
|
+
Follow and execute step-by-step the plan below to solve the query further below using your knowledge and the tools at your disposal. Make sure to return the final solution alongside with intermediary results. Be accurate and thorough. Always countercheck your results and their relevance to ensure satisfaction.
|
|
40
|
+
"""
|
|
41
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Modules with general tools for GSurgeon"""
|
|
2
|
+
|
|
3
|
+
import dspy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Split(dspy.Signature):
|
|
7
|
+
"""Split query into multiple atomic subqueries easier to handle for better satisfaction"""
|
|
8
|
+
|
|
9
|
+
query: str = dspy.InputField()
|
|
10
|
+
answer: list[str] = dspy.OutputField(desc="The list of smaller tasks")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def split_query(query: str) -> list[str]:
|
|
14
|
+
split = dspy.Predict(Split)
|
|
15
|
+
return split(query=query).get("answer")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
splitter = dspy.Tool(
|
|
19
|
+
name="splitter",
|
|
20
|
+
desc="Process a query by splitting into atomic subqueries for efficiency",
|
|
21
|
+
args={
|
|
22
|
+
"query": {
|
|
23
|
+
"type": "string",
|
|
24
|
+
"desc": "Query to process",
|
|
25
|
+
},
|
|
26
|
+
},
|
|
27
|
+
func=split_query,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Check(dspy.Signature):
|
|
32
|
+
"""Check if info is relevant to query"""
|
|
33
|
+
|
|
34
|
+
query: str = dspy.InputField()
|
|
35
|
+
info: str = dspy.InputField()
|
|
36
|
+
decision: str = dspy.OutputField(desc="Say 'yes' or 'no'")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def check_relevance(query: str, info: str) -> str:
|
|
40
|
+
check = dspy.Predict(Check)
|
|
41
|
+
return check(query=query, info=info).get("decision")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
checker = dspy.Tool(
|
|
45
|
+
name="checker",
|
|
46
|
+
desc="Check if information previously extracted is relevant for the query",
|
|
47
|
+
args={
|
|
48
|
+
"query": {
|
|
49
|
+
"type": "string",
|
|
50
|
+
"desc": "Query to address",
|
|
51
|
+
},
|
|
52
|
+
"info": {
|
|
53
|
+
"type": "string",
|
|
54
|
+
"desc": "Information extracted in attempt to provide answer to query",
|
|
55
|
+
},
|
|
56
|
+
},
|
|
57
|
+
func=check_relevance,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Rephrase(dspy.Signature):
|
|
62
|
+
"""Reformulate query given target and context accumulated so far"""
|
|
63
|
+
|
|
64
|
+
query: str = dspy.InputField()
|
|
65
|
+
target: str = dspy.InputField()
|
|
66
|
+
background: str = dspy.InputField()
|
|
67
|
+
reformulation: str = dspy.OutputField(desc="Reformulated query")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def rephrase_query(query: str, target: str, background: str) -> str:
|
|
71
|
+
rephrase = dspy.Predict(Rephrase)
|
|
72
|
+
return rephrase(query=query, target=target, background=background).get(
|
|
73
|
+
"reformulation"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
reformulator = dspy.Tool(
|
|
78
|
+
name="reformulator",
|
|
79
|
+
desc="Reformulate the query to be next processed in light of the context accumulated so far (background) and the target",
|
|
80
|
+
args={
|
|
81
|
+
"query": {
|
|
82
|
+
"type": "string",
|
|
83
|
+
"desc": "Query to be reformulated",
|
|
84
|
+
},
|
|
85
|
+
"target": {
|
|
86
|
+
"type": "string",
|
|
87
|
+
"desc": "Original query or target",
|
|
88
|
+
},
|
|
89
|
+
"background": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"desc": "Accumulated context in effort to achieve the target",
|
|
92
|
+
},
|
|
93
|
+
},
|
|
94
|
+
func=rephrase_query,
|
|
95
|
+
)
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""Module with GeneNetwork SPARQL tools for GSurgeon"""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import concurrent.futures
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import dspy
|
|
8
|
+
import httpx
|
|
9
|
+
from SPARQLWrapper import JSON, SPARQLWrapper
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class QueryTranslation(dspy.Signature):
|
|
13
|
+
"""Compare object snapshot in schema hint to keywords in the original query to find best semantic matches.
|
|
14
|
+
Use matches to generate valid SPARQL SELECT queries that can retrieve relevant information for the query.
|
|
15
|
+
CRITICAL:
|
|
16
|
+
1. Every query MUST start with the PREFIX declarations. Only use declared prefixes.
|
|
17
|
+
2. Leverage as many schema hints as possible.
|
|
18
|
+
|
|
19
|
+
When querying SPARQL, prefer fast, efficient SPARQL SELECT queries
|
|
20
|
+
that avoid Virtuoso timeouts (504 errors).
|
|
21
|
+
|
|
22
|
+
PERFORMANCE RULES:
|
|
23
|
+
1. Always add `LIMIT` - start with `LIMIT 50`, increase only if needed. Never omit `LIMIT`.
|
|
24
|
+
2. Never use `SELECT *` - list only the variables you actually need.
|
|
25
|
+
3. Avoid expensive operations: no Cartesian products, no cross joins, no full graph scans.
|
|
26
|
+
4. Use specific FILTER patterns that leverage indexes:
|
|
27
|
+
- Prefer `STRSTARTS(?label, "prefix")` over `CONTAINS` or regex.
|
|
28
|
+
- Avoid `FILTER regex(...)` - it disables indexes.
|
|
29
|
+
- Use `FILTER(?value = "exact")` or `IN` with small lists.
|
|
30
|
+
5. Prefer property paths over multiple joins when traversing a chain.
|
|
31
|
+
6. Use VALUES blocks for small sets of constants instead of UNION or OPTIONAL.
|
|
32
|
+
7. Avoid ORDER BY on large result sets - if needed, combine with `LIMIT` and a narrow `WHERE` clause.
|
|
33
|
+
8. Never use nested subqueries unless absolutely necessary; flatten them.
|
|
34
|
+
9. Use `OPTIONAL` only for truly optional patterns – otherwise, use a simple triple pattern.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
original_query: str = dspy.InputField(desc="User query")
|
|
38
|
+
schema_hint: str = dspy.InputField(desc="GeneNetwork schema from Virtuoso")
|
|
39
|
+
translated_queries: list[str] = dspy.OutputField(
|
|
40
|
+
desc="Top 10 valid SPARQL SELECT query with PREFIX declarations."
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def make_sparql_tool(sparql_uri: str) -> dspy.Tool:
|
|
45
|
+
|
|
46
|
+
def sparql_fetcher(query: str) -> Any:
|
|
47
|
+
|
|
48
|
+
def build_schema_hint(sparql_uri: str) -> str:
|
|
49
|
+
"""Build a compact schema hint from the live Virtuoso endpoint."""
|
|
50
|
+
_PREFIX_MAP = {
|
|
51
|
+
"http://rdf.genenetwork.org/v1/term/": "gnt",
|
|
52
|
+
"http://rdf.genenetwork.org/v1/category/": "gnc",
|
|
53
|
+
"http://rdf.genenetwork.org/v1/id/": "gn",
|
|
54
|
+
"http://purl.org/dc/terms/": "dct",
|
|
55
|
+
"http://www.w3.org/ns/dcat#": "dcat",
|
|
56
|
+
"http://www.w3.org/2000/01/rdf-schema#": "rdfs",
|
|
57
|
+
"http://www.w3.org/2004/02/skos/core#": "skos",
|
|
58
|
+
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
|
|
59
|
+
"http://www.w3.org/2002/07/owl#": "owl",
|
|
60
|
+
"http://purl.org/linked-data/cube#": "qb",
|
|
61
|
+
"http://purl.org/linked-data/sdmx/2009/measure#": "sdmx-measure",
|
|
62
|
+
"http://rdf-vocabulary.ddialliance.org/xkos#": "xkos",
|
|
63
|
+
"https://schema.org/": "schema",
|
|
64
|
+
"http://rdf.ncbi.nlm.nih.gov/pubmed/": "pubmed",
|
|
65
|
+
"http://xmlns.com/foaf/0.1/": "foaf",
|
|
66
|
+
"http://purl.org/spar/fabio/": "fabio",
|
|
67
|
+
"http://prismstandard.org/namespaces/basic/2.0/": "prism",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def uri_to_qname(uri: str) -> str:
|
|
71
|
+
"""Convert a full URI to a prefixed name, or return the URI in angle brackets."""
|
|
72
|
+
for ns, prefix in sorted(_PREFIX_MAP.items(), key=lambda x: -len(x[0])):
|
|
73
|
+
if uri.startswith(ns):
|
|
74
|
+
return f"{prefix}:{uri[len(ns):]}"
|
|
75
|
+
return f"<{uri}>"
|
|
76
|
+
|
|
77
|
+
def fetch_schema(sparql_uri: str) -> tuple[set[str], set[str], set[str]]:
|
|
78
|
+
"""Fetch literal and object properties from the live Virtuoso endpoint.
|
|
79
|
+
Return (literal_props, iri_props) where each is a set of full URIs.
|
|
80
|
+
"""
|
|
81
|
+
sparql = SPARQLWrapper(sparql_uri)
|
|
82
|
+
sparql.setReturnFormat(JSON)
|
|
83
|
+
|
|
84
|
+
literal_query = """
|
|
85
|
+
SELECT DISTINCT ?p
|
|
86
|
+
WHERE { ?s ?p ?o . FILTER isLiteral(?o) }
|
|
87
|
+
"""
|
|
88
|
+
sparql.setQuery(literal_query)
|
|
89
|
+
lit_result = sparql.queryAndConvert()
|
|
90
|
+
literal_props = {
|
|
91
|
+
b["p"]["value"]
|
|
92
|
+
for b in lit_result.get("results", {}).get("bindings", [])
|
|
93
|
+
if b.get("p")
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
iri_query = """
|
|
97
|
+
SELECT DISTINCT ?p
|
|
98
|
+
WHERE { ?s ?p ?o . FILTER isIRI(?o) }
|
|
99
|
+
"""
|
|
100
|
+
sparql.setQuery(iri_query)
|
|
101
|
+
iri_result = sparql.queryAndConvert()
|
|
102
|
+
iri_props = {
|
|
103
|
+
b["p"]["value"]
|
|
104
|
+
for b in iri_result.get("results", {}).get("bindings", [])
|
|
105
|
+
if b.get("p")
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return literal_props, iri_props
|
|
109
|
+
|
|
110
|
+
literal_props, iri_props = fetch_schema(sparql_uri)
|
|
111
|
+
return f"""=== GENENETWORK SCHEMA (from Virtuoso) ===
|
|
112
|
+
PREFIX dcat: <http://www.w3.org/ns/dcat#>
|
|
113
|
+
PREFIX gn: <http://rdf.genenetwork.org/v1/id/>
|
|
114
|
+
PREFIX dct: <http://purl.org/dc/terms/>
|
|
115
|
+
PREFIX gnc: <http://rdf.genenetwork.org/v1/category/>
|
|
116
|
+
PREFIX gnt: <http://rdf.genenetwork.org/v1/term/>
|
|
117
|
+
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
118
|
+
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
119
|
+
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
|
|
120
|
+
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
|
121
|
+
|
|
122
|
+
LITERAL PROPERTIES (object is a string/number/date):
|
|
123
|
+
{" ,".join([uri_to_qname(uri) for uri in literal_props])}
|
|
124
|
+
|
|
125
|
+
IRI PROPERTIES (object is a URI / another resource):
|
|
126
|
+
{" ,".join([uri_to_qname(uri) for uri in iri_props])}
|
|
127
|
+
|
|
128
|
+
SPECIAL HINTS FOR TRIPLE GENERATION:
|
|
129
|
+
1. To check if a trait is mapped, use: `?trait a gnt:mappedTrait .`
|
|
130
|
+
2. To get trait id, use: `?trait gnt:traitId ?trait_id .`
|
|
131
|
+
3. To fetch trait description, use: `?trait dct:description ?trait_description .`
|
|
132
|
+
4. To extract lod score at a specific locus, use: `?trait gnt:locus ?locus; gnt:lodScore ?lod_score .`
|
|
133
|
+
5. To fetch information related to QTL for a trait, use:
|
|
134
|
+
`?trait gnt:qtlChr ?chromosome; gnt:qtlStart ?start; gnt:qtlStop ?stop; gnt:qtlLOD ?lod_score .`
|
|
135
|
+
|
|
136
|
+
CRITICAL RULES:
|
|
137
|
+
1. Only use properties listed above. Do NOT invent new ones.
|
|
138
|
+
2. Literal properties give strings/numbers — use FILTER, not ?o a ...
|
|
139
|
+
3. Object properties link to other resources — you can chain ?o a <Class>.
|
|
140
|
+
4. Do NOT use taxon: for species. Use gn:Mus_musculus, gn:Rattus_norvegicus, gn:Homo_sapiens, etc.
|
|
141
|
+
5. gnt:has_trait_page gives the URL directly. Never build trait URLs manually.
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
schema_hint = build_schema_hint(sparql_uri)
|
|
145
|
+
translate_sparql = dspy.Predict(QueryTranslation)
|
|
146
|
+
sparql_queries = translate_sparql(
|
|
147
|
+
original_query=query, schema_hint=schema_hint
|
|
148
|
+
).get("translated_queries")
|
|
149
|
+
|
|
150
|
+
async def run_sparql(
|
|
151
|
+
sparql_uri: str,
|
|
152
|
+
query: str,
|
|
153
|
+
max_retries: int = 3,
|
|
154
|
+
base_delay: float = 2,
|
|
155
|
+
) -> dict:
|
|
156
|
+
"""Execute a single SPARQL query with retry + exponential jitter via httpx."""
|
|
157
|
+
client = httpx.AsyncClient(timeout=5000)
|
|
158
|
+
for attempt in range(max_retries):
|
|
159
|
+
try:
|
|
160
|
+
resp = await client.post(
|
|
161
|
+
sparql_uri,
|
|
162
|
+
data={"query": query},
|
|
163
|
+
headers={"Accept": "application/sparql-results+json"},
|
|
164
|
+
)
|
|
165
|
+
resp.raise_for_status()
|
|
166
|
+
return resp.json()
|
|
167
|
+
except httpx.HTTPStatusError as e:
|
|
168
|
+
if (
|
|
169
|
+
e.response.status_code in (504, 503, 502)
|
|
170
|
+
and attempt < max_retries - 1
|
|
171
|
+
):
|
|
172
|
+
await asyncio.sleep(
|
|
173
|
+
base_delay * (2**attempt) + random.uniform(0, 1)
|
|
174
|
+
)
|
|
175
|
+
continue
|
|
176
|
+
raise
|
|
177
|
+
return {}
|
|
178
|
+
|
|
179
|
+
async def sparql_fetch(
|
|
180
|
+
sparql_queries: list[str],
|
|
181
|
+
sparql_uri: str,
|
|
182
|
+
max_retries: int = 3,
|
|
183
|
+
base_delay: float = 0.5,
|
|
184
|
+
) -> str:
|
|
185
|
+
"""Execute *sparql_queries* concurrently against *sparql_uri*."""
|
|
186
|
+
if not sparql_queries:
|
|
187
|
+
return "No SPARQL queries to run."
|
|
188
|
+
|
|
189
|
+
async def _fetch_one(query: str, idx: int) -> str:
|
|
190
|
+
try:
|
|
191
|
+
result = await run_sparql(
|
|
192
|
+
sparql_uri, query, max_retries, base_delay
|
|
193
|
+
)
|
|
194
|
+
bindings = result.get("results", {}).get("bindings", [])
|
|
195
|
+
return f"Query {idx} succeeded ({len(bindings)} rows): {bindings}"
|
|
196
|
+
except Exception as e:
|
|
197
|
+
return f"Query {idx} failed: {e}\nQuery was:\n{query}"
|
|
198
|
+
|
|
199
|
+
tasks = [_fetch_one(q, i) for i, q in enumerate(sparql_queries)]
|
|
200
|
+
results = await asyncio.gather(*tasks)
|
|
201
|
+
return "\n\n".join(results)
|
|
202
|
+
|
|
203
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
|
204
|
+
future = executor.submit(
|
|
205
|
+
asyncio.run, sparql_fetch(sparql_queries, sparql_uri)
|
|
206
|
+
)
|
|
207
|
+
return future.result()
|
|
208
|
+
|
|
209
|
+
return dspy.Tool(
|
|
210
|
+
name="sparql_fetcher",
|
|
211
|
+
desc="Fetch RDF data around GeneNetwork data through SPARQL",
|
|
212
|
+
args={
|
|
213
|
+
"query": {
|
|
214
|
+
"type": "string",
|
|
215
|
+
"desc": "SPARQL query to run to fetch relevant data",
|
|
216
|
+
},
|
|
217
|
+
},
|
|
218
|
+
func=sparql_fetcher,
|
|
219
|
+
)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Modules with NCBI tools for GSurgeon"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import dspy
|
|
6
|
+
from Bio.Entrez import efetch, esearch, esummary, read
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def search_ncbi(database: str, term: str, max_results: int = 10) -> str:
|
|
10
|
+
handle = esearch(db=database, term=term, retmax=max_results)
|
|
11
|
+
records = read(handle)
|
|
12
|
+
handle.close()
|
|
13
|
+
# Order records for determinism
|
|
14
|
+
if isinstance(records, dict) and "IdList" in records:
|
|
15
|
+
records["IdList"] = sorted(records["IdList"])
|
|
16
|
+
return json.dumps(records, sort_keys=True)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
ncbi_searcher = dspy.Tool(
|
|
20
|
+
name="ncbi_searcher",
|
|
21
|
+
desc="Search an NCBI database (e.g., nucleotide, protein, pubmed) for a term",
|
|
22
|
+
args={
|
|
23
|
+
"database": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"desc": "Database name like 'nucleotide' or 'pubmed'",
|
|
26
|
+
},
|
|
27
|
+
"term": {"type": "string", "desc": "Search term or query"},
|
|
28
|
+
"max_results": {
|
|
29
|
+
"type": "integer",
|
|
30
|
+
"desc": "Max results (default 2000)",
|
|
31
|
+
"default": 2000,
|
|
32
|
+
},
|
|
33
|
+
},
|
|
34
|
+
func=search_ncbi,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def fetch_record(database: str, record_id: str, rettype: str) -> str:
|
|
39
|
+
handle = efetch(db=database, id=record_id, rettype=rettype, retmode="text")
|
|
40
|
+
result = handle.readline().strip()
|
|
41
|
+
handle.close()
|
|
42
|
+
return result
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
record_fetcher = dspy.Tool(
|
|
46
|
+
name="record_fetcher",
|
|
47
|
+
desc="Fetch a record from an NCBI database (e.g., nucleotide, protein, pubmed)",
|
|
48
|
+
args={
|
|
49
|
+
"database": {
|
|
50
|
+
"type": "string",
|
|
51
|
+
"desc": "Database name like 'nucleotide' or 'pubmed'",
|
|
52
|
+
},
|
|
53
|
+
"record_id": {"type": "string", "desc": "Identifier of record"},
|
|
54
|
+
"rettype": {"type": "string", "desc": "Return type compatible with database"},
|
|
55
|
+
},
|
|
56
|
+
func=fetch_record,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def summarize_record(database: str, record_id: str) -> str:
|
|
61
|
+
handle = esummary(db=database, id=record_id)
|
|
62
|
+
result = read(handle)
|
|
63
|
+
handle.close()
|
|
64
|
+
# If a list of summaries, sort by Id for determinism
|
|
65
|
+
if isinstance(result, list):
|
|
66
|
+
result = sorted(result, key=lambda x: x.get("Id", ""))
|
|
67
|
+
return json.dumps(result, sort_keys=True)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
record_synthesizer = dspy.Tool(
|
|
71
|
+
name="record_synthesiser",
|
|
72
|
+
desc="Get summary on a record from an NCBI database (e.g., nucleotide, protein, pubmed)",
|
|
73
|
+
args={
|
|
74
|
+
"database": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"desc": "Database name like 'nucleotide' or 'pubmed'",
|
|
77
|
+
},
|
|
78
|
+
"record_id": {"type": "string", "desc": "Identifier of record"},
|
|
79
|
+
},
|
|
80
|
+
func=summarize_record,
|
|
81
|
+
)
|