aurelian 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aurelian/__init__.py +9 -0
- aurelian/agents/__init__.py +0 -0
- aurelian/agents/amigo/__init__.py +3 -0
- aurelian/agents/amigo/amigo_agent.py +77 -0
- aurelian/agents/amigo/amigo_config.py +85 -0
- aurelian/agents/amigo/amigo_evals.py +73 -0
- aurelian/agents/amigo/amigo_gradio.py +52 -0
- aurelian/agents/amigo/amigo_mcp.py +152 -0
- aurelian/agents/amigo/amigo_tools.py +152 -0
- aurelian/agents/biblio/__init__.py +42 -0
- aurelian/agents/biblio/biblio_agent.py +94 -0
- aurelian/agents/biblio/biblio_config.py +40 -0
- aurelian/agents/biblio/biblio_gradio.py +67 -0
- aurelian/agents/biblio/biblio_mcp.py +115 -0
- aurelian/agents/biblio/biblio_tools.py +164 -0
- aurelian/agents/biblio_agent.py +46 -0
- aurelian/agents/checklist/__init__.py +44 -0
- aurelian/agents/checklist/checklist_agent.py +85 -0
- aurelian/agents/checklist/checklist_config.py +28 -0
- aurelian/agents/checklist/checklist_gradio.py +70 -0
- aurelian/agents/checklist/checklist_mcp.py +86 -0
- aurelian/agents/checklist/checklist_tools.py +141 -0
- aurelian/agents/checklist/content/checklists.yaml +7 -0
- aurelian/agents/checklist/content/streams.csv +136 -0
- aurelian/agents/checklist_agent.py +40 -0
- aurelian/agents/chemistry/__init__.py +3 -0
- aurelian/agents/chemistry/chemistry_agent.py +46 -0
- aurelian/agents/chemistry/chemistry_config.py +71 -0
- aurelian/agents/chemistry/chemistry_evals.py +79 -0
- aurelian/agents/chemistry/chemistry_gradio.py +50 -0
- aurelian/agents/chemistry/chemistry_mcp.py +120 -0
- aurelian/agents/chemistry/chemistry_tools.py +121 -0
- aurelian/agents/chemistry/image_agent.py +15 -0
- aurelian/agents/d4d/__init__.py +30 -0
- aurelian/agents/d4d/d4d_agent.py +72 -0
- aurelian/agents/d4d/d4d_config.py +46 -0
- aurelian/agents/d4d/d4d_gradio.py +58 -0
- aurelian/agents/d4d/d4d_mcp.py +71 -0
- aurelian/agents/d4d/d4d_tools.py +157 -0
- aurelian/agents/d4d_agent.py +64 -0
- aurelian/agents/diagnosis/__init__.py +33 -0
- aurelian/agents/diagnosis/diagnosis_agent.py +53 -0
- aurelian/agents/diagnosis/diagnosis_config.py +48 -0
- aurelian/agents/diagnosis/diagnosis_evals.py +76 -0
- aurelian/agents/diagnosis/diagnosis_gradio.py +52 -0
- aurelian/agents/diagnosis/diagnosis_mcp.py +141 -0
- aurelian/agents/diagnosis/diagnosis_tools.py +204 -0
- aurelian/agents/diagnosis_agent.py +28 -0
- aurelian/agents/draw/__init__.py +3 -0
- aurelian/agents/draw/draw_agent.py +39 -0
- aurelian/agents/draw/draw_config.py +26 -0
- aurelian/agents/draw/draw_gradio.py +50 -0
- aurelian/agents/draw/draw_mcp.py +94 -0
- aurelian/agents/draw/draw_tools.py +100 -0
- aurelian/agents/draw/judge_agent.py +18 -0
- aurelian/agents/filesystem/__init__.py +0 -0
- aurelian/agents/filesystem/filesystem_config.py +27 -0
- aurelian/agents/filesystem/filesystem_gradio.py +49 -0
- aurelian/agents/filesystem/filesystem_mcp.py +89 -0
- aurelian/agents/filesystem/filesystem_tools.py +95 -0
- aurelian/agents/filesystem/py.typed +0 -0
- aurelian/agents/github/__init__.py +0 -0
- aurelian/agents/github/github_agent.py +83 -0
- aurelian/agents/github/github_cli.py +248 -0
- aurelian/agents/github/github_config.py +22 -0
- aurelian/agents/github/github_gradio.py +152 -0
- aurelian/agents/github/github_mcp.py +252 -0
- aurelian/agents/github/github_tools.py +408 -0
- aurelian/agents/github/github_tools.py.tmp +413 -0
- aurelian/agents/goann/__init__.py +13 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.md +1000 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.pdf +0 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.md +693 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.pdf +0 -0
- aurelian/agents/goann/goann_agent.py +90 -0
- aurelian/agents/goann/goann_config.py +90 -0
- aurelian/agents/goann/goann_evals.py +104 -0
- aurelian/agents/goann/goann_gradio.py +62 -0
- aurelian/agents/goann/goann_mcp.py +0 -0
- aurelian/agents/goann/goann_tools.py +65 -0
- aurelian/agents/gocam/__init__.py +43 -0
- aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/DNA-binding_transcription_factor_activity_annotation_guidelines.md +100 -0
- aurelian/agents/gocam/documents/E3 ubiquitin ligases.docx +0 -0
- aurelian/agents/gocam/documents/E3 ubiquitin ligases.pdf +0 -0
- aurelian/agents/gocam/documents/E3_ubiquitin_ligases.md +134 -0
- aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.docx +0 -0
- aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.pdf +0 -0
- aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.docx +0 -0
- aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.pdf +0 -0
- aurelian/agents/gocam/documents/GO-CAM_annotation_guidelines_README.md +1 -0
- aurelian/agents/gocam/documents/GO-CAM_modelling_guidelines_TO_DO.md +3 -0
- aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.pdf +0 -0
- aurelian/agents/gocam/documents/How to annotate molecular adaptors.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate molecular adaptors.pdf +0 -0
- aurelian/agents/gocam/documents/How to annotate sequestering proteins.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate sequestering proteins.pdf +0 -0
- aurelian/agents/gocam/documents/How_to_annotate_complexes_in_GO-CAM.md +29 -0
- aurelian/agents/gocam/documents/How_to_annotate_molecular_adaptors.md +31 -0
- aurelian/agents/gocam/documents/How_to_annotate_sequestering_proteins.md +42 -0
- aurelian/agents/gocam/documents/Molecular adaptor activity.docx +0 -0
- aurelian/agents/gocam/documents/Molecular adaptor activity.pdf +0 -0
- aurelian/agents/gocam/documents/Molecular carrier activity.docx +0 -0
- aurelian/agents/gocam/documents/Molecular carrier activity.pdf +0 -0
- aurelian/agents/gocam/documents/Molecular_adaptor_activity.md +51 -0
- aurelian/agents/gocam/documents/Molecular_carrier_activity.md +41 -0
- aurelian/agents/gocam/documents/Protein sequestering activity.docx +0 -0
- aurelian/agents/gocam/documents/Protein sequestering activity.pdf +0 -0
- aurelian/agents/gocam/documents/Protein_sequestering_activity.md +50 -0
- aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/Signaling_receptor_activity_annotation_guidelines.md +187 -0
- aurelian/agents/gocam/documents/Transcription coregulator activity.docx +0 -0
- aurelian/agents/gocam/documents/Transcription coregulator activity.pdf +0 -0
- aurelian/agents/gocam/documents/Transcription_coregulator_activity.md +36 -0
- aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/Transporter_activity_annotation_annotation_guidelines.md +43 -0
- Regulatory Processes in GO-CAM.docx +0 -0
- Regulatory Processes in GO-CAM.pdf +0 -0
- aurelian/agents/gocam/documents/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +31 -0
- aurelian/agents/gocam/documents/md/DNA-binding_transcription_factor_activity_annotation_guidelines.md +131 -0
- aurelian/agents/gocam/documents/md/E3_ubiquitin_ligases.md +166 -0
- aurelian/agents/gocam/documents/md/GO-CAM_annotation_guidelines_README.md +1 -0
- aurelian/agents/gocam/documents/md/GO-CAM_modelling_guidelines_TO_DO.md +5 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_complexes_in_GO-CAM.md +28 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_molecular_adaptors.md +19 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_sequestering_proteins.md +38 -0
- aurelian/agents/gocam/documents/md/Molecular_adaptor_activity.md +52 -0
- aurelian/agents/gocam/documents/md/Molecular_carrier_activity.md +59 -0
- aurelian/agents/gocam/documents/md/Protein_sequestering_activity.md +52 -0
- aurelian/agents/gocam/documents/md/Signaling_receptor_activity_annotation_guidelines.md +271 -0
- aurelian/agents/gocam/documents/md/Transcription_coregulator_activity.md +54 -0
- aurelian/agents/gocam/documents/md/Transporter_activity_annotation_annotation_guidelines.md +38 -0
- aurelian/agents/gocam/documents/md/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +39 -0
- aurelian/agents/gocam/documents/pandoc_md/Signaling_receptor_activity_annotation_guidelines.md +334 -0
- aurelian/agents/gocam/gocam_agent.py +240 -0
- aurelian/agents/gocam/gocam_config.py +85 -0
- aurelian/agents/gocam/gocam_curator_agent.py +46 -0
- aurelian/agents/gocam/gocam_evals.py +67 -0
- aurelian/agents/gocam/gocam_gradio.py +89 -0
- aurelian/agents/gocam/gocam_mcp.py +224 -0
- aurelian/agents/gocam/gocam_tools.py +294 -0
- aurelian/agents/linkml/__init__.py +0 -0
- aurelian/agents/linkml/linkml_agent.py +62 -0
- aurelian/agents/linkml/linkml_config.py +48 -0
- aurelian/agents/linkml/linkml_evals.py +66 -0
- aurelian/agents/linkml/linkml_gradio.py +45 -0
- aurelian/agents/linkml/linkml_mcp.py +186 -0
- aurelian/agents/linkml/linkml_tools.py +102 -0
- aurelian/agents/literature/__init__.py +3 -0
- aurelian/agents/literature/literature_agent.py +55 -0
- aurelian/agents/literature/literature_config.py +35 -0
- aurelian/agents/literature/literature_gradio.py +52 -0
- aurelian/agents/literature/literature_mcp.py +174 -0
- aurelian/agents/literature/literature_tools.py +182 -0
- aurelian/agents/monarch/__init__.py +25 -0
- aurelian/agents/monarch/monarch_agent.py +44 -0
- aurelian/agents/monarch/monarch_config.py +45 -0
- aurelian/agents/monarch/monarch_gradio.py +51 -0
- aurelian/agents/monarch/monarch_mcp.py +65 -0
- aurelian/agents/monarch/monarch_tools.py +113 -0
- aurelian/agents/oak/__init__.py +0 -0
- aurelian/agents/oak/oak_config.py +27 -0
- aurelian/agents/oak/oak_gradio.py +57 -0
- aurelian/agents/ontology_mapper/__init__.py +31 -0
- aurelian/agents/ontology_mapper/ontology_mapper_agent.py +56 -0
- aurelian/agents/ontology_mapper/ontology_mapper_config.py +50 -0
- aurelian/agents/ontology_mapper/ontology_mapper_evals.py +108 -0
- aurelian/agents/ontology_mapper/ontology_mapper_gradio.py +58 -0
- aurelian/agents/ontology_mapper/ontology_mapper_mcp.py +81 -0
- aurelian/agents/ontology_mapper/ontology_mapper_tools.py +147 -0
- aurelian/agents/phenopackets/__init__.py +3 -0
- aurelian/agents/phenopackets/phenopackets_agent.py +58 -0
- aurelian/agents/phenopackets/phenopackets_config.py +72 -0
- aurelian/agents/phenopackets/phenopackets_evals.py +99 -0
- aurelian/agents/phenopackets/phenopackets_gradio.py +55 -0
- aurelian/agents/phenopackets/phenopackets_mcp.py +178 -0
- aurelian/agents/phenopackets/phenopackets_tools.py +127 -0
- aurelian/agents/rag/__init__.py +40 -0
- aurelian/agents/rag/rag_agent.py +83 -0
- aurelian/agents/rag/rag_config.py +80 -0
- aurelian/agents/rag/rag_gradio.py +67 -0
- aurelian/agents/rag/rag_mcp.py +107 -0
- aurelian/agents/rag/rag_tools.py +189 -0
- aurelian/agents/rag_agent.py +54 -0
- aurelian/agents/robot/__init__.py +0 -0
- aurelian/agents/robot/assets/__init__.py +3 -0
- aurelian/agents/robot/assets/template.md +384 -0
- aurelian/agents/robot/robot_config.py +25 -0
- aurelian/agents/robot/robot_gradio.py +46 -0
- aurelian/agents/robot/robot_mcp.py +100 -0
- aurelian/agents/robot/robot_ontology_agent.py +139 -0
- aurelian/agents/robot/robot_tools.py +50 -0
- aurelian/agents/talisman/__init__.py +3 -0
- aurelian/agents/talisman/talisman_agent.py +126 -0
- aurelian/agents/talisman/talisman_config.py +66 -0
- aurelian/agents/talisman/talisman_gradio.py +50 -0
- aurelian/agents/talisman/talisman_mcp.py +168 -0
- aurelian/agents/talisman/talisman_tools.py +720 -0
- aurelian/agents/ubergraph/__init__.py +40 -0
- aurelian/agents/ubergraph/ubergraph_agent.py +71 -0
- aurelian/agents/ubergraph/ubergraph_config.py +79 -0
- aurelian/agents/ubergraph/ubergraph_gradio.py +48 -0
- aurelian/agents/ubergraph/ubergraph_mcp.py +69 -0
- aurelian/agents/ubergraph/ubergraph_tools.py +118 -0
- aurelian/agents/uniprot/__init__.py +37 -0
- aurelian/agents/uniprot/uniprot_agent.py +43 -0
- aurelian/agents/uniprot/uniprot_config.py +43 -0
- aurelian/agents/uniprot/uniprot_evals.py +99 -0
- aurelian/agents/uniprot/uniprot_gradio.py +48 -0
- aurelian/agents/uniprot/uniprot_mcp.py +168 -0
- aurelian/agents/uniprot/uniprot_tools.py +136 -0
- aurelian/agents/web/__init__.py +0 -0
- aurelian/agents/web/web_config.py +27 -0
- aurelian/agents/web/web_gradio.py +48 -0
- aurelian/agents/web/web_mcp.py +50 -0
- aurelian/agents/web/web_tools.py +108 -0
- aurelian/chat.py +23 -0
- aurelian/cli.py +800 -0
- aurelian/dependencies/__init__.py +0 -0
- aurelian/dependencies/workdir.py +78 -0
- aurelian/mcp/__init__.py +0 -0
- aurelian/mcp/amigo_mcp_test.py +86 -0
- aurelian/mcp/config_generator.py +123 -0
- aurelian/mcp/example_config.json +43 -0
- aurelian/mcp/generate_sample_config.py +37 -0
- aurelian/mcp/gocam_mcp_test.py +126 -0
- aurelian/mcp/linkml_mcp_tools.py +190 -0
- aurelian/mcp/mcp_discovery.py +87 -0
- aurelian/mcp/mcp_test.py +31 -0
- aurelian/mcp/phenopackets_mcp_test.py +103 -0
- aurelian/tools/__init__.py +0 -0
- aurelian/tools/web/__init__.py +0 -0
- aurelian/tools/web/url_download.py +51 -0
- aurelian/utils/__init__.py +0 -0
- aurelian/utils/async_utils.py +15 -0
- aurelian/utils/data_utils.py +32 -0
- aurelian/utils/documentation_manager.py +59 -0
- aurelian/utils/doi_fetcher.py +238 -0
- aurelian/utils/ontology_utils.py +68 -0
- aurelian/utils/pdf_fetcher.py +23 -0
- aurelian/utils/process_logs.py +100 -0
- aurelian/utils/pubmed_utils.py +238 -0
- aurelian/utils/pytest_report_to_markdown.py +67 -0
- aurelian/utils/robot_ontology_utils.py +112 -0
- aurelian/utils/search_utils.py +95 -0
- aurelian-0.3.2.dist-info/LICENSE +22 -0
- aurelian-0.3.2.dist-info/METADATA +105 -0
- aurelian-0.3.2.dist-info/RECORD +254 -0
- aurelian-0.3.2.dist-info/WHEEL +4 -0
- aurelian-0.3.2.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
"""
|
2
|
+
Tools for the phenopackets agent.
|
3
|
+
"""
|
4
|
+
from typing import List, Dict, Optional
|
5
|
+
|
6
|
+
from pydantic_ai import RunContext, ModelRetry
|
7
|
+
|
8
|
+
from aurelian.agents.phenopackets.phenopackets_config import PhenopacketsDependencies
|
9
|
+
from aurelian.utils.data_utils import flatten
|
10
|
+
from aurelian.agents.literature.literature_tools import (
|
11
|
+
lookup_pmid as literature_lookup_pmid,
|
12
|
+
search_literature_web,
|
13
|
+
retrieve_literature_page
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
async def search_phenopackets(ctx: RunContext[PhenopacketsDependencies], query: str) -> List[Dict]:
|
18
|
+
"""
|
19
|
+
Performs a retrieval search over the Phenopackets database.
|
20
|
+
|
21
|
+
The query can be any text, such as name of a disease, phenotype, gene, etc.
|
22
|
+
|
23
|
+
The objects returned are "Phenopackets" which is a structured representation
|
24
|
+
of a patient. Each is uniquely identified by a phenopacket ID (essentially
|
25
|
+
the patient ID).
|
26
|
+
|
27
|
+
The objects returned are summaries of Phenopackets; some details such
|
28
|
+
as phenotypes are omitted. Use `lookup_phenopacket` to retrieve full details.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
ctx: The run context
|
32
|
+
query: The search query text
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
List[Dict]: List of phenopackets matching the query
|
36
|
+
"""
|
37
|
+
print(f"SEARCH PHENOPACKETS: {query} // {ctx.deps}")
|
38
|
+
try:
|
39
|
+
qr = ctx.deps.collection.search(query, index_name="llm", limit=ctx.deps.max_results)
|
40
|
+
objs = []
|
41
|
+
for score, row in qr.ranked_rows:
|
42
|
+
obj = flatten(row, preserve_keys=["interpretations", "diseases"])
|
43
|
+
obj["relevancy_score"] = score
|
44
|
+
objs.append(obj)
|
45
|
+
print(f"RESULT: {obj}")
|
46
|
+
|
47
|
+
if not objs:
|
48
|
+
raise ModelRetry(f"No phenopackets found matching the query: {query}. Try a different search term.")
|
49
|
+
|
50
|
+
return objs
|
51
|
+
except Exception as e:
|
52
|
+
if "ModelRetry" in str(type(e)):
|
53
|
+
raise e
|
54
|
+
raise ModelRetry(f"Error searching phenopackets: {str(e)}")
|
55
|
+
|
56
|
+
|
57
|
+
async def lookup_phenopacket(ctx: RunContext[PhenopacketsDependencies], phenopacket_id: str) -> Dict:
|
58
|
+
"""
|
59
|
+
Performs a lookup of an individual Phenopacket by its ID.
|
60
|
+
|
61
|
+
IDs are typically of the form PMID_nnn_PatientNumber, but this should not be assumed.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
ctx: The run context
|
65
|
+
phenopacket_id: The ID of the Phenopacket to look up
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
Dict: The phenopacket data
|
69
|
+
"""
|
70
|
+
print(f"LOOKUP PHENOPACKET: {phenopacket_id}")
|
71
|
+
try:
|
72
|
+
qr = ctx.deps.collection.find({"id": phenopacket_id})
|
73
|
+
if not qr.rows:
|
74
|
+
raise ModelRetry(f"Could not find phenopacket with ID {phenopacket_id}. The ID may be incorrect.")
|
75
|
+
return qr.rows[0]
|
76
|
+
except Exception as e:
|
77
|
+
if "ModelRetry" in str(type(e)):
|
78
|
+
raise e
|
79
|
+
raise ModelRetry(f"Error looking up phenopacket {phenopacket_id}: {str(e)}")
|
80
|
+
|
81
|
+
|
82
|
+
async def lookup_pmid(pmid: str) -> str:
|
83
|
+
"""
|
84
|
+
Lookup the text of a PubMed article by its PMID.
|
85
|
+
|
86
|
+
A PMID should be of the form "PMID:nnnnnnn" (no underscores).
|
87
|
+
|
88
|
+
NOTE: Phenopacket IDs are typically of the form PMID_nnn_PatientNumber,
|
89
|
+
but this should not be assumed. To reliably get PMIDs for a phenopacket,
|
90
|
+
use `lookup_phenopacket` to retrieve and examine the `externalReferences` field.
|
91
|
+
|
92
|
+
Args:
|
93
|
+
pmid: The PubMed ID to look up
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
str: Full text if available, otherwise abstract
|
97
|
+
"""
|
98
|
+
print(f"LOOKUP PMID FOR PHENOPACKET: {pmid}")
|
99
|
+
return await literature_lookup_pmid(pmid)
|
100
|
+
|
101
|
+
|
102
|
+
async def search_web(query: str) -> str:
|
103
|
+
"""
|
104
|
+
Search the web using a text query.
|
105
|
+
|
106
|
+
Args:
|
107
|
+
query: The search query
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
str: Search results with summaries
|
111
|
+
"""
|
112
|
+
print(f"PHENOPACKET WEB SEARCH: {query}")
|
113
|
+
return await search_literature_web(query)
|
114
|
+
|
115
|
+
|
116
|
+
async def retrieve_web_page(url: str) -> str:
|
117
|
+
"""
|
118
|
+
Fetch the contents of a web page.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
url: The URL to fetch
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
str: The contents of the web page
|
125
|
+
"""
|
126
|
+
print(f"FETCH WEB PAGE FOR PHENOPACKET: {url}")
|
127
|
+
return await retrieve_literature_page(url)
|
@@ -0,0 +1,40 @@
|
|
1
|
+
"""
|
2
|
+
RAG agent package for retrieval-augmented generation against document collections.
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Constants
|
6
|
+
COLLECTION_NAME = "main"
|
7
|
+
|
8
|
+
# isort: skip_file
|
9
|
+
from .rag_agent import rag_agent # noqa: E402
|
10
|
+
from .rag_config import RagDependencies, get_config # noqa: E402
|
11
|
+
from .rag_gradio import chat # noqa: E402
|
12
|
+
from .rag_tools import ( # noqa: E402
|
13
|
+
search_documents,
|
14
|
+
inspect_document,
|
15
|
+
lookup_pmid,
|
16
|
+
search_web,
|
17
|
+
retrieve_web_page,
|
18
|
+
)
|
19
|
+
|
20
|
+
__all__ = [
|
21
|
+
# Constants
|
22
|
+
"COLLECTION_NAME",
|
23
|
+
|
24
|
+
# Agent
|
25
|
+
"rag_agent",
|
26
|
+
|
27
|
+
# Config
|
28
|
+
"RagDependencies",
|
29
|
+
"get_config",
|
30
|
+
|
31
|
+
# Tools
|
32
|
+
"search_documents",
|
33
|
+
"inspect_document",
|
34
|
+
"lookup_pmid",
|
35
|
+
"search_web",
|
36
|
+
"retrieve_web_page",
|
37
|
+
|
38
|
+
# Gradio
|
39
|
+
"chat",
|
40
|
+
]
|
@@ -0,0 +1,83 @@
|
|
1
|
+
"""
|
2
|
+
Agent for retrieval-augmented generation (RAG) against document collections.
|
3
|
+
"""
|
4
|
+
from pydantic_ai import Agent, RunContext
|
5
|
+
|
6
|
+
from .rag_config import RagDependencies
|
7
|
+
from .rag_tools import search_documents, inspect_document, lookup_pmid, search_web, retrieve_web_page
|
8
|
+
|
9
|
+
|
10
|
+
rag_agent = Agent(
|
11
|
+
model="openai:gpt-4o",
|
12
|
+
deps_type=RagDependencies,
|
13
|
+
result_type=str,
|
14
|
+
system_prompt=(
|
15
|
+
"You are an AI assistant that help explore a literature collection via RAG."
|
16
|
+
" You can use different functions to access the store, for example:"
|
17
|
+
" - `search_documents` to find documents by text query"
|
18
|
+
" - `inspect_document` to retrieve a specific document (by title/name)"
|
19
|
+
"You can also use `lookup_pmid` to retrieve the text of a PubMed ID, or `search_web` to search the web."
|
20
|
+
),
|
21
|
+
)
|
22
|
+
|
23
|
+
|
24
|
+
@rag_agent.tool
|
25
|
+
async def search_documents_tool(ctx: RunContext[RagDependencies], query: str):
|
26
|
+
"""
|
27
|
+
Performs a retrieval search over the RAG database.
|
28
|
+
|
29
|
+
The query can be any text, such as name of a disease, phenotype, gene, etc.
|
30
|
+
"""
|
31
|
+
return await search_documents(ctx, query)
|
32
|
+
|
33
|
+
|
34
|
+
@rag_agent.tool
|
35
|
+
async def inspect_document_tool(ctx: RunContext[RagDependencies], query: str):
|
36
|
+
"""
|
37
|
+
Returns the content of the document.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
query: E.g. title
|
41
|
+
"""
|
42
|
+
return await inspect_document(ctx, query)
|
43
|
+
|
44
|
+
|
45
|
+
@rag_agent.tool
|
46
|
+
async def lookup_pmid_tool(ctx: RunContext[RagDependencies], pmid: str):
|
47
|
+
"""
|
48
|
+
Lookup the text of a PubMed ID, using its PMID.
|
49
|
+
|
50
|
+
A PMID should be of the form "PMID:nnnnnnn" (no underscores).
|
51
|
+
|
52
|
+
NOTE: Phenopacket IDs are typically of the form PMID_nnn_PatientNumber,
|
53
|
+
but this should be be assumed. To reliably get PMIDs for a phenopacket,
|
54
|
+
use `lookup_phenopacket` to retrieve examine the `externalReferences`
|
55
|
+
field.
|
56
|
+
|
57
|
+
Returns: full text if available, otherwise abstract
|
58
|
+
"""
|
59
|
+
return await lookup_pmid(ctx, pmid)
|
60
|
+
|
61
|
+
|
62
|
+
@rag_agent.tool
|
63
|
+
async def search_web_tool(ctx: RunContext[RagDependencies], query: str):
|
64
|
+
"""
|
65
|
+
Search the web using a text query.
|
66
|
+
|
67
|
+
Note, this will not retrieve the full content, for that you
|
68
|
+
should use `retrieve_web_page`.
|
69
|
+
|
70
|
+
Returns: matching web pages plus summaries
|
71
|
+
"""
|
72
|
+
return await search_web(ctx, query)
|
73
|
+
|
74
|
+
|
75
|
+
@rag_agent.tool
|
76
|
+
async def retrieve_web_page_tool(ctx: RunContext[RagDependencies], url: str):
|
77
|
+
"""
|
78
|
+
Fetch the contents of a web page.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
The contents of the web page.
|
82
|
+
"""
|
83
|
+
return await retrieve_web_page(ctx, url)
|
@@ -0,0 +1,80 @@
|
|
1
|
+
"""
|
2
|
+
Configuration for the RAG agent.
|
3
|
+
"""
|
4
|
+
from dataclasses import dataclass, field
|
5
|
+
import os
|
6
|
+
from typing import Optional
|
7
|
+
|
8
|
+
from linkml_store import Client
|
9
|
+
from linkml_store.api import Collection
|
10
|
+
|
11
|
+
from aurelian.dependencies.workdir import HasWorkdir, WorkDir
|
12
|
+
from . import COLLECTION_NAME
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass
|
16
|
+
class RagDependencies:
|
17
|
+
"""Configuration for the RAG agent."""
|
18
|
+
|
19
|
+
# Required fields
|
20
|
+
db_path: str
|
21
|
+
|
22
|
+
# Optional fields with defaults
|
23
|
+
collection_name: str = COLLECTION_NAME
|
24
|
+
max_results: int = 10
|
25
|
+
max_content_len: int = 5000
|
26
|
+
workdir: Optional[WorkDir] = None
|
27
|
+
_collection: Optional[Collection] = None
|
28
|
+
|
29
|
+
def __post_init__(self):
|
30
|
+
"""Initialize the config with default values."""
|
31
|
+
if self.workdir is None:
|
32
|
+
self.workdir = WorkDir()
|
33
|
+
|
34
|
+
@property
|
35
|
+
def collection(self) -> Collection:
|
36
|
+
"""Get the database collection, initializing it if needed."""
|
37
|
+
if self._collection is None:
|
38
|
+
client = Client()
|
39
|
+
db_path = self.db_path
|
40
|
+
client.attach_database(db_path)
|
41
|
+
db = client.databases[db_path]
|
42
|
+
self._collection = db.get_collection(self.collection_name)
|
43
|
+
return self._collection
|
44
|
+
|
45
|
+
|
46
|
+
def get_config(db_path: Optional[str] = None, collection_name: Optional[str] = None) -> RagDependencies:
|
47
|
+
"""
|
48
|
+
Get the RAG configuration from environment variables or defaults.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
db_path: The database path to use (overrides environment variable)
|
52
|
+
collection_name: The collection name to use (overrides environment variable)
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
A RagDependencies instance
|
56
|
+
"""
|
57
|
+
# Try to get from environment, then use provided values or defaults
|
58
|
+
env_db_path = os.environ.get("AURELIAN_RAG_DB_PATH", None)
|
59
|
+
env_collection = os.environ.get("AURELIAN_RAG_COLLECTION", COLLECTION_NAME)
|
60
|
+
|
61
|
+
# Use provided values first, then environment, then defaults
|
62
|
+
final_db_path = db_path or env_db_path
|
63
|
+
final_collection = collection_name or env_collection
|
64
|
+
|
65
|
+
# For testing purposes, if no DB path is provided, use a default one
|
66
|
+
# This is only used for running basic smoke tests
|
67
|
+
if not final_db_path:
|
68
|
+
if os.environ.get("TESTING", "0") == "1":
|
69
|
+
final_db_path = "memory://test"
|
70
|
+
else:
|
71
|
+
raise ValueError("Database path must be provided either as parameter or via AURELIAN_RAG_DB_PATH environment variable")
|
72
|
+
|
73
|
+
workdir_path = os.environ.get("AURELIAN_WORKDIR", None)
|
74
|
+
workdir = WorkDir(location=workdir_path) if workdir_path else None
|
75
|
+
|
76
|
+
return RagDependencies(
|
77
|
+
db_path=final_db_path,
|
78
|
+
collection_name=final_collection,
|
79
|
+
workdir=workdir,
|
80
|
+
)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
"""
|
2
|
+
Gradio interface for the RAG agent.
|
3
|
+
"""
|
4
|
+
from typing import List, Optional
|
5
|
+
|
6
|
+
import gradio as gr
|
7
|
+
|
8
|
+
from .rag_agent import rag_agent
|
9
|
+
from .rag_config import RagDependencies, get_config
|
10
|
+
|
11
|
+
|
12
|
+
async def get_info(query: str, history: List[str], deps: RagDependencies, model: str = None) -> str:
|
13
|
+
"""
|
14
|
+
Process a query using the RAG agent.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
query: The user query
|
18
|
+
history: The conversation history
|
19
|
+
deps: The agent dependencies
|
20
|
+
model: Optional model override
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
The agent's response
|
24
|
+
"""
|
25
|
+
print(f"QUERY: {query}")
|
26
|
+
print(f"HISTORY: {history}")
|
27
|
+
|
28
|
+
# Add history to the query if available
|
29
|
+
if history:
|
30
|
+
query += "## History"
|
31
|
+
for h in history:
|
32
|
+
query += f"\n{h}"
|
33
|
+
|
34
|
+
# Run the agent
|
35
|
+
result = await rag_agent.run(query, deps=deps, model=model)
|
36
|
+
return result.data
|
37
|
+
|
38
|
+
|
39
|
+
def chat(deps: Optional[RagDependencies] = None, model=None, **kwargs):
|
40
|
+
"""
|
41
|
+
Create a Gradio chat interface for the RAG agent.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
deps: Optional dependencies configuration
|
45
|
+
model: Optional model override
|
46
|
+
kwargs: Additional keyword arguments for dependencies
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
A Gradio ChatInterface
|
50
|
+
"""
|
51
|
+
# Initialize dependencies if needed
|
52
|
+
if deps is None:
|
53
|
+
deps = get_config(**kwargs) if kwargs else RagDependencies(**kwargs)
|
54
|
+
|
55
|
+
def get_info_wrapper(query: str, history: List[str]) -> str:
|
56
|
+
"""Wrapper for the async get_info function."""
|
57
|
+
import asyncio
|
58
|
+
return asyncio.run(get_info(query, history, deps, model))
|
59
|
+
|
60
|
+
return gr.ChatInterface(
|
61
|
+
fn=get_info_wrapper,
|
62
|
+
type="messages",
|
63
|
+
title="RAG AI Assistant",
|
64
|
+
examples=[
|
65
|
+
["What papers in collection are relevant to microbial nitrogen fixation?"],
|
66
|
+
],
|
67
|
+
)
|
@@ -0,0 +1,107 @@
|
|
1
|
+
"""
|
2
|
+
MCP tools for retrieval-augmented generation (RAG) against document collections.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
from typing import Dict, List
|
6
|
+
|
7
|
+
from mcp.server.fastmcp import FastMCP
|
8
|
+
|
9
|
+
import aurelian.agents.rag.rag_tools as rt
|
10
|
+
from aurelian.agents.rag.rag_agent import rag_agent
|
11
|
+
from aurelian.agents.rag.rag_config import RagDependencies
|
12
|
+
from pydantic_ai import RunContext
|
13
|
+
|
14
|
+
# Initialize FastMCP server
|
15
|
+
mcp = FastMCP("rag", instructions=rag_agent.system_prompt)
|
16
|
+
|
17
|
+
|
18
|
+
from aurelian.dependencies.workdir import WorkDir
|
19
|
+
|
20
|
+
def deps() -> RagDependencies:
|
21
|
+
deps = RagDependencies()
|
22
|
+
# Set the location from environment variable or default
|
23
|
+
loc = os.getenv("AURELIAN_WORKDIR", "/tmp/aurelian")
|
24
|
+
deps.workdir = WorkDir(loc)
|
25
|
+
return deps
|
26
|
+
|
27
|
+
def ctx() -> RunContext[RagDependencies]:
|
28
|
+
rc: RunContext[RagDependencies] = RunContext[RagDependencies](
|
29
|
+
deps=deps(),
|
30
|
+
model=None, usage=None, prompt=None,
|
31
|
+
)
|
32
|
+
return rc
|
33
|
+
|
34
|
+
|
35
|
+
@mcp.tool()
|
36
|
+
async def search_documents(query: str) -> List[Dict]:
|
37
|
+
"""
|
38
|
+
Performs a retrieval search over the RAG database.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
query: The search query (any text, such as name of a disease, phenotype, gene, etc.)
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
A list of document objects matching the query with relevancy scores
|
45
|
+
"""
|
46
|
+
return await rt.search_documents(ctx(), query)
|
47
|
+
|
48
|
+
|
49
|
+
@mcp.tool()
|
50
|
+
async def inspect_document(query: str) -> str:
|
51
|
+
"""
|
52
|
+
Returns the content of the document.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
query: E.g. title
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
The full content of the document
|
59
|
+
"""
|
60
|
+
return await rt.inspect_document(ctx(), query)
|
61
|
+
|
62
|
+
|
63
|
+
@mcp.tool()
|
64
|
+
async def lookup_pmid(pmid: str) -> str:
|
65
|
+
"""
|
66
|
+
Lookup the text of a PubMed ID, using its PMID.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
pmid: The PubMed ID to look up (format: "PMID:nnnnnnn")
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
The full text if available, otherwise abstract
|
73
|
+
"""
|
74
|
+
return await rt.lookup_pmid(ctx(), pmid)
|
75
|
+
|
76
|
+
|
77
|
+
@mcp.tool()
|
78
|
+
async def search_web(query: str) -> str:
|
79
|
+
"""
|
80
|
+
Search the web using a text query.
|
81
|
+
|
82
|
+
Args:
|
83
|
+
query: The search query
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
Matching web pages plus summaries
|
87
|
+
"""
|
88
|
+
return await rt.search_web(ctx(), query)
|
89
|
+
|
90
|
+
|
91
|
+
@mcp.tool()
|
92
|
+
async def retrieve_web_page(url: str) -> str:
|
93
|
+
"""
|
94
|
+
Fetch the contents of a web page.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
url: The URL to fetch
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
The contents of the web page
|
101
|
+
"""
|
102
|
+
return await rt.retrieve_web_page(ctx(), url)
|
103
|
+
|
104
|
+
|
105
|
+
if __name__ == "__main__":
|
106
|
+
# Initialize and run the server
|
107
|
+
mcp.run(transport='stdio')
|
@@ -0,0 +1,189 @@
|
|
1
|
+
"""
|
2
|
+
Tools for the RAG agent for retrieval-augmented generation.
|
3
|
+
"""
|
4
|
+
import asyncio
|
5
|
+
from typing import Dict, List
|
6
|
+
|
7
|
+
from pydantic_ai import RunContext, ModelRetry
|
8
|
+
|
9
|
+
from aurelian.utils.data_utils import flatten
|
10
|
+
from aurelian.utils.pubmed_utils import get_pmid_text
|
11
|
+
from aurelian.utils.search_utils import web_search, retrieve_web_page as fetch_web_page
|
12
|
+
from .rag_config import RagDependencies
|
13
|
+
|
14
|
+
|
15
|
+
async def search_documents(
|
16
|
+
ctx: RunContext[RagDependencies],
|
17
|
+
query: str
|
18
|
+
) -> List[Dict]:
|
19
|
+
"""
|
20
|
+
Performs a retrieval search over the RAG database.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
ctx: The run context
|
24
|
+
query: The search query (any text, such as name of a disease, phenotype, gene, etc.)
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
A list of document objects matching the query with relevancy scores
|
28
|
+
"""
|
29
|
+
try:
|
30
|
+
print(f"SEARCH: {query}")
|
31
|
+
|
32
|
+
# Execute the potentially blocking operation in a thread pool
|
33
|
+
def _search():
|
34
|
+
qr = ctx.deps.collection.search(query, index_name="llm", limit=ctx.deps.max_results)
|
35
|
+
objs = []
|
36
|
+
for score, row in qr.ranked_rows:
|
37
|
+
row["content"] = row["content"][:ctx.deps.max_content_len]
|
38
|
+
obj = flatten(row)
|
39
|
+
obj["relevancy_score"] = score
|
40
|
+
objs.append(obj)
|
41
|
+
print(f"RESULT: {obj}")
|
42
|
+
return objs
|
43
|
+
|
44
|
+
objs = await asyncio.to_thread(_search)
|
45
|
+
|
46
|
+
if not objs:
|
47
|
+
raise ModelRetry(f"No results found for query: {query}")
|
48
|
+
|
49
|
+
return objs
|
50
|
+
except Exception as e:
|
51
|
+
if "ModelRetry" in str(type(e)):
|
52
|
+
raise e
|
53
|
+
raise ModelRetry(f"Error searching documents: {str(e)}")
|
54
|
+
|
55
|
+
|
56
|
+
async def inspect_document(
|
57
|
+
ctx: RunContext[RagDependencies],
|
58
|
+
query: str
|
59
|
+
) -> List[Dict]:
|
60
|
+
"""
|
61
|
+
Returns the content of a document.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
ctx: The run context
|
65
|
+
query: Identifying information for the document (e.g., title)
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
The document content
|
69
|
+
"""
|
70
|
+
try:
|
71
|
+
print(f"INSPECT DOCUMENT: {query}")
|
72
|
+
|
73
|
+
# Execute the potentially blocking operation in a thread pool
|
74
|
+
def _inspect():
|
75
|
+
qr = ctx.deps.collection.search(query, index_name="llm", limit=ctx.deps.max_results)
|
76
|
+
for score, row in qr.ranked_rows:
|
77
|
+
return row["content"]
|
78
|
+
return None
|
79
|
+
|
80
|
+
content = await asyncio.to_thread(_inspect)
|
81
|
+
|
82
|
+
if not content:
|
83
|
+
raise ModelRetry(f"No document found matching: {query}")
|
84
|
+
|
85
|
+
return content
|
86
|
+
except Exception as e:
|
87
|
+
if "ModelRetry" in str(type(e)):
|
88
|
+
raise e
|
89
|
+
raise ModelRetry(f"Error inspecting document: {str(e)}")
|
90
|
+
|
91
|
+
|
92
|
+
async def lookup_pmid(
|
93
|
+
ctx: RunContext[RagDependencies],
|
94
|
+
pmid: str
|
95
|
+
) -> str:
|
96
|
+
"""
|
97
|
+
Lookup the text of a PubMed ID, using its PMID.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
ctx: The run context
|
101
|
+
pmid: The PubMed ID to look up (format: "PMID:nnnnnnn")
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
The full text if available, otherwise abstract
|
105
|
+
|
106
|
+
A PMID should be of the form "PMID:nnnnnnn" (no underscores).
|
107
|
+
|
108
|
+
NOTE: Phenopacket IDs are typically of the form PMID_nnn_PatientNumber,
|
109
|
+
but this should be be assumed. To reliably get PMIDs for a phenopacket,
|
110
|
+
use `lookup_phenopacket` to retrieve examine the `externalReferences`
|
111
|
+
field.
|
112
|
+
"""
|
113
|
+
try:
|
114
|
+
print(f"LOOKUP PMID: {pmid}")
|
115
|
+
|
116
|
+
# Execute the potentially blocking operation in a thread pool
|
117
|
+
text = await asyncio.to_thread(get_pmid_text, pmid)
|
118
|
+
|
119
|
+
if not text or text.strip() == "":
|
120
|
+
raise ModelRetry(f"No text found for PMID: {pmid}")
|
121
|
+
|
122
|
+
return text
|
123
|
+
except Exception as e:
|
124
|
+
if "ModelRetry" in str(type(e)):
|
125
|
+
raise e
|
126
|
+
raise ModelRetry(f"Error retrieving text from PMID: {str(e)}")
|
127
|
+
|
128
|
+
|
129
|
+
async def search_web(
|
130
|
+
ctx: RunContext[RagDependencies],
|
131
|
+
query: str
|
132
|
+
) -> str:
|
133
|
+
"""
|
134
|
+
Search the web using a text query.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
ctx: The run context
|
138
|
+
query: The search query
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
Matching web pages plus summaries
|
142
|
+
|
143
|
+
Note, this will not retrieve the full content, for that you
|
144
|
+
should use `retrieve_web_page`.
|
145
|
+
"""
|
146
|
+
try:
|
147
|
+
print(f"Web Search: {query}")
|
148
|
+
|
149
|
+
# Execute the potentially blocking operation in a thread pool
|
150
|
+
results = await asyncio.to_thread(web_search, query)
|
151
|
+
|
152
|
+
if not results or results.strip() == "":
|
153
|
+
raise ModelRetry(f"No web search results found for query: {query}")
|
154
|
+
|
155
|
+
return results
|
156
|
+
except Exception as e:
|
157
|
+
if "ModelRetry" in str(type(e)):
|
158
|
+
raise e
|
159
|
+
raise ModelRetry(f"Error searching web: {str(e)}")
|
160
|
+
|
161
|
+
|
162
|
+
async def retrieve_web_page(
|
163
|
+
ctx: RunContext[RagDependencies],
|
164
|
+
url: str
|
165
|
+
) -> str:
|
166
|
+
"""
|
167
|
+
Fetch the contents of a web page.
|
168
|
+
|
169
|
+
Args:
|
170
|
+
ctx: The run context
|
171
|
+
url: The URL to fetch
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
The contents of the web page
|
175
|
+
"""
|
176
|
+
try:
|
177
|
+
print(f"Fetch URL: {url}")
|
178
|
+
|
179
|
+
# Execute the potentially blocking operation in a thread pool
|
180
|
+
content = await asyncio.to_thread(fetch_web_page, url)
|
181
|
+
|
182
|
+
if not content or content.strip() == "":
|
183
|
+
raise ModelRetry(f"No content found for URL: {url}")
|
184
|
+
|
185
|
+
return content
|
186
|
+
except Exception as e:
|
187
|
+
if "ModelRetry" in str(type(e)):
|
188
|
+
raise e
|
189
|
+
raise ModelRetry(f"Error retrieving web page: {str(e)}")
|