aurelian 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aurelian/__init__.py +9 -0
- aurelian/agents/__init__.py +0 -0
- aurelian/agents/amigo/__init__.py +3 -0
- aurelian/agents/amigo/amigo_agent.py +77 -0
- aurelian/agents/amigo/amigo_config.py +85 -0
- aurelian/agents/amigo/amigo_evals.py +73 -0
- aurelian/agents/amigo/amigo_gradio.py +52 -0
- aurelian/agents/amigo/amigo_mcp.py +152 -0
- aurelian/agents/amigo/amigo_tools.py +152 -0
- aurelian/agents/biblio/__init__.py +42 -0
- aurelian/agents/biblio/biblio_agent.py +94 -0
- aurelian/agents/biblio/biblio_config.py +40 -0
- aurelian/agents/biblio/biblio_gradio.py +67 -0
- aurelian/agents/biblio/biblio_mcp.py +115 -0
- aurelian/agents/biblio/biblio_tools.py +164 -0
- aurelian/agents/biblio_agent.py +46 -0
- aurelian/agents/checklist/__init__.py +44 -0
- aurelian/agents/checklist/checklist_agent.py +85 -0
- aurelian/agents/checklist/checklist_config.py +28 -0
- aurelian/agents/checklist/checklist_gradio.py +70 -0
- aurelian/agents/checklist/checklist_mcp.py +86 -0
- aurelian/agents/checklist/checklist_tools.py +141 -0
- aurelian/agents/checklist/content/checklists.yaml +7 -0
- aurelian/agents/checklist/content/streams.csv +136 -0
- aurelian/agents/checklist_agent.py +40 -0
- aurelian/agents/chemistry/__init__.py +3 -0
- aurelian/agents/chemistry/chemistry_agent.py +46 -0
- aurelian/agents/chemistry/chemistry_config.py +71 -0
- aurelian/agents/chemistry/chemistry_evals.py +79 -0
- aurelian/agents/chemistry/chemistry_gradio.py +50 -0
- aurelian/agents/chemistry/chemistry_mcp.py +120 -0
- aurelian/agents/chemistry/chemistry_tools.py +121 -0
- aurelian/agents/chemistry/image_agent.py +15 -0
- aurelian/agents/d4d/__init__.py +30 -0
- aurelian/agents/d4d/d4d_agent.py +72 -0
- aurelian/agents/d4d/d4d_config.py +46 -0
- aurelian/agents/d4d/d4d_gradio.py +58 -0
- aurelian/agents/d4d/d4d_mcp.py +71 -0
- aurelian/agents/d4d/d4d_tools.py +157 -0
- aurelian/agents/d4d_agent.py +64 -0
- aurelian/agents/diagnosis/__init__.py +33 -0
- aurelian/agents/diagnosis/diagnosis_agent.py +53 -0
- aurelian/agents/diagnosis/diagnosis_config.py +48 -0
- aurelian/agents/diagnosis/diagnosis_evals.py +76 -0
- aurelian/agents/diagnosis/diagnosis_gradio.py +52 -0
- aurelian/agents/diagnosis/diagnosis_mcp.py +141 -0
- aurelian/agents/diagnosis/diagnosis_tools.py +204 -0
- aurelian/agents/diagnosis_agent.py +28 -0
- aurelian/agents/draw/__init__.py +3 -0
- aurelian/agents/draw/draw_agent.py +39 -0
- aurelian/agents/draw/draw_config.py +26 -0
- aurelian/agents/draw/draw_gradio.py +50 -0
- aurelian/agents/draw/draw_mcp.py +94 -0
- aurelian/agents/draw/draw_tools.py +100 -0
- aurelian/agents/draw/judge_agent.py +18 -0
- aurelian/agents/filesystem/__init__.py +0 -0
- aurelian/agents/filesystem/filesystem_config.py +27 -0
- aurelian/agents/filesystem/filesystem_gradio.py +49 -0
- aurelian/agents/filesystem/filesystem_mcp.py +89 -0
- aurelian/agents/filesystem/filesystem_tools.py +95 -0
- aurelian/agents/filesystem/py.typed +0 -0
- aurelian/agents/github/__init__.py +0 -0
- aurelian/agents/github/github_agent.py +83 -0
- aurelian/agents/github/github_cli.py +248 -0
- aurelian/agents/github/github_config.py +22 -0
- aurelian/agents/github/github_gradio.py +152 -0
- aurelian/agents/github/github_mcp.py +252 -0
- aurelian/agents/github/github_tools.py +408 -0
- aurelian/agents/github/github_tools.py.tmp +413 -0
- aurelian/agents/goann/__init__.py +13 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.md +1000 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.pdf +0 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.md +693 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.pdf +0 -0
- aurelian/agents/goann/goann_agent.py +90 -0
- aurelian/agents/goann/goann_config.py +90 -0
- aurelian/agents/goann/goann_evals.py +104 -0
- aurelian/agents/goann/goann_gradio.py +62 -0
- aurelian/agents/goann/goann_mcp.py +0 -0
- aurelian/agents/goann/goann_tools.py +65 -0
- aurelian/agents/gocam/__init__.py +43 -0
- aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/DNA-binding_transcription_factor_activity_annotation_guidelines.md +100 -0
- aurelian/agents/gocam/documents/E3 ubiquitin ligases.docx +0 -0
- aurelian/agents/gocam/documents/E3 ubiquitin ligases.pdf +0 -0
- aurelian/agents/gocam/documents/E3_ubiquitin_ligases.md +134 -0
- aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.docx +0 -0
- aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.pdf +0 -0
- aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.docx +0 -0
- aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.pdf +0 -0
- aurelian/agents/gocam/documents/GO-CAM_annotation_guidelines_README.md +1 -0
- aurelian/agents/gocam/documents/GO-CAM_modelling_guidelines_TO_DO.md +3 -0
- aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.pdf +0 -0
- aurelian/agents/gocam/documents/How to annotate molecular adaptors.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate molecular adaptors.pdf +0 -0
- aurelian/agents/gocam/documents/How to annotate sequestering proteins.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate sequestering proteins.pdf +0 -0
- aurelian/agents/gocam/documents/How_to_annotate_complexes_in_GO-CAM.md +29 -0
- aurelian/agents/gocam/documents/How_to_annotate_molecular_adaptors.md +31 -0
- aurelian/agents/gocam/documents/How_to_annotate_sequestering_proteins.md +42 -0
- aurelian/agents/gocam/documents/Molecular adaptor activity.docx +0 -0
- aurelian/agents/gocam/documents/Molecular adaptor activity.pdf +0 -0
- aurelian/agents/gocam/documents/Molecular carrier activity.docx +0 -0
- aurelian/agents/gocam/documents/Molecular carrier activity.pdf +0 -0
- aurelian/agents/gocam/documents/Molecular_adaptor_activity.md +51 -0
- aurelian/agents/gocam/documents/Molecular_carrier_activity.md +41 -0
- aurelian/agents/gocam/documents/Protein sequestering activity.docx +0 -0
- aurelian/agents/gocam/documents/Protein sequestering activity.pdf +0 -0
- aurelian/agents/gocam/documents/Protein_sequestering_activity.md +50 -0
- aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/Signaling_receptor_activity_annotation_guidelines.md +187 -0
- aurelian/agents/gocam/documents/Transcription coregulator activity.docx +0 -0
- aurelian/agents/gocam/documents/Transcription coregulator activity.pdf +0 -0
- aurelian/agents/gocam/documents/Transcription_coregulator_activity.md +36 -0
- aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/Transporter_activity_annotation_annotation_guidelines.md +43 -0
- Regulatory Processes in GO-CAM.docx +0 -0
- Regulatory Processes in GO-CAM.pdf +0 -0
- aurelian/agents/gocam/documents/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +31 -0
- aurelian/agents/gocam/documents/md/DNA-binding_transcription_factor_activity_annotation_guidelines.md +131 -0
- aurelian/agents/gocam/documents/md/E3_ubiquitin_ligases.md +166 -0
- aurelian/agents/gocam/documents/md/GO-CAM_annotation_guidelines_README.md +1 -0
- aurelian/agents/gocam/documents/md/GO-CAM_modelling_guidelines_TO_DO.md +5 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_complexes_in_GO-CAM.md +28 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_molecular_adaptors.md +19 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_sequestering_proteins.md +38 -0
- aurelian/agents/gocam/documents/md/Molecular_adaptor_activity.md +52 -0
- aurelian/agents/gocam/documents/md/Molecular_carrier_activity.md +59 -0
- aurelian/agents/gocam/documents/md/Protein_sequestering_activity.md +52 -0
- aurelian/agents/gocam/documents/md/Signaling_receptor_activity_annotation_guidelines.md +271 -0
- aurelian/agents/gocam/documents/md/Transcription_coregulator_activity.md +54 -0
- aurelian/agents/gocam/documents/md/Transporter_activity_annotation_annotation_guidelines.md +38 -0
- aurelian/agents/gocam/documents/md/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +39 -0
- aurelian/agents/gocam/documents/pandoc_md/Signaling_receptor_activity_annotation_guidelines.md +334 -0
- aurelian/agents/gocam/gocam_agent.py +240 -0
- aurelian/agents/gocam/gocam_config.py +85 -0
- aurelian/agents/gocam/gocam_curator_agent.py +46 -0
- aurelian/agents/gocam/gocam_evals.py +67 -0
- aurelian/agents/gocam/gocam_gradio.py +89 -0
- aurelian/agents/gocam/gocam_mcp.py +224 -0
- aurelian/agents/gocam/gocam_tools.py +294 -0
- aurelian/agents/linkml/__init__.py +0 -0
- aurelian/agents/linkml/linkml_agent.py +62 -0
- aurelian/agents/linkml/linkml_config.py +48 -0
- aurelian/agents/linkml/linkml_evals.py +66 -0
- aurelian/agents/linkml/linkml_gradio.py +45 -0
- aurelian/agents/linkml/linkml_mcp.py +186 -0
- aurelian/agents/linkml/linkml_tools.py +102 -0
- aurelian/agents/literature/__init__.py +3 -0
- aurelian/agents/literature/literature_agent.py +55 -0
- aurelian/agents/literature/literature_config.py +35 -0
- aurelian/agents/literature/literature_gradio.py +52 -0
- aurelian/agents/literature/literature_mcp.py +174 -0
- aurelian/agents/literature/literature_tools.py +182 -0
- aurelian/agents/monarch/__init__.py +25 -0
- aurelian/agents/monarch/monarch_agent.py +44 -0
- aurelian/agents/monarch/monarch_config.py +45 -0
- aurelian/agents/monarch/monarch_gradio.py +51 -0
- aurelian/agents/monarch/monarch_mcp.py +65 -0
- aurelian/agents/monarch/monarch_tools.py +113 -0
- aurelian/agents/oak/__init__.py +0 -0
- aurelian/agents/oak/oak_config.py +27 -0
- aurelian/agents/oak/oak_gradio.py +57 -0
- aurelian/agents/ontology_mapper/__init__.py +31 -0
- aurelian/agents/ontology_mapper/ontology_mapper_agent.py +56 -0
- aurelian/agents/ontology_mapper/ontology_mapper_config.py +50 -0
- aurelian/agents/ontology_mapper/ontology_mapper_evals.py +108 -0
- aurelian/agents/ontology_mapper/ontology_mapper_gradio.py +58 -0
- aurelian/agents/ontology_mapper/ontology_mapper_mcp.py +81 -0
- aurelian/agents/ontology_mapper/ontology_mapper_tools.py +147 -0
- aurelian/agents/phenopackets/__init__.py +3 -0
- aurelian/agents/phenopackets/phenopackets_agent.py +58 -0
- aurelian/agents/phenopackets/phenopackets_config.py +72 -0
- aurelian/agents/phenopackets/phenopackets_evals.py +99 -0
- aurelian/agents/phenopackets/phenopackets_gradio.py +55 -0
- aurelian/agents/phenopackets/phenopackets_mcp.py +178 -0
- aurelian/agents/phenopackets/phenopackets_tools.py +127 -0
- aurelian/agents/rag/__init__.py +40 -0
- aurelian/agents/rag/rag_agent.py +83 -0
- aurelian/agents/rag/rag_config.py +80 -0
- aurelian/agents/rag/rag_gradio.py +67 -0
- aurelian/agents/rag/rag_mcp.py +107 -0
- aurelian/agents/rag/rag_tools.py +189 -0
- aurelian/agents/rag_agent.py +54 -0
- aurelian/agents/robot/__init__.py +0 -0
- aurelian/agents/robot/assets/__init__.py +3 -0
- aurelian/agents/robot/assets/template.md +384 -0
- aurelian/agents/robot/robot_config.py +25 -0
- aurelian/agents/robot/robot_gradio.py +46 -0
- aurelian/agents/robot/robot_mcp.py +100 -0
- aurelian/agents/robot/robot_ontology_agent.py +139 -0
- aurelian/agents/robot/robot_tools.py +50 -0
- aurelian/agents/talisman/__init__.py +3 -0
- aurelian/agents/talisman/talisman_agent.py +126 -0
- aurelian/agents/talisman/talisman_config.py +66 -0
- aurelian/agents/talisman/talisman_gradio.py +50 -0
- aurelian/agents/talisman/talisman_mcp.py +168 -0
- aurelian/agents/talisman/talisman_tools.py +720 -0
- aurelian/agents/ubergraph/__init__.py +40 -0
- aurelian/agents/ubergraph/ubergraph_agent.py +71 -0
- aurelian/agents/ubergraph/ubergraph_config.py +79 -0
- aurelian/agents/ubergraph/ubergraph_gradio.py +48 -0
- aurelian/agents/ubergraph/ubergraph_mcp.py +69 -0
- aurelian/agents/ubergraph/ubergraph_tools.py +118 -0
- aurelian/agents/uniprot/__init__.py +37 -0
- aurelian/agents/uniprot/uniprot_agent.py +43 -0
- aurelian/agents/uniprot/uniprot_config.py +43 -0
- aurelian/agents/uniprot/uniprot_evals.py +99 -0
- aurelian/agents/uniprot/uniprot_gradio.py +48 -0
- aurelian/agents/uniprot/uniprot_mcp.py +168 -0
- aurelian/agents/uniprot/uniprot_tools.py +136 -0
- aurelian/agents/web/__init__.py +0 -0
- aurelian/agents/web/web_config.py +27 -0
- aurelian/agents/web/web_gradio.py +48 -0
- aurelian/agents/web/web_mcp.py +50 -0
- aurelian/agents/web/web_tools.py +108 -0
- aurelian/chat.py +23 -0
- aurelian/cli.py +800 -0
- aurelian/dependencies/__init__.py +0 -0
- aurelian/dependencies/workdir.py +78 -0
- aurelian/mcp/__init__.py +0 -0
- aurelian/mcp/amigo_mcp_test.py +86 -0
- aurelian/mcp/config_generator.py +123 -0
- aurelian/mcp/example_config.json +43 -0
- aurelian/mcp/generate_sample_config.py +37 -0
- aurelian/mcp/gocam_mcp_test.py +126 -0
- aurelian/mcp/linkml_mcp_tools.py +190 -0
- aurelian/mcp/mcp_discovery.py +87 -0
- aurelian/mcp/mcp_test.py +31 -0
- aurelian/mcp/phenopackets_mcp_test.py +103 -0
- aurelian/tools/__init__.py +0 -0
- aurelian/tools/web/__init__.py +0 -0
- aurelian/tools/web/url_download.py +51 -0
- aurelian/utils/__init__.py +0 -0
- aurelian/utils/async_utils.py +15 -0
- aurelian/utils/data_utils.py +32 -0
- aurelian/utils/documentation_manager.py +59 -0
- aurelian/utils/doi_fetcher.py +238 -0
- aurelian/utils/ontology_utils.py +68 -0
- aurelian/utils/pdf_fetcher.py +23 -0
- aurelian/utils/process_logs.py +100 -0
- aurelian/utils/pubmed_utils.py +238 -0
- aurelian/utils/pytest_report_to_markdown.py +67 -0
- aurelian/utils/robot_ontology_utils.py +112 -0
- aurelian/utils/search_utils.py +95 -0
- aurelian-0.3.2.dist-info/LICENSE +22 -0
- aurelian-0.3.2.dist-info/METADATA +105 -0
- aurelian-0.3.2.dist-info/RECORD +254 -0
- aurelian-0.3.2.dist-info/WHEEL +4 -0
- aurelian-0.3.2.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,720 @@
|
|
1
|
+
"""
|
2
|
+
Tools for retrieving gene information using the UniProt API and NCBI Entrez.
|
3
|
+
"""
|
4
|
+
from typing import Dict, List, Optional, Tuple, Any
|
5
|
+
import openai
|
6
|
+
import time
|
7
|
+
import threading
|
8
|
+
import json
|
9
|
+
import os
|
10
|
+
import datetime
|
11
|
+
import logging
|
12
|
+
|
13
|
+
from pydantic_ai import RunContext, ModelRetry
|
14
|
+
|
15
|
+
from .talisman_config import TalismanConfig, get_config
|
16
|
+
|
17
|
+
# Set up logging
|
18
|
+
logging.basicConfig(
|
19
|
+
level=logging.INFO,
|
20
|
+
format='%(asctime)s [%(levelname)s] Talisman: %(message)s',
|
21
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
22
|
+
)
|
23
|
+
|
24
|
+
# Rate limiting implementation
|
25
|
+
class RateLimiter:
|
26
|
+
"""Simple rate limiter to ensure we don't exceed API rate limits."""
|
27
|
+
|
28
|
+
def __init__(self, max_calls: int = 3, period: float = 1.0):
|
29
|
+
"""
|
30
|
+
Initialize the rate limiter.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
max_calls: Maximum number of calls allowed in the period
|
34
|
+
period: Time period in seconds
|
35
|
+
"""
|
36
|
+
self.max_calls = max_calls
|
37
|
+
self.period = period
|
38
|
+
self.calls = []
|
39
|
+
self.lock = threading.Lock()
|
40
|
+
|
41
|
+
def wait(self):
|
42
|
+
"""
|
43
|
+
Wait if necessary to respect the rate limit.
|
44
|
+
"""
|
45
|
+
with self.lock:
|
46
|
+
now = time.time()
|
47
|
+
|
48
|
+
# Remove timestamps older than the period
|
49
|
+
self.calls = [t for t in self.calls if now - t < self.period]
|
50
|
+
|
51
|
+
# If we've reached the maximum calls for this period, wait
|
52
|
+
if len(self.calls) >= self.max_calls:
|
53
|
+
# Calculate how long to wait
|
54
|
+
oldest_call = min(self.calls)
|
55
|
+
wait_time = self.period - (now - oldest_call)
|
56
|
+
if wait_time > 0:
|
57
|
+
time.sleep(wait_time)
|
58
|
+
# Reset calls after waiting
|
59
|
+
self.calls = []
|
60
|
+
|
61
|
+
# Add the current timestamp
|
62
|
+
self.calls.append(time.time())
|
63
|
+
|
64
|
+
# Create rate limiters for UniProt and NCBI
|
65
|
+
uniprot_limiter = RateLimiter(max_calls=3, period=1.0)
|
66
|
+
ncbi_limiter = RateLimiter(max_calls=3, period=1.0)
|
67
|
+
|
68
|
+
|
69
|
+
def normalize_gene_id(gene_id: str) -> str:
|
70
|
+
"""Normalize a gene ID by removing any version number or prefix.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
gene_id: The gene ID
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
The normalized gene ID
|
77
|
+
"""
|
78
|
+
if ":" in gene_id:
|
79
|
+
return gene_id.split(":")[-1]
|
80
|
+
return gene_id
|
81
|
+
|
82
|
+
|
83
|
+
def is_uniprot_id(gene_id: str) -> bool:
|
84
|
+
"""Check if the gene ID appears to be a UniProt accession.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
gene_id: The gene ID to check
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
True if it appears to be a UniProt ID, False otherwise
|
91
|
+
"""
|
92
|
+
# UniProt IDs typically start with O, P, Q and contain numbers
|
93
|
+
return gene_id.startswith(("P", "Q", "O")) and any(c.isdigit() for c in gene_id)
|
94
|
+
|
95
|
+
|
96
|
+
def lookup_uniprot_accession(ctx: RunContext[TalismanConfig], gene_symbol: str) -> str:
|
97
|
+
"""Look up UniProt accession for a gene symbol.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
ctx: The run context with access to the config
|
101
|
+
gene_symbol: The gene symbol to look up
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
UniProt accession if found, or the original symbol if not found
|
105
|
+
"""
|
106
|
+
logging.info(f"Looking up UniProt accession for: {gene_symbol}")
|
107
|
+
|
108
|
+
config = ctx.deps or get_config()
|
109
|
+
u = config.get_uniprot_client()
|
110
|
+
|
111
|
+
try:
|
112
|
+
gene_symbol = normalize_gene_id(gene_symbol)
|
113
|
+
|
114
|
+
# Skip lookup if it already looks like a UniProt ID
|
115
|
+
if is_uniprot_id(gene_symbol):
|
116
|
+
logging.info(f"{gene_symbol} appears to be a UniProt ID already")
|
117
|
+
return gene_symbol
|
118
|
+
|
119
|
+
# Apply rate limiting before making the request
|
120
|
+
uniprot_limiter.wait()
|
121
|
+
|
122
|
+
# Search for the gene symbol specifically
|
123
|
+
logging.info(f"Searching UniProt for gene symbol: {gene_symbol}")
|
124
|
+
search_query = f'gene:{gene_symbol} AND reviewed:yes'
|
125
|
+
results = u.search(search_query, frmt="tsv", columns="accession,gene_names")
|
126
|
+
|
127
|
+
if results and results.strip() != "":
|
128
|
+
# Get the first line after the header and extract the accession
|
129
|
+
lines = results.strip().split('\n')
|
130
|
+
if len(lines) > 1:
|
131
|
+
uniprot_id = lines[1].split('\t')[0]
|
132
|
+
logging.info(f"Found UniProt accession: {uniprot_id} for {gene_symbol}")
|
133
|
+
return uniprot_id
|
134
|
+
|
135
|
+
logging.info(f"No UniProt accession found for {gene_symbol}, using original symbol")
|
136
|
+
return gene_symbol
|
137
|
+
except Exception as e:
|
138
|
+
# Return original gene symbol if lookup fails
|
139
|
+
logging.warning(f"Error looking up UniProt accession for {gene_symbol}: {str(e)}")
|
140
|
+
return gene_symbol
|
141
|
+
|
142
|
+
|
143
|
+
def get_ncbi_gene_info(ctx: RunContext[TalismanConfig], gene_id: str, organism: str = None) -> Optional[str]:
|
144
|
+
"""Look up gene information in NCBI Entrez.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
ctx: The run context with access to the config
|
148
|
+
gene_id: Gene ID or symbol to look up
|
149
|
+
organism: Optional organism name to restrict search (e.g., "Salmonella", "Homo sapiens")
|
150
|
+
|
151
|
+
Returns:
|
152
|
+
Gene information from NCBI if found, or None if not found
|
153
|
+
"""
|
154
|
+
logging.info(f"Looking up NCBI information for: {gene_id}")
|
155
|
+
|
156
|
+
config = ctx.deps or get_config()
|
157
|
+
ncbi = config.get_ncbi_client()
|
158
|
+
|
159
|
+
# Check if the gene looks like bacterial (common for Salmonella)
|
160
|
+
bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg"]
|
161
|
+
is_likely_bacterial = any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
|
162
|
+
|
163
|
+
# Default organisms to try based on gene patterns
|
164
|
+
if is_likely_bacterial and not organism:
|
165
|
+
organisms_to_try = ["Salmonella", "Escherichia coli", "Bacteria"]
|
166
|
+
else:
|
167
|
+
organisms_to_try = [organism] if organism else ["Homo sapiens", None] # Try human first as default, then any organism
|
168
|
+
|
169
|
+
gene_results = None
|
170
|
+
|
171
|
+
try:
|
172
|
+
# Try for each organism in priority order
|
173
|
+
for org in organisms_to_try:
|
174
|
+
# First try to find the gene with organism constraint
|
175
|
+
if org:
|
176
|
+
logging.info(f"Searching NCBI gene database for: {gene_id} in organism: {org}")
|
177
|
+
ncbi_limiter.wait()
|
178
|
+
search_query = f"{gene_id}[Gene Symbol] AND {org}[Organism]"
|
179
|
+
search_results = ncbi.ESearch("gene", search_query)
|
180
|
+
gene_ids = search_results.get('idlist', [])
|
181
|
+
|
182
|
+
if gene_ids:
|
183
|
+
gene_id_found = gene_ids[0]
|
184
|
+
logging.info(f"Found gene ID: {gene_id_found} in {org}, fetching details")
|
185
|
+
ncbi_limiter.wait()
|
186
|
+
gene_data = ncbi.EFetch("gene", id=gene_id_found)
|
187
|
+
gene_results = f"NCBI Entrez Gene Information:\n{gene_data}"
|
188
|
+
break
|
189
|
+
|
190
|
+
# Try without organism constraint as fallback
|
191
|
+
if not gene_results:
|
192
|
+
logging.info(f"Trying gene symbol search without organism constraint for: {gene_id}")
|
193
|
+
ncbi_limiter.wait()
|
194
|
+
search_results = ncbi.ESearch("gene", f"{gene_id}[Gene Symbol]")
|
195
|
+
gene_ids = search_results.get('idlist', [])
|
196
|
+
|
197
|
+
if gene_ids:
|
198
|
+
gene_id_found = gene_ids[0]
|
199
|
+
logging.info(f"Found gene ID: {gene_id_found}, fetching details")
|
200
|
+
ncbi_limiter.wait()
|
201
|
+
gene_data = ncbi.EFetch("gene", id=gene_id_found)
|
202
|
+
gene_results = f"NCBI Entrez Gene Information:\n{gene_data}"
|
203
|
+
break
|
204
|
+
|
205
|
+
# If we found gene results, return them
|
206
|
+
if gene_results:
|
207
|
+
return gene_results
|
208
|
+
|
209
|
+
# If not found in gene database, try protein database
|
210
|
+
# For bacterial genes, try organism-specific search first
|
211
|
+
protein_ids = []
|
212
|
+
if is_likely_bacterial:
|
213
|
+
for org in organisms_to_try:
|
214
|
+
if org:
|
215
|
+
logging.info(f"Searching NCBI protein database for: {gene_id} in organism: {org}")
|
216
|
+
ncbi_limiter.wait()
|
217
|
+
search_query = f"{gene_id} AND {org}[Organism]"
|
218
|
+
search_results = ncbi.ESearch("protein", search_query)
|
219
|
+
protein_ids = search_results.get('idlist', [])
|
220
|
+
|
221
|
+
if protein_ids:
|
222
|
+
logging.info(f"Found protein ID(s) for {gene_id} in {org}: {protein_ids}")
|
223
|
+
break
|
224
|
+
else:
|
225
|
+
# Standard protein search (no organism constraint)
|
226
|
+
logging.info(f"Searching NCBI protein database for: {gene_id}")
|
227
|
+
ncbi_limiter.wait()
|
228
|
+
search_results = ncbi.ESearch("protein", gene_id)
|
229
|
+
protein_ids = search_results.get('idlist', [])
|
230
|
+
|
231
|
+
if protein_ids:
|
232
|
+
protein_id = protein_ids[0]
|
233
|
+
logging.info(f"Found protein ID: {protein_id}, fetching sequence")
|
234
|
+
ncbi_limiter.wait()
|
235
|
+
protein_data = ncbi.EFetch("protein", id=protein_id, rettype="fasta", retmode="text")
|
236
|
+
try:
|
237
|
+
# Strip byte prefix if present
|
238
|
+
if isinstance(protein_data, bytes):
|
239
|
+
protein_data = protein_data.decode('utf-8')
|
240
|
+
elif isinstance(protein_data, str) and protein_data.startswith('b\''):
|
241
|
+
protein_data = protein_data[2:-1].replace('\\n', '\n')
|
242
|
+
except:
|
243
|
+
pass
|
244
|
+
|
245
|
+
# Get additional details with esummary
|
246
|
+
logging.info(f"Fetching protein summary for: {protein_id}")
|
247
|
+
ncbi_limiter.wait()
|
248
|
+
summary_data = ncbi.ESummary("protein", id=protein_id)
|
249
|
+
|
250
|
+
# Extract and format useful summary information
|
251
|
+
protein_summary = ""
|
252
|
+
if isinstance(summary_data, dict) and summary_data:
|
253
|
+
# For newer versions of bioservices
|
254
|
+
if protein_id in summary_data:
|
255
|
+
details = summary_data[protein_id]
|
256
|
+
title = details.get('title', 'No title available')
|
257
|
+
organism = details.get('organism', 'Unknown organism')
|
258
|
+
protein_summary = f"Title: {title}\nOrganism: {organism}\n\n"
|
259
|
+
logging.info(f"Found protein: {title} ({organism})")
|
260
|
+
# For other data structures returned by ESummary
|
261
|
+
else:
|
262
|
+
title = None
|
263
|
+
organism = None
|
264
|
+
|
265
|
+
for key, value in summary_data.items():
|
266
|
+
if isinstance(value, dict):
|
267
|
+
if 'title' in value:
|
268
|
+
title = value['title']
|
269
|
+
if 'organism' in value:
|
270
|
+
organism = value['organism']
|
271
|
+
|
272
|
+
if title or organism:
|
273
|
+
protein_summary = f"Title: {title or 'Not available'}\nOrganism: {organism or 'Unknown'}\n\n"
|
274
|
+
if title:
|
275
|
+
logging.info(f"Found protein: {title}")
|
276
|
+
|
277
|
+
combined_data = f"{protein_summary}{protein_data}"
|
278
|
+
return f"NCBI Entrez Protein Information:\n{combined_data}"
|
279
|
+
|
280
|
+
# Try nucleotide database as well
|
281
|
+
logging.info(f"No protein found, trying NCBI nucleotide database for: {gene_id}")
|
282
|
+
ncbi_limiter.wait()
|
283
|
+
search_results = ncbi.ESearch("nuccore", gene_id)
|
284
|
+
nuccore_ids = search_results.get('idlist', [])
|
285
|
+
|
286
|
+
if nuccore_ids:
|
287
|
+
nuccore_id = nuccore_ids[0]
|
288
|
+
logging.info(f"Found nucleotide ID: {nuccore_id}, fetching details")
|
289
|
+
ncbi_limiter.wait()
|
290
|
+
nuccore_data = ncbi.EFetch("nuccore", id=nuccore_id, rettype="gb", retmode="text")
|
291
|
+
try:
|
292
|
+
if isinstance(nuccore_data, bytes):
|
293
|
+
nuccore_data = nuccore_data.decode('utf-8')
|
294
|
+
except:
|
295
|
+
pass
|
296
|
+
return f"NCBI Entrez Nucleotide Information:\n{nuccore_data}"
|
297
|
+
|
298
|
+
logging.info(f"No information found in NCBI for: {gene_id}")
|
299
|
+
return None
|
300
|
+
except Exception as e:
|
301
|
+
# Return None if lookup fails
|
302
|
+
logging.warning(f"Error querying NCBI Entrez for {gene_id}: {str(e)}")
|
303
|
+
return f"Error querying NCBI Entrez: {str(e)}"
|
304
|
+
|
305
|
+
|
306
|
+
def get_gene_description(ctx: RunContext[TalismanConfig], gene_id: str, organism: str = None) -> str:
|
307
|
+
"""Get description for a single gene ID, using UniProt and falling back to NCBI Entrez.
|
308
|
+
|
309
|
+
Args:
|
310
|
+
ctx: The run context with access to the config
|
311
|
+
gene_id: The gene identifier (UniProt ID, gene symbol, etc.)
|
312
|
+
organism: Optional organism name to restrict search (e.g., "Salmonella", "Homo sapiens")
|
313
|
+
|
314
|
+
Returns:
|
315
|
+
The gene description in a structured format
|
316
|
+
"""
|
317
|
+
logging.info(f"Getting description for gene: {gene_id}")
|
318
|
+
config = ctx.deps or get_config()
|
319
|
+
u = config.get_uniprot_client()
|
320
|
+
|
321
|
+
# Check if this looks like a bacterial gene code
|
322
|
+
bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg", "flh", "fli", "che"]
|
323
|
+
is_likely_bacterial = any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
|
324
|
+
|
325
|
+
# Auto-detect organism based on gene pattern
|
326
|
+
if is_likely_bacterial and not organism:
|
327
|
+
logging.info(f"Gene {gene_id} matches bacterial pattern, setting organism to Salmonella")
|
328
|
+
organism = "Salmonella"
|
329
|
+
|
330
|
+
try:
|
331
|
+
# Normalize the gene ID
|
332
|
+
gene_id = normalize_gene_id(gene_id)
|
333
|
+
logging.info(f"Normalized gene ID: {gene_id}")
|
334
|
+
uniprot_info = None
|
335
|
+
ncbi_info = None
|
336
|
+
|
337
|
+
# First try to look up UniProt accession if it looks like a gene symbol
|
338
|
+
if not is_uniprot_id(gene_id):
|
339
|
+
logging.info(f"Not a UniProt ID, looking up accession for: {gene_id}")
|
340
|
+
uniprot_id = lookup_uniprot_accession(ctx, gene_id)
|
341
|
+
# If lookup succeeded (returned a different ID), use that for retrieval
|
342
|
+
if uniprot_id != gene_id:
|
343
|
+
logging.info(f"Using UniProt ID: {uniprot_id} instead of {gene_id}")
|
344
|
+
gene_id = uniprot_id
|
345
|
+
|
346
|
+
# Direct lookup for UniProt IDs
|
347
|
+
if is_uniprot_id(gene_id):
|
348
|
+
try:
|
349
|
+
logging.info(f"Performing direct UniProt lookup for: {gene_id}")
|
350
|
+
# Apply rate limiting
|
351
|
+
uniprot_limiter.wait()
|
352
|
+
result = u.retrieve(gene_id, frmt="txt")
|
353
|
+
if result and result.strip() != "":
|
354
|
+
logging.info(f"Found direct UniProt entry for: {gene_id}")
|
355
|
+
uniprot_info = result
|
356
|
+
else:
|
357
|
+
logging.info(f"No direct UniProt entry found for: {gene_id}")
|
358
|
+
except Exception as e:
|
359
|
+
logging.warning(f"Error in direct UniProt lookup: {str(e)}")
|
360
|
+
pass # If direct lookup fails, continue with search
|
361
|
+
|
362
|
+
# If we don't have UniProt info yet, try the search
|
363
|
+
if not uniprot_info:
|
364
|
+
# Search for the gene
|
365
|
+
logging.info(f"Performing UniProt search for: {gene_id}")
|
366
|
+
uniprot_limiter.wait()
|
367
|
+
search_query = f'gene:{gene_id} OR accession:{gene_id} OR id:{gene_id}'
|
368
|
+
results = u.search(search_query, frmt="tsv",
|
369
|
+
columns="accession,id,gene_names,organism,protein_name,function,cc_disease")
|
370
|
+
|
371
|
+
if not results or results.strip() == "":
|
372
|
+
# Try a broader search if the specific one failed
|
373
|
+
logging.info(f"No specific match found, trying broader UniProt search for: {gene_id}")
|
374
|
+
uniprot_limiter.wait()
|
375
|
+
search_query = gene_id
|
376
|
+
results = u.search(search_query, frmt="tsv",
|
377
|
+
columns="accession,id,gene_names,organism,protein_name,function,cc_disease")
|
378
|
+
|
379
|
+
if results and results.strip() != "":
|
380
|
+
logging.info(f"Found UniProt entries in broader search for: {gene_id}")
|
381
|
+
uniprot_info = results
|
382
|
+
else:
|
383
|
+
logging.info(f"No UniProt entries found in broader search for: {gene_id}")
|
384
|
+
else:
|
385
|
+
logging.info(f"Found UniProt entries in specific search for: {gene_id}")
|
386
|
+
uniprot_info = results
|
387
|
+
|
388
|
+
# Check NCBI Entrez if we couldn't find anything in UniProt
|
389
|
+
if not uniprot_info or uniprot_info.strip() == "":
|
390
|
+
logging.info(f"No UniProt information found, checking NCBI for: {gene_id}")
|
391
|
+
# Pass the organism if we have one or auto-detected one
|
392
|
+
ncbi_info = get_ncbi_gene_info(ctx, gene_id, organism)
|
393
|
+
if ncbi_info:
|
394
|
+
logging.info(f"Found NCBI information for: {gene_id}")
|
395
|
+
else:
|
396
|
+
logging.warning(f"No NCBI information found for: {gene_id}")
|
397
|
+
|
398
|
+
# Combine results or use whichever source had information
|
399
|
+
if uniprot_info and ncbi_info:
|
400
|
+
logging.info(f"Returning combined UniProt and NCBI information for: {gene_id}")
|
401
|
+
return f"## UniProt Information\n{uniprot_info}\n\n## NCBI Information\n{ncbi_info}"
|
402
|
+
elif uniprot_info:
|
403
|
+
logging.info(f"Returning UniProt information for: {gene_id}")
|
404
|
+
return uniprot_info
|
405
|
+
elif ncbi_info:
|
406
|
+
logging.info(f"Returning NCBI information for: {gene_id}")
|
407
|
+
return ncbi_info
|
408
|
+
else:
|
409
|
+
logging.error(f"No gene information found for: {gene_id} in either UniProt or NCBI")
|
410
|
+
raise ModelRetry(f"No gene information found for: {gene_id} in either UniProt or NCBI Entrez")
|
411
|
+
|
412
|
+
except Exception as e:
|
413
|
+
if "ModelRetry" in str(type(e)):
|
414
|
+
raise e
|
415
|
+
logging.error(f"Error retrieving gene description for {gene_id}: {str(e)}")
|
416
|
+
raise ModelRetry(f"Error retrieving gene description: {str(e)}")
|
417
|
+
|
418
|
+
|
419
|
+
def get_gene_descriptions(ctx: RunContext[TalismanConfig], gene_ids: List[str]) -> str:
|
420
|
+
"""Get descriptions for multiple gene IDs.
|
421
|
+
|
422
|
+
Args:
|
423
|
+
ctx: The run context with access to the config
|
424
|
+
gene_ids: List of gene identifiers
|
425
|
+
|
426
|
+
Returns:
|
427
|
+
The gene descriptions in a structured tabular format
|
428
|
+
"""
|
429
|
+
logging.info(f"Retrieving descriptions for {len(gene_ids)} genes: {', '.join(gene_ids)}")
|
430
|
+
config = ctx.deps or get_config()
|
431
|
+
|
432
|
+
try:
|
433
|
+
if not gene_ids:
|
434
|
+
logging.error("No gene IDs provided")
|
435
|
+
raise ModelRetry("No gene IDs provided")
|
436
|
+
|
437
|
+
results = []
|
438
|
+
gene_info_dict = {}
|
439
|
+
|
440
|
+
for i, gene_id in enumerate(gene_ids):
|
441
|
+
logging.info(f"Processing gene {i+1}/{len(gene_ids)}: {gene_id}")
|
442
|
+
try:
|
443
|
+
gene_info = get_gene_description(ctx, gene_id)
|
444
|
+
results.append(f"## Gene: {gene_id}\n{gene_info}\n")
|
445
|
+
gene_info_dict[gene_id] = gene_info
|
446
|
+
logging.info(f"Successfully retrieved information for {gene_id}")
|
447
|
+
except Exception as e:
|
448
|
+
logging.warning(f"Error retrieving information for {gene_id}: {str(e)}")
|
449
|
+
results.append(f"## Gene: {gene_id}\nError: {str(e)}\n")
|
450
|
+
|
451
|
+
if not results:
|
452
|
+
logging.error("No gene information found for any of the provided IDs")
|
453
|
+
raise ModelRetry("No gene information found for any of the provided IDs")
|
454
|
+
|
455
|
+
# Store the gene info dictionary in an attribute we add to ctx (state only available in test context)
|
456
|
+
# Use hasattr to check if the attribute already exists
|
457
|
+
if not hasattr(ctx, "gene_info_dict"):
|
458
|
+
# Create the attribute if it doesn't exist
|
459
|
+
setattr(ctx, "gene_info_dict", {})
|
460
|
+
|
461
|
+
# Now set the value
|
462
|
+
ctx.gene_info_dict = gene_info_dict
|
463
|
+
logging.info(f"Successfully retrieved information for {len(gene_info_dict)} genes")
|
464
|
+
|
465
|
+
return "\n".join(results)
|
466
|
+
except Exception as e:
|
467
|
+
if "ModelRetry" in str(type(e)):
|
468
|
+
raise e
|
469
|
+
logging.error(f"Error retrieving gene descriptions: {str(e)}")
|
470
|
+
raise ModelRetry(f"Error retrieving gene descriptions: {str(e)}")
|
471
|
+
|
472
|
+
|
473
|
+
def parse_gene_list(gene_list: str) -> List[str]:
|
474
|
+
"""Parse a string containing gene IDs or symbols into a list.
|
475
|
+
|
476
|
+
Args:
|
477
|
+
gene_list: String of gene identifiers separated by commas, spaces, semicolons, or newlines
|
478
|
+
|
479
|
+
Returns:
|
480
|
+
List of gene identifiers
|
481
|
+
"""
|
482
|
+
if not gene_list:
|
483
|
+
return []
|
484
|
+
|
485
|
+
# Replace common separators with a single delimiter for splitting
|
486
|
+
for sep in [',', ';', '\n', '\t']:
|
487
|
+
gene_list = gene_list.replace(sep, ' ')
|
488
|
+
|
489
|
+
# Split on spaces and filter out empty strings
|
490
|
+
genes = [g.strip() for g in gene_list.split(' ') if g.strip()]
|
491
|
+
return genes
|
492
|
+
|
493
|
+
|
494
|
+
def get_genes_from_list(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
|
495
|
+
"""Get descriptions for multiple gene IDs provided as a string.
|
496
|
+
|
497
|
+
Args:
|
498
|
+
ctx: The run context with access to the config
|
499
|
+
gene_list: String containing gene identifiers separated by commas, spaces, or newlines
|
500
|
+
|
501
|
+
Returns:
|
502
|
+
The gene descriptions in a structured tabular format
|
503
|
+
"""
|
504
|
+
logging.info(f"Parsing gene list: {gene_list}")
|
505
|
+
gene_ids = parse_gene_list(gene_list)
|
506
|
+
|
507
|
+
if not gene_ids:
|
508
|
+
logging.error("No gene IDs could be parsed from the input string")
|
509
|
+
raise ModelRetry("No gene IDs could be parsed from the input string")
|
510
|
+
|
511
|
+
logging.info(f"Parsed {len(gene_ids)} gene IDs: {', '.join(gene_ids)}")
|
512
|
+
return get_gene_descriptions(ctx, gene_ids)
|
513
|
+
|
514
|
+
|
515
|
+
def analyze_gene_set(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
|
516
|
+
"""Analyze a set of genes and generate a biological summary of their properties and relationships.
|
517
|
+
|
518
|
+
Args:
|
519
|
+
ctx: The run context with access to the config
|
520
|
+
gene_list: String containing gene identifiers separated by commas, spaces, or newlines
|
521
|
+
|
522
|
+
Returns:
|
523
|
+
A structured biological summary of the gene set
|
524
|
+
"""
|
525
|
+
logging.info(f"Starting gene set analysis for: {gene_list}")
|
526
|
+
|
527
|
+
# Detect if these look like bacterial genes
|
528
|
+
bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg", "flh", "fli", "che", "DVU"]
|
529
|
+
gene_ids_list = parse_gene_list(gene_list)
|
530
|
+
is_likely_bacterial = any(
|
531
|
+
any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
|
532
|
+
for gene_id in gene_ids_list
|
533
|
+
)
|
534
|
+
|
535
|
+
# Set organism based on pattern detection
|
536
|
+
organism = None
|
537
|
+
if is_likely_bacterial:
|
538
|
+
logging.info(f"Detected likely bacterial genes: {gene_list}")
|
539
|
+
# Check for specific bacterial gene patterns
|
540
|
+
if any(gene_id.lower().startswith(("inv", "sip", "sop", "sic", "spa")) for gene_id in gene_ids_list):
|
541
|
+
organism = "Salmonella"
|
542
|
+
logging.info(f"Setting organism to Salmonella based on gene patterns")
|
543
|
+
elif any(gene_id.startswith("DVU") for gene_id in gene_ids_list):
|
544
|
+
organism = "Desulfovibrio"
|
545
|
+
logging.info(f"Setting organism to Desulfovibrio based on gene patterns")
|
546
|
+
|
547
|
+
# First, get detailed information about each gene
|
548
|
+
logging.info("Retrieving gene descriptions...")
|
549
|
+
# Pass organism information to each gene lookup
|
550
|
+
for gene_id in gene_ids_list:
|
551
|
+
logging.info(f"Processing {gene_id} with organism context: {organism}")
|
552
|
+
get_gene_description(ctx, gene_id, organism)
|
553
|
+
|
554
|
+
# Now get all gene descriptions
|
555
|
+
gene_descriptions = get_genes_from_list(ctx, gene_list)
|
556
|
+
logging.info("Gene descriptions retrieved successfully")
|
557
|
+
|
558
|
+
# Get the gene info dictionary from the context
|
559
|
+
gene_info_dict = getattr(ctx, "gene_info_dict", {})
|
560
|
+
|
561
|
+
if not gene_info_dict:
|
562
|
+
logging.error("No gene information was found to analyze")
|
563
|
+
raise ModelRetry("No gene information was found to analyze")
|
564
|
+
|
565
|
+
gene_ids = list(gene_info_dict.keys())
|
566
|
+
logging.info(f"Analyzing relationships between {len(gene_ids)} genes: {', '.join(gene_ids)}")
|
567
|
+
|
568
|
+
# Extract organism information from the gene descriptions if possible
|
569
|
+
detected_organism = None
|
570
|
+
organism_keywords = ["Salmonella", "Escherichia", "Desulfovibrio", "Homo sapiens", "human"]
|
571
|
+
for gene_info in gene_info_dict.values():
|
572
|
+
for keyword in organism_keywords:
|
573
|
+
if keyword.lower() in gene_info.lower():
|
574
|
+
detected_organism = keyword
|
575
|
+
break
|
576
|
+
if detected_organism:
|
577
|
+
break
|
578
|
+
|
579
|
+
if detected_organism:
|
580
|
+
logging.info(f"Detected organism from gene descriptions: {detected_organism}")
|
581
|
+
|
582
|
+
# Prepare a prompt for the LLM
|
583
|
+
prompt = f"""Analyze the following set of genes and provide a detailed biological summary:
|
584
|
+
|
585
|
+
Gene IDs/Symbols: {', '.join(gene_ids)}
|
586
|
+
|
587
|
+
Gene Information:
|
588
|
+
{gene_descriptions}
|
589
|
+
|
590
|
+
{f"IMPORTANT: These genes are from {detected_organism or organism}. Make sure your analysis reflects the correct organism context." if detected_organism or organism else ""}
|
591
|
+
|
592
|
+
Based on this information, provide a structured analysis covering:
|
593
|
+
1. Shared biological processes these genes may participate in
|
594
|
+
2. Potential protein-protein interactions or functional relationships
|
595
|
+
3. Common cellular localization patterns
|
596
|
+
4. Involvement in similar pathways
|
597
|
+
5. Coordinated activities or cooperative functions
|
598
|
+
6. Any disease associations that multiple genes in this set share
|
599
|
+
|
600
|
+
Focus particularly on identifying relationships between at least a pair of these genes.
|
601
|
+
If the genes appear unrelated, note this but try to identify any subtle connections based on their function.
|
602
|
+
|
603
|
+
Your analysis should include multiple kinds of relationships:
|
604
|
+
- Functional relationships
|
605
|
+
- Pathway relationships
|
606
|
+
- Regulatory relationships
|
607
|
+
- Localization patterns
|
608
|
+
- Physical interactions
|
609
|
+
- Genetic interactions
|
610
|
+
|
611
|
+
Format the response with appropriate markdown headings and bullet points.
|
612
|
+
|
613
|
+
IMPORTANT: You MUST include ALL of the following sections in your response:
|
614
|
+
|
615
|
+
1. First provide your detailed analysis with appropriate headings for each section.
|
616
|
+
|
617
|
+
2. After your analysis, include a distinct section titled "## Terms"
|
618
|
+
that contains a semicolon-delimited list of functional terms relevant to the gene set,
|
619
|
+
ordered by relevance. These terms should include:
|
620
|
+
- Gene Ontology biological process terms (e.g., DNA repair, oxidative phosphorylation, signal transduction)
|
621
|
+
- Molecular function terms (e.g., kinase activity, DNA binding, transporter activity)
|
622
|
+
- Cellular component/localization terms (e.g., nucleus, plasma membrane, mitochondria)
|
623
|
+
- Pathway names (e.g., glycolysis, TCA cycle, MAPK signaling)
|
624
|
+
- Co-regulation terms (e.g., stress response regulon, heat shock response)
|
625
|
+
- Interaction networks (e.g., protein complex formation, signaling cascade)
|
626
|
+
- Metabolic process terms (e.g., fatty acid synthesis, amino acid metabolism)
|
627
|
+
- Regulatory mechanisms (e.g., transcriptional regulation, post-translational modification)
|
628
|
+
- Disease associations (if relevant, e.g., virulence, pathogenesis, antibiotic resistance)
|
629
|
+
- Structural and functional domains/motifs (e.g., helix-turn-helix, zinc finger)
|
630
|
+
|
631
|
+
Example of Terms section:
|
632
|
+
## Terms
|
633
|
+
DNA damage response; p53 signaling pathway; apoptosis; cell cycle regulation; tumor suppression; DNA repair; protein ubiquitination; transcriptional regulation; nuclear localization; cancer predisposition
|
634
|
+
|
635
|
+
3. After the Terms section, include a summary table of the genes analyzed titled "## Gene Summary Table"
|
636
|
+
Format it as a markdown table with the following columns in this exact order:
|
637
|
+
- ID: The gene identifier (same as Gene Symbol)
|
638
|
+
- Annotation: Genomic coordinates or accession with position information
|
639
|
+
- Genomic Context: Information about the genomic location (chromosome, plasmid, etc.)
|
640
|
+
- Organism: The organism the gene belongs to
|
641
|
+
- Description: The protein/gene function description
|
642
|
+
|
643
|
+
Make sure the information is accurate based on the gene information provided and do not conflate with similarly named genes from different organisms.
|
644
|
+
|
645
|
+
Example:
|
646
|
+
|
647
|
+
## Gene Summary Table
|
648
|
+
| ID | Annotation | Genomic Context | Organism | Description |
|
649
|
+
|-------------|-------------|----------|----------------|------------|
|
650
|
+
| BRCA1 | NC_000017.11 (43044295..43125483) | Chromosome 17 | Homo sapiens | Breast cancer type 1 susceptibility protein |
|
651
|
+
| TP53 | NC_000017.11 (7668402..7687550) | Chromosome 17 | Homo sapiens | Tumor suppressor protein |
|
652
|
+
|
653
|
+
For bacterial genes, the table should look like:
|
654
|
+
|
655
|
+
## Gene Summary Table
|
656
|
+
| ID | Annotation | Genomic Context | Organism | Description |
|
657
|
+
|-------------|-------------|----------|----------------|------------|
|
658
|
+
| invA | NC_003197.2 (3038407..3040471, complement) | Chromosome | Salmonella enterica | Invasion protein |
|
659
|
+
| DVUA0001 | NC_005863.1 (699..872, complement) | Plasmid pDV | Desulfovibrio vulgaris str. Hildenborough | Hypothetical protein |
|
660
|
+
|
661
|
+
REMEMBER: ALL THREE SECTIONS ARE REQUIRED - Main Analysis, Terms, and Gene Summary Table.
|
662
|
+
"""
|
663
|
+
|
664
|
+
# Access OpenAI API to generate the analysis
|
665
|
+
try:
|
666
|
+
# Use the configured model name if available
|
667
|
+
model_name = getattr(ctx.deps, "model_name", "gpt-4o") if ctx.deps else "gpt-4o"
|
668
|
+
# Use the configured API key if available
|
669
|
+
api_key = getattr(ctx.deps, "openai_api_key", None) if ctx.deps else None
|
670
|
+
|
671
|
+
logging.info(f"Generating biological analysis using model: {model_name}")
|
672
|
+
|
673
|
+
if api_key:
|
674
|
+
openai.api_key = api_key
|
675
|
+
|
676
|
+
# Create the completion using OpenAI API
|
677
|
+
logging.info("Sending request to OpenAI API...")
|
678
|
+
response = openai.chat.completions.create(
|
679
|
+
model=model_name,
|
680
|
+
messages=[
|
681
|
+
{"role": "system", "content": "You are a biology expert analyzing gene sets to identify functional relationships. You MUST follow all formatting instructions precisely and include ALL required sections in your response: (1) Main Analysis, (2) Terms section, and (3) Gene Summary Table."},
|
682
|
+
{"role": "user", "content": prompt}
|
683
|
+
],
|
684
|
+
temperature=0.3,
|
685
|
+
max_tokens=4000
|
686
|
+
)
|
687
|
+
logging.info("Received response from OpenAI API")
|
688
|
+
|
689
|
+
# Extract the response content
|
690
|
+
result = response.choices[0].message.content
|
691
|
+
|
692
|
+
# Save the response to a timestamped file
|
693
|
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
694
|
+
filename = f"talisman_analysis_{timestamp}.json"
|
695
|
+
|
696
|
+
# Create a directory for analysis results if it doesn't exist
|
697
|
+
results_dir = os.path.join(os.path.expanduser("~"), "talisman_results")
|
698
|
+
os.makedirs(results_dir, exist_ok=True)
|
699
|
+
|
700
|
+
# Save the full response including metadata
|
701
|
+
file_path = os.path.join(results_dir, filename)
|
702
|
+
logging.info(f"Saving analysis results to: {file_path}")
|
703
|
+
|
704
|
+
with open(file_path, 'w') as f:
|
705
|
+
# Create a dictionary with both the result and input/metadata
|
706
|
+
output_data = {
|
707
|
+
"timestamp": timestamp,
|
708
|
+
"genes_analyzed": gene_ids,
|
709
|
+
"model": model_name,
|
710
|
+
"raw_response": response.model_dump(),
|
711
|
+
"analysis_result": result
|
712
|
+
}
|
713
|
+
json.dump(output_data, f, indent=2)
|
714
|
+
|
715
|
+
logging.info(f"Analysis complete. Results saved to: {file_path}")
|
716
|
+
|
717
|
+
return result
|
718
|
+
except Exception as e:
|
719
|
+
logging.error(f"Error generating gene set analysis: {str(e)}")
|
720
|
+
raise ModelRetry(f"Error generating gene set analysis: {str(e)}")
|