aurelian 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aurelian/__init__.py +9 -0
- aurelian/agents/__init__.py +0 -0
- aurelian/agents/amigo/__init__.py +3 -0
- aurelian/agents/amigo/amigo_agent.py +77 -0
- aurelian/agents/amigo/amigo_config.py +85 -0
- aurelian/agents/amigo/amigo_evals.py +73 -0
- aurelian/agents/amigo/amigo_gradio.py +52 -0
- aurelian/agents/amigo/amigo_mcp.py +152 -0
- aurelian/agents/amigo/amigo_tools.py +152 -0
- aurelian/agents/biblio/__init__.py +42 -0
- aurelian/agents/biblio/biblio_agent.py +95 -0
- aurelian/agents/biblio/biblio_config.py +40 -0
- aurelian/agents/biblio/biblio_gradio.py +67 -0
- aurelian/agents/biblio/biblio_mcp.py +115 -0
- aurelian/agents/biblio/biblio_tools.py +164 -0
- aurelian/agents/biblio_agent.py +46 -0
- aurelian/agents/checklist/__init__.py +44 -0
- aurelian/agents/checklist/checklist_agent.py +86 -0
- aurelian/agents/checklist/checklist_config.py +28 -0
- aurelian/agents/checklist/checklist_gradio.py +70 -0
- aurelian/agents/checklist/checklist_mcp.py +86 -0
- aurelian/agents/checklist/checklist_tools.py +141 -0
- aurelian/agents/checklist/content/checklists.yaml +7 -0
- aurelian/agents/checklist/content/streams.csv +136 -0
- aurelian/agents/checklist_agent.py +40 -0
- aurelian/agents/chemistry/__init__.py +3 -0
- aurelian/agents/chemistry/chemistry_agent.py +47 -0
- aurelian/agents/chemistry/chemistry_config.py +71 -0
- aurelian/agents/chemistry/chemistry_evals.py +79 -0
- aurelian/agents/chemistry/chemistry_gradio.py +50 -0
- aurelian/agents/chemistry/chemistry_mcp.py +120 -0
- aurelian/agents/chemistry/chemistry_tools.py +121 -0
- aurelian/agents/chemistry/image_agent.py +15 -0
- aurelian/agents/d4d/__init__.py +30 -0
- aurelian/agents/d4d/d4d_agent.py +73 -0
- aurelian/agents/d4d/d4d_config.py +46 -0
- aurelian/agents/d4d/d4d_gradio.py +58 -0
- aurelian/agents/d4d/d4d_mcp.py +71 -0
- aurelian/agents/d4d/d4d_tools.py +157 -0
- aurelian/agents/d4d_agent.py +64 -0
- aurelian/agents/diagnosis/__init__.py +33 -0
- aurelian/agents/diagnosis/diagnosis_agent.py +54 -0
- aurelian/agents/diagnosis/diagnosis_config.py +48 -0
- aurelian/agents/diagnosis/diagnosis_evals.py +76 -0
- aurelian/agents/diagnosis/diagnosis_gradio.py +52 -0
- aurelian/agents/diagnosis/diagnosis_mcp.py +141 -0
- aurelian/agents/diagnosis/diagnosis_tools.py +204 -0
- aurelian/agents/diagnosis_agent.py +28 -0
- aurelian/agents/draw/__init__.py +3 -0
- aurelian/agents/draw/draw_agent.py +39 -0
- aurelian/agents/draw/draw_config.py +26 -0
- aurelian/agents/draw/draw_gradio.py +50 -0
- aurelian/agents/draw/draw_mcp.py +94 -0
- aurelian/agents/draw/draw_tools.py +100 -0
- aurelian/agents/draw/judge_agent.py +18 -0
- aurelian/agents/filesystem/__init__.py +0 -0
- aurelian/agents/filesystem/filesystem_config.py +27 -0
- aurelian/agents/filesystem/filesystem_gradio.py +49 -0
- aurelian/agents/filesystem/filesystem_mcp.py +89 -0
- aurelian/agents/filesystem/filesystem_tools.py +95 -0
- aurelian/agents/filesystem/py.typed +0 -0
- aurelian/agents/github/__init__.py +0 -0
- aurelian/agents/github/github_agent.py +83 -0
- aurelian/agents/github/github_cli.py +248 -0
- aurelian/agents/github/github_config.py +22 -0
- aurelian/agents/github/github_gradio.py +152 -0
- aurelian/agents/github/github_mcp.py +252 -0
- aurelian/agents/github/github_tools.py +408 -0
- aurelian/agents/github/github_tools.py.tmp +413 -0
- aurelian/agents/goann/__init__.py +13 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.md +1000 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.pdf +0 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.md +693 -0
- aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.pdf +0 -0
- aurelian/agents/goann/goann_agent.py +90 -0
- aurelian/agents/goann/goann_config.py +90 -0
- aurelian/agents/goann/goann_evals.py +104 -0
- aurelian/agents/goann/goann_gradio.py +62 -0
- aurelian/agents/goann/goann_mcp.py +0 -0
- aurelian/agents/goann/goann_tools.py +65 -0
- aurelian/agents/gocam/__init__.py +52 -0
- aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/DNA-binding_transcription_factor_activity_annotation_guidelines.md +100 -0
- aurelian/agents/gocam/documents/E3 ubiquitin ligases.docx +0 -0
- aurelian/agents/gocam/documents/E3 ubiquitin ligases.pdf +0 -0
- aurelian/agents/gocam/documents/E3_ubiquitin_ligases.md +134 -0
- aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.docx +0 -0
- aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.pdf +0 -0
- aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.docx +0 -0
- aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.pdf +0 -0
- aurelian/agents/gocam/documents/GO-CAM_annotation_guidelines_README.md +1 -0
- aurelian/agents/gocam/documents/GO-CAM_modelling_guidelines_TO_DO.md +3 -0
- aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.pdf +0 -0
- aurelian/agents/gocam/documents/How to annotate molecular adaptors.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate molecular adaptors.pdf +0 -0
- aurelian/agents/gocam/documents/How to annotate sequestering proteins.docx +0 -0
- aurelian/agents/gocam/documents/How to annotate sequestering proteins.pdf +0 -0
- aurelian/agents/gocam/documents/How_to_annotate_complexes_in_GO-CAM.md +29 -0
- aurelian/agents/gocam/documents/How_to_annotate_molecular_adaptors.md +31 -0
- aurelian/agents/gocam/documents/How_to_annotate_sequestering_proteins.md +42 -0
- aurelian/agents/gocam/documents/Molecular adaptor activity.docx +0 -0
- aurelian/agents/gocam/documents/Molecular adaptor activity.pdf +0 -0
- aurelian/agents/gocam/documents/Molecular carrier activity.docx +0 -0
- aurelian/agents/gocam/documents/Molecular carrier activity.pdf +0 -0
- aurelian/agents/gocam/documents/Molecular_adaptor_activity.md +51 -0
- aurelian/agents/gocam/documents/Molecular_carrier_activity.md +41 -0
- aurelian/agents/gocam/documents/Protein sequestering activity.docx +0 -0
- aurelian/agents/gocam/documents/Protein sequestering activity.pdf +0 -0
- aurelian/agents/gocam/documents/Protein_sequestering_activity.md +50 -0
- aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/Signaling_receptor_activity_annotation_guidelines.md +187 -0
- aurelian/agents/gocam/documents/Transcription coregulator activity.docx +0 -0
- aurelian/agents/gocam/documents/Transcription coregulator activity.pdf +0 -0
- aurelian/agents/gocam/documents/Transcription_coregulator_activity.md +36 -0
- aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.docx +0 -0
- aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.pdf +0 -0
- aurelian/agents/gocam/documents/Transporter_activity_annotation_annotation_guidelines.md +43 -0
- Regulatory Processes in GO-CAM.docx +0 -0
- Regulatory Processes in GO-CAM.pdf +0 -0
- aurelian/agents/gocam/documents/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +31 -0
- aurelian/agents/gocam/documents/md/DNA-binding_transcription_factor_activity_annotation_guidelines.md +131 -0
- aurelian/agents/gocam/documents/md/E3_ubiquitin_ligases.md +166 -0
- aurelian/agents/gocam/documents/md/GO-CAM_annotation_guidelines_README.md +1 -0
- aurelian/agents/gocam/documents/md/GO-CAM_modelling_guidelines_TO_DO.md +5 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_complexes_in_GO-CAM.md +28 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_molecular_adaptors.md +19 -0
- aurelian/agents/gocam/documents/md/How_to_annotate_sequestering_proteins.md +38 -0
- aurelian/agents/gocam/documents/md/Molecular_adaptor_activity.md +52 -0
- aurelian/agents/gocam/documents/md/Molecular_carrier_activity.md +59 -0
- aurelian/agents/gocam/documents/md/Protein_sequestering_activity.md +52 -0
- aurelian/agents/gocam/documents/md/Signaling_receptor_activity_annotation_guidelines.md +271 -0
- aurelian/agents/gocam/documents/md/Transcription_coregulator_activity.md +54 -0
- aurelian/agents/gocam/documents/md/Transporter_activity_annotation_annotation_guidelines.md +38 -0
- aurelian/agents/gocam/documents/md/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +39 -0
- aurelian/agents/gocam/documents/pandoc_md/Signaling_receptor_activity_annotation_guidelines.md +334 -0
- aurelian/agents/gocam/gocam_agent.py +243 -0
- aurelian/agents/gocam/gocam_config.py +85 -0
- aurelian/agents/gocam/gocam_curator_agent.py +46 -0
- aurelian/agents/gocam/gocam_evals.py +64 -0
- aurelian/agents/gocam/gocam_gradio.py +89 -0
- aurelian/agents/gocam/gocam_mcp.py +224 -0
- aurelian/agents/gocam/gocam_tools.py +294 -0
- aurelian/agents/linkml/__init__.py +0 -0
- aurelian/agents/linkml/linkml_agent.py +62 -0
- aurelian/agents/linkml/linkml_config.py +48 -0
- aurelian/agents/linkml/linkml_evals.py +66 -0
- aurelian/agents/linkml/linkml_gradio.py +45 -0
- aurelian/agents/linkml/linkml_mcp.py +181 -0
- aurelian/agents/linkml/linkml_tools.py +102 -0
- aurelian/agents/literature/__init__.py +3 -0
- aurelian/agents/literature/literature_agent.py +75 -0
- aurelian/agents/literature/literature_config.py +35 -0
- aurelian/agents/literature/literature_gradio.py +52 -0
- aurelian/agents/literature/literature_mcp.py +174 -0
- aurelian/agents/literature/literature_tools.py +182 -0
- aurelian/agents/monarch/__init__.py +0 -0
- aurelian/agents/monarch/monarch_agent.py +45 -0
- aurelian/agents/monarch/monarch_config.py +45 -0
- aurelian/agents/monarch/monarch_gradio.py +51 -0
- aurelian/agents/monarch/monarch_mcp.py +65 -0
- aurelian/agents/monarch/monarch_tools.py +112 -0
- aurelian/agents/oak/__init__.py +0 -0
- aurelian/agents/oak/oak_config.py +27 -0
- aurelian/agents/oak/oak_gradio.py +57 -0
- aurelian/agents/ontology_mapper/__init__.py +31 -0
- aurelian/agents/ontology_mapper/ontology_mapper_agent.py +57 -0
- aurelian/agents/ontology_mapper/ontology_mapper_config.py +50 -0
- aurelian/agents/ontology_mapper/ontology_mapper_evals.py +108 -0
- aurelian/agents/ontology_mapper/ontology_mapper_gradio.py +58 -0
- aurelian/agents/ontology_mapper/ontology_mapper_mcp.py +81 -0
- aurelian/agents/ontology_mapper/ontology_mapper_tools.py +147 -0
- aurelian/agents/paperqa/__init__.py +27 -0
- aurelian/agents/paperqa/paperqa_agent.py +66 -0
- aurelian/agents/paperqa/paperqa_cli.py +305 -0
- aurelian/agents/paperqa/paperqa_config.py +142 -0
- aurelian/agents/paperqa/paperqa_gradio.py +90 -0
- aurelian/agents/paperqa/paperqa_mcp.py +155 -0
- aurelian/agents/paperqa/paperqa_tools.py +566 -0
- aurelian/agents/phenopackets/__init__.py +3 -0
- aurelian/agents/phenopackets/phenopackets_agent.py +58 -0
- aurelian/agents/phenopackets/phenopackets_config.py +72 -0
- aurelian/agents/phenopackets/phenopackets_evals.py +99 -0
- aurelian/agents/phenopackets/phenopackets_gradio.py +55 -0
- aurelian/agents/phenopackets/phenopackets_mcp.py +178 -0
- aurelian/agents/phenopackets/phenopackets_tools.py +127 -0
- aurelian/agents/rag/__init__.py +40 -0
- aurelian/agents/rag/rag_agent.py +84 -0
- aurelian/agents/rag/rag_config.py +80 -0
- aurelian/agents/rag/rag_gradio.py +67 -0
- aurelian/agents/rag/rag_mcp.py +107 -0
- aurelian/agents/rag/rag_tools.py +189 -0
- aurelian/agents/rag_agent.py +54 -0
- aurelian/agents/robot/__init__.py +0 -0
- aurelian/agents/robot/assets/__init__.py +3 -0
- aurelian/agents/robot/assets/template.md +384 -0
- aurelian/agents/robot/robot_config.py +25 -0
- aurelian/agents/robot/robot_gradio.py +46 -0
- aurelian/agents/robot/robot_mcp.py +100 -0
- aurelian/agents/robot/robot_ontology_agent.py +139 -0
- aurelian/agents/robot/robot_tools.py +50 -0
- aurelian/agents/talisman/__init__.py +3 -0
- aurelian/agents/talisman/__main__.py +17 -0
- aurelian/agents/talisman/cli.py +70 -0
- aurelian/agents/talisman/run_talisman.py +18 -0
- aurelian/agents/talisman/talisman_agent.py +143 -0
- aurelian/agents/talisman/talisman_config.py +66 -0
- aurelian/agents/talisman/talisman_gradio.py +50 -0
- aurelian/agents/talisman/talisman_mcp.py +75 -0
- aurelian/agents/talisman/talisman_tools.py +962 -0
- aurelian/agents/ubergraph/__init__.py +40 -0
- aurelian/agents/ubergraph/ubergraph_agent.py +72 -0
- aurelian/agents/ubergraph/ubergraph_config.py +79 -0
- aurelian/agents/ubergraph/ubergraph_gradio.py +48 -0
- aurelian/agents/ubergraph/ubergraph_mcp.py +69 -0
- aurelian/agents/ubergraph/ubergraph_tools.py +118 -0
- aurelian/agents/uniprot/__init__.py +0 -0
- aurelian/agents/uniprot/uniprot_agent.py +43 -0
- aurelian/agents/uniprot/uniprot_config.py +43 -0
- aurelian/agents/uniprot/uniprot_evals.py +99 -0
- aurelian/agents/uniprot/uniprot_gradio.py +48 -0
- aurelian/agents/uniprot/uniprot_mcp.py +168 -0
- aurelian/agents/uniprot/uniprot_tools.py +136 -0
- aurelian/agents/web/__init__.py +0 -0
- aurelian/agents/web/web_config.py +27 -0
- aurelian/agents/web/web_gradio.py +48 -0
- aurelian/agents/web/web_mcp.py +50 -0
- aurelian/agents/web/web_tools.py +121 -0
- aurelian/chat.py +23 -0
- aurelian/cli.py +1004 -0
- aurelian/dependencies/__init__.py +0 -0
- aurelian/dependencies/workdir.py +78 -0
- aurelian/evaluators/model.py +9 -0
- aurelian/evaluators/substring_evaluator.py +30 -0
- aurelian/mcp/__init__.py +0 -0
- aurelian/mcp/amigo_mcp_test.py +86 -0
- aurelian/mcp/config_generator.py +123 -0
- aurelian/mcp/example_config.json +43 -0
- aurelian/mcp/generate_sample_config.py +37 -0
- aurelian/mcp/gocam_mcp_test.py +126 -0
- aurelian/mcp/linkml_mcp_tools.py +190 -0
- aurelian/mcp/mcp_discovery.py +87 -0
- aurelian/mcp/mcp_test.py +31 -0
- aurelian/mcp/phenopackets_mcp_test.py +103 -0
- aurelian/tools/__init__.py +0 -0
- aurelian/tools/web/__init__.py +0 -0
- aurelian/tools/web/url_download.py +51 -0
- aurelian/utils/__init__.py +0 -0
- aurelian/utils/async_utils.py +18 -0
- aurelian/utils/data_utils.py +32 -0
- aurelian/utils/documentation_manager.py +59 -0
- aurelian/utils/doi_fetcher.py +238 -0
- aurelian/utils/ontology_utils.py +68 -0
- aurelian/utils/pdf_fetcher.py +23 -0
- aurelian/utils/process_logs.py +100 -0
- aurelian/utils/pubmed_utils.py +238 -0
- aurelian/utils/pytest_report_to_markdown.py +67 -0
- aurelian/utils/robot_ontology_utils.py +112 -0
- aurelian/utils/search_utils.py +95 -0
- aurelian-0.1.0.dist-info/LICENSE +22 -0
- aurelian-0.1.0.dist-info/METADATA +109 -0
- aurelian-0.1.0.dist-info/RECORD +266 -0
- aurelian-0.1.0.dist-info/WHEEL +4 -0
- aurelian-0.1.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,962 @@
|
|
1
|
+
"""
|
2
|
+
Tools for retrieving gene information using the UniProt API and NCBI Entrez.
|
3
|
+
"""
|
4
|
+
from typing import Dict, List, Optional, Tuple, Any
|
5
|
+
from pydantic import BaseModel, Field
|
6
|
+
import re
|
7
|
+
import openai
|
8
|
+
import time
|
9
|
+
import threading
|
10
|
+
import json
|
11
|
+
import os
|
12
|
+
import datetime
|
13
|
+
import logging
|
14
|
+
|
15
|
+
from pydantic_ai import RunContext, ModelRetry
|
16
|
+
|
17
|
+
from .talisman_config import TalismanConfig, get_config
|
18
|
+
|
19
|
+
# Define data models for structured output
|
20
|
+
class FunctionalTerm(BaseModel):
|
21
|
+
"""A functional term associated with genes."""
|
22
|
+
term: str = Field(..., description="The biological term or concept")
|
23
|
+
genes: List[str] = Field(..., description="List of genes associated with this term")
|
24
|
+
source: str = Field(..., description="The source database or ontology (GO-BP, KEGG, Reactome, etc.)")
|
25
|
+
|
26
|
+
class GeneSummary(BaseModel):
|
27
|
+
"""Summary information for a gene."""
|
28
|
+
id: str = Field(..., description="The gene identifier (Gene Symbol)")
|
29
|
+
annotation: str = Field(..., description="Genomic coordinates or accession with position")
|
30
|
+
genomic_context: str = Field(..., description="Information about genomic location (chromosome, etc.)")
|
31
|
+
organism: str = Field(..., description="The organism the gene belongs to")
|
32
|
+
description: str = Field(..., description="The protein/gene function description")
|
33
|
+
|
34
|
+
class GeneSetAnalysis(BaseModel):
|
35
|
+
"""Complete analysis of a gene set."""
|
36
|
+
input_species: str = Field(default="", description="The species provided by the user")
|
37
|
+
inferred_species: str = Field(default="", description="The species inferred from the gene data")
|
38
|
+
narrative: str = Field(default="No narrative information available for these genes.",
|
39
|
+
description="Explanation of functional and categorical relationships between genes")
|
40
|
+
functional_terms: List[FunctionalTerm] = Field(default_factory=list,
|
41
|
+
description="Functional terms associated with the gene set")
|
42
|
+
gene_summaries: List[GeneSummary] = Field(default_factory=list,
|
43
|
+
description="Summary information for each gene")
|
44
|
+
|
45
|
+
# Set up logging
|
46
|
+
logging.basicConfig(
|
47
|
+
level=logging.INFO,
|
48
|
+
format='%(asctime)s [%(levelname)s] Talisman: %(message)s',
|
49
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
50
|
+
)
|
51
|
+
|
52
|
+
# Rate limiting implementation
|
53
|
+
class RateLimiter:
|
54
|
+
"""Simple rate limiter to ensure we don't exceed API rate limits."""
|
55
|
+
|
56
|
+
def __init__(self, max_calls: int = 3, period: float = 1.0):
|
57
|
+
"""
|
58
|
+
Initialize the rate limiter.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
max_calls: Maximum number of calls allowed in the period
|
62
|
+
period: Time period in seconds
|
63
|
+
"""
|
64
|
+
self.max_calls = max_calls
|
65
|
+
self.period = period
|
66
|
+
self.calls = []
|
67
|
+
self.lock = threading.Lock()
|
68
|
+
|
69
|
+
def wait(self):
|
70
|
+
"""
|
71
|
+
Wait if necessary to respect the rate limit.
|
72
|
+
"""
|
73
|
+
with self.lock:
|
74
|
+
now = time.time()
|
75
|
+
|
76
|
+
# Remove timestamps older than the period
|
77
|
+
self.calls = [t for t in self.calls if now - t < self.period]
|
78
|
+
|
79
|
+
# If we've reached the maximum calls for this period, wait
|
80
|
+
if len(self.calls) >= self.max_calls:
|
81
|
+
# Calculate how long to wait
|
82
|
+
oldest_call = min(self.calls)
|
83
|
+
wait_time = self.period - (now - oldest_call)
|
84
|
+
if wait_time > 0:
|
85
|
+
time.sleep(wait_time)
|
86
|
+
# Reset calls after waiting
|
87
|
+
self.calls = []
|
88
|
+
|
89
|
+
# Add the current timestamp
|
90
|
+
self.calls.append(time.time())
|
91
|
+
|
92
|
+
# Create rate limiters for UniProt and NCBI
|
93
|
+
uniprot_limiter = RateLimiter(max_calls=3, period=1.0)
|
94
|
+
ncbi_limiter = RateLimiter(max_calls=3, period=1.0)
|
95
|
+
|
96
|
+
|
97
|
+
def normalize_gene_id(gene_id: str) -> str:
|
98
|
+
"""Normalize a gene ID by removing any version number or prefix.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
gene_id: The gene ID
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
The normalized gene ID
|
105
|
+
"""
|
106
|
+
if ":" in gene_id:
|
107
|
+
return gene_id.split(":")[-1]
|
108
|
+
return gene_id
|
109
|
+
|
110
|
+
|
111
|
+
def is_uniprot_id(gene_id: str) -> bool:
|
112
|
+
"""Check if the gene ID appears to be a UniProt accession.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
gene_id: The gene ID to check
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
True if it appears to be a UniProt ID, False otherwise
|
119
|
+
"""
|
120
|
+
# UniProt IDs typically start with O, P, Q and contain numbers
|
121
|
+
return gene_id.startswith(("P", "Q", "O")) and any(c.isdigit() for c in gene_id)
|
122
|
+
|
123
|
+
|
124
|
+
def lookup_uniprot_accession(ctx: RunContext[TalismanConfig], gene_symbol: str) -> str:
|
125
|
+
"""Look up UniProt accession for a gene symbol.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
ctx: The run context with access to the config
|
129
|
+
gene_symbol: The gene symbol to look up
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
UniProt accession if found, or the original symbol if not found
|
133
|
+
"""
|
134
|
+
logging.info(f"Looking up UniProt accession for: {gene_symbol}")
|
135
|
+
|
136
|
+
config = ctx.deps or get_config()
|
137
|
+
u = config.get_uniprot_client()
|
138
|
+
|
139
|
+
try:
|
140
|
+
gene_symbol = normalize_gene_id(gene_symbol)
|
141
|
+
|
142
|
+
# Skip lookup if it already looks like a UniProt ID
|
143
|
+
if is_uniprot_id(gene_symbol):
|
144
|
+
logging.info(f"{gene_symbol} appears to be a UniProt ID already")
|
145
|
+
return gene_symbol
|
146
|
+
|
147
|
+
# Apply rate limiting before making the request
|
148
|
+
uniprot_limiter.wait()
|
149
|
+
|
150
|
+
# Search for the gene symbol specifically
|
151
|
+
logging.info(f"Searching UniProt for gene symbol: {gene_symbol}")
|
152
|
+
search_query = f'gene:{gene_symbol} AND reviewed:yes'
|
153
|
+
results = u.search(search_query, frmt="tsv", columns="accession,gene_names")
|
154
|
+
|
155
|
+
if results and results.strip() != "":
|
156
|
+
# Get the first line after the header and extract the accession
|
157
|
+
lines = results.strip().split('\n')
|
158
|
+
if len(lines) > 1:
|
159
|
+
uniprot_id = lines[1].split('\t')[0]
|
160
|
+
logging.info(f"Found UniProt accession: {uniprot_id} for {gene_symbol}")
|
161
|
+
return uniprot_id
|
162
|
+
|
163
|
+
logging.info(f"No UniProt accession found for {gene_symbol}, using original symbol")
|
164
|
+
return gene_symbol
|
165
|
+
except Exception as e:
|
166
|
+
# Return original gene symbol if lookup fails
|
167
|
+
logging.warning(f"Error looking up UniProt accession for {gene_symbol}: {str(e)}")
|
168
|
+
return gene_symbol
|
169
|
+
|
170
|
+
|
171
|
+
def get_ncbi_gene_info(ctx: RunContext[TalismanConfig], gene_id: str, organism: str = None) -> Optional[str]:
|
172
|
+
"""Look up gene information in NCBI Entrez.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
ctx: The run context with access to the config
|
176
|
+
gene_id: Gene ID or symbol to look up
|
177
|
+
organism: Optional organism name to restrict search (e.g., "Salmonella", "Homo sapiens")
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
Gene information from NCBI if found, or None if not found
|
181
|
+
"""
|
182
|
+
logging.info(f"Looking up NCBI information for: {gene_id}")
|
183
|
+
|
184
|
+
config = ctx.deps or get_config()
|
185
|
+
ncbi = config.get_ncbi_client()
|
186
|
+
|
187
|
+
# No need to check for specific gene patterns
|
188
|
+
|
189
|
+
# Set organisms to try without domain-specific knowledge
|
190
|
+
organisms_to_try = [organism] if organism else [None] # Use organism if provided, else try without organism constraint
|
191
|
+
|
192
|
+
gene_results = None
|
193
|
+
|
194
|
+
try:
|
195
|
+
# Try for each organism in priority order
|
196
|
+
for org in organisms_to_try:
|
197
|
+
# First try to find the gene with organism constraint
|
198
|
+
if org:
|
199
|
+
logging.info(f"Searching NCBI gene database for: {gene_id} in organism: {org}")
|
200
|
+
ncbi_limiter.wait()
|
201
|
+
search_query = f"{gene_id}[Gene Symbol] AND {org}[Organism]"
|
202
|
+
search_results = ncbi.ESearch("gene", search_query)
|
203
|
+
gene_ids = search_results.get('idlist', [])
|
204
|
+
|
205
|
+
if gene_ids:
|
206
|
+
gene_id_found = gene_ids[0]
|
207
|
+
logging.info(f"Found gene ID: {gene_id_found} in {org}, fetching details")
|
208
|
+
ncbi_limiter.wait()
|
209
|
+
gene_data = ncbi.EFetch("gene", id=gene_id_found)
|
210
|
+
gene_results = f"NCBI Entrez Gene Information:\n{gene_data}"
|
211
|
+
break
|
212
|
+
|
213
|
+
# Try without organism constraint as fallback
|
214
|
+
if not gene_results:
|
215
|
+
logging.info(f"Trying gene symbol search without organism constraint for: {gene_id}")
|
216
|
+
ncbi_limiter.wait()
|
217
|
+
search_results = ncbi.ESearch("gene", f"{gene_id}[Gene Symbol]")
|
218
|
+
gene_ids = search_results.get('idlist', [])
|
219
|
+
|
220
|
+
if gene_ids:
|
221
|
+
gene_id_found = gene_ids[0]
|
222
|
+
logging.info(f"Found gene ID: {gene_id_found}, fetching details")
|
223
|
+
ncbi_limiter.wait()
|
224
|
+
gene_data = ncbi.EFetch("gene", id=gene_id_found)
|
225
|
+
gene_results = f"NCBI Entrez Gene Information:\n{gene_data}"
|
226
|
+
break
|
227
|
+
|
228
|
+
# If we found gene results, return them
|
229
|
+
if gene_results:
|
230
|
+
return gene_results
|
231
|
+
|
232
|
+
# If not found in gene database, try protein database
|
233
|
+
# Standard protein search
|
234
|
+
protein_ids = []
|
235
|
+
for org in organisms_to_try:
|
236
|
+
if org:
|
237
|
+
logging.info(f"Searching NCBI protein database for: {gene_id} in organism: {org}")
|
238
|
+
ncbi_limiter.wait()
|
239
|
+
search_query = f"{gene_id} AND {org}[Organism]"
|
240
|
+
search_results = ncbi.ESearch("protein", search_query)
|
241
|
+
protein_ids = search_results.get('idlist', [])
|
242
|
+
|
243
|
+
if protein_ids:
|
244
|
+
logging.info(f"Found protein ID(s) for {gene_id} in {org}: {protein_ids}")
|
245
|
+
break
|
246
|
+
|
247
|
+
# If no results with organism constraint, try without
|
248
|
+
if not protein_ids:
|
249
|
+
logging.info(f"Searching NCBI protein database for: {gene_id}")
|
250
|
+
ncbi_limiter.wait()
|
251
|
+
search_results = ncbi.ESearch("protein", gene_id)
|
252
|
+
protein_ids = search_results.get('idlist', [])
|
253
|
+
|
254
|
+
if protein_ids:
|
255
|
+
protein_id = protein_ids[0]
|
256
|
+
logging.info(f"Found protein ID: {protein_id}, fetching sequence")
|
257
|
+
ncbi_limiter.wait()
|
258
|
+
protein_data = ncbi.EFetch("protein", id=protein_id, rettype="fasta", retmode="text")
|
259
|
+
try:
|
260
|
+
# Strip byte prefix if present
|
261
|
+
if isinstance(protein_data, bytes):
|
262
|
+
protein_data = protein_data.decode('utf-8')
|
263
|
+
elif isinstance(protein_data, str) and protein_data.startswith('b\''):
|
264
|
+
protein_data = protein_data[2:-1].replace('\\n', '\n')
|
265
|
+
except:
|
266
|
+
pass
|
267
|
+
|
268
|
+
# Get additional details with esummary
|
269
|
+
logging.info(f"Fetching protein summary for: {protein_id}")
|
270
|
+
ncbi_limiter.wait()
|
271
|
+
summary_data = ncbi.ESummary("protein", id=protein_id)
|
272
|
+
|
273
|
+
# Extract and format useful summary information
|
274
|
+
protein_summary = ""
|
275
|
+
if isinstance(summary_data, dict) and summary_data:
|
276
|
+
# For newer versions of bioservices
|
277
|
+
if protein_id in summary_data:
|
278
|
+
details = summary_data[protein_id]
|
279
|
+
title = details.get('title', 'No title available')
|
280
|
+
organism = details.get('organism', 'Unknown organism')
|
281
|
+
protein_summary = f"Title: {title}\nOrganism: {organism}\n\n"
|
282
|
+
logging.info(f"Found protein: {title} ({organism})")
|
283
|
+
# For other data structures returned by ESummary
|
284
|
+
else:
|
285
|
+
title = None
|
286
|
+
organism = None
|
287
|
+
|
288
|
+
for key, value in summary_data.items():
|
289
|
+
if isinstance(value, dict):
|
290
|
+
if 'title' in value:
|
291
|
+
title = value['title']
|
292
|
+
if 'organism' in value:
|
293
|
+
organism = value['organism']
|
294
|
+
|
295
|
+
if title or organism:
|
296
|
+
protein_summary = f"Title: {title or 'Not available'}\nOrganism: {organism or 'Unknown'}\n\n"
|
297
|
+
if title:
|
298
|
+
logging.info(f"Found protein: {title}")
|
299
|
+
|
300
|
+
combined_data = f"{protein_summary}{protein_data}"
|
301
|
+
return f"NCBI Entrez Protein Information:\n{combined_data}"
|
302
|
+
|
303
|
+
# Try nucleotide database as well
|
304
|
+
logging.info(f"No protein found, trying NCBI nucleotide database for: {gene_id}")
|
305
|
+
ncbi_limiter.wait()
|
306
|
+
search_results = ncbi.ESearch("nuccore", gene_id)
|
307
|
+
nuccore_ids = search_results.get('idlist', [])
|
308
|
+
|
309
|
+
if nuccore_ids:
|
310
|
+
nuccore_id = nuccore_ids[0]
|
311
|
+
logging.info(f"Found nucleotide ID: {nuccore_id}, fetching details")
|
312
|
+
ncbi_limiter.wait()
|
313
|
+
nuccore_data = ncbi.EFetch("nuccore", id=nuccore_id, rettype="gb", retmode="text")
|
314
|
+
try:
|
315
|
+
if isinstance(nuccore_data, bytes):
|
316
|
+
nuccore_data = nuccore_data.decode('utf-8')
|
317
|
+
except:
|
318
|
+
pass
|
319
|
+
return f"NCBI Entrez Nucleotide Information:\n{nuccore_data}"
|
320
|
+
|
321
|
+
logging.info(f"No information found in NCBI for: {gene_id}")
|
322
|
+
return None
|
323
|
+
except Exception as e:
|
324
|
+
# Return None if lookup fails
|
325
|
+
logging.warning(f"Error querying NCBI Entrez for {gene_id}: {str(e)}")
|
326
|
+
return f"Error querying NCBI Entrez: {str(e)}"
|
327
|
+
|
328
|
+
|
329
|
+
def ensure_complete_output(markdown_result: str, gene_set_analysis: GeneSetAnalysis) -> str:
|
330
|
+
"""Ensures that the markdown output has all required sections.
|
331
|
+
|
332
|
+
Args:
|
333
|
+
markdown_result: The original markdown result
|
334
|
+
gene_set_analysis: The structured data model
|
335
|
+
|
336
|
+
Returns:
|
337
|
+
A complete markdown output with all required sections
|
338
|
+
"""
|
339
|
+
logging.info("Post-processing output to ensure all sections are present")
|
340
|
+
|
341
|
+
# Check if output already has proper sections - always enforce
|
342
|
+
has_narrative = re.search(r'^\s*##\s*Narrative', markdown_result, re.MULTILINE) is not None
|
343
|
+
has_functional_terms = re.search(r'^\s*##\s*Functional Terms Table', markdown_result, re.MULTILINE) is not None
|
344
|
+
has_gene_summary = re.search(r'^\s*##\s*Gene Summary Table', markdown_result, re.MULTILINE) is not None
|
345
|
+
has_species = re.search(r'^\s*#\s*Species', markdown_result, re.MULTILINE) is not None
|
346
|
+
|
347
|
+
# We'll always rebuild the output to ensure consistent formatting
|
348
|
+
result = ""
|
349
|
+
|
350
|
+
# Add species section if applicable
|
351
|
+
if gene_set_analysis.input_species or gene_set_analysis.inferred_species:
|
352
|
+
result += "# Species\n"
|
353
|
+
if gene_set_analysis.input_species:
|
354
|
+
result += f"Input: {gene_set_analysis.input_species}\n"
|
355
|
+
if gene_set_analysis.inferred_species:
|
356
|
+
result += f"Inferred: {gene_set_analysis.inferred_species}\n"
|
357
|
+
result += "\n"
|
358
|
+
|
359
|
+
# Add main header
|
360
|
+
result += "# Gene Set Analysis\n\n"
|
361
|
+
|
362
|
+
# Add narrative section - always include
|
363
|
+
result += "## Narrative\n"
|
364
|
+
if has_narrative:
|
365
|
+
# Extract existing narrative if it exists
|
366
|
+
narrative_match = re.search(r'##\s*Narrative\s*\n(.*?)(?=^\s*##|\Z)',
|
367
|
+
markdown_result, re.MULTILINE | re.DOTALL)
|
368
|
+
if narrative_match and narrative_match.group(1).strip():
|
369
|
+
result += narrative_match.group(1).strip() + "\n\n"
|
370
|
+
else:
|
371
|
+
result += f"{gene_set_analysis.narrative}\n\n"
|
372
|
+
else:
|
373
|
+
# Use the narrative from the model
|
374
|
+
result += f"{gene_set_analysis.narrative}\n\n"
|
375
|
+
|
376
|
+
# Add functional terms table - always include
|
377
|
+
result += "## Functional Terms Table\n"
|
378
|
+
result += "| Functional Term | Genes | Source |\n"
|
379
|
+
result += "|-----------------|-------|--------|\n"
|
380
|
+
|
381
|
+
if has_functional_terms:
|
382
|
+
# Try to extract existing table content
|
383
|
+
ft_match = re.search(r'##\s*Functional Terms Table\s*\n\|.*\|\s*\n\|[-\s|]*\|\s*\n(.*?)(?=^\s*##|\Z)',
|
384
|
+
markdown_result, re.MULTILINE | re.DOTALL)
|
385
|
+
if ft_match and ft_match.group(1).strip():
|
386
|
+
# Use existing content
|
387
|
+
for line in ft_match.group(1).strip().split("\n"):
|
388
|
+
if line.strip() and "|" in line:
|
389
|
+
result += line + "\n"
|
390
|
+
elif gene_set_analysis.functional_terms:
|
391
|
+
# Use model content
|
392
|
+
for term in gene_set_analysis.functional_terms:
|
393
|
+
genes_str = ", ".join(term.genes)
|
394
|
+
result += f"| {term.term} | {genes_str} | {term.source} |\n"
|
395
|
+
else:
|
396
|
+
# Create default content
|
397
|
+
gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
|
398
|
+
if gene_ids:
|
399
|
+
result += f"| Gene set | {', '.join(gene_ids)} | Analysis |\n"
|
400
|
+
else:
|
401
|
+
result += "| No terms available | - | - |\n"
|
402
|
+
else:
|
403
|
+
# Always include functional terms, using content from model
|
404
|
+
if gene_set_analysis.functional_terms:
|
405
|
+
for term in gene_set_analysis.functional_terms:
|
406
|
+
genes_str = ", ".join(term.genes)
|
407
|
+
result += f"| {term.term} | {genes_str} | {term.source} |\n"
|
408
|
+
else:
|
409
|
+
# Create default content if model has none
|
410
|
+
gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
|
411
|
+
if gene_ids:
|
412
|
+
result += f"| Gene set | {', '.join(gene_ids)} | Analysis |\n"
|
413
|
+
else:
|
414
|
+
result += "| No terms available | - | - |\n"
|
415
|
+
|
416
|
+
result += "\n"
|
417
|
+
|
418
|
+
# Add gene summary table - always include
|
419
|
+
result += "## Gene Summary Table\n"
|
420
|
+
result += "| ID | Annotation | Genomic Context | Organism | Description |\n"
|
421
|
+
result += "|-------------|-------------|----------|----------------|------------|\n"
|
422
|
+
|
423
|
+
if has_gene_summary:
|
424
|
+
# Try to extract existing gene summary
|
425
|
+
gs_match = re.search(r'##\s*Gene Summary Table\s*\n\|.*\|\s*\n\|[-\s|]*\|\s*\n(.*?)(?=^\s*##|\Z)',
|
426
|
+
markdown_result, re.MULTILINE | re.DOTALL)
|
427
|
+
if gs_match and gs_match.group(1).strip():
|
428
|
+
# Use existing content
|
429
|
+
for line in gs_match.group(1).strip().split("\n"):
|
430
|
+
if line.strip() and "|" in line:
|
431
|
+
result += line + "\n"
|
432
|
+
elif gene_set_analysis.gene_summaries:
|
433
|
+
# Use model content
|
434
|
+
for gene in gene_set_analysis.gene_summaries:
|
435
|
+
result += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
|
436
|
+
else:
|
437
|
+
# Create default content
|
438
|
+
result += "| No gene information available | - | - | - | - |\n"
|
439
|
+
else:
|
440
|
+
# Always include gene summary, using content from model
|
441
|
+
if gene_set_analysis.gene_summaries:
|
442
|
+
for gene in gene_set_analysis.gene_summaries:
|
443
|
+
result += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
|
444
|
+
else:
|
445
|
+
# Create default content if model has none
|
446
|
+
result += "| No gene information available | - | - | - | - |\n"
|
447
|
+
|
448
|
+
logging.info("Successfully enforced all required sections in the output")
|
449
|
+
return result
|
450
|
+
|
451
|
+
|
452
|
+
def get_gene_description(ctx: RunContext[TalismanConfig], gene_id: str, organism: str = None) -> str:
|
453
|
+
"""Get description for a single gene ID, using UniProt and falling back to NCBI Entrez.
|
454
|
+
|
455
|
+
Args:
|
456
|
+
ctx: The run context with access to the config
|
457
|
+
gene_id: The gene identifier (UniProt ID, gene symbol, etc.)
|
458
|
+
organism: Optional organism name to restrict search (e.g., "Salmonella", "Homo sapiens")
|
459
|
+
|
460
|
+
Returns:
|
461
|
+
The gene description in a structured format
|
462
|
+
"""
|
463
|
+
logging.info(f"Getting description for gene: {gene_id}")
|
464
|
+
config = ctx.deps or get_config()
|
465
|
+
u = config.get_uniprot_client()
|
466
|
+
|
467
|
+
try:
|
468
|
+
# Normalize the gene ID
|
469
|
+
gene_id = normalize_gene_id(gene_id)
|
470
|
+
logging.info(f"Normalized gene ID: {gene_id}")
|
471
|
+
uniprot_info = None
|
472
|
+
ncbi_info = None
|
473
|
+
|
474
|
+
# First try to look up UniProt accession if it looks like a gene symbol
|
475
|
+
if not is_uniprot_id(gene_id):
|
476
|
+
logging.info(f"Not a UniProt ID, looking up accession for: {gene_id}")
|
477
|
+
uniprot_id = lookup_uniprot_accession(ctx, gene_id)
|
478
|
+
# If lookup succeeded (returned a different ID), use that for retrieval
|
479
|
+
if uniprot_id != gene_id:
|
480
|
+
logging.info(f"Using UniProt ID: {uniprot_id} instead of {gene_id}")
|
481
|
+
gene_id = uniprot_id
|
482
|
+
|
483
|
+
# Direct lookup for UniProt IDs
|
484
|
+
if is_uniprot_id(gene_id):
|
485
|
+
try:
|
486
|
+
logging.info(f"Performing direct UniProt lookup for: {gene_id}")
|
487
|
+
# Apply rate limiting
|
488
|
+
uniprot_limiter.wait()
|
489
|
+
result = u.retrieve(gene_id, frmt="txt")
|
490
|
+
if result and result.strip() != "":
|
491
|
+
logging.info(f"Found direct UniProt entry for: {gene_id}")
|
492
|
+
uniprot_info = result
|
493
|
+
else:
|
494
|
+
logging.info(f"No direct UniProt entry found for: {gene_id}")
|
495
|
+
except Exception as e:
|
496
|
+
logging.warning(f"Error in direct UniProt lookup: {str(e)}")
|
497
|
+
pass # If direct lookup fails, continue with search
|
498
|
+
|
499
|
+
# If we don't have UniProt info yet, try the search
|
500
|
+
if not uniprot_info:
|
501
|
+
# Search for the gene
|
502
|
+
logging.info(f"Performing UniProt search for: {gene_id}")
|
503
|
+
uniprot_limiter.wait()
|
504
|
+
search_query = f'gene:{gene_id} OR accession:{gene_id} OR id:{gene_id}'
|
505
|
+
results = u.search(search_query, frmt="tsv",
|
506
|
+
columns="accession,id,gene_names,organism,protein_name,function,cc_disease")
|
507
|
+
|
508
|
+
if not results or results.strip() == "":
|
509
|
+
# Try a broader search if the specific one failed
|
510
|
+
logging.info(f"No specific match found, trying broader UniProt search for: {gene_id}")
|
511
|
+
uniprot_limiter.wait()
|
512
|
+
search_query = gene_id
|
513
|
+
results = u.search(search_query, frmt="tsv",
|
514
|
+
columns="accession,id,gene_names,organism,protein_name,function,cc_disease")
|
515
|
+
|
516
|
+
if results and results.strip() != "":
|
517
|
+
logging.info(f"Found UniProt entries in broader search for: {gene_id}")
|
518
|
+
uniprot_info = results
|
519
|
+
else:
|
520
|
+
logging.info(f"No UniProt entries found in broader search for: {gene_id}")
|
521
|
+
else:
|
522
|
+
logging.info(f"Found UniProt entries in specific search for: {gene_id}")
|
523
|
+
uniprot_info = results
|
524
|
+
|
525
|
+
# Check NCBI Entrez if we couldn't find anything in UniProt
|
526
|
+
if not uniprot_info or uniprot_info.strip() == "":
|
527
|
+
logging.info(f"No UniProt information found, checking NCBI for: {gene_id}")
|
528
|
+
# Pass the organism if we have one or auto-detected one
|
529
|
+
ncbi_info = get_ncbi_gene_info(ctx, gene_id, organism)
|
530
|
+
if ncbi_info:
|
531
|
+
logging.info(f"Found NCBI information for: {gene_id}")
|
532
|
+
else:
|
533
|
+
logging.warning(f"No NCBI information found for: {gene_id}")
|
534
|
+
|
535
|
+
# Combine results or use whichever source had information
|
536
|
+
if uniprot_info and ncbi_info:
|
537
|
+
logging.info(f"Returning combined UniProt and NCBI information for: {gene_id}")
|
538
|
+
return f"## UniProt Information\n{uniprot_info}\n\n## NCBI Information\n{ncbi_info}"
|
539
|
+
elif uniprot_info:
|
540
|
+
logging.info(f"Returning UniProt information for: {gene_id}")
|
541
|
+
return uniprot_info
|
542
|
+
elif ncbi_info:
|
543
|
+
logging.info(f"Returning NCBI information for: {gene_id}")
|
544
|
+
return ncbi_info
|
545
|
+
else:
|
546
|
+
logging.error(f"No gene information found for: {gene_id} in either UniProt or NCBI")
|
547
|
+
raise ModelRetry(f"No gene information found for: {gene_id} in either UniProt or NCBI Entrez")
|
548
|
+
|
549
|
+
except Exception as e:
|
550
|
+
if "ModelRetry" in str(type(e)):
|
551
|
+
raise e
|
552
|
+
logging.error(f"Error retrieving gene description for {gene_id}: {str(e)}")
|
553
|
+
raise ModelRetry(f"Error retrieving gene description: {str(e)}")
|
554
|
+
|
555
|
+
|
556
|
+
def get_gene_descriptions(ctx: RunContext[TalismanConfig], gene_ids: List[str]) -> str:
|
557
|
+
"""Get descriptions for multiple gene IDs.
|
558
|
+
|
559
|
+
Args:
|
560
|
+
ctx: The run context with access to the config
|
561
|
+
gene_ids: List of gene identifiers
|
562
|
+
|
563
|
+
Returns:
|
564
|
+
The gene descriptions in a structured tabular format
|
565
|
+
"""
|
566
|
+
logging.info(f"Retrieving descriptions for {len(gene_ids)} genes: {', '.join(gene_ids)}")
|
567
|
+
config = ctx.deps or get_config()
|
568
|
+
|
569
|
+
try:
|
570
|
+
if not gene_ids:
|
571
|
+
logging.error("No gene IDs provided")
|
572
|
+
raise ModelRetry("No gene IDs provided")
|
573
|
+
|
574
|
+
results = []
|
575
|
+
gene_info_dict = {}
|
576
|
+
|
577
|
+
for i, gene_id in enumerate(gene_ids):
|
578
|
+
logging.info(f"Processing gene {i+1}/{len(gene_ids)}: {gene_id}")
|
579
|
+
try:
|
580
|
+
gene_info = get_gene_description(ctx, gene_id)
|
581
|
+
results.append(f"## Gene: {gene_id}\n{gene_info}\n")
|
582
|
+
gene_info_dict[gene_id] = gene_info
|
583
|
+
logging.info(f"Successfully retrieved information for {gene_id}")
|
584
|
+
except Exception as e:
|
585
|
+
logging.warning(f"Error retrieving information for {gene_id}: {str(e)}")
|
586
|
+
results.append(f"## Gene: {gene_id}\nError: {str(e)}\n")
|
587
|
+
|
588
|
+
if not results:
|
589
|
+
logging.error("No gene information found for any of the provided IDs")
|
590
|
+
raise ModelRetry("No gene information found for any of the provided IDs")
|
591
|
+
|
592
|
+
# Store the gene info dictionary in an attribute we add to ctx (state only available in test context)
|
593
|
+
# Use hasattr to check if the attribute already exists
|
594
|
+
if not hasattr(ctx, "gene_info_dict"):
|
595
|
+
# Create the attribute if it doesn't exist
|
596
|
+
setattr(ctx, "gene_info_dict", {})
|
597
|
+
|
598
|
+
# Now set the value
|
599
|
+
ctx.gene_info_dict = gene_info_dict
|
600
|
+
logging.info(f"Successfully retrieved information for {len(gene_info_dict)} genes")
|
601
|
+
|
602
|
+
return "\n".join(results)
|
603
|
+
except Exception as e:
|
604
|
+
if "ModelRetry" in str(type(e)):
|
605
|
+
raise e
|
606
|
+
logging.error(f"Error retrieving gene descriptions: {str(e)}")
|
607
|
+
raise ModelRetry(f"Error retrieving gene descriptions: {str(e)}")
|
608
|
+
|
609
|
+
|
610
|
+
def parse_gene_list(gene_list: str) -> List[str]:
|
611
|
+
"""Parse a string containing gene IDs or symbols into a list.
|
612
|
+
|
613
|
+
Args:
|
614
|
+
gene_list: String of gene identifiers separated by commas, spaces, semicolons, or newlines
|
615
|
+
|
616
|
+
Returns:
|
617
|
+
List of gene identifiers
|
618
|
+
"""
|
619
|
+
if not gene_list:
|
620
|
+
return []
|
621
|
+
|
622
|
+
# Replace common separators with a single delimiter for splitting
|
623
|
+
for sep in [',', ';', '\n', '\t']:
|
624
|
+
gene_list = gene_list.replace(sep, ' ')
|
625
|
+
|
626
|
+
# Split on spaces and filter out empty strings
|
627
|
+
genes = [g.strip() for g in gene_list.split(' ') if g.strip()]
|
628
|
+
return genes
|
629
|
+
|
630
|
+
|
631
|
+
def get_genes_from_list(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
|
632
|
+
"""Get descriptions for multiple gene IDs provided as a string.
|
633
|
+
|
634
|
+
Args:
|
635
|
+
ctx: The run context with access to the config
|
636
|
+
gene_list: String containing gene identifiers separated by commas, spaces, or newlines
|
637
|
+
|
638
|
+
Returns:
|
639
|
+
The gene descriptions in a structured tabular format
|
640
|
+
"""
|
641
|
+
logging.info(f"Parsing gene list: {gene_list}")
|
642
|
+
gene_ids = parse_gene_list(gene_list)
|
643
|
+
|
644
|
+
if not gene_ids:
|
645
|
+
logging.error("No gene IDs could be parsed from the input string")
|
646
|
+
raise ModelRetry("No gene IDs could be parsed from the input string")
|
647
|
+
|
648
|
+
logging.info(f"Parsed {len(gene_ids)} gene IDs: {', '.join(gene_ids)}")
|
649
|
+
return get_gene_descriptions(ctx, gene_ids)
|
650
|
+
|
651
|
+
|
652
|
+
def analyze_gene_set(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
|
653
|
+
"""Analyze a set of genes and generate a biological summary of their properties and relationships.
|
654
|
+
|
655
|
+
Args:
|
656
|
+
ctx: The run context with access to the config
|
657
|
+
gene_list: String containing gene identifiers separated by commas, spaces, or newlines
|
658
|
+
|
659
|
+
Returns:
|
660
|
+
A structured biological summary of the gene set with Narrative, Functional Terms Table, and Gene Summary Table
|
661
|
+
"""
|
662
|
+
logging.info(f"Starting gene set analysis for: {gene_list}")
|
663
|
+
|
664
|
+
# Parse the gene list
|
665
|
+
gene_ids_list = parse_gene_list(gene_list)
|
666
|
+
organism = None # Let the gene lookup systems determine the organism
|
667
|
+
|
668
|
+
# First, get detailed information about each gene
|
669
|
+
logging.info("Retrieving gene descriptions...")
|
670
|
+
# Pass organism information to each gene lookup
|
671
|
+
for gene_id in gene_ids_list:
|
672
|
+
logging.info(f"Processing {gene_id} with organism context: {organism}")
|
673
|
+
get_gene_description(ctx, gene_id, organism)
|
674
|
+
|
675
|
+
# Now get all gene descriptions
|
676
|
+
gene_descriptions = get_genes_from_list(ctx, gene_list)
|
677
|
+
logging.info("Gene descriptions retrieved successfully")
|
678
|
+
|
679
|
+
# Get the gene info dictionary from the context
|
680
|
+
gene_info_dict = getattr(ctx, "gene_info_dict", {})
|
681
|
+
|
682
|
+
if not gene_info_dict:
|
683
|
+
logging.error("No gene information was found to analyze")
|
684
|
+
raise ModelRetry("No gene information was found to analyze")
|
685
|
+
|
686
|
+
gene_ids = list(gene_info_dict.keys())
|
687
|
+
logging.info(f"Analyzing relationships between {len(gene_ids)} genes: {', '.join(gene_ids)}")
|
688
|
+
|
689
|
+
# Extract organism information from the gene descriptions if possible
|
690
|
+
detected_organism = None
|
691
|
+
organism_keywords = ["Salmonella", "Escherichia", "Desulfovibrio", "Homo sapiens", "human"]
|
692
|
+
for gene_info in gene_info_dict.values():
|
693
|
+
for keyword in organism_keywords:
|
694
|
+
if keyword.lower() in gene_info.lower():
|
695
|
+
detected_organism = keyword
|
696
|
+
break
|
697
|
+
if detected_organism:
|
698
|
+
break
|
699
|
+
|
700
|
+
if detected_organism:
|
701
|
+
logging.info(f"Detected organism from gene descriptions: {detected_organism}")
|
702
|
+
|
703
|
+
# Prepare a prompt for the LLM with minimal instructions (main instructions are in the agent system prompt)
|
704
|
+
prompt = f"""Analyze the following set of genes:
|
705
|
+
|
706
|
+
Gene IDs/Symbols: {', '.join(gene_ids)}
|
707
|
+
|
708
|
+
Gene Information:
|
709
|
+
{gene_descriptions}
|
710
|
+
|
711
|
+
{f"IMPORTANT: These genes are from {detected_organism or organism}. Make sure your analysis reflects the correct organism context." if detected_organism or organism else ""}
|
712
|
+
|
713
|
+
Please provide a comprehensive analysis of the genes."""
|
714
|
+
|
715
|
+
# Access OpenAI API to generate the analysis
|
716
|
+
try:
|
717
|
+
# Use the configured model name if available
|
718
|
+
model_name = getattr(ctx.deps, "model_name", "gpt-4o") if ctx.deps else "gpt-4o"
|
719
|
+
# Use the configured API key if available
|
720
|
+
api_key = getattr(ctx.deps, "openai_api_key", None) if ctx.deps else None
|
721
|
+
|
722
|
+
logging.info(f"Generating biological analysis using model: {model_name}")
|
723
|
+
|
724
|
+
if api_key:
|
725
|
+
openai.api_key = api_key
|
726
|
+
|
727
|
+
# Create the completion using OpenAI API
|
728
|
+
system_prompt = """
|
729
|
+
You are a biology expert analyzing gene sets. You must provide a comprehensive analysis in JSON format.
|
730
|
+
|
731
|
+
Your response must be in this structured format:
|
732
|
+
{
|
733
|
+
"narrative": "Detailed explanation of functional relationships between genes, emphasizing shared functions",
|
734
|
+
"functional_terms": [
|
735
|
+
{"term": "DNA damage response", "genes": ["BRCA1", "BRCA2", "ATM"], "source": "GO-BP"},
|
736
|
+
{"term": "Homologous recombination", "genes": ["BRCA1", "BRCA2"], "source": "Reactome"},
|
737
|
+
etc.
|
738
|
+
],
|
739
|
+
"gene_summaries": [
|
740
|
+
{
|
741
|
+
"id": "BRCA1",
|
742
|
+
"annotation": "NC_000017.11 (43044295..43170327, complement)",
|
743
|
+
"genomic_context": "Chromosome 17",
|
744
|
+
"organism": "Homo sapiens",
|
745
|
+
"description": "Breast cancer type 1 susceptibility protein"
|
746
|
+
},
|
747
|
+
etc.
|
748
|
+
]
|
749
|
+
}
|
750
|
+
|
751
|
+
Your output MUST be valid JSON with these three fields. Do not include any text before or after the JSON.
|
752
|
+
"""
|
753
|
+
|
754
|
+
logging.info("Sending request to OpenAI API...")
|
755
|
+
response = openai.chat.completions.create(
|
756
|
+
model=model_name,
|
757
|
+
messages=[
|
758
|
+
{"role": "system", "content": system_prompt},
|
759
|
+
{"role": "user", "content": prompt}
|
760
|
+
],
|
761
|
+
temperature=0.2,
|
762
|
+
max_tokens=4000,
|
763
|
+
response_format={"type": "json_object"}
|
764
|
+
)
|
765
|
+
logging.info("Received response from OpenAI API")
|
766
|
+
|
767
|
+
# Extract the response content
|
768
|
+
response_content = response.choices[0].message.content
|
769
|
+
|
770
|
+
try:
|
771
|
+
# Try to parse the JSON response into our Pydantic model
|
772
|
+
gene_set_analysis = GeneSetAnalysis.model_validate_json(response_content)
|
773
|
+
json_result = response_content
|
774
|
+
is_structured = True
|
775
|
+
logging.info("Successfully parsed structured JSON response")
|
776
|
+
except Exception as parse_error:
|
777
|
+
# If JSON parsing fails, handle the unstructured text response
|
778
|
+
logging.warning(f"Failed to parse JSON response: {str(parse_error)}. Creating structured format from text.")
|
779
|
+
is_structured = False
|
780
|
+
|
781
|
+
# Parse the unstructured text to extract information - look for Gene Summary Table section
|
782
|
+
lines = response_content.split('\n')
|
783
|
+
|
784
|
+
# Extract gene IDs from the table if present
|
785
|
+
gene_ids_found = []
|
786
|
+
description_map = {}
|
787
|
+
organism_map = {}
|
788
|
+
annotation_map = {}
|
789
|
+
genomic_context_map = {}
|
790
|
+
|
791
|
+
in_table = False
|
792
|
+
for i, line in enumerate(lines):
|
793
|
+
if "## Gene Summary Table" in line:
|
794
|
+
in_table = True
|
795
|
+
continue
|
796
|
+
if in_table and '|' in line:
|
797
|
+
# Skip the header and separator lines
|
798
|
+
if "---" in line or "ID" in line:
|
799
|
+
continue
|
800
|
+
|
801
|
+
# Parse the table row
|
802
|
+
parts = [p.strip() for p in line.split('|')]
|
803
|
+
if len(parts) >= 6: # Should have 6 parts with empty first and last elements
|
804
|
+
gene_id = parts[1].strip()
|
805
|
+
if gene_id:
|
806
|
+
gene_ids_found.append(gene_id)
|
807
|
+
description_map[gene_id] = parts[5].strip()
|
808
|
+
organism_map[gene_id] = parts[4].strip()
|
809
|
+
annotation_map[gene_id] = parts[2].strip()
|
810
|
+
genomic_context_map[gene_id] = parts[3].strip()
|
811
|
+
|
812
|
+
# Extract any existing narrative from the output
|
813
|
+
existing_narrative = "\n".join(
|
814
|
+
[l for l in lines if not (
|
815
|
+
"## Gene Summary Table" in l or
|
816
|
+
"## Functional Terms Table" in l or
|
817
|
+
"## Terms" in l or
|
818
|
+
(in_table and '|' in l)
|
819
|
+
)]
|
820
|
+
).strip()
|
821
|
+
|
822
|
+
# Use existing narrative if it exists and is substantial
|
823
|
+
if existing_narrative and len(existing_narrative.split()) > 10:
|
824
|
+
narrative = existing_narrative
|
825
|
+
# Otherwise create a generic narrative from the gene info we have
|
826
|
+
elif len(gene_ids_found) > 0:
|
827
|
+
gene_ids_str = ", ".join(gene_ids_found)
|
828
|
+
descriptions = [f"{g}: {description_map.get(g, 'Unknown function')}" for g in gene_ids_found]
|
829
|
+
common_organism = next(iter(set(organism_map.values())), "Unknown organism")
|
830
|
+
|
831
|
+
narrative = f"""The genes {gene_ids_str} are from {common_organism}.
|
832
|
+
|
833
|
+
Gene functions: {'; '.join(descriptions)}.
|
834
|
+
|
835
|
+
Based on their annotations and genomic context, these genes may be functionally related and potentially participate in shared biological pathways or cellular processes."""
|
836
|
+
else:
|
837
|
+
narrative = "No gene information available."
|
838
|
+
|
839
|
+
# Create generic functional terms based on gene descriptions
|
840
|
+
functional_terms = []
|
841
|
+
|
842
|
+
# If we have gene IDs and descriptions, create a basic functional term
|
843
|
+
if gene_ids_found:
|
844
|
+
# Create a default functional term with all genes
|
845
|
+
functional_terms.append({
|
846
|
+
"term": "Gene set",
|
847
|
+
"genes": gene_ids_found,
|
848
|
+
"source": "Analysis"
|
849
|
+
})
|
850
|
+
|
851
|
+
# Only extract functional terms from descriptions, without hardcoded knowledge
|
852
|
+
for gene_id in gene_ids_found:
|
853
|
+
description = description_map.get(gene_id, "").lower()
|
854
|
+
if description and len(description) > 3:
|
855
|
+
functional_terms.append({
|
856
|
+
"term": f"{gene_id} function",
|
857
|
+
"genes": [gene_id],
|
858
|
+
"source": "Annotation"
|
859
|
+
})
|
860
|
+
|
861
|
+
# Create gene summaries
|
862
|
+
gene_summaries = []
|
863
|
+
for gene_id in gene_ids_found:
|
864
|
+
gene_summaries.append({
|
865
|
+
"id": gene_id,
|
866
|
+
"annotation": annotation_map.get(gene_id, "Unknown"),
|
867
|
+
"genomic_context": genomic_context_map.get(gene_id, "Unknown"),
|
868
|
+
"organism": organism_map.get(gene_id, "Unknown"),
|
869
|
+
"description": description_map.get(gene_id, "Unknown")
|
870
|
+
})
|
871
|
+
|
872
|
+
# Create a structured response
|
873
|
+
structured_data = {
|
874
|
+
"narrative": narrative,
|
875
|
+
"functional_terms": functional_terms,
|
876
|
+
"gene_summaries": gene_summaries
|
877
|
+
}
|
878
|
+
|
879
|
+
# Convert to JSON
|
880
|
+
json_result = json.dumps(structured_data, indent=2)
|
881
|
+
|
882
|
+
# Create the Pydantic model
|
883
|
+
gene_set_analysis = GeneSetAnalysis.model_validate(structured_data)
|
884
|
+
|
885
|
+
# Format the results in markdown for display
|
886
|
+
markdown_result = "# Gene Set Analysis\n\n"
|
887
|
+
|
888
|
+
# Add narrative section (always include this)
|
889
|
+
narrative = gene_set_analysis.narrative.strip()
|
890
|
+
if narrative:
|
891
|
+
markdown_result += f"## Narrative\n{narrative}\n\n"
|
892
|
+
else:
|
893
|
+
# Create a generic narrative based on gene data without domain-specific information
|
894
|
+
gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
|
895
|
+
gene_descs = [f"{g.id}: {g.description}" for g in gene_set_analysis.gene_summaries]
|
896
|
+
organisms = list(set([g.organism for g in gene_set_analysis.gene_summaries]))
|
897
|
+
|
898
|
+
if gene_set_analysis.gene_summaries:
|
899
|
+
organism_str = organisms[0] if organisms else "Unknown organism"
|
900
|
+
markdown_result += f"""## Narrative
|
901
|
+
The genes {', '.join(gene_ids)} are from {organism_str}.
|
902
|
+
|
903
|
+
Gene functions: {'; '.join(gene_descs)}.
|
904
|
+
|
905
|
+
Based on their annotations and genomic context, these genes may be functionally related and could potentially participate in shared biological pathways or cellular processes.
|
906
|
+
\n\n"""
|
907
|
+
else:
|
908
|
+
markdown_result += f"""## Narrative
|
909
|
+
No gene information available.
|
910
|
+
\n\n"""
|
911
|
+
|
912
|
+
# Add functional terms table
|
913
|
+
markdown_result += "## Functional Terms Table\n"
|
914
|
+
markdown_result += "| Functional Term | Genes | Source |\n"
|
915
|
+
markdown_result += "|-----------------|-------|--------|\n"
|
916
|
+
|
917
|
+
# Add functional terms rows
|
918
|
+
if gene_set_analysis.functional_terms:
|
919
|
+
for term in gene_set_analysis.functional_terms:
|
920
|
+
genes_str = ", ".join(term.genes)
|
921
|
+
markdown_result += f"| {term.term} | {genes_str} | {term.source} |\n"
|
922
|
+
else:
|
923
|
+
# Add default terms if none exist
|
924
|
+
gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
|
925
|
+
markdown_result += f"| Protein function | {', '.join(gene_ids)} | Literature |\n"
|
926
|
+
|
927
|
+
# Add gene summary table
|
928
|
+
markdown_result += "\n## Gene Summary Table\n"
|
929
|
+
markdown_result += "| ID | Annotation | Genomic Context | Organism | Description |\n"
|
930
|
+
markdown_result += "|-------------|-------------|----------|----------------|------------|\n"
|
931
|
+
|
932
|
+
# Add gene summary rows
|
933
|
+
for gene in gene_set_analysis.gene_summaries:
|
934
|
+
markdown_result += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
|
935
|
+
|
936
|
+
# Save the results
|
937
|
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
938
|
+
|
939
|
+
# Create both JSON and markdown files
|
940
|
+
results_dir = os.path.join(os.path.expanduser("~"), "talisman_results")
|
941
|
+
os.makedirs(results_dir, exist_ok=True)
|
942
|
+
|
943
|
+
# Save the JSON response
|
944
|
+
json_path = os.path.join(results_dir, f"talisman_analysis_{timestamp}.json")
|
945
|
+
with open(json_path, 'w') as f:
|
946
|
+
f.write(json_result)
|
947
|
+
|
948
|
+
# Save the markdown formatted response
|
949
|
+
md_path = os.path.join(results_dir, f"talisman_analysis_{timestamp}.md")
|
950
|
+
with open(md_path, 'w') as f:
|
951
|
+
f.write(markdown_result)
|
952
|
+
|
953
|
+
logging.info(f"Analysis complete. Results saved to: {json_path} and {md_path}")
|
954
|
+
|
955
|
+
# Ensure all required sections are present in the markdown output
|
956
|
+
final_output = ensure_complete_output(markdown_result, gene_set_analysis)
|
957
|
+
|
958
|
+
# Return the post-processed markdown-formatted result for display
|
959
|
+
return final_output
|
960
|
+
except Exception as e:
|
961
|
+
logging.error(f"Error generating gene set analysis: {str(e)}")
|
962
|
+
raise ModelRetry(f"Error generating gene set analysis: {str(e)}")
|