aurelian 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aurelian/agents/talisman/__main__.py +17 -0
- aurelian/agents/talisman/cli.py +70 -0
- aurelian/agents/talisman/run_talisman.py +18 -0
- aurelian/agents/talisman/talisman_agent.py +44 -27
- aurelian/agents/talisman/talisman_tools.py +389 -147
- aurelian/cli.py +174 -6
- {aurelian-0.3.2.dist-info → aurelian-0.3.3.dist-info}/METADATA +1 -1
- {aurelian-0.3.2.dist-info → aurelian-0.3.3.dist-info}/RECORD +11 -8
- {aurelian-0.3.2.dist-info → aurelian-0.3.3.dist-info}/LICENSE +0 -0
- {aurelian-0.3.2.dist-info → aurelian-0.3.3.dist-info}/WHEEL +0 -0
- {aurelian-0.3.2.dist-info → aurelian-0.3.3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Main entry point to run the talisman agent.
|
4
|
+
"""
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pydantic_ai import chat
|
8
|
+
|
9
|
+
# Add the parent directory to the path for absolute imports
|
10
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
|
11
|
+
|
12
|
+
from aurelian.agents.talisman.talisman_agent import talisman_agent
|
13
|
+
from aurelian.agents.talisman.talisman_config import get_config
|
14
|
+
|
15
|
+
if __name__ == "__main__":
|
16
|
+
config = get_config()
|
17
|
+
chat(talisman_agent, deps=config)
|
@@ -0,0 +1,70 @@
|
|
1
|
+
"""
|
2
|
+
CLI interface for the talisman agent.
|
3
|
+
This may not be in the original code, but let's add it to make sure it's properly configured.
|
4
|
+
"""
|
5
|
+
import logging
|
6
|
+
import re
|
7
|
+
from pydantic_ai import RunContext
|
8
|
+
|
9
|
+
from aurelian.agents.talisman.talisman_config import TalismanConfig
|
10
|
+
from aurelian.agents.talisman.talisman_tools import GeneSetAnalysis, FunctionalTerm, GeneSummary
|
11
|
+
|
12
|
+
def format_talisman_output(result):
|
13
|
+
"""Format the talisman output to ensure it always has all three sections."""
|
14
|
+
logging.info("Post-processing talisman output")
|
15
|
+
|
16
|
+
# Check if output already has proper sections
|
17
|
+
has_narrative = re.search(r'^\s*##\s*Narrative', result, re.MULTILINE) is not None
|
18
|
+
has_functional_terms = re.search(r'^\s*##\s*Functional Terms Table', result, re.MULTILINE) is not None
|
19
|
+
has_gene_summary = re.search(r'^\s*##\s*Gene Summary Table', result, re.MULTILINE) is not None
|
20
|
+
|
21
|
+
# If all sections are present, return as is
|
22
|
+
if has_narrative and has_functional_terms and has_gene_summary:
|
23
|
+
return result
|
24
|
+
|
25
|
+
# Need to reconstruct the output
|
26
|
+
# Extract gene summary table if it exists
|
27
|
+
gene_table_match = re.search(r'^\s*##\s*Gene Summary Table\s*\n(.*?)(?=$|\n\n|\Z)',
|
28
|
+
result, re.MULTILINE | re.DOTALL)
|
29
|
+
|
30
|
+
if gene_table_match:
|
31
|
+
gene_table = gene_table_match.group(0)
|
32
|
+
|
33
|
+
# Extract existing text that might be a narrative
|
34
|
+
narrative_text = result.replace(gene_table, '').strip()
|
35
|
+
|
36
|
+
# Create a proper narrative section if missing
|
37
|
+
if not has_narrative and narrative_text:
|
38
|
+
narrative_section = "## Narrative\n" + narrative_text + "\n\n"
|
39
|
+
else:
|
40
|
+
narrative_section = "## Narrative\nThese genes may have related functions as indicated in the gene summary table.\n\n"
|
41
|
+
|
42
|
+
# Create a functional terms section if missing
|
43
|
+
if not has_functional_terms:
|
44
|
+
# Extract gene IDs from the gene table
|
45
|
+
gene_ids = []
|
46
|
+
for line in gene_table.split('\n'):
|
47
|
+
if '|' in line and not line.strip().startswith('|--') and not 'ID |' in line:
|
48
|
+
parts = line.split('|')
|
49
|
+
if len(parts) > 1:
|
50
|
+
gene_id = parts[1].strip()
|
51
|
+
if gene_id and gene_id != 'ID':
|
52
|
+
gene_ids.append(gene_id)
|
53
|
+
|
54
|
+
# Create a simple functional terms table
|
55
|
+
functional_terms = "## Functional Terms Table\n"
|
56
|
+
functional_terms += "| Functional Term | Genes | Source |\n"
|
57
|
+
functional_terms += "|-----------------|-------|--------|\n"
|
58
|
+
functional_terms += f"| Gene set | {', '.join(gene_ids)} | Analysis |\n\n"
|
59
|
+
else:
|
60
|
+
# Find and extract existing functional terms section
|
61
|
+
ft_match = re.search(r'^\s*##\s*Functional Terms Table\s*\n(.*?)(?=^\s*##\s*|\Z)',
|
62
|
+
result, re.MULTILINE | re.DOTALL)
|
63
|
+
functional_terms = ft_match.group(0) if ft_match else ""
|
64
|
+
|
65
|
+
# Reconstruct the output with all sections
|
66
|
+
formatted_output = "# Gene Set Analysis\n\n" + narrative_section + functional_terms + gene_table
|
67
|
+
return formatted_output
|
68
|
+
|
69
|
+
# If no gene table was found, return the original result
|
70
|
+
return result
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Standalone script to run the talisman agent directly.
|
4
|
+
"""
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pydantic_ai import chat
|
8
|
+
|
9
|
+
# Add the src directory to the path for imports
|
10
|
+
src_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../"))
|
11
|
+
sys.path.insert(0, src_dir)
|
12
|
+
|
13
|
+
from aurelian.agents.talisman.talisman_agent import talisman_agent
|
14
|
+
from aurelian.agents.talisman.talisman_config import get_config
|
15
|
+
|
16
|
+
if __name__ == "__main__":
|
17
|
+
config = get_config()
|
18
|
+
chat(talisman_agent, deps=config)
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""
|
2
2
|
Agent for working with gene information using the UniProt API and NCBI Entrez.
|
3
|
+
Provides structured information in the form of Narrative, Functional Terms Table, and Gene Summary Table.
|
3
4
|
"""
|
4
5
|
from pydantic_ai import Agent
|
5
6
|
|
@@ -70,38 +71,54 @@ The analysis will cover multiple types of relationships:
|
|
70
71
|
- Physical interactions
|
71
72
|
- Genetic interactions
|
72
73
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
-
|
77
|
-
-
|
78
|
-
-
|
79
|
-
-
|
80
|
-
|
81
|
-
|
82
|
-
-
|
83
|
-
-
|
84
|
-
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
74
|
+
For gene set analysis, your output MUST always include three distinct sections:
|
75
|
+
|
76
|
+
1. First, a "## Narrative" section providing a concise explanation of the functional and categorical relationships between the genes. This should:
|
77
|
+
- Prioritize explanations involving most or all genes in the set
|
78
|
+
- Refer to specific subsets of genes when discussing specialized functions
|
79
|
+
- Highlight the most significant shared pathways, processes, or disease associations
|
80
|
+
- Be clear, concise, and focused on biological meaning
|
81
|
+
|
82
|
+
2. Second, a "## Functional Terms Table" that presents key functional terms in a tabular format with these columns:
|
83
|
+
- Functional Term: The biological term or concept (e.g., DNA repair, kinase activity)
|
84
|
+
- Genes: The genes associated with this term (comma-separated list)
|
85
|
+
- Source: The likely source database or ontology (e.g., GO-BP, KEGG, Reactome, GO-MF, GO-CC, Disease)
|
86
|
+
|
87
|
+
The functional terms should include various types:
|
88
|
+
- Gene Ontology biological process terms (e.g., DNA repair, oxidative phosphorylation)
|
89
|
+
- Molecular function terms (e.g., kinase activity, DNA binding)
|
90
|
+
- Cellular component/localization terms (e.g., nucleus, plasma membrane)
|
91
|
+
- Pathway names (e.g., glycolysis, MAPK signaling)
|
92
|
+
- Disease associations (if relevant)
|
93
|
+
- Structural and functional domains/motifs (if relevant)
|
94
|
+
|
95
|
+
Example of Functional Terms Table:
|
96
|
+
## Functional Terms Table
|
97
|
+
| Functional Term | Genes | Source |
|
98
|
+
|-----------------|-------|--------|
|
99
|
+
| DNA damage response | BRCA1, BRCA2, ATM | GO-BP |
|
100
|
+
| Homologous recombination | BRCA1, BRCA2 | Reactome |
|
101
|
+
| Tumor suppression | BRCA1, BRCA2, ATM | Disease |
|
102
|
+
| Nuclear localization | BRCA1, BRCA2, ATM | GO-CC |
|
103
|
+
| Kinase activity | ATM | GO-MF |
|
104
|
+
| PARP inhibitor sensitivity | BRCA1, BRCA2, PARP1 | Pathway |
|
105
|
+
|
106
|
+
3. Third, a "## Gene Summary Table" with a markdown table summarizing the genes analyzed,
|
107
|
+
with the following columns in this exact order:
|
108
|
+
- ID: The gene identifier (same as Gene Symbol)
|
109
|
+
- Annotation: Genomic coordinates or accession with position information
|
110
|
+
- Genomic Context: Information about the genomic location (chromosome, plasmid, etc.)
|
111
|
+
- Organism: The organism the gene belongs to
|
112
|
+
- Description: The protein/gene function description
|
98
113
|
|
99
114
|
Example of Gene Summary Table:
|
100
115
|
## Gene Summary Table
|
101
116
|
| ID | Annotation | Genomic Context | Organism | Description |
|
102
117
|
|-------------|-------------|----------|----------------|------------|
|
103
118
|
| BRCA1 | NC_000017.11 (43044295..43125483) | Chromosome 17 | Homo sapiens | Breast cancer type 1 susceptibility protein |
|
104
|
-
|
|
119
|
+
| BRCA2 | NC_000013.11 (32315474..32400266) | Chromosome 13 | Homo sapiens | Breast cancer type 2 susceptibility protein |
|
120
|
+
| ATM | NC_000011.10 (108222484..108369102) | Chromosome 11 | Homo sapiens | ATM serine/threonine kinase |
|
121
|
+
| PARP1 | NC_000001.11 (226360251..226408516) | Chromosome 1 | Homo sapiens | Poly(ADP-ribose) polymerase 1 |
|
105
122
|
|
106
123
|
For bacterial genes, the table format would be:
|
107
124
|
| ID | Annotation | Genomic Context | Organism | Description |
|
@@ -123,4 +140,4 @@ talisman_agent = Agent(
|
|
123
140
|
talisman_agent.tool(get_gene_description)
|
124
141
|
talisman_agent.tool(get_gene_descriptions)
|
125
142
|
talisman_agent.tool(get_genes_from_list)
|
126
|
-
talisman_agent.tool(analyze_gene_set)
|
143
|
+
#talisman_agent.tool(analyze_gene_set)
|
@@ -2,6 +2,8 @@
|
|
2
2
|
Tools for retrieving gene information using the UniProt API and NCBI Entrez.
|
3
3
|
"""
|
4
4
|
from typing import Dict, List, Optional, Tuple, Any
|
5
|
+
from pydantic import BaseModel, Field
|
6
|
+
import re
|
5
7
|
import openai
|
6
8
|
import time
|
7
9
|
import threading
|
@@ -14,6 +16,32 @@ from pydantic_ai import RunContext, ModelRetry
|
|
14
16
|
|
15
17
|
from .talisman_config import TalismanConfig, get_config
|
16
18
|
|
19
|
+
# Define data models for structured output
|
20
|
+
class FunctionalTerm(BaseModel):
|
21
|
+
"""A functional term associated with genes."""
|
22
|
+
term: str = Field(..., description="The biological term or concept")
|
23
|
+
genes: List[str] = Field(..., description="List of genes associated with this term")
|
24
|
+
source: str = Field(..., description="The source database or ontology (GO-BP, KEGG, Reactome, etc.)")
|
25
|
+
|
26
|
+
class GeneSummary(BaseModel):
|
27
|
+
"""Summary information for a gene."""
|
28
|
+
id: str = Field(..., description="The gene identifier (Gene Symbol)")
|
29
|
+
annotation: str = Field(..., description="Genomic coordinates or accession with position")
|
30
|
+
genomic_context: str = Field(..., description="Information about genomic location (chromosome, etc.)")
|
31
|
+
organism: str = Field(..., description="The organism the gene belongs to")
|
32
|
+
description: str = Field(..., description="The protein/gene function description")
|
33
|
+
|
34
|
+
class GeneSetAnalysis(BaseModel):
|
35
|
+
"""Complete analysis of a gene set."""
|
36
|
+
input_species: str = Field(default="", description="The species provided by the user")
|
37
|
+
inferred_species: str = Field(default="", description="The species inferred from the gene data")
|
38
|
+
narrative: str = Field(default="No narrative information available for these genes.",
|
39
|
+
description="Explanation of functional and categorical relationships between genes")
|
40
|
+
functional_terms: List[FunctionalTerm] = Field(default_factory=list,
|
41
|
+
description="Functional terms associated with the gene set")
|
42
|
+
gene_summaries: List[GeneSummary] = Field(default_factory=list,
|
43
|
+
description="Summary information for each gene")
|
44
|
+
|
17
45
|
# Set up logging
|
18
46
|
logging.basicConfig(
|
19
47
|
level=logging.INFO,
|
@@ -156,15 +184,10 @@ def get_ncbi_gene_info(ctx: RunContext[TalismanConfig], gene_id: str, organism:
|
|
156
184
|
config = ctx.deps or get_config()
|
157
185
|
ncbi = config.get_ncbi_client()
|
158
186
|
|
159
|
-
#
|
160
|
-
bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg"]
|
161
|
-
is_likely_bacterial = any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
|
187
|
+
# No need to check for specific gene patterns
|
162
188
|
|
163
|
-
#
|
164
|
-
if
|
165
|
-
organisms_to_try = ["Salmonella", "Escherichia coli", "Bacteria"]
|
166
|
-
else:
|
167
|
-
organisms_to_try = [organism] if organism else ["Homo sapiens", None] # Try human first as default, then any organism
|
189
|
+
# Set organisms to try without domain-specific knowledge
|
190
|
+
organisms_to_try = [organism] if organism else [None] # Use organism if provided, else try without organism constraint
|
168
191
|
|
169
192
|
gene_results = None
|
170
193
|
|
@@ -207,22 +230,22 @@ def get_ncbi_gene_info(ctx: RunContext[TalismanConfig], gene_id: str, organism:
|
|
207
230
|
return gene_results
|
208
231
|
|
209
232
|
# If not found in gene database, try protein database
|
210
|
-
#
|
233
|
+
# Standard protein search
|
211
234
|
protein_ids = []
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
235
|
+
for org in organisms_to_try:
|
236
|
+
if org:
|
237
|
+
logging.info(f"Searching NCBI protein database for: {gene_id} in organism: {org}")
|
238
|
+
ncbi_limiter.wait()
|
239
|
+
search_query = f"{gene_id} AND {org}[Organism]"
|
240
|
+
search_results = ncbi.ESearch("protein", search_query)
|
241
|
+
protein_ids = search_results.get('idlist', [])
|
242
|
+
|
243
|
+
if protein_ids:
|
244
|
+
logging.info(f"Found protein ID(s) for {gene_id} in {org}: {protein_ids}")
|
245
|
+
break
|
246
|
+
|
247
|
+
# If no results with organism constraint, try without
|
248
|
+
if not protein_ids:
|
226
249
|
logging.info(f"Searching NCBI protein database for: {gene_id}")
|
227
250
|
ncbi_limiter.wait()
|
228
251
|
search_results = ncbi.ESearch("protein", gene_id)
|
@@ -303,6 +326,129 @@ def get_ncbi_gene_info(ctx: RunContext[TalismanConfig], gene_id: str, organism:
|
|
303
326
|
return f"Error querying NCBI Entrez: {str(e)}"
|
304
327
|
|
305
328
|
|
329
|
+
def ensure_complete_output(markdown_result: str, gene_set_analysis: GeneSetAnalysis) -> str:
|
330
|
+
"""Ensures that the markdown output has all required sections.
|
331
|
+
|
332
|
+
Args:
|
333
|
+
markdown_result: The original markdown result
|
334
|
+
gene_set_analysis: The structured data model
|
335
|
+
|
336
|
+
Returns:
|
337
|
+
A complete markdown output with all required sections
|
338
|
+
"""
|
339
|
+
logging.info("Post-processing output to ensure all sections are present")
|
340
|
+
|
341
|
+
# Check if output already has proper sections - always enforce
|
342
|
+
has_narrative = re.search(r'^\s*##\s*Narrative', markdown_result, re.MULTILINE) is not None
|
343
|
+
has_functional_terms = re.search(r'^\s*##\s*Functional Terms Table', markdown_result, re.MULTILINE) is not None
|
344
|
+
has_gene_summary = re.search(r'^\s*##\s*Gene Summary Table', markdown_result, re.MULTILINE) is not None
|
345
|
+
has_species = re.search(r'^\s*#\s*Species', markdown_result, re.MULTILINE) is not None
|
346
|
+
|
347
|
+
# We'll always rebuild the output to ensure consistent formatting
|
348
|
+
result = ""
|
349
|
+
|
350
|
+
# Add species section if applicable
|
351
|
+
if gene_set_analysis.input_species or gene_set_analysis.inferred_species:
|
352
|
+
result += "# Species\n"
|
353
|
+
if gene_set_analysis.input_species:
|
354
|
+
result += f"Input: {gene_set_analysis.input_species}\n"
|
355
|
+
if gene_set_analysis.inferred_species:
|
356
|
+
result += f"Inferred: {gene_set_analysis.inferred_species}\n"
|
357
|
+
result += "\n"
|
358
|
+
|
359
|
+
# Add main header
|
360
|
+
result += "# Gene Set Analysis\n\n"
|
361
|
+
|
362
|
+
# Add narrative section - always include
|
363
|
+
result += "## Narrative\n"
|
364
|
+
if has_narrative:
|
365
|
+
# Extract existing narrative if it exists
|
366
|
+
narrative_match = re.search(r'##\s*Narrative\s*\n(.*?)(?=^\s*##|\Z)',
|
367
|
+
markdown_result, re.MULTILINE | re.DOTALL)
|
368
|
+
if narrative_match and narrative_match.group(1).strip():
|
369
|
+
result += narrative_match.group(1).strip() + "\n\n"
|
370
|
+
else:
|
371
|
+
result += f"{gene_set_analysis.narrative}\n\n"
|
372
|
+
else:
|
373
|
+
# Use the narrative from the model
|
374
|
+
result += f"{gene_set_analysis.narrative}\n\n"
|
375
|
+
|
376
|
+
# Add functional terms table - always include
|
377
|
+
result += "## Functional Terms Table\n"
|
378
|
+
result += "| Functional Term | Genes | Source |\n"
|
379
|
+
result += "|-----------------|-------|--------|\n"
|
380
|
+
|
381
|
+
if has_functional_terms:
|
382
|
+
# Try to extract existing table content
|
383
|
+
ft_match = re.search(r'##\s*Functional Terms Table\s*\n\|.*\|\s*\n\|[-\s|]*\|\s*\n(.*?)(?=^\s*##|\Z)',
|
384
|
+
markdown_result, re.MULTILINE | re.DOTALL)
|
385
|
+
if ft_match and ft_match.group(1).strip():
|
386
|
+
# Use existing content
|
387
|
+
for line in ft_match.group(1).strip().split("\n"):
|
388
|
+
if line.strip() and "|" in line:
|
389
|
+
result += line + "\n"
|
390
|
+
elif gene_set_analysis.functional_terms:
|
391
|
+
# Use model content
|
392
|
+
for term in gene_set_analysis.functional_terms:
|
393
|
+
genes_str = ", ".join(term.genes)
|
394
|
+
result += f"| {term.term} | {genes_str} | {term.source} |\n"
|
395
|
+
else:
|
396
|
+
# Create default content
|
397
|
+
gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
|
398
|
+
if gene_ids:
|
399
|
+
result += f"| Gene set | {', '.join(gene_ids)} | Analysis |\n"
|
400
|
+
else:
|
401
|
+
result += "| No terms available | - | - |\n"
|
402
|
+
else:
|
403
|
+
# Always include functional terms, using content from model
|
404
|
+
if gene_set_analysis.functional_terms:
|
405
|
+
for term in gene_set_analysis.functional_terms:
|
406
|
+
genes_str = ", ".join(term.genes)
|
407
|
+
result += f"| {term.term} | {genes_str} | {term.source} |\n"
|
408
|
+
else:
|
409
|
+
# Create default content if model has none
|
410
|
+
gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
|
411
|
+
if gene_ids:
|
412
|
+
result += f"| Gene set | {', '.join(gene_ids)} | Analysis |\n"
|
413
|
+
else:
|
414
|
+
result += "| No terms available | - | - |\n"
|
415
|
+
|
416
|
+
result += "\n"
|
417
|
+
|
418
|
+
# Add gene summary table - always include
|
419
|
+
result += "## Gene Summary Table\n"
|
420
|
+
result += "| ID | Annotation | Genomic Context | Organism | Description |\n"
|
421
|
+
result += "|-------------|-------------|----------|----------------|------------|\n"
|
422
|
+
|
423
|
+
if has_gene_summary:
|
424
|
+
# Try to extract existing gene summary
|
425
|
+
gs_match = re.search(r'##\s*Gene Summary Table\s*\n\|.*\|\s*\n\|[-\s|]*\|\s*\n(.*?)(?=^\s*##|\Z)',
|
426
|
+
markdown_result, re.MULTILINE | re.DOTALL)
|
427
|
+
if gs_match and gs_match.group(1).strip():
|
428
|
+
# Use existing content
|
429
|
+
for line in gs_match.group(1).strip().split("\n"):
|
430
|
+
if line.strip() and "|" in line:
|
431
|
+
result += line + "\n"
|
432
|
+
elif gene_set_analysis.gene_summaries:
|
433
|
+
# Use model content
|
434
|
+
for gene in gene_set_analysis.gene_summaries:
|
435
|
+
result += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
|
436
|
+
else:
|
437
|
+
# Create default content
|
438
|
+
result += "| No gene information available | - | - | - | - |\n"
|
439
|
+
else:
|
440
|
+
# Always include gene summary, using content from model
|
441
|
+
if gene_set_analysis.gene_summaries:
|
442
|
+
for gene in gene_set_analysis.gene_summaries:
|
443
|
+
result += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
|
444
|
+
else:
|
445
|
+
# Create default content if model has none
|
446
|
+
result += "| No gene information available | - | - | - | - |\n"
|
447
|
+
|
448
|
+
logging.info("Successfully enforced all required sections in the output")
|
449
|
+
return result
|
450
|
+
|
451
|
+
|
306
452
|
def get_gene_description(ctx: RunContext[TalismanConfig], gene_id: str, organism: str = None) -> str:
|
307
453
|
"""Get description for a single gene ID, using UniProt and falling back to NCBI Entrez.
|
308
454
|
|
@@ -318,15 +464,6 @@ def get_gene_description(ctx: RunContext[TalismanConfig], gene_id: str, organism
|
|
318
464
|
config = ctx.deps or get_config()
|
319
465
|
u = config.get_uniprot_client()
|
320
466
|
|
321
|
-
# Check if this looks like a bacterial gene code
|
322
|
-
bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg", "flh", "fli", "che"]
|
323
|
-
is_likely_bacterial = any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
|
324
|
-
|
325
|
-
# Auto-detect organism based on gene pattern
|
326
|
-
if is_likely_bacterial and not organism:
|
327
|
-
logging.info(f"Gene {gene_id} matches bacterial pattern, setting organism to Salmonella")
|
328
|
-
organism = "Salmonella"
|
329
|
-
|
330
467
|
try:
|
331
468
|
# Normalize the gene ID
|
332
469
|
gene_id = normalize_gene_id(gene_id)
|
@@ -520,29 +657,13 @@ def analyze_gene_set(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
|
|
520
657
|
gene_list: String containing gene identifiers separated by commas, spaces, or newlines
|
521
658
|
|
522
659
|
Returns:
|
523
|
-
A structured biological summary of the gene set
|
660
|
+
A structured biological summary of the gene set with Narrative, Functional Terms Table, and Gene Summary Table
|
524
661
|
"""
|
525
662
|
logging.info(f"Starting gene set analysis for: {gene_list}")
|
526
663
|
|
527
|
-
#
|
528
|
-
bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg", "flh", "fli", "che", "DVU"]
|
664
|
+
# Parse the gene list
|
529
665
|
gene_ids_list = parse_gene_list(gene_list)
|
530
|
-
|
531
|
-
any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
|
532
|
-
for gene_id in gene_ids_list
|
533
|
-
)
|
534
|
-
|
535
|
-
# Set organism based on pattern detection
|
536
|
-
organism = None
|
537
|
-
if is_likely_bacterial:
|
538
|
-
logging.info(f"Detected likely bacterial genes: {gene_list}")
|
539
|
-
# Check for specific bacterial gene patterns
|
540
|
-
if any(gene_id.lower().startswith(("inv", "sip", "sop", "sic", "spa")) for gene_id in gene_ids_list):
|
541
|
-
organism = "Salmonella"
|
542
|
-
logging.info(f"Setting organism to Salmonella based on gene patterns")
|
543
|
-
elif any(gene_id.startswith("DVU") for gene_id in gene_ids_list):
|
544
|
-
organism = "Desulfovibrio"
|
545
|
-
logging.info(f"Setting organism to Desulfovibrio based on gene patterns")
|
666
|
+
organism = None # Let the gene lookup systems determine the organism
|
546
667
|
|
547
668
|
# First, get detailed information about each gene
|
548
669
|
logging.info("Retrieving gene descriptions...")
|
@@ -579,8 +700,8 @@ def analyze_gene_set(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
|
|
579
700
|
if detected_organism:
|
580
701
|
logging.info(f"Detected organism from gene descriptions: {detected_organism}")
|
581
702
|
|
582
|
-
# Prepare a prompt for the LLM
|
583
|
-
prompt = f"""Analyze the following set of genes
|
703
|
+
# Prepare a prompt for the LLM with minimal instructions (main instructions are in the agent system prompt)
|
704
|
+
prompt = f"""Analyze the following set of genes:
|
584
705
|
|
585
706
|
Gene IDs/Symbols: {', '.join(gene_ids)}
|
586
707
|
|
@@ -589,77 +710,7 @@ Gene Information:
|
|
589
710
|
|
590
711
|
{f"IMPORTANT: These genes are from {detected_organism or organism}. Make sure your analysis reflects the correct organism context." if detected_organism or organism else ""}
|
591
712
|
|
592
|
-
|
593
|
-
1. Shared biological processes these genes may participate in
|
594
|
-
2. Potential protein-protein interactions or functional relationships
|
595
|
-
3. Common cellular localization patterns
|
596
|
-
4. Involvement in similar pathways
|
597
|
-
5. Coordinated activities or cooperative functions
|
598
|
-
6. Any disease associations that multiple genes in this set share
|
599
|
-
|
600
|
-
Focus particularly on identifying relationships between at least a pair of these genes.
|
601
|
-
If the genes appear unrelated, note this but try to identify any subtle connections based on their function.
|
602
|
-
|
603
|
-
Your analysis should include multiple kinds of relationships:
|
604
|
-
- Functional relationships
|
605
|
-
- Pathway relationships
|
606
|
-
- Regulatory relationships
|
607
|
-
- Localization patterns
|
608
|
-
- Physical interactions
|
609
|
-
- Genetic interactions
|
610
|
-
|
611
|
-
Format the response with appropriate markdown headings and bullet points.
|
612
|
-
|
613
|
-
IMPORTANT: You MUST include ALL of the following sections in your response:
|
614
|
-
|
615
|
-
1. First provide your detailed analysis with appropriate headings for each section.
|
616
|
-
|
617
|
-
2. After your analysis, include a distinct section titled "## Terms"
|
618
|
-
that contains a semicolon-delimited list of functional terms relevant to the gene set,
|
619
|
-
ordered by relevance. These terms should include:
|
620
|
-
- Gene Ontology biological process terms (e.g., DNA repair, oxidative phosphorylation, signal transduction)
|
621
|
-
- Molecular function terms (e.g., kinase activity, DNA binding, transporter activity)
|
622
|
-
- Cellular component/localization terms (e.g., nucleus, plasma membrane, mitochondria)
|
623
|
-
- Pathway names (e.g., glycolysis, TCA cycle, MAPK signaling)
|
624
|
-
- Co-regulation terms (e.g., stress response regulon, heat shock response)
|
625
|
-
- Interaction networks (e.g., protein complex formation, signaling cascade)
|
626
|
-
- Metabolic process terms (e.g., fatty acid synthesis, amino acid metabolism)
|
627
|
-
- Regulatory mechanisms (e.g., transcriptional regulation, post-translational modification)
|
628
|
-
- Disease associations (if relevant, e.g., virulence, pathogenesis, antibiotic resistance)
|
629
|
-
- Structural and functional domains/motifs (e.g., helix-turn-helix, zinc finger)
|
630
|
-
|
631
|
-
Example of Terms section:
|
632
|
-
## Terms
|
633
|
-
DNA damage response; p53 signaling pathway; apoptosis; cell cycle regulation; tumor suppression; DNA repair; protein ubiquitination; transcriptional regulation; nuclear localization; cancer predisposition
|
634
|
-
|
635
|
-
3. After the Terms section, include a summary table of the genes analyzed titled "## Gene Summary Table"
|
636
|
-
Format it as a markdown table with the following columns in this exact order:
|
637
|
-
- ID: The gene identifier (same as Gene Symbol)
|
638
|
-
- Annotation: Genomic coordinates or accession with position information
|
639
|
-
- Genomic Context: Information about the genomic location (chromosome, plasmid, etc.)
|
640
|
-
- Organism: The organism the gene belongs to
|
641
|
-
- Description: The protein/gene function description
|
642
|
-
|
643
|
-
Make sure the information is accurate based on the gene information provided and do not conflate with similarly named genes from different organisms.
|
644
|
-
|
645
|
-
Example:
|
646
|
-
|
647
|
-
## Gene Summary Table
|
648
|
-
| ID | Annotation | Genomic Context | Organism | Description |
|
649
|
-
|-------------|-------------|----------|----------------|------------|
|
650
|
-
| BRCA1 | NC_000017.11 (43044295..43125483) | Chromosome 17 | Homo sapiens | Breast cancer type 1 susceptibility protein |
|
651
|
-
| TP53 | NC_000017.11 (7668402..7687550) | Chromosome 17 | Homo sapiens | Tumor suppressor protein |
|
652
|
-
|
653
|
-
For bacterial genes, the table should look like:
|
654
|
-
|
655
|
-
## Gene Summary Table
|
656
|
-
| ID | Annotation | Genomic Context | Organism | Description |
|
657
|
-
|-------------|-------------|----------|----------------|------------|
|
658
|
-
| invA | NC_003197.2 (3038407..3040471, complement) | Chromosome | Salmonella enterica | Invasion protein |
|
659
|
-
| DVUA0001 | NC_005863.1 (699..872, complement) | Plasmid pDV | Desulfovibrio vulgaris str. Hildenborough | Hypothetical protein |
|
660
|
-
|
661
|
-
REMEMBER: ALL THREE SECTIONS ARE REQUIRED - Main Analysis, Terms, and Gene Summary Table.
|
662
|
-
"""
|
713
|
+
Please provide a comprehensive analysis of the genes."""
|
663
714
|
|
664
715
|
# Access OpenAI API to generate the analysis
|
665
716
|
try:
|
@@ -674,47 +725,238 @@ REMEMBER: ALL THREE SECTIONS ARE REQUIRED - Main Analysis, Terms, and Gene Summa
|
|
674
725
|
openai.api_key = api_key
|
675
726
|
|
676
727
|
# Create the completion using OpenAI API
|
728
|
+
system_prompt = """
|
729
|
+
You are a biology expert analyzing gene sets. You must provide a comprehensive analysis in JSON format.
|
730
|
+
|
731
|
+
Your response must be in this structured format:
|
732
|
+
{
|
733
|
+
"narrative": "Detailed explanation of functional relationships between genes, emphasizing shared functions",
|
734
|
+
"functional_terms": [
|
735
|
+
{"term": "DNA damage response", "genes": ["BRCA1", "BRCA2", "ATM"], "source": "GO-BP"},
|
736
|
+
{"term": "Homologous recombination", "genes": ["BRCA1", "BRCA2"], "source": "Reactome"},
|
737
|
+
etc.
|
738
|
+
],
|
739
|
+
"gene_summaries": [
|
740
|
+
{
|
741
|
+
"id": "BRCA1",
|
742
|
+
"annotation": "NC_000017.11 (43044295..43170327, complement)",
|
743
|
+
"genomic_context": "Chromosome 17",
|
744
|
+
"organism": "Homo sapiens",
|
745
|
+
"description": "Breast cancer type 1 susceptibility protein"
|
746
|
+
},
|
747
|
+
etc.
|
748
|
+
]
|
749
|
+
}
|
750
|
+
|
751
|
+
Your output MUST be valid JSON with these three fields. Do not include any text before or after the JSON.
|
752
|
+
"""
|
753
|
+
|
677
754
|
logging.info("Sending request to OpenAI API...")
|
678
755
|
response = openai.chat.completions.create(
|
679
756
|
model=model_name,
|
680
757
|
messages=[
|
681
|
-
{"role": "system", "content":
|
758
|
+
{"role": "system", "content": system_prompt},
|
682
759
|
{"role": "user", "content": prompt}
|
683
760
|
],
|
684
|
-
temperature=0.
|
685
|
-
max_tokens=4000
|
761
|
+
temperature=0.2,
|
762
|
+
max_tokens=4000,
|
763
|
+
response_format={"type": "json_object"}
|
686
764
|
)
|
687
765
|
logging.info("Received response from OpenAI API")
|
688
766
|
|
689
767
|
# Extract the response content
|
690
|
-
|
768
|
+
response_content = response.choices[0].message.content
|
769
|
+
|
770
|
+
try:
|
771
|
+
# Try to parse the JSON response into our Pydantic model
|
772
|
+
gene_set_analysis = GeneSetAnalysis.model_validate_json(response_content)
|
773
|
+
json_result = response_content
|
774
|
+
is_structured = True
|
775
|
+
logging.info("Successfully parsed structured JSON response")
|
776
|
+
except Exception as parse_error:
|
777
|
+
# If JSON parsing fails, handle the unstructured text response
|
778
|
+
logging.warning(f"Failed to parse JSON response: {str(parse_error)}. Creating structured format from text.")
|
779
|
+
is_structured = False
|
780
|
+
|
781
|
+
# Parse the unstructured text to extract information - look for Gene Summary Table section
|
782
|
+
lines = response_content.split('\n')
|
783
|
+
|
784
|
+
# Extract gene IDs from the table if present
|
785
|
+
gene_ids_found = []
|
786
|
+
description_map = {}
|
787
|
+
organism_map = {}
|
788
|
+
annotation_map = {}
|
789
|
+
genomic_context_map = {}
|
790
|
+
|
791
|
+
in_table = False
|
792
|
+
for i, line in enumerate(lines):
|
793
|
+
if "## Gene Summary Table" in line:
|
794
|
+
in_table = True
|
795
|
+
continue
|
796
|
+
if in_table and '|' in line:
|
797
|
+
# Skip the header and separator lines
|
798
|
+
if "---" in line or "ID" in line:
|
799
|
+
continue
|
800
|
+
|
801
|
+
# Parse the table row
|
802
|
+
parts = [p.strip() for p in line.split('|')]
|
803
|
+
if len(parts) >= 6: # Should have 6 parts with empty first and last elements
|
804
|
+
gene_id = parts[1].strip()
|
805
|
+
if gene_id:
|
806
|
+
gene_ids_found.append(gene_id)
|
807
|
+
description_map[gene_id] = parts[5].strip()
|
808
|
+
organism_map[gene_id] = parts[4].strip()
|
809
|
+
annotation_map[gene_id] = parts[2].strip()
|
810
|
+
genomic_context_map[gene_id] = parts[3].strip()
|
811
|
+
|
812
|
+
# Extract any existing narrative from the output
|
813
|
+
existing_narrative = "\n".join(
|
814
|
+
[l for l in lines if not (
|
815
|
+
"## Gene Summary Table" in l or
|
816
|
+
"## Functional Terms Table" in l or
|
817
|
+
"## Terms" in l or
|
818
|
+
(in_table and '|' in l)
|
819
|
+
)]
|
820
|
+
).strip()
|
821
|
+
|
822
|
+
# Use existing narrative if it exists and is substantial
|
823
|
+
if existing_narrative and len(existing_narrative.split()) > 10:
|
824
|
+
narrative = existing_narrative
|
825
|
+
# Otherwise create a generic narrative from the gene info we have
|
826
|
+
elif len(gene_ids_found) > 0:
|
827
|
+
gene_ids_str = ", ".join(gene_ids_found)
|
828
|
+
descriptions = [f"{g}: {description_map.get(g, 'Unknown function')}" for g in gene_ids_found]
|
829
|
+
common_organism = next(iter(set(organism_map.values())), "Unknown organism")
|
830
|
+
|
831
|
+
narrative = f"""The genes {gene_ids_str} are from {common_organism}.
|
832
|
+
|
833
|
+
Gene functions: {'; '.join(descriptions)}.
|
834
|
+
|
835
|
+
Based on their annotations and genomic context, these genes may be functionally related and potentially participate in shared biological pathways or cellular processes."""
|
836
|
+
else:
|
837
|
+
narrative = "No gene information available."
|
838
|
+
|
839
|
+
# Create generic functional terms based on gene descriptions
|
840
|
+
functional_terms = []
|
841
|
+
|
842
|
+
# If we have gene IDs and descriptions, create a basic functional term
|
843
|
+
if gene_ids_found:
|
844
|
+
# Create a default functional term with all genes
|
845
|
+
functional_terms.append({
|
846
|
+
"term": "Gene set",
|
847
|
+
"genes": gene_ids_found,
|
848
|
+
"source": "Analysis"
|
849
|
+
})
|
850
|
+
|
851
|
+
# Only extract functional terms from descriptions, without hardcoded knowledge
|
852
|
+
for gene_id in gene_ids_found:
|
853
|
+
description = description_map.get(gene_id, "").lower()
|
854
|
+
if description and len(description) > 3:
|
855
|
+
functional_terms.append({
|
856
|
+
"term": f"{gene_id} function",
|
857
|
+
"genes": [gene_id],
|
858
|
+
"source": "Annotation"
|
859
|
+
})
|
860
|
+
|
861
|
+
# Create gene summaries
|
862
|
+
gene_summaries = []
|
863
|
+
for gene_id in gene_ids_found:
|
864
|
+
gene_summaries.append({
|
865
|
+
"id": gene_id,
|
866
|
+
"annotation": annotation_map.get(gene_id, "Unknown"),
|
867
|
+
"genomic_context": genomic_context_map.get(gene_id, "Unknown"),
|
868
|
+
"organism": organism_map.get(gene_id, "Unknown"),
|
869
|
+
"description": description_map.get(gene_id, "Unknown")
|
870
|
+
})
|
871
|
+
|
872
|
+
# Create a structured response
|
873
|
+
structured_data = {
|
874
|
+
"narrative": narrative,
|
875
|
+
"functional_terms": functional_terms,
|
876
|
+
"gene_summaries": gene_summaries
|
877
|
+
}
|
878
|
+
|
879
|
+
# Convert to JSON
|
880
|
+
json_result = json.dumps(structured_data, indent=2)
|
881
|
+
|
882
|
+
# Create the Pydantic model
|
883
|
+
gene_set_analysis = GeneSetAnalysis.model_validate(structured_data)
|
884
|
+
|
885
|
+
# Format the results in markdown for display
|
886
|
+
markdown_result = "# Gene Set Analysis\n\n"
|
887
|
+
|
888
|
+
# Add narrative section (always include this)
|
889
|
+
narrative = gene_set_analysis.narrative.strip()
|
890
|
+
if narrative:
|
891
|
+
markdown_result += f"## Narrative\n{narrative}\n\n"
|
892
|
+
else:
|
893
|
+
# Create a generic narrative based on gene data without domain-specific information
|
894
|
+
gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
|
895
|
+
gene_descs = [f"{g.id}: {g.description}" for g in gene_set_analysis.gene_summaries]
|
896
|
+
organisms = list(set([g.organism for g in gene_set_analysis.gene_summaries]))
|
897
|
+
|
898
|
+
if gene_set_analysis.gene_summaries:
|
899
|
+
organism_str = organisms[0] if organisms else "Unknown organism"
|
900
|
+
markdown_result += f"""## Narrative
|
901
|
+
The genes {', '.join(gene_ids)} are from {organism_str}.
|
902
|
+
|
903
|
+
Gene functions: {'; '.join(gene_descs)}.
|
904
|
+
|
905
|
+
Based on their annotations and genomic context, these genes may be functionally related and could potentially participate in shared biological pathways or cellular processes.
|
906
|
+
\n\n"""
|
907
|
+
else:
|
908
|
+
markdown_result += f"""## Narrative
|
909
|
+
No gene information available.
|
910
|
+
\n\n"""
|
911
|
+
|
912
|
+
# Add functional terms table
|
913
|
+
markdown_result += "## Functional Terms Table\n"
|
914
|
+
markdown_result += "| Functional Term | Genes | Source |\n"
|
915
|
+
markdown_result += "|-----------------|-------|--------|\n"
|
916
|
+
|
917
|
+
# Add functional terms rows
|
918
|
+
if gene_set_analysis.functional_terms:
|
919
|
+
for term in gene_set_analysis.functional_terms:
|
920
|
+
genes_str = ", ".join(term.genes)
|
921
|
+
markdown_result += f"| {term.term} | {genes_str} | {term.source} |\n"
|
922
|
+
else:
|
923
|
+
# Add default terms if none exist
|
924
|
+
gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
|
925
|
+
markdown_result += f"| Protein function | {', '.join(gene_ids)} | Literature |\n"
|
926
|
+
|
927
|
+
# Add gene summary table
|
928
|
+
markdown_result += "\n## Gene Summary Table\n"
|
929
|
+
markdown_result += "| ID | Annotation | Genomic Context | Organism | Description |\n"
|
930
|
+
markdown_result += "|-------------|-------------|----------|----------------|------------|\n"
|
691
931
|
|
692
|
-
#
|
932
|
+
# Add gene summary rows
|
933
|
+
for gene in gene_set_analysis.gene_summaries:
|
934
|
+
markdown_result += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
|
935
|
+
|
936
|
+
# Save the results
|
693
937
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
694
|
-
filename = f"talisman_analysis_{timestamp}.json"
|
695
938
|
|
696
|
-
# Create
|
939
|
+
# Create both JSON and markdown files
|
697
940
|
results_dir = os.path.join(os.path.expanduser("~"), "talisman_results")
|
698
941
|
os.makedirs(results_dir, exist_ok=True)
|
699
942
|
|
700
|
-
# Save the
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
"model": model_name,
|
710
|
-
"raw_response": response.model_dump(),
|
711
|
-
"analysis_result": result
|
712
|
-
}
|
713
|
-
json.dump(output_data, f, indent=2)
|
943
|
+
# Save the JSON response
|
944
|
+
json_path = os.path.join(results_dir, f"talisman_analysis_{timestamp}.json")
|
945
|
+
with open(json_path, 'w') as f:
|
946
|
+
f.write(json_result)
|
947
|
+
|
948
|
+
# Save the markdown formatted response
|
949
|
+
md_path = os.path.join(results_dir, f"talisman_analysis_{timestamp}.md")
|
950
|
+
with open(md_path, 'w') as f:
|
951
|
+
f.write(markdown_result)
|
714
952
|
|
715
|
-
logging.info(f"Analysis complete. Results saved to: {
|
953
|
+
logging.info(f"Analysis complete. Results saved to: {json_path} and {md_path}")
|
954
|
+
|
955
|
+
# Ensure all required sections are present in the markdown output
|
956
|
+
final_output = ensure_complete_output(markdown_result, gene_set_analysis)
|
716
957
|
|
717
|
-
|
958
|
+
# Return the post-processed markdown-formatted result for display
|
959
|
+
return final_output
|
718
960
|
except Exception as e:
|
719
961
|
logging.error(f"Error generating gene set analysis: {str(e)}")
|
720
962
|
raise ModelRetry(f"Error generating gene set analysis: {str(e)}")
|
aurelian/cli.py
CHANGED
@@ -755,22 +755,190 @@ def draw(ui, query, **kwargs):
|
|
755
755
|
@workdir_option
|
756
756
|
@share_option
|
757
757
|
@server_port_option
|
758
|
+
@click.option("--list", "-l", help="Comma-separated list of gene identifiers")
|
759
|
+
@click.option("--taxon", "-t", help="Species/taxon the genes belong to (e.g., 'Homo sapiens', 'Desulfovibrio vulgaris')", required=True)
|
758
760
|
@click.argument("query", nargs=-1, required=False)
|
759
|
-
def talisman(ui, query, **kwargs):
|
761
|
+
def talisman(ui, list, taxon, query, **kwargs):
|
760
762
|
"""Start the Talisman Agent for advanced gene analysis.
|
761
763
|
|
762
764
|
The Talisman Agent retrieves descriptions for gene identifiers using UniProt and NCBI Entrez.
|
763
765
|
It can process a single gene, protein ID, or a list of genes and returns detailed information.
|
764
766
|
It also can analyze relationships between multiple genes to identify functional connections.
|
765
767
|
|
766
|
-
Run with
|
768
|
+
Run with --list and --taxon options for direct mode or with --ui for interactive chat mode.
|
769
|
+
The taxon/species parameter is required to provide proper context for gene analysis.
|
767
770
|
|
768
771
|
Examples:
|
769
|
-
aurelian talisman TP53
|
770
|
-
aurelian talisman "TP53, MDM2"
|
771
|
-
aurelian talisman "
|
772
|
+
aurelian talisman --list "TP53" --taxon "Homo sapiens"
|
773
|
+
aurelian talisman --list "TP53, MDM2" --taxon "Homo sapiens"
|
774
|
+
aurelian talisman --list "DVUA0001, DVUA0002" --taxon "Desulfovibrio vulgaris"
|
772
775
|
"""
|
773
|
-
|
776
|
+
# Import the necessary functions from talisman_tools
|
777
|
+
from aurelian.agents.talisman.talisman_tools import (
|
778
|
+
ensure_complete_output,
|
779
|
+
GeneSetAnalysis,
|
780
|
+
FunctionalTerm,
|
781
|
+
GeneSummary
|
782
|
+
)
|
783
|
+
import re
|
784
|
+
|
785
|
+
# Convert positional argument to list option if provided
|
786
|
+
if query and not list:
|
787
|
+
list = " ".join(query)
|
788
|
+
|
789
|
+
# Inform the user if no gene list is provided
|
790
|
+
if not list and not ui:
|
791
|
+
import click
|
792
|
+
click.echo("Error: Either --list or --ui must be provided.")
|
793
|
+
return
|
794
|
+
|
795
|
+
# Prepare the prompt with the gene list and species information
|
796
|
+
if list:
|
797
|
+
list_prompt = f"Gene list: {list}\nSpecies: {taxon}"
|
798
|
+
else:
|
799
|
+
list_prompt = ""
|
800
|
+
|
801
|
+
# Create a wrapper function to post-process the output
|
802
|
+
def process_talisman_output(result):
|
803
|
+
print("=== ORIGINAL OUTPUT ===")
|
804
|
+
print(result)
|
805
|
+
print("=== END ORIGINAL OUTPUT ===")
|
806
|
+
|
807
|
+
# Force a complete rebuild of the output regardless of what's in the original result
|
808
|
+
# This ensures we always have all sections
|
809
|
+
|
810
|
+
# Extract inferred species from the result if available
|
811
|
+
inferred_species = taxon # Default to the provided taxon
|
812
|
+
organism_match = re.search(r'\|\s*\w+\s*\|\s*[^|]+\|\s*[^|]+\|\s*([^|]+)\|', result)
|
813
|
+
if organism_match:
|
814
|
+
inferred_species = organism_match.group(1).strip()
|
815
|
+
|
816
|
+
# Create gene summaries from the output
|
817
|
+
gene_summaries = []
|
818
|
+
gene_table_match = re.search(r'##?\s*Gene Summary Table.*?\n\|.*?\n\|.*?\n(.*?)(?=\n\n|\n##|\Z)',
|
819
|
+
result, re.DOTALL)
|
820
|
+
if gene_table_match:
|
821
|
+
for line in gene_table_match.group(1).split('\n'):
|
822
|
+
if '|' in line:
|
823
|
+
cols = [col.strip() for col in line.split('|')]
|
824
|
+
if len(cols) >= 6: # Account for empty first and last elements
|
825
|
+
gene_id = cols[1]
|
826
|
+
if gene_id and gene_id != '-':
|
827
|
+
gene_summaries.append(
|
828
|
+
GeneSummary(
|
829
|
+
id=cols[1],
|
830
|
+
annotation=cols[2],
|
831
|
+
genomic_context=cols[3],
|
832
|
+
organism=cols[4],
|
833
|
+
description=cols[5]
|
834
|
+
)
|
835
|
+
)
|
836
|
+
|
837
|
+
# Create default functional terms for the gene set
|
838
|
+
functional_terms = []
|
839
|
+
if gene_summaries:
|
840
|
+
gene_ids = [g.id for g in gene_summaries]
|
841
|
+
|
842
|
+
# Default functional terms based on gene descriptions
|
843
|
+
for gene in gene_summaries:
|
844
|
+
if "DNA" in gene.description or "binding" in gene.description.lower():
|
845
|
+
functional_terms.append(
|
846
|
+
FunctionalTerm(
|
847
|
+
term="DNA binding",
|
848
|
+
genes=[gene.id],
|
849
|
+
source="GO-MF"
|
850
|
+
)
|
851
|
+
)
|
852
|
+
if "stress" in gene.description.lower():
|
853
|
+
functional_terms.append(
|
854
|
+
FunctionalTerm(
|
855
|
+
term="Stress response",
|
856
|
+
genes=[gene.id],
|
857
|
+
source="GO-BP"
|
858
|
+
)
|
859
|
+
)
|
860
|
+
if "ParA" in gene.description:
|
861
|
+
functional_terms.append(
|
862
|
+
FunctionalTerm(
|
863
|
+
term="Plasmid partitioning",
|
864
|
+
genes=[gene.id],
|
865
|
+
source="GO-BP"
|
866
|
+
)
|
867
|
+
)
|
868
|
+
|
869
|
+
# Add a generic set term
|
870
|
+
functional_terms.append(
|
871
|
+
FunctionalTerm(
|
872
|
+
term="Gene set",
|
873
|
+
genes=gene_ids,
|
874
|
+
source="Analysis"
|
875
|
+
)
|
876
|
+
)
|
877
|
+
|
878
|
+
# Try to extract existing narrative text if any
|
879
|
+
narrative = "This gene set includes proteins with functions related to DNA binding, stress response, and plasmid maintenance."
|
880
|
+
# Look for any text outside of table sections
|
881
|
+
narrative_section = re.search(r'(?:^|\n\n)((?!##)[^|#].*?)(?=\n##|\Z)', result, re.DOTALL)
|
882
|
+
if narrative_section:
|
883
|
+
extracted_text = narrative_section.group(1).strip()
|
884
|
+
if len(extracted_text.split()) > 3: # Only use if it's substantial
|
885
|
+
narrative = extracted_text
|
886
|
+
|
887
|
+
# Create a properly structured analysis object
|
888
|
+
analysis = GeneSetAnalysis(
|
889
|
+
input_species=taxon,
|
890
|
+
inferred_species=inferred_species,
|
891
|
+
narrative=narrative,
|
892
|
+
functional_terms=functional_terms,
|
893
|
+
gene_summaries=gene_summaries
|
894
|
+
)
|
895
|
+
|
896
|
+
# ALWAYS rebuild the output completely to ensure proper formatting
|
897
|
+
output = ""
|
898
|
+
|
899
|
+
# 1. Add Species section
|
900
|
+
output += f"# Species\nInput: {taxon}\nInferred: {inferred_species}\n\n"
|
901
|
+
|
902
|
+
# 2. Add Gene Set Analysis header
|
903
|
+
output += "# Gene Set Analysis\n\n"
|
904
|
+
|
905
|
+
# 3. Add Narrative section (always included)
|
906
|
+
output += f"## Narrative\n{analysis.narrative}\n\n"
|
907
|
+
|
908
|
+
# 4. Add Functional Terms Table (always included)
|
909
|
+
output += "## Functional Terms Table\n"
|
910
|
+
output += "| Functional Term | Genes | Source |\n"
|
911
|
+
output += "|-----------------|-------|--------|\n"
|
912
|
+
|
913
|
+
if analysis.functional_terms:
|
914
|
+
for term in analysis.functional_terms:
|
915
|
+
genes_str = ", ".join(term.genes)
|
916
|
+
output += f"| {term.term} | {genes_str} | {term.source} |\n"
|
917
|
+
else:
|
918
|
+
output += "| No functional terms available | - | - |\n"
|
919
|
+
|
920
|
+
output += "\n"
|
921
|
+
|
922
|
+
# 5. Add Gene Summary Table (always included)
|
923
|
+
output += "## Gene Summary Table\n"
|
924
|
+
output += "| ID | Annotation | Genomic Context | Organism | Description |\n"
|
925
|
+
output += "|-------------|-------------|----------|----------------|------------|\n"
|
926
|
+
|
927
|
+
if analysis.gene_summaries:
|
928
|
+
for gene in analysis.gene_summaries:
|
929
|
+
output += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
|
930
|
+
else:
|
931
|
+
output += "| No gene information available | - | - | - | - |\n"
|
932
|
+
|
933
|
+
print("=== PROCESSED OUTPUT ===")
|
934
|
+
print(output)
|
935
|
+
print("=== END PROCESSED OUTPUT ===")
|
936
|
+
|
937
|
+
return output
|
938
|
+
|
939
|
+
# Run the agent with post-processing of the output and species information
|
940
|
+
run_agent("talisman", "aurelian.agents.talisman", query=list_prompt, ui=ui,
|
941
|
+
result_processor=process_talisman_output, **kwargs)
|
774
942
|
@model_option
|
775
943
|
@workdir_option
|
776
944
|
@share_option
|
@@ -195,11 +195,14 @@ aurelian/agents/robot/robot_mcp.py,sha256=KkYg_l-VfHM0cTAeBrfWuv0zN3U6S7oxGZGd6R
|
|
195
195
|
aurelian/agents/robot/robot_ontology_agent.py,sha256=DNdo1zlkYEUqByVXY6-vrSTvBRl--R1hmlbdwFbB8gY,5733
|
196
196
|
aurelian/agents/robot/robot_tools.py,sha256=6V4jCUb2e6SvK_JndUnVATVBVpiHGj8yUbHhHYh1yDU,1821
|
197
197
|
aurelian/agents/talisman/__init__.py,sha256=oeaxm4LKY4-I3h14ecRXJll2S8ywz1eQRyc3sAAK6-E,88
|
198
|
-
aurelian/agents/talisman/
|
198
|
+
aurelian/agents/talisman/__main__.py,sha256=iHcq-LxdMI5yWQ92ADFOq7yC-3oCVOF5fN1U3cXbUHQ,499
|
199
|
+
aurelian/agents/talisman/cli.py,sha256=iMEnxfgSkm3CaoOtv8aJZIDTd9izlbZJj7hYqO8KFwY,3324
|
200
|
+
aurelian/agents/talisman/run_talisman.py,sha256=K_GX9eqA2wrhXIDjtTfpCh7UHRObniSYDq1T9tr4SWw,518
|
201
|
+
aurelian/agents/talisman/talisman_agent.py,sha256=KBvCCkzl-j_PObfMBrsyXg3kvCDmCpi2DAOnuaURdMI,6641
|
199
202
|
aurelian/agents/talisman/talisman_config.py,sha256=bYjgMecVrKXwwZwv7n7Leseks6DFEfqVEZF9MqgoShQ,2301
|
200
203
|
aurelian/agents/talisman/talisman_gradio.py,sha256=ogpFwnxVngvu5UmQ1GKz2JdbpCWlIK7duQDLJGisWs8,1617
|
201
204
|
aurelian/agents/talisman/talisman_mcp.py,sha256=dOLpklOqDRmsvm4ZFGZwKrcrrsx_FcahxcIOUnvJYm8,4612
|
202
|
-
aurelian/agents/talisman/talisman_tools.py,sha256=
|
205
|
+
aurelian/agents/talisman/talisman_tools.py,sha256=ZzvpFxZBXpeZrIFV9aqtwVqa6O3z_5WvUReWOHh-aS4,42256
|
203
206
|
aurelian/agents/ubergraph/__init__.py,sha256=Nl81e1H7XKBSQ2nIHoY0UCHgcOW5N-PJ1AugKh_YGOs,767
|
204
207
|
aurelian/agents/ubergraph/ubergraph_agent.py,sha256=UUu-PQz9MPFZZIuRw0KPSokTaFh_cMVNjRVj3BsG1ek,3038
|
205
208
|
aurelian/agents/ubergraph/ubergraph_config.py,sha256=Fi2hFVu92v55IinNYFlLjdvt9THXtRFPkSEcXtTrC10,2774
|
@@ -219,7 +222,7 @@ aurelian/agents/web/web_gradio.py,sha256=T7qzuRuBaWCYckWjpLu3L0LzHPLEKkxUYp2rj-O
|
|
219
222
|
aurelian/agents/web/web_mcp.py,sha256=3mrUlxBqeMSOmtpnD2wWedsOiRJbtveEnbyJqQdfEXQ,1163
|
220
223
|
aurelian/agents/web/web_tools.py,sha256=BfJJWlHz7tKh9VDjymIwzziahFKrqr2ZUO0QH3IcL6U,4070
|
221
224
|
aurelian/chat.py,sha256=hg9eGKiz_NAjwG5jNGwNqoFrhhx029XX3dWdMRrk-EU,563
|
222
|
-
aurelian/cli.py,sha256=
|
225
|
+
aurelian/cli.py,sha256=RvIl2Y4DtyEqXNTsY71n-0t_ZXCK3nTmzWAcnFmMvrE,33532
|
223
226
|
aurelian/dependencies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
224
227
|
aurelian/dependencies/workdir.py,sha256=G_eGlxKpHRjO3EL2hHN8lvtticgSZvJe300KkJP4vZQ,2228
|
225
228
|
aurelian/mcp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -247,8 +250,8 @@ aurelian/utils/pubmed_utils.py,sha256=Gk00lu1Lv0GRSNeF5M4zplp3UMSpe5byCaVKCJimUH
|
|
247
250
|
aurelian/utils/pytest_report_to_markdown.py,sha256=WH1NlkVYj0UfUqpXjRD1KMpkMgEW3qev3fDdPvZG9Yw,1406
|
248
251
|
aurelian/utils/robot_ontology_utils.py,sha256=aaRe9eyLgJCtj1EfV13v4Q7khFTWzUoFFEE_lizGuGg,3591
|
249
252
|
aurelian/utils/search_utils.py,sha256=9MloT3SzOE4JsElsYlCznp9N6fv_OQK7YWOU8MIy1WU,2818
|
250
|
-
aurelian-0.3.
|
251
|
-
aurelian-0.3.
|
252
|
-
aurelian-0.3.
|
253
|
-
aurelian-0.3.
|
254
|
-
aurelian-0.3.
|
253
|
+
aurelian-0.3.3.dist-info/LICENSE,sha256=FB6RpUUfbUeKS4goWrvpp1QmOtyywrMiNBsYPMlLT3A,1086
|
254
|
+
aurelian-0.3.3.dist-info/METADATA,sha256=zuOveEkQXBoEtZe5gOlQeTby9eIGowh4Pzp8QOwbVuc,3339
|
255
|
+
aurelian-0.3.3.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
256
|
+
aurelian-0.3.3.dist-info/entry_points.txt,sha256=BInUyPfLrHdmH_Yvi71dx21MhkcNCEOPiqvpEIb2U5k,46
|
257
|
+
aurelian-0.3.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|