aurelian 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (254) hide show
  1. aurelian/__init__.py +9 -0
  2. aurelian/agents/__init__.py +0 -0
  3. aurelian/agents/amigo/__init__.py +3 -0
  4. aurelian/agents/amigo/amigo_agent.py +77 -0
  5. aurelian/agents/amigo/amigo_config.py +85 -0
  6. aurelian/agents/amigo/amigo_evals.py +73 -0
  7. aurelian/agents/amigo/amigo_gradio.py +52 -0
  8. aurelian/agents/amigo/amigo_mcp.py +152 -0
  9. aurelian/agents/amigo/amigo_tools.py +152 -0
  10. aurelian/agents/biblio/__init__.py +42 -0
  11. aurelian/agents/biblio/biblio_agent.py +94 -0
  12. aurelian/agents/biblio/biblio_config.py +40 -0
  13. aurelian/agents/biblio/biblio_gradio.py +67 -0
  14. aurelian/agents/biblio/biblio_mcp.py +115 -0
  15. aurelian/agents/biblio/biblio_tools.py +164 -0
  16. aurelian/agents/biblio_agent.py +46 -0
  17. aurelian/agents/checklist/__init__.py +44 -0
  18. aurelian/agents/checklist/checklist_agent.py +85 -0
  19. aurelian/agents/checklist/checklist_config.py +28 -0
  20. aurelian/agents/checklist/checklist_gradio.py +70 -0
  21. aurelian/agents/checklist/checklist_mcp.py +86 -0
  22. aurelian/agents/checklist/checklist_tools.py +141 -0
  23. aurelian/agents/checklist/content/checklists.yaml +7 -0
  24. aurelian/agents/checklist/content/streams.csv +136 -0
  25. aurelian/agents/checklist_agent.py +40 -0
  26. aurelian/agents/chemistry/__init__.py +3 -0
  27. aurelian/agents/chemistry/chemistry_agent.py +46 -0
  28. aurelian/agents/chemistry/chemistry_config.py +71 -0
  29. aurelian/agents/chemistry/chemistry_evals.py +79 -0
  30. aurelian/agents/chemistry/chemistry_gradio.py +50 -0
  31. aurelian/agents/chemistry/chemistry_mcp.py +120 -0
  32. aurelian/agents/chemistry/chemistry_tools.py +121 -0
  33. aurelian/agents/chemistry/image_agent.py +15 -0
  34. aurelian/agents/d4d/__init__.py +30 -0
  35. aurelian/agents/d4d/d4d_agent.py +72 -0
  36. aurelian/agents/d4d/d4d_config.py +46 -0
  37. aurelian/agents/d4d/d4d_gradio.py +58 -0
  38. aurelian/agents/d4d/d4d_mcp.py +71 -0
  39. aurelian/agents/d4d/d4d_tools.py +157 -0
  40. aurelian/agents/d4d_agent.py +64 -0
  41. aurelian/agents/diagnosis/__init__.py +33 -0
  42. aurelian/agents/diagnosis/diagnosis_agent.py +53 -0
  43. aurelian/agents/diagnosis/diagnosis_config.py +48 -0
  44. aurelian/agents/diagnosis/diagnosis_evals.py +76 -0
  45. aurelian/agents/diagnosis/diagnosis_gradio.py +52 -0
  46. aurelian/agents/diagnosis/diagnosis_mcp.py +141 -0
  47. aurelian/agents/diagnosis/diagnosis_tools.py +204 -0
  48. aurelian/agents/diagnosis_agent.py +28 -0
  49. aurelian/agents/draw/__init__.py +3 -0
  50. aurelian/agents/draw/draw_agent.py +39 -0
  51. aurelian/agents/draw/draw_config.py +26 -0
  52. aurelian/agents/draw/draw_gradio.py +50 -0
  53. aurelian/agents/draw/draw_mcp.py +94 -0
  54. aurelian/agents/draw/draw_tools.py +100 -0
  55. aurelian/agents/draw/judge_agent.py +18 -0
  56. aurelian/agents/filesystem/__init__.py +0 -0
  57. aurelian/agents/filesystem/filesystem_config.py +27 -0
  58. aurelian/agents/filesystem/filesystem_gradio.py +49 -0
  59. aurelian/agents/filesystem/filesystem_mcp.py +89 -0
  60. aurelian/agents/filesystem/filesystem_tools.py +95 -0
  61. aurelian/agents/filesystem/py.typed +0 -0
  62. aurelian/agents/github/__init__.py +0 -0
  63. aurelian/agents/github/github_agent.py +83 -0
  64. aurelian/agents/github/github_cli.py +248 -0
  65. aurelian/agents/github/github_config.py +22 -0
  66. aurelian/agents/github/github_gradio.py +152 -0
  67. aurelian/agents/github/github_mcp.py +252 -0
  68. aurelian/agents/github/github_tools.py +408 -0
  69. aurelian/agents/github/github_tools.py.tmp +413 -0
  70. aurelian/agents/goann/__init__.py +13 -0
  71. aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.md +1000 -0
  72. aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines.pdf +0 -0
  73. aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.md +693 -0
  74. aurelian/agents/goann/documents/Transcription_Factors_Annotation_Guidelines_Paper.pdf +0 -0
  75. aurelian/agents/goann/goann_agent.py +90 -0
  76. aurelian/agents/goann/goann_config.py +90 -0
  77. aurelian/agents/goann/goann_evals.py +104 -0
  78. aurelian/agents/goann/goann_gradio.py +62 -0
  79. aurelian/agents/goann/goann_mcp.py +0 -0
  80. aurelian/agents/goann/goann_tools.py +65 -0
  81. aurelian/agents/gocam/__init__.py +43 -0
  82. aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.docx +0 -0
  83. aurelian/agents/gocam/documents/DNA-binding transcription factor activity annotation guidelines.pdf +0 -0
  84. aurelian/agents/gocam/documents/DNA-binding_transcription_factor_activity_annotation_guidelines.md +100 -0
  85. aurelian/agents/gocam/documents/E3 ubiquitin ligases.docx +0 -0
  86. aurelian/agents/gocam/documents/E3 ubiquitin ligases.pdf +0 -0
  87. aurelian/agents/gocam/documents/E3_ubiquitin_ligases.md +134 -0
  88. aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.docx +0 -0
  89. aurelian/agents/gocam/documents/GO-CAM annotation guidelines README.pdf +0 -0
  90. aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.docx +0 -0
  91. aurelian/agents/gocam/documents/GO-CAM modelling guidelines TO DO.pdf +0 -0
  92. aurelian/agents/gocam/documents/GO-CAM_annotation_guidelines_README.md +1 -0
  93. aurelian/agents/gocam/documents/GO-CAM_modelling_guidelines_TO_DO.md +3 -0
  94. aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.docx +0 -0
  95. aurelian/agents/gocam/documents/How to annotate complexes in GO-CAM.pdf +0 -0
  96. aurelian/agents/gocam/documents/How to annotate molecular adaptors.docx +0 -0
  97. aurelian/agents/gocam/documents/How to annotate molecular adaptors.pdf +0 -0
  98. aurelian/agents/gocam/documents/How to annotate sequestering proteins.docx +0 -0
  99. aurelian/agents/gocam/documents/How to annotate sequestering proteins.pdf +0 -0
  100. aurelian/agents/gocam/documents/How_to_annotate_complexes_in_GO-CAM.md +29 -0
  101. aurelian/agents/gocam/documents/How_to_annotate_molecular_adaptors.md +31 -0
  102. aurelian/agents/gocam/documents/How_to_annotate_sequestering_proteins.md +42 -0
  103. aurelian/agents/gocam/documents/Molecular adaptor activity.docx +0 -0
  104. aurelian/agents/gocam/documents/Molecular adaptor activity.pdf +0 -0
  105. aurelian/agents/gocam/documents/Molecular carrier activity.docx +0 -0
  106. aurelian/agents/gocam/documents/Molecular carrier activity.pdf +0 -0
  107. aurelian/agents/gocam/documents/Molecular_adaptor_activity.md +51 -0
  108. aurelian/agents/gocam/documents/Molecular_carrier_activity.md +41 -0
  109. aurelian/agents/gocam/documents/Protein sequestering activity.docx +0 -0
  110. aurelian/agents/gocam/documents/Protein sequestering activity.pdf +0 -0
  111. aurelian/agents/gocam/documents/Protein_sequestering_activity.md +50 -0
  112. aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.docx +0 -0
  113. aurelian/agents/gocam/documents/Signaling receptor activity annotation guidelines.pdf +0 -0
  114. aurelian/agents/gocam/documents/Signaling_receptor_activity_annotation_guidelines.md +187 -0
  115. aurelian/agents/gocam/documents/Transcription coregulator activity.docx +0 -0
  116. aurelian/agents/gocam/documents/Transcription coregulator activity.pdf +0 -0
  117. aurelian/agents/gocam/documents/Transcription_coregulator_activity.md +36 -0
  118. aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.docx +0 -0
  119. aurelian/agents/gocam/documents/Transporter activity annotation annotation guidelines.pdf +0 -0
  120. aurelian/agents/gocam/documents/Transporter_activity_annotation_annotation_guidelines.md +43 -0
  121. Regulatory Processes in GO-CAM.docx +0 -0
  122. Regulatory Processes in GO-CAM.pdf +0 -0
  123. aurelian/agents/gocam/documents/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +31 -0
  124. aurelian/agents/gocam/documents/md/DNA-binding_transcription_factor_activity_annotation_guidelines.md +131 -0
  125. aurelian/agents/gocam/documents/md/E3_ubiquitin_ligases.md +166 -0
  126. aurelian/agents/gocam/documents/md/GO-CAM_annotation_guidelines_README.md +1 -0
  127. aurelian/agents/gocam/documents/md/GO-CAM_modelling_guidelines_TO_DO.md +5 -0
  128. aurelian/agents/gocam/documents/md/How_to_annotate_complexes_in_GO-CAM.md +28 -0
  129. aurelian/agents/gocam/documents/md/How_to_annotate_molecular_adaptors.md +19 -0
  130. aurelian/agents/gocam/documents/md/How_to_annotate_sequestering_proteins.md +38 -0
  131. aurelian/agents/gocam/documents/md/Molecular_adaptor_activity.md +52 -0
  132. aurelian/agents/gocam/documents/md/Molecular_carrier_activity.md +59 -0
  133. aurelian/agents/gocam/documents/md/Protein_sequestering_activity.md +52 -0
  134. aurelian/agents/gocam/documents/md/Signaling_receptor_activity_annotation_guidelines.md +271 -0
  135. aurelian/agents/gocam/documents/md/Transcription_coregulator_activity.md +54 -0
  136. aurelian/agents/gocam/documents/md/Transporter_activity_annotation_annotation_guidelines.md +38 -0
  137. aurelian/agents/gocam/documents/md/WIP_-_Regulation_and_Regulatory_Processes_in_GO-CAM.md +39 -0
  138. aurelian/agents/gocam/documents/pandoc_md/Signaling_receptor_activity_annotation_guidelines.md +334 -0
  139. aurelian/agents/gocam/gocam_agent.py +240 -0
  140. aurelian/agents/gocam/gocam_config.py +85 -0
  141. aurelian/agents/gocam/gocam_curator_agent.py +46 -0
  142. aurelian/agents/gocam/gocam_evals.py +67 -0
  143. aurelian/agents/gocam/gocam_gradio.py +89 -0
  144. aurelian/agents/gocam/gocam_mcp.py +224 -0
  145. aurelian/agents/gocam/gocam_tools.py +294 -0
  146. aurelian/agents/linkml/__init__.py +0 -0
  147. aurelian/agents/linkml/linkml_agent.py +62 -0
  148. aurelian/agents/linkml/linkml_config.py +48 -0
  149. aurelian/agents/linkml/linkml_evals.py +66 -0
  150. aurelian/agents/linkml/linkml_gradio.py +45 -0
  151. aurelian/agents/linkml/linkml_mcp.py +186 -0
  152. aurelian/agents/linkml/linkml_tools.py +102 -0
  153. aurelian/agents/literature/__init__.py +3 -0
  154. aurelian/agents/literature/literature_agent.py +55 -0
  155. aurelian/agents/literature/literature_config.py +35 -0
  156. aurelian/agents/literature/literature_gradio.py +52 -0
  157. aurelian/agents/literature/literature_mcp.py +174 -0
  158. aurelian/agents/literature/literature_tools.py +182 -0
  159. aurelian/agents/monarch/__init__.py +25 -0
  160. aurelian/agents/monarch/monarch_agent.py +44 -0
  161. aurelian/agents/monarch/monarch_config.py +45 -0
  162. aurelian/agents/monarch/monarch_gradio.py +51 -0
  163. aurelian/agents/monarch/monarch_mcp.py +65 -0
  164. aurelian/agents/monarch/monarch_tools.py +113 -0
  165. aurelian/agents/oak/__init__.py +0 -0
  166. aurelian/agents/oak/oak_config.py +27 -0
  167. aurelian/agents/oak/oak_gradio.py +57 -0
  168. aurelian/agents/ontology_mapper/__init__.py +31 -0
  169. aurelian/agents/ontology_mapper/ontology_mapper_agent.py +56 -0
  170. aurelian/agents/ontology_mapper/ontology_mapper_config.py +50 -0
  171. aurelian/agents/ontology_mapper/ontology_mapper_evals.py +108 -0
  172. aurelian/agents/ontology_mapper/ontology_mapper_gradio.py +58 -0
  173. aurelian/agents/ontology_mapper/ontology_mapper_mcp.py +81 -0
  174. aurelian/agents/ontology_mapper/ontology_mapper_tools.py +147 -0
  175. aurelian/agents/phenopackets/__init__.py +3 -0
  176. aurelian/agents/phenopackets/phenopackets_agent.py +58 -0
  177. aurelian/agents/phenopackets/phenopackets_config.py +72 -0
  178. aurelian/agents/phenopackets/phenopackets_evals.py +99 -0
  179. aurelian/agents/phenopackets/phenopackets_gradio.py +55 -0
  180. aurelian/agents/phenopackets/phenopackets_mcp.py +178 -0
  181. aurelian/agents/phenopackets/phenopackets_tools.py +127 -0
  182. aurelian/agents/rag/__init__.py +40 -0
  183. aurelian/agents/rag/rag_agent.py +83 -0
  184. aurelian/agents/rag/rag_config.py +80 -0
  185. aurelian/agents/rag/rag_gradio.py +67 -0
  186. aurelian/agents/rag/rag_mcp.py +107 -0
  187. aurelian/agents/rag/rag_tools.py +189 -0
  188. aurelian/agents/rag_agent.py +54 -0
  189. aurelian/agents/robot/__init__.py +0 -0
  190. aurelian/agents/robot/assets/__init__.py +3 -0
  191. aurelian/agents/robot/assets/template.md +384 -0
  192. aurelian/agents/robot/robot_config.py +25 -0
  193. aurelian/agents/robot/robot_gradio.py +46 -0
  194. aurelian/agents/robot/robot_mcp.py +100 -0
  195. aurelian/agents/robot/robot_ontology_agent.py +139 -0
  196. aurelian/agents/robot/robot_tools.py +50 -0
  197. aurelian/agents/talisman/__init__.py +3 -0
  198. aurelian/agents/talisman/talisman_agent.py +126 -0
  199. aurelian/agents/talisman/talisman_config.py +66 -0
  200. aurelian/agents/talisman/talisman_gradio.py +50 -0
  201. aurelian/agents/talisman/talisman_mcp.py +168 -0
  202. aurelian/agents/talisman/talisman_tools.py +720 -0
  203. aurelian/agents/ubergraph/__init__.py +40 -0
  204. aurelian/agents/ubergraph/ubergraph_agent.py +71 -0
  205. aurelian/agents/ubergraph/ubergraph_config.py +79 -0
  206. aurelian/agents/ubergraph/ubergraph_gradio.py +48 -0
  207. aurelian/agents/ubergraph/ubergraph_mcp.py +69 -0
  208. aurelian/agents/ubergraph/ubergraph_tools.py +118 -0
  209. aurelian/agents/uniprot/__init__.py +37 -0
  210. aurelian/agents/uniprot/uniprot_agent.py +43 -0
  211. aurelian/agents/uniprot/uniprot_config.py +43 -0
  212. aurelian/agents/uniprot/uniprot_evals.py +99 -0
  213. aurelian/agents/uniprot/uniprot_gradio.py +48 -0
  214. aurelian/agents/uniprot/uniprot_mcp.py +168 -0
  215. aurelian/agents/uniprot/uniprot_tools.py +136 -0
  216. aurelian/agents/web/__init__.py +0 -0
  217. aurelian/agents/web/web_config.py +27 -0
  218. aurelian/agents/web/web_gradio.py +48 -0
  219. aurelian/agents/web/web_mcp.py +50 -0
  220. aurelian/agents/web/web_tools.py +108 -0
  221. aurelian/chat.py +23 -0
  222. aurelian/cli.py +800 -0
  223. aurelian/dependencies/__init__.py +0 -0
  224. aurelian/dependencies/workdir.py +78 -0
  225. aurelian/mcp/__init__.py +0 -0
  226. aurelian/mcp/amigo_mcp_test.py +86 -0
  227. aurelian/mcp/config_generator.py +123 -0
  228. aurelian/mcp/example_config.json +43 -0
  229. aurelian/mcp/generate_sample_config.py +37 -0
  230. aurelian/mcp/gocam_mcp_test.py +126 -0
  231. aurelian/mcp/linkml_mcp_tools.py +190 -0
  232. aurelian/mcp/mcp_discovery.py +87 -0
  233. aurelian/mcp/mcp_test.py +31 -0
  234. aurelian/mcp/phenopackets_mcp_test.py +103 -0
  235. aurelian/tools/__init__.py +0 -0
  236. aurelian/tools/web/__init__.py +0 -0
  237. aurelian/tools/web/url_download.py +51 -0
  238. aurelian/utils/__init__.py +0 -0
  239. aurelian/utils/async_utils.py +15 -0
  240. aurelian/utils/data_utils.py +32 -0
  241. aurelian/utils/documentation_manager.py +59 -0
  242. aurelian/utils/doi_fetcher.py +238 -0
  243. aurelian/utils/ontology_utils.py +68 -0
  244. aurelian/utils/pdf_fetcher.py +23 -0
  245. aurelian/utils/process_logs.py +100 -0
  246. aurelian/utils/pubmed_utils.py +238 -0
  247. aurelian/utils/pytest_report_to_markdown.py +67 -0
  248. aurelian/utils/robot_ontology_utils.py +112 -0
  249. aurelian/utils/search_utils.py +95 -0
  250. aurelian-0.3.2.dist-info/LICENSE +22 -0
  251. aurelian-0.3.2.dist-info/METADATA +105 -0
  252. aurelian-0.3.2.dist-info/RECORD +254 -0
  253. aurelian-0.3.2.dist-info/WHEEL +4 -0
  254. aurelian-0.3.2.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,720 @@
1
+ """
2
+ Tools for retrieving gene information using the UniProt API and NCBI Entrez.
3
+ """
4
+ from typing import Dict, List, Optional, Tuple, Any
5
+ import openai
6
+ import time
7
+ import threading
8
+ import json
9
+ import os
10
+ import datetime
11
+ import logging
12
+
13
+ from pydantic_ai import RunContext, ModelRetry
14
+
15
+ from .talisman_config import TalismanConfig, get_config
16
+
17
+ # Set up logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s [%(levelname)s] Talisman: %(message)s',
21
+ datefmt='%Y-%m-%d %H:%M:%S'
22
+ )
23
+
24
+ # Rate limiting implementation
25
+ class RateLimiter:
26
+ """Simple rate limiter to ensure we don't exceed API rate limits."""
27
+
28
+ def __init__(self, max_calls: int = 3, period: float = 1.0):
29
+ """
30
+ Initialize the rate limiter.
31
+
32
+ Args:
33
+ max_calls: Maximum number of calls allowed in the period
34
+ period: Time period in seconds
35
+ """
36
+ self.max_calls = max_calls
37
+ self.period = period
38
+ self.calls = []
39
+ self.lock = threading.Lock()
40
+
41
+ def wait(self):
42
+ """
43
+ Wait if necessary to respect the rate limit.
44
+ """
45
+ with self.lock:
46
+ now = time.time()
47
+
48
+ # Remove timestamps older than the period
49
+ self.calls = [t for t in self.calls if now - t < self.period]
50
+
51
+ # If we've reached the maximum calls for this period, wait
52
+ if len(self.calls) >= self.max_calls:
53
+ # Calculate how long to wait
54
+ oldest_call = min(self.calls)
55
+ wait_time = self.period - (now - oldest_call)
56
+ if wait_time > 0:
57
+ time.sleep(wait_time)
58
+ # Reset calls after waiting
59
+ self.calls = []
60
+
61
+ # Add the current timestamp
62
+ self.calls.append(time.time())
63
+
64
+ # Create rate limiters for UniProt and NCBI
65
+ uniprot_limiter = RateLimiter(max_calls=3, period=1.0)
66
+ ncbi_limiter = RateLimiter(max_calls=3, period=1.0)
67
+
68
+
69
+ def normalize_gene_id(gene_id: str) -> str:
70
+ """Normalize a gene ID by removing any version number or prefix.
71
+
72
+ Args:
73
+ gene_id: The gene ID
74
+
75
+ Returns:
76
+ The normalized gene ID
77
+ """
78
+ if ":" in gene_id:
79
+ return gene_id.split(":")[-1]
80
+ return gene_id
81
+
82
+
83
+ def is_uniprot_id(gene_id: str) -> bool:
84
+ """Check if the gene ID appears to be a UniProt accession.
85
+
86
+ Args:
87
+ gene_id: The gene ID to check
88
+
89
+ Returns:
90
+ True if it appears to be a UniProt ID, False otherwise
91
+ """
92
+ # UniProt IDs typically start with O, P, Q and contain numbers
93
+ return gene_id.startswith(("P", "Q", "O")) and any(c.isdigit() for c in gene_id)
94
+
95
+
96
+ def lookup_uniprot_accession(ctx: RunContext[TalismanConfig], gene_symbol: str) -> str:
97
+ """Look up UniProt accession for a gene symbol.
98
+
99
+ Args:
100
+ ctx: The run context with access to the config
101
+ gene_symbol: The gene symbol to look up
102
+
103
+ Returns:
104
+ UniProt accession if found, or the original symbol if not found
105
+ """
106
+ logging.info(f"Looking up UniProt accession for: {gene_symbol}")
107
+
108
+ config = ctx.deps or get_config()
109
+ u = config.get_uniprot_client()
110
+
111
+ try:
112
+ gene_symbol = normalize_gene_id(gene_symbol)
113
+
114
+ # Skip lookup if it already looks like a UniProt ID
115
+ if is_uniprot_id(gene_symbol):
116
+ logging.info(f"{gene_symbol} appears to be a UniProt ID already")
117
+ return gene_symbol
118
+
119
+ # Apply rate limiting before making the request
120
+ uniprot_limiter.wait()
121
+
122
+ # Search for the gene symbol specifically
123
+ logging.info(f"Searching UniProt for gene symbol: {gene_symbol}")
124
+ search_query = f'gene:{gene_symbol} AND reviewed:yes'
125
+ results = u.search(search_query, frmt="tsv", columns="accession,gene_names")
126
+
127
+ if results and results.strip() != "":
128
+ # Get the first line after the header and extract the accession
129
+ lines = results.strip().split('\n')
130
+ if len(lines) > 1:
131
+ uniprot_id = lines[1].split('\t')[0]
132
+ logging.info(f"Found UniProt accession: {uniprot_id} for {gene_symbol}")
133
+ return uniprot_id
134
+
135
+ logging.info(f"No UniProt accession found for {gene_symbol}, using original symbol")
136
+ return gene_symbol
137
+ except Exception as e:
138
+ # Return original gene symbol if lookup fails
139
+ logging.warning(f"Error looking up UniProt accession for {gene_symbol}: {str(e)}")
140
+ return gene_symbol
141
+
142
+
143
+ def get_ncbi_gene_info(ctx: RunContext[TalismanConfig], gene_id: str, organism: str = None) -> Optional[str]:
144
+ """Look up gene information in NCBI Entrez.
145
+
146
+ Args:
147
+ ctx: The run context with access to the config
148
+ gene_id: Gene ID or symbol to look up
149
+ organism: Optional organism name to restrict search (e.g., "Salmonella", "Homo sapiens")
150
+
151
+ Returns:
152
+ Gene information from NCBI if found, or None if not found
153
+ """
154
+ logging.info(f"Looking up NCBI information for: {gene_id}")
155
+
156
+ config = ctx.deps or get_config()
157
+ ncbi = config.get_ncbi_client()
158
+
159
+ # Check if the gene looks like bacterial (common for Salmonella)
160
+ bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg"]
161
+ is_likely_bacterial = any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
162
+
163
+ # Default organisms to try based on gene patterns
164
+ if is_likely_bacterial and not organism:
165
+ organisms_to_try = ["Salmonella", "Escherichia coli", "Bacteria"]
166
+ else:
167
+ organisms_to_try = [organism] if organism else ["Homo sapiens", None] # Try human first as default, then any organism
168
+
169
+ gene_results = None
170
+
171
+ try:
172
+ # Try for each organism in priority order
173
+ for org in organisms_to_try:
174
+ # First try to find the gene with organism constraint
175
+ if org:
176
+ logging.info(f"Searching NCBI gene database for: {gene_id} in organism: {org}")
177
+ ncbi_limiter.wait()
178
+ search_query = f"{gene_id}[Gene Symbol] AND {org}[Organism]"
179
+ search_results = ncbi.ESearch("gene", search_query)
180
+ gene_ids = search_results.get('idlist', [])
181
+
182
+ if gene_ids:
183
+ gene_id_found = gene_ids[0]
184
+ logging.info(f"Found gene ID: {gene_id_found} in {org}, fetching details")
185
+ ncbi_limiter.wait()
186
+ gene_data = ncbi.EFetch("gene", id=gene_id_found)
187
+ gene_results = f"NCBI Entrez Gene Information:\n{gene_data}"
188
+ break
189
+
190
+ # Try without organism constraint as fallback
191
+ if not gene_results:
192
+ logging.info(f"Trying gene symbol search without organism constraint for: {gene_id}")
193
+ ncbi_limiter.wait()
194
+ search_results = ncbi.ESearch("gene", f"{gene_id}[Gene Symbol]")
195
+ gene_ids = search_results.get('idlist', [])
196
+
197
+ if gene_ids:
198
+ gene_id_found = gene_ids[0]
199
+ logging.info(f"Found gene ID: {gene_id_found}, fetching details")
200
+ ncbi_limiter.wait()
201
+ gene_data = ncbi.EFetch("gene", id=gene_id_found)
202
+ gene_results = f"NCBI Entrez Gene Information:\n{gene_data}"
203
+ break
204
+
205
+ # If we found gene results, return them
206
+ if gene_results:
207
+ return gene_results
208
+
209
+ # If not found in gene database, try protein database
210
+ # For bacterial genes, try organism-specific search first
211
+ protein_ids = []
212
+ if is_likely_bacterial:
213
+ for org in organisms_to_try:
214
+ if org:
215
+ logging.info(f"Searching NCBI protein database for: {gene_id} in organism: {org}")
216
+ ncbi_limiter.wait()
217
+ search_query = f"{gene_id} AND {org}[Organism]"
218
+ search_results = ncbi.ESearch("protein", search_query)
219
+ protein_ids = search_results.get('idlist', [])
220
+
221
+ if protein_ids:
222
+ logging.info(f"Found protein ID(s) for {gene_id} in {org}: {protein_ids}")
223
+ break
224
+ else:
225
+ # Standard protein search (no organism constraint)
226
+ logging.info(f"Searching NCBI protein database for: {gene_id}")
227
+ ncbi_limiter.wait()
228
+ search_results = ncbi.ESearch("protein", gene_id)
229
+ protein_ids = search_results.get('idlist', [])
230
+
231
+ if protein_ids:
232
+ protein_id = protein_ids[0]
233
+ logging.info(f"Found protein ID: {protein_id}, fetching sequence")
234
+ ncbi_limiter.wait()
235
+ protein_data = ncbi.EFetch("protein", id=protein_id, rettype="fasta", retmode="text")
236
+ try:
237
+ # Strip byte prefix if present
238
+ if isinstance(protein_data, bytes):
239
+ protein_data = protein_data.decode('utf-8')
240
+ elif isinstance(protein_data, str) and protein_data.startswith('b\''):
241
+ protein_data = protein_data[2:-1].replace('\\n', '\n')
242
+ except:
243
+ pass
244
+
245
+ # Get additional details with esummary
246
+ logging.info(f"Fetching protein summary for: {protein_id}")
247
+ ncbi_limiter.wait()
248
+ summary_data = ncbi.ESummary("protein", id=protein_id)
249
+
250
+ # Extract and format useful summary information
251
+ protein_summary = ""
252
+ if isinstance(summary_data, dict) and summary_data:
253
+ # For newer versions of bioservices
254
+ if protein_id in summary_data:
255
+ details = summary_data[protein_id]
256
+ title = details.get('title', 'No title available')
257
+ organism = details.get('organism', 'Unknown organism')
258
+ protein_summary = f"Title: {title}\nOrganism: {organism}\n\n"
259
+ logging.info(f"Found protein: {title} ({organism})")
260
+ # For other data structures returned by ESummary
261
+ else:
262
+ title = None
263
+ organism = None
264
+
265
+ for key, value in summary_data.items():
266
+ if isinstance(value, dict):
267
+ if 'title' in value:
268
+ title = value['title']
269
+ if 'organism' in value:
270
+ organism = value['organism']
271
+
272
+ if title or organism:
273
+ protein_summary = f"Title: {title or 'Not available'}\nOrganism: {organism or 'Unknown'}\n\n"
274
+ if title:
275
+ logging.info(f"Found protein: {title}")
276
+
277
+ combined_data = f"{protein_summary}{protein_data}"
278
+ return f"NCBI Entrez Protein Information:\n{combined_data}"
279
+
280
+ # Try nucleotide database as well
281
+ logging.info(f"No protein found, trying NCBI nucleotide database for: {gene_id}")
282
+ ncbi_limiter.wait()
283
+ search_results = ncbi.ESearch("nuccore", gene_id)
284
+ nuccore_ids = search_results.get('idlist', [])
285
+
286
+ if nuccore_ids:
287
+ nuccore_id = nuccore_ids[0]
288
+ logging.info(f"Found nucleotide ID: {nuccore_id}, fetching details")
289
+ ncbi_limiter.wait()
290
+ nuccore_data = ncbi.EFetch("nuccore", id=nuccore_id, rettype="gb", retmode="text")
291
+ try:
292
+ if isinstance(nuccore_data, bytes):
293
+ nuccore_data = nuccore_data.decode('utf-8')
294
+ except:
295
+ pass
296
+ return f"NCBI Entrez Nucleotide Information:\n{nuccore_data}"
297
+
298
+ logging.info(f"No information found in NCBI for: {gene_id}")
299
+ return None
300
+ except Exception as e:
301
+ # Return None if lookup fails
302
+ logging.warning(f"Error querying NCBI Entrez for {gene_id}: {str(e)}")
303
+ return f"Error querying NCBI Entrez: {str(e)}"
304
+
305
+
306
+ def get_gene_description(ctx: RunContext[TalismanConfig], gene_id: str, organism: str = None) -> str:
307
+ """Get description for a single gene ID, using UniProt and falling back to NCBI Entrez.
308
+
309
+ Args:
310
+ ctx: The run context with access to the config
311
+ gene_id: The gene identifier (UniProt ID, gene symbol, etc.)
312
+ organism: Optional organism name to restrict search (e.g., "Salmonella", "Homo sapiens")
313
+
314
+ Returns:
315
+ The gene description in a structured format
316
+ """
317
+ logging.info(f"Getting description for gene: {gene_id}")
318
+ config = ctx.deps or get_config()
319
+ u = config.get_uniprot_client()
320
+
321
+ # Check if this looks like a bacterial gene code
322
+ bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg", "flh", "fli", "che"]
323
+ is_likely_bacterial = any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
324
+
325
+ # Auto-detect organism based on gene pattern
326
+ if is_likely_bacterial and not organism:
327
+ logging.info(f"Gene {gene_id} matches bacterial pattern, setting organism to Salmonella")
328
+ organism = "Salmonella"
329
+
330
+ try:
331
+ # Normalize the gene ID
332
+ gene_id = normalize_gene_id(gene_id)
333
+ logging.info(f"Normalized gene ID: {gene_id}")
334
+ uniprot_info = None
335
+ ncbi_info = None
336
+
337
+ # First try to look up UniProt accession if it looks like a gene symbol
338
+ if not is_uniprot_id(gene_id):
339
+ logging.info(f"Not a UniProt ID, looking up accession for: {gene_id}")
340
+ uniprot_id = lookup_uniprot_accession(ctx, gene_id)
341
+ # If lookup succeeded (returned a different ID), use that for retrieval
342
+ if uniprot_id != gene_id:
343
+ logging.info(f"Using UniProt ID: {uniprot_id} instead of {gene_id}")
344
+ gene_id = uniprot_id
345
+
346
+ # Direct lookup for UniProt IDs
347
+ if is_uniprot_id(gene_id):
348
+ try:
349
+ logging.info(f"Performing direct UniProt lookup for: {gene_id}")
350
+ # Apply rate limiting
351
+ uniprot_limiter.wait()
352
+ result = u.retrieve(gene_id, frmt="txt")
353
+ if result and result.strip() != "":
354
+ logging.info(f"Found direct UniProt entry for: {gene_id}")
355
+ uniprot_info = result
356
+ else:
357
+ logging.info(f"No direct UniProt entry found for: {gene_id}")
358
+ except Exception as e:
359
+ logging.warning(f"Error in direct UniProt lookup: {str(e)}")
360
+ pass # If direct lookup fails, continue with search
361
+
362
+ # If we don't have UniProt info yet, try the search
363
+ if not uniprot_info:
364
+ # Search for the gene
365
+ logging.info(f"Performing UniProt search for: {gene_id}")
366
+ uniprot_limiter.wait()
367
+ search_query = f'gene:{gene_id} OR accession:{gene_id} OR id:{gene_id}'
368
+ results = u.search(search_query, frmt="tsv",
369
+ columns="accession,id,gene_names,organism,protein_name,function,cc_disease")
370
+
371
+ if not results or results.strip() == "":
372
+ # Try a broader search if the specific one failed
373
+ logging.info(f"No specific match found, trying broader UniProt search for: {gene_id}")
374
+ uniprot_limiter.wait()
375
+ search_query = gene_id
376
+ results = u.search(search_query, frmt="tsv",
377
+ columns="accession,id,gene_names,organism,protein_name,function,cc_disease")
378
+
379
+ if results and results.strip() != "":
380
+ logging.info(f"Found UniProt entries in broader search for: {gene_id}")
381
+ uniprot_info = results
382
+ else:
383
+ logging.info(f"No UniProt entries found in broader search for: {gene_id}")
384
+ else:
385
+ logging.info(f"Found UniProt entries in specific search for: {gene_id}")
386
+ uniprot_info = results
387
+
388
+ # Check NCBI Entrez if we couldn't find anything in UniProt
389
+ if not uniprot_info or uniprot_info.strip() == "":
390
+ logging.info(f"No UniProt information found, checking NCBI for: {gene_id}")
391
+ # Pass the organism if we have one or auto-detected one
392
+ ncbi_info = get_ncbi_gene_info(ctx, gene_id, organism)
393
+ if ncbi_info:
394
+ logging.info(f"Found NCBI information for: {gene_id}")
395
+ else:
396
+ logging.warning(f"No NCBI information found for: {gene_id}")
397
+
398
+ # Combine results or use whichever source had information
399
+ if uniprot_info and ncbi_info:
400
+ logging.info(f"Returning combined UniProt and NCBI information for: {gene_id}")
401
+ return f"## UniProt Information\n{uniprot_info}\n\n## NCBI Information\n{ncbi_info}"
402
+ elif uniprot_info:
403
+ logging.info(f"Returning UniProt information for: {gene_id}")
404
+ return uniprot_info
405
+ elif ncbi_info:
406
+ logging.info(f"Returning NCBI information for: {gene_id}")
407
+ return ncbi_info
408
+ else:
409
+ logging.error(f"No gene information found for: {gene_id} in either UniProt or NCBI")
410
+ raise ModelRetry(f"No gene information found for: {gene_id} in either UniProt or NCBI Entrez")
411
+
412
+ except Exception as e:
413
+ if "ModelRetry" in str(type(e)):
414
+ raise e
415
+ logging.error(f"Error retrieving gene description for {gene_id}: {str(e)}")
416
+ raise ModelRetry(f"Error retrieving gene description: {str(e)}")
417
+
418
+
419
+ def get_gene_descriptions(ctx: RunContext[TalismanConfig], gene_ids: List[str]) -> str:
420
+ """Get descriptions for multiple gene IDs.
421
+
422
+ Args:
423
+ ctx: The run context with access to the config
424
+ gene_ids: List of gene identifiers
425
+
426
+ Returns:
427
+ The gene descriptions in a structured tabular format
428
+ """
429
+ logging.info(f"Retrieving descriptions for {len(gene_ids)} genes: {', '.join(gene_ids)}")
430
+ config = ctx.deps or get_config()
431
+
432
+ try:
433
+ if not gene_ids:
434
+ logging.error("No gene IDs provided")
435
+ raise ModelRetry("No gene IDs provided")
436
+
437
+ results = []
438
+ gene_info_dict = {}
439
+
440
+ for i, gene_id in enumerate(gene_ids):
441
+ logging.info(f"Processing gene {i+1}/{len(gene_ids)}: {gene_id}")
442
+ try:
443
+ gene_info = get_gene_description(ctx, gene_id)
444
+ results.append(f"## Gene: {gene_id}\n{gene_info}\n")
445
+ gene_info_dict[gene_id] = gene_info
446
+ logging.info(f"Successfully retrieved information for {gene_id}")
447
+ except Exception as e:
448
+ logging.warning(f"Error retrieving information for {gene_id}: {str(e)}")
449
+ results.append(f"## Gene: {gene_id}\nError: {str(e)}\n")
450
+
451
+ if not results:
452
+ logging.error("No gene information found for any of the provided IDs")
453
+ raise ModelRetry("No gene information found for any of the provided IDs")
454
+
455
+ # Store the gene info dictionary in an attribute we add to ctx (state only available in test context)
456
+ # Use hasattr to check if the attribute already exists
457
+ if not hasattr(ctx, "gene_info_dict"):
458
+ # Create the attribute if it doesn't exist
459
+ setattr(ctx, "gene_info_dict", {})
460
+
461
+ # Now set the value
462
+ ctx.gene_info_dict = gene_info_dict
463
+ logging.info(f"Successfully retrieved information for {len(gene_info_dict)} genes")
464
+
465
+ return "\n".join(results)
466
+ except Exception as e:
467
+ if "ModelRetry" in str(type(e)):
468
+ raise e
469
+ logging.error(f"Error retrieving gene descriptions: {str(e)}")
470
+ raise ModelRetry(f"Error retrieving gene descriptions: {str(e)}")
471
+
472
+
473
+ def parse_gene_list(gene_list: str) -> List[str]:
474
+ """Parse a string containing gene IDs or symbols into a list.
475
+
476
+ Args:
477
+ gene_list: String of gene identifiers separated by commas, spaces, semicolons, or newlines
478
+
479
+ Returns:
480
+ List of gene identifiers
481
+ """
482
+ if not gene_list:
483
+ return []
484
+
485
+ # Replace common separators with a single delimiter for splitting
486
+ for sep in [',', ';', '\n', '\t']:
487
+ gene_list = gene_list.replace(sep, ' ')
488
+
489
+ # Split on spaces and filter out empty strings
490
+ genes = [g.strip() for g in gene_list.split(' ') if g.strip()]
491
+ return genes
492
+
493
+
494
+ def get_genes_from_list(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
495
+ """Get descriptions for multiple gene IDs provided as a string.
496
+
497
+ Args:
498
+ ctx: The run context with access to the config
499
+ gene_list: String containing gene identifiers separated by commas, spaces, or newlines
500
+
501
+ Returns:
502
+ The gene descriptions in a structured tabular format
503
+ """
504
+ logging.info(f"Parsing gene list: {gene_list}")
505
+ gene_ids = parse_gene_list(gene_list)
506
+
507
+ if not gene_ids:
508
+ logging.error("No gene IDs could be parsed from the input string")
509
+ raise ModelRetry("No gene IDs could be parsed from the input string")
510
+
511
+ logging.info(f"Parsed {len(gene_ids)} gene IDs: {', '.join(gene_ids)}")
512
+ return get_gene_descriptions(ctx, gene_ids)
513
+
514
+
515
+ def analyze_gene_set(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
516
+ """Analyze a set of genes and generate a biological summary of their properties and relationships.
517
+
518
+ Args:
519
+ ctx: The run context with access to the config
520
+ gene_list: String containing gene identifiers separated by commas, spaces, or newlines
521
+
522
+ Returns:
523
+ A structured biological summary of the gene set
524
+ """
525
+ logging.info(f"Starting gene set analysis for: {gene_list}")
526
+
527
+ # Detect if these look like bacterial genes
528
+ bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg", "flh", "fli", "che", "DVU"]
529
+ gene_ids_list = parse_gene_list(gene_list)
530
+ is_likely_bacterial = any(
531
+ any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
532
+ for gene_id in gene_ids_list
533
+ )
534
+
535
+ # Set organism based on pattern detection
536
+ organism = None
537
+ if is_likely_bacterial:
538
+ logging.info(f"Detected likely bacterial genes: {gene_list}")
539
+ # Check for specific bacterial gene patterns
540
+ if any(gene_id.lower().startswith(("inv", "sip", "sop", "sic", "spa")) for gene_id in gene_ids_list):
541
+ organism = "Salmonella"
542
+ logging.info(f"Setting organism to Salmonella based on gene patterns")
543
+ elif any(gene_id.startswith("DVU") for gene_id in gene_ids_list):
544
+ organism = "Desulfovibrio"
545
+ logging.info(f"Setting organism to Desulfovibrio based on gene patterns")
546
+
547
+ # First, get detailed information about each gene
548
+ logging.info("Retrieving gene descriptions...")
549
+ # Pass organism information to each gene lookup
550
+ for gene_id in gene_ids_list:
551
+ logging.info(f"Processing {gene_id} with organism context: {organism}")
552
+ get_gene_description(ctx, gene_id, organism)
553
+
554
+ # Now get all gene descriptions
555
+ gene_descriptions = get_genes_from_list(ctx, gene_list)
556
+ logging.info("Gene descriptions retrieved successfully")
557
+
558
+ # Get the gene info dictionary from the context
559
+ gene_info_dict = getattr(ctx, "gene_info_dict", {})
560
+
561
+ if not gene_info_dict:
562
+ logging.error("No gene information was found to analyze")
563
+ raise ModelRetry("No gene information was found to analyze")
564
+
565
+ gene_ids = list(gene_info_dict.keys())
566
+ logging.info(f"Analyzing relationships between {len(gene_ids)} genes: {', '.join(gene_ids)}")
567
+
568
+ # Extract organism information from the gene descriptions if possible
569
+ detected_organism = None
570
+ organism_keywords = ["Salmonella", "Escherichia", "Desulfovibrio", "Homo sapiens", "human"]
571
+ for gene_info in gene_info_dict.values():
572
+ for keyword in organism_keywords:
573
+ if keyword.lower() in gene_info.lower():
574
+ detected_organism = keyword
575
+ break
576
+ if detected_organism:
577
+ break
578
+
579
+ if detected_organism:
580
+ logging.info(f"Detected organism from gene descriptions: {detected_organism}")
581
+
582
+ # Prepare a prompt for the LLM
583
+ prompt = f"""Analyze the following set of genes and provide a detailed biological summary:
584
+
585
+ Gene IDs/Symbols: {', '.join(gene_ids)}
586
+
587
+ Gene Information:
588
+ {gene_descriptions}
589
+
590
+ {f"IMPORTANT: These genes are from {detected_organism or organism}. Make sure your analysis reflects the correct organism context." if detected_organism or organism else ""}
591
+
592
+ Based on this information, provide a structured analysis covering:
593
+ 1. Shared biological processes these genes may participate in
594
+ 2. Potential protein-protein interactions or functional relationships
595
+ 3. Common cellular localization patterns
596
+ 4. Involvement in similar pathways
597
+ 5. Coordinated activities or cooperative functions
598
+ 6. Any disease associations that multiple genes in this set share
599
+
600
+ Focus particularly on identifying relationships between at least a pair of these genes.
601
+ If the genes appear unrelated, note this but try to identify any subtle connections based on their function.
602
+
603
+ Your analysis should include multiple kinds of relationships:
604
+ - Functional relationships
605
+ - Pathway relationships
606
+ - Regulatory relationships
607
+ - Localization patterns
608
+ - Physical interactions
609
+ - Genetic interactions
610
+
611
+ Format the response with appropriate markdown headings and bullet points.
612
+
613
+ IMPORTANT: You MUST include ALL of the following sections in your response:
614
+
615
+ 1. First provide your detailed analysis with appropriate headings for each section.
616
+
617
+ 2. After your analysis, include a distinct section titled "## Terms"
618
+ that contains a semicolon-delimited list of functional terms relevant to the gene set,
619
+ ordered by relevance. These terms should include:
620
+ - Gene Ontology biological process terms (e.g., DNA repair, oxidative phosphorylation, signal transduction)
621
+ - Molecular function terms (e.g., kinase activity, DNA binding, transporter activity)
622
+ - Cellular component/localization terms (e.g., nucleus, plasma membrane, mitochondria)
623
+ - Pathway names (e.g., glycolysis, TCA cycle, MAPK signaling)
624
+ - Co-regulation terms (e.g., stress response regulon, heat shock response)
625
+ - Interaction networks (e.g., protein complex formation, signaling cascade)
626
+ - Metabolic process terms (e.g., fatty acid synthesis, amino acid metabolism)
627
+ - Regulatory mechanisms (e.g., transcriptional regulation, post-translational modification)
628
+ - Disease associations (if relevant, e.g., virulence, pathogenesis, antibiotic resistance)
629
+ - Structural and functional domains/motifs (e.g., helix-turn-helix, zinc finger)
630
+
631
+ Example of Terms section:
632
+ ## Terms
633
+ DNA damage response; p53 signaling pathway; apoptosis; cell cycle regulation; tumor suppression; DNA repair; protein ubiquitination; transcriptional regulation; nuclear localization; cancer predisposition
634
+
635
+ 3. After the Terms section, include a summary table of the genes analyzed titled "## Gene Summary Table"
636
+ Format it as a markdown table with the following columns in this exact order:
637
+ - ID: The gene identifier (same as Gene Symbol)
638
+ - Annotation: Genomic coordinates or accession with position information
639
+ - Genomic Context: Information about the genomic location (chromosome, plasmid, etc.)
640
+ - Organism: The organism the gene belongs to
641
+ - Description: The protein/gene function description
642
+
643
+ Make sure the information is accurate based on the gene information provided and do not conflate with similarly named genes from different organisms.
644
+
645
+ Example:
646
+
647
+ ## Gene Summary Table
648
+ | ID | Annotation | Genomic Context | Organism | Description |
649
+ |-------------|-------------|----------|----------------|------------|
650
+ | BRCA1 | NC_000017.11 (43044295..43125483) | Chromosome 17 | Homo sapiens | Breast cancer type 1 susceptibility protein |
651
+ | TP53 | NC_000017.11 (7668402..7687550) | Chromosome 17 | Homo sapiens | Tumor suppressor protein |
652
+
653
+ For bacterial genes, the table should look like:
654
+
655
+ ## Gene Summary Table
656
+ | ID | Annotation | Genomic Context | Organism | Description |
657
+ |-------------|-------------|----------|----------------|------------|
658
+ | invA | NC_003197.2 (3038407..3040471, complement) | Chromosome | Salmonella enterica | Invasion protein |
659
+ | DVUA0001 | NC_005863.1 (699..872, complement) | Plasmid pDV | Desulfovibrio vulgaris str. Hildenborough | Hypothetical protein |
660
+
661
+ REMEMBER: ALL THREE SECTIONS ARE REQUIRED - Main Analysis, Terms, and Gene Summary Table.
662
+ """
663
+
664
+ # Access OpenAI API to generate the analysis
665
+ try:
666
+ # Use the configured model name if available
667
+ model_name = getattr(ctx.deps, "model_name", "gpt-4o") if ctx.deps else "gpt-4o"
668
+ # Use the configured API key if available
669
+ api_key = getattr(ctx.deps, "openai_api_key", None) if ctx.deps else None
670
+
671
+ logging.info(f"Generating biological analysis using model: {model_name}")
672
+
673
+ if api_key:
674
+ openai.api_key = api_key
675
+
676
+ # Create the completion using OpenAI API
677
+ logging.info("Sending request to OpenAI API...")
678
+ response = openai.chat.completions.create(
679
+ model=model_name,
680
+ messages=[
681
+ {"role": "system", "content": "You are a biology expert analyzing gene sets to identify functional relationships. You MUST follow all formatting instructions precisely and include ALL required sections in your response: (1) Main Analysis, (2) Terms section, and (3) Gene Summary Table."},
682
+ {"role": "user", "content": prompt}
683
+ ],
684
+ temperature=0.3,
685
+ max_tokens=4000
686
+ )
687
+ logging.info("Received response from OpenAI API")
688
+
689
+ # Extract the response content
690
+ result = response.choices[0].message.content
691
+
692
+ # Save the response to a timestamped file
693
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
694
+ filename = f"talisman_analysis_{timestamp}.json"
695
+
696
+ # Create a directory for analysis results if it doesn't exist
697
+ results_dir = os.path.join(os.path.expanduser("~"), "talisman_results")
698
+ os.makedirs(results_dir, exist_ok=True)
699
+
700
+ # Save the full response including metadata
701
+ file_path = os.path.join(results_dir, filename)
702
+ logging.info(f"Saving analysis results to: {file_path}")
703
+
704
+ with open(file_path, 'w') as f:
705
+ # Create a dictionary with both the result and input/metadata
706
+ output_data = {
707
+ "timestamp": timestamp,
708
+ "genes_analyzed": gene_ids,
709
+ "model": model_name,
710
+ "raw_response": response.model_dump(),
711
+ "analysis_result": result
712
+ }
713
+ json.dump(output_data, f, indent=2)
714
+
715
+ logging.info(f"Analysis complete. Results saved to: {file_path}")
716
+
717
+ return result
718
+ except Exception as e:
719
+ logging.error(f"Error generating gene set analysis: {str(e)}")
720
+ raise ModelRetry(f"Error generating gene set analysis: {str(e)}")