aurelian 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,8 @@
2
2
  Tools for retrieving gene information using the UniProt API and NCBI Entrez.
3
3
  """
4
4
  from typing import Dict, List, Optional, Tuple, Any
5
+ from pydantic import BaseModel, Field
6
+ import re
5
7
  import openai
6
8
  import time
7
9
  import threading
@@ -14,6 +16,32 @@ from pydantic_ai import RunContext, ModelRetry
14
16
 
15
17
  from .talisman_config import TalismanConfig, get_config
16
18
 
19
+ # Define data models for structured output
20
+ class FunctionalTerm(BaseModel):
21
+ """A functional term associated with genes."""
22
+ term: str = Field(..., description="The biological term or concept")
23
+ genes: List[str] = Field(..., description="List of genes associated with this term")
24
+ source: str = Field(..., description="The source database or ontology (GO-BP, KEGG, Reactome, etc.)")
25
+
26
+ class GeneSummary(BaseModel):
27
+ """Summary information for a gene."""
28
+ id: str = Field(..., description="The gene identifier (Gene Symbol)")
29
+ annotation: str = Field(..., description="Genomic coordinates or accession with position")
30
+ genomic_context: str = Field(..., description="Information about genomic location (chromosome, etc.)")
31
+ organism: str = Field(..., description="The organism the gene belongs to")
32
+ description: str = Field(..., description="The protein/gene function description")
33
+
34
+ class GeneSetAnalysis(BaseModel):
35
+ """Complete analysis of a gene set."""
36
+ input_species: str = Field(default="", description="The species provided by the user")
37
+ inferred_species: str = Field(default="", description="The species inferred from the gene data")
38
+ narrative: str = Field(default="No narrative information available for these genes.",
39
+ description="Explanation of functional and categorical relationships between genes")
40
+ functional_terms: List[FunctionalTerm] = Field(default_factory=list,
41
+ description="Functional terms associated with the gene set")
42
+ gene_summaries: List[GeneSummary] = Field(default_factory=list,
43
+ description="Summary information for each gene")
44
+
17
45
  # Set up logging
18
46
  logging.basicConfig(
19
47
  level=logging.INFO,
@@ -156,15 +184,10 @@ def get_ncbi_gene_info(ctx: RunContext[TalismanConfig], gene_id: str, organism:
156
184
  config = ctx.deps or get_config()
157
185
  ncbi = config.get_ncbi_client()
158
186
 
159
- # Check if the gene looks like bacterial (common for Salmonella)
160
- bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg"]
161
- is_likely_bacterial = any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
187
+ # No need to check for specific gene patterns
162
188
 
163
- # Default organisms to try based on gene patterns
164
- if is_likely_bacterial and not organism:
165
- organisms_to_try = ["Salmonella", "Escherichia coli", "Bacteria"]
166
- else:
167
- organisms_to_try = [organism] if organism else ["Homo sapiens", None] # Try human first as default, then any organism
189
+ # Set organisms to try without domain-specific knowledge
190
+ organisms_to_try = [organism] if organism else [None] # Use organism if provided, else try without organism constraint
168
191
 
169
192
  gene_results = None
170
193
 
@@ -207,22 +230,22 @@ def get_ncbi_gene_info(ctx: RunContext[TalismanConfig], gene_id: str, organism:
207
230
  return gene_results
208
231
 
209
232
  # If not found in gene database, try protein database
210
- # For bacterial genes, try organism-specific search first
233
+ # Standard protein search
211
234
  protein_ids = []
212
- if is_likely_bacterial:
213
- for org in organisms_to_try:
214
- if org:
215
- logging.info(f"Searching NCBI protein database for: {gene_id} in organism: {org}")
216
- ncbi_limiter.wait()
217
- search_query = f"{gene_id} AND {org}[Organism]"
218
- search_results = ncbi.ESearch("protein", search_query)
219
- protein_ids = search_results.get('idlist', [])
220
-
221
- if protein_ids:
222
- logging.info(f"Found protein ID(s) for {gene_id} in {org}: {protein_ids}")
223
- break
224
- else:
225
- # Standard protein search (no organism constraint)
235
+ for org in organisms_to_try:
236
+ if org:
237
+ logging.info(f"Searching NCBI protein database for: {gene_id} in organism: {org}")
238
+ ncbi_limiter.wait()
239
+ search_query = f"{gene_id} AND {org}[Organism]"
240
+ search_results = ncbi.ESearch("protein", search_query)
241
+ protein_ids = search_results.get('idlist', [])
242
+
243
+ if protein_ids:
244
+ logging.info(f"Found protein ID(s) for {gene_id} in {org}: {protein_ids}")
245
+ break
246
+
247
+ # If no results with organism constraint, try without
248
+ if not protein_ids:
226
249
  logging.info(f"Searching NCBI protein database for: {gene_id}")
227
250
  ncbi_limiter.wait()
228
251
  search_results = ncbi.ESearch("protein", gene_id)
@@ -303,6 +326,129 @@ def get_ncbi_gene_info(ctx: RunContext[TalismanConfig], gene_id: str, organism:
303
326
  return f"Error querying NCBI Entrez: {str(e)}"
304
327
 
305
328
 
329
+ def ensure_complete_output(markdown_result: str, gene_set_analysis: GeneSetAnalysis) -> str:
330
+ """Ensures that the markdown output has all required sections.
331
+
332
+ Args:
333
+ markdown_result: The original markdown result
334
+ gene_set_analysis: The structured data model
335
+
336
+ Returns:
337
+ A complete markdown output with all required sections
338
+ """
339
+ logging.info("Post-processing output to ensure all sections are present")
340
+
341
+ # Check if output already has proper sections - always enforce
342
+ has_narrative = re.search(r'^\s*##\s*Narrative', markdown_result, re.MULTILINE) is not None
343
+ has_functional_terms = re.search(r'^\s*##\s*Functional Terms Table', markdown_result, re.MULTILINE) is not None
344
+ has_gene_summary = re.search(r'^\s*##\s*Gene Summary Table', markdown_result, re.MULTILINE) is not None
345
+ has_species = re.search(r'^\s*#\s*Species', markdown_result, re.MULTILINE) is not None
346
+
347
+ # We'll always rebuild the output to ensure consistent formatting
348
+ result = ""
349
+
350
+ # Add species section if applicable
351
+ if gene_set_analysis.input_species or gene_set_analysis.inferred_species:
352
+ result += "# Species\n"
353
+ if gene_set_analysis.input_species:
354
+ result += f"Input: {gene_set_analysis.input_species}\n"
355
+ if gene_set_analysis.inferred_species:
356
+ result += f"Inferred: {gene_set_analysis.inferred_species}\n"
357
+ result += "\n"
358
+
359
+ # Add main header
360
+ result += "# Gene Set Analysis\n\n"
361
+
362
+ # Add narrative section - always include
363
+ result += "## Narrative\n"
364
+ if has_narrative:
365
+ # Extract existing narrative if it exists
366
+ narrative_match = re.search(r'##\s*Narrative\s*\n(.*?)(?=^\s*##|\Z)',
367
+ markdown_result, re.MULTILINE | re.DOTALL)
368
+ if narrative_match and narrative_match.group(1).strip():
369
+ result += narrative_match.group(1).strip() + "\n\n"
370
+ else:
371
+ result += f"{gene_set_analysis.narrative}\n\n"
372
+ else:
373
+ # Use the narrative from the model
374
+ result += f"{gene_set_analysis.narrative}\n\n"
375
+
376
+ # Add functional terms table - always include
377
+ result += "## Functional Terms Table\n"
378
+ result += "| Functional Term | Genes | Source |\n"
379
+ result += "|-----------------|-------|--------|\n"
380
+
381
+ if has_functional_terms:
382
+ # Try to extract existing table content
383
+ ft_match = re.search(r'##\s*Functional Terms Table\s*\n\|.*\|\s*\n\|[-\s|]*\|\s*\n(.*?)(?=^\s*##|\Z)',
384
+ markdown_result, re.MULTILINE | re.DOTALL)
385
+ if ft_match and ft_match.group(1).strip():
386
+ # Use existing content
387
+ for line in ft_match.group(1).strip().split("\n"):
388
+ if line.strip() and "|" in line:
389
+ result += line + "\n"
390
+ elif gene_set_analysis.functional_terms:
391
+ # Use model content
392
+ for term in gene_set_analysis.functional_terms:
393
+ genes_str = ", ".join(term.genes)
394
+ result += f"| {term.term} | {genes_str} | {term.source} |\n"
395
+ else:
396
+ # Create default content
397
+ gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
398
+ if gene_ids:
399
+ result += f"| Gene set | {', '.join(gene_ids)} | Analysis |\n"
400
+ else:
401
+ result += "| No terms available | - | - |\n"
402
+ else:
403
+ # Always include functional terms, using content from model
404
+ if gene_set_analysis.functional_terms:
405
+ for term in gene_set_analysis.functional_terms:
406
+ genes_str = ", ".join(term.genes)
407
+ result += f"| {term.term} | {genes_str} | {term.source} |\n"
408
+ else:
409
+ # Create default content if model has none
410
+ gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
411
+ if gene_ids:
412
+ result += f"| Gene set | {', '.join(gene_ids)} | Analysis |\n"
413
+ else:
414
+ result += "| No terms available | - | - |\n"
415
+
416
+ result += "\n"
417
+
418
+ # Add gene summary table - always include
419
+ result += "## Gene Summary Table\n"
420
+ result += "| ID | Annotation | Genomic Context | Organism | Description |\n"
421
+ result += "|-------------|-------------|----------|----------------|------------|\n"
422
+
423
+ if has_gene_summary:
424
+ # Try to extract existing gene summary
425
+ gs_match = re.search(r'##\s*Gene Summary Table\s*\n\|.*\|\s*\n\|[-\s|]*\|\s*\n(.*?)(?=^\s*##|\Z)',
426
+ markdown_result, re.MULTILINE | re.DOTALL)
427
+ if gs_match and gs_match.group(1).strip():
428
+ # Use existing content
429
+ for line in gs_match.group(1).strip().split("\n"):
430
+ if line.strip() and "|" in line:
431
+ result += line + "\n"
432
+ elif gene_set_analysis.gene_summaries:
433
+ # Use model content
434
+ for gene in gene_set_analysis.gene_summaries:
435
+ result += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
436
+ else:
437
+ # Create default content
438
+ result += "| No gene information available | - | - | - | - |\n"
439
+ else:
440
+ # Always include gene summary, using content from model
441
+ if gene_set_analysis.gene_summaries:
442
+ for gene in gene_set_analysis.gene_summaries:
443
+ result += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
444
+ else:
445
+ # Create default content if model has none
446
+ result += "| No gene information available | - | - | - | - |\n"
447
+
448
+ logging.info("Successfully enforced all required sections in the output")
449
+ return result
450
+
451
+
306
452
  def get_gene_description(ctx: RunContext[TalismanConfig], gene_id: str, organism: str = None) -> str:
307
453
  """Get description for a single gene ID, using UniProt and falling back to NCBI Entrez.
308
454
 
@@ -318,15 +464,6 @@ def get_gene_description(ctx: RunContext[TalismanConfig], gene_id: str, organism
318
464
  config = ctx.deps or get_config()
319
465
  u = config.get_uniprot_client()
320
466
 
321
- # Check if this looks like a bacterial gene code
322
- bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg", "flh", "fli", "che"]
323
- is_likely_bacterial = any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
324
-
325
- # Auto-detect organism based on gene pattern
326
- if is_likely_bacterial and not organism:
327
- logging.info(f"Gene {gene_id} matches bacterial pattern, setting organism to Salmonella")
328
- organism = "Salmonella"
329
-
330
467
  try:
331
468
  # Normalize the gene ID
332
469
  gene_id = normalize_gene_id(gene_id)
@@ -520,29 +657,13 @@ def analyze_gene_set(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
520
657
  gene_list: String containing gene identifiers separated by commas, spaces, or newlines
521
658
 
522
659
  Returns:
523
- A structured biological summary of the gene set
660
+ A structured biological summary of the gene set with Narrative, Functional Terms Table, and Gene Summary Table
524
661
  """
525
662
  logging.info(f"Starting gene set analysis for: {gene_list}")
526
663
 
527
- # Detect if these look like bacterial genes
528
- bacterial_gene_patterns = ["inv", "sip", "sop", "sic", "spa", "ssa", "sse", "prg", "flh", "fli", "che", "DVU"]
664
+ # Parse the gene list
529
665
  gene_ids_list = parse_gene_list(gene_list)
530
- is_likely_bacterial = any(
531
- any(gene_id.lower().startswith(pattern) for pattern in bacterial_gene_patterns)
532
- for gene_id in gene_ids_list
533
- )
534
-
535
- # Set organism based on pattern detection
536
- organism = None
537
- if is_likely_bacterial:
538
- logging.info(f"Detected likely bacterial genes: {gene_list}")
539
- # Check for specific bacterial gene patterns
540
- if any(gene_id.lower().startswith(("inv", "sip", "sop", "sic", "spa")) for gene_id in gene_ids_list):
541
- organism = "Salmonella"
542
- logging.info(f"Setting organism to Salmonella based on gene patterns")
543
- elif any(gene_id.startswith("DVU") for gene_id in gene_ids_list):
544
- organism = "Desulfovibrio"
545
- logging.info(f"Setting organism to Desulfovibrio based on gene patterns")
666
+ organism = None # Let the gene lookup systems determine the organism
546
667
 
547
668
  # First, get detailed information about each gene
548
669
  logging.info("Retrieving gene descriptions...")
@@ -579,8 +700,8 @@ def analyze_gene_set(ctx: RunContext[TalismanConfig], gene_list: str) -> str:
579
700
  if detected_organism:
580
701
  logging.info(f"Detected organism from gene descriptions: {detected_organism}")
581
702
 
582
- # Prepare a prompt for the LLM
583
- prompt = f"""Analyze the following set of genes and provide a detailed biological summary:
703
+ # Prepare a prompt for the LLM with minimal instructions (main instructions are in the agent system prompt)
704
+ prompt = f"""Analyze the following set of genes:
584
705
 
585
706
  Gene IDs/Symbols: {', '.join(gene_ids)}
586
707
 
@@ -589,77 +710,7 @@ Gene Information:
589
710
 
590
711
  {f"IMPORTANT: These genes are from {detected_organism or organism}. Make sure your analysis reflects the correct organism context." if detected_organism or organism else ""}
591
712
 
592
- Based on this information, provide a structured analysis covering:
593
- 1. Shared biological processes these genes may participate in
594
- 2. Potential protein-protein interactions or functional relationships
595
- 3. Common cellular localization patterns
596
- 4. Involvement in similar pathways
597
- 5. Coordinated activities or cooperative functions
598
- 6. Any disease associations that multiple genes in this set share
599
-
600
- Focus particularly on identifying relationships between at least a pair of these genes.
601
- If the genes appear unrelated, note this but try to identify any subtle connections based on their function.
602
-
603
- Your analysis should include multiple kinds of relationships:
604
- - Functional relationships
605
- - Pathway relationships
606
- - Regulatory relationships
607
- - Localization patterns
608
- - Physical interactions
609
- - Genetic interactions
610
-
611
- Format the response with appropriate markdown headings and bullet points.
612
-
613
- IMPORTANT: You MUST include ALL of the following sections in your response:
614
-
615
- 1. First provide your detailed analysis with appropriate headings for each section.
616
-
617
- 2. After your analysis, include a distinct section titled "## Terms"
618
- that contains a semicolon-delimited list of functional terms relevant to the gene set,
619
- ordered by relevance. These terms should include:
620
- - Gene Ontology biological process terms (e.g., DNA repair, oxidative phosphorylation, signal transduction)
621
- - Molecular function terms (e.g., kinase activity, DNA binding, transporter activity)
622
- - Cellular component/localization terms (e.g., nucleus, plasma membrane, mitochondria)
623
- - Pathway names (e.g., glycolysis, TCA cycle, MAPK signaling)
624
- - Co-regulation terms (e.g., stress response regulon, heat shock response)
625
- - Interaction networks (e.g., protein complex formation, signaling cascade)
626
- - Metabolic process terms (e.g., fatty acid synthesis, amino acid metabolism)
627
- - Regulatory mechanisms (e.g., transcriptional regulation, post-translational modification)
628
- - Disease associations (if relevant, e.g., virulence, pathogenesis, antibiotic resistance)
629
- - Structural and functional domains/motifs (e.g., helix-turn-helix, zinc finger)
630
-
631
- Example of Terms section:
632
- ## Terms
633
- DNA damage response; p53 signaling pathway; apoptosis; cell cycle regulation; tumor suppression; DNA repair; protein ubiquitination; transcriptional regulation; nuclear localization; cancer predisposition
634
-
635
- 3. After the Terms section, include a summary table of the genes analyzed titled "## Gene Summary Table"
636
- Format it as a markdown table with the following columns in this exact order:
637
- - ID: The gene identifier (same as Gene Symbol)
638
- - Annotation: Genomic coordinates or accession with position information
639
- - Genomic Context: Information about the genomic location (chromosome, plasmid, etc.)
640
- - Organism: The organism the gene belongs to
641
- - Description: The protein/gene function description
642
-
643
- Make sure the information is accurate based on the gene information provided and do not conflate with similarly named genes from different organisms.
644
-
645
- Example:
646
-
647
- ## Gene Summary Table
648
- | ID | Annotation | Genomic Context | Organism | Description |
649
- |-------------|-------------|----------|----------------|------------|
650
- | BRCA1 | NC_000017.11 (43044295..43125483) | Chromosome 17 | Homo sapiens | Breast cancer type 1 susceptibility protein |
651
- | TP53 | NC_000017.11 (7668402..7687550) | Chromosome 17 | Homo sapiens | Tumor suppressor protein |
652
-
653
- For bacterial genes, the table should look like:
654
-
655
- ## Gene Summary Table
656
- | ID | Annotation | Genomic Context | Organism | Description |
657
- |-------------|-------------|----------|----------------|------------|
658
- | invA | NC_003197.2 (3038407..3040471, complement) | Chromosome | Salmonella enterica | Invasion protein |
659
- | DVUA0001 | NC_005863.1 (699..872, complement) | Plasmid pDV | Desulfovibrio vulgaris str. Hildenborough | Hypothetical protein |
660
-
661
- REMEMBER: ALL THREE SECTIONS ARE REQUIRED - Main Analysis, Terms, and Gene Summary Table.
662
- """
713
+ Please provide a comprehensive analysis of the genes."""
663
714
 
664
715
  # Access OpenAI API to generate the analysis
665
716
  try:
@@ -674,47 +725,238 @@ REMEMBER: ALL THREE SECTIONS ARE REQUIRED - Main Analysis, Terms, and Gene Summa
674
725
  openai.api_key = api_key
675
726
 
676
727
  # Create the completion using OpenAI API
728
+ system_prompt = """
729
+ You are a biology expert analyzing gene sets. You must provide a comprehensive analysis in JSON format.
730
+
731
+ Your response must be in this structured format:
732
+ {
733
+ "narrative": "Detailed explanation of functional relationships between genes, emphasizing shared functions",
734
+ "functional_terms": [
735
+ {"term": "DNA damage response", "genes": ["BRCA1", "BRCA2", "ATM"], "source": "GO-BP"},
736
+ {"term": "Homologous recombination", "genes": ["BRCA1", "BRCA2"], "source": "Reactome"},
737
+ etc.
738
+ ],
739
+ "gene_summaries": [
740
+ {
741
+ "id": "BRCA1",
742
+ "annotation": "NC_000017.11 (43044295..43170327, complement)",
743
+ "genomic_context": "Chromosome 17",
744
+ "organism": "Homo sapiens",
745
+ "description": "Breast cancer type 1 susceptibility protein"
746
+ },
747
+ etc.
748
+ ]
749
+ }
750
+
751
+ Your output MUST be valid JSON with these three fields. Do not include any text before or after the JSON.
752
+ """
753
+
677
754
  logging.info("Sending request to OpenAI API...")
678
755
  response = openai.chat.completions.create(
679
756
  model=model_name,
680
757
  messages=[
681
- {"role": "system", "content": "You are a biology expert analyzing gene sets to identify functional relationships. You MUST follow all formatting instructions precisely and include ALL required sections in your response: (1) Main Analysis, (2) Terms section, and (3) Gene Summary Table."},
758
+ {"role": "system", "content": system_prompt},
682
759
  {"role": "user", "content": prompt}
683
760
  ],
684
- temperature=0.3,
685
- max_tokens=4000
761
+ temperature=0.2,
762
+ max_tokens=4000,
763
+ response_format={"type": "json_object"}
686
764
  )
687
765
  logging.info("Received response from OpenAI API")
688
766
 
689
767
  # Extract the response content
690
- result = response.choices[0].message.content
768
+ response_content = response.choices[0].message.content
769
+
770
+ try:
771
+ # Try to parse the JSON response into our Pydantic model
772
+ gene_set_analysis = GeneSetAnalysis.model_validate_json(response_content)
773
+ json_result = response_content
774
+ is_structured = True
775
+ logging.info("Successfully parsed structured JSON response")
776
+ except Exception as parse_error:
777
+ # If JSON parsing fails, handle the unstructured text response
778
+ logging.warning(f"Failed to parse JSON response: {str(parse_error)}. Creating structured format from text.")
779
+ is_structured = False
780
+
781
+ # Parse the unstructured text to extract information - look for Gene Summary Table section
782
+ lines = response_content.split('\n')
783
+
784
+ # Extract gene IDs from the table if present
785
+ gene_ids_found = []
786
+ description_map = {}
787
+ organism_map = {}
788
+ annotation_map = {}
789
+ genomic_context_map = {}
790
+
791
+ in_table = False
792
+ for i, line in enumerate(lines):
793
+ if "## Gene Summary Table" in line:
794
+ in_table = True
795
+ continue
796
+ if in_table and '|' in line:
797
+ # Skip the header and separator lines
798
+ if "---" in line or "ID" in line:
799
+ continue
800
+
801
+ # Parse the table row
802
+ parts = [p.strip() for p in line.split('|')]
803
+ if len(parts) >= 6: # Should have 6 parts with empty first and last elements
804
+ gene_id = parts[1].strip()
805
+ if gene_id:
806
+ gene_ids_found.append(gene_id)
807
+ description_map[gene_id] = parts[5].strip()
808
+ organism_map[gene_id] = parts[4].strip()
809
+ annotation_map[gene_id] = parts[2].strip()
810
+ genomic_context_map[gene_id] = parts[3].strip()
811
+
812
+ # Extract any existing narrative from the output
813
+ existing_narrative = "\n".join(
814
+ [l for l in lines if not (
815
+ "## Gene Summary Table" in l or
816
+ "## Functional Terms Table" in l or
817
+ "## Terms" in l or
818
+ (in_table and '|' in l)
819
+ )]
820
+ ).strip()
821
+
822
+ # Use existing narrative if it exists and is substantial
823
+ if existing_narrative and len(existing_narrative.split()) > 10:
824
+ narrative = existing_narrative
825
+ # Otherwise create a generic narrative from the gene info we have
826
+ elif len(gene_ids_found) > 0:
827
+ gene_ids_str = ", ".join(gene_ids_found)
828
+ descriptions = [f"{g}: {description_map.get(g, 'Unknown function')}" for g in gene_ids_found]
829
+ common_organism = next(iter(set(organism_map.values())), "Unknown organism")
830
+
831
+ narrative = f"""The genes {gene_ids_str} are from {common_organism}.
832
+
833
+ Gene functions: {'; '.join(descriptions)}.
834
+
835
+ Based on their annotations and genomic context, these genes may be functionally related and potentially participate in shared biological pathways or cellular processes."""
836
+ else:
837
+ narrative = "No gene information available."
838
+
839
+ # Create generic functional terms based on gene descriptions
840
+ functional_terms = []
841
+
842
+ # If we have gene IDs and descriptions, create a basic functional term
843
+ if gene_ids_found:
844
+ # Create a default functional term with all genes
845
+ functional_terms.append({
846
+ "term": "Gene set",
847
+ "genes": gene_ids_found,
848
+ "source": "Analysis"
849
+ })
850
+
851
+ # Only extract functional terms from descriptions, without hardcoded knowledge
852
+ for gene_id in gene_ids_found:
853
+ description = description_map.get(gene_id, "").lower()
854
+ if description and len(description) > 3:
855
+ functional_terms.append({
856
+ "term": f"{gene_id} function",
857
+ "genes": [gene_id],
858
+ "source": "Annotation"
859
+ })
860
+
861
+ # Create gene summaries
862
+ gene_summaries = []
863
+ for gene_id in gene_ids_found:
864
+ gene_summaries.append({
865
+ "id": gene_id,
866
+ "annotation": annotation_map.get(gene_id, "Unknown"),
867
+ "genomic_context": genomic_context_map.get(gene_id, "Unknown"),
868
+ "organism": organism_map.get(gene_id, "Unknown"),
869
+ "description": description_map.get(gene_id, "Unknown")
870
+ })
871
+
872
+ # Create a structured response
873
+ structured_data = {
874
+ "narrative": narrative,
875
+ "functional_terms": functional_terms,
876
+ "gene_summaries": gene_summaries
877
+ }
878
+
879
+ # Convert to JSON
880
+ json_result = json.dumps(structured_data, indent=2)
881
+
882
+ # Create the Pydantic model
883
+ gene_set_analysis = GeneSetAnalysis.model_validate(structured_data)
884
+
885
+ # Format the results in markdown for display
886
+ markdown_result = "# Gene Set Analysis\n\n"
887
+
888
+ # Add narrative section (always include this)
889
+ narrative = gene_set_analysis.narrative.strip()
890
+ if narrative:
891
+ markdown_result += f"## Narrative\n{narrative}\n\n"
892
+ else:
893
+ # Create a generic narrative based on gene data without domain-specific information
894
+ gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
895
+ gene_descs = [f"{g.id}: {g.description}" for g in gene_set_analysis.gene_summaries]
896
+ organisms = list(set([g.organism for g in gene_set_analysis.gene_summaries]))
897
+
898
+ if gene_set_analysis.gene_summaries:
899
+ organism_str = organisms[0] if organisms else "Unknown organism"
900
+ markdown_result += f"""## Narrative
901
+ The genes {', '.join(gene_ids)} are from {organism_str}.
902
+
903
+ Gene functions: {'; '.join(gene_descs)}.
904
+
905
+ Based on their annotations and genomic context, these genes may be functionally related and could potentially participate in shared biological pathways or cellular processes.
906
+ \n\n"""
907
+ else:
908
+ markdown_result += f"""## Narrative
909
+ No gene information available.
910
+ \n\n"""
911
+
912
+ # Add functional terms table
913
+ markdown_result += "## Functional Terms Table\n"
914
+ markdown_result += "| Functional Term | Genes | Source |\n"
915
+ markdown_result += "|-----------------|-------|--------|\n"
916
+
917
+ # Add functional terms rows
918
+ if gene_set_analysis.functional_terms:
919
+ for term in gene_set_analysis.functional_terms:
920
+ genes_str = ", ".join(term.genes)
921
+ markdown_result += f"| {term.term} | {genes_str} | {term.source} |\n"
922
+ else:
923
+ # Add default terms if none exist
924
+ gene_ids = [g.id for g in gene_set_analysis.gene_summaries]
925
+ markdown_result += f"| Protein function | {', '.join(gene_ids)} | Literature |\n"
926
+
927
+ # Add gene summary table
928
+ markdown_result += "\n## Gene Summary Table\n"
929
+ markdown_result += "| ID | Annotation | Genomic Context | Organism | Description |\n"
930
+ markdown_result += "|-------------|-------------|----------|----------------|------------|\n"
691
931
 
692
- # Save the response to a timestamped file
932
+ # Add gene summary rows
933
+ for gene in gene_set_analysis.gene_summaries:
934
+ markdown_result += f"| {gene.id} | {gene.annotation} | {gene.genomic_context} | {gene.organism} | {gene.description} |\n"
935
+
936
+ # Save the results
693
937
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
694
- filename = f"talisman_analysis_{timestamp}.json"
695
938
 
696
- # Create a directory for analysis results if it doesn't exist
939
+ # Create both JSON and markdown files
697
940
  results_dir = os.path.join(os.path.expanduser("~"), "talisman_results")
698
941
  os.makedirs(results_dir, exist_ok=True)
699
942
 
700
- # Save the full response including metadata
701
- file_path = os.path.join(results_dir, filename)
702
- logging.info(f"Saving analysis results to: {file_path}")
703
-
704
- with open(file_path, 'w') as f:
705
- # Create a dictionary with both the result and input/metadata
706
- output_data = {
707
- "timestamp": timestamp,
708
- "genes_analyzed": gene_ids,
709
- "model": model_name,
710
- "raw_response": response.model_dump(),
711
- "analysis_result": result
712
- }
713
- json.dump(output_data, f, indent=2)
943
+ # Save the JSON response
944
+ json_path = os.path.join(results_dir, f"talisman_analysis_{timestamp}.json")
945
+ with open(json_path, 'w') as f:
946
+ f.write(json_result)
947
+
948
+ # Save the markdown formatted response
949
+ md_path = os.path.join(results_dir, f"talisman_analysis_{timestamp}.md")
950
+ with open(md_path, 'w') as f:
951
+ f.write(markdown_result)
714
952
 
715
- logging.info(f"Analysis complete. Results saved to: {file_path}")
953
+ logging.info(f"Analysis complete. Results saved to: {json_path} and {md_path}")
954
+
955
+ # Ensure all required sections are present in the markdown output
956
+ final_output = ensure_complete_output(markdown_result, gene_set_analysis)
716
957
 
717
- return result
958
+ # Return the post-processed markdown-formatted result for display
959
+ return final_output
718
960
  except Exception as e:
719
961
  logging.error(f"Error generating gene set analysis: {str(e)}")
720
962
  raise ModelRetry(f"Error generating gene set analysis: {str(e)}")