genomix-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. genomix/__init__.py +3 -0
  2. genomix/agent/__init__.py +0 -0
  3. genomix/agent/context_compressor.py +31 -0
  4. genomix/agent/loop.py +93 -0
  5. genomix/agent/prompt_builder.py +68 -0
  6. genomix/agent/session_store.py +41 -0
  7. genomix/builtin_skills/common/file-formats/SKILL.md +94 -0
  8. genomix/builtin_skills/common/genome-references/SKILL.md +86 -0
  9. genomix/builtin_skills/comparative/blast-analysis/SKILL.md +72 -0
  10. genomix/builtin_skills/comparative/multiple-alignment/SKILL.md +67 -0
  11. genomix/builtin_skills/comparative/phylogenetics/SKILL.md +79 -0
  12. genomix/builtin_skills/exploration/database-search/SKILL.md +73 -0
  13. genomix/builtin_skills/exploration/sequence-summary/SKILL.md +93 -0
  14. genomix/builtin_skills/exploration/variant-explain/SKILL.md +66 -0
  15. genomix/builtin_skills/sequencing/alignment/SKILL.md +83 -0
  16. genomix/builtin_skills/sequencing/annotation/SKILL.md +77 -0
  17. genomix/builtin_skills/sequencing/quality-control/SKILL.md +60 -0
  18. genomix/builtin_skills/sequencing/variant-calling/SKILL.md +93 -0
  19. genomix/cli.py +193 -0
  20. genomix/config.py +80 -0
  21. genomix/errors.py +37 -0
  22. genomix/output.py +20 -0
  23. genomix/project/__init__.py +0 -0
  24. genomix/project/manager.py +73 -0
  25. genomix/project/setup_wizard.py +44 -0
  26. genomix/providers/__init__.py +20 -0
  27. genomix/providers/base.py +27 -0
  28. genomix/providers/claude.py +124 -0
  29. genomix/providers/openai_provider.py +33 -0
  30. genomix/providers/opencode.py +71 -0
  31. genomix/runtime.py +75 -0
  32. genomix/skills/__init__.py +1 -0
  33. genomix/skills/loader.py +58 -0
  34. genomix/skills/registry.py +48 -0
  35. genomix/swarm/__init__.py +0 -0
  36. genomix/swarm/manager.py +70 -0
  37. genomix/tools/__init__.py +0 -0
  38. genomix/tools/file_tools.py +37 -0
  39. genomix/tools/mcp_bridge.py +85 -0
  40. genomix/tools/mcp_manager.py +281 -0
  41. genomix/tools/registry.py +25 -0
  42. genomix/tui.py +512 -0
  43. genomix_cli-0.1.0.dist-info/METADATA +217 -0
  44. genomix_cli-0.1.0.dist-info/RECORD +61 -0
  45. genomix_cli-0.1.0.dist-info/WHEEL +4 -0
  46. genomix_cli-0.1.0.dist-info/entry_points.txt +2 -0
  47. genomix_cli-0.1.0.dist-info/licenses/LICENSE +194 -0
  48. mcp_servers/__init__.py +0 -0
  49. mcp_servers/base_biotool.py +27 -0
  50. mcp_servers/base_database.py +111 -0
  51. mcp_servers/biotools/__init__.py +0 -0
  52. mcp_servers/biotools/blast_server.py +69 -0
  53. mcp_servers/biotools/bwa_server.py +37 -0
  54. mcp_servers/biotools/fastqc_server.py +29 -0
  55. mcp_servers/biotools/gatk_server.py +47 -0
  56. mcp_servers/biotools/samtools_server.py +76 -0
  57. mcp_servers/databases/__init__.py +0 -0
  58. mcp_servers/databases/clinvar_server.py +53 -0
  59. mcp_servers/databases/dbsnp_server.py +53 -0
  60. mcp_servers/databases/ensembl_server.py +100 -0
  61. mcp_servers/databases/ncbi_server.py +68 -0
genomix/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Genomix CLI — AI-powered DNA sequence and genome analysis."""
2
+
3
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,31 @@
1
+ """Context compression: summarize old tool results when context grows."""
2
+ from __future__ import annotations
3
+ from typing import Any
4
+
5
+ CHARS_PER_TOKEN = 4
6
+
7
+
8
+ def estimate_tokens(messages):
9
+ return sum(len(m.get("content", "") or "") for m in messages)
10
+
11
+
12
+ def should_compress(messages, max_tokens):
13
+ return estimate_tokens(messages) > max_tokens * 0.8
14
+
15
+
16
+ def compress_messages(messages, max_tokens):
17
+ if not should_compress(messages, max_tokens):
18
+ return messages
19
+ result = []
20
+ if messages and messages[0]["role"] == "system":
21
+ result.append(messages[0])
22
+ messages = messages[1:]
23
+ keep_recent = min(6, len(messages))
24
+ old = messages[:-keep_recent] if keep_recent < len(messages) else []
25
+ recent = messages[-keep_recent:]
26
+ for msg in old:
27
+ if msg.get("role") == "tool" and len(msg.get("content", "") or "") > 500:
28
+ msg = {**msg, "content": (msg["content"] or "")[:200] + "\n... [truncated]"}
29
+ result.append(msg)
30
+ result.extend(recent)
31
+ return result
genomix/agent/loop.py ADDED
@@ -0,0 +1,93 @@
1
+ """Agent conversation loop with tool calling support."""
2
+ from __future__ import annotations
3
+ import json
4
+ from typing import Any, Callable
5
+
6
+ from genomix.providers.base import BaseProvider
7
+ from genomix.tools.registry import ToolRegistry
8
+ from genomix.agent.context_compressor import should_compress, compress_messages
9
+
10
+
11
+ class AgentLoop:
12
+ def __init__(
13
+ self,
14
+ provider: BaseProvider,
15
+ tool_registry: ToolRegistry,
16
+ system_prompt: str = "",
17
+ max_iterations: int = 30,
18
+ on_tool_call: Callable[[str, dict], None] | None = None,
19
+ on_tool_result: Callable[[str, str], None] | None = None,
20
+ on_thinking: Callable[[str], None] | None = None,
21
+ ):
22
+ self.provider = provider
23
+ self.tool_registry = tool_registry
24
+ self.system_prompt = system_prompt
25
+ self.max_iterations = max_iterations
26
+ self.messages: list[dict[str, Any]] = []
27
+ # UI callbacks
28
+ self.on_tool_call = on_tool_call
29
+ self.on_tool_result = on_tool_result
30
+ self.on_thinking = on_thinking
31
+
32
+ def _build_messages(self) -> list[dict[str, Any]]:
33
+ msgs = list(self.messages)
34
+
35
+ # Compress if context is getting large
36
+ max_tokens = self.provider.max_context_length()
37
+ if should_compress(msgs, max_tokens):
38
+ msgs = compress_messages(msgs, max_tokens)
39
+
40
+ if self.system_prompt:
41
+ return [{"role": "system", "content": self.system_prompt}] + msgs
42
+ return msgs
43
+
44
+ def _force_final_synthesis(self) -> str:
45
+ """Ask the model for a final answer with tool use disabled."""
46
+ self.messages.append(
47
+ {
48
+ "role": "user",
49
+ "content": (
50
+ "You have enough information. Do not call any more tools. "
51
+ "Provide the best final answer now, clearly noting uncertainty where needed."
52
+ ),
53
+ }
54
+ )
55
+ response = self.provider.chat(self._build_messages(), tools=None)
56
+ self.messages.append({"role": "assistant", "content": response.content})
57
+ return response.content or "Max iterations reached without a final response."
58
+
59
+ def chat(self, user_message: str) -> str:
60
+ self.messages.append({"role": "user", "content": user_message})
61
+ tools = self.tool_registry.list_tools() or None
62
+
63
+ for iteration in range(self.max_iterations):
64
+ all_messages = self._build_messages()
65
+
66
+ if self.on_thinking and iteration == 0:
67
+ self.on_thinking("Thinking...")
68
+
69
+ response = self.provider.chat(all_messages, tools=tools)
70
+
71
+ if response.tool_calls:
72
+ self.messages.append({"role": "assistant", "content": response.content or "", "tool_calls": [
73
+ {"id": tc.id, "type": "function", "function": {"name": tc.name, "arguments": json.dumps(tc.arguments) if isinstance(tc.arguments, dict) else tc.arguments}}
74
+ for tc in response.tool_calls
75
+ ]})
76
+ for tc in response.tool_calls:
77
+ if self.on_tool_call:
78
+ self.on_tool_call(tc.name, tc.arguments)
79
+ try:
80
+ result = self.tool_registry.dispatch(tc.name, tc.arguments)
81
+ except Exception as e:
82
+ result = json.dumps({"error": str(e)})
83
+ # Truncate large tool results to prevent context explosion
84
+ if len(result) > 2000:
85
+ result = result[:2000] + "\n... [truncated]"
86
+ if self.on_tool_result:
87
+ self.on_tool_result(tc.name, result[:200])
88
+ self.messages.append({"role": "tool", "tool_call_id": tc.id, "content": result})
89
+ else:
90
+ self.messages.append({"role": "assistant", "content": response.content})
91
+ return response.content or ""
92
+
93
+ return self._force_final_synthesis()
@@ -0,0 +1,68 @@
1
+ """Assemble the system prompt from project context, skills, and mode."""
2
+ from __future__ import annotations
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from genomix.project.manager import GenomixProject
7
+
8
+ IDENTITY = """You are Genomix, an AI assistant specialized in DNA sequence and genome analysis.
9
+ You help biologists, bioinformaticians, and researchers analyze genomic data by orchestrating
10
+ bioinformatics tools and querying genomic databases.
11
+
12
+ You are proactive: suggest next steps, explain results in accessible language, and adapt
13
+ your communication to the user's expertise level. When a user speaks in natural language,
14
+ explain in plain terms. When they use slash commands, be concise and technical.
15
+
16
+ You have access to tools for: file manipulation, sequence alignment, variant calling,
17
+ annotation, BLAST searches, database queries (NCBI, Ensembl, ClinVar, dbSNP), and more.
18
+
19
+ IMPORTANT — Tool calling strategy:
20
+ - Be STRATEGIC with tool calls. Do NOT query every database for every item.
21
+ - Maximum 5-6 tool calls per question. After that, synthesize from what you have.
22
+ - Use your own knowledge to supplement — you don't need to verify everything via API.
23
+
24
+ When analyzing VCF files:
25
+ 1. Read the file. Check if INFO has annotations (GENE, EFFECT, CLNSIG).
26
+ 2. IF annotated: use them directly, no database queries needed.
27
+ 3. IF raw (no annotations, ID is "."):
28
+ → Use YOUR KNOWLEDGE FIRST to identify genes from coordinates:
29
+ chr17:43,044,000-43,170,000 = BRCA1
30
+ chr13:32,315,000-32,400,000 = BRCA2
31
+ chr7:117,480,000-117,668,000 = CFTR
32
+ chr11:5,225,000-5,228,000 = HBB
33
+ chr19:44,905,000-44,910,000 = APOE
34
+ → Only query databases (1-2 calls MAX) for coordinates you truly don't recognize.
35
+ → Interpret GT (0/1=het, 1/1=hom), DP (read depth), GQ (quality).
36
+ → Use your knowledge of well-known pathogenic variants at these positions.
37
+ → RESPOND after reading the file + at most 2 database calls. Do NOT look up every variant.
38
+
39
+ Advanced analysis capabilities — DO NOT say you can't do these:
40
+ - ANCESTRY INFERENCE: Many variants have population-specific frequencies. Use your
41
+ knowledge of ancestry-informative markers (e.g. rs334/HBB sickle cell = high frequency
42
+ in African/Mediterranean populations, CFTR deltaF508 = Northern European, APOE allele
43
+ frequencies vary by population). Use ensembl_population_frequencies to get gnomAD/1000G
44
+ frequency data across AFR/EUR/EAS/SAS/AMR populations. Combine multiple variants to
45
+ suggest likely ancestry.
46
+ - PHENOTYPE INFERENCE: From pathogenic variants, infer likely phenotypic consequences
47
+ (disease risk, carrier status for recessive conditions, drug response).
48
+ - PHARMACOGENOMICS: Some variants affect drug metabolism. Flag them if present.
49
+ - CARRIER STATUS: For autosomal recessive diseases (sickle cell, CF), heterozygous
50
+ carriers (0/1) are typically unaffected but can pass the variant to children."""
51
+
52
+ PRIVACY_ADDENDUM = """
53
+ PRIVACY MODE IS ACTIVE. You must follow these rules strictly:
54
+ - Never include raw sequence data (nucleotide strings) in your responses or reasoning
55
+ - Never include patient identifiers or sample metadata
56
+ - Only reference aggregated statistics, variant IDs (rsIDs), and gene symbols
57
+ - All tools run locally — only summaries are passed to you"""
58
+
59
+
60
+ def build_system_prompt(project, skill_body, privacy_mode):
61
+ parts = [IDENTITY]
62
+ if project:
63
+ parts.append(f"\n## Active Project\n- **Name:** {project.name}\n- **Organism:** {project.organism}\n- **Reference genome:** {project.reference_genome}\n- **Data type:** {project.data_type}\n- **Project root:** {project.root}")
64
+ if skill_body:
65
+ parts.append(f"\n## Current Task Instructions\n\n{skill_body}")
66
+ if privacy_mode:
67
+ parts.append(PRIVACY_ADDENDUM)
68
+ return "\n".join(parts)
@@ -0,0 +1,41 @@
1
+ """Session history storage with SQLite + FTS5."""
2
+ import json, sqlite3, uuid
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+
8
+ class SessionStore:
9
+ def __init__(self, db_path):
10
+ self.db_path = Path(db_path)
11
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
12
+ self._init_db()
13
+
14
+ def _init_db(self):
15
+ with sqlite3.connect(self.db_path) as conn:
16
+ conn.execute("CREATE TABLE IF NOT EXISTS sessions (id TEXT PRIMARY KEY, title TEXT, messages TEXT, created_at TEXT)")
17
+ conn.execute("CREATE VIRTUAL TABLE IF NOT EXISTS sessions_fts USING fts5(id, title, content)")
18
+
19
+ def save_session(self, messages, title=""):
20
+ sid = uuid.uuid4().hex[:12]
21
+ now = datetime.now(timezone.utc).isoformat()
22
+ content = " ".join(m.get("content", "") or "" for m in messages)
23
+ with sqlite3.connect(self.db_path) as conn:
24
+ conn.execute("INSERT INTO sessions VALUES (?,?,?,?)", (sid, title, json.dumps(messages), now))
25
+ conn.execute("INSERT INTO sessions_fts VALUES (?,?,?)", (sid, title, content))
26
+ return sid
27
+
28
+ def load_session(self, session_id):
29
+ with sqlite3.connect(self.db_path) as conn:
30
+ row = conn.execute("SELECT messages FROM sessions WHERE id=?", (session_id,)).fetchone()
31
+ return json.loads(row[0]) if row else []
32
+
33
+ def search(self, query):
34
+ with sqlite3.connect(self.db_path) as conn:
35
+ rows = conn.execute("SELECT id, title FROM sessions_fts WHERE sessions_fts MATCH ?", (query,)).fetchall()
36
+ return [{"id": r[0], "title": r[1]} for r in rows]
37
+
38
+ def list_sessions(self, limit=20):
39
+ with sqlite3.connect(self.db_path) as conn:
40
+ rows = conn.execute("SELECT id, title, created_at FROM sessions ORDER BY created_at DESC LIMIT ?", (limit,)).fetchall()
41
+ return [{"id": r[0], "title": r[1], "created_at": r[2]} for r in rows]
@@ -0,0 +1,94 @@
1
+ ---
2
+ name: file-formats
3
+ description: Recognize and validate FASTA, FASTQ, BAM, VCF, and GFF files by inspecting their content
4
+ version: 1.0.0
5
+ author: genomix-cli
6
+ license: Apache-2.0
7
+ metadata:
8
+ genomix:
9
+ tags: [common, file-formats, fasta, fastq, bam, vcf, gff, detection]
10
+ tools_used: [read_file, run_command]
11
+ ---
12
+
13
+ # Genomic File Format Recognition
14
+
15
+ ## Detection by Content
16
+
17
+ Never rely solely on file extensions — always verify content.
18
+
19
+ ### FASTA
20
+ - First non-empty line starts with `>`.
21
+ - Second line is a sequence (A, T, G, C, N, IUPAC ambiguity codes, or amino acid letters).
22
+ - Multi-line sequences are allowed; next record starts with `>`.
23
+
24
+ ```
25
+ >sequence_id optional description
26
+ ATCGATCGATCG
27
+ ATCGATCGATCG
28
+ ```
29
+
30
+ ### FASTQ
31
+ - Records are 4 lines: `@header`, sequence, `+` (optionally repeated header), quality string.
32
+ - Quality string length must equal sequence length.
33
+ - Quality characters are ASCII 33–126 (Phred+33 encoding for modern Illumina).
34
+
35
+ ```
36
+ @read_id
37
+ ATCGATCGATCG
38
+ +
39
+ FFFFIIIIBBBB
40
+ ```
41
+
42
+ Distinguish FASTQ from FASTA: FASTQ starts with `@`, FASTA with `>`. Note: `@` also appears in quality lines, so always check that records are 4 lines.
43
+
44
+ ### BAM / SAM
45
+ - **SAM**: plain text; starts with `@HD` (header line), `@SQ` (reference sequences), then alignment records (11 mandatory tab-separated fields).
46
+ - **BAM**: binary; magic bytes are `BAM\1` (hex `42 41 4D 01`). Use `samtools view -H` to inspect.
47
+
48
+ ```bash
49
+ samtools view -H sample.bam | head -5
50
+ ```
51
+
52
+ ### VCF
53
+ - Starts with `##fileformat=VCFv4.x`.
54
+ - Meta-information lines begin with `##`.
55
+ - Header line begins with `#CHROM`.
56
+ - Data lines: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO (+ optional FORMAT and sample columns).
57
+
58
+ ```
59
+ ##fileformat=VCFv4.2
60
+ #CHROM POS ID REF ALT QUAL FILTER INFO
61
+ chr17 41245466 rs28897696 G A 100 PASS .
62
+ ```
63
+
64
+ ### GFF / GTF
65
+ - GFF3: starts with `##gff-version 3`. 9 tab-separated fields; attributes in `key=value` format.
66
+ - GTF (GFF2): attributes in `key "value"` format (used by GENCODE, Ensembl downloads).
67
+
68
+ ```
69
+ ##gff-version 3
70
+ chr1 RefSeq gene 11874 14409 . + . ID=gene1;Name=DDX11L1
71
+ ```
72
+
73
+ ## Format Validation Commands
74
+
75
+ ```bash
76
+ # Check FASTA integrity
77
+ seqkit stats sequences.fasta
78
+
79
+ # Validate FASTQ (check pairing, quality encoding)
80
+ fastqc --nogroup sample.fastq.gz
81
+
82
+ # Validate BAM (check for truncation, index)
83
+ samtools quickcheck sample.bam && samtools index sample.bam
84
+
85
+ # Check VCF (validate against spec)
86
+ bcftools stats sample.vcf.gz | head -30
87
+ ```
88
+
89
+ ## Common Pitfalls
90
+
91
+ - **FASTA with wrapped lines**: parsers must handle variable line widths.
92
+ - **FASTQ quality encoding**: older data may use Phred+64 (Illumina 1.3–1.7); seqkit or FastQC can auto-detect.
93
+ - **Multi-sample VCF**: sample columns appear after FORMAT — always check the header for sample names.
94
+ - **BGZF vs gzip**: BAM and indexed VCF use BGZF (block gzip), which is compatible with regular gzip but supports random access via `.tbi` / `.csi` indexes.
@@ -0,0 +1,86 @@
1
+ ---
2
+ name: genome-references
3
+ description: Choose between GRCh38 and GRCh37, perform coordinate liftover, and select organism-specific references
4
+ version: 1.0.0
5
+ author: genomix-cli
6
+ license: Apache-2.0
7
+ metadata:
8
+ genomix:
9
+ tags: [common, references, GRCh38, GRCh37, hg38, hg19, liftover, assembly]
10
+ tools_used: [run_command, ncbi_search]
11
+ ---
12
+
13
+ # Genome Reference Selection
14
+
15
+ ## GRCh38 vs GRCh37
16
+
17
+ | Feature | GRCh37 (hg19) | GRCh38 (hg38) |
18
+ |---------|---------------|----------------|
19
+ | Release | 2009 | 2013 (last patch 2022) |
20
+ | Alternate loci | None | 261 ALT sequences |
21
+ | Centromere resolution | Incomplete | Near-complete |
22
+ | Annotation currency | Legacy databases | Active updates (GENCODE, RefSeq) |
23
+ | Clinical databases | ClinVar legacy data | Current default |
24
+ | Chromosome names | chr1–22, chrX, chrY | chr1–22, chrX, chrY (same) |
25
+
26
+ **Use GRCh38** for all new projects. GRCh37 is only justified when:
27
+ - Reanalyzing legacy data that must remain comparable to a previous GRCh37 run.
28
+ - A clinical database or variant report explicitly uses GRCh37 coordinates.
29
+ - A collaboration partner mandates GRCh37.
30
+
31
+ Note: UCSC names (hg19/hg38) are interchangeable with Ensembl/NCBI names (GRCh37/GRCh38) for the primary assembly — the sequences are identical.
32
+
33
+ ## Coordinate Liftover
34
+
35
+ To convert coordinates from GRCh37 to GRCh38 (or reverse), use UCSC liftOver or Picard LiftoverVcf.
36
+
37
+ ### liftOver (BED)
38
+
39
+ ```bash
40
+ # Download chain file
41
+ wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
42
+
43
+ liftOver \
44
+ input.hg19.bed \
45
+ hg19ToHg38.over.chain.gz \
46
+ output.hg38.bed \
47
+ unmapped.bed
48
+ ```
49
+
50
+ ### Picard LiftoverVcf
51
+
52
+ ```bash
53
+ gatk LiftoverVcf \
54
+ -I sample.hg19.vcf.gz \
55
+ -O sample.hg38.vcf.gz \
56
+ --CHAIN hg19ToHg38.over.chain.gz \
57
+ -R GRCh38.fa \
58
+ --REJECT rejected.vcf.gz
59
+ ```
60
+
61
+ Always inspect the `unmapped.bed` or `rejected.vcf.gz` — coordinates in regions that were restructured between assemblies cannot be lifted over reliably.
62
+
63
+ ## Common Organism References
64
+
65
+ | Organism | Assembly | NCBI Accession |
66
+ |----------|----------|----------------|
67
+ | Human | GRCh38.p14 | GCA_000001405.29 |
68
+ | Mouse | GRCm39 | GCA_000001635.9 |
69
+ | Zebrafish | GRCz11 | GCA_000002035.4 |
70
+ | Drosophila | dm6 | GCA_000001215.4 |
71
+ | C. elegans | WBcel235 | GCA_000002985.3 |
72
+ | E. coli K-12 | ASM584v2 | GCA_000005845.2 |
73
+ | SARS-CoV-2 | ASM985889v3 | GCA_009858895.3 |
74
+ | Arabidopsis | TAIR10.1 | GCA_000001735.2 |
75
+
76
+ ## Reference Download
77
+
78
+ ```bash
79
+ # Human GRCh38 from NCBI
80
+ wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/GCA_000001405.15_GRCh38_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/chroms.tar.gz
81
+
82
+ # Or via Ensembl (soft-masked)
83
+ wget https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
84
+ ```
85
+
86
+ Ensembl FASTA uses chromosome names without `chr` prefix (e.g., `1`, `X`). UCSC/NCBI FASTAs use `chr1`, `chrX`. Ensure the chromosome naming convention matches your annotation files to avoid silently skipping records.
@@ -0,0 +1,72 @@
1
+ ---
2
+ name: blast-analysis
3
+ description: Select the correct BLAST program (blastn/blastp/blastx), set e-value thresholds, and interpret results
4
+ version: 1.0.0
5
+ author: genomix-cli
6
+ license: Apache-2.0
7
+ metadata:
8
+ genomix:
9
+ tags: [comparative, blast, blastn, blastp, blastx, similarity, e-value]
10
+ tools_used: [run_blast, ncbi_search]
11
+ ---
12
+
13
+ # BLAST Analysis
14
+
15
+ ## Program Selection
16
+
17
+ | Query | Database | Program |
18
+ |-------|----------|---------|
19
+ | Nucleotide | Nucleotide | blastn |
20
+ | Protein | Protein | blastp |
21
+ | Nucleotide (translated) | Protein | blastx |
22
+ | Protein | Nucleotide (translated) | tblastn |
23
+ | Nucleotide (translated) | Nucleotide (translated) | tblastx |
24
+
25
+ Decision rules:
26
+ - **blastn**: Comparing closely related sequences (same species, >70% identity), primer design, confirming PCR products.
27
+ - **blastp**: Comparing protein sequences across species, finding orthologs, functional domain analysis.
28
+ - **blastx**: Annotating novel nucleotide sequences (EST, genome contig) — finds protein homologs in 6-frame translation.
29
+ - **tblastn**: Finding gene locations in an unannotated genome assembly using a known protein query.
30
+
31
+ Avoid tblastx for large queries — it is computationally expensive.
32
+
33
+ ## E-value Interpretation
34
+
35
+ The e-value (expect value) is the number of alignments of equal or better score expected by chance in a database of this size.
36
+
37
+ | E-value | Interpretation |
38
+ |---------|----------------|
39
+ | < 1e-100 | Nearly identical sequences (>95% identity at full length) |
40
+ | 1e-50 to 1e-100 | Very strong homology, high confidence |
41
+ | 1e-10 to 1e-50 | Significant homology, likely true hit |
42
+ | 1e-3 to 1e-10 | Moderate confidence, check alignment manually |
43
+ | 0.01 to 1 | Weak hit, may be spurious — check length and identity |
44
+ | > 1 | Likely noise, not significant |
45
+
46
+ Default e-value cutoff is 10. For genomics, use `1e-5` as a starting filter; tighten to `1e-20` for confident functional annotation.
47
+
48
+ ## Key Result Fields
49
+
50
+ - **% identity**: Fraction of aligned positions with exact match.
51
+ - **query coverage**: Fraction of the query sequence in the alignment. Low coverage (<50%) with high identity often means domain hit rather than full-length homolog.
52
+ - **bit score**: Normalized score independent of database size — use for ranking across runs.
53
+ - **gaps**: High gap content with moderate identity may indicate divergent or frameshifted sequences.
54
+
55
+ ## Local BLAST Command Example
56
+
57
+ ```bash
58
+ # Build local database
59
+ makeblastdb -in proteins.fasta -dbtype prot -out mydb
60
+
61
+ # Run blastp
62
+ blastp \
63
+ -query query.fasta \
64
+ -db mydb \
65
+ -out results.txt \
66
+ -evalue 1e-10 \
67
+ -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" \
68
+ -num_threads 8 \
69
+ -max_target_seqs 10
70
+ ```
71
+
72
+ Format 6 (tabular) is easiest for downstream filtering. Always specify `-max_target_seqs` to avoid memory issues with large databases.
@@ -0,0 +1,67 @@
1
+ ---
2
+ name: multiple-alignment
3
+ description: Perform multiple sequence alignment with MAFFT, choose the right algorithm, and interpret output
4
+ version: 1.0.0
5
+ author: genomix-cli
6
+ license: Apache-2.0
7
+ metadata:
8
+ genomix:
9
+ tags: [comparative, mafft, multiple-alignment, msa, phylogenetics]
10
+ tools_used: [run_command, read_file]
11
+ ---
12
+
13
+ # Multiple Sequence Alignment with MAFFT
14
+
15
+ ## When to Use MAFFT
16
+
17
+ Use multiple sequence alignment (MSA) when you need to:
18
+ - Compare homologous sequences to identify conserved regions.
19
+ - Prepare input for phylogenetic tree construction.
20
+ - Detect insertions/deletions (indels) across taxa.
21
+ - Find functional motifs conserved across species.
22
+
23
+ MAFFT is preferred over Clustal Omega for most tasks: it scales to thousands of sequences and handles both protein and nucleotide input.
24
+
25
+ ## Algorithm Selection
26
+
27
+ | Flag | Algorithm | Use When |
28
+ |------|-----------|----------|
29
+ | `--auto` | Auto-select | Default; let MAFFT decide based on input size |
30
+ | `--localpair --maxiterate 1000` (L-INS-i) | Iterative local alignment | <200 sequences, highly accurate, long gaps expected |
31
+ | `--globalpair --maxiterate 1000` (G-INS-i) | Iterative global alignment | <200 sequences, global homology throughout |
32
+ | `--ep 0 --genafpair --maxiterate 1000` (E-INS-i) | Iterative with multiple conserved domains | <200 sequences, unalignable regions between conserved blocks |
33
+ | `--retree 2 --maxiterate 0` (FFT-NS-2) | Progressive | >1000 sequences, speed is priority |
34
+
35
+ For phylogenetics: use L-INS-i or G-INS-i on <200 sequences for best accuracy.
36
+
37
+ ## Basic Commands
38
+
39
+ ```bash
40
+ # Protein alignment (auto mode)
41
+ mafft --auto --thread 4 input.fasta > aligned.fasta
42
+
43
+ # High-accuracy nucleotide alignment (<200 seqs)
44
+ mafft --localpair --maxiterate 1000 --thread 4 input.fasta > aligned.fasta
45
+
46
+ # Large dataset (>500 seqs)
47
+ mafft --retree 2 --maxiterate 0 --thread 8 input.fasta > aligned.fasta
48
+ ```
49
+
50
+ ## Output Interpretation
51
+
52
+ MAFFT outputs FASTA with gaps represented as `-`. All sequences are padded to the same length.
53
+
54
+ Key things to check:
55
+ - **Alignment length vs. average sequence length**: A ratio > 3x suggests many large insertions or misaligned sequences — consider removing outliers.
56
+ - **Conserved columns**: Columns with identical residues across all sequences indicate functional/structural constraints.
57
+ - **Gap-rich regions**: Highly gapped columns (>50% gaps) are often unreliable — mask them before phylogenetic analysis using trimAl or Gblocks.
58
+
59
+ ## Downstream Processing
60
+
61
+ Before building a phylogenetic tree, trim poorly aligned columns:
62
+
63
+ ```bash
64
+ trimal -in aligned.fasta -out trimmed.fasta -automated1
65
+ ```
66
+
67
+ Visualize alignments with AliView or JalView to inspect quality before proceeding.
@@ -0,0 +1,79 @@
1
+ ---
2
+ name: phylogenetics
3
+ description: Build phylogenetic trees with FastTree, interpret Newick format, and assess bootstrap support
4
+ version: 1.0.0
5
+ author: genomix-cli
6
+ license: Apache-2.0
7
+ metadata:
8
+ genomix:
9
+ tags: [comparative, phylogenetics, fasttree, newick, tree-building]
10
+ tools_used: [run_command, read_file]
11
+ ---
12
+
13
+ # Phylogenetic Tree Building with FastTree
14
+
15
+ ## When to Use FastTree
16
+
17
+ FastTree approximates maximum-likelihood (ML) trees using neighbor-joining for an initial topology, then applies NNI (nearest-neighbor interchange) and SPR (subtree pruning/regrafting) moves. It is orders of magnitude faster than RAxML or IQ-TREE for large datasets (>500 sequences).
18
+
19
+ Use FastTree when:
20
+ - You have >200 sequences and need a result in minutes.
21
+ - You need a quick exploratory tree before committing to a slower, more thorough method.
22
+
23
+ Use IQ-TREE or RAxML when:
24
+ - Accuracy is paramount (publication-quality tree with model selection).
25
+ - Dataset is <500 sequences and run time is not a constraint.
26
+
27
+ ## Input Requirements
28
+
29
+ FastTree requires a multiple sequence alignment in FASTA or PHYLIP format. Always trim the alignment first (see multiple-alignment skill).
30
+
31
+ ## Commands
32
+
33
+ ```bash
34
+ # Nucleotide alignment (GTR+CAT model — default for DNA)
35
+ FastTree -gtr -nt trimmed.fasta > tree.nwk
36
+
37
+ # Protein alignment (WAG model)
38
+ FastTree trimmed.fasta > tree.nwk
39
+
40
+ # With 1000 bootstrap replicates (slower but adds confidence values)
41
+ FastTree -gtr -nt -boot 1000 trimmed.fasta > tree.nwk
42
+ ```
43
+
44
+ ## Newick Format
45
+
46
+ A Newick tree encodes topology and branch lengths as nested parentheses:
47
+
48
+ ```
49
+ ((A:0.1,B:0.2):0.05,(C:0.3,D:0.15):0.08);
50
+ ```
51
+
52
+ - Each leaf is a sequence identifier.
53
+ - Numbers after `:` are branch lengths (substitutions per site).
54
+ - Numbers before `:` at internal nodes are bootstrap values (0–1 for FastTree; 0–100 for RAxML/IQ-TREE).
55
+ - The semicolon terminates the tree.
56
+
57
+ ## Interpreting Bootstrap Support
58
+
59
+ | Value (0–1 scale) | Interpretation |
60
+ |-------------------|----------------|
61
+ | ≥ 0.95 | Strong support — clade is well-resolved |
62
+ | 0.70–0.94 | Moderate support — likely correct |
63
+ | 0.50–0.69 | Weak support — treat with caution |
64
+ | < 0.50 | Unresolved — do not over-interpret the clade |
65
+
66
+ ## Visualization
67
+
68
+ Convert and visualize Newick trees with:
69
+ - **FigTree** (GUI, free): color branches by bootstrap, midpoint-root.
70
+ - **iTOL** (web): publication-quality with metadata overlays.
71
+ - **ETE3** (Python): programmatic rendering.
72
+
73
+ ```python
74
+ from ete3 import Tree
75
+ t = Tree("tree.nwk")
76
+ print(t.get_ascii(show_internal=True))
77
+ ```
78
+
79
+ Always root the tree at an outgroup or by midpoint before interpreting topology.