genomix-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genomix/__init__.py +3 -0
- genomix/agent/__init__.py +0 -0
- genomix/agent/context_compressor.py +31 -0
- genomix/agent/loop.py +93 -0
- genomix/agent/prompt_builder.py +68 -0
- genomix/agent/session_store.py +41 -0
- genomix/builtin_skills/common/file-formats/SKILL.md +94 -0
- genomix/builtin_skills/common/genome-references/SKILL.md +86 -0
- genomix/builtin_skills/comparative/blast-analysis/SKILL.md +72 -0
- genomix/builtin_skills/comparative/multiple-alignment/SKILL.md +67 -0
- genomix/builtin_skills/comparative/phylogenetics/SKILL.md +79 -0
- genomix/builtin_skills/exploration/database-search/SKILL.md +73 -0
- genomix/builtin_skills/exploration/sequence-summary/SKILL.md +93 -0
- genomix/builtin_skills/exploration/variant-explain/SKILL.md +66 -0
- genomix/builtin_skills/sequencing/alignment/SKILL.md +83 -0
- genomix/builtin_skills/sequencing/annotation/SKILL.md +77 -0
- genomix/builtin_skills/sequencing/quality-control/SKILL.md +60 -0
- genomix/builtin_skills/sequencing/variant-calling/SKILL.md +93 -0
- genomix/cli.py +193 -0
- genomix/config.py +80 -0
- genomix/errors.py +37 -0
- genomix/output.py +20 -0
- genomix/project/__init__.py +0 -0
- genomix/project/manager.py +73 -0
- genomix/project/setup_wizard.py +44 -0
- genomix/providers/__init__.py +20 -0
- genomix/providers/base.py +27 -0
- genomix/providers/claude.py +124 -0
- genomix/providers/openai_provider.py +33 -0
- genomix/providers/opencode.py +71 -0
- genomix/runtime.py +75 -0
- genomix/skills/__init__.py +1 -0
- genomix/skills/loader.py +58 -0
- genomix/skills/registry.py +48 -0
- genomix/swarm/__init__.py +0 -0
- genomix/swarm/manager.py +70 -0
- genomix/tools/__init__.py +0 -0
- genomix/tools/file_tools.py +37 -0
- genomix/tools/mcp_bridge.py +85 -0
- genomix/tools/mcp_manager.py +281 -0
- genomix/tools/registry.py +25 -0
- genomix/tui.py +512 -0
- genomix_cli-0.1.0.dist-info/METADATA +217 -0
- genomix_cli-0.1.0.dist-info/RECORD +61 -0
- genomix_cli-0.1.0.dist-info/WHEEL +4 -0
- genomix_cli-0.1.0.dist-info/entry_points.txt +2 -0
- genomix_cli-0.1.0.dist-info/licenses/LICENSE +194 -0
- mcp_servers/__init__.py +0 -0
- mcp_servers/base_biotool.py +27 -0
- mcp_servers/base_database.py +111 -0
- mcp_servers/biotools/__init__.py +0 -0
- mcp_servers/biotools/blast_server.py +69 -0
- mcp_servers/biotools/bwa_server.py +37 -0
- mcp_servers/biotools/fastqc_server.py +29 -0
- mcp_servers/biotools/gatk_server.py +47 -0
- mcp_servers/biotools/samtools_server.py +76 -0
- mcp_servers/databases/__init__.py +0 -0
- mcp_servers/databases/clinvar_server.py +53 -0
- mcp_servers/databases/dbsnp_server.py +53 -0
- mcp_servers/databases/ensembl_server.py +100 -0
- mcp_servers/databases/ncbi_server.py +68 -0
genomix/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Context compression: summarize old tool results when context grows."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
CHARS_PER_TOKEN = 4
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def estimate_tokens(messages):
|
|
9
|
+
return sum(len(m.get("content", "") or "") for m in messages)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def should_compress(messages, max_tokens):
|
|
13
|
+
return estimate_tokens(messages) > max_tokens * 0.8
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def compress_messages(messages, max_tokens):
|
|
17
|
+
if not should_compress(messages, max_tokens):
|
|
18
|
+
return messages
|
|
19
|
+
result = []
|
|
20
|
+
if messages and messages[0]["role"] == "system":
|
|
21
|
+
result.append(messages[0])
|
|
22
|
+
messages = messages[1:]
|
|
23
|
+
keep_recent = min(6, len(messages))
|
|
24
|
+
old = messages[:-keep_recent] if keep_recent < len(messages) else []
|
|
25
|
+
recent = messages[-keep_recent:]
|
|
26
|
+
for msg in old:
|
|
27
|
+
if msg.get("role") == "tool" and len(msg.get("content", "") or "") > 500:
|
|
28
|
+
msg = {**msg, "content": (msg["content"] or "")[:200] + "\n... [truncated]"}
|
|
29
|
+
result.append(msg)
|
|
30
|
+
result.extend(recent)
|
|
31
|
+
return result
|
genomix/agent/loop.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Agent conversation loop with tool calling support."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any, Callable
|
|
5
|
+
|
|
6
|
+
from genomix.providers.base import BaseProvider
|
|
7
|
+
from genomix.tools.registry import ToolRegistry
|
|
8
|
+
from genomix.agent.context_compressor import should_compress, compress_messages
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AgentLoop:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
provider: BaseProvider,
|
|
15
|
+
tool_registry: ToolRegistry,
|
|
16
|
+
system_prompt: str = "",
|
|
17
|
+
max_iterations: int = 30,
|
|
18
|
+
on_tool_call: Callable[[str, dict], None] | None = None,
|
|
19
|
+
on_tool_result: Callable[[str, str], None] | None = None,
|
|
20
|
+
on_thinking: Callable[[str], None] | None = None,
|
|
21
|
+
):
|
|
22
|
+
self.provider = provider
|
|
23
|
+
self.tool_registry = tool_registry
|
|
24
|
+
self.system_prompt = system_prompt
|
|
25
|
+
self.max_iterations = max_iterations
|
|
26
|
+
self.messages: list[dict[str, Any]] = []
|
|
27
|
+
# UI callbacks
|
|
28
|
+
self.on_tool_call = on_tool_call
|
|
29
|
+
self.on_tool_result = on_tool_result
|
|
30
|
+
self.on_thinking = on_thinking
|
|
31
|
+
|
|
32
|
+
def _build_messages(self) -> list[dict[str, Any]]:
|
|
33
|
+
msgs = list(self.messages)
|
|
34
|
+
|
|
35
|
+
# Compress if context is getting large
|
|
36
|
+
max_tokens = self.provider.max_context_length()
|
|
37
|
+
if should_compress(msgs, max_tokens):
|
|
38
|
+
msgs = compress_messages(msgs, max_tokens)
|
|
39
|
+
|
|
40
|
+
if self.system_prompt:
|
|
41
|
+
return [{"role": "system", "content": self.system_prompt}] + msgs
|
|
42
|
+
return msgs
|
|
43
|
+
|
|
44
|
+
def _force_final_synthesis(self) -> str:
|
|
45
|
+
"""Ask the model for a final answer with tool use disabled."""
|
|
46
|
+
self.messages.append(
|
|
47
|
+
{
|
|
48
|
+
"role": "user",
|
|
49
|
+
"content": (
|
|
50
|
+
"You have enough information. Do not call any more tools. "
|
|
51
|
+
"Provide the best final answer now, clearly noting uncertainty where needed."
|
|
52
|
+
),
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
response = self.provider.chat(self._build_messages(), tools=None)
|
|
56
|
+
self.messages.append({"role": "assistant", "content": response.content})
|
|
57
|
+
return response.content or "Max iterations reached without a final response."
|
|
58
|
+
|
|
59
|
+
def chat(self, user_message: str) -> str:
|
|
60
|
+
self.messages.append({"role": "user", "content": user_message})
|
|
61
|
+
tools = self.tool_registry.list_tools() or None
|
|
62
|
+
|
|
63
|
+
for iteration in range(self.max_iterations):
|
|
64
|
+
all_messages = self._build_messages()
|
|
65
|
+
|
|
66
|
+
if self.on_thinking and iteration == 0:
|
|
67
|
+
self.on_thinking("Thinking...")
|
|
68
|
+
|
|
69
|
+
response = self.provider.chat(all_messages, tools=tools)
|
|
70
|
+
|
|
71
|
+
if response.tool_calls:
|
|
72
|
+
self.messages.append({"role": "assistant", "content": response.content or "", "tool_calls": [
|
|
73
|
+
{"id": tc.id, "type": "function", "function": {"name": tc.name, "arguments": json.dumps(tc.arguments) if isinstance(tc.arguments, dict) else tc.arguments}}
|
|
74
|
+
for tc in response.tool_calls
|
|
75
|
+
]})
|
|
76
|
+
for tc in response.tool_calls:
|
|
77
|
+
if self.on_tool_call:
|
|
78
|
+
self.on_tool_call(tc.name, tc.arguments)
|
|
79
|
+
try:
|
|
80
|
+
result = self.tool_registry.dispatch(tc.name, tc.arguments)
|
|
81
|
+
except Exception as e:
|
|
82
|
+
result = json.dumps({"error": str(e)})
|
|
83
|
+
# Truncate large tool results to prevent context explosion
|
|
84
|
+
if len(result) > 2000:
|
|
85
|
+
result = result[:2000] + "\n... [truncated]"
|
|
86
|
+
if self.on_tool_result:
|
|
87
|
+
self.on_tool_result(tc.name, result[:200])
|
|
88
|
+
self.messages.append({"role": "tool", "tool_call_id": tc.id, "content": result})
|
|
89
|
+
else:
|
|
90
|
+
self.messages.append({"role": "assistant", "content": response.content})
|
|
91
|
+
return response.content or ""
|
|
92
|
+
|
|
93
|
+
return self._force_final_synthesis()
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Assemble the system prompt from project context, skills, and mode."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from genomix.project.manager import GenomixProject
|
|
7
|
+
|
|
8
|
+
IDENTITY = """You are Genomix, an AI assistant specialized in DNA sequence and genome analysis.
|
|
9
|
+
You help biologists, bioinformaticians, and researchers analyze genomic data by orchestrating
|
|
10
|
+
bioinformatics tools and querying genomic databases.
|
|
11
|
+
|
|
12
|
+
You are proactive: suggest next steps, explain results in accessible language, and adapt
|
|
13
|
+
your communication to the user's expertise level. When a user speaks in natural language,
|
|
14
|
+
explain in plain terms. When they use slash commands, be concise and technical.
|
|
15
|
+
|
|
16
|
+
You have access to tools for: file manipulation, sequence alignment, variant calling,
|
|
17
|
+
annotation, BLAST searches, database queries (NCBI, Ensembl, ClinVar, dbSNP), and more.
|
|
18
|
+
|
|
19
|
+
IMPORTANT — Tool calling strategy:
|
|
20
|
+
- Be STRATEGIC with tool calls. Do NOT query every database for every item.
|
|
21
|
+
- Maximum 5-6 tool calls per question. After that, synthesize from what you have.
|
|
22
|
+
- Use your own knowledge to supplement — you don't need to verify everything via API.
|
|
23
|
+
|
|
24
|
+
When analyzing VCF files:
|
|
25
|
+
1. Read the file. Check if INFO has annotations (GENE, EFFECT, CLNSIG).
|
|
26
|
+
2. IF annotated: use them directly, no database queries needed.
|
|
27
|
+
3. IF raw (no annotations, ID is "."):
|
|
28
|
+
→ Use YOUR KNOWLEDGE FIRST to identify genes from coordinates:
|
|
29
|
+
chr17:43,044,000-43,170,000 = BRCA1
|
|
30
|
+
chr13:32,315,000-32,400,000 = BRCA2
|
|
31
|
+
chr7:117,480,000-117,668,000 = CFTR
|
|
32
|
+
chr11:5,225,000-5,228,000 = HBB
|
|
33
|
+
chr19:44,905,000-44,910,000 = APOE
|
|
34
|
+
→ Only query databases (1-2 calls MAX) for coordinates you truly don't recognize.
|
|
35
|
+
→ Interpret GT (0/1=het, 1/1=hom), DP (read depth), GQ (quality).
|
|
36
|
+
→ Use your knowledge of well-known pathogenic variants at these positions.
|
|
37
|
+
→ RESPOND after reading the file + at most 2 database calls. Do NOT look up every variant.
|
|
38
|
+
|
|
39
|
+
Advanced analysis capabilities — DO NOT say you can't do these:
|
|
40
|
+
- ANCESTRY INFERENCE: Many variants have population-specific frequencies. Use your
|
|
41
|
+
knowledge of ancestry-informative markers (e.g. rs334/HBB sickle cell = high frequency
|
|
42
|
+
in African/Mediterranean populations, CFTR deltaF508 = Northern European, APOE allele
|
|
43
|
+
frequencies vary by population). Use ensembl_population_frequencies to get gnomAD/1000G
|
|
44
|
+
frequency data across AFR/EUR/EAS/SAS/AMR populations. Combine multiple variants to
|
|
45
|
+
suggest likely ancestry.
|
|
46
|
+
- PHENOTYPE INFERENCE: From pathogenic variants, infer likely phenotypic consequences
|
|
47
|
+
(disease risk, carrier status for recessive conditions, drug response).
|
|
48
|
+
- PHARMACOGENOMICS: Some variants affect drug metabolism. Flag them if present.
|
|
49
|
+
- CARRIER STATUS: For autosomal recessive diseases (sickle cell, CF), heterozygous
|
|
50
|
+
carriers (0/1) are typically unaffected but can pass the variant to children."""
|
|
51
|
+
|
|
52
|
+
PRIVACY_ADDENDUM = """
|
|
53
|
+
PRIVACY MODE IS ACTIVE. You must follow these rules strictly:
|
|
54
|
+
- Never include raw sequence data (nucleotide strings) in your responses or reasoning
|
|
55
|
+
- Never include patient identifiers or sample metadata
|
|
56
|
+
- Only reference aggregated statistics, variant IDs (rsIDs), and gene symbols
|
|
57
|
+
- All tools run locally — only summaries are passed to you"""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def build_system_prompt(project, skill_body, privacy_mode):
|
|
61
|
+
parts = [IDENTITY]
|
|
62
|
+
if project:
|
|
63
|
+
parts.append(f"\n## Active Project\n- **Name:** {project.name}\n- **Organism:** {project.organism}\n- **Reference genome:** {project.reference_genome}\n- **Data type:** {project.data_type}\n- **Project root:** {project.root}")
|
|
64
|
+
if skill_body:
|
|
65
|
+
parts.append(f"\n## Current Task Instructions\n\n{skill_body}")
|
|
66
|
+
if privacy_mode:
|
|
67
|
+
parts.append(PRIVACY_ADDENDUM)
|
|
68
|
+
return "\n".join(parts)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Session history storage with SQLite + FTS5."""
|
|
2
|
+
import json, sqlite3, uuid
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SessionStore:
|
|
9
|
+
def __init__(self, db_path):
|
|
10
|
+
self.db_path = Path(db_path)
|
|
11
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
12
|
+
self._init_db()
|
|
13
|
+
|
|
14
|
+
def _init_db(self):
|
|
15
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
16
|
+
conn.execute("CREATE TABLE IF NOT EXISTS sessions (id TEXT PRIMARY KEY, title TEXT, messages TEXT, created_at TEXT)")
|
|
17
|
+
conn.execute("CREATE VIRTUAL TABLE IF NOT EXISTS sessions_fts USING fts5(id, title, content)")
|
|
18
|
+
|
|
19
|
+
def save_session(self, messages, title=""):
|
|
20
|
+
sid = uuid.uuid4().hex[:12]
|
|
21
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
22
|
+
content = " ".join(m.get("content", "") or "" for m in messages)
|
|
23
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
24
|
+
conn.execute("INSERT INTO sessions VALUES (?,?,?,?)", (sid, title, json.dumps(messages), now))
|
|
25
|
+
conn.execute("INSERT INTO sessions_fts VALUES (?,?,?)", (sid, title, content))
|
|
26
|
+
return sid
|
|
27
|
+
|
|
28
|
+
def load_session(self, session_id):
|
|
29
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
30
|
+
row = conn.execute("SELECT messages FROM sessions WHERE id=?", (session_id,)).fetchone()
|
|
31
|
+
return json.loads(row[0]) if row else []
|
|
32
|
+
|
|
33
|
+
def search(self, query):
|
|
34
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
35
|
+
rows = conn.execute("SELECT id, title FROM sessions_fts WHERE sessions_fts MATCH ?", (query,)).fetchall()
|
|
36
|
+
return [{"id": r[0], "title": r[1]} for r in rows]
|
|
37
|
+
|
|
38
|
+
def list_sessions(self, limit=20):
|
|
39
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
40
|
+
rows = conn.execute("SELECT id, title, created_at FROM sessions ORDER BY created_at DESC LIMIT ?", (limit,)).fetchall()
|
|
41
|
+
return [{"id": r[0], "title": r[1], "created_at": r[2]} for r in rows]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: file-formats
|
|
3
|
+
description: Recognize and validate FASTA, FASTQ, BAM, VCF, and GFF files by inspecting their content
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: genomix-cli
|
|
6
|
+
license: Apache-2.0
|
|
7
|
+
metadata:
|
|
8
|
+
genomix:
|
|
9
|
+
tags: [common, file-formats, fasta, fastq, bam, vcf, gff, detection]
|
|
10
|
+
tools_used: [read_file, run_command]
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Genomic File Format Recognition
|
|
14
|
+
|
|
15
|
+
## Detection by Content
|
|
16
|
+
|
|
17
|
+
Never rely solely on file extensions — always verify content.
|
|
18
|
+
|
|
19
|
+
### FASTA
|
|
20
|
+
- First non-empty line starts with `>`.
|
|
21
|
+
- Second line is a sequence (A, T, G, C, N, IUPAC ambiguity codes, or amino acid letters).
|
|
22
|
+
- Multi-line sequences are allowed; next record starts with `>`.
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
>sequence_id optional description
|
|
26
|
+
ATCGATCGATCG
|
|
27
|
+
ATCGATCGATCG
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### FASTQ
|
|
31
|
+
- Records are 4 lines: `@header`, sequence, `+` (optionally repeated header), quality string.
|
|
32
|
+
- Quality string length must equal sequence length.
|
|
33
|
+
- Quality characters are ASCII 33–126 (Phred+33 encoding for modern Illumina).
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
@read_id
|
|
37
|
+
ATCGATCGATCG
|
|
38
|
+
+
|
|
39
|
+
FFFFIIIIBBBB
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Distinguish FASTQ from FASTA: FASTQ starts with `@`, FASTA with `>`. Note: `@` also appears in quality lines, so always check that records are 4 lines.
|
|
43
|
+
|
|
44
|
+
### BAM / SAM
|
|
45
|
+
- **SAM**: plain text; starts with `@HD` (header line), `@SQ` (reference sequences), then alignment records (11 mandatory tab-separated fields).
|
|
46
|
+
- **BAM**: binary; magic bytes are `BAM\1` (hex `42 41 4D 01`). Use `samtools view -H` to inspect.
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
samtools view -H sample.bam | head -5
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### VCF
|
|
53
|
+
- Starts with `##fileformat=VCFv4.x`.
|
|
54
|
+
- Meta-information lines begin with `##`.
|
|
55
|
+
- Header line begins with `#CHROM`.
|
|
56
|
+
- Data lines: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO (+ optional FORMAT and sample columns).
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
##fileformat=VCFv4.2
|
|
60
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO
|
|
61
|
+
chr17 41245466 rs28897696 G A 100 PASS .
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### GFF / GTF
|
|
65
|
+
- GFF3: starts with `##gff-version 3`. 9 tab-separated fields; attributes in `key=value` format.
|
|
66
|
+
- GTF (GFF2): attributes in `key "value"` format (used by GENCODE, Ensembl downloads).
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
##gff-version 3
|
|
70
|
+
chr1 RefSeq gene 11874 14409 . + . ID=gene1;Name=DDX11L1
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Format Validation Commands
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Check FASTA integrity
|
|
77
|
+
seqkit stats sequences.fasta
|
|
78
|
+
|
|
79
|
+
# Validate FASTQ (check pairing, quality encoding)
|
|
80
|
+
fastqc --nogroup sample.fastq.gz
|
|
81
|
+
|
|
82
|
+
# Validate BAM (check for truncation, index)
|
|
83
|
+
samtools quickcheck sample.bam && samtools index sample.bam
|
|
84
|
+
|
|
85
|
+
# Check VCF (validate against spec)
|
|
86
|
+
bcftools stats sample.vcf.gz | head -30
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Common Pitfalls
|
|
90
|
+
|
|
91
|
+
- **FASTA with wrapped lines**: parsers must handle variable line widths.
|
|
92
|
+
- **FASTQ quality encoding**: older data may use Phred+64 (Illumina 1.3–1.7); seqkit or FastQC can auto-detect.
|
|
93
|
+
- **Multi-sample VCF**: sample columns appear after FORMAT — always check the header for sample names.
|
|
94
|
+
- **BGZF vs gzip**: BAM and indexed VCF use BGZF (block gzip), which is compatible with regular gzip but supports random access via `.tbi` / `.csi` indexes.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: genome-references
|
|
3
|
+
description: Choose between GRCh38 and GRCh37, perform coordinate liftover, and select organism-specific references
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: genomix-cli
|
|
6
|
+
license: Apache-2.0
|
|
7
|
+
metadata:
|
|
8
|
+
genomix:
|
|
9
|
+
tags: [common, references, GRCh38, GRCh37, hg38, hg19, liftover, assembly]
|
|
10
|
+
tools_used: [run_command, ncbi_search]
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Genome Reference Selection
|
|
14
|
+
|
|
15
|
+
## GRCh38 vs GRCh37
|
|
16
|
+
|
|
17
|
+
| Feature | GRCh37 (hg19) | GRCh38 (hg38) |
|
|
18
|
+
|---------|---------------|----------------|
|
|
19
|
+
| Release | 2009 | 2013 (last patch 2022) |
|
|
20
|
+
| Alternate loci | None | 261 ALT sequences |
|
|
21
|
+
| Centromere resolution | Incomplete | Near-complete |
|
|
22
|
+
| Annotation currency | Legacy databases | Active updates (GENCODE, RefSeq) |
|
|
23
|
+
| Clinical databases | ClinVar legacy data | Current default |
|
|
24
|
+
| Chromosome names | chr1–22, chrX, chrY | chr1–22, chrX, chrY (same) |
|
|
25
|
+
|
|
26
|
+
**Use GRCh38** for all new projects. GRCh37 is only justified when:
|
|
27
|
+
- Reanalyzing legacy data that must remain comparable to a previous GRCh37 run.
|
|
28
|
+
- A clinical database or variant report explicitly uses GRCh37 coordinates.
|
|
29
|
+
- A collaboration partner mandates GRCh37.
|
|
30
|
+
|
|
31
|
+
Note: UCSC names (hg19/hg38) are interchangeable with Ensembl/NCBI names (GRCh37/GRCh38) for the primary assembly — the sequences are identical.
|
|
32
|
+
|
|
33
|
+
## Coordinate Liftover
|
|
34
|
+
|
|
35
|
+
To convert coordinates from GRCh37 to GRCh38 (or reverse), use UCSC liftOver or Picard LiftoverVcf.
|
|
36
|
+
|
|
37
|
+
### liftOver (BED)
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# Download chain file
|
|
41
|
+
wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
|
|
42
|
+
|
|
43
|
+
liftOver \
|
|
44
|
+
input.hg19.bed \
|
|
45
|
+
hg19ToHg38.over.chain.gz \
|
|
46
|
+
output.hg38.bed \
|
|
47
|
+
unmapped.bed
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Picard LiftoverVcf
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
gatk LiftoverVcf \
|
|
54
|
+
-I sample.hg19.vcf.gz \
|
|
55
|
+
-O sample.hg38.vcf.gz \
|
|
56
|
+
--CHAIN hg19ToHg38.over.chain.gz \
|
|
57
|
+
-R GRCh38.fa \
|
|
58
|
+
--REJECT rejected.vcf.gz
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Always inspect the `unmapped.bed` or `rejected.vcf.gz` — coordinates in regions that were restructured between assemblies cannot be lifted over reliably.
|
|
62
|
+
|
|
63
|
+
## Common Organism References
|
|
64
|
+
|
|
65
|
+
| Organism | Assembly | NCBI Accession |
|
|
66
|
+
|----------|----------|----------------|
|
|
67
|
+
| Human | GRCh38.p14 | GCA_000001405.29 |
|
|
68
|
+
| Mouse | GRCm39 | GCA_000001635.9 |
|
|
69
|
+
| Zebrafish | GRCz11 | GCA_000002035.4 |
|
|
70
|
+
| Drosophila | dm6 | GCA_000001215.4 |
|
|
71
|
+
| C. elegans | WBcel235 | GCA_000002985.3 |
|
|
72
|
+
| E. coli K-12 | ASM584v2 | GCA_000005845.2 |
|
|
73
|
+
| SARS-CoV-2 | ASM985889v3 | GCA_009858895.3 |
|
|
74
|
+
| Arabidopsis | TAIR10.1 | GCA_000001735.2 |
|
|
75
|
+
|
|
76
|
+
## Reference Download
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# Human GRCh38 from NCBI
|
|
80
|
+
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/GCA_000001405.15_GRCh38_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/chroms.tar.gz
|
|
81
|
+
|
|
82
|
+
# Or via Ensembl (soft-masked)
|
|
83
|
+
wget https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Ensembl FASTA uses chromosome names without `chr` prefix (e.g., `1`, `X`). UCSC/NCBI FASTAs use `chr1`, `chrX`. Ensure the chromosome naming convention matches your annotation files to avoid silently skipping records.
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: blast-analysis
|
|
3
|
+
description: Select the correct BLAST program (blastn/blastp/blastx), set e-value thresholds, and interpret results
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: genomix-cli
|
|
6
|
+
license: Apache-2.0
|
|
7
|
+
metadata:
|
|
8
|
+
genomix:
|
|
9
|
+
tags: [comparative, blast, blastn, blastp, blastx, similarity, e-value]
|
|
10
|
+
tools_used: [run_blast, ncbi_search]
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# BLAST Analysis
|
|
14
|
+
|
|
15
|
+
## Program Selection
|
|
16
|
+
|
|
17
|
+
| Query | Database | Program |
|
|
18
|
+
|-------|----------|---------|
|
|
19
|
+
| Nucleotide | Nucleotide | blastn |
|
|
20
|
+
| Protein | Protein | blastp |
|
|
21
|
+
| Nucleotide (translated) | Protein | blastx |
|
|
22
|
+
| Protein | Nucleotide (translated) | tblastn |
|
|
23
|
+
| Nucleotide (translated) | Nucleotide (translated) | tblastx |
|
|
24
|
+
|
|
25
|
+
Decision rules:
|
|
26
|
+
- **blastn**: Comparing closely related sequences (same species, >70% identity), primer design, confirming PCR products.
|
|
27
|
+
- **blastp**: Comparing protein sequences across species, finding orthologs, functional domain analysis.
|
|
28
|
+
- **blastx**: Annotating novel nucleotide sequences (EST, genome contig) — finds protein homologs in 6-frame translation.
|
|
29
|
+
- **tblastn**: Finding gene locations in an unannotated genome assembly using a known protein query.
|
|
30
|
+
|
|
31
|
+
Avoid tblastx for large queries — it is computationally expensive.
|
|
32
|
+
|
|
33
|
+
## E-value Interpretation
|
|
34
|
+
|
|
35
|
+
The e-value (expect value) is the number of alignments of equal or better score expected by chance in a database of this size.
|
|
36
|
+
|
|
37
|
+
| E-value | Interpretation |
|
|
38
|
+
|---------|----------------|
|
|
39
|
+
| < 1e-100 | Nearly identical sequences (>95% identity at full length) |
|
|
40
|
+
| 1e-50 to 1e-100 | Very strong homology, high confidence |
|
|
41
|
+
| 1e-10 to 1e-50 | Significant homology, likely true hit |
|
|
42
|
+
| 1e-3 to 1e-10 | Moderate confidence, check alignment manually |
|
|
43
|
+
| 0.01 to 1 | Weak hit, may be spurious — check length and identity |
|
|
44
|
+
| > 1 | Likely noise, not significant |
|
|
45
|
+
|
|
46
|
+
Default e-value cutoff is 10. For genomics, use `1e-5` as a starting filter; tighten to `1e-20` for confident functional annotation.
|
|
47
|
+
|
|
48
|
+
## Key Result Fields
|
|
49
|
+
|
|
50
|
+
- **% identity**: Fraction of aligned positions with exact match.
|
|
51
|
+
- **query coverage**: Fraction of the query sequence in the alignment. Low coverage (<50%) with high identity often means domain hit rather than full-length homolog.
|
|
52
|
+
- **bit score**: Normalized score independent of database size — use for ranking across runs.
|
|
53
|
+
- **gaps**: High gap content with moderate identity may indicate divergent or frameshifted sequences.
|
|
54
|
+
|
|
55
|
+
## Local BLAST Command Example
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
# Build local database
|
|
59
|
+
makeblastdb -in proteins.fasta -dbtype prot -out mydb
|
|
60
|
+
|
|
61
|
+
# Run blastp
|
|
62
|
+
blastp \
|
|
63
|
+
-query query.fasta \
|
|
64
|
+
-db mydb \
|
|
65
|
+
-out results.txt \
|
|
66
|
+
-evalue 1e-10 \
|
|
67
|
+
-outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" \
|
|
68
|
+
-num_threads 8 \
|
|
69
|
+
-max_target_seqs 10
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Format 6 (tabular) is easiest for downstream filtering. Always specify `-max_target_seqs` to avoid memory issues with large databases.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: multiple-alignment
|
|
3
|
+
description: Perform multiple sequence alignment with MAFFT, choose the right algorithm, and interpret output
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: genomix-cli
|
|
6
|
+
license: Apache-2.0
|
|
7
|
+
metadata:
|
|
8
|
+
genomix:
|
|
9
|
+
tags: [comparative, mafft, multiple-alignment, msa, phylogenetics]
|
|
10
|
+
tools_used: [run_command, read_file]
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Multiple Sequence Alignment with MAFFT
|
|
14
|
+
|
|
15
|
+
## When to Use MAFFT
|
|
16
|
+
|
|
17
|
+
Use multiple sequence alignment (MSA) when you need to:
|
|
18
|
+
- Compare homologous sequences to identify conserved regions.
|
|
19
|
+
- Prepare input for phylogenetic tree construction.
|
|
20
|
+
- Detect insertions/deletions (indels) across taxa.
|
|
21
|
+
- Find functional motifs conserved across species.
|
|
22
|
+
|
|
23
|
+
MAFFT is preferred over Clustal Omega for most tasks: it scales to thousands of sequences and handles both protein and nucleotide input.
|
|
24
|
+
|
|
25
|
+
## Algorithm Selection
|
|
26
|
+
|
|
27
|
+
| Flag | Algorithm | Use When |
|
|
28
|
+
|------|-----------|----------|
|
|
29
|
+
| `--auto` | Auto-select | Default; let MAFFT decide based on input size |
|
|
30
|
+
| `--localpair --maxiterate 1000` (L-INS-i) | Iterative local alignment | <200 sequences, highly accurate, long gaps expected |
|
|
31
|
+
| `--globalpair --maxiterate 1000` (G-INS-i) | Iterative global alignment | <200 sequences, global homology throughout |
|
|
32
|
+
| `--ep 0 --genafpair --maxiterate 1000` (E-INS-i) | Iterative with multiple conserved domains | <200 sequences, unalignable regions between conserved blocks |
|
|
33
|
+
| `--retree 2 --maxiterate 0` (FFT-NS-2) | Progressive | >1000 sequences, speed is priority |
|
|
34
|
+
|
|
35
|
+
For phylogenetics: use L-INS-i or G-INS-i on <200 sequences for best accuracy.
|
|
36
|
+
|
|
37
|
+
## Basic Commands
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# Protein alignment (auto mode)
|
|
41
|
+
mafft --auto --thread 4 input.fasta > aligned.fasta
|
|
42
|
+
|
|
43
|
+
# High-accuracy nucleotide alignment (<200 seqs)
|
|
44
|
+
mafft --localpair --maxiterate 1000 --thread 4 input.fasta > aligned.fasta
|
|
45
|
+
|
|
46
|
+
# Large dataset (>500 seqs)
|
|
47
|
+
mafft --retree 2 --maxiterate 0 --thread 8 input.fasta > aligned.fasta
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Output Interpretation
|
|
51
|
+
|
|
52
|
+
MAFFT outputs FASTA with gaps represented as `-`. All sequences are padded to the same length.
|
|
53
|
+
|
|
54
|
+
Key things to check:
|
|
55
|
+
- **Alignment length vs. average sequence length**: A ratio > 3x suggests many large insertions or misaligned sequences — consider removing outliers.
|
|
56
|
+
- **Conserved columns**: Columns with identical residues across all sequences indicate functional/structural constraints.
|
|
57
|
+
- **Gap-rich regions**: Highly gapped columns (>50% gaps) are often unreliable — mask them before phylogenetic analysis using trimAl or Gblocks.
|
|
58
|
+
|
|
59
|
+
## Downstream Processing
|
|
60
|
+
|
|
61
|
+
Before building a phylogenetic tree, trim poorly aligned columns:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
trimal -in aligned.fasta -out trimmed.fasta -automated1
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Visualize alignments with AliView or JalView to inspect quality before proceeding.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: phylogenetics
|
|
3
|
+
description: Build phylogenetic trees with FastTree, interpret Newick format, and assess bootstrap support
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: genomix-cli
|
|
6
|
+
license: Apache-2.0
|
|
7
|
+
metadata:
|
|
8
|
+
genomix:
|
|
9
|
+
tags: [comparative, phylogenetics, fasttree, newick, tree-building]
|
|
10
|
+
tools_used: [run_command, read_file]
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Phylogenetic Tree Building with FastTree
|
|
14
|
+
|
|
15
|
+
## When to Use FastTree
|
|
16
|
+
|
|
17
|
+
FastTree approximates maximum-likelihood (ML) trees using neighbor-joining for an initial topology, then applies NNI (nearest-neighbor interchange) and SPR (subtree pruning/regrafting) moves. It is orders of magnitude faster than RAxML or IQ-TREE for large datasets (>500 sequences).
|
|
18
|
+
|
|
19
|
+
Use FastTree when:
|
|
20
|
+
- You have >200 sequences and need a result in minutes.
|
|
21
|
+
- You need a quick exploratory tree before committing to a slower, more thorough method.
|
|
22
|
+
|
|
23
|
+
Use IQ-TREE or RAxML when:
|
|
24
|
+
- Accuracy is paramount (publication-quality tree with model selection).
|
|
25
|
+
- Dataset is <500 sequences and run time is not a constraint.
|
|
26
|
+
|
|
27
|
+
## Input Requirements
|
|
28
|
+
|
|
29
|
+
FastTree requires a multiple sequence alignment in FASTA or PHYLIP format. Always trim the alignment first (see multiple-alignment skill).
|
|
30
|
+
|
|
31
|
+
## Commands
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
# Nucleotide alignment (GTR+CAT model — default for DNA)
|
|
35
|
+
FastTree -gtr -nt trimmed.fasta > tree.nwk
|
|
36
|
+
|
|
37
|
+
# Protein alignment (WAG model)
|
|
38
|
+
FastTree trimmed.fasta > tree.nwk
|
|
39
|
+
|
|
40
|
+
# With 1000 bootstrap replicates (slower but adds confidence values)
|
|
41
|
+
FastTree -gtr -nt -boot 1000 trimmed.fasta > tree.nwk
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Newick Format
|
|
45
|
+
|
|
46
|
+
A Newick tree encodes topology and branch lengths as nested parentheses:
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
((A:0.1,B:0.2):0.05,(C:0.3,D:0.15):0.08);
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
- Each leaf is a sequence identifier.
|
|
53
|
+
- Numbers after `:` are branch lengths (substitutions per site).
|
|
54
|
+
- Numbers before `:` at internal nodes are bootstrap values (0–1 for FastTree; 0–100 for RAxML/IQ-TREE).
|
|
55
|
+
- The semicolon terminates the tree.
|
|
56
|
+
|
|
57
|
+
## Interpreting Bootstrap Support
|
|
58
|
+
|
|
59
|
+
| Value (0–1 scale) | Interpretation |
|
|
60
|
+
|-------------------|----------------|
|
|
61
|
+
| ≥ 0.95 | Strong support — clade is well-resolved |
|
|
62
|
+
| 0.70–0.94 | Moderate support — likely correct |
|
|
63
|
+
| 0.50–0.69 | Weak support — treat with caution |
|
|
64
|
+
| < 0.50 | Unresolved — do not over-interpret the clade |
|
|
65
|
+
|
|
66
|
+
## Visualization
|
|
67
|
+
|
|
68
|
+
Convert and visualize Newick trees with:
|
|
69
|
+
- **FigTree** (GUI, free): color branches by bootstrap, midpoint-root.
|
|
70
|
+
- **iTOL** (web): publication-quality with metadata overlays.
|
|
71
|
+
- **ETE3** (Python): programmatic rendering.
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from ete3 import Tree
|
|
75
|
+
t = Tree("tree.nwk")
|
|
76
|
+
print(t.get_ascii(show_internal=True))
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Always root the tree at an outgroup or by midpoint before interpreting topology.
|