@nahisaho/satori 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +106 -39
- package/package.json +1 -1
- package/src/.github/skills/scientific-biomedical-pubtator/SKILL.md +331 -0
- package/src/.github/skills/scientific-cell-line-resources/SKILL.md +258 -0
- package/src/.github/skills/scientific-ebi-databases/SKILL.md +280 -0
- package/src/.github/skills/scientific-ontology-enrichment/SKILL.md +340 -0
- package/src/.github/skills/scientific-phylogenetics/SKILL.md +297 -0
- package/src/.github/skills/scientific-preprint-archive/SKILL.md +476 -0
- package/src/.github/skills/scientific-public-health-data/SKILL.md +322 -0
- package/src/.github/skills/scientific-regulatory-genomics/SKILL.md +274 -0
- package/src/.github/skills/scientific-reinforcement-learning/SKILL.md +280 -0
- package/src/.github/skills/scientific-symbolic-mathematics/SKILL.md +277 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-phylogenetics
|
|
3
|
+
description: |
|
|
4
|
+
系統解析スキル。ete3/ETE Toolkit による系統樹構築・可視化、
|
|
5
|
+
scikit-bio 系統的多様性、配列アライメントベース進化解析、
|
|
6
|
+
分子時計・分岐年代推定、祖先配列再構成パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Phylogenetics
|
|
10
|
+
|
|
11
|
+
ETE Toolkit / scikit-bio を中心とした
|
|
12
|
+
分子系統解析・進化生物学パイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 分子系統樹を構築・可視化するとき (NJ/ML/ベイズ法)
|
|
17
|
+
- 多重配列アライメントから系統推定するとき
|
|
18
|
+
- 分岐年代推定 (分子時計) を行うとき
|
|
19
|
+
- 系統的多様性 (PD: Phylogenetic Diversity) を計算するとき
|
|
20
|
+
- 祖先配列再構成を行うとき
|
|
21
|
+
- 系統比較法 (PGLS 等) で形質進化を解析するとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. ETE Toolkit 系統樹構築
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from ete3 import Tree, TreeStyle, NodeStyle, faces, AttrFace
|
|
31
|
+
import subprocess
|
|
32
|
+
import tempfile
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def build_phylogenetic_tree(sequences_fasta, method="fasttree", model="GTR"):
|
|
36
|
+
"""
|
|
37
|
+
配列アライメントから系統樹構築。
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
sequences_fasta: str — FASTA ファイルパス (アライン済み)
|
|
41
|
+
method: str — "fasttree", "raxml", "iqtree"
|
|
42
|
+
model: str — 進化モデル ("GTR", "JTT", "WAG", "LG")
|
|
43
|
+
|
|
44
|
+
K-Dense: etetoolkit — Phylogenetics toolkit
|
|
45
|
+
"""
|
|
46
|
+
commands = {
|
|
47
|
+
"fasttree": ["fasttree", "-gtr", "-nt", sequences_fasta],
|
|
48
|
+
"raxml": [
|
|
49
|
+
"raxmlHPC", "-s", sequences_fasta, "-n", "tree",
|
|
50
|
+
"-m", f"GTRGAMMA", "-p", "12345",
|
|
51
|
+
],
|
|
52
|
+
"iqtree": [
|
|
53
|
+
"iqtree2", "-s", sequences_fasta,
|
|
54
|
+
"-m", model, "-bb", "1000", "--prefix", "iqtree_out",
|
|
55
|
+
],
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
cmd = commands.get(method, commands["fasttree"])
|
|
59
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
60
|
+
|
|
61
|
+
if method == "fasttree":
|
|
62
|
+
newick = result.stdout
|
|
63
|
+
elif method == "iqtree":
|
|
64
|
+
with open("iqtree_out.treefile", "r") as f:
|
|
65
|
+
newick = f.read()
|
|
66
|
+
else:
|
|
67
|
+
newick = result.stdout
|
|
68
|
+
|
|
69
|
+
tree = Tree(newick)
|
|
70
|
+
print(f"Phylogenetic tree ({method}, {model}): "
|
|
71
|
+
f"{len(tree)} leaves, {len(list(tree.traverse()))} total nodes")
|
|
72
|
+
return tree
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def visualize_tree(tree, output_file="phylogenetic_tree.png",
|
|
76
|
+
layout="rectangular", show_support=True):
|
|
77
|
+
"""
|
|
78
|
+
ETE3 系統樹可視化。
|
|
79
|
+
|
|
80
|
+
Parameters:
|
|
81
|
+
tree: ete3.Tree — 系統樹オブジェクト
|
|
82
|
+
output_file: str — 出力画像パス
|
|
83
|
+
layout: str — "rectangular", "circular"
|
|
84
|
+
show_support: bool — ブートストラップ値を表示
|
|
85
|
+
"""
|
|
86
|
+
ts = TreeStyle()
|
|
87
|
+
ts.mode = "c" if layout == "circular" else "r"
|
|
88
|
+
ts.show_leaf_name = True
|
|
89
|
+
ts.show_branch_length = True
|
|
90
|
+
ts.show_branch_support = show_support
|
|
91
|
+
ts.branch_vertical_margin = 10
|
|
92
|
+
|
|
93
|
+
# Node styling
|
|
94
|
+
for node in tree.traverse():
|
|
95
|
+
nstyle = NodeStyle()
|
|
96
|
+
if node.is_leaf():
|
|
97
|
+
nstyle["fgcolor"] = "#2196F3"
|
|
98
|
+
nstyle["size"] = 8
|
|
99
|
+
else:
|
|
100
|
+
nstyle["fgcolor"] = "#E91E63"
|
|
101
|
+
nstyle["size"] = 5
|
|
102
|
+
if show_support and node.support >= 0.9:
|
|
103
|
+
nstyle["fgcolor"] = "#4CAF50"
|
|
104
|
+
node.set_style(nstyle)
|
|
105
|
+
|
|
106
|
+
tree.render(output_file, tree_style=ts, w=800, units="px")
|
|
107
|
+
print(f"Tree rendered: {output_file} ({layout} layout)")
|
|
108
|
+
return output_file
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## 2. 多重配列アライメント
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from Bio import AlignIO, SeqIO
|
|
115
|
+
from Bio.Align.Applications import MafftCommandline, MuscleCommandline
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def run_multiple_alignment(input_fasta, method="mafft", output_fasta=None):
|
|
119
|
+
"""
|
|
120
|
+
多重配列アライメント。
|
|
121
|
+
|
|
122
|
+
Parameters:
|
|
123
|
+
input_fasta: str — 入力 FASTA パス
|
|
124
|
+
method: str — "mafft", "muscle", "clustalw"
|
|
125
|
+
output_fasta: str — 出力パス
|
|
126
|
+
"""
|
|
127
|
+
if output_fasta is None:
|
|
128
|
+
output_fasta = input_fasta.replace(".fasta", f"_aligned_{method}.fasta")
|
|
129
|
+
|
|
130
|
+
if method == "mafft":
|
|
131
|
+
cmd = f"mafft --auto {input_fasta} > {output_fasta}"
|
|
132
|
+
elif method == "muscle":
|
|
133
|
+
cmd = f"muscle -in {input_fasta} -out {output_fasta}"
|
|
134
|
+
else:
|
|
135
|
+
cmd = f"clustalw2 -INFILE={input_fasta} -OUTFILE={output_fasta}"
|
|
136
|
+
|
|
137
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
138
|
+
|
|
139
|
+
alignment = AlignIO.read(output_fasta, "fasta")
|
|
140
|
+
print(f"Alignment ({method}): {len(alignment)} sequences, "
|
|
141
|
+
f"{alignment.get_alignment_length()} positions")
|
|
142
|
+
return alignment
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## 3. 系統的多様性 (Phylogenetic Diversity)
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
import skbio
|
|
149
|
+
from skbio import TreeNode
|
|
150
|
+
from skbio.diversity import alpha_diversity, beta_diversity
|
|
151
|
+
import numpy as np
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def calculate_phylogenetic_diversity(newick_string, sample_otus):
|
|
155
|
+
"""
|
|
156
|
+
系統的多様性 (Faith's PD, UniFrac) 計算。
|
|
157
|
+
|
|
158
|
+
Parameters:
|
|
159
|
+
newick_string: str — Newick 形式系統樹
|
|
160
|
+
sample_otus: dict — {sample_id: {otu_id: abundance}}
|
|
161
|
+
|
|
162
|
+
K-Dense: scikit-bio — PD & UniFrac
|
|
163
|
+
"""
|
|
164
|
+
tree = TreeNode.read([newick_string])
|
|
165
|
+
|
|
166
|
+
# Prepare OTU table
|
|
167
|
+
all_otus = sorted(set(
|
|
168
|
+
otu for otus in sample_otus.values() for otu in otus
|
|
169
|
+
))
|
|
170
|
+
sample_names = list(sample_otus.keys())
|
|
171
|
+
otu_table = np.zeros((len(sample_names), len(all_otus)))
|
|
172
|
+
for i, sample in enumerate(sample_names):
|
|
173
|
+
for j, otu in enumerate(all_otus):
|
|
174
|
+
otu_table[i, j] = sample_otus[sample].get(otu, 0)
|
|
175
|
+
|
|
176
|
+
# Faith's PD (alpha diversity)
|
|
177
|
+
pd_values = alpha_diversity("faith_pd", otu_table, ids=sample_names, tree=tree,
|
|
178
|
+
otu_ids=all_otus)
|
|
179
|
+
print(f"Faith's PD: mean={pd_values.mean():.3f}, "
|
|
180
|
+
f"range=[{pd_values.min():.3f}, {pd_values.max():.3f}]")
|
|
181
|
+
|
|
182
|
+
# Weighted UniFrac (beta diversity)
|
|
183
|
+
unifrac_dm = beta_diversity("weighted_unifrac", otu_table,
|
|
184
|
+
ids=sample_names, tree=tree, otu_ids=all_otus)
|
|
185
|
+
print(f"Weighted UniFrac: mean distance = "
|
|
186
|
+
f"{unifrac_dm.condensed_form().mean():.4f}")
|
|
187
|
+
|
|
188
|
+
return {"faith_pd": pd_values, "unifrac": unifrac_dm}
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## 4. 分子時計・分岐年代推定
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
def estimate_divergence_times(tree, calibrations, rate_model="strict"):
|
|
195
|
+
"""
|
|
196
|
+
分子時計による分岐年代推定。
|
|
197
|
+
|
|
198
|
+
Parameters:
|
|
199
|
+
tree: ete3.Tree — 系統樹
|
|
200
|
+
calibrations: dict — {(taxon1, taxon2): (min_age, max_age)}
|
|
201
|
+
e.g., {("human", "mouse"): (85, 95)} # MYA
|
|
202
|
+
rate_model: str — "strict" or "relaxed"
|
|
203
|
+
"""
|
|
204
|
+
# Branch length to relative time conversion
|
|
205
|
+
total_length = max(tree.get_distance(leaf) for leaf in tree.get_leaves())
|
|
206
|
+
|
|
207
|
+
# Apply calibration
|
|
208
|
+
for (t1, t2), (min_age, max_age) in calibrations.items():
|
|
209
|
+
node1 = tree.search_nodes(name=t1)
|
|
210
|
+
node2 = tree.search_nodes(name=t2)
|
|
211
|
+
if node1 and node2:
|
|
212
|
+
ancestor = tree.get_common_ancestor(node1[0], node2[0])
|
|
213
|
+
dist = tree.get_distance(ancestor)
|
|
214
|
+
calibration_age = (min_age + max_age) / 2
|
|
215
|
+
rate = dist / calibration_age if calibration_age > 0 else 1
|
|
216
|
+
print(f"Calibration {t1}-{t2}: {calibration_age} MYA, rate={rate:.6f}")
|
|
217
|
+
|
|
218
|
+
# Estimate ages for all internal nodes
|
|
219
|
+
node_ages = {}
|
|
220
|
+
for node in tree.traverse("postorder"):
|
|
221
|
+
if not node.is_leaf():
|
|
222
|
+
dist = tree.get_distance(node)
|
|
223
|
+
# Simple proportional dating
|
|
224
|
+
estimated_age = (dist / total_length) * max(
|
|
225
|
+
(min_age + max_age) / 2
|
|
226
|
+
for (min_age, max_age) in calibrations.values()
|
|
227
|
+
)
|
|
228
|
+
node_ages[node.name or f"node_{id(node)}"] = estimated_age
|
|
229
|
+
|
|
230
|
+
return node_ages
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## 5. 祖先配列再構成
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
def ancestral_sequence_reconstruction(alignment_file, tree_file, model="JTT"):
|
|
237
|
+
"""
|
|
238
|
+
最尤法による祖先配列再構成。
|
|
239
|
+
|
|
240
|
+
Parameters:
|
|
241
|
+
alignment_file: str — アライメントファイルパス
|
|
242
|
+
tree_file: str — 系統樹ファイルパス (Newick)
|
|
243
|
+
model: str — アミノ酸置換モデル
|
|
244
|
+
"""
|
|
245
|
+
# Using IQ-TREE for ASR
|
|
246
|
+
cmd = [
|
|
247
|
+
"iqtree2", "-s", alignment_file, "-te", tree_file,
|
|
248
|
+
"-m", model, "-asr", "--prefix", "asr_output",
|
|
249
|
+
]
|
|
250
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
251
|
+
|
|
252
|
+
if result.returncode == 0:
|
|
253
|
+
# Parse ancestral sequences
|
|
254
|
+
asr_file = "asr_output.state"
|
|
255
|
+
ancestral_seqs = {}
|
|
256
|
+
if os.path.exists(asr_file):
|
|
257
|
+
import csv
|
|
258
|
+
with open(asr_file) as f:
|
|
259
|
+
reader = csv.reader(f, delimiter="\t")
|
|
260
|
+
for row in reader:
|
|
261
|
+
if row and not row[0].startswith("#"):
|
|
262
|
+
node = row[0]
|
|
263
|
+
site = row[1]
|
|
264
|
+
state = row[2]
|
|
265
|
+
if node not in ancestral_seqs:
|
|
266
|
+
ancestral_seqs[node] = []
|
|
267
|
+
ancestral_seqs[node].append(state)
|
|
268
|
+
|
|
269
|
+
print(f"ASR ({model}): {len(ancestral_seqs)} ancestral nodes reconstructed")
|
|
270
|
+
return ancestral_seqs
|
|
271
|
+
else:
|
|
272
|
+
print(f"ASR failed: {result.stderr[:200]}")
|
|
273
|
+
return None
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
---
|
|
277
|
+
|
|
278
|
+
## パイプライン出力
|
|
279
|
+
|
|
280
|
+
| 出力ファイル | 説明 | 連携先スキル |
|
|
281
|
+
|---|---|---|
|
|
282
|
+
| `results/phylogenetic_tree.nwk` | Newick 系統樹 | → infectious-disease, microbiome |
|
|
283
|
+
| `figures/phylogenetic_tree.png` | 系統樹可視化 | → publication-figures, presentation |
|
|
284
|
+
| `results/divergence_times.json` | 分岐年代推定 | → population-genetics, environmental-ecology |
|
|
285
|
+
| `results/ancestral_sequences.fasta` | 祖先配列 | → protein-design, sequence-analysis |
|
|
286
|
+
| `results/phylo_diversity.json` | 系統的多様性 | → microbiome-metagenomics |
|
|
287
|
+
|
|
288
|
+
## パイプライン統合
|
|
289
|
+
|
|
290
|
+
```
|
|
291
|
+
sequence-analysis ──→ phylogenetics ──→ infectious-disease
|
|
292
|
+
(アライメント) (系統樹構築) (病原体系統解析)
|
|
293
|
+
│
|
|
294
|
+
├──→ microbiome-metagenomics (UniFrac)
|
|
295
|
+
├──→ population-genetics (分岐推定)
|
|
296
|
+
└──→ environmental-ecology (系統的多様性)
|
|
297
|
+
```
|