@nahisaho/satori 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,297 @@
1
+ ---
2
+ name: scientific-phylogenetics
3
+ description: |
4
+ 系統解析スキル。ete3/ETE Toolkit による系統樹構築・可視化、
5
+ scikit-bio 系統的多様性、配列アライメントベース進化解析、
6
+ 分子時計・分岐年代推定、祖先配列再構成パイプライン。
7
+ ---
8
+
9
+ # Scientific Phylogenetics
10
+
11
+ ETE Toolkit / scikit-bio を中心とした
12
+ 分子系統解析・進化生物学パイプラインを提供する。
13
+
14
+ ## When to Use
15
+
16
+ - 分子系統樹を構築・可視化するとき (NJ/ML/ベイズ法)
17
+ - 多重配列アライメントから系統推定するとき
18
+ - 分岐年代推定 (分子時計) を行うとき
19
+ - 系統的多様性 (PD: Phylogenetic Diversity) を計算するとき
20
+ - 祖先配列再構成を行うとき
21
+ - 系統比較法 (PGLS 等) で形質進化を解析するとき
22
+
23
+ ---
24
+
25
+ ## Quick Start
26
+
27
+ ## 1. ETE Toolkit 系統樹構築
28
+
29
+ ```python
30
+ from ete3 import Tree, TreeStyle, NodeStyle, faces, AttrFace
31
+ import subprocess
32
+ import tempfile
33
+
34
+
35
+ def build_phylogenetic_tree(sequences_fasta, method="fasttree", model="GTR"):
36
+ """
37
+ 配列アライメントから系統樹構築。
38
+
39
+ Parameters:
40
+ sequences_fasta: str — FASTA ファイルパス (アライン済み)
41
+ method: str — "fasttree", "raxml", "iqtree"
42
+ model: str — 進化モデル ("GTR", "JTT", "WAG", "LG")
43
+
44
+ K-Dense: etetoolkit — Phylogenetics toolkit
45
+ """
46
+ commands = {
47
+ "fasttree": ["fasttree", "-gtr", "-nt", sequences_fasta],
48
+ "raxml": [
49
+ "raxmlHPC", "-s", sequences_fasta, "-n", "tree",
50
+ "-m", f"GTRGAMMA", "-p", "12345",
51
+ ],
52
+ "iqtree": [
53
+ "iqtree2", "-s", sequences_fasta,
54
+ "-m", model, "-bb", "1000", "--prefix", "iqtree_out",
55
+ ],
56
+ }
57
+
58
+ cmd = commands.get(method, commands["fasttree"])
59
+ result = subprocess.run(cmd, capture_output=True, text=True)
60
+
61
+ if method == "fasttree":
62
+ newick = result.stdout
63
+ elif method == "iqtree":
64
+ with open("iqtree_out.treefile", "r") as f:
65
+ newick = f.read()
66
+ else:
67
+ newick = result.stdout
68
+
69
+ tree = Tree(newick)
70
+ print(f"Phylogenetic tree ({method}, {model}): "
71
+ f"{len(tree)} leaves, {len(list(tree.traverse()))} total nodes")
72
+ return tree
73
+
74
+
75
+ def visualize_tree(tree, output_file="phylogenetic_tree.png",
76
+ layout="rectangular", show_support=True):
77
+ """
78
+ ETE3 系統樹可視化。
79
+
80
+ Parameters:
81
+ tree: ete3.Tree — 系統樹オブジェクト
82
+ output_file: str — 出力画像パス
83
+ layout: str — "rectangular", "circular"
84
+ show_support: bool — ブートストラップ値を表示
85
+ """
86
+ ts = TreeStyle()
87
+ ts.mode = "c" if layout == "circular" else "r"
88
+ ts.show_leaf_name = True
89
+ ts.show_branch_length = True
90
+ ts.show_branch_support = show_support
91
+ ts.branch_vertical_margin = 10
92
+
93
+ # Node styling
94
+ for node in tree.traverse():
95
+ nstyle = NodeStyle()
96
+ if node.is_leaf():
97
+ nstyle["fgcolor"] = "#2196F3"
98
+ nstyle["size"] = 8
99
+ else:
100
+ nstyle["fgcolor"] = "#E91E63"
101
+ nstyle["size"] = 5
102
+ if show_support and node.support >= 0.9:
103
+ nstyle["fgcolor"] = "#4CAF50"
104
+ node.set_style(nstyle)
105
+
106
+ tree.render(output_file, tree_style=ts, w=800, units="px")
107
+ print(f"Tree rendered: {output_file} ({layout} layout)")
108
+ return output_file
109
+ ```
110
+
111
+ ## 2. 多重配列アライメント
112
+
113
+ ```python
114
+ from Bio import AlignIO, SeqIO
115
+ from Bio.Align.Applications import MafftCommandline, MuscleCommandline
116
+
117
+
118
+ def run_multiple_alignment(input_fasta, method="mafft", output_fasta=None):
119
+ """
120
+ 多重配列アライメント。
121
+
122
+ Parameters:
123
+ input_fasta: str — 入力 FASTA パス
124
+ method: str — "mafft", "muscle", "clustalw"
125
+ output_fasta: str — 出力パス
126
+ """
127
+ if output_fasta is None:
128
+ output_fasta = input_fasta.replace(".fasta", f"_aligned_{method}.fasta")
129
+
130
+ if method == "mafft":
131
+ cmd = f"mafft --auto {input_fasta} > {output_fasta}"
132
+ elif method == "muscle":
133
+ cmd = f"muscle -in {input_fasta} -out {output_fasta}"
134
+ else:
135
+ cmd = f"clustalw2 -INFILE={input_fasta} -OUTFILE={output_fasta}"
136
+
137
+ subprocess.run(cmd, shell=True, check=True)
138
+
139
+ alignment = AlignIO.read(output_fasta, "fasta")
140
+ print(f"Alignment ({method}): {len(alignment)} sequences, "
141
+ f"{alignment.get_alignment_length()} positions")
142
+ return alignment
143
+ ```
144
+
145
+ ## 3. 系統的多様性 (Phylogenetic Diversity)
146
+
147
+ ```python
148
+ import skbio
149
+ from skbio import TreeNode
150
+ from skbio.diversity import alpha_diversity, beta_diversity
151
+ import numpy as np
152
+
153
+
154
+ def calculate_phylogenetic_diversity(newick_string, sample_otus):
155
+ """
156
+ 系統的多様性 (Faith's PD, UniFrac) 計算。
157
+
158
+ Parameters:
159
+ newick_string: str — Newick 形式系統樹
160
+ sample_otus: dict — {sample_id: {otu_id: abundance}}
161
+
162
+ K-Dense: scikit-bio — PD & UniFrac
163
+ """
164
+ tree = TreeNode.read([newick_string])
165
+
166
+ # Prepare OTU table
167
+ all_otus = sorted(set(
168
+ otu for otus in sample_otus.values() for otu in otus
169
+ ))
170
+ sample_names = list(sample_otus.keys())
171
+ otu_table = np.zeros((len(sample_names), len(all_otus)))
172
+ for i, sample in enumerate(sample_names):
173
+ for j, otu in enumerate(all_otus):
174
+ otu_table[i, j] = sample_otus[sample].get(otu, 0)
175
+
176
+ # Faith's PD (alpha diversity)
177
+ pd_values = alpha_diversity("faith_pd", otu_table, ids=sample_names, tree=tree,
178
+ otu_ids=all_otus)
179
+ print(f"Faith's PD: mean={pd_values.mean():.3f}, "
180
+ f"range=[{pd_values.min():.3f}, {pd_values.max():.3f}]")
181
+
182
+ # Weighted UniFrac (beta diversity)
183
+ unifrac_dm = beta_diversity("weighted_unifrac", otu_table,
184
+ ids=sample_names, tree=tree, otu_ids=all_otus)
185
+ print(f"Weighted UniFrac: mean distance = "
186
+ f"{unifrac_dm.condensed_form().mean():.4f}")
187
+
188
+ return {"faith_pd": pd_values, "unifrac": unifrac_dm}
189
+ ```
190
+
191
+ ## 4. 分子時計・分岐年代推定
192
+
193
+ ```python
194
+ def estimate_divergence_times(tree, calibrations, rate_model="strict"):
195
+ """
196
+ 分子時計による分岐年代推定。
197
+
198
+ Parameters:
199
+ tree: ete3.Tree — 系統樹
200
+ calibrations: dict — {(taxon1, taxon2): (min_age, max_age)}
201
+ e.g., {("human", "mouse"): (85, 95)} # MYA
202
+ rate_model: str — "strict" or "relaxed"
203
+ """
204
+ # Branch length to relative time conversion
205
+ total_length = max(tree.get_distance(leaf) for leaf in tree.get_leaves())
206
+
207
+ # Apply calibration
208
+ for (t1, t2), (min_age, max_age) in calibrations.items():
209
+ node1 = tree.search_nodes(name=t1)
210
+ node2 = tree.search_nodes(name=t2)
211
+ if node1 and node2:
212
+ ancestor = tree.get_common_ancestor(node1[0], node2[0])
213
+ dist = tree.get_distance(ancestor)
214
+ calibration_age = (min_age + max_age) / 2
215
+ rate = dist / calibration_age if calibration_age > 0 else 1
216
+ print(f"Calibration {t1}-{t2}: {calibration_age} MYA, rate={rate:.6f}")
217
+
218
+ # Estimate ages for all internal nodes
219
+ node_ages = {}
220
+ for node in tree.traverse("postorder"):
221
+ if not node.is_leaf():
222
+ dist = tree.get_distance(node)
223
+ # Simple proportional dating
224
+ estimated_age = (dist / total_length) * max(
225
+ (min_age + max_age) / 2
226
+ for (min_age, max_age) in calibrations.values()
227
+ )
228
+ node_ages[node.name or f"node_{id(node)}"] = estimated_age
229
+
230
+ return node_ages
231
+ ```
232
+
233
+ ## 5. 祖先配列再構成
234
+
235
+ ```python
236
+ def ancestral_sequence_reconstruction(alignment_file, tree_file, model="JTT"):
237
+ """
238
+ 最尤法による祖先配列再構成。
239
+
240
+ Parameters:
241
+ alignment_file: str — アライメントファイルパス
242
+ tree_file: str — 系統樹ファイルパス (Newick)
243
+ model: str — アミノ酸置換モデル
244
+ """
245
+ # Using IQ-TREE for ASR
246
+ cmd = [
247
+ "iqtree2", "-s", alignment_file, "-te", tree_file,
248
+ "-m", model, "-asr", "--prefix", "asr_output",
249
+ ]
250
+ result = subprocess.run(cmd, capture_output=True, text=True)
251
+
252
+ if result.returncode == 0:
253
+ # Parse ancestral sequences
254
+ asr_file = "asr_output.state"
255
+ ancestral_seqs = {}
256
+ if os.path.exists(asr_file):
257
+ import csv
258
+ with open(asr_file) as f:
259
+ reader = csv.reader(f, delimiter="\t")
260
+ for row in reader:
261
+ if row and not row[0].startswith("#"):
262
+ node = row[0]
263
+ site = row[1]
264
+ state = row[2]
265
+ if node not in ancestral_seqs:
266
+ ancestral_seqs[node] = []
267
+ ancestral_seqs[node].append(state)
268
+
269
+ print(f"ASR ({model}): {len(ancestral_seqs)} ancestral nodes reconstructed")
270
+ return ancestral_seqs
271
+ else:
272
+ print(f"ASR failed: {result.stderr[:200]}")
273
+ return None
274
+ ```
275
+
276
+ ---
277
+
278
+ ## パイプライン出力
279
+
280
+ | 出力ファイル | 説明 | 連携先スキル |
281
+ |---|---|---|
282
+ | `results/phylogenetic_tree.nwk` | Newick 系統樹 | → infectious-disease, microbiome |
283
+ | `figures/phylogenetic_tree.png` | 系統樹可視化 | → publication-figures, presentation |
284
+ | `results/divergence_times.json` | 分岐年代推定 | → population-genetics, environmental-ecology |
285
+ | `results/ancestral_sequences.fasta` | 祖先配列 | → protein-design, sequence-analysis |
286
+ | `results/phylo_diversity.json` | 系統的多様性 | → microbiome-metagenomics |
287
+
288
+ ## パイプライン統合
289
+
290
+ ```
291
+ sequence-analysis ──→ phylogenetics ──→ infectious-disease
292
+ (アライメント) (系統樹構築) (病原体系統解析)
293
+
294
+ ├──→ microbiome-metagenomics (UniFrac)
295
+ ├──→ population-genetics (分岐推定)
296
+ └──→ environmental-ecology (系統的多様性)
297
+ ```