@nahisaho/satori 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +70 -39
- package/package.json +1 -1
- package/src/.github/skills/scientific-biothings-idmapping/SKILL.md +4 -0
- package/src/.github/skills/scientific-cellxgene-census/SKILL.md +257 -0
- package/src/.github/skills/scientific-clingen-curation/SKILL.md +258 -0
- package/src/.github/skills/scientific-clinical-nlp/SKILL.md +250 -0
- package/src/.github/skills/scientific-clinical-pharmacology/SKILL.md +361 -0
- package/src/.github/skills/scientific-clinical-standards/SKILL.md +444 -0
- package/src/.github/skills/scientific-crispr-design/SKILL.md +369 -0
- package/src/.github/skills/scientific-drug-repurposing/SKILL.md +4 -0
- package/src/.github/skills/scientific-environmental-ecology/SKILL.md +5 -0
- package/src/.github/skills/scientific-epidemiology-public-health/SKILL.md +5 -0
- package/src/.github/skills/scientific-epigenomics-chromatin/SKILL.md +5 -0
- package/src/.github/skills/scientific-glycomics/SKILL.md +274 -0
- package/src/.github/skills/scientific-gtex-tissue-expression/SKILL.md +5 -2
- package/src/.github/skills/scientific-hgnc-nomenclature/SKILL.md +282 -0
- package/src/.github/skills/scientific-human-cell-atlas/SKILL.md +3 -0
- package/src/.github/skills/scientific-human-protein-atlas/SKILL.md +4 -0
- package/src/.github/skills/scientific-immunoinformatics/SKILL.md +9 -0
- package/src/.github/skills/scientific-lipidomics/SKILL.md +284 -0
- package/src/.github/skills/scientific-metabolomics/SKILL.md +3 -0
- package/src/.github/skills/scientific-metabolomics-network/SKILL.md +311 -0
- package/src/.github/skills/scientific-metagenome-assembled-genomes/SKILL.md +299 -0
- package/src/.github/skills/scientific-model-organism-db/SKILL.md +8 -0
- package/src/.github/skills/scientific-pharmacogenomics/SKILL.md +4 -0
- package/src/.github/skills/scientific-pharos-targets/SKILL.md +276 -0
- package/src/.github/skills/scientific-protein-structure-analysis/SKILL.md +4 -0
- package/src/.github/skills/scientific-public-health-data/SKILL.md +11 -0
- package/src/.github/skills/scientific-systems-biology/SKILL.md +11 -0
- package/src/.github/skills/scientific-variant-effect-prediction/SKILL.md +7 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-metabolomics-network
|
|
3
|
+
description: |
|
|
4
|
+
代謝物ネットワーク構築スキル。KEGG/Reactome 代謝パスウェイ
|
|
5
|
+
グラフ抽出・代謝物相関ネットワーク構築 (GGM/WGCNA)・
|
|
6
|
+
ハブ代謝物同定・MetaboAnalyst 統合エンリッチメント
|
|
7
|
+
パイプライン。
|
|
8
|
+
TU 外スキル (直接 Python ライブラリ + REST API)。
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Scientific Metabolomics Network
|
|
12
|
+
|
|
13
|
+
KEGG/Reactome 代謝パスウェイからのネットワーク構築、
|
|
14
|
+
代謝物間相関解析 (Gaussian Graphical Model / WGCNA)、
|
|
15
|
+
ハブ代謝物同定、MetaboAnalyst 統合エンリッチメントの
|
|
16
|
+
パイプラインを提供する。
|
|
17
|
+
|
|
18
|
+
## When to Use
|
|
19
|
+
|
|
20
|
+
- 代謝物相関ネットワーク (partial correlation) を構築するとき
|
|
21
|
+
- KEGG/Reactome 代謝パスウェイをグラフ化するとき
|
|
22
|
+
- ハブ代謝物 (高接続度) を同定するとき
|
|
23
|
+
- 代謝パスウェイエンリッチメント解析を行うとき
|
|
24
|
+
- メタボロームデータのネットワーク可視化
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
## 1. 代謝物相関ネットワーク
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import numpy as np
|
|
34
|
+
import pandas as pd
|
|
35
|
+
import networkx as nx
|
|
36
|
+
from sklearn.covariance import GraphicalLassoCV
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def metabolite_correlation_network(
|
|
40
|
+
data, method="glasso", threshold=0.1):
|
|
41
|
+
"""
|
|
42
|
+
代謝物相関ネットワーク構築。
|
|
43
|
+
|
|
44
|
+
Parameters:
|
|
45
|
+
data: pd.DataFrame — 代謝物濃度行列
|
|
46
|
+
(行=サンプル, 列=代謝物)
|
|
47
|
+
method: str — "glasso" (Graphical Lasso) or
|
|
48
|
+
"pearson" (Pearson partial)
|
|
49
|
+
threshold: float — エッジ閾値
|
|
50
|
+
"""
|
|
51
|
+
metabolites = data.columns.tolist()
|
|
52
|
+
|
|
53
|
+
if method == "glasso":
|
|
54
|
+
model = GraphicalLassoCV(cv=5)
|
|
55
|
+
model.fit(data.values)
|
|
56
|
+
precision = model.precision_
|
|
57
|
+
# Partial correlation from precision matrix
|
|
58
|
+
diag = np.sqrt(np.diag(precision))
|
|
59
|
+
partial_corr = -(precision /
|
|
60
|
+
np.outer(diag, diag))
|
|
61
|
+
np.fill_diagonal(partial_corr, 1.0)
|
|
62
|
+
else:
|
|
63
|
+
partial_corr = data.corr().values
|
|
64
|
+
|
|
65
|
+
# Build network
|
|
66
|
+
G = nx.Graph()
|
|
67
|
+
G.add_nodes_from(metabolites)
|
|
68
|
+
|
|
69
|
+
for i in range(len(metabolites)):
|
|
70
|
+
for j in range(i + 1, len(metabolites)):
|
|
71
|
+
w = abs(partial_corr[i, j])
|
|
72
|
+
if w > threshold:
|
|
73
|
+
G.add_edge(
|
|
74
|
+
metabolites[i],
|
|
75
|
+
metabolites[j],
|
|
76
|
+
weight=round(w, 4),
|
|
77
|
+
sign=("+" if partial_corr[i, j]
|
|
78
|
+
> 0 else "-"))
|
|
79
|
+
|
|
80
|
+
print(f"Metabolite network: "
|
|
81
|
+
f"{G.number_of_nodes()} nodes, "
|
|
82
|
+
f"{G.number_of_edges()} edges "
|
|
83
|
+
f"(threshold={threshold})")
|
|
84
|
+
return G
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def hub_metabolites(G, top_n=10):
|
|
88
|
+
"""
|
|
89
|
+
ハブ代謝物同定 (次数中心性) 。
|
|
90
|
+
|
|
91
|
+
Parameters:
|
|
92
|
+
G: nx.Graph — 代謝物ネットワーク
|
|
93
|
+
top_n: int — 上位件数
|
|
94
|
+
"""
|
|
95
|
+
degree_cent = nx.degree_centrality(G)
|
|
96
|
+
betweenness = nx.betweenness_centrality(G)
|
|
97
|
+
|
|
98
|
+
rows = []
|
|
99
|
+
for node in G.nodes():
|
|
100
|
+
rows.append({
|
|
101
|
+
"metabolite": node,
|
|
102
|
+
"degree": G.degree(node),
|
|
103
|
+
"degree_centrality": round(
|
|
104
|
+
degree_cent[node], 4),
|
|
105
|
+
"betweenness": round(
|
|
106
|
+
betweenness[node], 4),
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
df = pd.DataFrame(rows).sort_values(
|
|
110
|
+
"degree_centrality",
|
|
111
|
+
ascending=False).head(top_n)
|
|
112
|
+
print(f"Top {top_n} hub metabolites:")
|
|
113
|
+
for _, row in df.iterrows():
|
|
114
|
+
print(f" {row['metabolite']}: "
|
|
115
|
+
f"deg={row['degree']}, "
|
|
116
|
+
f"bc={row['betweenness']}")
|
|
117
|
+
return df
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## 2. KEGG 代謝パスウェイグラフ
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
def kegg_pathway_graph(pathway_id):
|
|
124
|
+
"""
|
|
125
|
+
KEGG — 代謝パスウェイをネットワークグラフとして取得。
|
|
126
|
+
|
|
127
|
+
Parameters:
|
|
128
|
+
pathway_id: str — KEGG パスウェイ ID
|
|
129
|
+
(例: "hsa00010")
|
|
130
|
+
"""
|
|
131
|
+
import requests
|
|
132
|
+
|
|
133
|
+
# KGML 取得
|
|
134
|
+
url = (f"https://rest.kegg.jp/get/"
|
|
135
|
+
f"{pathway_id}/kgml")
|
|
136
|
+
resp = requests.get(url, timeout=30)
|
|
137
|
+
resp.raise_for_status()
|
|
138
|
+
|
|
139
|
+
import xml.etree.ElementTree as ET
|
|
140
|
+
root = ET.fromstring(resp.text)
|
|
141
|
+
|
|
142
|
+
G = nx.DiGraph()
|
|
143
|
+
|
|
144
|
+
# ノード追加
|
|
145
|
+
entry_map = {}
|
|
146
|
+
for entry in root.findall("entry"):
|
|
147
|
+
eid = entry.get("id")
|
|
148
|
+
name = entry.get("name", "")
|
|
149
|
+
etype = entry.get("type", "")
|
|
150
|
+
graphics = entry.find("graphics")
|
|
151
|
+
label = (graphics.get("name", name)
|
|
152
|
+
if graphics is not None else name)
|
|
153
|
+
entry_map[eid] = label
|
|
154
|
+
G.add_node(label, entry_type=etype)
|
|
155
|
+
|
|
156
|
+
# エッジ追加
|
|
157
|
+
for relation in root.findall("relation"):
|
|
158
|
+
e1 = relation.get("entry1")
|
|
159
|
+
e2 = relation.get("entry2")
|
|
160
|
+
rtype = relation.get("type", "")
|
|
161
|
+
if e1 in entry_map and e2 in entry_map:
|
|
162
|
+
G.add_edge(entry_map[e1],
|
|
163
|
+
entry_map[e2],
|
|
164
|
+
relation_type=rtype)
|
|
165
|
+
|
|
166
|
+
for reaction in root.findall("reaction"):
|
|
167
|
+
rname = reaction.get("name", "")
|
|
168
|
+
substrates = [s.get("name", "")
|
|
169
|
+
for s in reaction.findall(
|
|
170
|
+
"substrate")]
|
|
171
|
+
products = [p.get("name", "")
|
|
172
|
+
for p in reaction.findall(
|
|
173
|
+
"product")]
|
|
174
|
+
for s in substrates:
|
|
175
|
+
for p in products:
|
|
176
|
+
G.add_edge(s, p,
|
|
177
|
+
reaction=rname)
|
|
178
|
+
|
|
179
|
+
print(f"KEGG pathway {pathway_id}: "
|
|
180
|
+
f"{G.number_of_nodes()} nodes, "
|
|
181
|
+
f"{G.number_of_edges()} edges")
|
|
182
|
+
return G
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## 3. パスウェイエンリッチメント
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
def metabolite_pathway_enrichment(
|
|
189
|
+
metabolite_list, organism="hsa"):
|
|
190
|
+
"""
|
|
191
|
+
代謝物パスウェイエンリッチメント (KEGG)。
|
|
192
|
+
|
|
193
|
+
Parameters:
|
|
194
|
+
metabolite_list: list[str] — KEGG compound ID
|
|
195
|
+
リスト (例: ["C00031", "C00158"])
|
|
196
|
+
organism: str — 生物種コード
|
|
197
|
+
"""
|
|
198
|
+
import requests
|
|
199
|
+
from scipy.stats import hypergeom
|
|
200
|
+
|
|
201
|
+
# KEGG compound→pathway マッピング
|
|
202
|
+
url = "https://rest.kegg.jp/link/pathway/compound"
|
|
203
|
+
resp = requests.get(url, timeout=30)
|
|
204
|
+
resp.raise_for_status()
|
|
205
|
+
|
|
206
|
+
cpd_to_pw = {}
|
|
207
|
+
pw_to_cpd = {}
|
|
208
|
+
for line in resp.text.strip().split("\n"):
|
|
209
|
+
if not line:
|
|
210
|
+
continue
|
|
211
|
+
parts = line.split("\t")
|
|
212
|
+
if len(parts) != 2:
|
|
213
|
+
continue
|
|
214
|
+
cpd = parts[0].replace("cpd:", "")
|
|
215
|
+
pw = parts[1].replace("path:", "")
|
|
216
|
+
if not pw.startswith("map"):
|
|
217
|
+
continue
|
|
218
|
+
cpd_to_pw.setdefault(cpd, set()).add(pw)
|
|
219
|
+
pw_to_cpd.setdefault(pw, set()).add(cpd)
|
|
220
|
+
|
|
221
|
+
# エンリッチメント計算
|
|
222
|
+
query_set = set(metabolite_list)
|
|
223
|
+
all_cpds = set(cpd_to_pw.keys())
|
|
224
|
+
M = len(all_cpds)
|
|
225
|
+
n = len(query_set & all_cpds)
|
|
226
|
+
|
|
227
|
+
results = []
|
|
228
|
+
for pw, pw_cpds in pw_to_cpd.items():
|
|
229
|
+
N = len(pw_cpds)
|
|
230
|
+
k = len(query_set & pw_cpds)
|
|
231
|
+
if k == 0:
|
|
232
|
+
continue
|
|
233
|
+
pval = hypergeom.sf(k - 1, M, N, n)
|
|
234
|
+
results.append({
|
|
235
|
+
"pathway": pw,
|
|
236
|
+
"overlap": k,
|
|
237
|
+
"pathway_size": N,
|
|
238
|
+
"pvalue": pval,
|
|
239
|
+
"metabolites": ", ".join(
|
|
240
|
+
query_set & pw_cpds),
|
|
241
|
+
})
|
|
242
|
+
|
|
243
|
+
df = pd.DataFrame(results).sort_values("pvalue")
|
|
244
|
+
print(f"Pathway enrichment: "
|
|
245
|
+
f"{len(df)} pathways (p<0.05: "
|
|
246
|
+
f"{(df['pvalue'] < 0.05).sum()})")
|
|
247
|
+
return df
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## 4. 代謝ネットワーク統合パイプライン
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
def metabolomics_network_pipeline(
|
|
254
|
+
data, metabolite_ids=None,
|
|
255
|
+
output_dir="results"):
|
|
256
|
+
"""
|
|
257
|
+
代謝ネットワーク統合パイプライン。
|
|
258
|
+
|
|
259
|
+
Parameters:
|
|
260
|
+
data: pd.DataFrame — 代謝物濃度行列
|
|
261
|
+
metabolite_ids: list[str] | None — KEGG
|
|
262
|
+
compound ID (エンリッチメント用)
|
|
263
|
+
output_dir: str — 出力ディレクトリ
|
|
264
|
+
"""
|
|
265
|
+
from pathlib import Path
|
|
266
|
+
output_dir = Path(output_dir)
|
|
267
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
268
|
+
|
|
269
|
+
# 1) 相関ネットワーク
|
|
270
|
+
G = metabolite_correlation_network(data)
|
|
271
|
+
nx.write_graphml(
|
|
272
|
+
G, str(output_dir / "metabolite_network.graphml"))
|
|
273
|
+
|
|
274
|
+
# 2) ハブ代謝物
|
|
275
|
+
hubs = hub_metabolites(G)
|
|
276
|
+
hubs.to_csv(
|
|
277
|
+
output_dir / "hub_metabolites.csv",
|
|
278
|
+
index=False)
|
|
279
|
+
|
|
280
|
+
# 3) パスウェイエンリッチメント
|
|
281
|
+
if metabolite_ids:
|
|
282
|
+
enrich = metabolite_pathway_enrichment(
|
|
283
|
+
metabolite_ids)
|
|
284
|
+
enrich.to_csv(
|
|
285
|
+
output_dir / "pathway_enrichment.csv",
|
|
286
|
+
index=False)
|
|
287
|
+
|
|
288
|
+
print(f"Metabolomics network pipeline → "
|
|
289
|
+
f"{output_dir}")
|
|
290
|
+
return {"network": G, "hubs": hubs}
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## パイプライン統合
|
|
296
|
+
|
|
297
|
+
```
|
|
298
|
+
metabolomics → metabolomics-network → pathway-enrichment
|
|
299
|
+
(LC-MS/NMR) (GGM/グラフ構築) (KEGG/Reactome)
|
|
300
|
+
│ │ ↓
|
|
301
|
+
lipidomics ────────────┘ systems-biology
|
|
302
|
+
(脂質サブクラス) (マルチオミクス統合)
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
## パイプライン出力
|
|
306
|
+
|
|
307
|
+
| ファイル | 説明 | 次スキル |
|
|
308
|
+
|---------|------|---------|
|
|
309
|
+
| `results/metabolite_network.graphml` | 相関ネットワーク | → systems-biology |
|
|
310
|
+
| `results/hub_metabolites.csv` | ハブ代謝物 | → biomarker-discovery |
|
|
311
|
+
| `results/pathway_enrichment.csv` | パスウェイエンリッチメント | → pathway-enrichment |
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-metagenome-assembled-genomes
|
|
3
|
+
description: |
|
|
4
|
+
メタゲノムアセンブルゲノム (MAG) 解析スキル。
|
|
5
|
+
MetaBAT2 / CONCOCT / MaxBin2 ビニング・CheckM2 品質評価・
|
|
6
|
+
GTDB-Tk 分類学的分類・dRep 脱重複・Prokka アノテーション・
|
|
7
|
+
MAG アセンブリ品質レポートパイプライン。
|
|
8
|
+
TU 外スキル (CLI ラッパー + Python ライブラリ)。
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Scientific Metagenome-Assembled Genomes
|
|
12
|
+
|
|
13
|
+
メタゲノムリードから個別ゲノム (MAG) を再構築する
|
|
14
|
+
ビニング・品質評価・分類・アノテーションの
|
|
15
|
+
統合パイプラインを提供する。
|
|
16
|
+
|
|
17
|
+
## When to Use
|
|
18
|
+
|
|
19
|
+
- メタゲノムショットガンデータから MAG を再構築するとき
|
|
20
|
+
- コンティグビニング (MetaBAT2/CONCOCT/MaxBin2) を実行するとき
|
|
21
|
+
- CheckM/CheckM2 でゲノム完全性・コンタミネーションを評価するとき
|
|
22
|
+
- GTDB-Tk で MAG の分類学的位置づけを行うとき
|
|
23
|
+
- dRep で冗長な MAG を脱重複するとき
|
|
24
|
+
- Prokka/Bakta で MAG のアノテーションを行うとき
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
## 1. MetaBAT2 ビニング
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import subprocess
|
|
34
|
+
import pandas as pd
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_metabat2(assembly_fasta, bam_file,
|
|
39
|
+
output_dir="metabat2_bins",
|
|
40
|
+
min_contig=2500):
|
|
41
|
+
"""
|
|
42
|
+
MetaBAT2 — メタゲノムコンティグビニング。
|
|
43
|
+
|
|
44
|
+
Parameters:
|
|
45
|
+
assembly_fasta: str — アセンブリ FASTA
|
|
46
|
+
bam_file: str — ソート済み BAM
|
|
47
|
+
output_dir: str — 出力ディレクトリ
|
|
48
|
+
min_contig: int — 最小コンティグ長
|
|
49
|
+
"""
|
|
50
|
+
out = Path(output_dir)
|
|
51
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
|
|
53
|
+
# 深度テーブル生成
|
|
54
|
+
depth_file = out / "depth.txt"
|
|
55
|
+
subprocess.run([
|
|
56
|
+
"jgi_summarize_bam_contig_depths",
|
|
57
|
+
"--outputDepth", str(depth_file),
|
|
58
|
+
bam_file
|
|
59
|
+
], check=True)
|
|
60
|
+
|
|
61
|
+
# MetaBAT2 実行
|
|
62
|
+
subprocess.run([
|
|
63
|
+
"metabat2",
|
|
64
|
+
"-i", assembly_fasta,
|
|
65
|
+
"-a", str(depth_file),
|
|
66
|
+
"-o", str(out / "bin"),
|
|
67
|
+
"-m", str(min_contig),
|
|
68
|
+
"--seed", "42",
|
|
69
|
+
], check=True)
|
|
70
|
+
|
|
71
|
+
bins = list(out.glob("bin.*.fa"))
|
|
72
|
+
print(f"MetaBAT2: {len(bins)} bins generated")
|
|
73
|
+
return bins
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## 2. CheckM2 品質評価
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
def run_checkm2(bin_dir, output_dir="checkm2_out",
|
|
80
|
+
threads=8):
|
|
81
|
+
"""
|
|
82
|
+
CheckM2 — MAG 品質評価
|
|
83
|
+
(完全性 / コンタミネーション / N50)。
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
bin_dir: str — ビンディレクトリ
|
|
87
|
+
output_dir: str — 出力ディレクトリ
|
|
88
|
+
threads: int — スレッド数
|
|
89
|
+
"""
|
|
90
|
+
out = Path(output_dir)
|
|
91
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
subprocess.run([
|
|
94
|
+
"checkm2", "predict",
|
|
95
|
+
"--input", bin_dir,
|
|
96
|
+
"--output-directory", str(out),
|
|
97
|
+
"--threads", str(threads),
|
|
98
|
+
"-x", "fa",
|
|
99
|
+
], check=True)
|
|
100
|
+
|
|
101
|
+
report = out / "quality_report.tsv"
|
|
102
|
+
df = pd.read_csv(report, sep="\t")
|
|
103
|
+
|
|
104
|
+
# MIMAG 基準による分類
|
|
105
|
+
df["quality"] = df.apply(
|
|
106
|
+
lambda r: (
|
|
107
|
+
"high" if r["Completeness"] >= 90
|
|
108
|
+
and r["Contamination"] < 5
|
|
109
|
+
else "medium"
|
|
110
|
+
if r["Completeness"] >= 50
|
|
111
|
+
and r["Contamination"] < 10
|
|
112
|
+
else "low"), axis=1)
|
|
113
|
+
|
|
114
|
+
n_hq = (df["quality"] == "high").sum()
|
|
115
|
+
n_mq = (df["quality"] == "medium").sum()
|
|
116
|
+
n_lq = (df["quality"] == "low").sum()
|
|
117
|
+
print(f"CheckM2: {n_hq} HQ, {n_mq} MQ, "
|
|
118
|
+
f"{n_lq} LQ MAGs")
|
|
119
|
+
return df
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def filter_quality_mags(checkm_df,
|
|
123
|
+
min_completeness=50,
|
|
124
|
+
max_contamination=10):
|
|
125
|
+
"""
|
|
126
|
+
品質基準によるMAGフィルタリング。
|
|
127
|
+
|
|
128
|
+
Parameters:
|
|
129
|
+
checkm_df: pd.DataFrame — CheckM2 結果
|
|
130
|
+
min_completeness: float — 最小完全性 (%)
|
|
131
|
+
max_contamination: float — 最大汚染 (%)
|
|
132
|
+
"""
|
|
133
|
+
filtered = checkm_df[
|
|
134
|
+
(checkm_df["Completeness"]
|
|
135
|
+
>= min_completeness)
|
|
136
|
+
& (checkm_df["Contamination"]
|
|
137
|
+
<= max_contamination)
|
|
138
|
+
].copy()
|
|
139
|
+
|
|
140
|
+
print(f"Filter: {len(filtered)}/"
|
|
141
|
+
f"{len(checkm_df)} MAGs passed "
|
|
142
|
+
f"(≥{min_completeness}% comp, "
|
|
143
|
+
f"≤{max_contamination}% contam)")
|
|
144
|
+
return filtered
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## 3. GTDB-Tk 分類
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
def run_gtdbtk(bin_dir, output_dir="gtdbtk_out",
|
|
151
|
+
threads=8):
|
|
152
|
+
"""
|
|
153
|
+
GTDB-Tk — ゲノム分類学分類
|
|
154
|
+
(GTDB taxonomy)。
|
|
155
|
+
|
|
156
|
+
Parameters:
|
|
157
|
+
bin_dir: str — フィルタ済みビンディレクトリ
|
|
158
|
+
output_dir: str — 出力ディレクトリ
|
|
159
|
+
threads: int — スレッド数
|
|
160
|
+
"""
|
|
161
|
+
out = Path(output_dir)
|
|
162
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
163
|
+
|
|
164
|
+
subprocess.run([
|
|
165
|
+
"gtdbtk", "classify_wf",
|
|
166
|
+
"--genome_dir", bin_dir,
|
|
167
|
+
"--out_dir", str(out),
|
|
168
|
+
"--cpus", str(threads),
|
|
169
|
+
"-x", "fa",
|
|
170
|
+
], check=True)
|
|
171
|
+
|
|
172
|
+
# 細菌/古細菌分類結果を統合
|
|
173
|
+
results = []
|
|
174
|
+
for domain in ["bac120", "ar53"]:
|
|
175
|
+
tsv = (out / f"gtdbtk.{domain}."
|
|
176
|
+
"summary.tsv")
|
|
177
|
+
if tsv.exists():
|
|
178
|
+
df = pd.read_csv(tsv, sep="\t")
|
|
179
|
+
df["domain_marker"] = domain
|
|
180
|
+
results.append(df)
|
|
181
|
+
|
|
182
|
+
if results:
|
|
183
|
+
combined = pd.concat(results,
|
|
184
|
+
ignore_index=True)
|
|
185
|
+
print(f"GTDB-Tk: {len(combined)} MAGs "
|
|
186
|
+
f"classified")
|
|
187
|
+
return combined
|
|
188
|
+
|
|
189
|
+
print("GTDB-Tk: no classification results")
|
|
190
|
+
return pd.DataFrame()
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## 4. dRep 脱重複
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
def run_drep(bin_dir, output_dir="drep_out",
|
|
197
|
+
ani_threshold=0.95):
|
|
198
|
+
"""
|
|
199
|
+
dRep — MAG 脱重複 (ANI ベース)。
|
|
200
|
+
|
|
201
|
+
Parameters:
|
|
202
|
+
bin_dir: str — ビンディレクトリ
|
|
203
|
+
output_dir: str — 出力ディレクトリ
|
|
204
|
+
ani_threshold: float — ANI 閾値
|
|
205
|
+
"""
|
|
206
|
+
out = Path(output_dir)
|
|
207
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
208
|
+
|
|
209
|
+
subprocess.run([
|
|
210
|
+
"dRep", "dereplicate",
|
|
211
|
+
str(out),
|
|
212
|
+
"-g", f"{bin_dir}/*.fa",
|
|
213
|
+
"-sa", str(ani_threshold),
|
|
214
|
+
"--ignoreGenomeQuality",
|
|
215
|
+
], check=True)
|
|
216
|
+
|
|
217
|
+
derep = list(
|
|
218
|
+
(out / "dereplicated_genomes").glob("*.fa"))
|
|
219
|
+
print(f"dRep: {len(derep)} dereplicated MAGs "
|
|
220
|
+
f"(ANI ≥ {ani_threshold})")
|
|
221
|
+
return derep
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## 5. MAG パイプライン統合
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
def mag_pipeline(assembly_fasta, bam_file,
|
|
228
|
+
output_dir="mag_results",
|
|
229
|
+
threads=8):
|
|
230
|
+
"""
|
|
231
|
+
MAG 統合パイプライン。
|
|
232
|
+
|
|
233
|
+
Parameters:
|
|
234
|
+
assembly_fasta: str — メタゲノムアセンブリ
|
|
235
|
+
bam_file: str — ソート済み BAM
|
|
236
|
+
output_dir: str — 出力ルート
|
|
237
|
+
threads: int — スレッド数
|
|
238
|
+
"""
|
|
239
|
+
out = Path(output_dir)
|
|
240
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
241
|
+
|
|
242
|
+
# 1) ビニング
|
|
243
|
+
bins = run_metabat2(
|
|
244
|
+
assembly_fasta, bam_file,
|
|
245
|
+
str(out / "bins"))
|
|
246
|
+
|
|
247
|
+
# 2) 品質評価
|
|
248
|
+
checkm = run_checkm2(
|
|
249
|
+
str(out / "bins"),
|
|
250
|
+
str(out / "checkm2"),
|
|
251
|
+
threads)
|
|
252
|
+
|
|
253
|
+
# 3) フィルタリング (MIMAG medium+)
|
|
254
|
+
quality = filter_quality_mags(checkm)
|
|
255
|
+
|
|
256
|
+
# 4) GTDB-Tk 分類
|
|
257
|
+
taxonomy = run_gtdbtk(
|
|
258
|
+
str(out / "bins"),
|
|
259
|
+
str(out / "gtdbtk"),
|
|
260
|
+
threads)
|
|
261
|
+
|
|
262
|
+
# 5) 脱重複
|
|
263
|
+
derep = run_drep(
|
|
264
|
+
str(out / "bins"),
|
|
265
|
+
str(out / "drep"))
|
|
266
|
+
|
|
267
|
+
print(f"MAG pipeline: {len(bins)} bins → "
|
|
268
|
+
f"{len(quality)} QC passed → "
|
|
269
|
+
f"{len(derep)} dereplicated")
|
|
270
|
+
|
|
271
|
+
return {
|
|
272
|
+
"bins": bins,
|
|
273
|
+
"checkm": checkm,
|
|
274
|
+
"quality": quality,
|
|
275
|
+
"taxonomy": taxonomy,
|
|
276
|
+
"dereplicated": derep,
|
|
277
|
+
}
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## パイプライン統合
|
|
283
|
+
|
|
284
|
+
```
|
|
285
|
+
microbiome-metagenomics → metagenome-assembled-genomes → environmental-ecology
|
|
286
|
+
(メタゲノム組成解析) (MAG 再構築) (生態系統合)
|
|
287
|
+
│ │ ↓
|
|
288
|
+
long-read-sequencing ─────────┘ phylogenomics
|
|
289
|
+
(ロングリードアセンブリ) (系統解析)
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
## パイプライン出力
|
|
293
|
+
|
|
294
|
+
| ファイル | 説明 | 次スキル |
|
|
295
|
+
|---------|------|---------|
|
|
296
|
+
| `*_bins/bin.*.fa` | ビンゲノム | → dRep, GTDB-Tk |
|
|
297
|
+
| `checkm2_out/quality_report.tsv` | 品質レポート | → フィルタリング |
|
|
298
|
+
| `gtdbtk_out/*.summary.tsv` | 分類結果 | → phylogenomics |
|
|
299
|
+
| `drep_out/dereplicated_genomes/` | 脱重複 MAG | → environmental-ecology |
|
|
@@ -5,6 +5,14 @@ description: |
|
|
|
5
5
|
WormBase (線虫)、ZFIN (ゼブラフィッシュ)、RGD (ラット)、
|
|
6
6
|
MGI (マウス) の REST API を統合した
|
|
7
7
|
モデル生物遺伝子・表現型・疾患モデル横断検索パイプライン。
|
|
8
|
+
ToolUniverse 連携: impc, mpd。
|
|
9
|
+
tu_tools:
|
|
10
|
+
- key: impc
|
|
11
|
+
name: IMPC
|
|
12
|
+
description: 国際マウス表現型解析コンソーシアム
|
|
13
|
+
- key: mpd
|
|
14
|
+
name: MPD
|
|
15
|
+
description: Mouse Phenome Database マウス表現型
|
|
8
16
|
---
|
|
9
17
|
|
|
10
18
|
# Scientific Model Organism Database
|
|
@@ -6,6 +6,10 @@ description: |
|
|
|
6
6
|
代謝酵素表現型判定 (PM/IM/NM/RM/UM)、FDA 薬理ゲノムバイオマーカー、
|
|
7
7
|
投与量レコメンデーション、PGx レポート生成を統合した
|
|
8
8
|
個別化薬物療法支援パイプライン。
|
|
9
|
+
tu_tools:
|
|
10
|
+
- key: fda_pharmacogenomic_biomarkers
|
|
11
|
+
name: FDA PGx Biomarkers
|
|
12
|
+
description: FDA 薬理ゲノミクスバイオマーカーテーブル
|
|
9
13
|
---
|
|
10
14
|
|
|
11
15
|
# Scientific Pharmacogenomics
|