@nahisaho/satori 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -23
- package/package.json +1 -1
- package/src/.github/skills/scientific-clinical-pharmacology/SKILL.md +361 -0
- package/src/.github/skills/scientific-clinical-standards/SKILL.md +444 -0
- package/src/.github/skills/scientific-crispr-design/SKILL.md +369 -0
- package/src/.github/skills/scientific-environmental-ecology/SKILL.md +5 -0
- package/src/.github/skills/scientific-epidemiology-public-health/SKILL.md +5 -0
- package/src/.github/skills/scientific-epigenomics-chromatin/SKILL.md +5 -0
- package/src/.github/skills/scientific-glycomics/SKILL.md +274 -0
- package/src/.github/skills/scientific-immunoinformatics/SKILL.md +9 -0
- package/src/.github/skills/scientific-lipidomics/SKILL.md +284 -0
- package/src/.github/skills/scientific-metabolomics/SKILL.md +3 -0
- package/src/.github/skills/scientific-metagenome-assembled-genomes/SKILL.md +299 -0
- package/src/.github/skills/scientific-model-organism-db/SKILL.md +8 -0
- package/src/.github/skills/scientific-public-health-data/SKILL.md +11 -0
- package/src/.github/skills/scientific-systems-biology/SKILL.md +11 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-lipidomics
|
|
3
|
+
description: |
|
|
4
|
+
リピドミクス解析スキル。LipidMAPS / SwissLipids / LION
|
|
5
|
+
脂質データベース統合検索・脂質サブクラス分類・
|
|
6
|
+
脂質 MS/MS スペクトル同定・脂質パスウェイエンリッチメント・
|
|
7
|
+
脂質プロファイリングパイプライン。
|
|
8
|
+
TU 外スキル (直接 REST API + Python ライブラリ)。
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Scientific Lipidomics
|
|
12
|
+
|
|
13
|
+
LipidMAPS / SwissLipids / LION 脂質データベースを統合した
|
|
14
|
+
脂質構造検索・サブクラス分類・MS/MS 同定・
|
|
15
|
+
脂質パスウェイエンリッチメント解析パイプラインを提供する。
|
|
16
|
+
|
|
17
|
+
## When to Use
|
|
18
|
+
|
|
19
|
+
- LC-MS/MS リピドミクスデータの脂質同定を行うとき
|
|
20
|
+
- LipidMAPS で脂質構造・サブクラスを検索するとき
|
|
21
|
+
- 脂質プロファイルの差次解析 (fold change/p-value) を行うとき
|
|
22
|
+
- LION エンリッチメントで脂質機能解析を行うとき
|
|
23
|
+
- 脂質パスウェイ (スフィンゴ脂質/リン脂質代謝) を可視化するとき
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
## 1. LipidMAPS 構造検索
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import requests
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
LIPIDMAPS_API = "https://www.lipidmaps.org/rest"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def lipidmaps_search(name=None, formula=None,
|
|
39
|
+
mass=None, tolerance=0.01):
|
|
40
|
+
"""
|
|
41
|
+
LipidMAPS — 脂質構造検索。
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
name: str | None — 脂質名 (部分一致)
|
|
45
|
+
formula: str | None — 分子式
|
|
46
|
+
mass: float | None — 精密質量
|
|
47
|
+
tolerance: float — 質量誤差 (Da)
|
|
48
|
+
"""
|
|
49
|
+
if mass is not None:
|
|
50
|
+
url = (f"{LIPIDMAPS_API}/compound/lm_id/"
|
|
51
|
+
f"mass/{mass}/{tolerance}")
|
|
52
|
+
elif name:
|
|
53
|
+
url = (f"{LIPIDMAPS_API}/compound/lm_id/"
|
|
54
|
+
f"name/{name}")
|
|
55
|
+
elif formula:
|
|
56
|
+
url = (f"{LIPIDMAPS_API}/compound/lm_id/"
|
|
57
|
+
f"formula/{formula}")
|
|
58
|
+
else:
|
|
59
|
+
print("LipidMAPS: provide name, formula, "
|
|
60
|
+
"or mass")
|
|
61
|
+
return pd.DataFrame()
|
|
62
|
+
|
|
63
|
+
resp = requests.get(url, timeout=30)
|
|
64
|
+
resp.raise_for_status()
|
|
65
|
+
data = resp.json()
|
|
66
|
+
|
|
67
|
+
if isinstance(data, dict):
|
|
68
|
+
data = [data]
|
|
69
|
+
|
|
70
|
+
rows = []
|
|
71
|
+
for item in data:
|
|
72
|
+
rows.append({
|
|
73
|
+
"lm_id": item.get("lm_id", ""),
|
|
74
|
+
"name": item.get("name", ""),
|
|
75
|
+
"sys_name": item.get(
|
|
76
|
+
"systematic_name", ""),
|
|
77
|
+
"formula": item.get("formula", ""),
|
|
78
|
+
"mass": item.get("mass", 0),
|
|
79
|
+
"main_class": item.get(
|
|
80
|
+
"main_class", ""),
|
|
81
|
+
"sub_class": item.get("sub_class", ""),
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
df = pd.DataFrame(rows)
|
|
85
|
+
print(f"LipidMAPS: {len(df)} lipids found")
|
|
86
|
+
return df
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def lipidmaps_classification(lm_id):
|
|
90
|
+
"""
|
|
91
|
+
LipidMAPS — 脂質分類階層取得。
|
|
92
|
+
|
|
93
|
+
Parameters:
|
|
94
|
+
lm_id: str — LipidMAPS ID (例: "LMFA01010001")
|
|
95
|
+
"""
|
|
96
|
+
url = (f"{LIPIDMAPS_API}/compound/"
|
|
97
|
+
f"lm_id/{lm_id}/all")
|
|
98
|
+
resp = requests.get(url, timeout=30)
|
|
99
|
+
resp.raise_for_status()
|
|
100
|
+
data = resp.json()
|
|
101
|
+
|
|
102
|
+
classification = {
|
|
103
|
+
"lm_id": data.get("lm_id", ""),
|
|
104
|
+
"category": data.get("core", ""),
|
|
105
|
+
"main_class": data.get("main_class", ""),
|
|
106
|
+
"sub_class": data.get("sub_class", ""),
|
|
107
|
+
"class_level4": data.get(
|
|
108
|
+
"class_level4", ""),
|
|
109
|
+
"smiles": data.get("smiles", ""),
|
|
110
|
+
"inchi_key": data.get("inchi_key", ""),
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
print(f"LipidMAPS: {lm_id} → "
|
|
114
|
+
f"{classification['main_class']} / "
|
|
115
|
+
f"{classification['sub_class']}")
|
|
116
|
+
return classification
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## 2. 脂質差次解析
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
import numpy as np
|
|
123
|
+
from scipy import stats
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def lipid_differential_analysis(data, groups,
|
|
127
|
+
fdr_threshold=0.05):
|
|
128
|
+
"""
|
|
129
|
+
脂質差次解析 (Fold Change + t-test)。
|
|
130
|
+
|
|
131
|
+
Parameters:
|
|
132
|
+
data: pd.DataFrame — 脂質濃度行列
|
|
133
|
+
(行=サンプル, 列=脂質)
|
|
134
|
+
groups: list[int] — グループラベル (0 or 1)
|
|
135
|
+
fdr_threshold: float — FDR 閾値
|
|
136
|
+
"""
|
|
137
|
+
from statsmodels.stats.multitest import (
|
|
138
|
+
multipletests)
|
|
139
|
+
|
|
140
|
+
groups = np.array(groups)
|
|
141
|
+
g0 = data[groups == 0]
|
|
142
|
+
g1 = data[groups == 1]
|
|
143
|
+
|
|
144
|
+
results = []
|
|
145
|
+
for lipid in data.columns:
|
|
146
|
+
mean0 = g0[lipid].mean()
|
|
147
|
+
mean1 = g1[lipid].mean()
|
|
148
|
+
fc = (mean1 / mean0 if mean0 > 0
|
|
149
|
+
else np.inf)
|
|
150
|
+
log2fc = np.log2(fc) if fc > 0 else 0
|
|
151
|
+
_, pval = stats.ttest_ind(
|
|
152
|
+
g0[lipid], g1[lipid])
|
|
153
|
+
results.append({
|
|
154
|
+
"lipid": lipid,
|
|
155
|
+
"mean_ctrl": round(mean0, 4),
|
|
156
|
+
"mean_case": round(mean1, 4),
|
|
157
|
+
"fold_change": round(fc, 4),
|
|
158
|
+
"log2FC": round(log2fc, 4),
|
|
159
|
+
"pvalue": pval,
|
|
160
|
+
})
|
|
161
|
+
|
|
162
|
+
df = pd.DataFrame(results)
|
|
163
|
+
_, fdr, _, _ = multipletests(
|
|
164
|
+
df["pvalue"], method="fdr_bh")
|
|
165
|
+
df["fdr"] = fdr
|
|
166
|
+
df["significant"] = df["fdr"] < fdr_threshold
|
|
167
|
+
|
|
168
|
+
n_sig = df["significant"].sum()
|
|
169
|
+
n_up = ((df["significant"]) &
|
|
170
|
+
(df["log2FC"] > 0)).sum()
|
|
171
|
+
n_down = ((df["significant"]) &
|
|
172
|
+
(df["log2FC"] < 0)).sum()
|
|
173
|
+
print(f"Lipid DA: {n_sig} significant "
|
|
174
|
+
f"({n_up} up, {n_down} down)")
|
|
175
|
+
return df.sort_values("pvalue")
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## 3. 脂質サブクラスエンリッチメント
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
def lipid_subclass_enrichment(
|
|
182
|
+
sig_lipids, all_lipids, class_map):
|
|
183
|
+
"""
|
|
184
|
+
脂質サブクラスエンリッチメント (Fisher exact)。
|
|
185
|
+
|
|
186
|
+
Parameters:
|
|
187
|
+
sig_lipids: list[str] — 有意脂質リスト
|
|
188
|
+
all_lipids: list[str] — 全脂質リスト
|
|
189
|
+
class_map: dict — {lipid: subclass} マッピング
|
|
190
|
+
"""
|
|
191
|
+
from scipy.stats import fisher_exact
|
|
192
|
+
|
|
193
|
+
sig_set = set(sig_lipids)
|
|
194
|
+
all_set = set(all_lipids)
|
|
195
|
+
|
|
196
|
+
# サブクラス別集計
|
|
197
|
+
subclasses = set(class_map.values())
|
|
198
|
+
results = []
|
|
199
|
+
for sc in subclasses:
|
|
200
|
+
sc_all = {l for l, c in class_map.items()
|
|
201
|
+
if c == sc and l in all_set}
|
|
202
|
+
sc_sig = sc_all & sig_set
|
|
203
|
+
a = len(sc_sig)
|
|
204
|
+
b = len(sig_set) - a
|
|
205
|
+
c = len(sc_all) - a
|
|
206
|
+
d = len(all_set) - a - b - c
|
|
207
|
+
if a == 0:
|
|
208
|
+
continue
|
|
209
|
+
_, pval = fisher_exact(
|
|
210
|
+
[[a, b], [c, d]],
|
|
211
|
+
alternative="greater")
|
|
212
|
+
results.append({
|
|
213
|
+
"subclass": sc,
|
|
214
|
+
"sig_in_class": a,
|
|
215
|
+
"total_in_class": len(sc_all),
|
|
216
|
+
"pvalue": pval,
|
|
217
|
+
"ratio": round(a / len(sc_all), 3),
|
|
218
|
+
})
|
|
219
|
+
|
|
220
|
+
df = pd.DataFrame(results).sort_values("pvalue")
|
|
221
|
+
print(f"Subclass enrichment: "
|
|
222
|
+
f"{(df['pvalue'] < 0.05).sum()} "
|
|
223
|
+
f"significant subclasses")
|
|
224
|
+
return df
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## 4. リピドミクス統合パイプライン
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
def lipidomics_pipeline(data, groups,
|
|
231
|
+
output_dir="results"):
|
|
232
|
+
"""
|
|
233
|
+
リピドミクス統合パイプライン。
|
|
234
|
+
|
|
235
|
+
Parameters:
|
|
236
|
+
data: pd.DataFrame — 脂質濃度行列
|
|
237
|
+
groups: list[int] — グループラベル
|
|
238
|
+
output_dir: str — 出力ディレクトリ
|
|
239
|
+
"""
|
|
240
|
+
from pathlib import Path
|
|
241
|
+
output_dir = Path(output_dir)
|
|
242
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
243
|
+
|
|
244
|
+
# 1) 差次解析
|
|
245
|
+
da = lipid_differential_analysis(data, groups)
|
|
246
|
+
da.to_csv(output_dir / "lipid_da.csv",
|
|
247
|
+
index=False)
|
|
248
|
+
|
|
249
|
+
# 2) LipidMAPS アノテーション
|
|
250
|
+
annotations = []
|
|
251
|
+
for lipid in data.columns[:30]:
|
|
252
|
+
result = lipidmaps_search(name=lipid)
|
|
253
|
+
if not result.empty:
|
|
254
|
+
row = result.iloc[0].to_dict()
|
|
255
|
+
row["query"] = lipid
|
|
256
|
+
annotations.append(row)
|
|
257
|
+
if annotations:
|
|
258
|
+
ann_df = pd.DataFrame(annotations)
|
|
259
|
+
ann_df.to_csv(
|
|
260
|
+
output_dir / "lipid_annotations.csv",
|
|
261
|
+
index=False)
|
|
262
|
+
|
|
263
|
+
print(f"Lipidomics pipeline → {output_dir}")
|
|
264
|
+
return {"da": da}
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## パイプライン統合
|
|
270
|
+
|
|
271
|
+
```
|
|
272
|
+
metabolomics → lipidomics → pathway-enrichment
|
|
273
|
+
(LC-MS 全代謝物) (脂質特化) (脂質代謝パスウェイ)
|
|
274
|
+
│ │ ↓
|
|
275
|
+
metabolomics-network ─┘ multi-omics
|
|
276
|
+
(代謝物相関) (オミクス統合)
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## パイプライン出力
|
|
280
|
+
|
|
281
|
+
| ファイル | 説明 | 次スキル |
|
|
282
|
+
|---------|------|---------|
|
|
283
|
+
| `results/lipid_da.csv` | 差次脂質 | → biomarker-discovery |
|
|
284
|
+
| `results/lipid_annotations.csv` | LipidMAPS 注釈 | → pathway-enrichment |
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-metagenome-assembled-genomes
|
|
3
|
+
description: |
|
|
4
|
+
メタゲノムアセンブルゲノム (MAG) 解析スキル。
|
|
5
|
+
MetaBAT2 / CONCOCT / MaxBin2 ビニング・CheckM2 品質評価・
|
|
6
|
+
GTDB-Tk 分類学的分類・dRep 脱重複・Prokka アノテーション・
|
|
7
|
+
MAG アセンブリ品質レポートパイプライン。
|
|
8
|
+
TU 外スキル (CLI ラッパー + Python ライブラリ)。
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Scientific Metagenome-Assembled Genomes
|
|
12
|
+
|
|
13
|
+
メタゲノムリードから個別ゲノム (MAG) を再構築する
|
|
14
|
+
ビニング・品質評価・分類・アノテーションの
|
|
15
|
+
統合パイプラインを提供する。
|
|
16
|
+
|
|
17
|
+
## When to Use
|
|
18
|
+
|
|
19
|
+
- メタゲノムショットガンデータから MAG を再構築するとき
|
|
20
|
+
- コンティグビニング (MetaBAT2/CONCOCT/MaxBin2) を実行するとき
|
|
21
|
+
- CheckM/CheckM2 でゲノム完全性・コンタミネーションを評価するとき
|
|
22
|
+
- GTDB-Tk で MAG の分類学的位置づけを行うとき
|
|
23
|
+
- dRep で冗長な MAG を脱重複するとき
|
|
24
|
+
- Prokka/Bakta で MAG のアノテーションを行うとき
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
## 1. MetaBAT2 ビニング
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import subprocess
|
|
34
|
+
import pandas as pd
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run_metabat2(assembly_fasta, bam_file,
|
|
39
|
+
output_dir="metabat2_bins",
|
|
40
|
+
min_contig=2500):
|
|
41
|
+
"""
|
|
42
|
+
MetaBAT2 — メタゲノムコンティグビニング。
|
|
43
|
+
|
|
44
|
+
Parameters:
|
|
45
|
+
assembly_fasta: str — アセンブリ FASTA
|
|
46
|
+
bam_file: str — ソート済み BAM
|
|
47
|
+
output_dir: str — 出力ディレクトリ
|
|
48
|
+
min_contig: int — 最小コンティグ長
|
|
49
|
+
"""
|
|
50
|
+
out = Path(output_dir)
|
|
51
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
|
|
53
|
+
# 深度テーブル生成
|
|
54
|
+
depth_file = out / "depth.txt"
|
|
55
|
+
subprocess.run([
|
|
56
|
+
"jgi_summarize_bam_contig_depths",
|
|
57
|
+
"--outputDepth", str(depth_file),
|
|
58
|
+
bam_file
|
|
59
|
+
], check=True)
|
|
60
|
+
|
|
61
|
+
# MetaBAT2 実行
|
|
62
|
+
subprocess.run([
|
|
63
|
+
"metabat2",
|
|
64
|
+
"-i", assembly_fasta,
|
|
65
|
+
"-a", str(depth_file),
|
|
66
|
+
"-o", str(out / "bin"),
|
|
67
|
+
"-m", str(min_contig),
|
|
68
|
+
"--seed", "42",
|
|
69
|
+
], check=True)
|
|
70
|
+
|
|
71
|
+
bins = list(out.glob("bin.*.fa"))
|
|
72
|
+
print(f"MetaBAT2: {len(bins)} bins generated")
|
|
73
|
+
return bins
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## 2. CheckM2 品質評価
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
def run_checkm2(bin_dir, output_dir="checkm2_out",
|
|
80
|
+
threads=8):
|
|
81
|
+
"""
|
|
82
|
+
CheckM2 — MAG 品質評価
|
|
83
|
+
(完全性 / コンタミネーション / N50)。
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
bin_dir: str — ビンディレクトリ
|
|
87
|
+
output_dir: str — 出力ディレクトリ
|
|
88
|
+
threads: int — スレッド数
|
|
89
|
+
"""
|
|
90
|
+
out = Path(output_dir)
|
|
91
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
subprocess.run([
|
|
94
|
+
"checkm2", "predict",
|
|
95
|
+
"--input", bin_dir,
|
|
96
|
+
"--output-directory", str(out),
|
|
97
|
+
"--threads", str(threads),
|
|
98
|
+
"-x", "fa",
|
|
99
|
+
], check=True)
|
|
100
|
+
|
|
101
|
+
report = out / "quality_report.tsv"
|
|
102
|
+
df = pd.read_csv(report, sep="\t")
|
|
103
|
+
|
|
104
|
+
# MIMAG 基準による分類
|
|
105
|
+
df["quality"] = df.apply(
|
|
106
|
+
lambda r: (
|
|
107
|
+
"high" if r["Completeness"] >= 90
|
|
108
|
+
and r["Contamination"] < 5
|
|
109
|
+
else "medium"
|
|
110
|
+
if r["Completeness"] >= 50
|
|
111
|
+
and r["Contamination"] < 10
|
|
112
|
+
else "low"), axis=1)
|
|
113
|
+
|
|
114
|
+
n_hq = (df["quality"] == "high").sum()
|
|
115
|
+
n_mq = (df["quality"] == "medium").sum()
|
|
116
|
+
n_lq = (df["quality"] == "low").sum()
|
|
117
|
+
print(f"CheckM2: {n_hq} HQ, {n_mq} MQ, "
|
|
118
|
+
f"{n_lq} LQ MAGs")
|
|
119
|
+
return df
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def filter_quality_mags(checkm_df,
|
|
123
|
+
min_completeness=50,
|
|
124
|
+
max_contamination=10):
|
|
125
|
+
"""
|
|
126
|
+
品質基準によるMAGフィルタリング。
|
|
127
|
+
|
|
128
|
+
Parameters:
|
|
129
|
+
checkm_df: pd.DataFrame — CheckM2 結果
|
|
130
|
+
min_completeness: float — 最小完全性 (%)
|
|
131
|
+
max_contamination: float — 最大汚染 (%)
|
|
132
|
+
"""
|
|
133
|
+
filtered = checkm_df[
|
|
134
|
+
(checkm_df["Completeness"]
|
|
135
|
+
>= min_completeness)
|
|
136
|
+
& (checkm_df["Contamination"]
|
|
137
|
+
<= max_contamination)
|
|
138
|
+
].copy()
|
|
139
|
+
|
|
140
|
+
print(f"Filter: {len(filtered)}/"
|
|
141
|
+
f"{len(checkm_df)} MAGs passed "
|
|
142
|
+
f"(≥{min_completeness}% comp, "
|
|
143
|
+
f"≤{max_contamination}% contam)")
|
|
144
|
+
return filtered
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## 3. GTDB-Tk 分類
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
def run_gtdbtk(bin_dir, output_dir="gtdbtk_out",
|
|
151
|
+
threads=8):
|
|
152
|
+
"""
|
|
153
|
+
GTDB-Tk — ゲノム分類学分類
|
|
154
|
+
(GTDB taxonomy)。
|
|
155
|
+
|
|
156
|
+
Parameters:
|
|
157
|
+
bin_dir: str — フィルタ済みビンディレクトリ
|
|
158
|
+
output_dir: str — 出力ディレクトリ
|
|
159
|
+
threads: int — スレッド数
|
|
160
|
+
"""
|
|
161
|
+
out = Path(output_dir)
|
|
162
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
163
|
+
|
|
164
|
+
subprocess.run([
|
|
165
|
+
"gtdbtk", "classify_wf",
|
|
166
|
+
"--genome_dir", bin_dir,
|
|
167
|
+
"--out_dir", str(out),
|
|
168
|
+
"--cpus", str(threads),
|
|
169
|
+
"-x", "fa",
|
|
170
|
+
], check=True)
|
|
171
|
+
|
|
172
|
+
# 細菌/古細菌分類結果を統合
|
|
173
|
+
results = []
|
|
174
|
+
for domain in ["bac120", "ar53"]:
|
|
175
|
+
tsv = (out / f"gtdbtk.{domain}."
|
|
176
|
+
"summary.tsv")
|
|
177
|
+
if tsv.exists():
|
|
178
|
+
df = pd.read_csv(tsv, sep="\t")
|
|
179
|
+
df["domain_marker"] = domain
|
|
180
|
+
results.append(df)
|
|
181
|
+
|
|
182
|
+
if results:
|
|
183
|
+
combined = pd.concat(results,
|
|
184
|
+
ignore_index=True)
|
|
185
|
+
print(f"GTDB-Tk: {len(combined)} MAGs "
|
|
186
|
+
f"classified")
|
|
187
|
+
return combined
|
|
188
|
+
|
|
189
|
+
print("GTDB-Tk: no classification results")
|
|
190
|
+
return pd.DataFrame()
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## 4. dRep 脱重複
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
def run_drep(bin_dir, output_dir="drep_out",
|
|
197
|
+
ani_threshold=0.95):
|
|
198
|
+
"""
|
|
199
|
+
dRep — MAG 脱重複 (ANI ベース)。
|
|
200
|
+
|
|
201
|
+
Parameters:
|
|
202
|
+
bin_dir: str — ビンディレクトリ
|
|
203
|
+
output_dir: str — 出力ディレクトリ
|
|
204
|
+
ani_threshold: float — ANI 閾値
|
|
205
|
+
"""
|
|
206
|
+
out = Path(output_dir)
|
|
207
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
208
|
+
|
|
209
|
+
subprocess.run([
|
|
210
|
+
"dRep", "dereplicate",
|
|
211
|
+
str(out),
|
|
212
|
+
"-g", f"{bin_dir}/*.fa",
|
|
213
|
+
"-sa", str(ani_threshold),
|
|
214
|
+
"--ignoreGenomeQuality",
|
|
215
|
+
], check=True)
|
|
216
|
+
|
|
217
|
+
derep = list(
|
|
218
|
+
(out / "dereplicated_genomes").glob("*.fa"))
|
|
219
|
+
print(f"dRep: {len(derep)} dereplicated MAGs "
|
|
220
|
+
f"(ANI ≥ {ani_threshold})")
|
|
221
|
+
return derep
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## 5. MAG パイプライン統合
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
def mag_pipeline(assembly_fasta, bam_file,
|
|
228
|
+
output_dir="mag_results",
|
|
229
|
+
threads=8):
|
|
230
|
+
"""
|
|
231
|
+
MAG 統合パイプライン。
|
|
232
|
+
|
|
233
|
+
Parameters:
|
|
234
|
+
assembly_fasta: str — メタゲノムアセンブリ
|
|
235
|
+
bam_file: str — ソート済み BAM
|
|
236
|
+
output_dir: str — 出力ルート
|
|
237
|
+
threads: int — スレッド数
|
|
238
|
+
"""
|
|
239
|
+
out = Path(output_dir)
|
|
240
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
241
|
+
|
|
242
|
+
# 1) ビニング
|
|
243
|
+
bins = run_metabat2(
|
|
244
|
+
assembly_fasta, bam_file,
|
|
245
|
+
str(out / "bins"))
|
|
246
|
+
|
|
247
|
+
# 2) 品質評価
|
|
248
|
+
checkm = run_checkm2(
|
|
249
|
+
str(out / "bins"),
|
|
250
|
+
str(out / "checkm2"),
|
|
251
|
+
threads)
|
|
252
|
+
|
|
253
|
+
# 3) フィルタリング (MIMAG medium+)
|
|
254
|
+
quality = filter_quality_mags(checkm)
|
|
255
|
+
|
|
256
|
+
# 4) GTDB-Tk 分類
|
|
257
|
+
taxonomy = run_gtdbtk(
|
|
258
|
+
str(out / "bins"),
|
|
259
|
+
str(out / "gtdbtk"),
|
|
260
|
+
threads)
|
|
261
|
+
|
|
262
|
+
# 5) 脱重複
|
|
263
|
+
derep = run_drep(
|
|
264
|
+
str(out / "bins"),
|
|
265
|
+
str(out / "drep"))
|
|
266
|
+
|
|
267
|
+
print(f"MAG pipeline: {len(bins)} bins → "
|
|
268
|
+
f"{len(quality)} QC passed → "
|
|
269
|
+
f"{len(derep)} dereplicated")
|
|
270
|
+
|
|
271
|
+
return {
|
|
272
|
+
"bins": bins,
|
|
273
|
+
"checkm": checkm,
|
|
274
|
+
"quality": quality,
|
|
275
|
+
"taxonomy": taxonomy,
|
|
276
|
+
"dereplicated": derep,
|
|
277
|
+
}
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## パイプライン統合
|
|
283
|
+
|
|
284
|
+
```
|
|
285
|
+
microbiome-metagenomics → metagenome-assembled-genomes → environmental-ecology
|
|
286
|
+
(メタゲノム組成解析) (MAG 再構築) (生態系統合)
|
|
287
|
+
│ │ ↓
|
|
288
|
+
long-read-sequencing ─────────┘ phylogenomics
|
|
289
|
+
(ロングリードアセンブリ) (系統解析)
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
## パイプライン出力
|
|
293
|
+
|
|
294
|
+
| ファイル | 説明 | 次スキル |
|
|
295
|
+
|---------|------|---------|
|
|
296
|
+
| `*_bins/bin.*.fa` | ビンゲノム | → dRep, GTDB-Tk |
|
|
297
|
+
| `checkm2_out/quality_report.tsv` | 品質レポート | → フィルタリング |
|
|
298
|
+
| `gtdbtk_out/*.summary.tsv` | 分類結果 | → phylogenomics |
|
|
299
|
+
| `drep_out/dereplicated_genomes/` | 脱重複 MAG | → environmental-ecology |
|
|
@@ -5,6 +5,14 @@ description: |
|
|
|
5
5
|
WormBase (線虫)、ZFIN (ゼブラフィッシュ)、RGD (ラット)、
|
|
6
6
|
MGI (マウス) の REST API を統合した
|
|
7
7
|
モデル生物遺伝子・表現型・疾患モデル横断検索パイプライン。
|
|
8
|
+
ToolUniverse 連携: impc, mpd。
|
|
9
|
+
tu_tools:
|
|
10
|
+
- key: impc
|
|
11
|
+
name: IMPC
|
|
12
|
+
description: 国際マウス表現型解析コンソーシアム
|
|
13
|
+
- key: mpd
|
|
14
|
+
name: MPD
|
|
15
|
+
description: Mouse Phenome Database マウス表現型
|
|
8
16
|
---
|
|
9
17
|
|
|
10
18
|
# Scientific Model Organism Database
|
|
@@ -4,6 +4,17 @@ description: |
|
|
|
4
4
|
公衆衛生データアクセススキル。NHANES 疫学調査データ、MedlinePlus 一般向け
|
|
5
5
|
健康情報、RxNorm 薬剤標準語彙、ODPHP 健康目標・ガイドライン、
|
|
6
6
|
Health Disparities 健康格差データ統合パイプライン。
|
|
7
|
+
ToolUniverse 連携: nhanes, medlineplus, odphp。
|
|
8
|
+
tu_tools:
|
|
9
|
+
- key: nhanes
|
|
10
|
+
name: NHANES
|
|
11
|
+
description: 全米健康栄養調査データ
|
|
12
|
+
- key: medlineplus
|
|
13
|
+
name: MedlinePlus
|
|
14
|
+
description: NLM 一般向け健康情報 API
|
|
15
|
+
- key: odphp
|
|
16
|
+
name: ODPHP
|
|
17
|
+
description: Healthy People 健康目標・ガイドライン
|
|
7
18
|
---
|
|
8
19
|
|
|
9
20
|
# Scientific Public Health Data
|
|
@@ -5,6 +5,17 @@ description: |
|
|
|
5
5
|
代謝フラックス解析(FBA / pFBA)・遺伝子制御ネットワーク推定(GRN)・
|
|
6
6
|
シグナル伝達経路モデリング・パラメータ推定・感度解析・
|
|
7
7
|
BioModels/Reactome/KEGG/BiGG 統合パイプライン。
|
|
8
|
+
ToolUniverse 連携: bigg_models, complex_portal, wikipathways。
|
|
9
|
+
tu_tools:
|
|
10
|
+
- key: bigg_models
|
|
11
|
+
name: BiGG Models
|
|
12
|
+
description: ゲノムスケール代謝モデル BiGG REST API
|
|
13
|
+
- key: complex_portal
|
|
14
|
+
name: Complex Portal
|
|
15
|
+
description: EBI タンパク質複合体データベース
|
|
16
|
+
- key: wikipathways
|
|
17
|
+
name: WikiPathways
|
|
18
|
+
description: WikiPathways コミュニティパスウェイ
|
|
8
19
|
---
|
|
9
20
|
|
|
10
21
|
# Scientific Systems Biology
|