@nahisaho/satori 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +123 -4
- package/package.json +1 -1
- package/src/.github/skills/scientific-environmental-ecology/SKILL.md +295 -0
- package/src/.github/skills/scientific-epidemiology-public-health/SKILL.md +332 -0
- package/src/.github/skills/scientific-immunoinformatics/SKILL.md +341 -0
- package/src/.github/skills/scientific-infectious-disease/SKILL.md +342 -0
- package/src/.github/skills/scientific-microbiome-metagenomics/SKILL.md +349 -0
- package/src/.github/skills/scientific-population-genetics/SKILL.md +336 -0
- package/src/.github/skills/scientific-single-cell-genomics/SKILL.md +361 -0
- package/src/.github/skills/scientific-spatial-transcriptomics/SKILL.md +281 -0
- package/src/.github/skills/scientific-systems-biology/SKILL.md +310 -0
- package/src/.github/skills/scientific-text-mining-nlp/SKILL.md +358 -0
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-infectious-disease
|
|
3
|
+
description: |
|
|
4
|
+
感染症ゲノミクス・疫学スキル。病原体ゲノム解析(SNP/系統樹)・
|
|
5
|
+
AMR(薬剤耐性)遺伝子検出・分子疫学(MLST/cgMLST)・
|
|
6
|
+
アウトブレイク調査トレーシング・疫学的 SIR/SEIR コンパートメントモデル・
|
|
7
|
+
伝播ネットワーク推定パイプライン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Infectious Disease Genomics
|
|
11
|
+
|
|
12
|
+
病原体ゲノミクスと感染症疫学の統合解析パイプラインを提供する。
|
|
13
|
+
病原体配列タイピング、系統解析、薬剤耐性遺伝子検出、
|
|
14
|
+
アウトブレイク伝播推定、数理疫学モデルを体系的に扱う。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- 病原体の全ゲノムシーケンスデータの解析が必要なとき
|
|
19
|
+
- 薬剤耐性(AMR)遺伝子を検出・分類するとき
|
|
20
|
+
- 分子疫学タイピング(MLST, cgMLST, SNP)を行うとき
|
|
21
|
+
- アウトブレイクの伝播経路を推定するとき
|
|
22
|
+
- SIR / SEIR 等のコンパートメントモデルで感染拡大をシミュレーションするとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. 病原体ゲノム前処理
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
def pathogen_qc_pipeline(fastq_r1, fastq_r2, reference_genome,
|
|
35
|
+
min_depth=30, min_coverage=0.95):
|
|
36
|
+
"""
|
|
37
|
+
病原体 WGS 前処理パイプライン。
|
|
38
|
+
|
|
39
|
+
手順:
|
|
40
|
+
1. Fastp — read QC + adapter trimming
|
|
41
|
+
2. BWA-MEM2 — リファレンスマッピング
|
|
42
|
+
3. Samtools / Picard — dupmark + sort
|
|
43
|
+
4. FreeBayes / GATK — variant calling
|
|
44
|
+
5. カバレッジ / 深度 QC
|
|
45
|
+
|
|
46
|
+
品質基準:
|
|
47
|
+
- mean_depth ≥ min_depth (既定: 30x)
|
|
48
|
+
- genome_coverage ≥ min_coverage (既定: 95%)
|
|
49
|
+
"""
|
|
50
|
+
import subprocess
|
|
51
|
+
|
|
52
|
+
cmds = [
|
|
53
|
+
# QC + trimming
|
|
54
|
+
f"fastp -i {fastq_r1} -I {fastq_r2} -o trim_R1.fq.gz -O trim_R2.fq.gz "
|
|
55
|
+
f"--json qc_report.json",
|
|
56
|
+
# Mapping
|
|
57
|
+
f"bwa-mem2 mem -t 8 {reference_genome} trim_R1.fq.gz trim_R2.fq.gz | "
|
|
58
|
+
f"samtools sort -@ 4 -o aligned.bam",
|
|
59
|
+
# Mark duplicates
|
|
60
|
+
f"samtools markdup aligned.bam dedup.bam",
|
|
61
|
+
f"samtools index dedup.bam",
|
|
62
|
+
# Variant calling
|
|
63
|
+
f"freebayes -f {reference_genome} dedup.bam > variants.vcf",
|
|
64
|
+
# Coverage stats
|
|
65
|
+
f"samtools depth -a dedup.bam | awk '{{sum+=$3; n++}} END {{print sum/n}}'"
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
for cmd in cmds:
|
|
69
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
70
|
+
|
|
71
|
+
print(f" Pipeline complete: variants.vcf generated")
|
|
72
|
+
return "variants.vcf"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## 2. AMR 遺伝子検出
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
def detect_amr_genes(assembly_fasta, database="resfinder"):
|
|
79
|
+
"""
|
|
80
|
+
薬剤耐性(AMR)遺伝子の検出。
|
|
81
|
+
|
|
82
|
+
データベース:
|
|
83
|
+
- ResFinder: 後天性耐性遺伝子
|
|
84
|
+
- CARD (RGI): 包括的 AMR データベース
|
|
85
|
+
- AMRFinderPlus: NCBI 統合 AMR 検出
|
|
86
|
+
|
|
87
|
+
結果カテゴリ:
|
|
88
|
+
- 耐性遺伝子(acquired resistance genes)
|
|
89
|
+
- 点変異(point mutations)
|
|
90
|
+
- 耐性表現型予測
|
|
91
|
+
"""
|
|
92
|
+
import subprocess
|
|
93
|
+
import json
|
|
94
|
+
|
|
95
|
+
if database == "resfinder":
|
|
96
|
+
cmd = (f"python -m resfinder -ifa {assembly_fasta} "
|
|
97
|
+
f"--acquired --point -o resfinder_results/")
|
|
98
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
99
|
+
|
|
100
|
+
with open("resfinder_results/ResFinder_results_tab.txt") as f:
|
|
101
|
+
lines = f.readlines()
|
|
102
|
+
results = parse_resfinder_output(lines)
|
|
103
|
+
|
|
104
|
+
elif database == "card":
|
|
105
|
+
cmd = f"rgi main -i {assembly_fasta} -o rgi_results -t contig -a BLAST"
|
|
106
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
107
|
+
results = pd.read_csv("rgi_results.txt", sep="\t")
|
|
108
|
+
|
|
109
|
+
n_genes = len(results) if isinstance(results, list) else len(results)
|
|
110
|
+
print(f" AMR: {n_genes} resistance genes detected ({database})")
|
|
111
|
+
return results
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def parse_resfinder_output(lines):
|
|
115
|
+
"""ResFinder 出力をパースする。"""
|
|
116
|
+
results = []
|
|
117
|
+
for line in lines[1:]:
|
|
118
|
+
fields = line.strip().split("\t")
|
|
119
|
+
if len(fields) >= 6:
|
|
120
|
+
results.append({
|
|
121
|
+
"gene": fields[0],
|
|
122
|
+
"identity": float(fields[1]),
|
|
123
|
+
"coverage": float(fields[2]),
|
|
124
|
+
"phenotype": fields[5] if len(fields) > 5 else "Unknown",
|
|
125
|
+
})
|
|
126
|
+
return results
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## 3. 分子疫学タイピング
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
def molecular_typing(assembly_fasta, organism, scheme="mlst"):
|
|
133
|
+
"""
|
|
134
|
+
分子疫学タイピング。
|
|
135
|
+
|
|
136
|
+
scheme:
|
|
137
|
+
- "mlst": Multi-Locus Sequence Typing(7 遺伝子座)
|
|
138
|
+
- "cgmlst": core genome MLST(数百〜数千遺伝子座)
|
|
139
|
+
- "wgmlst": whole genome MLST
|
|
140
|
+
|
|
141
|
+
MLST:
|
|
142
|
+
各ハウスキーピング遺伝子座のアリル番号の組み合わせで
|
|
143
|
+
Sequence Type(ST)を決定する。
|
|
144
|
+
"""
|
|
145
|
+
import subprocess
|
|
146
|
+
|
|
147
|
+
if scheme == "mlst":
|
|
148
|
+
cmd = f"mlst {assembly_fasta} --scheme {organism}"
|
|
149
|
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
150
|
+
fields = result.stdout.strip().split("\t")
|
|
151
|
+
typing = {
|
|
152
|
+
"file": fields[0],
|
|
153
|
+
"scheme": fields[1],
|
|
154
|
+
"ST": fields[2],
|
|
155
|
+
"alleles": fields[3:],
|
|
156
|
+
}
|
|
157
|
+
elif scheme == "cgmlst":
|
|
158
|
+
cmd = f"chewbbaca AlleleCall -i {assembly_fasta} -g schema/ -o cgmlst_results/"
|
|
159
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
160
|
+
typing = {"scheme": "cgMLST", "results_dir": "cgmlst_results/"}
|
|
161
|
+
|
|
162
|
+
print(f" Typing: ST={typing.get('ST', 'N/A')} ({scheme})")
|
|
163
|
+
return typing
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## 4. 系統解析・伝播推定
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
def phylogenetic_analysis(alignment_fasta, method="iqtree", model="GTR+G"):
|
|
170
|
+
"""
|
|
171
|
+
病原体系統解析パイプライン。
|
|
172
|
+
|
|
173
|
+
method:
|
|
174
|
+
- "iqtree": IQ-TREE 2 — 最尤法(ModelFinder 自動モデル選択)
|
|
175
|
+
- "raxml": RAxML-NG — 最尤法
|
|
176
|
+
- "beast": BEAST 2 — ベイズ系統年代学
|
|
177
|
+
|
|
178
|
+
アウトブレイク推定:
|
|
179
|
+
- SNP 距離行列 → 最小スパニングツリー
|
|
180
|
+
- tMRCA (最近共通祖先時間) 推定
|
|
181
|
+
"""
|
|
182
|
+
import subprocess
|
|
183
|
+
from Bio import Phylo
|
|
184
|
+
|
|
185
|
+
if method == "iqtree":
|
|
186
|
+
cmd = (f"iqtree2 -s {alignment_fasta} -m {model} "
|
|
187
|
+
f"-bb 1000 -alrt 1000 -nt AUTO")
|
|
188
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
189
|
+
tree = Phylo.read(f"{alignment_fasta}.treefile", "newick")
|
|
190
|
+
|
|
191
|
+
return tree
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def transmission_network(snp_matrix, max_snp_distance=10):
|
|
195
|
+
"""
|
|
196
|
+
SNP 距離ベースの伝播ネットワーク推定。
|
|
197
|
+
|
|
198
|
+
基準:
|
|
199
|
+
- 直接伝播: SNP 距離 ≤ max_snp_distance
|
|
200
|
+
- 近縁クラスタ: SNP 距離 ≤ 2 × max_snp_distance
|
|
201
|
+
|
|
202
|
+
アルゴリズム:
|
|
203
|
+
1. ペアワイズ SNP 距離計算
|
|
204
|
+
2. 閾値以下のペアをエッジとして接続
|
|
205
|
+
3. 最小スパニングツリーで伝播方向推定
|
|
206
|
+
"""
|
|
207
|
+
import networkx as nx
|
|
208
|
+
|
|
209
|
+
G = nx.Graph()
|
|
210
|
+
samples = snp_matrix.index.tolist()
|
|
211
|
+
G.add_nodes_from(samples)
|
|
212
|
+
|
|
213
|
+
for i, s1 in enumerate(samples):
|
|
214
|
+
for j, s2 in enumerate(samples):
|
|
215
|
+
if i < j:
|
|
216
|
+
dist = snp_matrix.iloc[i, j]
|
|
217
|
+
if dist <= max_snp_distance:
|
|
218
|
+
G.add_edge(s1, s2, weight=dist, snp_distance=dist)
|
|
219
|
+
|
|
220
|
+
mst = nx.minimum_spanning_tree(G)
|
|
221
|
+
clusters = list(nx.connected_components(G))
|
|
222
|
+
|
|
223
|
+
print(f" Transmission: {G.number_of_edges()} links, "
|
|
224
|
+
f"{len(clusters)} clusters")
|
|
225
|
+
return G, mst, clusters
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## 5. SIR / SEIR コンパートメントモデル
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
from scipy.integrate import odeint
|
|
232
|
+
|
|
233
|
+
def sir_model(y, t, beta, gamma, N):
|
|
234
|
+
"""
|
|
235
|
+
SIR コンパートメントモデル。
|
|
236
|
+
|
|
237
|
+
dS/dt = -β · S · I / N
|
|
238
|
+
dI/dt = β · S · I / N - γ · I
|
|
239
|
+
dR/dt = γ · I
|
|
240
|
+
|
|
241
|
+
R₀ = β / γ (基本再生産数)
|
|
242
|
+
"""
|
|
243
|
+
S, I, R = y
|
|
244
|
+
dSdt = -beta * S * I / N
|
|
245
|
+
dIdt = beta * S * I / N - gamma * I
|
|
246
|
+
dRdt = gamma * I
|
|
247
|
+
return [dSdt, dIdt, dRdt]
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def seir_model(y, t, beta, sigma, gamma, N):
|
|
251
|
+
"""
|
|
252
|
+
SEIR コンパートメントモデル(潜伏期あり)。
|
|
253
|
+
|
|
254
|
+
dS/dt = -β · S · I / N
|
|
255
|
+
dE/dt = β · S · I / N - σ · E
|
|
256
|
+
dI/dt = σ · E - γ · I
|
|
257
|
+
dR/dt = γ · I
|
|
258
|
+
|
|
259
|
+
σ: 潜伏期の逆数 (1/incubation_period)
|
|
260
|
+
"""
|
|
261
|
+
S, E, I, R = y
|
|
262
|
+
dSdt = -beta * S * I / N
|
|
263
|
+
dEdt = beta * S * I / N - sigma * E
|
|
264
|
+
dIdt = sigma * E - gamma * I
|
|
265
|
+
dRdt = gamma * I
|
|
266
|
+
return [dSdt, dEdt, dIdt, dRdt]
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def run_epidemic_simulation(model="SIR", N=1e6, I0=10, R0=2.5,
|
|
270
|
+
gamma=1/10, sigma=1/5, days=180):
|
|
271
|
+
"""
|
|
272
|
+
感染症拡大シミュレーション。
|
|
273
|
+
|
|
274
|
+
Parameters:
|
|
275
|
+
R0: 基本再生産数
|
|
276
|
+
gamma: 回復率 (1/感染期間)
|
|
277
|
+
sigma: 発症率 (1/潜伏期間、SEIR のみ)
|
|
278
|
+
days: シミュレーション日数
|
|
279
|
+
"""
|
|
280
|
+
beta = R0 * gamma
|
|
281
|
+
t = np.linspace(0, days, days * 10)
|
|
282
|
+
|
|
283
|
+
if model == "SIR":
|
|
284
|
+
y0 = [N - I0, I0, 0]
|
|
285
|
+
sol = odeint(sir_model, y0, t, args=(beta, gamma, N))
|
|
286
|
+
df = pd.DataFrame(sol, columns=["S", "I", "R"])
|
|
287
|
+
elif model == "SEIR":
|
|
288
|
+
y0 = [N - I0, 0, I0, 0]
|
|
289
|
+
sol = odeint(seir_model, y0, t, args=(beta, sigma, gamma, N))
|
|
290
|
+
df = pd.DataFrame(sol, columns=["S", "E", "I", "R"])
|
|
291
|
+
|
|
292
|
+
df["t"] = t
|
|
293
|
+
peak_I = df["I"].max()
|
|
294
|
+
peak_day = df.loc[df["I"].idxmax(), "t"]
|
|
295
|
+
|
|
296
|
+
print(f" {model}: R₀={R0:.1f}, peak infection={peak_I:.0f} at day {peak_day:.0f}")
|
|
297
|
+
return df
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## References
|
|
301
|
+
|
|
302
|
+
### Output Files
|
|
303
|
+
|
|
304
|
+
| ファイル | 形式 |
|
|
305
|
+
|---|---|
|
|
306
|
+
| `results/amr_genes.csv` | CSV |
|
|
307
|
+
| `results/mlst_typing.json` | JSON |
|
|
308
|
+
| `results/snp_matrix.csv` | CSV |
|
|
309
|
+
| `results/transmission_network.json` | JSON |
|
|
310
|
+
| `results/epidemic_simulation.csv` | CSV |
|
|
311
|
+
| `figures/phylogenetic_tree.png` | PNG |
|
|
312
|
+
| `figures/transmission_network.png` | PNG |
|
|
313
|
+
| `figures/epidemic_curves.png` | PNG |
|
|
314
|
+
|
|
315
|
+
### 利用可能ツール
|
|
316
|
+
|
|
317
|
+
> [ToolUniverse](https://github.com/mims-harvard/ToolUniverse) SMCP 経由で利用可能な外部ツール。
|
|
318
|
+
|
|
319
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
320
|
+
|---|---|---|
|
|
321
|
+
| EUHealthInfo | `euhealthinfo_search_infectious_diseases` | 感染症サーベイランスデータ |
|
|
322
|
+
| EUHealthInfo | `euhealthinfo_search_surveillance` | 疫学サーベイランス |
|
|
323
|
+
| CDC | `cdc_data_search_datasets` | CDC データセット検索 |
|
|
324
|
+
| CDC | `cdc_data_get_dataset` | CDC データ取得 |
|
|
325
|
+
| NCBI | `BLAST_nucleotide_search` | 病原体配列同定 |
|
|
326
|
+
| NCBI | `NCBI_get_sequence` | ゲノム配列取得 |
|
|
327
|
+
| PubMed | `PubMed_search_articles` | 感染症文献検索 |
|
|
328
|
+
| ClinicalTrials | `search_clinical_trials` | 感染症治療臨床試験 |
|
|
329
|
+
|
|
330
|
+
### 参照スキル
|
|
331
|
+
|
|
332
|
+
| スキル | 連携内容 |
|
|
333
|
+
|---|---|
|
|
334
|
+
| [scientific-sequence-analysis](../scientific-sequence-analysis/SKILL.md) | 配列アライメント・BLAST |
|
|
335
|
+
| [scientific-bioinformatics](../scientific-bioinformatics/SKILL.md) | ゲノムアノテーション |
|
|
336
|
+
| [scientific-network-analysis](../scientific-network-analysis/SKILL.md) | 伝播ネットワーク可視化 |
|
|
337
|
+
| [scientific-survival-clinical](../scientific-survival-clinical/SKILL.md) | 感染症アウトカム解析 |
|
|
338
|
+
| [scientific-bayesian-statistics](../scientific-bayesian-statistics/SKILL.md) | ベイズ系統年代学 |
|
|
339
|
+
|
|
340
|
+
#### 依存パッケージ
|
|
341
|
+
|
|
342
|
+
- biopython, ete3, scipy, networkx, subprocess (fastp, bwa-mem2, freebayes, iqtree2)
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-microbiome-metagenomics
|
|
3
|
+
description: |
|
|
4
|
+
マイクロバイオーム・メタゲノミクス解析スキル。16S rRNA アンプリコン解析(DADA2)・
|
|
5
|
+
ショットガンメタゲノム解析(MetaPhlAn / HUMAnN)・α/β 多様性・
|
|
6
|
+
差次存在量解析(DESeq2 / ANCOM-BC)・機能的プロファイリング・
|
|
7
|
+
組成データ解析(CoDA)パイプライン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Microbiome & Metagenomics
|
|
11
|
+
|
|
12
|
+
マイクロバイオーム解析の標準パイプラインを提供する。
|
|
13
|
+
16S rRNA アンプリコンおよびショットガンメタゲノムデータの
|
|
14
|
+
品質管理、分類学的プロファイリング、多様性評価、
|
|
15
|
+
差次存在量解析、機能的アノテーションを体系的に扱う。
|
|
16
|
+
|
|
17
|
+
## When to Use
|
|
18
|
+
|
|
19
|
+
- 16S rRNA アンプリコンシーケンスの解析が必要なとき
|
|
20
|
+
- ショットガンメタゲノムの分類学的・機能的プロファイリングを行うとき
|
|
21
|
+
- 群集の α / β 多様性を比較するとき
|
|
22
|
+
- 群間で差次存在量の微生物を同定するとき
|
|
23
|
+
- 組成データ(compositional data)の統計解析を行うとき
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
## 1. 16S rRNA アンプリコン解析(DADA2)
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
def dada2_pipeline(fastq_dir, trim_left=20, trunc_len_f=240, trunc_len_r=200,
|
|
36
|
+
min_overlap=12):
|
|
37
|
+
"""
|
|
38
|
+
DADA2 アンプリコン解析パイプライン。
|
|
39
|
+
|
|
40
|
+
手順:
|
|
41
|
+
1. filterAndTrim — 品質フィルタリング + プライマー除去
|
|
42
|
+
2. learnErrors — エラーモデル学習
|
|
43
|
+
3. dada — ASV(Amplicon Sequence Variant)推定
|
|
44
|
+
4. mergePairs — ペアエンドマージ
|
|
45
|
+
5. removeBimeraDenovo — キメラ除去
|
|
46
|
+
6. assignTaxonomy — SILVA/GTDB による分類
|
|
47
|
+
|
|
48
|
+
ASV vs OTU:
|
|
49
|
+
ASV は 100% 配列同一性で分解(1 塩基差を区別)
|
|
50
|
+
OTU は 97% 類似度でクラスタリング(旧来法)
|
|
51
|
+
"""
|
|
52
|
+
import subprocess
|
|
53
|
+
|
|
54
|
+
r_script = f"""
|
|
55
|
+
library(dada2)
|
|
56
|
+
|
|
57
|
+
path <- "{fastq_dir}"
|
|
58
|
+
fnFs <- sort(list.files(path, pattern="_R1_001.fastq.gz", full.names=TRUE))
|
|
59
|
+
fnRs <- sort(list.files(path, pattern="_R2_001.fastq.gz", full.names=TRUE))
|
|
60
|
+
|
|
61
|
+
# Filter and trim
|
|
62
|
+
filtFs <- file.path(path, "filtered", basename(fnFs))
|
|
63
|
+
filtRs <- file.path(path, "filtered", basename(fnRs))
|
|
64
|
+
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
|
|
65
|
+
trimLeft={trim_left}, truncLen=c({trunc_len_f},{trunc_len_r}),
|
|
66
|
+
maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE)
|
|
67
|
+
|
|
68
|
+
# Error learning
|
|
69
|
+
errF <- learnErrors(filtFs, multithread=TRUE)
|
|
70
|
+
errR <- learnErrors(filtRs, multithread=TRUE)
|
|
71
|
+
|
|
72
|
+
# Denoise
|
|
73
|
+
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
|
|
74
|
+
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)
|
|
75
|
+
|
|
76
|
+
# Merge
|
|
77
|
+
merged <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, minOverlap={min_overlap})
|
|
78
|
+
|
|
79
|
+
# ASV table
|
|
80
|
+
seqtab <- makeSequenceTable(merged)
|
|
81
|
+
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus")
|
|
82
|
+
|
|
83
|
+
# Taxonomy
|
|
84
|
+
taxa <- assignTaxonomy(seqtab.nochim, "silva_nr99_v138.1_train_set.fa.gz")
|
|
85
|
+
|
|
86
|
+
write.csv(seqtab.nochim, "results/asv_table.csv")
|
|
87
|
+
write.csv(taxa, "results/taxonomy.csv")
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
with open("_dada2_pipeline.R", "w") as f:
|
|
91
|
+
f.write(r_script)
|
|
92
|
+
subprocess.run(["Rscript", "_dada2_pipeline.R"], check=True)
|
|
93
|
+
|
|
94
|
+
asv_table = pd.read_csv("results/asv_table.csv", index_col=0)
|
|
95
|
+
taxonomy = pd.read_csv("results/taxonomy.csv", index_col=0)
|
|
96
|
+
print(f" DADA2: {asv_table.shape[1]} ASVs from {asv_table.shape[0]} samples")
|
|
97
|
+
return asv_table, taxonomy
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## 2. ショットガン分類学的プロファイリング
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
def shotgun_taxonomic_profiling(fastq_files, method="metaphlan"):
|
|
104
|
+
"""
|
|
105
|
+
ショットガンメタゲノム分類学的プロファイリング。
|
|
106
|
+
|
|
107
|
+
method:
|
|
108
|
+
- "metaphlan": MetaPhlAn 4 — clade-specific marker 遺伝子ベース
|
|
109
|
+
- "kraken2": Kraken2 — k-mer ベース(高速、メモリ大)
|
|
110
|
+
- "sourmash": sourmash — MinHash ベース
|
|
111
|
+
|
|
112
|
+
MetaPhlAn: 精度重視(微量種の検出に優れる)
|
|
113
|
+
Kraken2: 速度重視(大規模データ向け)
|
|
114
|
+
"""
|
|
115
|
+
import subprocess
|
|
116
|
+
|
|
117
|
+
profiles = []
|
|
118
|
+
for fq in fastq_files:
|
|
119
|
+
sample = fq.split("/")[-1].replace(".fastq.gz", "")
|
|
120
|
+
|
|
121
|
+
if method == "metaphlan":
|
|
122
|
+
cmd = (f"metaphlan {fq} --input_type fastq "
|
|
123
|
+
f"--nproc 8 -o {sample}_profile.txt "
|
|
124
|
+
f"--bowtie2out {sample}.bt2out")
|
|
125
|
+
elif method == "kraken2":
|
|
126
|
+
cmd = (f"kraken2 --db kraken2_db --threads 8 "
|
|
127
|
+
f"--report {sample}_report.txt "
|
|
128
|
+
f"--output {sample}_kraken.txt {fq}")
|
|
129
|
+
|
|
130
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
131
|
+
profile = pd.read_csv(f"{sample}_profile.txt", sep="\t",
|
|
132
|
+
comment="#", header=None)
|
|
133
|
+
profile["sample"] = sample
|
|
134
|
+
profiles.append(profile)
|
|
135
|
+
|
|
136
|
+
merged = pd.concat(profiles, ignore_index=True)
|
|
137
|
+
print(f" Profiling ({method}): {len(fastq_files)} samples processed")
|
|
138
|
+
return merged
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## 3. α / β 多様性解析
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
from scipy.spatial.distance import braycurtis, pdist, squareform
|
|
145
|
+
from scipy.stats import mannwhitneyu, kruskal
|
|
146
|
+
from skbio.diversity import alpha_diversity, beta_diversity
|
|
147
|
+
|
|
148
|
+
def alpha_diversity_analysis(asv_table, metadata, group_col,
|
|
149
|
+
metrics=None):
|
|
150
|
+
"""
|
|
151
|
+
α 多様性(群集内多様性)解析。
|
|
152
|
+
|
|
153
|
+
指標:
|
|
154
|
+
- observed_features: 観察種数(Richness)
|
|
155
|
+
- shannon: Shannon entropy H' = -Σ pᵢ ln(pᵢ)
|
|
156
|
+
- simpson: Simpson index D = 1 - Σ pᵢ²
|
|
157
|
+
- chao1: Chao1 推定種数 S_est = S_obs + f₁²/(2·f₂)
|
|
158
|
+
- faith_pd: Faith's Phylogenetic Diversity(系統的多様性)
|
|
159
|
+
"""
|
|
160
|
+
if metrics is None:
|
|
161
|
+
metrics = ["observed_features", "shannon", "simpson", "chao1"]
|
|
162
|
+
|
|
163
|
+
results = {}
|
|
164
|
+
for metric in metrics:
|
|
165
|
+
values = alpha_diversity(metric, asv_table.values, asv_table.index)
|
|
166
|
+
results[metric] = values
|
|
167
|
+
|
|
168
|
+
alpha_df = pd.DataFrame(results, index=asv_table.index)
|
|
169
|
+
alpha_df = alpha_df.join(metadata[[group_col]])
|
|
170
|
+
|
|
171
|
+
# 群間比較
|
|
172
|
+
groups = alpha_df[group_col].unique()
|
|
173
|
+
comparisons = {}
|
|
174
|
+
for metric in metrics:
|
|
175
|
+
if len(groups) == 2:
|
|
176
|
+
g1 = alpha_df[alpha_df[group_col] == groups[0]][metric]
|
|
177
|
+
g2 = alpha_df[alpha_df[group_col] == groups[1]][metric]
|
|
178
|
+
stat, pval = mannwhitneyu(g1, g2)
|
|
179
|
+
else:
|
|
180
|
+
group_data = [alpha_df[alpha_df[group_col] == g][metric] for g in groups]
|
|
181
|
+
stat, pval = kruskal(*group_data)
|
|
182
|
+
comparisons[metric] = {"statistic": stat, "p_value": pval}
|
|
183
|
+
|
|
184
|
+
print(f" α diversity: {len(metrics)} indices computed for {len(alpha_df)} samples")
|
|
185
|
+
return alpha_df, comparisons
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def beta_diversity_analysis(asv_table, metadata, group_col,
|
|
189
|
+
metric="braycurtis", n_perms=999):
|
|
190
|
+
"""
|
|
191
|
+
β 多様性(群集間距離)解析。
|
|
192
|
+
|
|
193
|
+
距離指標:
|
|
194
|
+
- braycurtis: Bray-Curtis dissimilarity
|
|
195
|
+
- jaccard: Jaccard distance
|
|
196
|
+
- unifrac: UniFrac(系統考慮、ツリー必要)
|
|
197
|
+
- aitchison: Aitchison distance(CoDA 推奨)
|
|
198
|
+
|
|
199
|
+
統計検定:
|
|
200
|
+
- PERMANOVA (adonis2): 群間距離の有意差
|
|
201
|
+
- PERMDISP: 分散均一性検定
|
|
202
|
+
"""
|
|
203
|
+
dm = beta_diversity(metric, asv_table.values, asv_table.index)
|
|
204
|
+
|
|
205
|
+
# PERMANOVA
|
|
206
|
+
from skbio.stats.distance import permanova
|
|
207
|
+
groups = metadata.loc[asv_table.index, group_col]
|
|
208
|
+
permanova_result = permanova(dm, groups, permutations=n_perms)
|
|
209
|
+
|
|
210
|
+
# PCoA
|
|
211
|
+
from skbio.stats.ordination import pcoa
|
|
212
|
+
pcoa_result = pcoa(dm)
|
|
213
|
+
|
|
214
|
+
print(f" β diversity ({metric}): PERMANOVA R²={permanova_result['test statistic']:.4f}, "
|
|
215
|
+
f"p={permanova_result['p-value']:.4f}")
|
|
216
|
+
return dm, pcoa_result, permanova_result
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## 4. 差次存在量解析
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
def differential_abundance(asv_table, metadata, group_col,
|
|
223
|
+
formula="~group", method="ancombc"):
|
|
224
|
+
"""
|
|
225
|
+
差次存在量解析 — 群間で有意に異なる微生物の同定。
|
|
226
|
+
|
|
227
|
+
method:
|
|
228
|
+
- "ancombc": ANCOM-BC2 — バイアス補正・組成データ対応(推奨)
|
|
229
|
+
- "deseq2": DESeq2 — 負の二項分布(RNA-seq 由来)
|
|
230
|
+
- "aldex2": ALDEx2 — CLR 変換 + 効果量
|
|
231
|
+
|
|
232
|
+
組成データの問題:
|
|
233
|
+
相対存在量は合計=1 の制約がありスプリアス相関を生む。
|
|
234
|
+
CLR 変換: clr(x) = log(xᵢ / geometric_mean(x))
|
|
235
|
+
"""
|
|
236
|
+
import subprocess
|
|
237
|
+
|
|
238
|
+
if method == "ancombc":
|
|
239
|
+
r_script = f"""
|
|
240
|
+
library(ANCOMBC)
|
|
241
|
+
library(phyloseq)
|
|
242
|
+
# ANCOM-BC2 analysis
|
|
243
|
+
res <- ancombc2(data=ps, fix_formula="{formula}",
|
|
244
|
+
p_adj_method="holm", alpha=0.05)
|
|
245
|
+
write.csv(res$res, "results/da_results.csv")
|
|
246
|
+
"""
|
|
247
|
+
with open("_da_analysis.R", "w") as f:
|
|
248
|
+
f.write(r_script)
|
|
249
|
+
subprocess.run(["Rscript", "_da_analysis.R"], check=True)
|
|
250
|
+
results = pd.read_csv("results/da_results.csv", index_col=0)
|
|
251
|
+
|
|
252
|
+
elif method == "deseq2":
|
|
253
|
+
r_script = f"""
|
|
254
|
+
library(DESeq2)
|
|
255
|
+
dds <- DESeqDataSetFromMatrix(countData=asv_counts,
|
|
256
|
+
colData=sample_data,
|
|
257
|
+
design={formula})
|
|
258
|
+
dds <- DESeq(dds)
|
|
259
|
+
res <- results(dds)
|
|
260
|
+
write.csv(as.data.frame(res), "results/da_results.csv")
|
|
261
|
+
"""
|
|
262
|
+
with open("_da_analysis.R", "w") as f:
|
|
263
|
+
f.write(r_script)
|
|
264
|
+
subprocess.run(["Rscript", "_da_analysis.R"], check=True)
|
|
265
|
+
results = pd.read_csv("results/da_results.csv", index_col=0)
|
|
266
|
+
|
|
267
|
+
n_sig = (results.get("padj", results.get("q_val", pd.Series())) < 0.05).sum()
|
|
268
|
+
print(f" DA ({method}): {n_sig} differentially abundant taxa")
|
|
269
|
+
return results
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## 5. 機能的プロファイリング
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
def functional_profiling(fastq_files, method="humann"):
|
|
276
|
+
"""
|
|
277
|
+
メタゲノム機能的プロファイリング。
|
|
278
|
+
|
|
279
|
+
method:
|
|
280
|
+
- "humann": HUMAnN 3 — UniRef90/MetaCyc パスウェイ
|
|
281
|
+
- "picrust2": PICRUSt2 — 16S から機能予測
|
|
282
|
+
|
|
283
|
+
HUMAnN 出力:
|
|
284
|
+
1. Gene families (UniRef90/UniRef50)
|
|
285
|
+
2. Pathway abundance (MetaCyc)
|
|
286
|
+
3. Pathway coverage
|
|
287
|
+
"""
|
|
288
|
+
import subprocess
|
|
289
|
+
|
|
290
|
+
for fq in fastq_files:
|
|
291
|
+
sample = fq.split("/")[-1].replace(".fastq.gz", "")
|
|
292
|
+
cmd = (f"humann --input {fq} --output humann_results/{sample}/ "
|
|
293
|
+
f"--threads 8 --nucleotide-database chocophlan "
|
|
294
|
+
f"--protein-database uniref")
|
|
295
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
296
|
+
|
|
297
|
+
# 結果のマージ
|
|
298
|
+
subprocess.run("humann_join_tables -i humann_results/ -o results/pathway_abundance.tsv "
|
|
299
|
+
"--file_name pathabundance", shell=True, check=True)
|
|
300
|
+
subprocess.run("humann_join_tables -i humann_results/ -o results/genefamilies.tsv "
|
|
301
|
+
"--file_name genefamilies", shell=True, check=True)
|
|
302
|
+
|
|
303
|
+
pathways = pd.read_csv("results/pathway_abundance.tsv", sep="\t", index_col=0)
|
|
304
|
+
print(f" HUMAnN: {pathways.shape[0]} pathways across {pathways.shape[1]} samples")
|
|
305
|
+
return pathways
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
## References
|
|
309
|
+
|
|
310
|
+
### Output Files
|
|
311
|
+
|
|
312
|
+
| ファイル | 形式 |
|
|
313
|
+
|---|---|
|
|
314
|
+
| `results/asv_table.csv` | CSV |
|
|
315
|
+
| `results/taxonomy.csv` | CSV |
|
|
316
|
+
| `results/alpha_diversity.csv` | CSV |
|
|
317
|
+
| `results/beta_distance_matrix.csv` | CSV |
|
|
318
|
+
| `results/da_results.csv` | CSV |
|
|
319
|
+
| `results/pathway_abundance.tsv` | TSV |
|
|
320
|
+
| `figures/alpha_boxplot.png` | PNG |
|
|
321
|
+
| `figures/pcoa_plot.png` | PNG |
|
|
322
|
+
| `figures/barplot_taxonomy.png` | PNG |
|
|
323
|
+
|
|
324
|
+
### 利用可能ツール
|
|
325
|
+
|
|
326
|
+
> [ToolUniverse](https://github.com/mims-harvard/ToolUniverse) SMCP 経由で利用可能な外部ツール。
|
|
327
|
+
|
|
328
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
329
|
+
|---|---|---|
|
|
330
|
+
| MGnify | `MGnify_search_studies` | メタゲノム研究検索 |
|
|
331
|
+
| MGnify | `MGnify_list_analyses` | メタゲノム解析一覧 |
|
|
332
|
+
| KEGG | `kegg_get_pathway_info` | 代謝パスウェイ情報 |
|
|
333
|
+
| KEGG | `kegg_search_pathway` | パスウェイ検索 |
|
|
334
|
+
| MetaCyc | `MetaCyc_search_pathways` | 代謝経路検索 |
|
|
335
|
+
| PubMed | `PubMed_search_articles` | マイクロバイオーム文献検索 |
|
|
336
|
+
|
|
337
|
+
### 参照スキル
|
|
338
|
+
|
|
339
|
+
| スキル | 連携内容 |
|
|
340
|
+
|---|---|
|
|
341
|
+
| [scientific-metabolomics](../scientific-metabolomics/SKILL.md) | 代謝物-微生物相関 |
|
|
342
|
+
| [scientific-network-analysis](../scientific-network-analysis/SKILL.md) | 微生物共起ネットワーク |
|
|
343
|
+
| [scientific-statistical-testing](../scientific-statistical-testing/SKILL.md) | 多重検定補正 |
|
|
344
|
+
| [scientific-multi-omics](../scientific-multi-omics/SKILL.md) | マルチオミクス統合 |
|
|
345
|
+
| [scientific-causal-inference](../scientific-causal-inference/SKILL.md) | 因果推論(微生物-表現型) |
|
|
346
|
+
|
|
347
|
+
#### 依存パッケージ
|
|
348
|
+
|
|
349
|
+
- scikit-bio, biom-format, qiime2, dada2 (R), ANCOM-BC (R), DESeq2 (R), HUMAnN, MetaPhlAn
|