@nahisaho/satori 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +123 -4
- package/package.json +1 -1
- package/src/.github/skills/scientific-environmental-ecology/SKILL.md +295 -0
- package/src/.github/skills/scientific-epidemiology-public-health/SKILL.md +332 -0
- package/src/.github/skills/scientific-immunoinformatics/SKILL.md +341 -0
- package/src/.github/skills/scientific-infectious-disease/SKILL.md +342 -0
- package/src/.github/skills/scientific-microbiome-metagenomics/SKILL.md +349 -0
- package/src/.github/skills/scientific-population-genetics/SKILL.md +336 -0
- package/src/.github/skills/scientific-single-cell-genomics/SKILL.md +361 -0
- package/src/.github/skills/scientific-spatial-transcriptomics/SKILL.md +281 -0
- package/src/.github/skills/scientific-systems-biology/SKILL.md +310 -0
- package/src/.github/skills/scientific-text-mining-nlp/SKILL.md +358 -0
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-population-genetics
|
|
3
|
+
description: |
|
|
4
|
+
集団遺伝学解析スキル。アレル頻度解析・Hardy-Weinberg 平衡検定・
|
|
5
|
+
集団構造解析(PCA / ADMIXTURE)・Fst 分化指標・選択圧検出(iHS / XP-EHH)・
|
|
6
|
+
連鎖不平衡(LD)解析・GWAS Catalog / gnomAD データ統合パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Population Genetics
|
|
10
|
+
|
|
11
|
+
集団遺伝学に特化した解析パイプラインを提供する。
|
|
12
|
+
アレル頻度、集団構造、遺伝的分化、自然選択シグナル、
|
|
13
|
+
連鎖不平衡の解析を体系的に扱い、GWAS Catalog・gnomAD との統合を支援する。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- アレル頻度分布や Hardy-Weinberg 平衡を検定するとき
|
|
18
|
+
- 集団構造(PCA / ADMIXTURE / STRUCTURE)を解析するとき
|
|
19
|
+
- 集団間の遺伝的分化(Fst)を評価するとき
|
|
20
|
+
- 自然選択シグナル(iHS / Tajima's D / XP-EHH)を検出するとき
|
|
21
|
+
- GWAS 関連バリアントの集団遺伝学的解釈を行うとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. QC・アレル頻度解析
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
|
|
33
|
+
def genotype_qc(plink_prefix, mind=0.02, geno=0.02, maf=0.01,
|
|
34
|
+
hwe_p=1e-6):
|
|
35
|
+
"""
|
|
36
|
+
ジェノタイプ QC パイプライン(PLINK 2)。
|
|
37
|
+
|
|
38
|
+
フィルタリング基準:
|
|
39
|
+
- --mind: 個体ミッシング率 ≤ mind
|
|
40
|
+
- --geno: SNP ミッシング率 ≤ geno
|
|
41
|
+
- --maf: Minor Allele Frequency ≥ maf
|
|
42
|
+
- --hwe: Hardy-Weinberg p ≥ hwe_p(コントロールのみ)
|
|
43
|
+
|
|
44
|
+
追加 QC:
|
|
45
|
+
- 性別不一致チェック
|
|
46
|
+
- IBD 推定(近親者除外: π̂ > 0.25)
|
|
47
|
+
- PCA アウトライアー除外
|
|
48
|
+
"""
|
|
49
|
+
import subprocess
|
|
50
|
+
|
|
51
|
+
# Step 1: ミッシング率フィルタ
|
|
52
|
+
cmd = (f"plink2 --bfile {plink_prefix} "
|
|
53
|
+
f"--mind {mind} --geno {geno} --maf {maf} "
|
|
54
|
+
f"--hwe {hwe_p} "
|
|
55
|
+
f"--make-bed --out {plink_prefix}_qc")
|
|
56
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
57
|
+
|
|
58
|
+
# Step 2: IBD 推定(近親者検出)
|
|
59
|
+
cmd = (f"plink2 --bfile {plink_prefix}_qc "
|
|
60
|
+
f"--indep-pairwise 50 5 0.2 --out {plink_prefix}_prune")
|
|
61
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
62
|
+
|
|
63
|
+
cmd = (f"plink2 --bfile {plink_prefix}_qc "
|
|
64
|
+
f"--extract {plink_prefix}_prune.prune.in "
|
|
65
|
+
f"--genome --out {plink_prefix}_ibd")
|
|
66
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
67
|
+
|
|
68
|
+
return f"{plink_prefix}_qc"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def allele_frequency_stats(genotype_matrix, populations):
|
|
72
|
+
"""
|
|
73
|
+
集団別アレル頻度統計。
|
|
74
|
+
|
|
75
|
+
算出指標:
|
|
76
|
+
- MAF: Minor Allele Frequency
|
|
77
|
+
- Het: Observed heterozygosity = n_het / n_total
|
|
78
|
+
- Expected Het (He): 2pq
|
|
79
|
+
- HWE: Hardy-Weinberg 平衡検定 (χ² test)
|
|
80
|
+
H₀: f(AA) = p², f(Aa) = 2pq, f(aa) = q²
|
|
81
|
+
"""
|
|
82
|
+
from scipy.stats import chi2
|
|
83
|
+
|
|
84
|
+
results = []
|
|
85
|
+
for pop in populations["population"].unique():
|
|
86
|
+
pop_idx = populations[populations["population"] == pop].index
|
|
87
|
+
geno_pop = genotype_matrix.loc[pop_idx]
|
|
88
|
+
|
|
89
|
+
for snp in geno_pop.columns:
|
|
90
|
+
counts = geno_pop[snp].value_counts()
|
|
91
|
+
n = counts.sum()
|
|
92
|
+
n_0 = counts.get(0, 0) # AA
|
|
93
|
+
n_1 = counts.get(1, 0) # Aa
|
|
94
|
+
n_2 = counts.get(2, 0) # aa
|
|
95
|
+
|
|
96
|
+
p = (2 * n_0 + n_1) / (2 * n)
|
|
97
|
+
q = 1 - p
|
|
98
|
+
maf = min(p, q)
|
|
99
|
+
|
|
100
|
+
# HWE test
|
|
101
|
+
exp_0 = n * p**2
|
|
102
|
+
exp_1 = n * 2*p*q
|
|
103
|
+
exp_2 = n * q**2
|
|
104
|
+
if exp_0 > 0 and exp_1 > 0 and exp_2 > 0:
|
|
105
|
+
chi2_stat = ((n_0-exp_0)**2/exp_0 + (n_1-exp_1)**2/exp_1 +
|
|
106
|
+
(n_2-exp_2)**2/exp_2)
|
|
107
|
+
hwe_p = 1 - chi2.cdf(chi2_stat, df=1)
|
|
108
|
+
else:
|
|
109
|
+
hwe_p = 1.0
|
|
110
|
+
|
|
111
|
+
het_obs = n_1 / n
|
|
112
|
+
het_exp = 2 * p * q
|
|
113
|
+
|
|
114
|
+
results.append({
|
|
115
|
+
"snp": snp, "population": pop,
|
|
116
|
+
"MAF": round(maf, 4), "p": round(p, 4),
|
|
117
|
+
"Het_obs": round(het_obs, 4), "Het_exp": round(het_exp, 4),
|
|
118
|
+
"HWE_p": round(hwe_p, 6),
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
return pd.DataFrame(results)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## 2. 集団構造解析
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
def population_structure(plink_prefix, n_components=10, method="pca"):
|
|
128
|
+
"""
|
|
129
|
+
集団構造解析。
|
|
130
|
+
|
|
131
|
+
method:
|
|
132
|
+
- "pca": 主成分分析 — 集団間の遺伝的差異を 2D/3D で可視化
|
|
133
|
+
- "admixture": ADMIXTURE — 各個体の祖先集団比率を推定
|
|
134
|
+
K=2〜10 を試行し、CV error 最小の K を選択
|
|
135
|
+
|
|
136
|
+
PCA on genotypes:
|
|
137
|
+
X を (n_samples × n_snps) ジェノタイプ行列として
|
|
138
|
+
共分散行列 C = XᵀX / n_snps の固有値分解
|
|
139
|
+
"""
|
|
140
|
+
import subprocess
|
|
141
|
+
|
|
142
|
+
if method == "pca":
|
|
143
|
+
cmd = (f"plink2 --bfile {plink_prefix} "
|
|
144
|
+
f"--pca {n_components} --out {plink_prefix}_pca")
|
|
145
|
+
subprocess.run(cmd, shell=True, check=True)
|
|
146
|
+
|
|
147
|
+
eigenvec = pd.read_csv(f"{plink_prefix}_pca.eigenvec", sep="\t")
|
|
148
|
+
eigenval = pd.read_csv(f"{plink_prefix}_pca.eigenval", header=None)
|
|
149
|
+
var_explained = eigenval[0] / eigenval[0].sum()
|
|
150
|
+
|
|
151
|
+
print(f" PCA: PC1={var_explained[0]:.3f}, PC2={var_explained[1]:.3f}")
|
|
152
|
+
return eigenvec, var_explained
|
|
153
|
+
|
|
154
|
+
elif method == "admixture":
|
|
155
|
+
cv_errors = {}
|
|
156
|
+
for K in range(2, 11):
|
|
157
|
+
cmd = f"admixture --cv {plink_prefix}.bed {K}"
|
|
158
|
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
159
|
+
# CV error 抽出
|
|
160
|
+
for line in result.stdout.split("\n"):
|
|
161
|
+
if "CV error" in line:
|
|
162
|
+
cv_errors[K] = float(line.split(": ")[1])
|
|
163
|
+
|
|
164
|
+
best_K = min(cv_errors, key=cv_errors.get)
|
|
165
|
+
Q = pd.read_csv(f"{plink_prefix}.{best_K}.Q", sep=" ", header=None)
|
|
166
|
+
|
|
167
|
+
print(f" ADMIXTURE: best K={best_K} (CV error={cv_errors[best_K]:.4f})")
|
|
168
|
+
return Q, cv_errors, best_K
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## 3. 遺伝的分化(Fst)
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
def calculate_fst(genotype_matrix, populations, method="weir_cockerham"):
|
|
175
|
+
"""
|
|
176
|
+
集団間遺伝的分化指標 Fst 算出。
|
|
177
|
+
|
|
178
|
+
Weir-Cockerham (1984) 推定量:
|
|
179
|
+
F_ST = σ²_a / (σ²_a + σ²_b + σ²_w)
|
|
180
|
+
σ²_a: 集団間分散
|
|
181
|
+
σ²_b: 集団内個体間分散
|
|
182
|
+
σ²_w: 個体内(アレル間)分散
|
|
183
|
+
|
|
184
|
+
解釈:
|
|
185
|
+
Fst = 0: 分化なし(パンミクシア)
|
|
186
|
+
0 < Fst < 0.05: 低分化
|
|
187
|
+
0.05 ≤ Fst < 0.15: 中程度の分化
|
|
188
|
+
0.15 ≤ Fst < 0.25: 大きな分化
|
|
189
|
+
Fst ≥ 0.25: 非常に大きな分化
|
|
190
|
+
|
|
191
|
+
genome-wide Fst: 全 SNP の加重平均
|
|
192
|
+
per-SNP Fst: 局所的適応シグナルの検出
|
|
193
|
+
"""
|
|
194
|
+
pop_labels = populations["population"]
|
|
195
|
+
unique_pops = pop_labels.unique()
|
|
196
|
+
|
|
197
|
+
fst_per_snp = []
|
|
198
|
+
for snp in genotype_matrix.columns:
|
|
199
|
+
# 集団別アレル頻度
|
|
200
|
+
pop_freqs = {}
|
|
201
|
+
pop_sizes = {}
|
|
202
|
+
for pop in unique_pops:
|
|
203
|
+
idx = pop_labels[pop_labels == pop].index
|
|
204
|
+
geno = genotype_matrix.loc[idx, snp].dropna()
|
|
205
|
+
p = (2 * (geno == 0).sum() + (geno == 1).sum()) / (2 * len(geno))
|
|
206
|
+
pop_freqs[pop] = p
|
|
207
|
+
pop_sizes[pop] = len(geno)
|
|
208
|
+
|
|
209
|
+
# Weir-Cockerham Fst
|
|
210
|
+
n_pops = len(unique_pops)
|
|
211
|
+
n_total = sum(pop_sizes.values())
|
|
212
|
+
p_bar = sum(pop_freqs[p] * pop_sizes[p] for p in unique_pops) / n_total
|
|
213
|
+
n_bar = n_total / n_pops
|
|
214
|
+
|
|
215
|
+
MSP = sum(pop_sizes[p] * (pop_freqs[p] - p_bar)**2
|
|
216
|
+
for p in unique_pops) / (n_pops - 1)
|
|
217
|
+
MSG = sum(pop_sizes[p] * pop_freqs[p] * (1 - pop_freqs[p])
|
|
218
|
+
for p in unique_pops) / (n_total - n_pops)
|
|
219
|
+
|
|
220
|
+
nc = (n_total - sum(n**2 for n in pop_sizes.values()) / n_total) / (n_pops - 1)
|
|
221
|
+
|
|
222
|
+
if (MSP + (nc - 1) * MSG) > 0:
|
|
223
|
+
fst = (MSP - MSG) / (MSP + (nc - 1) * MSG)
|
|
224
|
+
else:
|
|
225
|
+
fst = 0
|
|
226
|
+
|
|
227
|
+
fst_per_snp.append({"snp": snp, "Fst": max(fst, 0), "p_bar": p_bar})
|
|
228
|
+
|
|
229
|
+
fst_df = pd.DataFrame(fst_per_snp)
|
|
230
|
+
genome_fst = fst_df["Fst"].mean()
|
|
231
|
+
|
|
232
|
+
print(f" Fst: genome-wide={genome_fst:.4f}, "
|
|
233
|
+
f"max per-SNP={fst_df['Fst'].max():.4f}")
|
|
234
|
+
return fst_df, genome_fst
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## 4. 自然選択シグナル検出
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
def selection_scan(haplotype_matrix, positions, method="ihs"):
|
|
241
|
+
"""
|
|
242
|
+
自然選択シグナルの検出。
|
|
243
|
+
|
|
244
|
+
method:
|
|
245
|
+
- "ihs": Integrated Haplotype Score — ローカル正の選択
|
|
246
|
+
|iHS| > 2: 選択シグナル候補
|
|
247
|
+
- "tajima_d": Tajima's D — 中立性検定
|
|
248
|
+
D > 0: バランス選択 or 集団縮小
|
|
249
|
+
D < 0: 正の選択 or 集団拡大
|
|
250
|
+
D ≈ 0: 中立進化
|
|
251
|
+
- "xpehh": Cross-Population EHH — 集団間正の選択
|
|
252
|
+
|
|
253
|
+
iHS:
|
|
254
|
+
各 SNP について、派生アレル (derived) と祖先アレル (ancestral) の
|
|
255
|
+
Extended Haplotype Homozygosity (EHH) を比較。
|
|
256
|
+
iHS = ln(iHH_A / iHH_D) → 標準化
|
|
257
|
+
"""
|
|
258
|
+
if method == "tajima_d":
|
|
259
|
+
# スライディングウィンドウ Tajima's D
|
|
260
|
+
from allel import tajima_d
|
|
261
|
+
import allel
|
|
262
|
+
|
|
263
|
+
D_values = []
|
|
264
|
+
window_size = 50000
|
|
265
|
+
step = 10000
|
|
266
|
+
|
|
267
|
+
for start in range(0, positions[-1], step):
|
|
268
|
+
end = start + window_size
|
|
269
|
+
mask = (positions >= start) & (positions < end)
|
|
270
|
+
if mask.sum() > 5:
|
|
271
|
+
ac = allel.AlleleCountsArray(
|
|
272
|
+
haplotype_matrix[:, mask].sum(axis=0).reshape(-1, 1))
|
|
273
|
+
D = tajima_d(ac)
|
|
274
|
+
D_values.append({"start": start, "end": end, "D": D,
|
|
275
|
+
"n_snps": mask.sum()})
|
|
276
|
+
|
|
277
|
+
df = pd.DataFrame(D_values)
|
|
278
|
+
print(f" Tajima's D: mean={df['D'].mean():.3f}, "
|
|
279
|
+
f"range=[{df['D'].min():.3f}, {df['D'].max():.3f}]")
|
|
280
|
+
return df
|
|
281
|
+
|
|
282
|
+
elif method == "ihs":
|
|
283
|
+
import allel
|
|
284
|
+
ihs = allel.ihs(haplotype_matrix, positions)
|
|
285
|
+
# 標準化
|
|
286
|
+
ihs_std = (ihs - np.nanmean(ihs)) / np.nanstd(ihs)
|
|
287
|
+
|
|
288
|
+
n_sig = np.sum(np.abs(ihs_std) > 2)
|
|
289
|
+
print(f" iHS: {n_sig} candidate regions (|iHS|>2)")
|
|
290
|
+
return ihs_std
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
## References
|
|
294
|
+
|
|
295
|
+
### Output Files
|
|
296
|
+
|
|
297
|
+
| ファイル | 形式 |
|
|
298
|
+
|---|---|
|
|
299
|
+
| `results/allele_frequencies.csv` | CSV |
|
|
300
|
+
| `results/pca_eigenvec.csv` | CSV |
|
|
301
|
+
| `results/admixture_Q.csv` | CSV |
|
|
302
|
+
| `results/fst_per_snp.csv` | CSV |
|
|
303
|
+
| `results/selection_scan.csv` | CSV |
|
|
304
|
+
| `figures/pca_populations.png` | PNG |
|
|
305
|
+
| `figures/admixture_barplot.png` | PNG |
|
|
306
|
+
| `figures/manhattan_fst.png` | PNG |
|
|
307
|
+
|
|
308
|
+
### 利用可能ツール
|
|
309
|
+
|
|
310
|
+
> [ToolUniverse](https://github.com/mims-harvard/ToolUniverse) SMCP 経由で利用可能な外部ツール。
|
|
311
|
+
|
|
312
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
313
|
+
|---|---|---|
|
|
314
|
+
| gnomAD | `gnomad_get_variant` | バリアント集団頻度 |
|
|
315
|
+
| gnomAD | `gnomad_get_gene_constraints` | 遺伝子制約指標 |
|
|
316
|
+
| gnomAD | `gnomad_get_region` | 領域別バリアント |
|
|
317
|
+
| gnomAD | `gnomad_search_variants` | バリアント検索 |
|
|
318
|
+
| GWAS | `GWAS_search_associations_by_gene` | 遺伝子別 GWAS 関連 |
|
|
319
|
+
| GWAS | `gwas_search_studies` | GWAS 研究検索 |
|
|
320
|
+
| GWAS | `gwas_get_variants_for_trait` | 形質別バリアント |
|
|
321
|
+
| GWAS | `gwas_get_associations_for_snp` | SNP 別関連 |
|
|
322
|
+
| GWAS | `gwas_get_snps_for_gene` | 遺伝子近傍 SNP |
|
|
323
|
+
|
|
324
|
+
### 参照スキル
|
|
325
|
+
|
|
326
|
+
| スキル | 連携内容 |
|
|
327
|
+
|---|---|
|
|
328
|
+
| [scientific-variant-interpretation](../scientific-variant-interpretation/SKILL.md) | バリアント臨床解釈 |
|
|
329
|
+
| [scientific-bioinformatics](../scientific-bioinformatics/SKILL.md) | ゲノムアノテーション |
|
|
330
|
+
| [scientific-disease-research](../scientific-disease-research/SKILL.md) | 疾患-遺伝子関連 |
|
|
331
|
+
| [scientific-statistical-testing](../scientific-statistical-testing/SKILL.md) | 統計検定 |
|
|
332
|
+
| [scientific-pca-tsne](../scientific-pca-tsne/SKILL.md) | 次元削減 |
|
|
333
|
+
|
|
334
|
+
#### 依存パッケージ
|
|
335
|
+
|
|
336
|
+
- scikit-allel, plink2, admixture, pandas, numpy, scipy
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-single-cell-genomics
|
|
3
|
+
description: |
|
|
4
|
+
シングルセルゲノミクス解析スキル。scRNA-seq データの品質管理・正規化・
|
|
5
|
+
次元削減(PCA/UMAP)・クラスタリング(Leiden)・差次発現遺伝子(DEG)同定・
|
|
6
|
+
セルタイプアノテーション・RNA velocity・細胞間コミュニケーション推定パイプライン。
|
|
7
|
+
Scanpy / AnnData フレームワークに準拠。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Single-Cell Genomics
|
|
11
|
+
|
|
12
|
+
scRNA-seq / snRNA-seq データを対象に、QC → 正規化 → 高変動遺伝子選択 →
|
|
13
|
+
次元削減 → クラスタリング → DEG → セルタイプアノテーションの標準パイプラインを提供する。
|
|
14
|
+
CELLxGENE Census・HCA データポータルとの連携も組み込む。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- scRNA-seq / snRNA-seq データの解析パイプラインが必要なとき
|
|
19
|
+
- セルタイプのクラスタリングとアノテーションを行うとき
|
|
20
|
+
- クラスタ間の差次発現遺伝子を同定するとき
|
|
21
|
+
- RNA velocity による細胞分化軌跡を解析するとき
|
|
22
|
+
- CellChat / CellPhoneDB による細胞間コミュニケーション推定を行うとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. QC・前処理パイプライン
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import scanpy as sc
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
def sc_qc_preprocessing(adata, min_genes=200, max_genes=5000,
|
|
36
|
+
max_pct_mito=20, min_cells=3):
|
|
37
|
+
"""
|
|
38
|
+
scRNA-seq 標準 QC パイプライン。
|
|
39
|
+
|
|
40
|
+
QC メトリクス:
|
|
41
|
+
- n_genes_by_counts: 細胞あたり検出遺伝子数
|
|
42
|
+
- total_counts: 細胞あたり総 UMI カウント
|
|
43
|
+
- pct_counts_mt: ミトコンドリア遺伝子比率(%)
|
|
44
|
+
|
|
45
|
+
フィルタリング基準:
|
|
46
|
+
- min_genes ≤ n_genes ≤ max_genes
|
|
47
|
+
- pct_mito ≤ max_pct_mito
|
|
48
|
+
- 遺伝子は min_cells 以上の細胞で発現
|
|
49
|
+
"""
|
|
50
|
+
# ミトコンドリア遺伝子のアノテーション
|
|
51
|
+
adata.var["mt"] = adata.var_names.str.startswith(("MT-", "mt-"))
|
|
52
|
+
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
|
|
53
|
+
|
|
54
|
+
n_before = adata.n_obs
|
|
55
|
+
# 細胞フィルタ
|
|
56
|
+
sc.pp.filter_cells(adata, min_genes=min_genes)
|
|
57
|
+
adata = adata[adata.obs["n_genes_by_counts"] <= max_genes].copy()
|
|
58
|
+
adata = adata[adata.obs["pct_counts_mt"] <= max_pct_mito].copy()
|
|
59
|
+
# 遺伝子フィルタ
|
|
60
|
+
sc.pp.filter_genes(adata, min_cells=min_cells)
|
|
61
|
+
|
|
62
|
+
n_after = adata.n_obs
|
|
63
|
+
print(f" QC: {n_before} → {n_after} cells ({n_before - n_after} removed)")
|
|
64
|
+
|
|
65
|
+
return adata
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def sc_normalize(adata, target_sum=1e4, n_top_genes=2000):
|
|
69
|
+
"""
|
|
70
|
+
正規化・HVG 選択パイプライン。
|
|
71
|
+
|
|
72
|
+
手順:
|
|
73
|
+
1. Library size normalization (target_sum)
|
|
74
|
+
2. Log1p 変換
|
|
75
|
+
3. Highly Variable Gene (HVG) 選択 (Seurat v3 法)
|
|
76
|
+
"""
|
|
77
|
+
adata.layers["counts"] = adata.X.copy()
|
|
78
|
+
sc.pp.normalize_total(adata, target_sum=target_sum)
|
|
79
|
+
sc.pp.log1p(adata)
|
|
80
|
+
adata.layers["log_normalized"] = adata.X.copy()
|
|
81
|
+
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, flavor="seurat_v3",
|
|
82
|
+
layer="counts")
|
|
83
|
+
print(f" HVG: {adata.var['highly_variable'].sum()} genes selected")
|
|
84
|
+
return adata
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## 2. 次元削減・クラスタリング
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
def sc_clustering(adata, n_pcs=50, n_neighbors=15, resolution=1.0,
|
|
91
|
+
random_state=42):
|
|
92
|
+
"""
|
|
93
|
+
PCA → 近傍グラフ → UMAP → Leiden クラスタリング。
|
|
94
|
+
|
|
95
|
+
Parameters:
|
|
96
|
+
n_pcs: PCA 主成分数
|
|
97
|
+
n_neighbors: k-NN グラフの近傍数
|
|
98
|
+
resolution: Leiden クラスタリングの解像度
|
|
99
|
+
"""
|
|
100
|
+
sc.pp.scale(adata, max_value=10)
|
|
101
|
+
sc.tl.pca(adata, n_comps=n_pcs, random_state=random_state)
|
|
102
|
+
|
|
103
|
+
# Elbow plot 用の分散説明率
|
|
104
|
+
variance_ratio = adata.uns["pca"]["variance_ratio"]
|
|
105
|
+
cumvar = np.cumsum(variance_ratio)
|
|
106
|
+
n_pcs_use = int(np.argmax(cumvar >= 0.9)) + 1
|
|
107
|
+
n_pcs_use = max(n_pcs_use, 15)
|
|
108
|
+
print(f" PCA: using {n_pcs_use} PCs (cumulative variance ≥ 90%)")
|
|
109
|
+
|
|
110
|
+
sc.pp.neighbors(adata, n_pcs=n_pcs_use, n_neighbors=n_neighbors,
|
|
111
|
+
random_state=random_state)
|
|
112
|
+
sc.tl.umap(adata, random_state=random_state)
|
|
113
|
+
sc.tl.leiden(adata, resolution=resolution, random_state=random_state)
|
|
114
|
+
|
|
115
|
+
n_clusters = adata.obs["leiden"].nunique()
|
|
116
|
+
print(f" Leiden: {n_clusters} clusters (resolution={resolution})")
|
|
117
|
+
return adata
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## 3. 差次発現遺伝子(DEG)同定
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
def sc_deg_analysis(adata, groupby="leiden", method="wilcoxon",
|
|
124
|
+
n_genes=200, min_logfc=0.25, max_pval=0.05):
|
|
125
|
+
"""
|
|
126
|
+
クラスタ間の差次発現遺伝子を同定する。
|
|
127
|
+
|
|
128
|
+
method:
|
|
129
|
+
- "wilcoxon": Wilcoxon rank-sum test(推奨)
|
|
130
|
+
- "t-test_overestim_var": Welch's t-test(高速)
|
|
131
|
+
- "logreg": Logistic regression
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
DataFrame with top DEGs per cluster
|
|
135
|
+
"""
|
|
136
|
+
sc.tl.rank_genes_groups(adata, groupby=groupby, method=method, n_genes=n_genes)
|
|
137
|
+
|
|
138
|
+
deg_results = []
|
|
139
|
+
for cluster in adata.obs[groupby].unique():
|
|
140
|
+
df = sc.get.rank_genes_groups_df(adata, group=cluster)
|
|
141
|
+
df["cluster"] = cluster
|
|
142
|
+
df_sig = df[(df["logfoldchanges"].abs() >= min_logfc) &
|
|
143
|
+
(df["pvals_adj"] < max_pval)]
|
|
144
|
+
deg_results.append(df_sig)
|
|
145
|
+
|
|
146
|
+
deg_df = pd.concat(deg_results, ignore_index=True)
|
|
147
|
+
print(f" DEG: {len(deg_df)} significant genes across {adata.obs[groupby].nunique()} clusters")
|
|
148
|
+
return deg_df
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## 4. セルタイプアノテーション
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
def annotate_celltypes_marker(adata, marker_dict, groupby="leiden",
|
|
155
|
+
threshold=0.5):
|
|
156
|
+
"""
|
|
157
|
+
マーカー遺伝子ベースのセルタイプアノテーション。
|
|
158
|
+
|
|
159
|
+
marker_dict 例:
|
|
160
|
+
{
|
|
161
|
+
"T cells": ["CD3D", "CD3E", "CD8A", "CD4"],
|
|
162
|
+
"B cells": ["CD19", "MS4A1", "CD79A"],
|
|
163
|
+
"Monocytes": ["CD14", "LYZ", "FCGR3A"],
|
|
164
|
+
"NK cells": ["NKG7", "GNLY", "KLRD1"],
|
|
165
|
+
"Dendritic": ["FCER1A", "CST3", "CLEC10A"],
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
各クラスタのマーカー遺伝子発現スコアを計算し、最も高いスコアに対応する
|
|
169
|
+
セルタイプをアサインする。
|
|
170
|
+
"""
|
|
171
|
+
sc.tl.score_genes(adata, gene_list=[], score_name="_dummy")
|
|
172
|
+
|
|
173
|
+
scores = {}
|
|
174
|
+
for cell_type, markers in marker_dict.items():
|
|
175
|
+
valid_markers = [m for m in markers if m in adata.var_names]
|
|
176
|
+
if valid_markers:
|
|
177
|
+
score_name = f"score_{cell_type.replace(' ', '_')}"
|
|
178
|
+
sc.tl.score_genes(adata, gene_list=valid_markers, score_name=score_name)
|
|
179
|
+
scores[cell_type] = score_name
|
|
180
|
+
|
|
181
|
+
# クラスタごとに最高スコアのセルタイプを割り当て
|
|
182
|
+
cluster_annotations = {}
|
|
183
|
+
for cluster in adata.obs[groupby].unique():
|
|
184
|
+
mask = adata.obs[groupby] == cluster
|
|
185
|
+
best_type = "Unknown"
|
|
186
|
+
best_score = -np.inf
|
|
187
|
+
for cell_type, score_col in scores.items():
|
|
188
|
+
mean_score = adata.obs.loc[mask, score_col].mean()
|
|
189
|
+
if mean_score > best_score and mean_score > threshold:
|
|
190
|
+
best_score = mean_score
|
|
191
|
+
best_type = cell_type
|
|
192
|
+
cluster_annotations[cluster] = best_type
|
|
193
|
+
|
|
194
|
+
adata.obs["cell_type"] = adata.obs[groupby].map(cluster_annotations)
|
|
195
|
+
print(f" Annotation: {len(set(cluster_annotations.values()))} cell types assigned")
|
|
196
|
+
return adata, cluster_annotations
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## 5. RNA Velocity
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
def rna_velocity_analysis(adata_loom_path, adata, basis="umap"):
|
|
203
|
+
"""
|
|
204
|
+
scVelo による RNA velocity 解析。
|
|
205
|
+
|
|
206
|
+
RNA velocity は、スプライシング状態(unspliced/spliced)の比率から
|
|
207
|
+
遺伝子発現の時間的変化方向を推定する。
|
|
208
|
+
|
|
209
|
+
Modes:
|
|
210
|
+
- stochastic: 確率的モデル(推奨、高速)
|
|
211
|
+
- dynamical: 動的モデル(精度高、低速)
|
|
212
|
+
"""
|
|
213
|
+
import scvelo as scv
|
|
214
|
+
|
|
215
|
+
# 前処理
|
|
216
|
+
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)
|
|
217
|
+
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
|
|
218
|
+
|
|
219
|
+
# Velocity 推定
|
|
220
|
+
scv.tl.velocity(adata, mode="stochastic")
|
|
221
|
+
scv.tl.velocity_graph(adata)
|
|
222
|
+
|
|
223
|
+
# 可視化
|
|
224
|
+
scv.pl.velocity_embedding_stream(adata, basis=basis,
|
|
225
|
+
save="figures/velocity_stream.png")
|
|
226
|
+
|
|
227
|
+
# Latent time(擬似時間)
|
|
228
|
+
scv.tl.latent_time(adata)
|
|
229
|
+
print(f" Velocity: latent time range [{adata.obs['latent_time'].min():.3f}, "
|
|
230
|
+
f"{adata.obs['latent_time'].max():.3f}]")
|
|
231
|
+
|
|
232
|
+
return adata
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## 6. 細胞間コミュニケーション推定
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
def cell_communication_analysis(adata, groupby="cell_type",
|
|
239
|
+
database="CellChatDB.human"):
|
|
240
|
+
"""
|
|
241
|
+
CellChat によるリガンド-レセプター相互作用解析。
|
|
242
|
+
|
|
243
|
+
パイプライン:
|
|
244
|
+
1. L-R ペアデータベースのロード
|
|
245
|
+
2. 発現ベースのコミュニケーション確率計算
|
|
246
|
+
3. シグナリングネットワーク推定
|
|
247
|
+
4. 経路レベル集約
|
|
248
|
+
"""
|
|
249
|
+
import cellchat
|
|
250
|
+
|
|
251
|
+
cc = cellchat.CellChat(adata, groupby=groupby)
|
|
252
|
+
cc.preprocess()
|
|
253
|
+
cc.identify_overexpressed_genes()
|
|
254
|
+
cc.identify_overexpressed_interactions()
|
|
255
|
+
|
|
256
|
+
cc.compute_communication_prob(database=database)
|
|
257
|
+
cc.filter_communication(min_cells=10)
|
|
258
|
+
cc.compute_communication_prob_pathway()
|
|
259
|
+
|
|
260
|
+
# ネットワーク可視化
|
|
261
|
+
cc.aggregate_net()
|
|
262
|
+
cc.net_analysis()
|
|
263
|
+
|
|
264
|
+
# 結果取得
|
|
265
|
+
lr_pairs = cc.get_significant_interactions()
|
|
266
|
+
print(f" CellChat: {len(lr_pairs)} significant L-R interactions")
|
|
267
|
+
|
|
268
|
+
return cc, lr_pairs
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## 7. 可視化
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
def sc_visualization_panel(adata, deg_df=None, save_dir="figures"):
|
|
275
|
+
"""
|
|
276
|
+
シングルセル解析結果の可視化パネル。
|
|
277
|
+
|
|
278
|
+
生成図:
|
|
279
|
+
1. QC violin plot (n_genes, total_counts, pct_mito)
|
|
280
|
+
2. UMAP — leiden clusters
|
|
281
|
+
3. UMAP — cell types
|
|
282
|
+
4. DEG dot plot (top markers per cluster)
|
|
283
|
+
5. Marker heatmap
|
|
284
|
+
"""
|
|
285
|
+
import matplotlib.pyplot as plt
|
|
286
|
+
import os
|
|
287
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
288
|
+
|
|
289
|
+
# 1. QC violin
|
|
290
|
+
sc.pl.violin(adata, ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
|
|
291
|
+
jitter=0.4, multi_panel=True, save="_qc.png")
|
|
292
|
+
|
|
293
|
+
# 2. UMAP — clusters
|
|
294
|
+
sc.pl.umap(adata, color="leiden", legend_loc="on data",
|
|
295
|
+
title="Leiden Clusters", save="_leiden.png")
|
|
296
|
+
|
|
297
|
+
# 3. UMAP — cell types
|
|
298
|
+
if "cell_type" in adata.obs.columns:
|
|
299
|
+
sc.pl.umap(adata, color="cell_type",
|
|
300
|
+
title="Cell Type Annotation", save="_celltypes.png")
|
|
301
|
+
|
|
302
|
+
# 4. DEG dot plot
|
|
303
|
+
if deg_df is not None:
|
|
304
|
+
top_markers = deg_df.groupby("cluster").head(5)["names"].unique().tolist()
|
|
305
|
+
sc.pl.dotplot(adata, var_names=top_markers[:30],
|
|
306
|
+
groupby="leiden", save="_markers.png")
|
|
307
|
+
|
|
308
|
+
# 5. Stacked violin
|
|
309
|
+
if deg_df is not None:
|
|
310
|
+
top5 = deg_df.groupby("cluster").head(3)["names"].unique().tolist()[:20]
|
|
311
|
+
sc.pl.stacked_violin(adata, var_names=top5,
|
|
312
|
+
groupby="leiden", save="_stacked.png")
|
|
313
|
+
|
|
314
|
+
print(f" Figures saved to {save_dir}/")
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## References
|
|
318
|
+
|
|
319
|
+
### Output Files
|
|
320
|
+
|
|
321
|
+
| ファイル | 形式 |
|
|
322
|
+
|---|---|
|
|
323
|
+
| `results/sc_qc_summary.json` | JSON |
|
|
324
|
+
| `results/sc_deg_results.csv` | CSV |
|
|
325
|
+
| `results/sc_celltype_annotations.json` | JSON |
|
|
326
|
+
| `results/sc_velocity_summary.json` | JSON |
|
|
327
|
+
| `results/sc_cellchat_interactions.csv` | CSV |
|
|
328
|
+
| `figures/umap_leiden.png` | PNG |
|
|
329
|
+
| `figures/umap_celltypes.png` | PNG |
|
|
330
|
+
| `figures/velocity_stream.png` | PNG |
|
|
331
|
+
| `figures/deg_dotplot.png` | PNG |
|
|
332
|
+
|
|
333
|
+
### 利用可能ツール
|
|
334
|
+
|
|
335
|
+
> [ToolUniverse](https://github.com/mims-harvard/ToolUniverse) SMCP 経由で利用可能な外部ツール。
|
|
336
|
+
|
|
337
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
338
|
+
|---|---|---|
|
|
339
|
+
| CELLxGENE | `CELLxGENE_get_expression_data` | シングルセル発現データ取得 |
|
|
340
|
+
| CELLxGENE | `CELLxGENE_get_cell_metadata` | 細胞メタデータ取得 |
|
|
341
|
+
| CELLxGENE | `CELLxGENE_get_gene_metadata` | 遺伝子メタデータ取得 |
|
|
342
|
+
| CELLxGENE | `CELLxGENE_get_presence_matrix` | 遺伝子存在マトリクス |
|
|
343
|
+
| CELLxGENE | `CELLxGENE_get_embeddings` | 埋め込みベクトル取得 |
|
|
344
|
+
| CELLxGENE | `CELLxGENE_download_h5ad` | H5AD ファイルダウンロード |
|
|
345
|
+
| HCA | `hca_search_projects` | Human Cell Atlas プロジェクト検索 |
|
|
346
|
+
| HCA | `hca_get_file_manifest` | HCA ファイルマニフェスト取得 |
|
|
347
|
+
| HPA | `HPA_get_rna_expression_by_source` | 組織別 RNA 発現データ |
|
|
348
|
+
|
|
349
|
+
### 参照スキル
|
|
350
|
+
|
|
351
|
+
| スキル | 連携内容 |
|
|
352
|
+
|---|---|
|
|
353
|
+
| [scientific-bioinformatics](../scientific-bioinformatics/SKILL.md) | 遺伝子アノテーション・パスウェイ解析 |
|
|
354
|
+
| [scientific-multi-omics](../scientific-multi-omics/SKILL.md) | マルチオミクス統合 |
|
|
355
|
+
| [scientific-network-analysis](../scientific-network-analysis/SKILL.md) | 遺伝子制御ネットワーク |
|
|
356
|
+
| [scientific-deep-learning](../scientific-deep-learning/SKILL.md) | scVI / scGPT 等の深層学習モデル |
|
|
357
|
+
| [scientific-pca-tsne](../scientific-pca-tsne/SKILL.md) | 次元削減手法 |
|
|
358
|
+
|
|
359
|
+
#### 依存パッケージ
|
|
360
|
+
|
|
361
|
+
- scanpy, anndata, scvelo, cellchat, leidenalg, umap-learn
|