@nahisaho/satori 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -15
- package/package.json +1 -1
- package/src/.github/skills/scientific-advanced-imaging/SKILL.md +382 -0
- package/src/.github/skills/scientific-chembl-assay-mining/SKILL.md +509 -0
- package/src/.github/skills/scientific-deep-chemistry/SKILL.md +350 -0
- package/src/.github/skills/scientific-ensembl-genomics/SKILL.md +378 -0
- package/src/.github/skills/scientific-expression-comparison/SKILL.md +303 -0
- package/src/.github/skills/scientific-md-simulation/SKILL.md +315 -0
- package/src/.github/skills/scientific-model-organism-db/SKILL.md +329 -0
- package/src/.github/skills/scientific-perturbation-analysis/SKILL.md +297 -0
- package/src/.github/skills/scientific-scvi-integration/SKILL.md +344 -0
- package/src/.github/skills/scientific-string-network-api/SKILL.md +376 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-perturbation-analysis
|
|
3
|
+
description: |
|
|
4
|
+
シングルセル摂動解析スキル。pertpy による CRISPR スクリーン解析・
|
|
5
|
+
薬剤応答分析・scGen 摂動予測・Augur 摂動応答性スコアリング・
|
|
6
|
+
scIB 統合ベンチマーク・差次的摂動応答パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Perturbation Analysis
|
|
10
|
+
|
|
11
|
+
pertpy / Augur / scIB を活用したシングルセルレベルの摂動解析
|
|
12
|
+
パイプラインを提供する。CRISPR スクリーン、薬剤処理、
|
|
13
|
+
遺伝子ノックダウンなどの摂動データの統合解析。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- CRISPR スクリーンデータ (Perturb-seq) を解析するとき
|
|
18
|
+
- 薬剤処理前後のシングルセル発現変動を評価するとき
|
|
19
|
+
- 摂動応答の細胞型特異性を定量するとき
|
|
20
|
+
- 複数のバッチ統合手法をベンチマークするとき (scIB)
|
|
21
|
+
- 摂動の効果を in silico で予測するとき (scGen)
|
|
22
|
+
- 差次的優先度 (Augur) で摂動応答性の高い細胞型を特定するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. pertpy セットアップ & データ読込み
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import pertpy as pt
|
|
32
|
+
import scanpy as sc
|
|
33
|
+
import anndata as ad
|
|
34
|
+
import pandas as pd
|
|
35
|
+
import numpy as np
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def load_perturbation_data(adata_path, perturbation_key="perturbation",
|
|
39
|
+
control_label="control"):
|
|
40
|
+
"""
|
|
41
|
+
摂動実験 AnnData 読込み & 前処理。
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
adata_path: str — AnnData ファイルパス
|
|
45
|
+
perturbation_key: str — 摂動ラベルカラム
|
|
46
|
+
control_label: str — コントロールラベル
|
|
47
|
+
|
|
48
|
+
K-Dense: pertpy
|
|
49
|
+
"""
|
|
50
|
+
adata = sc.read_h5ad(adata_path)
|
|
51
|
+
|
|
52
|
+
# 基本前処理
|
|
53
|
+
sc.pp.filter_cells(adata, min_genes=200)
|
|
54
|
+
sc.pp.filter_genes(adata, min_cells=3)
|
|
55
|
+
sc.pp.normalize_total(adata, target_sum=1e4)
|
|
56
|
+
sc.pp.log1p(adata)
|
|
57
|
+
|
|
58
|
+
n_perturbations = adata.obs[perturbation_key].nunique()
|
|
59
|
+
n_control = (adata.obs[perturbation_key] == control_label).sum()
|
|
60
|
+
n_perturbed = len(adata) - n_control
|
|
61
|
+
|
|
62
|
+
print(f"Loaded: {len(adata)} cells, {n_perturbations} perturbations")
|
|
63
|
+
print(f"Control: {n_control}, Perturbed: {n_perturbed}")
|
|
64
|
+
return adata
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## 2. 差次的遺伝子発現 (摂動 vs コントロール)
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
def differential_perturbation(adata, perturbation_key="perturbation",
|
|
71
|
+
control="control", target=None):
|
|
72
|
+
"""
|
|
73
|
+
摂動-コントロール間差次的発現解析。
|
|
74
|
+
|
|
75
|
+
Parameters:
|
|
76
|
+
adata: AnnData — 摂動データ
|
|
77
|
+
perturbation_key: str — 摂動ラベル
|
|
78
|
+
control: str — コントロールラベル
|
|
79
|
+
target: str — 比較対象摂動 (None で全摂動)
|
|
80
|
+
"""
|
|
81
|
+
if target:
|
|
82
|
+
mask = adata.obs[perturbation_key].isin([control, target])
|
|
83
|
+
adata_sub = adata[mask].copy()
|
|
84
|
+
else:
|
|
85
|
+
adata_sub = adata.copy()
|
|
86
|
+
|
|
87
|
+
sc.tl.rank_genes_groups(
|
|
88
|
+
adata_sub,
|
|
89
|
+
groupby=perturbation_key,
|
|
90
|
+
reference=control,
|
|
91
|
+
method="wilcoxon",
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
results = {}
|
|
95
|
+
for group in adata_sub.obs[perturbation_key].unique():
|
|
96
|
+
if group == control:
|
|
97
|
+
continue
|
|
98
|
+
try:
|
|
99
|
+
degs = sc.get.rank_genes_groups_df(adata_sub, group=group)
|
|
100
|
+
degs_sig = degs[degs["pvals_adj"] < 0.05]
|
|
101
|
+
results[group] = {
|
|
102
|
+
"n_degs": len(degs_sig),
|
|
103
|
+
"n_up": (degs_sig["logfoldchanges"] > 0).sum(),
|
|
104
|
+
"n_down": (degs_sig["logfoldchanges"] < 0).sum(),
|
|
105
|
+
"top_genes": degs_sig.head(10)["names"].tolist(),
|
|
106
|
+
}
|
|
107
|
+
except Exception:
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
print(f"DE results: {len(results)} perturbations analyzed")
|
|
111
|
+
return results
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## 3. Augur 摂動応答性スコアリング
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
def augur_prioritization(adata, perturbation_key="perturbation",
|
|
118
|
+
cell_type_key="cell_type", control="control"):
|
|
119
|
+
"""
|
|
120
|
+
Augur で細胞型ごとの摂動応答性をスコアリング。
|
|
121
|
+
|
|
122
|
+
Parameters:
|
|
123
|
+
adata: AnnData — 摂動データ
|
|
124
|
+
perturbation_key: str — 摂動ラベル
|
|
125
|
+
cell_type_key: str — 細胞型ラベル
|
|
126
|
+
control: str — コントロールラベル
|
|
127
|
+
|
|
128
|
+
K-Dense: augur (via pertpy)
|
|
129
|
+
"""
|
|
130
|
+
ag = pt.tl.Augur(estimator="random_forest_classifier")
|
|
131
|
+
|
|
132
|
+
# 摂動 vs コントロールで各細胞型のAUC計算
|
|
133
|
+
adata_augur, results = ag.predict(
|
|
134
|
+
adata,
|
|
135
|
+
condition_key=perturbation_key,
|
|
136
|
+
cell_type_key=cell_type_key,
|
|
137
|
+
control_label=control,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# 結果をDataFrameに
|
|
141
|
+
auc_df = results["summary_metrics"]
|
|
142
|
+
auc_df = auc_df.sort_values("auc", ascending=False)
|
|
143
|
+
|
|
144
|
+
print(f"Augur prioritization:")
|
|
145
|
+
for _, row in auc_df.head(5).iterrows():
|
|
146
|
+
print(f" {row['cell_type']}: AUC={row['auc']:.3f}")
|
|
147
|
+
|
|
148
|
+
return auc_df
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## 4. scGen 摂動予測
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
def scgen_perturbation_prediction(adata, perturbation_key="perturbation",
|
|
155
|
+
cell_type_key="cell_type",
|
|
156
|
+
control="control", target_perturbation=None,
|
|
157
|
+
target_cell_type=None):
|
|
158
|
+
"""
|
|
159
|
+
scGen による摂動効果の in silico 予測。
|
|
160
|
+
|
|
161
|
+
Parameters:
|
|
162
|
+
adata: AnnData — 訓練データ
|
|
163
|
+
target_perturbation: str — 予測対象の摂動
|
|
164
|
+
target_cell_type: str — 予測対象の細胞型
|
|
165
|
+
"""
|
|
166
|
+
import scgen
|
|
167
|
+
|
|
168
|
+
# モデル訓練
|
|
169
|
+
scg = scgen.SCGEN(adata)
|
|
170
|
+
scg.train(max_epochs=100, batch_size=32)
|
|
171
|
+
|
|
172
|
+
# 予測
|
|
173
|
+
pred, delta = scg.predict(
|
|
174
|
+
ctrl_key=control,
|
|
175
|
+
stim_key=target_perturbation,
|
|
176
|
+
celltype_to_predict=target_cell_type,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
print(f"scGen prediction: {target_cell_type} under {target_perturbation}")
|
|
180
|
+
print(f" Predicted cells: {pred.shape[0]}")
|
|
181
|
+
return pred, delta
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## 5. scIB 統合ベンチマーク
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
def benchmark_integration(adata, batch_key="batch", label_key="cell_type",
|
|
188
|
+
methods=None):
|
|
189
|
+
"""
|
|
190
|
+
scIB でバッチ統合手法をベンチマーク。
|
|
191
|
+
|
|
192
|
+
Parameters:
|
|
193
|
+
adata: AnnData — バッチ混在データ
|
|
194
|
+
batch_key: str — バッチラベル
|
|
195
|
+
label_key: str — 細胞型ラベル
|
|
196
|
+
methods: list — 評価するメトリクス
|
|
197
|
+
|
|
198
|
+
K-Dense: scib
|
|
199
|
+
"""
|
|
200
|
+
import scib
|
|
201
|
+
|
|
202
|
+
if methods is None:
|
|
203
|
+
methods = ["scib"]
|
|
204
|
+
|
|
205
|
+
# 基本メトリクス
|
|
206
|
+
metrics = {}
|
|
207
|
+
|
|
208
|
+
# batch correction metrics
|
|
209
|
+
metrics["batch_kbet"] = scib.me.kBET(
|
|
210
|
+
adata, batch_key=batch_key, label_key=label_key
|
|
211
|
+
)
|
|
212
|
+
metrics["batch_silhouette"] = scib.me.silhouette_batch(
|
|
213
|
+
adata, batch_key=batch_key, label_key=label_key, embed="X_pca"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# bio conservation metrics
|
|
217
|
+
metrics["bio_nmi"] = scib.me.nmi(adata, label_key, "leiden")
|
|
218
|
+
metrics["bio_ari"] = scib.me.ari(adata, label_key, "leiden")
|
|
219
|
+
metrics["bio_silhouette"] = scib.me.silhouette(
|
|
220
|
+
adata, label_key=label_key, embed="X_pca"
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# 総合スコア
|
|
224
|
+
metrics["overall"] = 0.6 * np.mean([
|
|
225
|
+
metrics["bio_nmi"], metrics["bio_ari"], metrics["bio_silhouette"]
|
|
226
|
+
]) + 0.4 * np.mean([
|
|
227
|
+
metrics["batch_kbet"], metrics["batch_silhouette"]
|
|
228
|
+
])
|
|
229
|
+
|
|
230
|
+
print(f"scIB benchmark:")
|
|
231
|
+
for k, v in metrics.items():
|
|
232
|
+
print(f" {k}: {v:.4f}")
|
|
233
|
+
return metrics
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## 6. 摂動シグネチャ解析
|
|
237
|
+
|
|
238
|
+
```python
|
|
239
|
+
def perturbation_signature(adata, perturbation_key="perturbation",
|
|
240
|
+
control="control", n_top_genes=50):
|
|
241
|
+
"""
|
|
242
|
+
摂動特異的遺伝子シグネチャ抽出。
|
|
243
|
+
|
|
244
|
+
Parameters:
|
|
245
|
+
adata: AnnData — 摂動データ
|
|
246
|
+
perturbation_key: str — 摂動ラベル
|
|
247
|
+
control: str — コントロールラベル
|
|
248
|
+
n_top_genes: int — トップ遺伝子数
|
|
249
|
+
"""
|
|
250
|
+
perturbations = [p for p in adata.obs[perturbation_key].unique()
|
|
251
|
+
if p != control]
|
|
252
|
+
|
|
253
|
+
signatures = {}
|
|
254
|
+
ctrl_mean = adata[adata.obs[perturbation_key] == control].X.mean(axis=0)
|
|
255
|
+
ctrl_mean = np.asarray(ctrl_mean).flatten()
|
|
256
|
+
|
|
257
|
+
for pert in perturbations:
|
|
258
|
+
pert_mask = adata.obs[perturbation_key] == pert
|
|
259
|
+
pert_mean = adata[pert_mask].X.mean(axis=0)
|
|
260
|
+
pert_mean = np.asarray(pert_mean).flatten()
|
|
261
|
+
|
|
262
|
+
delta = pert_mean - ctrl_mean
|
|
263
|
+
gene_indices = np.argsort(np.abs(delta))[::-1][:n_top_genes]
|
|
264
|
+
|
|
265
|
+
signatures[pert] = {
|
|
266
|
+
"top_genes": adata.var_names[gene_indices].tolist(),
|
|
267
|
+
"deltas": delta[gene_indices].tolist(),
|
|
268
|
+
"n_cells": int(pert_mask.sum()),
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
print(f"Signatures extracted: {len(signatures)} perturbations, "
|
|
272
|
+
f"{n_top_genes} genes each")
|
|
273
|
+
return signatures
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
---
|
|
277
|
+
|
|
278
|
+
## パイプライン統合
|
|
279
|
+
|
|
280
|
+
```
|
|
281
|
+
single-cell-genomics → perturbation-analysis → pathway-enrichment
|
|
282
|
+
(scRNA-seq QC) (摂動 DE/Augur/scGen) (KEGG/Reactome)
|
|
283
|
+
│ │ ↓
|
|
284
|
+
spatial-transcriptomics ──┘ │ disease-research
|
|
285
|
+
(Visium/MERFISH) ↓ (GWAS/DisGeNET)
|
|
286
|
+
drug-target-profiling
|
|
287
|
+
(標的候補評価)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## パイプライン出力
|
|
291
|
+
|
|
292
|
+
| ファイル | 説明 | 次スキル |
|
|
293
|
+
|---------|------|---------|
|
|
294
|
+
| `results/perturbation_de.json` | 差次的発現結果 | → pathway-enrichment |
|
|
295
|
+
| `results/augur_scores.csv` | Augur 応答性スコア | → single-cell-genomics |
|
|
296
|
+
| `results/perturbation_signatures.json` | 摂動シグネチャ | → drug-target-profiling |
|
|
297
|
+
| `results/scib_benchmark.json` | 統合ベンチマーク | → spatial-transcriptomics |
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-scvi-integration
|
|
3
|
+
description: |
|
|
4
|
+
scvi-tools シングルセル統合スキル。scVI 変分オートエンコーダ統合・
|
|
5
|
+
scANVI 半教師有りアノテーション・totalVI CITE-seq
|
|
6
|
+
RNA+タンパク質結合解析・SOLO ダブレット検出・潜在空間解析。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific scVI Integration
|
|
10
|
+
|
|
11
|
+
scvi-tools を活用したシングルセル確率的モデルベース統合
|
|
12
|
+
パイプラインを提供する。scVI/scANVI/totalVI/SOLO による
|
|
13
|
+
バッチ統合、半教師有りアノテーション、マルチモーダル解析。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- 複数バッチの scRNA-seq データを統合するとき (scVI)
|
|
18
|
+
- 半教師有りで細胞型アノテーションを転移するとき (scANVI)
|
|
19
|
+
- CITE-seq (RNA + ADT) データを結合解析するとき (totalVI)
|
|
20
|
+
- ダブレット (doublet) を検出・除去するとき (SOLO)
|
|
21
|
+
- 差次的発現を確率的にテストするとき
|
|
22
|
+
- 潜在空間を用いたクラスタリングを行うとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. scVI モデルセットアップ & 訓練
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import scvi
|
|
32
|
+
import scanpy as sc
|
|
33
|
+
import anndata as ad
|
|
34
|
+
import numpy as np
|
|
35
|
+
import pandas as pd
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def setup_scvi(adata, batch_key="batch", layer=None, n_latent=30,
|
|
39
|
+
n_hidden=128, n_layers=2):
|
|
40
|
+
"""
|
|
41
|
+
scVI 変分オートエンコーダのセットアップ & 訓練。
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
adata: AnnData — 入力データ (raw counts)
|
|
45
|
+
batch_key: str — バッチキー
|
|
46
|
+
layer: str — カウント格納レイヤー
|
|
47
|
+
n_latent: int — 潜在次元数
|
|
48
|
+
n_hidden: int — 隠れユニット数
|
|
49
|
+
n_layers: int — ニューラルネットレイヤー数
|
|
50
|
+
|
|
51
|
+
K-Dense: scvi-tools
|
|
52
|
+
"""
|
|
53
|
+
# scVI データ登録
|
|
54
|
+
scvi.model.SCVI.setup_anndata(
|
|
55
|
+
adata,
|
|
56
|
+
batch_key=batch_key,
|
|
57
|
+
layer=layer,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# モデル構築
|
|
61
|
+
model = scvi.model.SCVI(
|
|
62
|
+
adata,
|
|
63
|
+
n_latent=n_latent,
|
|
64
|
+
n_hidden=n_hidden,
|
|
65
|
+
n_layers=n_layers,
|
|
66
|
+
gene_likelihood="zinb", # zero-inflated negative binomial
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# 訓練
|
|
70
|
+
model.train(max_epochs=400, early_stopping=True)
|
|
71
|
+
|
|
72
|
+
# 潜在空間取得
|
|
73
|
+
latent = model.get_latent_representation()
|
|
74
|
+
adata.obsm["X_scVI"] = latent
|
|
75
|
+
|
|
76
|
+
print(f"scVI trained: {len(adata)} cells → {n_latent}D latent space")
|
|
77
|
+
print(f" Batches: {adata.obs[batch_key].nunique()}")
|
|
78
|
+
return model
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## 2. scVI バッチ統合 & UMAP
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
def scvi_integration(adata, model, resolution=1.0):
|
|
85
|
+
"""
|
|
86
|
+
scVI 潜在空間でバッチ統合 & クラスタリング。
|
|
87
|
+
|
|
88
|
+
Parameters:
|
|
89
|
+
adata: AnnData — scVI 登録済みデータ
|
|
90
|
+
model: scvi.model.SCVI — 訓練済みモデル
|
|
91
|
+
resolution: float — Leiden 解像度
|
|
92
|
+
"""
|
|
93
|
+
# 潜在空間から近傍グラフ
|
|
94
|
+
sc.pp.neighbors(adata, use_rep="X_scVI")
|
|
95
|
+
sc.tl.umap(adata)
|
|
96
|
+
sc.tl.leiden(adata, resolution=resolution)
|
|
97
|
+
|
|
98
|
+
n_clusters = adata.obs["leiden"].nunique()
|
|
99
|
+
print(f"scVI integration: {n_clusters} clusters (resolution={resolution})")
|
|
100
|
+
return adata
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## 3. scANVI 半教師有りアノテーション
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
def scanvi_annotation(adata, scvi_model, labels_key="cell_type",
|
|
107
|
+
unlabeled_category="Unknown", n_epochs=20):
|
|
108
|
+
"""
|
|
109
|
+
scANVI で半教師有り細胞型アノテーション転移。
|
|
110
|
+
|
|
111
|
+
Parameters:
|
|
112
|
+
adata: AnnData — scVI 登録済みデータ
|
|
113
|
+
scvi_model: scvi.model.SCVI — 訓練済み scVI
|
|
114
|
+
labels_key: str — 既知ラベルカラム
|
|
115
|
+
unlabeled_category: str — 未知ラベル値
|
|
116
|
+
n_epochs: int — 追加訓練エポック
|
|
117
|
+
"""
|
|
118
|
+
# scANVI = scVI + 半教師有り
|
|
119
|
+
scanvi_model = scvi.model.SCANVI.from_scvi_model(
|
|
120
|
+
scvi_model,
|
|
121
|
+
unlabeled_category=unlabeled_category,
|
|
122
|
+
labels_key=labels_key,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
scanvi_model.train(max_epochs=n_epochs)
|
|
126
|
+
|
|
127
|
+
# 予測ラベル
|
|
128
|
+
predictions = scanvi_model.predict()
|
|
129
|
+
adata.obs["scANVI_prediction"] = predictions
|
|
130
|
+
|
|
131
|
+
# 予測確率
|
|
132
|
+
soft_predictions = scanvi_model.predict(soft=True)
|
|
133
|
+
adata.obs["scANVI_confidence"] = soft_predictions.max(axis=1)
|
|
134
|
+
|
|
135
|
+
n_labeled = (adata.obs[labels_key] != unlabeled_category).sum()
|
|
136
|
+
n_unlabeled = (adata.obs[labels_key] == unlabeled_category).sum()
|
|
137
|
+
mean_conf = adata.obs["scANVI_confidence"].mean()
|
|
138
|
+
|
|
139
|
+
print(f"scANVI annotation:")
|
|
140
|
+
print(f" Labeled: {n_labeled}, Unlabeled: {n_unlabeled}")
|
|
141
|
+
print(f" Mean confidence: {mean_conf:.3f}")
|
|
142
|
+
return scanvi_model
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## 4. totalVI CITE-seq 統合
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
def totalvi_citeseq(adata, protein_expression_obsm="protein_expression",
|
|
149
|
+
batch_key="batch", n_latent=20, n_epochs=400):
|
|
150
|
+
"""
|
|
151
|
+
totalVI で RNA + ADT (CITE-seq) 結合解析。
|
|
152
|
+
|
|
153
|
+
Parameters:
|
|
154
|
+
adata: AnnData — RNA counts + protein expression
|
|
155
|
+
protein_expression_obsm: str — ADT 格納 obsm キー
|
|
156
|
+
batch_key: str — バッチキー
|
|
157
|
+
n_latent: int — 潜在次元数
|
|
158
|
+
"""
|
|
159
|
+
scvi.model.TOTALVI.setup_anndata(
|
|
160
|
+
adata,
|
|
161
|
+
batch_key=batch_key,
|
|
162
|
+
protein_expression_obsm_key=protein_expression_obsm,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
model = scvi.model.TOTALVI(
|
|
166
|
+
adata,
|
|
167
|
+
n_latent=n_latent,
|
|
168
|
+
latent_distribution="normal",
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
model.train(max_epochs=n_epochs, early_stopping=True)
|
|
172
|
+
|
|
173
|
+
# 潜在空間
|
|
174
|
+
latent = model.get_latent_representation()
|
|
175
|
+
adata.obsm["X_totalVI"] = latent
|
|
176
|
+
|
|
177
|
+
# タンパク質前景確率 (denoised)
|
|
178
|
+
_, protein_fore = model.get_normalized_expression(
|
|
179
|
+
n_samples=25,
|
|
180
|
+
return_mean=True,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
n_proteins = protein_fore.shape[1] if hasattr(protein_fore, 'shape') else 0
|
|
184
|
+
print(f"totalVI: {len(adata)} cells, {n_proteins} proteins")
|
|
185
|
+
print(f" Latent dim: {n_latent}")
|
|
186
|
+
return model
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## 5. SOLO ダブレット検出
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
def solo_doublet_detection(adata, scvi_model, threshold=0.5):
|
|
193
|
+
"""
|
|
194
|
+
SOLO でダブレット検出。
|
|
195
|
+
|
|
196
|
+
Parameters:
|
|
197
|
+
adata: AnnData — scVI 登録済みデータ
|
|
198
|
+
scvi_model: scvi.model.SCVI — 訓練済み scVI
|
|
199
|
+
threshold: float — ダブレット判定閾値
|
|
200
|
+
"""
|
|
201
|
+
solo_model = scvi.external.SOLO.from_scvi_model(scvi_model)
|
|
202
|
+
solo_model.train()
|
|
203
|
+
|
|
204
|
+
# ダブレット予測
|
|
205
|
+
predictions = solo_model.predict()
|
|
206
|
+
predictions["label"] = predictions.apply(
|
|
207
|
+
lambda x: "doublet" if x["doublet"] > threshold else "singlet",
|
|
208
|
+
axis=1,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
adata.obs["solo_doublet"] = predictions["label"].values
|
|
212
|
+
adata.obs["solo_score"] = predictions["doublet"].values
|
|
213
|
+
|
|
214
|
+
n_doublets = (adata.obs["solo_doublet"] == "doublet").sum()
|
|
215
|
+
doublet_rate = n_doublets / len(adata) * 100
|
|
216
|
+
|
|
217
|
+
print(f"SOLO doublet detection:")
|
|
218
|
+
print(f" Doublets: {n_doublets} ({doublet_rate:.1f}%)")
|
|
219
|
+
print(f" Singlets: {len(adata) - n_doublets}")
|
|
220
|
+
return predictions
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## 6. scVI 差次的発現
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
def scvi_differential_expression(model, adata, groupby="leiden",
|
|
227
|
+
group1="0", group2="1",
|
|
228
|
+
delta=0.25, batch_size=256):
|
|
229
|
+
"""
|
|
230
|
+
scVI による確率的差次的発現。
|
|
231
|
+
|
|
232
|
+
Parameters:
|
|
233
|
+
model: scvi.model.SCVI — 訓練済みモデル
|
|
234
|
+
adata: AnnData — 入力データ
|
|
235
|
+
groupby: str — グループカラム
|
|
236
|
+
group1: str — 比較グループ1
|
|
237
|
+
group2: str — 比較グループ2
|
|
238
|
+
delta: float — LFC 閾値
|
|
239
|
+
"""
|
|
240
|
+
de_results = model.differential_expression(
|
|
241
|
+
groupby=groupby,
|
|
242
|
+
group1=group1,
|
|
243
|
+
group2=group2,
|
|
244
|
+
delta=delta,
|
|
245
|
+
batch_size=batch_size,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# 有意な遺伝子フィルタ
|
|
249
|
+
sig_genes = de_results[
|
|
250
|
+
(de_results["is_de_fdr_0.05"])
|
|
251
|
+
& (de_results["lfc_mean"].abs() > delta)
|
|
252
|
+
]
|
|
253
|
+
|
|
254
|
+
sig_genes = sig_genes.sort_values("lfc_mean", ascending=False)
|
|
255
|
+
|
|
256
|
+
print(f"scVI DE: {group1} vs {group2}")
|
|
257
|
+
print(f" Significant genes: {len(sig_genes)}")
|
|
258
|
+
print(f" Up in {group1}: {(sig_genes['lfc_mean'] > 0).sum()}")
|
|
259
|
+
print(f" Up in {group2}: {(sig_genes['lfc_mean'] < 0).sum()}")
|
|
260
|
+
return sig_genes
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## 7. 統合パイプライン
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
def scvi_pipeline(adata_paths, batch_labels=None, labels_key="cell_type",
|
|
267
|
+
output_dir="results"):
|
|
268
|
+
"""
|
|
269
|
+
scVI → scANVI → SOLO 統合パイプライン。
|
|
270
|
+
|
|
271
|
+
Parameters:
|
|
272
|
+
adata_paths: list — AnnData ファイルパスリスト
|
|
273
|
+
batch_labels: list — バッチラベル
|
|
274
|
+
labels_key: str — 細胞型ラベルカラム
|
|
275
|
+
output_dir: str — 出力ディレクトリ
|
|
276
|
+
"""
|
|
277
|
+
from pathlib import Path
|
|
278
|
+
output_dir = Path(output_dir)
|
|
279
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
280
|
+
|
|
281
|
+
# 1) データ結合
|
|
282
|
+
adatas = []
|
|
283
|
+
for i, path in enumerate(adata_paths):
|
|
284
|
+
a = sc.read_h5ad(path)
|
|
285
|
+
a.obs["batch"] = batch_labels[i] if batch_labels else f"batch_{i}"
|
|
286
|
+
adatas.append(a)
|
|
287
|
+
|
|
288
|
+
adata = ad.concat(adatas, join="inner")
|
|
289
|
+
adata.obs_names_make_unique()
|
|
290
|
+
|
|
291
|
+
# 2) 前処理
|
|
292
|
+
sc.pp.highly_variable_genes(
|
|
293
|
+
adata, n_top_genes=2000, batch_key="batch", flavor="seurat_v3"
|
|
294
|
+
)
|
|
295
|
+
adata = adata[:, adata.var["highly_variable"]].copy()
|
|
296
|
+
|
|
297
|
+
# 3) scVI 訓練
|
|
298
|
+
scvi_model = setup_scvi(adata, batch_key="batch")
|
|
299
|
+
|
|
300
|
+
# 4) SOLO ダブレット除去
|
|
301
|
+
solo_results = solo_doublet_detection(adata, scvi_model)
|
|
302
|
+
adata = adata[adata.obs["solo_doublet"] == "singlet"].copy()
|
|
303
|
+
|
|
304
|
+
# 5) 再訓練 (ダブレット除去後)
|
|
305
|
+
scvi_model = setup_scvi(adata, batch_key="batch")
|
|
306
|
+
|
|
307
|
+
# 6) scANVI アノテーション
|
|
308
|
+
if labels_key in adata.obs.columns:
|
|
309
|
+
scanvi_model = scanvi_annotation(adata, scvi_model, labels_key=labels_key)
|
|
310
|
+
|
|
311
|
+
# 7) UMAP & クラスタリング
|
|
312
|
+
scvi_integration(adata, scvi_model)
|
|
313
|
+
|
|
314
|
+
# 保存
|
|
315
|
+
adata.write(output_dir / "integrated.h5ad")
|
|
316
|
+
scvi_model.save(str(output_dir / "scvi_model"))
|
|
317
|
+
|
|
318
|
+
print(f"Pipeline complete: {len(adata)} cells integrated")
|
|
319
|
+
return adata, scvi_model
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## パイプライン統合
|
|
325
|
+
|
|
326
|
+
```
|
|
327
|
+
single-cell-genomics → scvi-integration → spatial-transcriptomics
|
|
328
|
+
(scRNA-seq QC) (scVI/scANVI/totalVI) (Visium/MERFISH)
|
|
329
|
+
│ │ ↓
|
|
330
|
+
perturbation-analysis ────────┘ gene-expression
|
|
331
|
+
(Perturb-seq) │ (DEG/マーカー)
|
|
332
|
+
↓
|
|
333
|
+
expression-comparison
|
|
334
|
+
(Expression Atlas 比較)
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## パイプライン出力
|
|
338
|
+
|
|
339
|
+
| ファイル | 説明 | 次スキル |
|
|
340
|
+
|---------|------|---------|
|
|
341
|
+
| `results/integrated.h5ad` | 統合 AnnData | → spatial-transcriptomics |
|
|
342
|
+
| `results/scvi_model/` | scVI 訓練済みモデル | → perturbation-analysis |
|
|
343
|
+
| `results/de_results.csv` | 差次的発現結果 | → gene-expression |
|
|
344
|
+
| `results/annotations.csv` | scANVI アノテーション | → expression-comparison |
|