@nahisaho/satori 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -29
- package/package.json +1 -1
- package/src/.github/skills/scientific-data-submission/SKILL.md +357 -0
- package/src/.github/skills/scientific-encode-screen/SKILL.md +315 -0
- package/src/.github/skills/scientific-environmental-geodata/SKILL.md +255 -0
- package/src/.github/skills/scientific-geo-expression/SKILL.md +274 -0
- package/src/.github/skills/scientific-gpu-singlecell/SKILL.md +296 -0
- package/src/.github/skills/scientific-human-cell-atlas/SKILL.md +294 -0
- package/src/.github/skills/scientific-marine-ecology/SKILL.md +429 -0
- package/src/.github/skills/scientific-metabolic-atlas/SKILL.md +263 -0
- package/src/.github/skills/scientific-nci60-screening/SKILL.md +307 -0
- package/src/.github/skills/scientific-paleobiology/SKILL.md +265 -0
- package/src/.github/skills/scientific-parasite-genomics/SKILL.md +280 -0
- package/src/.github/skills/scientific-plant-biology/SKILL.md +321 -0
- package/src/.github/skills/scientific-rrna-taxonomy/SKILL.md +379 -0
- package/src/.github/skills/scientific-scatac-signac/SKILL.md +300 -0
- package/src/.github/skills/scientific-squidpy-advanced/SKILL.md +251 -0
- package/src/.github/skills/scientific-toxicology-env/SKILL.md +309 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-geo-expression
|
|
3
|
+
description: |
|
|
4
|
+
GEO (Gene Expression Omnibus) 発現プロファイルスキル。GEO REST
|
|
5
|
+
API データセット検索・サンプル情報・発現マトリクス取得・バルク
|
|
6
|
+
RNA-seq/マイクロアレイ差次的発現解析。ToolUniverse 連携: geo。
|
|
7
|
+
tu_tools:
|
|
8
|
+
- key: geo
|
|
9
|
+
name: GEO (Gene Expression Omnibus)
|
|
10
|
+
description: GEO データセット・サンプル情報・発現データ検索
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Scientific GEO Expression
|
|
14
|
+
|
|
15
|
+
GEO REST API を活用したトランスクリプトーム発現プロファイル
|
|
16
|
+
解析パイプラインを提供する。
|
|
17
|
+
|
|
18
|
+
## When to Use
|
|
19
|
+
|
|
20
|
+
- GEO データセット (GDS/GSE) を検索・ダウンロードするとき
|
|
21
|
+
- マイクロアレイ/RNA-seq 発現マトリクスを取得するとき
|
|
22
|
+
- 条件間差次的発現解析 (DEG) を実行するとき
|
|
23
|
+
- 複数 GEO データセットを横断比較するとき
|
|
24
|
+
- GEO メタデータから実験条件を構造化するとき
|
|
25
|
+
- 再解析パイプラインで GEO データを再利用するとき
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
## 1. GEO データセット検索
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import requests
|
|
35
|
+
import pandas as pd
|
|
36
|
+
import GEOparse
|
|
37
|
+
from io import StringIO
|
|
38
|
+
|
|
39
|
+
GEO_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def geo_search_datasets(query, organism="Homo sapiens",
|
|
43
|
+
study_type=None, limit=20):
|
|
44
|
+
"""
|
|
45
|
+
GEO — データセット検索 (E-utilities)。
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
query: str — 検索クエリ (例: "breast cancer RNA-seq")
|
|
49
|
+
organism: str — 生物種
|
|
50
|
+
study_type: str — 研究タイプ ("Expression profiling by array" etc.)
|
|
51
|
+
limit: int — 最大結果数
|
|
52
|
+
"""
|
|
53
|
+
search_term = f"{query} AND {organism}[Organism]"
|
|
54
|
+
if study_type:
|
|
55
|
+
search_term += f' AND "{study_type}"[Study Type]'
|
|
56
|
+
|
|
57
|
+
# ESearch
|
|
58
|
+
url = f"{GEO_BASE}/esearch.fcgi"
|
|
59
|
+
params = {
|
|
60
|
+
"db": "gds",
|
|
61
|
+
"term": search_term,
|
|
62
|
+
"retmax": limit,
|
|
63
|
+
"retmode": "json",
|
|
64
|
+
}
|
|
65
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
66
|
+
resp.raise_for_status()
|
|
67
|
+
ids = resp.json().get("esearchresult", {}).get("idlist", [])
|
|
68
|
+
|
|
69
|
+
if not ids:
|
|
70
|
+
print("No GEO datasets found")
|
|
71
|
+
return pd.DataFrame()
|
|
72
|
+
|
|
73
|
+
# ESummary
|
|
74
|
+
url = f"{GEO_BASE}/esummary.fcgi"
|
|
75
|
+
params = {"db": "gds", "id": ",".join(ids), "retmode": "json"}
|
|
76
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
77
|
+
resp.raise_for_status()
|
|
78
|
+
summaries = resp.json().get("result", {})
|
|
79
|
+
|
|
80
|
+
results = []
|
|
81
|
+
for gds_id in ids:
|
|
82
|
+
info = summaries.get(gds_id, {})
|
|
83
|
+
results.append({
|
|
84
|
+
"accession": info.get("accession", ""),
|
|
85
|
+
"title": info.get("title", ""),
|
|
86
|
+
"summary": info.get("summary", "")[:200],
|
|
87
|
+
"organism": info.get("taxon", ""),
|
|
88
|
+
"platform": info.get("gpl", ""),
|
|
89
|
+
"sample_count": info.get("n_samples", 0),
|
|
90
|
+
"series_type": info.get("gdstype", ""),
|
|
91
|
+
"pub_date": info.get("pdat", ""),
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
df = pd.DataFrame(results)
|
|
95
|
+
print(f"GEO search: {len(df)} datasets")
|
|
96
|
+
return df
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## 2. GEO 発現マトリクス取得
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
def geo_get_expression_matrix(gse_id, log2_transform=True):
|
|
103
|
+
"""
|
|
104
|
+
GEO — GSE 発現マトリクス取得 (GEOparse)。
|
|
105
|
+
|
|
106
|
+
Parameters:
|
|
107
|
+
gse_id: str — GSE アクセッション (例: "GSE12345")
|
|
108
|
+
log2_transform: bool — log2 変換を適用するか
|
|
109
|
+
"""
|
|
110
|
+
import numpy as np
|
|
111
|
+
|
|
112
|
+
gse = GEOparse.get_GEO(geo=gse_id, destdir="/tmp", silent=True)
|
|
113
|
+
|
|
114
|
+
# サンプルメタデータ
|
|
115
|
+
samples = []
|
|
116
|
+
for gsm_name, gsm in gse.gsms.items():
|
|
117
|
+
meta = gsm.metadata
|
|
118
|
+
samples.append({
|
|
119
|
+
"sample_id": gsm_name,
|
|
120
|
+
"title": meta.get("title", [""])[0],
|
|
121
|
+
"source": meta.get("source_name_ch1", [""])[0],
|
|
122
|
+
"characteristics": "; ".join(
|
|
123
|
+
meta.get("characteristics_ch1", [])),
|
|
124
|
+
"platform": meta.get("platform_id", [""])[0],
|
|
125
|
+
})
|
|
126
|
+
sample_df = pd.DataFrame(samples)
|
|
127
|
+
|
|
128
|
+
# 発現マトリクス
|
|
129
|
+
pivoted = gse.pivot_samples("VALUE")
|
|
130
|
+
if pivoted.empty:
|
|
131
|
+
print(f"No expression data in {gse_id}")
|
|
132
|
+
return sample_df, pd.DataFrame()
|
|
133
|
+
|
|
134
|
+
if log2_transform:
|
|
135
|
+
pivoted = np.log2(pivoted.astype(float) + 1)
|
|
136
|
+
|
|
137
|
+
print(f"GEO {gse_id}: {pivoted.shape[0]} probes × "
|
|
138
|
+
f"{pivoted.shape[1]} samples")
|
|
139
|
+
return sample_df, pivoted
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## 3. 差次的発現解析
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from scipy import stats
|
|
146
|
+
import numpy as np
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def geo_differential_expression(expr_matrix, group_a_samples,
|
|
150
|
+
group_b_samples, method="ttest",
|
|
151
|
+
fdr_threshold=0.05, lfc_threshold=1.0):
|
|
152
|
+
"""
|
|
153
|
+
GEO — 差次的発現解析。
|
|
154
|
+
|
|
155
|
+
Parameters:
|
|
156
|
+
expr_matrix: pd.DataFrame — 発現マトリクス (genes × samples)
|
|
157
|
+
group_a_samples: list[str] — グループ A サンプル ID
|
|
158
|
+
group_b_samples: list[str] — グループ B サンプル ID
|
|
159
|
+
method: str — "ttest" or "wilcoxon"
|
|
160
|
+
fdr_threshold: float — FDR 閾値
|
|
161
|
+
lfc_threshold: float — log2FC 閾値
|
|
162
|
+
"""
|
|
163
|
+
a_data = expr_matrix[group_a_samples]
|
|
164
|
+
b_data = expr_matrix[group_b_samples]
|
|
165
|
+
|
|
166
|
+
results = []
|
|
167
|
+
for gene in expr_matrix.index:
|
|
168
|
+
a_vals = a_data.loc[gene].dropna().values
|
|
169
|
+
b_vals = b_data.loc[gene].dropna().values
|
|
170
|
+
|
|
171
|
+
if len(a_vals) < 2 or len(b_vals) < 2:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
lfc = b_vals.mean() - a_vals.mean()
|
|
175
|
+
|
|
176
|
+
if method == "ttest":
|
|
177
|
+
stat, pval = stats.ttest_ind(a_vals, b_vals)
|
|
178
|
+
else:
|
|
179
|
+
stat, pval = stats.mannwhitneyu(a_vals, b_vals,
|
|
180
|
+
alternative="two-sided")
|
|
181
|
+
|
|
182
|
+
results.append({
|
|
183
|
+
"gene": gene,
|
|
184
|
+
"log2fc": lfc,
|
|
185
|
+
"mean_a": a_vals.mean(),
|
|
186
|
+
"mean_b": b_vals.mean(),
|
|
187
|
+
"statistic": stat,
|
|
188
|
+
"p_value": pval,
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
df = pd.DataFrame(results)
|
|
192
|
+
|
|
193
|
+
# FDR correction (Benjamini-Hochberg)
|
|
194
|
+
from statsmodels.stats.multitest import multipletests
|
|
195
|
+
_, df["fdr"], _, _ = multipletests(df["p_value"], method="fdr_bh")
|
|
196
|
+
|
|
197
|
+
# DEG フィルタ
|
|
198
|
+
df["is_deg"] = (df["fdr"] < fdr_threshold) & (df["log2fc"].abs() > lfc_threshold)
|
|
199
|
+
n_deg = df["is_deg"].sum()
|
|
200
|
+
n_up = ((df["is_deg"]) & (df["log2fc"] > 0)).sum()
|
|
201
|
+
n_down = ((df["is_deg"]) & (df["log2fc"] < 0)).sum()
|
|
202
|
+
|
|
203
|
+
print(f"DEG: {n_deg} genes (↑{n_up} / ↓{n_down}), "
|
|
204
|
+
f"FDR<{fdr_threshold}, |LFC|>{lfc_threshold}")
|
|
205
|
+
return df.sort_values("p_value")
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## 4. GEO 発現プロファイリングパイプライン
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
def geo_expression_pipeline(gse_id, group_col="condition",
|
|
212
|
+
group_a="control", group_b="treatment",
|
|
213
|
+
output_dir="results"):
|
|
214
|
+
"""
|
|
215
|
+
GEO 発現プロファイリング統合パイプライン。
|
|
216
|
+
|
|
217
|
+
Parameters:
|
|
218
|
+
gse_id: str — GSE アクセッション
|
|
219
|
+
group_col: str — グループ化カラム
|
|
220
|
+
group_a: str — コントロールグループ
|
|
221
|
+
group_b: str — 処理グループ
|
|
222
|
+
output_dir: str — 出力ディレクトリ
|
|
223
|
+
"""
|
|
224
|
+
from pathlib import Path
|
|
225
|
+
output_dir = Path(output_dir)
|
|
226
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
227
|
+
|
|
228
|
+
# 1) データ取得
|
|
229
|
+
sample_df, expr = geo_get_expression_matrix(gse_id)
|
|
230
|
+
sample_df.to_csv(output_dir / "samples.csv", index=False)
|
|
231
|
+
|
|
232
|
+
# 2) グループ分割
|
|
233
|
+
a_samples = sample_df[
|
|
234
|
+
sample_df["source"].str.contains(group_a, case=False)
|
|
235
|
+
]["sample_id"].tolist()
|
|
236
|
+
b_samples = sample_df[
|
|
237
|
+
sample_df["source"].str.contains(group_b, case=False)
|
|
238
|
+
]["sample_id"].tolist()
|
|
239
|
+
|
|
240
|
+
# 3) 差次的発現
|
|
241
|
+
deg = geo_differential_expression(expr, a_samples, b_samples)
|
|
242
|
+
deg.to_csv(output_dir / "deg_results.csv", index=False)
|
|
243
|
+
|
|
244
|
+
print(f"GEO pipeline: {output_dir}")
|
|
245
|
+
return {"samples": sample_df, "expression": expr, "deg": deg}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## ToolUniverse 連携
|
|
251
|
+
|
|
252
|
+
| TU Key | ツール名 | 連携内容 |
|
|
253
|
+
|--------|---------|---------|
|
|
254
|
+
| `geo` | GEO | データセット検索・サンプル情報・発現データ |
|
|
255
|
+
|
|
256
|
+
## パイプライン統合
|
|
257
|
+
|
|
258
|
+
```
|
|
259
|
+
ebi-databases → geo-expression → gene-expression-transcriptomics
|
|
260
|
+
(ENA/EBI Search) (GEO データ) (DESeq2/GTEx)
|
|
261
|
+
│ │ ↓
|
|
262
|
+
literature-search ────┘ pathway-enrichment
|
|
263
|
+
(PubMed/OpenAlex) │ (KEGG/Reactome/GO)
|
|
264
|
+
↓
|
|
265
|
+
multi-omics
|
|
266
|
+
(統合解析)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## パイプライン出力
|
|
270
|
+
|
|
271
|
+
| ファイル | 説明 | 次スキル |
|
|
272
|
+
|---------|------|---------|
|
|
273
|
+
| `results/samples.csv` | サンプルメタデータ | → gene-expression-transcriptomics |
|
|
274
|
+
| `results/deg_results.csv` | 差次的発現結果 | → pathway-enrichment |
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-gpu-singlecell
|
|
3
|
+
description: |
|
|
4
|
+
GPU アクセラレーション シングルセル解析スキル。
|
|
5
|
+
rapids-singlecell / cuML / cuGraph による GPU 並列処理。
|
|
6
|
+
大規模 (>1M cells) データの高速前処理・クラスタリング・
|
|
7
|
+
次元削減。K-Dense: rapids-singlecell。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific GPU Single-Cell
|
|
11
|
+
|
|
12
|
+
rapids-singlecell / cuML / cuGraph を活用した GPU アクセラレー
|
|
13
|
+
ション対応シングルセル解析パイプラインを提供する。100万細胞超
|
|
14
|
+
の大規模データセットの高速処理。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- 大規模シングルセルデータ (>100k cells) の高速前処理が必要なとき
|
|
19
|
+
- GPU クラスタリング (Leiden/Louvain) を実行するとき
|
|
20
|
+
- GPU UMAP/t-SNE で次元削減を高速化するとき
|
|
21
|
+
- CPU 版 scanpy では処理時間が実用的でないとき
|
|
22
|
+
- 複数サンプル統合に GPU を活用するとき
|
|
23
|
+
- ベンチマーク (CPU vs GPU) で性能比較を行うとき
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
## 1. rapids-singlecell 前処理
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import rapids_singlecell as rsc
|
|
33
|
+
import scanpy as sc
|
|
34
|
+
import anndata as ad
|
|
35
|
+
import cupy as cp
|
|
36
|
+
import numpy as np
|
|
37
|
+
import pandas as pd
|
|
38
|
+
import time
|
|
39
|
+
from pathlib import Path
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def gpu_preprocessing(adata, min_genes=200, min_cells=3,
|
|
43
|
+
n_top_genes=2000, target_sum=10000):
|
|
44
|
+
"""
|
|
45
|
+
rapids-singlecell — GPU 前処理パイプライン。
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
adata: AnnData — 入力データ
|
|
49
|
+
min_genes: int — 最小遺伝子数
|
|
50
|
+
min_cells: int — 最小細胞数
|
|
51
|
+
n_top_genes: int — HVG 数
|
|
52
|
+
target_sum: float — 正規化ターゲット
|
|
53
|
+
"""
|
|
54
|
+
t0 = time.time()
|
|
55
|
+
|
|
56
|
+
# GPU メモリにデータ転送
|
|
57
|
+
rsc.get.anndata_to_GPU(adata)
|
|
58
|
+
|
|
59
|
+
# QC
|
|
60
|
+
rsc.pp.calculate_qc_metrics(adata)
|
|
61
|
+
rsc.pp.filter_cells(adata, min_genes=min_genes)
|
|
62
|
+
rsc.pp.filter_genes(adata, min_cells=min_cells)
|
|
63
|
+
|
|
64
|
+
# 正規化
|
|
65
|
+
rsc.pp.normalize_total(adata, target_sum=target_sum)
|
|
66
|
+
rsc.pp.log1p(adata)
|
|
67
|
+
|
|
68
|
+
# HVG 選択
|
|
69
|
+
rsc.pp.highly_variable_genes(
|
|
70
|
+
adata,
|
|
71
|
+
n_top_genes=n_top_genes,
|
|
72
|
+
flavor="seurat_v3",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# スケーリング
|
|
76
|
+
rsc.pp.scale(adata, max_value=10)
|
|
77
|
+
|
|
78
|
+
elapsed = time.time() - t0
|
|
79
|
+
print(f"GPU preprocessing: {adata.n_obs} cells × {adata.n_vars} genes "
|
|
80
|
+
f"({elapsed:.1f}s)")
|
|
81
|
+
return adata
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## 2. GPU PCA & 近傍グラフ
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
def gpu_pca_neighbors(adata, n_comps=50, n_neighbors=15):
|
|
88
|
+
"""
|
|
89
|
+
GPU PCA + 近傍グラフ構築。
|
|
90
|
+
|
|
91
|
+
Parameters:
|
|
92
|
+
adata: AnnData — 前処理済みデータ (GPU)
|
|
93
|
+
n_comps: int — PCA 成分数
|
|
94
|
+
n_neighbors: int — kNN 近傍数
|
|
95
|
+
"""
|
|
96
|
+
t0 = time.time()
|
|
97
|
+
|
|
98
|
+
# GPU PCA
|
|
99
|
+
rsc.pp.pca(adata, n_comps=n_comps)
|
|
100
|
+
|
|
101
|
+
# GPU kNN
|
|
102
|
+
rsc.pp.neighbors(adata, n_neighbors=n_neighbors, n_pcs=n_comps)
|
|
103
|
+
|
|
104
|
+
elapsed = time.time() - t0
|
|
105
|
+
print(f"GPU PCA + kNN: {n_comps} PCs, k={n_neighbors} ({elapsed:.1f}s)")
|
|
106
|
+
return adata
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## 3. GPU クラスタリング (Leiden/Louvain)
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
def gpu_clustering(adata, method="leiden", resolution=1.0):
|
|
113
|
+
"""
|
|
114
|
+
cuGraph — GPU Leiden/Louvain クラスタリング。
|
|
115
|
+
|
|
116
|
+
Parameters:
|
|
117
|
+
adata: AnnData — 近傍グラフ付きデータ
|
|
118
|
+
method: str — "leiden" or "louvain"
|
|
119
|
+
resolution: float — クラスタリング解像度
|
|
120
|
+
"""
|
|
121
|
+
t0 = time.time()
|
|
122
|
+
|
|
123
|
+
if method == "leiden":
|
|
124
|
+
rsc.tl.leiden(adata, resolution=resolution)
|
|
125
|
+
else:
|
|
126
|
+
rsc.tl.louvain(adata, resolution=resolution)
|
|
127
|
+
|
|
128
|
+
n_clusters = adata.obs[method].nunique()
|
|
129
|
+
elapsed = time.time() - t0
|
|
130
|
+
|
|
131
|
+
print(f"GPU {method}: {n_clusters} clusters, "
|
|
132
|
+
f"resolution={resolution} ({elapsed:.1f}s)")
|
|
133
|
+
return adata
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## 4. GPU UMAP / t-SNE
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
def gpu_embedding(adata, method="umap", n_components=2, **kwargs):
|
|
140
|
+
"""
|
|
141
|
+
GPU UMAP / t-SNE 次元削減。
|
|
142
|
+
|
|
143
|
+
Parameters:
|
|
144
|
+
adata: AnnData — 近傍グラフ付きデータ
|
|
145
|
+
method: str — "umap" or "tsne"
|
|
146
|
+
n_components: int — 出力次元数
|
|
147
|
+
"""
|
|
148
|
+
t0 = time.time()
|
|
149
|
+
|
|
150
|
+
if method == "umap":
|
|
151
|
+
rsc.tl.umap(adata, n_components=n_components, **kwargs)
|
|
152
|
+
else:
|
|
153
|
+
rsc.tl.tsne(adata, n_pcs=n_components, **kwargs)
|
|
154
|
+
|
|
155
|
+
elapsed = time.time() - t0
|
|
156
|
+
print(f"GPU {method.upper()}: {n_components}D ({elapsed:.1f}s)")
|
|
157
|
+
return adata
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## 5. CPU vs GPU ベンチマーク
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
def benchmark_cpu_vs_gpu(adata_path, n_top_genes=2000, n_comps=50):
|
|
164
|
+
"""
|
|
165
|
+
CPU (scanpy) vs GPU (rapids-singlecell) ベンチマーク。
|
|
166
|
+
|
|
167
|
+
Parameters:
|
|
168
|
+
adata_path: str — h5ad ファイルパス
|
|
169
|
+
n_top_genes: int — HVG 数
|
|
170
|
+
n_comps: int — PCA 成分数
|
|
171
|
+
"""
|
|
172
|
+
results = {}
|
|
173
|
+
|
|
174
|
+
# === CPU (scanpy) ===
|
|
175
|
+
adata_cpu = sc.read_h5ad(adata_path)
|
|
176
|
+
t0 = time.time()
|
|
177
|
+
sc.pp.normalize_total(adata_cpu, target_sum=1e4)
|
|
178
|
+
sc.pp.log1p(adata_cpu)
|
|
179
|
+
sc.pp.highly_variable_genes(adata_cpu, n_top_genes=n_top_genes)
|
|
180
|
+
adata_cpu = adata_cpu[:, adata_cpu.var["highly_variable"]].copy()
|
|
181
|
+
sc.pp.scale(adata_cpu, max_value=10)
|
|
182
|
+
sc.pp.pca(adata_cpu, n_comps=n_comps)
|
|
183
|
+
sc.pp.neighbors(adata_cpu)
|
|
184
|
+
sc.tl.leiden(adata_cpu)
|
|
185
|
+
sc.tl.umap(adata_cpu)
|
|
186
|
+
cpu_time = time.time() - t0
|
|
187
|
+
results["cpu_seconds"] = cpu_time
|
|
188
|
+
results["cpu_clusters"] = adata_cpu.obs["leiden"].nunique()
|
|
189
|
+
|
|
190
|
+
# === GPU (rapids-singlecell) ===
|
|
191
|
+
adata_gpu = sc.read_h5ad(adata_path)
|
|
192
|
+
t0 = time.time()
|
|
193
|
+
rsc.get.anndata_to_GPU(adata_gpu)
|
|
194
|
+
rsc.pp.normalize_total(adata_gpu, target_sum=1e4)
|
|
195
|
+
rsc.pp.log1p(adata_gpu)
|
|
196
|
+
rsc.pp.highly_variable_genes(adata_gpu, n_top_genes=n_top_genes,
|
|
197
|
+
flavor="seurat_v3")
|
|
198
|
+
adata_gpu = adata_gpu[:, adata_gpu.var["highly_variable"]].copy()
|
|
199
|
+
rsc.pp.scale(adata_gpu, max_value=10)
|
|
200
|
+
rsc.pp.pca(adata_gpu, n_comps=n_comps)
|
|
201
|
+
rsc.pp.neighbors(adata_gpu)
|
|
202
|
+
rsc.tl.leiden(adata_gpu)
|
|
203
|
+
rsc.tl.umap(adata_gpu)
|
|
204
|
+
gpu_time = time.time() - t0
|
|
205
|
+
results["gpu_seconds"] = gpu_time
|
|
206
|
+
results["gpu_clusters"] = adata_gpu.obs["leiden"].nunique()
|
|
207
|
+
|
|
208
|
+
results["speedup"] = cpu_time / gpu_time
|
|
209
|
+
results["n_cells"] = adata_cpu.n_obs
|
|
210
|
+
|
|
211
|
+
print(f"Benchmark ({results['n_cells']} cells):")
|
|
212
|
+
print(f" CPU: {cpu_time:.1f}s ({results['cpu_clusters']} clusters)")
|
|
213
|
+
print(f" GPU: {gpu_time:.1f}s ({results['gpu_clusters']} clusters)")
|
|
214
|
+
print(f" Speedup: {results['speedup']:.1f}x")
|
|
215
|
+
return results
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## 6. GPU シングルセル統合パイプライン
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
def gpu_singlecell_pipeline(input_files, output_dir="results",
|
|
222
|
+
n_top_genes=3000, resolution=1.0):
|
|
223
|
+
"""
|
|
224
|
+
大規模 GPU シングルセル統合パイプライン。
|
|
225
|
+
|
|
226
|
+
Parameters:
|
|
227
|
+
input_files: list[str] — h5ad ファイルリスト
|
|
228
|
+
output_dir: str — 出力ディレクトリ
|
|
229
|
+
n_top_genes: int — HVG 数
|
|
230
|
+
resolution: float — Leiden 解像度
|
|
231
|
+
"""
|
|
232
|
+
output_dir = Path(output_dir)
|
|
233
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
234
|
+
|
|
235
|
+
t_total = time.time()
|
|
236
|
+
|
|
237
|
+
# 1) データ読み込み・結合
|
|
238
|
+
adatas = []
|
|
239
|
+
for i, f in enumerate(input_files):
|
|
240
|
+
a = sc.read_h5ad(f)
|
|
241
|
+
a.obs["sample"] = f"sample_{i}"
|
|
242
|
+
adatas.append(a)
|
|
243
|
+
adata = ad.concat(adatas, join="inner")
|
|
244
|
+
print(f"Combined: {adata.n_obs} cells from {len(input_files)} samples")
|
|
245
|
+
|
|
246
|
+
# 2) GPU 前処理
|
|
247
|
+
adata = gpu_preprocessing(adata, n_top_genes=n_top_genes)
|
|
248
|
+
|
|
249
|
+
# 3) GPU PCA + kNN
|
|
250
|
+
adata = gpu_pca_neighbors(adata)
|
|
251
|
+
|
|
252
|
+
# 4) GPU クラスタリング
|
|
253
|
+
adata = gpu_clustering(adata, resolution=resolution)
|
|
254
|
+
|
|
255
|
+
# 5) GPU UMAP
|
|
256
|
+
adata = gpu_embedding(adata)
|
|
257
|
+
|
|
258
|
+
# 6) CPU に戻して marker 検出
|
|
259
|
+
rsc.get.anndata_to_CPU(adata)
|
|
260
|
+
sc.tl.rank_genes_groups(adata, groupby="leiden", method="wilcoxon")
|
|
261
|
+
|
|
262
|
+
# 保存
|
|
263
|
+
adata.write(output_dir / "gpu_singlecell.h5ad")
|
|
264
|
+
|
|
265
|
+
# マーカー遺伝子エクスポート
|
|
266
|
+
markers = sc.get.rank_genes_groups_df(adata, group=None)
|
|
267
|
+
markers.to_csv(output_dir / "markers.csv", index=False)
|
|
268
|
+
|
|
269
|
+
total_time = time.time() - t_total
|
|
270
|
+
print(f"GPU pipeline: {adata.n_obs} cells, "
|
|
271
|
+
f"{adata.obs['leiden'].nunique()} clusters ({total_time:.1f}s)")
|
|
272
|
+
return adata
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## パイプライン統合
|
|
278
|
+
|
|
279
|
+
```
|
|
280
|
+
single-cell-genomics → gpu-singlecell → scvi-integration
|
|
281
|
+
(scanpy 標準) (GPU 高速化) (深層学習統合)
|
|
282
|
+
│ │ ↓
|
|
283
|
+
batch-correction ─────────┘ cell-type-annotation
|
|
284
|
+
(Harmony/scVI) │ (自動アノテーション)
|
|
285
|
+
↓
|
|
286
|
+
atlas-construction
|
|
287
|
+
(大規模アトラス)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## パイプライン出力
|
|
291
|
+
|
|
292
|
+
| ファイル | 説明 | 次スキル |
|
|
293
|
+
|---------|------|---------|
|
|
294
|
+
| `results/gpu_singlecell.h5ad` | GPU 処理済み AnnData | → scvi-integration |
|
|
295
|
+
| `results/markers.csv` | マーカー遺伝子 | → cell-type-annotation |
|
|
296
|
+
| `results/benchmark.json` | CPU/GPU 比較結果 | → atlas-construction |
|