@nahisaho/satori 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -22
- package/package.json +1 -1
- package/src/.github/skills/scientific-data-submission/SKILL.md +357 -0
- package/src/.github/skills/scientific-gpu-singlecell/SKILL.md +296 -0
- package/src/.github/skills/scientific-marine-ecology/SKILL.md +429 -0
- package/src/.github/skills/scientific-nci60-screening/SKILL.md +307 -0
- package/src/.github/skills/scientific-plant-biology/SKILL.md +321 -0
- package/src/.github/skills/scientific-rrna-taxonomy/SKILL.md +379 -0
- package/src/.github/skills/scientific-scatac-signac/SKILL.md +300 -0
- package/src/.github/skills/scientific-toxicology-env/SKILL.md +309 -0
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-rrna-taxonomy
|
|
3
|
+
description: |
|
|
4
|
+
rRNA リファレンス・分類学スキル。SILVA SSU/LSU rRNA データベース・
|
|
5
|
+
Greengenes2 系統分類・MGnify メタゲノム解析・QIIME2 分類器・
|
|
6
|
+
scikit-bio 配列解析・系統分類パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific rRNA Taxonomy
|
|
10
|
+
|
|
11
|
+
SILVA / Greengenes2 / MGnify を活用した rRNA リファレンスおよび
|
|
12
|
+
分類学的アノテーションパイプラインを提供する。16S/18S/ITS
|
|
13
|
+
アンプリコン配列の分類学的帰属と系統解析。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- 16S rRNA アンプリコン配列の分類学的帰属を行うとき
|
|
18
|
+
- SILVA/Greengenes2 リファレンスで分類器を訓練するとき
|
|
19
|
+
- MGnify からメタゲノム解析結果を取得するとき
|
|
20
|
+
- 18S/ITS 真核生物分類を行うとき
|
|
21
|
+
- ASV/OTU の分類学的コンセンサスを判定するとき
|
|
22
|
+
- QIIME2 カスタム分類器パイプラインを構築するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. SILVA rRNA リファレンス取得
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import requests
|
|
32
|
+
import pandas as pd
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
from io import StringIO
|
|
35
|
+
|
|
36
|
+
SILVA_BASE = "https://www.arb-silva.de/api"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def download_silva_reference(version="138.1", subunit="SSU",
|
|
40
|
+
output_dir="references"):
|
|
41
|
+
"""
|
|
42
|
+
SILVA rRNA リファレンス配列 & 分類取得。
|
|
43
|
+
|
|
44
|
+
Parameters:
|
|
45
|
+
version: str — SILVA バージョン
|
|
46
|
+
subunit: str — "SSU" (16S/18S) or "LSU" (23S/28S)
|
|
47
|
+
output_dir: str — 出力ディレクトリ
|
|
48
|
+
"""
|
|
49
|
+
output_dir = Path(output_dir)
|
|
50
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
|
|
52
|
+
# SILVA FTP から NR99 配列取得
|
|
53
|
+
base_url = f"https://www.arb-silva.de/fileadmin/silva_databases/release_{version}/Exports"
|
|
54
|
+
fasta_url = f"{base_url}/SILVA_{version}_{subunit}Ref_NR99_tax_silva.fasta.gz"
|
|
55
|
+
tax_url = f"{base_url}/taxonomy/tax_slv_{subunit.lower()}_{version}.txt.gz"
|
|
56
|
+
|
|
57
|
+
import urllib.request
|
|
58
|
+
import gzip
|
|
59
|
+
|
|
60
|
+
# 配列ダウンロード
|
|
61
|
+
fasta_path = output_dir / f"silva_{version}_{subunit}_NR99.fasta.gz"
|
|
62
|
+
if not fasta_path.exists():
|
|
63
|
+
urllib.request.urlretrieve(fasta_url, str(fasta_path))
|
|
64
|
+
print(f"Downloaded: {fasta_path}")
|
|
65
|
+
|
|
66
|
+
# 分類辞書ダウンロード
|
|
67
|
+
tax_path = output_dir / f"silva_{version}_{subunit}_taxonomy.txt.gz"
|
|
68
|
+
if not tax_path.exists():
|
|
69
|
+
urllib.request.urlretrieve(tax_url, str(tax_path))
|
|
70
|
+
print(f"Downloaded: {tax_path}")
|
|
71
|
+
|
|
72
|
+
# 配列数カウント
|
|
73
|
+
n_seqs = 0
|
|
74
|
+
with gzip.open(str(fasta_path), "rt") as f:
|
|
75
|
+
for line in f:
|
|
76
|
+
if line.startswith(">"):
|
|
77
|
+
n_seqs += 1
|
|
78
|
+
|
|
79
|
+
print(f"SILVA {version} {subunit}: {n_seqs} reference sequences")
|
|
80
|
+
return {"fasta": str(fasta_path), "taxonomy": str(tax_path), "n_seqs": n_seqs}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## 2. Greengenes2 分類学取得
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
def download_greengenes2(version="2024.09", output_dir="references"):
|
|
87
|
+
"""
|
|
88
|
+
Greengenes2 分類・系統樹・配列取得。
|
|
89
|
+
|
|
90
|
+
Parameters:
|
|
91
|
+
version: str — GG2 バージョン
|
|
92
|
+
output_dir: str — 出力ディレクトリ
|
|
93
|
+
"""
|
|
94
|
+
output_dir = Path(output_dir)
|
|
95
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
|
|
97
|
+
gg2_base = f"https://ftp.microbio.me/greengenes_release/{version}"
|
|
98
|
+
files = {
|
|
99
|
+
"taxonomy": f"{gg2_base}/taxonomy.tsv.gz",
|
|
100
|
+
"backbone": f"{gg2_base}/gg2-backbone.nwk.gz",
|
|
101
|
+
"seqs": f"{gg2_base}/gg2-seqs.fna.gz",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
import urllib.request
|
|
105
|
+
paths = {}
|
|
106
|
+
for name, url in files.items():
|
|
107
|
+
out_path = output_dir / f"gg2_{version}_{name}{Path(url).suffix}{Path(url).suffixes[-1] if len(Path(url).suffixes) > 1 else ''}"
|
|
108
|
+
out_path = output_dir / Path(url).name
|
|
109
|
+
if not out_path.exists():
|
|
110
|
+
try:
|
|
111
|
+
urllib.request.urlretrieve(url, str(out_path))
|
|
112
|
+
print(f"Downloaded: {out_path}")
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"Warning: {name} download failed: {e}")
|
|
115
|
+
paths[name] = str(out_path)
|
|
116
|
+
|
|
117
|
+
print(f"Greengenes2 {version}: {len(paths)} files downloaded")
|
|
118
|
+
return paths
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## 3. MGnify メタゲノム解析結果取得
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
MGNIFY_BASE = "https://www.ebi.ac.uk/metagenomics/api/v1"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def mgnify_study_search(query, biome=None, limit=25):
|
|
128
|
+
"""
|
|
129
|
+
MGnify — メタゲノム研究検索。
|
|
130
|
+
|
|
131
|
+
Parameters:
|
|
132
|
+
query: str — 検索クエリ
|
|
133
|
+
biome: str — バイオーム (例: "root:Environmental:Aquatic")
|
|
134
|
+
limit: int — 最大取得数
|
|
135
|
+
|
|
136
|
+
TU: mgnify
|
|
137
|
+
"""
|
|
138
|
+
params = {"search": query, "page_size": limit}
|
|
139
|
+
if biome:
|
|
140
|
+
params["lineage"] = biome
|
|
141
|
+
|
|
142
|
+
resp = requests.get(f"{MGNIFY_BASE}/studies", params=params, timeout=30)
|
|
143
|
+
resp.raise_for_status()
|
|
144
|
+
data = resp.json()
|
|
145
|
+
|
|
146
|
+
results = []
|
|
147
|
+
for study in data.get("data", []):
|
|
148
|
+
attrs = study.get("attributes", {})
|
|
149
|
+
results.append({
|
|
150
|
+
"study_id": study["id"],
|
|
151
|
+
"name": attrs.get("study-name", ""),
|
|
152
|
+
"abstract": attrs.get("study-abstract", "")[:200],
|
|
153
|
+
"biome": attrs.get("biome-name", ""),
|
|
154
|
+
"samples_count": attrs.get("samples-count", 0),
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
df = pd.DataFrame(results)
|
|
158
|
+
print(f"MGnify: '{query}' → {len(df)} studies")
|
|
159
|
+
return df
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def mgnify_taxonomy(analysis_id):
|
|
163
|
+
"""
|
|
164
|
+
MGnify — 分類学的アノテーション結果取得。
|
|
165
|
+
|
|
166
|
+
Parameters:
|
|
167
|
+
analysis_id: str — MGnify 解析 ID
|
|
168
|
+
|
|
169
|
+
TU: mgnify
|
|
170
|
+
"""
|
|
171
|
+
url = f"{MGNIFY_BASE}/analyses/{analysis_id}/taxonomy/ssu"
|
|
172
|
+
resp = requests.get(url, params={"page_size": 100}, timeout=30)
|
|
173
|
+
resp.raise_for_status()
|
|
174
|
+
data = resp.json()
|
|
175
|
+
|
|
176
|
+
taxa = []
|
|
177
|
+
for entry in data.get("data", []):
|
|
178
|
+
attrs = entry.get("attributes", {})
|
|
179
|
+
taxa.append({
|
|
180
|
+
"lineage": attrs.get("lineage", ""),
|
|
181
|
+
"count": attrs.get("count", 0),
|
|
182
|
+
"rank": attrs.get("hierarchy", {}).get("rank", ""),
|
|
183
|
+
})
|
|
184
|
+
|
|
185
|
+
df = pd.DataFrame(taxa)
|
|
186
|
+
df = df.sort_values("count", ascending=False)
|
|
187
|
+
print(f"MGnify taxonomy ({analysis_id}): {len(df)} taxa")
|
|
188
|
+
return df
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## 4. QIIME2 分類器パイプライン
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
def qiime2_classify_sklearn(sequences_path, reference_seqs, reference_tax,
|
|
195
|
+
classifier_output="classifier.qza"):
|
|
196
|
+
"""
|
|
197
|
+
QIIME2 scikit-learn 分類器訓練 & 分類。
|
|
198
|
+
|
|
199
|
+
Parameters:
|
|
200
|
+
sequences_path: str — 入力配列 (FASTA or QZA)
|
|
201
|
+
reference_seqs: str — リファレンス配列パス
|
|
202
|
+
reference_tax: str — リファレンス分類パス
|
|
203
|
+
classifier_output: str — 分類器出力パス
|
|
204
|
+
"""
|
|
205
|
+
import subprocess
|
|
206
|
+
|
|
207
|
+
# 1) リファレンスインポート
|
|
208
|
+
subprocess.run([
|
|
209
|
+
"qiime", "tools", "import",
|
|
210
|
+
"--type", "FeatureData[Sequence]",
|
|
211
|
+
"--input-path", reference_seqs,
|
|
212
|
+
"--output-path", "ref-seqs.qza",
|
|
213
|
+
], check=True)
|
|
214
|
+
|
|
215
|
+
subprocess.run([
|
|
216
|
+
"qiime", "tools", "import",
|
|
217
|
+
"--type", "FeatureData[Taxonomy]",
|
|
218
|
+
"--input-format", "HeaderlessTSVTaxonomyFormat",
|
|
219
|
+
"--input-path", reference_tax,
|
|
220
|
+
"--output-path", "ref-taxonomy.qza",
|
|
221
|
+
], check=True)
|
|
222
|
+
|
|
223
|
+
# 2) 分類器訓練
|
|
224
|
+
subprocess.run([
|
|
225
|
+
"qiime", "feature-classifier", "fit-classifier-naive-bayes",
|
|
226
|
+
"--i-reference-reads", "ref-seqs.qza",
|
|
227
|
+
"--i-reference-taxonomy", "ref-taxonomy.qza",
|
|
228
|
+
"--o-classifier", classifier_output,
|
|
229
|
+
], check=True)
|
|
230
|
+
|
|
231
|
+
# 3) 分類実行
|
|
232
|
+
subprocess.run([
|
|
233
|
+
"qiime", "feature-classifier", "classify-sklearn",
|
|
234
|
+
"--i-classifier", classifier_output,
|
|
235
|
+
"--i-reads", sequences_path,
|
|
236
|
+
"--o-classification", "taxonomy.qza",
|
|
237
|
+
], check=True)
|
|
238
|
+
|
|
239
|
+
print(f"QIIME2 classification complete: {classifier_output}")
|
|
240
|
+
return "taxonomy.qza"
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## 5. 分類学的コンセンサス解析
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
import numpy as np
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def taxonomy_consensus(classifications, confidence_threshold=0.8):
|
|
250
|
+
"""
|
|
251
|
+
複数分類器のコンセンサス分類。
|
|
252
|
+
|
|
253
|
+
Parameters:
|
|
254
|
+
classifications: dict — {method: DataFrame(feature_id, taxon, confidence)}
|
|
255
|
+
confidence_threshold: float — 信頼度閾値
|
|
256
|
+
"""
|
|
257
|
+
all_features = set()
|
|
258
|
+
for method_df in classifications.values():
|
|
259
|
+
all_features.update(method_df["feature_id"].tolist())
|
|
260
|
+
|
|
261
|
+
consensus = []
|
|
262
|
+
for feat_id in all_features:
|
|
263
|
+
taxa = {}
|
|
264
|
+
for method, df in classifications.items():
|
|
265
|
+
row = df[df["feature_id"] == feat_id]
|
|
266
|
+
if len(row) > 0:
|
|
267
|
+
taxa[method] = {
|
|
268
|
+
"taxon": row.iloc[0]["taxon"],
|
|
269
|
+
"confidence": row.iloc[0].get("confidence", 1.0),
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
# ランクごとのコンセンサス
|
|
273
|
+
if taxa:
|
|
274
|
+
lineages = [t["taxon"] for t in taxa.values()]
|
|
275
|
+
confidences = [t["confidence"] for t in taxa.values()]
|
|
276
|
+
|
|
277
|
+
# 分割してランク比較
|
|
278
|
+
split_lineages = [l.split(";") for l in lineages]
|
|
279
|
+
max_depth = max(len(sl) for sl in split_lineages)
|
|
280
|
+
consensus_lineage = []
|
|
281
|
+
|
|
282
|
+
for rank_idx in range(max_depth):
|
|
283
|
+
rank_taxa = [sl[rank_idx] for sl in split_lineages
|
|
284
|
+
if rank_idx < len(sl)]
|
|
285
|
+
most_common = max(set(rank_taxa), key=rank_taxa.count)
|
|
286
|
+
agreement = rank_taxa.count(most_common) / len(rank_taxa)
|
|
287
|
+
|
|
288
|
+
if agreement >= confidence_threshold:
|
|
289
|
+
consensus_lineage.append(most_common)
|
|
290
|
+
else:
|
|
291
|
+
break
|
|
292
|
+
|
|
293
|
+
consensus.append({
|
|
294
|
+
"feature_id": feat_id,
|
|
295
|
+
"consensus_taxon": ";".join(consensus_lineage),
|
|
296
|
+
"depth": len(consensus_lineage),
|
|
297
|
+
"methods_agree": len(taxa),
|
|
298
|
+
"mean_confidence": np.mean(confidences),
|
|
299
|
+
})
|
|
300
|
+
|
|
301
|
+
df = pd.DataFrame(consensus)
|
|
302
|
+
print(f"Consensus: {len(df)} features, "
|
|
303
|
+
f"mean depth={df['depth'].mean():.1f}")
|
|
304
|
+
return df
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
## 6. rRNA 分類統合パイプライン
|
|
308
|
+
|
|
309
|
+
```python
|
|
310
|
+
def rrna_taxonomy_pipeline(input_fasta, output_dir="results",
|
|
311
|
+
silva_version="138.1", use_greengenes=True):
|
|
312
|
+
"""
|
|
313
|
+
SILVA + Greengenes2 統合 rRNA 分類パイプライン。
|
|
314
|
+
|
|
315
|
+
Parameters:
|
|
316
|
+
input_fasta: str — 入力 16S rRNA 配列
|
|
317
|
+
output_dir: str — 出力ディレクトリ
|
|
318
|
+
silva_version: str — SILVA バージョン
|
|
319
|
+
use_greengenes: bool — GG2 も併用
|
|
320
|
+
"""
|
|
321
|
+
output_dir = Path(output_dir)
|
|
322
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
323
|
+
|
|
324
|
+
# 1) リファレンスダウンロード
|
|
325
|
+
silva_ref = download_silva_reference(
|
|
326
|
+
version=silva_version, output_dir=str(output_dir / "refs")
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
refs = {"silva": silva_ref}
|
|
330
|
+
if use_greengenes:
|
|
331
|
+
gg2_ref = download_greengenes2(
|
|
332
|
+
output_dir=str(output_dir / "refs")
|
|
333
|
+
)
|
|
334
|
+
refs["greengenes2"] = gg2_ref
|
|
335
|
+
|
|
336
|
+
# 2) QIIME2 分類 (SILVA)
|
|
337
|
+
silva_taxonomy = qiime2_classify_sklearn(
|
|
338
|
+
input_fasta,
|
|
339
|
+
silva_ref["fasta"],
|
|
340
|
+
silva_ref["taxonomy"],
|
|
341
|
+
classifier_output=str(output_dir / "silva_classifier.qza"),
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# 3) MGnify 比較参照
|
|
345
|
+
# (解析済みデータが MGnify にあれば取得)
|
|
346
|
+
|
|
347
|
+
print(f"Pipeline complete: {len(refs)} references used")
|
|
348
|
+
return refs
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## パイプライン統合
|
|
354
|
+
|
|
355
|
+
```
|
|
356
|
+
microbiome-metagenomics → rrna-taxonomy → phylogenetics
|
|
357
|
+
(DADA2 ASV パイプライン) (SILVA/GG2 分類) (ETE3 系統樹)
|
|
358
|
+
│ │ ↓
|
|
359
|
+
environmental-ecology ─────────┘ population-genetics
|
|
360
|
+
(α/β 多様性) │ (Fst/ADMIXTURE)
|
|
361
|
+
↓
|
|
362
|
+
pathway-enrichment
|
|
363
|
+
(微生物機能濃縮)
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
## パイプライン出力
|
|
367
|
+
|
|
368
|
+
| ファイル | 説明 | 次スキル |
|
|
369
|
+
|---------|------|---------|
|
|
370
|
+
| `results/taxonomy.csv` | 分類学的帰属結果 | → microbiome-metagenomics |
|
|
371
|
+
| `results/consensus.csv` | コンセンサス分類 | → phylogenetics |
|
|
372
|
+
| `results/refs/` | SILVA/GG2 リファレンス | — |
|
|
373
|
+
| `results/mgnify_taxonomy.csv` | MGnify 分類結果 | → environmental-ecology |
|
|
374
|
+
|
|
375
|
+
### 利用可能ツール (ToolUniverse SMCP)
|
|
376
|
+
|
|
377
|
+
| Config Key | ツール数 | 主要機能 |
|
|
378
|
+
|-----------|---------|---------|
|
|
379
|
+
| `mgnify` | 3+ | メタゲノム研究検索・分類学的プロファイル・機能アノテーション |
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-scatac-signac
|
|
3
|
+
description: |
|
|
4
|
+
scATAC-seq 解析スキル (Signac/SnapATAC2/episcanpy)。
|
|
5
|
+
ピークコーリング・モチーフ解析・Gene Activity スコア・
|
|
6
|
+
RNA+ATAC マルチモーダル統合 (WNN)。K-Dense: signac。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific scATAC-seq / Signac
|
|
10
|
+
|
|
11
|
+
Signac / SnapATAC2 / episcanpy を活用した scATAC-seq (single-cell
|
|
12
|
+
ATAC-seq) 解析パイプラインを提供する。クロマチンアクセシビリティ
|
|
13
|
+
解析、モチーフエンリッチメント、マルチモーダル統合。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- scATAC-seq データの前処理とピークコーリングを行うとき
|
|
18
|
+
- 転写因子モチーフのエンリッチメント解析を実行するとき
|
|
19
|
+
- Gene Activity スコアで scATAC を発現レベルで解釈するとき
|
|
20
|
+
- scRNA-seq + scATAC-seq のマルチモーダル統合を行うとき
|
|
21
|
+
- クロマチンアクセシビリティの細胞型特異的パターンを同定するとき
|
|
22
|
+
- エピゲノム (ヒストン修飾/DNAメチル化) とクロマチンを統合するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. scATAC-seq 前処理 (SnapATAC2)
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import snapatac2 as snap
|
|
32
|
+
import anndata as ad
|
|
33
|
+
import numpy as np
|
|
34
|
+
import pandas as pd
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def scatac_preprocessing(fragment_file, genome="hg38",
|
|
39
|
+
min_tsse=5, min_fragments=1000):
|
|
40
|
+
"""
|
|
41
|
+
SnapATAC2 — scATAC-seq 前処理パイプライン。
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
fragment_file: str — fragments.tsv.gz パス
|
|
45
|
+
genome: str — リファレンスゲノム ("hg38", "mm10")
|
|
46
|
+
min_tsse: float — 最小 TSS enrichment スコア
|
|
47
|
+
min_fragments: int — 最小フラグメント数
|
|
48
|
+
"""
|
|
49
|
+
# フラグメントファイル読み込み
|
|
50
|
+
adata = snap.pp.import_data(
|
|
51
|
+
fragment_file,
|
|
52
|
+
chrom_sizes=snap.genome(genome),
|
|
53
|
+
sorted_by_barcode=False,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# QC メトリクス
|
|
57
|
+
snap.metrics.tsse(adata, snap.genome(genome))
|
|
58
|
+
snap.metrics.frag_size_distr(adata)
|
|
59
|
+
|
|
60
|
+
# フィルタリング
|
|
61
|
+
snap.pp.filter_cells(
|
|
62
|
+
adata,
|
|
63
|
+
min_counts=min_fragments,
|
|
64
|
+
min_tsse=min_tsse,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
print(f"scATAC preprocessing: {adata.n_obs} cells, "
|
|
68
|
+
f"TSS enrichment ≥ {min_tsse}")
|
|
69
|
+
return adata
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## 2. ピークコーリング & Tile Matrix
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
def scatac_peak_calling(adata, genome="hg38", peak_method="macs2",
|
|
76
|
+
n_features=50000):
|
|
77
|
+
"""
|
|
78
|
+
scATAC-seq ピークコーリング & アクセシビリティマトリクス作成。
|
|
79
|
+
|
|
80
|
+
Parameters:
|
|
81
|
+
adata: AnnData — 前処理済み scATAC データ
|
|
82
|
+
genome: str — リファレンスゲノム
|
|
83
|
+
peak_method: str — "macs2" or "tile"
|
|
84
|
+
n_features: int — feature selection 上位数
|
|
85
|
+
"""
|
|
86
|
+
if peak_method == "tile":
|
|
87
|
+
# Tile matrix (500bp bins)
|
|
88
|
+
snap.pp.add_tile_matrix(adata, bin_size=500)
|
|
89
|
+
else:
|
|
90
|
+
# MACS2 peak calling (クラスタ別)
|
|
91
|
+
snap.pp.make_peak_matrix(adata)
|
|
92
|
+
|
|
93
|
+
# Feature selection
|
|
94
|
+
snap.pp.select_features(adata, n_features=n_features)
|
|
95
|
+
|
|
96
|
+
# 次元削減
|
|
97
|
+
snap.tl.spectral(adata)
|
|
98
|
+
snap.tl.umap(adata)
|
|
99
|
+
|
|
100
|
+
# クラスタリング
|
|
101
|
+
snap.pp.knn(adata)
|
|
102
|
+
snap.tl.leiden(adata)
|
|
103
|
+
|
|
104
|
+
n_clusters = adata.obs["leiden"].nunique()
|
|
105
|
+
print(f"Peak calling ({peak_method}): {n_clusters} clusters, "
|
|
106
|
+
f"{n_features} features")
|
|
107
|
+
return adata
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## 3. モチーフエンリッチメント (chromVAR)
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
def motif_enrichment(adata, genome="hg38", motif_db="JASPAR2022"):
|
|
114
|
+
"""
|
|
115
|
+
chromVAR モチーフエンリッチメント解析。
|
|
116
|
+
|
|
117
|
+
Parameters:
|
|
118
|
+
adata: AnnData — ピークマトリクス付き scATAC データ
|
|
119
|
+
genome: str — リファレンスゲノム
|
|
120
|
+
motif_db: str — モチーフデータベース
|
|
121
|
+
"""
|
|
122
|
+
# モチーフスキャン
|
|
123
|
+
snap.tl.motif_enrichment(
|
|
124
|
+
adata,
|
|
125
|
+
motifs=motif_db,
|
|
126
|
+
genome=genome,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# クラスタ別差分モチーフ
|
|
130
|
+
cluster_motifs = {}
|
|
131
|
+
for cluster in adata.obs["leiden"].unique():
|
|
132
|
+
mask = adata.obs["leiden"] == cluster
|
|
133
|
+
# chromVAR deviation を取得
|
|
134
|
+
if "chromvar" in adata.obsm:
|
|
135
|
+
deviations = adata.obsm["chromvar"][mask].mean(axis=0)
|
|
136
|
+
top_idx = np.argsort(deviations)[-10:]
|
|
137
|
+
top_motifs = [adata.uns["motif_names"][i] for i in top_idx]
|
|
138
|
+
cluster_motifs[cluster] = top_motifs
|
|
139
|
+
|
|
140
|
+
results = []
|
|
141
|
+
for cluster, motifs in cluster_motifs.items():
|
|
142
|
+
for rank, motif in enumerate(motifs, 1):
|
|
143
|
+
results.append({
|
|
144
|
+
"cluster": cluster,
|
|
145
|
+
"rank": rank,
|
|
146
|
+
"motif": motif,
|
|
147
|
+
})
|
|
148
|
+
|
|
149
|
+
df = pd.DataFrame(results)
|
|
150
|
+
print(f"Motif enrichment: {len(cluster_motifs)} clusters, "
|
|
151
|
+
f"{len(df)} motif-cluster pairs")
|
|
152
|
+
return df
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## 4. Gene Activity スコア
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
def gene_activity_score(adata, genome="hg38", upstream=2000, body=True):
|
|
159
|
+
"""
|
|
160
|
+
Gene Activity スコア計算 — ATAC → 擬似発現量。
|
|
161
|
+
|
|
162
|
+
Parameters:
|
|
163
|
+
adata: AnnData — scATAC データ
|
|
164
|
+
genome: str — リファレンスゲノム
|
|
165
|
+
upstream: int — プロモーター上流距離 (bp)
|
|
166
|
+
body: bool — 遺伝子本体を含むか
|
|
167
|
+
"""
|
|
168
|
+
snap.pp.make_gene_matrix(
|
|
169
|
+
adata,
|
|
170
|
+
gene_anno=snap.genome(genome),
|
|
171
|
+
upstream=upstream,
|
|
172
|
+
include_body=body,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Gene Activity を .layers に保存
|
|
176
|
+
if hasattr(adata, "uns") and "gene_activity" in adata.uns:
|
|
177
|
+
gene_act = adata.uns["gene_activity"]
|
|
178
|
+
else:
|
|
179
|
+
gene_act = adata.X # Gene matrix mode
|
|
180
|
+
|
|
181
|
+
# 正規化
|
|
182
|
+
gene_act_norm = gene_act / gene_act.sum(axis=1, keepdims=True) * 10000
|
|
183
|
+
gene_act_log = np.log1p(gene_act_norm)
|
|
184
|
+
|
|
185
|
+
print(f"Gene activity: {gene_act_log.shape[1]} genes, "
|
|
186
|
+
f"{gene_act_log.shape[0]} cells")
|
|
187
|
+
return gene_act_log
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## 5. RNA + ATAC マルチモーダル統合 (WNN)
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
import scanpy as sc
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def multimodal_wnn(adata_atac, adata_rna, n_neighbors=20):
|
|
197
|
+
"""
|
|
198
|
+
Weighted Nearest Neighbor (WNN) — RNA + ATAC 統合。
|
|
199
|
+
|
|
200
|
+
Parameters:
|
|
201
|
+
adata_atac: AnnData — scATAC データ (LSI 済み)
|
|
202
|
+
adata_rna: AnnData — scRNA データ (PCA 済み)
|
|
203
|
+
n_neighbors: int — 近傍数
|
|
204
|
+
"""
|
|
205
|
+
# 共通バーコード
|
|
206
|
+
common_bc = list(
|
|
207
|
+
set(adata_atac.obs_names) & set(adata_rna.obs_names)
|
|
208
|
+
)
|
|
209
|
+
atac_sub = adata_atac[common_bc].copy()
|
|
210
|
+
rna_sub = adata_rna[common_bc].copy()
|
|
211
|
+
|
|
212
|
+
print(f"Common barcodes: {len(common_bc)}")
|
|
213
|
+
|
|
214
|
+
# 各モダリティの kNN
|
|
215
|
+
sc.pp.neighbors(rna_sub, use_rep="X_pca", key_added="rna")
|
|
216
|
+
sc.pp.neighbors(atac_sub, use_rep="X_spectral", key_added="atac")
|
|
217
|
+
|
|
218
|
+
# WNN 統合 (muon)
|
|
219
|
+
try:
|
|
220
|
+
import muon as mu
|
|
221
|
+
mdata = mu.MuData({"rna": rna_sub, "atac": atac_sub})
|
|
222
|
+
mu.pp.neighbors(mdata, key="wnn")
|
|
223
|
+
mu.tl.umap(mdata, neighbors_key="wnn")
|
|
224
|
+
mu.tl.leiden(mdata, neighbors_key="wnn")
|
|
225
|
+
|
|
226
|
+
n_clusters = mdata.obs["leiden"].nunique()
|
|
227
|
+
print(f"WNN integration: {n_clusters} clusters")
|
|
228
|
+
return mdata
|
|
229
|
+
except ImportError:
|
|
230
|
+
print("muon not installed — falling back to concatenation")
|
|
231
|
+
# Fallback: 単純連結
|
|
232
|
+
combined = ad.concat([rna_sub, atac_sub], axis=1, merge="same")
|
|
233
|
+
sc.pp.neighbors(combined)
|
|
234
|
+
sc.tl.leiden(combined)
|
|
235
|
+
return combined
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## 6. scATAC-seq 統合パイプライン
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
def scatac_pipeline(fragment_file, rna_h5ad=None, genome="hg38",
|
|
242
|
+
output_dir="results"):
|
|
243
|
+
"""
|
|
244
|
+
scATAC-seq 統合解析パイプライン。
|
|
245
|
+
|
|
246
|
+
Parameters:
|
|
247
|
+
fragment_file: str — fragments.tsv.gz
|
|
248
|
+
rna_h5ad: str — scRNA-seq h5ad (マルチモーダル用, optional)
|
|
249
|
+
genome: str — リファレンスゲノム
|
|
250
|
+
output_dir: str — 出力ディレクトリ
|
|
251
|
+
"""
|
|
252
|
+
output_dir = Path(output_dir)
|
|
253
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
254
|
+
|
|
255
|
+
# 1) 前処理
|
|
256
|
+
adata = scatac_preprocessing(fragment_file, genome=genome)
|
|
257
|
+
|
|
258
|
+
# 2) ピーク & クラスタリング
|
|
259
|
+
adata = scatac_peak_calling(adata, genome=genome)
|
|
260
|
+
adata.write(output_dir / "scatac_clustered.h5ad")
|
|
261
|
+
|
|
262
|
+
# 3) モチーフ
|
|
263
|
+
motifs = motif_enrichment(adata, genome=genome)
|
|
264
|
+
motifs.to_csv(output_dir / "motif_enrichment.csv", index=False)
|
|
265
|
+
|
|
266
|
+
# 4) Gene Activity
|
|
267
|
+
gene_act = gene_activity_score(adata, genome=genome)
|
|
268
|
+
|
|
269
|
+
# 5) マルチモーダル統合
|
|
270
|
+
if rna_h5ad:
|
|
271
|
+
adata_rna = sc.read_h5ad(rna_h5ad)
|
|
272
|
+
mdata = multimodal_wnn(adata, adata_rna)
|
|
273
|
+
mdata.write(output_dir / "multimodal_wnn.h5mu")
|
|
274
|
+
|
|
275
|
+
print(f"scATAC pipeline: {output_dir}")
|
|
276
|
+
return adata
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
## パイプライン統合
|
|
282
|
+
|
|
283
|
+
```
|
|
284
|
+
epigenomics-chromatin → scatac-signac → single-cell-genomics
|
|
285
|
+
(ChIP/ATAC bulk) (scATAC-seq) (scRNA 統合)
|
|
286
|
+
│ │ ↓
|
|
287
|
+
peak-annotation ──────────┘ spatial-transcriptomics
|
|
288
|
+
(ENCODE/ChIPAtlas) │ (Visium/MERFISH)
|
|
289
|
+
↓
|
|
290
|
+
gene-regulatory-network
|
|
291
|
+
(GRN 推定)
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
## パイプライン出力
|
|
295
|
+
|
|
296
|
+
| ファイル | 説明 | 次スキル |
|
|
297
|
+
|---------|------|---------|
|
|
298
|
+
| `results/scatac_clustered.h5ad` | クラスタリング済み scATAC | → single-cell-genomics |
|
|
299
|
+
| `results/motif_enrichment.csv` | モチーフエンリッチメント | → gene-regulatory-network |
|
|
300
|
+
| `results/multimodal_wnn.h5mu` | RNA+ATAC 統合 | → spatial-transcriptomics |
|