@nahisaho/satori 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -41
- package/package.json +1 -1
- package/src/.github/skills/scientific-alphafold-structures/SKILL.md +256 -0
- package/src/.github/skills/scientific-arrayexpress-expression/SKILL.md +264 -0
- package/src/.github/skills/scientific-crossref-metadata/SKILL.md +313 -0
- package/src/.github/skills/scientific-encode-screen/SKILL.md +315 -0
- package/src/.github/skills/scientific-environmental-geodata/SKILL.md +255 -0
- package/src/.github/skills/scientific-geo-expression/SKILL.md +274 -0
- package/src/.github/skills/scientific-gtex-tissue-expression/SKILL.md +271 -0
- package/src/.github/skills/scientific-gwas-catalog/SKILL.md +267 -0
- package/src/.github/skills/scientific-human-cell-atlas/SKILL.md +294 -0
- package/src/.github/skills/scientific-icgc-cancer-data/SKILL.md +351 -0
- package/src/.github/skills/scientific-metabolic-atlas/SKILL.md +263 -0
- package/src/.github/skills/scientific-paleobiology/SKILL.md +265 -0
- package/src/.github/skills/scientific-parasite-genomics/SKILL.md +280 -0
- package/src/.github/skills/scientific-pharmgkb-pgx/SKILL.md +306 -0
- package/src/.github/skills/scientific-semantic-scholar/SKILL.md +298 -0
- package/src/.github/skills/scientific-squidpy-advanced/SKILL.md +251 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-gwas-catalog
|
|
3
|
+
description: |
|
|
4
|
+
GWAS カタログスキル。NHGRI-EBI GWAS Catalog REST API によるゲノム
|
|
5
|
+
ワイド関連研究メタデータ・関連シグナル・形質・遺伝子座検索。
|
|
6
|
+
ToolUniverse 連携: gwas。
|
|
7
|
+
tu_tools:
|
|
8
|
+
- key: gwas
|
|
9
|
+
name: GWAS Catalog
|
|
10
|
+
description: GWAS 関連シグナル・形質・遺伝子座検索
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Scientific GWAS Catalog
|
|
14
|
+
|
|
15
|
+
NHGRI-EBI GWAS Catalog REST API を活用した GWAS メタデータ
|
|
16
|
+
解析・遺伝子座レベル解釈パイプラインを提供する。
|
|
17
|
+
|
|
18
|
+
## When to Use
|
|
19
|
+
|
|
20
|
+
- GWAS Catalog から疾患/形質の関連バリアントを検索するとき
|
|
21
|
+
- 遺伝的関連シグナルのエフェクトサイズ・P値を取得するとき
|
|
22
|
+
- 特定遺伝子座の LD ブロック情報を解析するとき
|
|
23
|
+
- 多形質 PheWAS-like 解析を実施するとき
|
|
24
|
+
- GWAS サマリ統計量を下流解析に準備するとき
|
|
25
|
+
- 公開 GWAS データから PRS ウェイトを抽出するとき
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
## 1. GWAS 関連シグナル検索
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import requests
|
|
35
|
+
import pandas as pd
|
|
36
|
+
import numpy as np
|
|
37
|
+
|
|
38
|
+
GWAS_BASE = "https://www.ebi.ac.uk/gwas/rest/api"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def gwas_search_associations(trait=None, gene=None, variant=None,
|
|
42
|
+
p_upper=5e-8, limit=100):
|
|
43
|
+
"""
|
|
44
|
+
GWAS Catalog — 関連シグナル検索。
|
|
45
|
+
|
|
46
|
+
Parameters:
|
|
47
|
+
trait: str — 形質/疾患 EFO ID or 名前 (例: "EFO_0001645")
|
|
48
|
+
gene: str — 遺伝子名 (例: "BRCA1")
|
|
49
|
+
variant: str — rsID (例: "rs1234567")
|
|
50
|
+
p_upper: float — P値上限
|
|
51
|
+
limit: int — 最大結果数
|
|
52
|
+
"""
|
|
53
|
+
if trait:
|
|
54
|
+
url = f"{GWAS_BASE}/efoTraits/{trait}/associations"
|
|
55
|
+
elif gene:
|
|
56
|
+
url = f"{GWAS_BASE}/associations/search/findByGene"
|
|
57
|
+
elif variant:
|
|
58
|
+
url = f"{GWAS_BASE}/singleNucleotidePolymorphisms/{variant}/associations"
|
|
59
|
+
else:
|
|
60
|
+
url = f"{GWAS_BASE}/associations"
|
|
61
|
+
|
|
62
|
+
params = {"size": limit}
|
|
63
|
+
if gene:
|
|
64
|
+
params["geneName"] = gene
|
|
65
|
+
|
|
66
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
67
|
+
resp.raise_for_status()
|
|
68
|
+
data = resp.json()
|
|
69
|
+
|
|
70
|
+
associations = data.get("_embedded", {}).get("associations", [])
|
|
71
|
+
results = []
|
|
72
|
+
for assoc in associations:
|
|
73
|
+
p_value = assoc.get("pvalue", 1.0)
|
|
74
|
+
if p_value and float(p_value) > p_upper:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
loci = assoc.get("loci", [{}])
|
|
78
|
+
genes = []
|
|
79
|
+
for locus in loci:
|
|
80
|
+
for gene_info in locus.get("authorReportedGenes", []):
|
|
81
|
+
genes.append(gene_info.get("geneName", ""))
|
|
82
|
+
|
|
83
|
+
snps = []
|
|
84
|
+
for snp_info in assoc.get("snps", []):
|
|
85
|
+
snps.append(snp_info.get("rsId", ""))
|
|
86
|
+
|
|
87
|
+
results.append({
|
|
88
|
+
"association_id": assoc.get("associationId", ""),
|
|
89
|
+
"p_value": float(p_value) if p_value else None,
|
|
90
|
+
"p_value_mlog": assoc.get("pvalueMantissa", 0),
|
|
91
|
+
"or_beta": assoc.get("orPerCopyNum", None),
|
|
92
|
+
"beta_num": assoc.get("betaNum", None),
|
|
93
|
+
"beta_direction": assoc.get("betaDirection", ""),
|
|
94
|
+
"ci": assoc.get("range", ""),
|
|
95
|
+
"risk_allele_freq": assoc.get("riskFrequency", ""),
|
|
96
|
+
"snps": "; ".join(snps),
|
|
97
|
+
"genes": "; ".join(genes),
|
|
98
|
+
"trait": assoc.get("efoTraits", [{}])[0].get("trait", "")
|
|
99
|
+
if assoc.get("efoTraits") else "",
|
|
100
|
+
"study_accession": assoc.get("study", {}).get(
|
|
101
|
+
"accessionId", ""),
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
df = pd.DataFrame(results)
|
|
105
|
+
print(f"GWAS associations: {len(df)} results "
|
|
106
|
+
f"(trait={trait}, gene={gene}, p<{p_upper})")
|
|
107
|
+
return df.sort_values("p_value") if not df.empty else df
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## 2. GWAS 研究メタデータ検索
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
def gwas_search_studies(query=None, efo_trait=None, limit=50):
|
|
114
|
+
"""
|
|
115
|
+
GWAS Catalog — 研究メタデータ検索。
|
|
116
|
+
|
|
117
|
+
Parameters:
|
|
118
|
+
query: str — フリーテキスト検索
|
|
119
|
+
efo_trait: str — EFO 形質 ID
|
|
120
|
+
limit: int — 最大結果数
|
|
121
|
+
"""
|
|
122
|
+
if efo_trait:
|
|
123
|
+
url = f"{GWAS_BASE}/efoTraits/{efo_trait}/studies"
|
|
124
|
+
else:
|
|
125
|
+
url = f"{GWAS_BASE}/studies/search/findByDiseaseTrait"
|
|
126
|
+
|
|
127
|
+
params = {"size": limit}
|
|
128
|
+
if query:
|
|
129
|
+
params["diseaseTrait"] = query
|
|
130
|
+
|
|
131
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
132
|
+
resp.raise_for_status()
|
|
133
|
+
data = resp.json()
|
|
134
|
+
|
|
135
|
+
studies = data.get("_embedded", {}).get("studies", [])
|
|
136
|
+
results = []
|
|
137
|
+
for s in studies:
|
|
138
|
+
results.append({
|
|
139
|
+
"accession": s.get("accessionId", ""),
|
|
140
|
+
"title": s.get("title", ""),
|
|
141
|
+
"pubmed_id": s.get("publicationInfo", {}).get(
|
|
142
|
+
"pubmedId", ""),
|
|
143
|
+
"author": s.get("publicationInfo", {}).get(
|
|
144
|
+
"author", {}).get("fullname", ""),
|
|
145
|
+
"journal": s.get("publicationInfo", {}).get(
|
|
146
|
+
"publication", ""),
|
|
147
|
+
"date": s.get("publicationInfo", {}).get(
|
|
148
|
+
"publicationDate", ""),
|
|
149
|
+
"initial_sample_size": s.get("initialSampleSize", ""),
|
|
150
|
+
"replication_sample_size": s.get(
|
|
151
|
+
"replicationSampleSize", ""),
|
|
152
|
+
"ancestry": s.get("ancestries", []),
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
df = pd.DataFrame(results)
|
|
156
|
+
print(f"GWAS studies: {len(df)} results")
|
|
157
|
+
return df
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## 3. GWAS 形質検索・PheWAS
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
def gwas_phewas(variant_rsid, p_threshold=5e-8):
|
|
164
|
+
"""
|
|
165
|
+
GWAS Catalog — バリアント PheWAS (形質横断検索)。
|
|
166
|
+
|
|
167
|
+
Parameters:
|
|
168
|
+
variant_rsid: str — rsID (例: "rs7903146")
|
|
169
|
+
p_threshold: float — P値閾値
|
|
170
|
+
"""
|
|
171
|
+
url = (f"{GWAS_BASE}/singleNucleotidePolymorphisms/"
|
|
172
|
+
f"{variant_rsid}/associations")
|
|
173
|
+
resp = requests.get(url, params={"size": 500}, timeout=30)
|
|
174
|
+
resp.raise_for_status()
|
|
175
|
+
data = resp.json()
|
|
176
|
+
|
|
177
|
+
associations = data.get("_embedded", {}).get("associations", [])
|
|
178
|
+
results = []
|
|
179
|
+
for assoc in associations:
|
|
180
|
+
p_val = assoc.get("pvalue", 1.0)
|
|
181
|
+
if p_val and float(p_val) > p_threshold:
|
|
182
|
+
continue
|
|
183
|
+
for trait in assoc.get("efoTraits", []):
|
|
184
|
+
results.append({
|
|
185
|
+
"variant": variant_rsid,
|
|
186
|
+
"trait": trait.get("trait", ""),
|
|
187
|
+
"efo_uri": trait.get("shortForm", ""),
|
|
188
|
+
"p_value": float(p_val) if p_val else None,
|
|
189
|
+
"or_beta": assoc.get("orPerCopyNum", None),
|
|
190
|
+
"study": assoc.get("study", {}).get(
|
|
191
|
+
"accessionId", ""),
|
|
192
|
+
})
|
|
193
|
+
|
|
194
|
+
df = pd.DataFrame(results)
|
|
195
|
+
if not df.empty:
|
|
196
|
+
df = df.sort_values("p_value")
|
|
197
|
+
print(f"PheWAS {variant_rsid}: {len(df)} trait associations")
|
|
198
|
+
return df
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## 4. GWAS 統合パイプライン
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
def gwas_catalog_pipeline(trait_query, output_dir="results"):
|
|
205
|
+
"""
|
|
206
|
+
GWAS Catalog 統合パイプライン。
|
|
207
|
+
|
|
208
|
+
Parameters:
|
|
209
|
+
trait_query: str — 形質/疾患名
|
|
210
|
+
output_dir: str — 出力ディレクトリ
|
|
211
|
+
"""
|
|
212
|
+
from pathlib import Path
|
|
213
|
+
output_dir = Path(output_dir)
|
|
214
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
215
|
+
|
|
216
|
+
# 1) 研究検索
|
|
217
|
+
studies = gwas_search_studies(query=trait_query)
|
|
218
|
+
studies.to_csv(output_dir / "gwas_studies.csv", index=False)
|
|
219
|
+
|
|
220
|
+
# 2) 関連シグナル
|
|
221
|
+
assocs = gwas_search_associations(gene=None, trait=None)
|
|
222
|
+
assocs.to_csv(output_dir / "gwas_associations.csv", index=False)
|
|
223
|
+
|
|
224
|
+
# 3) トップバリアントの PheWAS
|
|
225
|
+
if not assocs.empty:
|
|
226
|
+
top_snps = assocs["snps"].str.split("; ").explode().unique()[:5]
|
|
227
|
+
phewas_all = []
|
|
228
|
+
for rsid in top_snps:
|
|
229
|
+
if rsid.startswith("rs"):
|
|
230
|
+
phewas = gwas_phewas(rsid)
|
|
231
|
+
phewas_all.append(phewas)
|
|
232
|
+
if phewas_all:
|
|
233
|
+
phewas_df = pd.concat(phewas_all, ignore_index=True)
|
|
234
|
+
phewas_df.to_csv(output_dir / "phewas.csv", index=False)
|
|
235
|
+
|
|
236
|
+
print(f"GWAS pipeline: {output_dir}")
|
|
237
|
+
return {"studies": studies, "associations": assocs}
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## ToolUniverse 連携
|
|
243
|
+
|
|
244
|
+
| TU Key | ツール名 | 連携内容 |
|
|
245
|
+
|--------|---------|---------|
|
|
246
|
+
| `gwas` | GWAS Catalog | 関連シグナル・形質・研究メタデータ検索 |
|
|
247
|
+
|
|
248
|
+
## パイプライン統合
|
|
249
|
+
|
|
250
|
+
```
|
|
251
|
+
disease-research → gwas-catalog → variant-interpretation
|
|
252
|
+
(DisGeNET/OMIM) (GWAS Catalog) (ACMG/AMP)
|
|
253
|
+
│ │ ↓
|
|
254
|
+
population-genetics ──┘ variant-effect-prediction
|
|
255
|
+
(Fst/PCA) │ (CADD/SpliceAI)
|
|
256
|
+
↓
|
|
257
|
+
precision-oncology
|
|
258
|
+
(臨床的意義判定)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
## パイプライン出力
|
|
262
|
+
|
|
263
|
+
| ファイル | 説明 | 次スキル |
|
|
264
|
+
|---------|------|---------|
|
|
265
|
+
| `results/gwas_studies.csv` | GWAS 研究メタデータ | → literature-search |
|
|
266
|
+
| `results/gwas_associations.csv` | 関連シグナル | → variant-interpretation |
|
|
267
|
+
| `results/phewas.csv` | PheWAS 結果 | → disease-research |
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-human-cell-atlas
|
|
3
|
+
description: |
|
|
4
|
+
Human Cell Atlas (HCA) データポータルスキル。HCA Data Portal API
|
|
5
|
+
プロジェクト検索・ファイルダウンロード・CELLxGENE Census 統合・
|
|
6
|
+
細胞型アノテーション・アトラス構築。ToolUniverse 連携: hca_tools。
|
|
7
|
+
tu_tools:
|
|
8
|
+
- key: hca_tools
|
|
9
|
+
name: Human Cell Atlas Tools
|
|
10
|
+
description: HCA データポータル プロジェクト・バンドル・ファイル検索
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Scientific Human Cell Atlas
|
|
14
|
+
|
|
15
|
+
HCA Data Portal / CELLxGENE Census を活用した大規模シングルセル
|
|
16
|
+
アトラスデータアクセス・解析パイプラインを提供する。
|
|
17
|
+
|
|
18
|
+
## When to Use
|
|
19
|
+
|
|
20
|
+
- HCA Data Portal からプロジェクト・実験データを検索するとき
|
|
21
|
+
- CELLxGENE Census で大規模シングルセルアトラスを照会するとき
|
|
22
|
+
- 特定組織/疾患の細胞型構成を調べるとき
|
|
23
|
+
- 複数 HCA プロジェクト間で細胞型を比較するとき
|
|
24
|
+
- シングルセルアトラスのリファレンスマッピングを行うとき
|
|
25
|
+
- 希少細胞型の発見・アノテーションを実施するとき
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
## 1. HCA Data Portal プロジェクト検索
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import requests
|
|
35
|
+
import pandas as pd
|
|
36
|
+
import json
|
|
37
|
+
|
|
38
|
+
HCA_BASE = "https://service.azul.data.humancellatlas.org"
|
|
39
|
+
HCA_CATALOG = "dcp44"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def hca_search_projects(keyword=None, organ=None, disease=None,
|
|
43
|
+
species="Homo sapiens", limit=25):
|
|
44
|
+
"""
|
|
45
|
+
HCA Data Portal — プロジェクト検索。
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
keyword: str — キーワード検索
|
|
49
|
+
organ: str — 臓器 (例: "lung", "heart")
|
|
50
|
+
disease: str — 疾患 (例: "COVID-19")
|
|
51
|
+
species: str — 生物種
|
|
52
|
+
limit: int — 最大結果数
|
|
53
|
+
"""
|
|
54
|
+
url = f"{HCA_BASE}/index/projects"
|
|
55
|
+
params = {"catalog": HCA_CATALOG, "size": limit}
|
|
56
|
+
|
|
57
|
+
filters = {}
|
|
58
|
+
if organ:
|
|
59
|
+
filters["organ"] = {"is": [organ]}
|
|
60
|
+
if disease:
|
|
61
|
+
filters["disease"] = {"is": [disease]}
|
|
62
|
+
if species:
|
|
63
|
+
filters["genusSpecies"] = {"is": [species]}
|
|
64
|
+
if keyword:
|
|
65
|
+
params["q"] = keyword
|
|
66
|
+
if filters:
|
|
67
|
+
params["filters"] = json.dumps(filters)
|
|
68
|
+
|
|
69
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
70
|
+
resp.raise_for_status()
|
|
71
|
+
data = resp.json()
|
|
72
|
+
|
|
73
|
+
projects = []
|
|
74
|
+
for hit in data.get("hits", []):
|
|
75
|
+
proj = hit.get("projects", [{}])[0]
|
|
76
|
+
samples = hit.get("samples", [{}])[0]
|
|
77
|
+
protocols = hit.get("protocols", [{}])[0]
|
|
78
|
+
projects.append({
|
|
79
|
+
"project_id": proj.get("projectId", ""),
|
|
80
|
+
"title": proj.get("projectTitle", ""),
|
|
81
|
+
"organ": ", ".join(samples.get("organ", [])),
|
|
82
|
+
"disease": ", ".join(samples.get("disease", [])),
|
|
83
|
+
"species": ", ".join(samples.get("genusSpecies", [])),
|
|
84
|
+
"cell_count": hit.get("cellSuspensions", [{}])[0].get(
|
|
85
|
+
"totalCells", 0),
|
|
86
|
+
"library_method": ", ".join(protocols.get(
|
|
87
|
+
"libraryConstructionApproach", [])),
|
|
88
|
+
"donor_count": samples.get("donorCount", 0),
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
df = pd.DataFrame(projects)
|
|
92
|
+
print(f"HCA: {len(df)} projects found")
|
|
93
|
+
return df
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## 2. HCA ファイル取得
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
def hca_get_project_files(project_id, file_format=None):
|
|
100
|
+
"""
|
|
101
|
+
HCA — プロジェクトのファイル一覧取得。
|
|
102
|
+
|
|
103
|
+
Parameters:
|
|
104
|
+
project_id: str — プロジェクト UUID
|
|
105
|
+
file_format: str — ファイル形式 (例: "h5ad", "loom", "csv")
|
|
106
|
+
"""
|
|
107
|
+
url = f"{HCA_BASE}/index/files"
|
|
108
|
+
filters = {"projectId": {"is": [project_id]}}
|
|
109
|
+
if file_format:
|
|
110
|
+
filters["fileFormat"] = {"is": [file_format]}
|
|
111
|
+
|
|
112
|
+
params = {
|
|
113
|
+
"catalog": HCA_CATALOG,
|
|
114
|
+
"filters": json.dumps(filters),
|
|
115
|
+
"size": 100,
|
|
116
|
+
}
|
|
117
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
118
|
+
resp.raise_for_status()
|
|
119
|
+
data = resp.json()
|
|
120
|
+
|
|
121
|
+
files = []
|
|
122
|
+
for hit in data.get("hits", []):
|
|
123
|
+
for f in hit.get("files", []):
|
|
124
|
+
files.append({
|
|
125
|
+
"file_id": f.get("uuid", ""),
|
|
126
|
+
"name": f.get("name", ""),
|
|
127
|
+
"format": f.get("format", ""),
|
|
128
|
+
"size_bytes": f.get("size", 0),
|
|
129
|
+
"url": f.get("url", ""),
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
df = pd.DataFrame(files)
|
|
133
|
+
print(f"HCA files ({project_id[:8]}): {len(df)} files")
|
|
134
|
+
return df
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## 3. CELLxGENE Census アトラスクエリ
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
def cellxgene_census_query(organism="homo_sapiens", tissue=None,
|
|
141
|
+
disease=None, cell_type=None,
|
|
142
|
+
gene_list=None, max_cells=50000):
|
|
143
|
+
"""
|
|
144
|
+
CELLxGENE Census — 大規模シングルセルアトラスクエリ。
|
|
145
|
+
|
|
146
|
+
Parameters:
|
|
147
|
+
organism: str — 生物種
|
|
148
|
+
tissue: str — 組織
|
|
149
|
+
disease: str — 疾患
|
|
150
|
+
cell_type: str — 細胞型
|
|
151
|
+
gene_list: list[str] — 取得遺伝子リスト
|
|
152
|
+
max_cells: int — 最大細胞数
|
|
153
|
+
"""
|
|
154
|
+
import cellxgene_census
|
|
155
|
+
|
|
156
|
+
census = cellxgene_census.open_soma()
|
|
157
|
+
|
|
158
|
+
# 観察フィルタ構築
|
|
159
|
+
obs_filters = []
|
|
160
|
+
if tissue:
|
|
161
|
+
obs_filters.append(f"tissue == '{tissue}'")
|
|
162
|
+
if disease:
|
|
163
|
+
obs_filters.append(f"disease == '{disease}'")
|
|
164
|
+
if cell_type:
|
|
165
|
+
obs_filters.append(f"cell_type == '{cell_type}'")
|
|
166
|
+
|
|
167
|
+
obs_filter = " and ".join(obs_filters) if obs_filters else None
|
|
168
|
+
|
|
169
|
+
# 遺伝子フィルタ
|
|
170
|
+
var_filter = None
|
|
171
|
+
if gene_list:
|
|
172
|
+
genes_str = "', '".join(gene_list)
|
|
173
|
+
var_filter = f"feature_name in ['{genes_str}']"
|
|
174
|
+
|
|
175
|
+
adata = cellxgene_census.get_anndata(
|
|
176
|
+
census,
|
|
177
|
+
organism=organism,
|
|
178
|
+
obs_value_filter=obs_filter,
|
|
179
|
+
var_value_filter=var_filter,
|
|
180
|
+
obs_column_names=[
|
|
181
|
+
"cell_type", "tissue", "disease",
|
|
182
|
+
"donor_id", "dataset_id", "assay",
|
|
183
|
+
],
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if adata.n_obs > max_cells:
|
|
187
|
+
import numpy as np
|
|
188
|
+
idx = np.random.choice(adata.n_obs, max_cells, replace=False)
|
|
189
|
+
adata = adata[idx].copy()
|
|
190
|
+
|
|
191
|
+
census.close()
|
|
192
|
+
print(f"CELLxGENE Census: {adata.n_obs} cells × {adata.n_vars} genes")
|
|
193
|
+
return adata
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## 4. 細胞型構成解析
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
import scanpy as sc
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def cell_type_composition(adata, groupby="tissue", cell_type_col="cell_type"):
|
|
203
|
+
"""
|
|
204
|
+
細胞型構成の定量比較。
|
|
205
|
+
|
|
206
|
+
Parameters:
|
|
207
|
+
adata: AnnData — シングルセルデータ
|
|
208
|
+
groupby: str — グループ変数
|
|
209
|
+
cell_type_col: str — 細胞型カラム名
|
|
210
|
+
"""
|
|
211
|
+
# 構成比計算
|
|
212
|
+
composition = (
|
|
213
|
+
adata.obs.groupby([groupby, cell_type_col])
|
|
214
|
+
.size()
|
|
215
|
+
.unstack(fill_value=0)
|
|
216
|
+
)
|
|
217
|
+
composition_pct = composition.div(composition.sum(axis=1), axis=0) * 100
|
|
218
|
+
|
|
219
|
+
print(f"Cell type composition: {composition.shape[0]} groups × "
|
|
220
|
+
f"{composition.shape[1]} cell types")
|
|
221
|
+
return composition_pct
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## 5. HCA 統合パイプライン
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
def hca_atlas_pipeline(organ, disease=None, output_dir="results"):
|
|
228
|
+
"""
|
|
229
|
+
HCA + CELLxGENE 統合アトラスパイプライン。
|
|
230
|
+
|
|
231
|
+
Parameters:
|
|
232
|
+
organ: str — 対象臓器
|
|
233
|
+
disease: str — 対象疾患
|
|
234
|
+
output_dir: str — 出力ディレクトリ
|
|
235
|
+
"""
|
|
236
|
+
from pathlib import Path
|
|
237
|
+
output_dir = Path(output_dir)
|
|
238
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
239
|
+
|
|
240
|
+
# 1) HCA プロジェクト検索
|
|
241
|
+
projects = hca_search_projects(organ=organ, disease=disease)
|
|
242
|
+
projects.to_csv(output_dir / "hca_projects.csv", index=False)
|
|
243
|
+
|
|
244
|
+
# 2) CELLxGENE Census クエリ
|
|
245
|
+
adata = cellxgene_census_query(tissue=organ, disease=disease)
|
|
246
|
+
|
|
247
|
+
# 3) 前処理
|
|
248
|
+
sc.pp.normalize_total(adata, target_sum=1e4)
|
|
249
|
+
sc.pp.log1p(adata)
|
|
250
|
+
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
|
|
251
|
+
adata = adata[:, adata.var["highly_variable"]].copy()
|
|
252
|
+
sc.pp.pca(adata)
|
|
253
|
+
sc.pp.neighbors(adata)
|
|
254
|
+
sc.tl.umap(adata)
|
|
255
|
+
|
|
256
|
+
# 4) 細胞型構成
|
|
257
|
+
composition = cell_type_composition(adata)
|
|
258
|
+
composition.to_csv(output_dir / "cell_type_composition.csv")
|
|
259
|
+
|
|
260
|
+
# 5) 保存
|
|
261
|
+
adata.write(output_dir / "hca_atlas.h5ad")
|
|
262
|
+
|
|
263
|
+
print(f"HCA atlas pipeline: {output_dir}")
|
|
264
|
+
return {"projects": projects, "adata": adata, "composition": composition}
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## ToolUniverse 連携
|
|
270
|
+
|
|
271
|
+
| TU Key | ツール名 | 連携内容 |
|
|
272
|
+
|--------|---------|---------|
|
|
273
|
+
| `hca_tools` | HCA Tools | プロジェクト検索・ファイルダウンロード |
|
|
274
|
+
|
|
275
|
+
## パイプライン統合
|
|
276
|
+
|
|
277
|
+
```
|
|
278
|
+
single-cell-genomics → human-cell-atlas → scvi-integration
|
|
279
|
+
(scanpy 標準) (HCA/CELLxGENE) (scVI 統合)
|
|
280
|
+
│ │ ↓
|
|
281
|
+
spatial-transcriptomics ───┘ cell-type-annotation
|
|
282
|
+
(Visium/MERFISH) │ (リファレンスマッピング)
|
|
283
|
+
↓
|
|
284
|
+
gpu-singlecell
|
|
285
|
+
(大規模処理)
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## パイプライン出力
|
|
289
|
+
|
|
290
|
+
| ファイル | 説明 | 次スキル |
|
|
291
|
+
|---------|------|---------|
|
|
292
|
+
| `results/hca_projects.csv` | HCA プロジェクト一覧 | → single-cell-genomics |
|
|
293
|
+
| `results/hca_atlas.h5ad` | アトラス AnnData | → scvi-integration |
|
|
294
|
+
| `results/cell_type_composition.csv` | 細胞型構成比 | → spatial-transcriptomics |
|