@nahisaho/satori 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -41
- package/package.json +1 -1
- package/src/.github/skills/scientific-alphafold-structures/SKILL.md +256 -0
- package/src/.github/skills/scientific-arrayexpress-expression/SKILL.md +264 -0
- package/src/.github/skills/scientific-crossref-metadata/SKILL.md +313 -0
- package/src/.github/skills/scientific-encode-screen/SKILL.md +315 -0
- package/src/.github/skills/scientific-environmental-geodata/SKILL.md +255 -0
- package/src/.github/skills/scientific-geo-expression/SKILL.md +274 -0
- package/src/.github/skills/scientific-gtex-tissue-expression/SKILL.md +271 -0
- package/src/.github/skills/scientific-gwas-catalog/SKILL.md +267 -0
- package/src/.github/skills/scientific-human-cell-atlas/SKILL.md +294 -0
- package/src/.github/skills/scientific-icgc-cancer-data/SKILL.md +351 -0
- package/src/.github/skills/scientific-metabolic-atlas/SKILL.md +263 -0
- package/src/.github/skills/scientific-paleobiology/SKILL.md +265 -0
- package/src/.github/skills/scientific-parasite-genomics/SKILL.md +280 -0
- package/src/.github/skills/scientific-pharmgkb-pgx/SKILL.md +306 -0
- package/src/.github/skills/scientific-semantic-scholar/SKILL.md +298 -0
- package/src/.github/skills/scientific-squidpy-advanced/SKILL.md +251 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-geo-expression
|
|
3
|
+
description: |
|
|
4
|
+
GEO (Gene Expression Omnibus) 発現プロファイルスキル。GEO REST
|
|
5
|
+
API データセット検索・サンプル情報・発現マトリクス取得・バルク
|
|
6
|
+
RNA-seq/マイクロアレイ差次的発現解析。ToolUniverse 連携: geo。
|
|
7
|
+
tu_tools:
|
|
8
|
+
- key: geo
|
|
9
|
+
name: GEO (Gene Expression Omnibus)
|
|
10
|
+
description: GEO データセット・サンプル情報・発現データ検索
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Scientific GEO Expression
|
|
14
|
+
|
|
15
|
+
GEO REST API を活用したトランスクリプトーム発現プロファイル
|
|
16
|
+
解析パイプラインを提供する。
|
|
17
|
+
|
|
18
|
+
## When to Use
|
|
19
|
+
|
|
20
|
+
- GEO データセット (GDS/GSE) を検索・ダウンロードするとき
|
|
21
|
+
- マイクロアレイ/RNA-seq 発現マトリクスを取得するとき
|
|
22
|
+
- 条件間差次的発現解析 (DEG) を実行するとき
|
|
23
|
+
- 複数 GEO データセットを横断比較するとき
|
|
24
|
+
- GEO メタデータから実験条件を構造化するとき
|
|
25
|
+
- 再解析パイプラインで GEO データを再利用するとき
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
## 1. GEO データセット検索
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import requests
|
|
35
|
+
import pandas as pd
|
|
36
|
+
import GEOparse
|
|
37
|
+
from io import StringIO
|
|
38
|
+
|
|
39
|
+
GEO_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def geo_search_datasets(query, organism="Homo sapiens",
|
|
43
|
+
study_type=None, limit=20):
|
|
44
|
+
"""
|
|
45
|
+
GEO — データセット検索 (E-utilities)。
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
query: str — 検索クエリ (例: "breast cancer RNA-seq")
|
|
49
|
+
organism: str — 生物種
|
|
50
|
+
study_type: str — 研究タイプ ("Expression profiling by array" etc.)
|
|
51
|
+
limit: int — 最大結果数
|
|
52
|
+
"""
|
|
53
|
+
search_term = f"{query} AND {organism}[Organism]"
|
|
54
|
+
if study_type:
|
|
55
|
+
search_term += f' AND "{study_type}"[Study Type]'
|
|
56
|
+
|
|
57
|
+
# ESearch
|
|
58
|
+
url = f"{GEO_BASE}/esearch.fcgi"
|
|
59
|
+
params = {
|
|
60
|
+
"db": "gds",
|
|
61
|
+
"term": search_term,
|
|
62
|
+
"retmax": limit,
|
|
63
|
+
"retmode": "json",
|
|
64
|
+
}
|
|
65
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
66
|
+
resp.raise_for_status()
|
|
67
|
+
ids = resp.json().get("esearchresult", {}).get("idlist", [])
|
|
68
|
+
|
|
69
|
+
if not ids:
|
|
70
|
+
print("No GEO datasets found")
|
|
71
|
+
return pd.DataFrame()
|
|
72
|
+
|
|
73
|
+
# ESummary
|
|
74
|
+
url = f"{GEO_BASE}/esummary.fcgi"
|
|
75
|
+
params = {"db": "gds", "id": ",".join(ids), "retmode": "json"}
|
|
76
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
77
|
+
resp.raise_for_status()
|
|
78
|
+
summaries = resp.json().get("result", {})
|
|
79
|
+
|
|
80
|
+
results = []
|
|
81
|
+
for gds_id in ids:
|
|
82
|
+
info = summaries.get(gds_id, {})
|
|
83
|
+
results.append({
|
|
84
|
+
"accession": info.get("accession", ""),
|
|
85
|
+
"title": info.get("title", ""),
|
|
86
|
+
"summary": info.get("summary", "")[:200],
|
|
87
|
+
"organism": info.get("taxon", ""),
|
|
88
|
+
"platform": info.get("gpl", ""),
|
|
89
|
+
"sample_count": info.get("n_samples", 0),
|
|
90
|
+
"series_type": info.get("gdstype", ""),
|
|
91
|
+
"pub_date": info.get("pdat", ""),
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
df = pd.DataFrame(results)
|
|
95
|
+
print(f"GEO search: {len(df)} datasets")
|
|
96
|
+
return df
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## 2. GEO 発現マトリクス取得
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
def geo_get_expression_matrix(gse_id, log2_transform=True):
|
|
103
|
+
"""
|
|
104
|
+
GEO — GSE 発現マトリクス取得 (GEOparse)。
|
|
105
|
+
|
|
106
|
+
Parameters:
|
|
107
|
+
gse_id: str — GSE アクセッション (例: "GSE12345")
|
|
108
|
+
log2_transform: bool — log2 変換を適用するか
|
|
109
|
+
"""
|
|
110
|
+
import numpy as np
|
|
111
|
+
|
|
112
|
+
gse = GEOparse.get_GEO(geo=gse_id, destdir="/tmp", silent=True)
|
|
113
|
+
|
|
114
|
+
# サンプルメタデータ
|
|
115
|
+
samples = []
|
|
116
|
+
for gsm_name, gsm in gse.gsms.items():
|
|
117
|
+
meta = gsm.metadata
|
|
118
|
+
samples.append({
|
|
119
|
+
"sample_id": gsm_name,
|
|
120
|
+
"title": meta.get("title", [""])[0],
|
|
121
|
+
"source": meta.get("source_name_ch1", [""])[0],
|
|
122
|
+
"characteristics": "; ".join(
|
|
123
|
+
meta.get("characteristics_ch1", [])),
|
|
124
|
+
"platform": meta.get("platform_id", [""])[0],
|
|
125
|
+
})
|
|
126
|
+
sample_df = pd.DataFrame(samples)
|
|
127
|
+
|
|
128
|
+
# 発現マトリクス
|
|
129
|
+
pivoted = gse.pivot_samples("VALUE")
|
|
130
|
+
if pivoted.empty:
|
|
131
|
+
print(f"No expression data in {gse_id}")
|
|
132
|
+
return sample_df, pd.DataFrame()
|
|
133
|
+
|
|
134
|
+
if log2_transform:
|
|
135
|
+
pivoted = np.log2(pivoted.astype(float) + 1)
|
|
136
|
+
|
|
137
|
+
print(f"GEO {gse_id}: {pivoted.shape[0]} probes × "
|
|
138
|
+
f"{pivoted.shape[1]} samples")
|
|
139
|
+
return sample_df, pivoted
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## 3. 差次的発現解析
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from scipy import stats
|
|
146
|
+
import numpy as np
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def geo_differential_expression(expr_matrix, group_a_samples,
|
|
150
|
+
group_b_samples, method="ttest",
|
|
151
|
+
fdr_threshold=0.05, lfc_threshold=1.0):
|
|
152
|
+
"""
|
|
153
|
+
GEO — 差次的発現解析。
|
|
154
|
+
|
|
155
|
+
Parameters:
|
|
156
|
+
expr_matrix: pd.DataFrame — 発現マトリクス (genes × samples)
|
|
157
|
+
group_a_samples: list[str] — グループ A サンプル ID
|
|
158
|
+
group_b_samples: list[str] — グループ B サンプル ID
|
|
159
|
+
method: str — "ttest" or "wilcoxon"
|
|
160
|
+
fdr_threshold: float — FDR 閾値
|
|
161
|
+
lfc_threshold: float — log2FC 閾値
|
|
162
|
+
"""
|
|
163
|
+
a_data = expr_matrix[group_a_samples]
|
|
164
|
+
b_data = expr_matrix[group_b_samples]
|
|
165
|
+
|
|
166
|
+
results = []
|
|
167
|
+
for gene in expr_matrix.index:
|
|
168
|
+
a_vals = a_data.loc[gene].dropna().values
|
|
169
|
+
b_vals = b_data.loc[gene].dropna().values
|
|
170
|
+
|
|
171
|
+
if len(a_vals) < 2 or len(b_vals) < 2:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
lfc = b_vals.mean() - a_vals.mean()
|
|
175
|
+
|
|
176
|
+
if method == "ttest":
|
|
177
|
+
stat, pval = stats.ttest_ind(a_vals, b_vals)
|
|
178
|
+
else:
|
|
179
|
+
stat, pval = stats.mannwhitneyu(a_vals, b_vals,
|
|
180
|
+
alternative="two-sided")
|
|
181
|
+
|
|
182
|
+
results.append({
|
|
183
|
+
"gene": gene,
|
|
184
|
+
"log2fc": lfc,
|
|
185
|
+
"mean_a": a_vals.mean(),
|
|
186
|
+
"mean_b": b_vals.mean(),
|
|
187
|
+
"statistic": stat,
|
|
188
|
+
"p_value": pval,
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
df = pd.DataFrame(results)
|
|
192
|
+
|
|
193
|
+
# FDR correction (Benjamini-Hochberg)
|
|
194
|
+
from statsmodels.stats.multitest import multipletests
|
|
195
|
+
_, df["fdr"], _, _ = multipletests(df["p_value"], method="fdr_bh")
|
|
196
|
+
|
|
197
|
+
# DEG フィルタ
|
|
198
|
+
df["is_deg"] = (df["fdr"] < fdr_threshold) & (df["log2fc"].abs() > lfc_threshold)
|
|
199
|
+
n_deg = df["is_deg"].sum()
|
|
200
|
+
n_up = ((df["is_deg"]) & (df["log2fc"] > 0)).sum()
|
|
201
|
+
n_down = ((df["is_deg"]) & (df["log2fc"] < 0)).sum()
|
|
202
|
+
|
|
203
|
+
print(f"DEG: {n_deg} genes (↑{n_up} / ↓{n_down}), "
|
|
204
|
+
f"FDR<{fdr_threshold}, |LFC|>{lfc_threshold}")
|
|
205
|
+
return df.sort_values("p_value")
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## 4. GEO 発現プロファイリングパイプライン
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
def geo_expression_pipeline(gse_id, group_col="condition",
|
|
212
|
+
group_a="control", group_b="treatment",
|
|
213
|
+
output_dir="results"):
|
|
214
|
+
"""
|
|
215
|
+
GEO 発現プロファイリング統合パイプライン。
|
|
216
|
+
|
|
217
|
+
Parameters:
|
|
218
|
+
gse_id: str — GSE アクセッション
|
|
219
|
+
group_col: str — グループ化カラム
|
|
220
|
+
group_a: str — コントロールグループ
|
|
221
|
+
group_b: str — 処理グループ
|
|
222
|
+
output_dir: str — 出力ディレクトリ
|
|
223
|
+
"""
|
|
224
|
+
from pathlib import Path
|
|
225
|
+
output_dir = Path(output_dir)
|
|
226
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
227
|
+
|
|
228
|
+
# 1) データ取得
|
|
229
|
+
sample_df, expr = geo_get_expression_matrix(gse_id)
|
|
230
|
+
sample_df.to_csv(output_dir / "samples.csv", index=False)
|
|
231
|
+
|
|
232
|
+
# 2) グループ分割
|
|
233
|
+
a_samples = sample_df[
|
|
234
|
+
sample_df["source"].str.contains(group_a, case=False)
|
|
235
|
+
]["sample_id"].tolist()
|
|
236
|
+
b_samples = sample_df[
|
|
237
|
+
sample_df["source"].str.contains(group_b, case=False)
|
|
238
|
+
]["sample_id"].tolist()
|
|
239
|
+
|
|
240
|
+
# 3) 差次的発現
|
|
241
|
+
deg = geo_differential_expression(expr, a_samples, b_samples)
|
|
242
|
+
deg.to_csv(output_dir / "deg_results.csv", index=False)
|
|
243
|
+
|
|
244
|
+
print(f"GEO pipeline: {output_dir}")
|
|
245
|
+
return {"samples": sample_df, "expression": expr, "deg": deg}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## ToolUniverse 連携
|
|
251
|
+
|
|
252
|
+
| TU Key | ツール名 | 連携内容 |
|
|
253
|
+
|--------|---------|---------|
|
|
254
|
+
| `geo` | GEO | データセット検索・サンプル情報・発現データ |
|
|
255
|
+
|
|
256
|
+
## パイプライン統合
|
|
257
|
+
|
|
258
|
+
```
|
|
259
|
+
ebi-databases → geo-expression → gene-expression-transcriptomics
|
|
260
|
+
(ENA/EBI Search) (GEO データ) (DESeq2/GTEx)
|
|
261
|
+
│ │ ↓
|
|
262
|
+
literature-search ────┘ pathway-enrichment
|
|
263
|
+
(PubMed/OpenAlex) │ (KEGG/Reactome/GO)
|
|
264
|
+
↓
|
|
265
|
+
multi-omics
|
|
266
|
+
(統合解析)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## パイプライン出力
|
|
270
|
+
|
|
271
|
+
| ファイル | 説明 | 次スキル |
|
|
272
|
+
|---------|------|---------|
|
|
273
|
+
| `results/samples.csv` | サンプルメタデータ | → gene-expression-transcriptomics |
|
|
274
|
+
| `results/deg_results.csv` | 差次的発現結果 | → pathway-enrichment |
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-gtex-tissue-expression
|
|
3
|
+
description: |
|
|
4
|
+
GTEx 組織発現スキル。GTEx Portal REST API v2 による
|
|
5
|
+
組織特異的遺伝子発現パターン解析・eQTL ルックアップ・
|
|
6
|
+
多組織比較。直接 API (ToolUniverse 非連携)。
|
|
7
|
+
tu_tools: []
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific GTEx Tissue Expression
|
|
11
|
+
|
|
12
|
+
GTEx (Genotype-Tissue Expression) Portal REST API v2 を活用した
|
|
13
|
+
組織特異的遺伝子発現解析・eQTL 検索・多組織比較パイプライン
|
|
14
|
+
を提供する。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- 遺伝子の組織特異的発現パターンを調べるとき
|
|
19
|
+
- 特定組織における eQTL (発現量的形質遺伝子座) を検索するとき
|
|
20
|
+
- 複数組織間で遺伝子発現レベルを比較するとき
|
|
21
|
+
- TPM (Transcripts Per Million) 発現データを取得するとき
|
|
22
|
+
- バリアントが遺伝子発現に与える影響を評価するとき
|
|
23
|
+
- 組織間の遺伝子共発現パターンを分析するとき
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
## 1. 組織特異的遺伝子発現取得
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import requests
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
GTEX_BASE = "https://gtexportal.org/api/v2"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def gtex_gene_expression(gene_id, tissue=None):
|
|
39
|
+
"""
|
|
40
|
+
GTEx — 組織別遺伝子発現 (中央値 TPM) 取得。
|
|
41
|
+
|
|
42
|
+
Parameters:
|
|
43
|
+
gene_id: str — 遺伝子シンボル or Ensembl ID
|
|
44
|
+
(例: "BRCA1", "ENSG00000012048")
|
|
45
|
+
tissue: str — 組織 ID (None で全組織)
|
|
46
|
+
(例: "Breast_Mammary_Tissue")
|
|
47
|
+
"""
|
|
48
|
+
url = f"{GTEX_BASE}/expression/medianGeneExpression"
|
|
49
|
+
params = {
|
|
50
|
+
"gencodeId": gene_id,
|
|
51
|
+
"datasetId": "gtex_v8",
|
|
52
|
+
}
|
|
53
|
+
if tissue:
|
|
54
|
+
params["tissueSiteDetailId"] = tissue
|
|
55
|
+
|
|
56
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
57
|
+
resp.raise_for_status()
|
|
58
|
+
data = resp.json()
|
|
59
|
+
|
|
60
|
+
results = []
|
|
61
|
+
for item in data.get("data", []):
|
|
62
|
+
results.append({
|
|
63
|
+
"gene_symbol": item.get("geneSymbol", ""),
|
|
64
|
+
"gencode_id": item.get("gencodeId", ""),
|
|
65
|
+
"tissue": item.get("tissueSiteDetailId", ""),
|
|
66
|
+
"tissue_name": item.get("tissueSiteDetail", ""),
|
|
67
|
+
"median_tpm": item.get("median", 0),
|
|
68
|
+
"sample_count": item.get("numSamples", 0),
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
df = pd.DataFrame(results)
|
|
72
|
+
if not df.empty:
|
|
73
|
+
df = df.sort_values("median_tpm", ascending=False)
|
|
74
|
+
|
|
75
|
+
print(f"GTEx expression: {gene_id} → "
|
|
76
|
+
f"{len(df)} tissues")
|
|
77
|
+
return df
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def gtex_top_tissues(gene_id, top_n=10):
|
|
81
|
+
"""
|
|
82
|
+
GTEx — 発現量上位組織。
|
|
83
|
+
|
|
84
|
+
Parameters:
|
|
85
|
+
gene_id: str — 遺伝子シンボル or Ensembl ID
|
|
86
|
+
top_n: int — 上位組織数
|
|
87
|
+
"""
|
|
88
|
+
df = gtex_gene_expression(gene_id)
|
|
89
|
+
top = df.head(top_n) if not df.empty else df
|
|
90
|
+
print(f"GTEx top {top_n} tissues for {gene_id}:")
|
|
91
|
+
for _, row in top.iterrows():
|
|
92
|
+
print(f" {row['tissue_name']}: "
|
|
93
|
+
f"{row['median_tpm']:.2f} TPM "
|
|
94
|
+
f"(n={row['sample_count']})")
|
|
95
|
+
return top
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## 2. eQTL 検索
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
def gtex_eqtl_lookup(gene_id, tissue, variant_id=None):
|
|
102
|
+
"""
|
|
103
|
+
GTEx — eQTL ルックアップ。
|
|
104
|
+
|
|
105
|
+
Parameters:
|
|
106
|
+
gene_id: str — 遺伝子シンボル or Ensembl ID
|
|
107
|
+
tissue: str — 組織 ID
|
|
108
|
+
(例: "Liver", "Whole_Blood")
|
|
109
|
+
variant_id: str — バリアント ID (任意)
|
|
110
|
+
(例: "rs12345")
|
|
111
|
+
"""
|
|
112
|
+
url = f"{GTEX_BASE}/association/singleTissueEqtl"
|
|
113
|
+
params = {
|
|
114
|
+
"gencodeId": gene_id,
|
|
115
|
+
"tissueSiteDetailId": tissue,
|
|
116
|
+
"datasetId": "gtex_v8",
|
|
117
|
+
}
|
|
118
|
+
if variant_id:
|
|
119
|
+
params["variantId"] = variant_id
|
|
120
|
+
|
|
121
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
122
|
+
resp.raise_for_status()
|
|
123
|
+
data = resp.json()
|
|
124
|
+
|
|
125
|
+
results = []
|
|
126
|
+
for item in data.get("data", []):
|
|
127
|
+
results.append({
|
|
128
|
+
"gene_symbol": item.get("geneSymbol", ""),
|
|
129
|
+
"variant_id": item.get("variantId", ""),
|
|
130
|
+
"tissue": tissue,
|
|
131
|
+
"pvalue": item.get("pValue"),
|
|
132
|
+
"nes": item.get("nes"), # normalized effect size
|
|
133
|
+
"maf": item.get("maf"),
|
|
134
|
+
"ref": item.get("ref", ""),
|
|
135
|
+
"alt": item.get("alt", ""),
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
df = pd.DataFrame(results)
|
|
139
|
+
if not df.empty:
|
|
140
|
+
df = df.sort_values("pvalue")
|
|
141
|
+
|
|
142
|
+
print(f"GTEx eQTL: {gene_id} in {tissue} → "
|
|
143
|
+
f"{len(df)} associations")
|
|
144
|
+
return df
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## 3. 多組織比較
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
def gtex_multi_gene_comparison(gene_ids, tissues=None):
|
|
151
|
+
"""
|
|
152
|
+
GTEx — 複数遺伝子・複数組織の発現比較。
|
|
153
|
+
|
|
154
|
+
Parameters:
|
|
155
|
+
gene_ids: list[str] — 遺伝子リスト
|
|
156
|
+
tissues: list[str] — 組織リスト (None で全組織)
|
|
157
|
+
"""
|
|
158
|
+
all_data = []
|
|
159
|
+
for gid in gene_ids:
|
|
160
|
+
try:
|
|
161
|
+
df = gtex_gene_expression(gid)
|
|
162
|
+
if tissues:
|
|
163
|
+
df = df[df["tissue"].isin(tissues)]
|
|
164
|
+
all_data.append(df)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print(f" Warning: {gid} — {e}")
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
if not all_data:
|
|
170
|
+
return pd.DataFrame()
|
|
171
|
+
|
|
172
|
+
combined = pd.concat(all_data, ignore_index=True)
|
|
173
|
+
|
|
174
|
+
# ピボットテーブル: 行=組織, 列=遺伝子, 値=TPM
|
|
175
|
+
if not combined.empty:
|
|
176
|
+
pivot = combined.pivot_table(
|
|
177
|
+
index="tissue_name",
|
|
178
|
+
columns="gene_symbol",
|
|
179
|
+
values="median_tpm",
|
|
180
|
+
aggfunc="first",
|
|
181
|
+
)
|
|
182
|
+
print(f"GTEx comparison: {len(gene_ids)} genes × "
|
|
183
|
+
f"{len(pivot)} tissues")
|
|
184
|
+
return pivot
|
|
185
|
+
|
|
186
|
+
return combined
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## 4. GTEx 統合パイプライン
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
def gtex_pipeline(gene_ids, tissues=None,
|
|
193
|
+
output_dir="results"):
|
|
194
|
+
"""
|
|
195
|
+
GTEx 統合パイプライン。
|
|
196
|
+
|
|
197
|
+
Parameters:
|
|
198
|
+
gene_ids: list[str] — 遺伝子リスト
|
|
199
|
+
tissues: list[str] — 組織リスト (None で全組織)
|
|
200
|
+
output_dir: str — 出力ディレクトリ
|
|
201
|
+
"""
|
|
202
|
+
from pathlib import Path
|
|
203
|
+
output_dir = Path(output_dir)
|
|
204
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
205
|
+
|
|
206
|
+
# 1) 全遺伝子の組織発現
|
|
207
|
+
all_expr = []
|
|
208
|
+
for gid in gene_ids:
|
|
209
|
+
try:
|
|
210
|
+
df = gtex_gene_expression(gid)
|
|
211
|
+
df.to_csv(output_dir / f"expression_{gid}.csv",
|
|
212
|
+
index=False)
|
|
213
|
+
all_expr.append(df)
|
|
214
|
+
except Exception:
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
# 2) 多組織比較マトリクス
|
|
218
|
+
pivot = gtex_multi_gene_comparison(gene_ids, tissues)
|
|
219
|
+
if isinstance(pivot, pd.DataFrame) and not pivot.empty:
|
|
220
|
+
pivot.to_csv(output_dir / "expression_matrix.csv")
|
|
221
|
+
|
|
222
|
+
# 3) eQTL 検索 (上位組織)
|
|
223
|
+
eqtl_results = []
|
|
224
|
+
for gid in gene_ids:
|
|
225
|
+
if all_expr:
|
|
226
|
+
top = all_expr[-1].head(3)
|
|
227
|
+
for _, row in top.iterrows():
|
|
228
|
+
try:
|
|
229
|
+
eqtl = gtex_eqtl_lookup(gid,
|
|
230
|
+
row["tissue"])
|
|
231
|
+
eqtl_results.append(eqtl)
|
|
232
|
+
except Exception:
|
|
233
|
+
continue
|
|
234
|
+
if eqtl_results:
|
|
235
|
+
eqtl_combined = pd.concat(eqtl_results,
|
|
236
|
+
ignore_index=True)
|
|
237
|
+
eqtl_combined.to_csv(output_dir / "eqtl_results.csv",
|
|
238
|
+
index=False)
|
|
239
|
+
|
|
240
|
+
print(f"GTEx pipeline: {output_dir}")
|
|
241
|
+
return {"expression": all_expr, "matrix": pivot}
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## ToolUniverse 連携
|
|
247
|
+
|
|
248
|
+
| TU Key | ツール名 | 連携内容 |
|
|
249
|
+
|--------|---------|---------|
|
|
250
|
+
| (direct) | GTEx Portal API v2 | 直接 REST API — TU 非連携 |
|
|
251
|
+
|
|
252
|
+
## パイプライン統合
|
|
253
|
+
|
|
254
|
+
```
|
|
255
|
+
gene-expression-transcriptomics → gtex-tissue-expression → variant-interpretation
|
|
256
|
+
(DESeq2/edgeR 差分発現) (組織別 TPM + eQTL) (臨床変異評価)
|
|
257
|
+
│ │ ↓
|
|
258
|
+
arrayexpress-expression ──────────┘ gwas-catalog
|
|
259
|
+
(ArrayExpress データ) │ (GWAS 関連解析)
|
|
260
|
+
↓
|
|
261
|
+
disease-research
|
|
262
|
+
(疾患関連遺伝子)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
## パイプライン出力
|
|
266
|
+
|
|
267
|
+
| ファイル | 説明 | 次スキル |
|
|
268
|
+
|---------|------|---------|
|
|
269
|
+
| `results/expression_*.csv` | 遺伝子別組織発現 | → disease-research |
|
|
270
|
+
| `results/expression_matrix.csv` | 多遺伝子比較 | → pathway-enrichment |
|
|
271
|
+
| `results/eqtl_results.csv` | eQTL 関連 | → variant-interpretation |
|