@nahisaho/satori 0.16.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -18
- package/package.json +1 -1
- package/src/.github/skills/scientific-encode-screen/SKILL.md +315 -0
- package/src/.github/skills/scientific-environmental-geodata/SKILL.md +255 -0
- package/src/.github/skills/scientific-geo-expression/SKILL.md +274 -0
- package/src/.github/skills/scientific-human-cell-atlas/SKILL.md +294 -0
- package/src/.github/skills/scientific-metabolic-atlas/SKILL.md +263 -0
- package/src/.github/skills/scientific-paleobiology/SKILL.md +265 -0
- package/src/.github/skills/scientific-parasite-genomics/SKILL.md +280 -0
- package/src/.github/skills/scientific-squidpy-advanced/SKILL.md +251 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-environmental-geodata
|
|
3
|
+
description: |
|
|
4
|
+
環境地理空間データスキル。SoilGrids REST API による土壌特性
|
|
5
|
+
取得、WorldClim/CHELSA 気候データ、生物多様性-環境モデリング
|
|
6
|
+
統合。直接 REST API 連携 (TU 外)。
|
|
7
|
+
tu_tools: []
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Environmental Geodata
|
|
11
|
+
|
|
12
|
+
SoilGrids・WorldClim 等の地球観測/環境データ API を活用した
|
|
13
|
+
生態学的環境モデリングパイプラインを提供する。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- グローバル土壌特性 (pH, SOC, 粘土含量) を取得するとき
|
|
18
|
+
- バイオクリマティック変数 (BIO1-BIO19) を取得するとき
|
|
19
|
+
- 種分布モデル (SDM) の環境変数を準備するとき
|
|
20
|
+
- 気候変動シナリオの生息地適性を評価するとき
|
|
21
|
+
- 環境ニッチモデリングを実施するとき
|
|
22
|
+
- 土壌-植生-気候の相互作用を解析するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. SoilGrids 土壌特性取得
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import requests
|
|
32
|
+
import pandas as pd
|
|
33
|
+
import numpy as np
|
|
34
|
+
|
|
35
|
+
SOILGRIDS_BASE = "https://rest.isric.org/soilgrids/v2.0"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def soilgrids_get_properties(lat, lon, properties=None,
|
|
39
|
+
depths=None, values=None):
|
|
40
|
+
"""
|
|
41
|
+
SoilGrids — 地点の土壌特性取得。
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
lat: float — 緯度
|
|
45
|
+
lon: float — 経度
|
|
46
|
+
properties: list[str] — 土壌特性 (例: ["phh2o", "soc", "clay"])
|
|
47
|
+
depths: list[str] — 深度 (例: ["0-5cm", "5-15cm"])
|
|
48
|
+
values: list[str] — 値の種類 (例: ["mean", "Q0.05", "Q0.95"])
|
|
49
|
+
"""
|
|
50
|
+
if properties is None:
|
|
51
|
+
properties = ["phh2o", "soc", "clay", "sand", "nitrogen",
|
|
52
|
+
"bdod", "cec", "ocd"]
|
|
53
|
+
if depths is None:
|
|
54
|
+
depths = ["0-5cm", "5-15cm", "15-30cm", "30-60cm"]
|
|
55
|
+
if values is None:
|
|
56
|
+
values = ["mean", "Q0.05", "Q0.95"]
|
|
57
|
+
|
|
58
|
+
url = f"{SOILGRIDS_BASE}/properties/query"
|
|
59
|
+
params = {
|
|
60
|
+
"lat": lat,
|
|
61
|
+
"lon": lon,
|
|
62
|
+
"property": properties,
|
|
63
|
+
"depth": depths,
|
|
64
|
+
"value": values,
|
|
65
|
+
}
|
|
66
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
67
|
+
resp.raise_for_status()
|
|
68
|
+
data = resp.json()
|
|
69
|
+
|
|
70
|
+
results = []
|
|
71
|
+
for layer in data.get("properties", {}).get("layers", []):
|
|
72
|
+
prop_name = layer.get("name", "")
|
|
73
|
+
unit = layer.get("unit_measure", {})
|
|
74
|
+
conversion = unit.get("mapped_units", "")
|
|
75
|
+
for depth_info in layer.get("depths", []):
|
|
76
|
+
row = {
|
|
77
|
+
"property": prop_name,
|
|
78
|
+
"depth": depth_info.get("label", ""),
|
|
79
|
+
"unit": conversion,
|
|
80
|
+
}
|
|
81
|
+
for val_key, val_val in depth_info.get("values", {}).items():
|
|
82
|
+
row[val_key] = val_val
|
|
83
|
+
results.append(row)
|
|
84
|
+
|
|
85
|
+
df = pd.DataFrame(results)
|
|
86
|
+
print(f"SoilGrids ({lat}, {lon}): {len(df)} records, "
|
|
87
|
+
f"{len(properties)} properties")
|
|
88
|
+
return df
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## 2. WorldClim バイオクリマティック変数
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
import rasterio
|
|
95
|
+
from rasterio.sample import sample_gen
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def worldclim_get_bioclim(lat, lon, resolution="2.5m",
|
|
99
|
+
data_dir="worldclim"):
|
|
100
|
+
"""
|
|
101
|
+
WorldClim — バイオクリマティック変数取得。
|
|
102
|
+
|
|
103
|
+
Parameters:
|
|
104
|
+
lat: float — 緯度
|
|
105
|
+
lon: float — 経度
|
|
106
|
+
resolution: str — 空間解像度 ("30s", "2.5m", "5m", "10m")
|
|
107
|
+
data_dir: str — WorldClim データディレクトリ
|
|
108
|
+
"""
|
|
109
|
+
from pathlib import Path
|
|
110
|
+
bio_dir = Path(data_dir) / f"wc2.1_{resolution}_bio"
|
|
111
|
+
|
|
112
|
+
bioclim_names = {
|
|
113
|
+
1: "Annual Mean Temperature",
|
|
114
|
+
2: "Mean Diurnal Range",
|
|
115
|
+
3: "Isothermality",
|
|
116
|
+
4: "Temperature Seasonality",
|
|
117
|
+
5: "Max Temperature Warmest Month",
|
|
118
|
+
6: "Min Temperature Coldest Month",
|
|
119
|
+
7: "Temperature Annual Range",
|
|
120
|
+
8: "Mean Temperature Wettest Quarter",
|
|
121
|
+
9: "Mean Temperature Driest Quarter",
|
|
122
|
+
10: "Mean Temperature Warmest Quarter",
|
|
123
|
+
11: "Mean Temperature Coldest Quarter",
|
|
124
|
+
12: "Annual Precipitation",
|
|
125
|
+
13: "Precipitation Wettest Month",
|
|
126
|
+
14: "Precipitation Driest Month",
|
|
127
|
+
15: "Precipitation Seasonality",
|
|
128
|
+
16: "Precipitation Wettest Quarter",
|
|
129
|
+
17: "Precipitation Driest Quarter",
|
|
130
|
+
18: "Precipitation Warmest Quarter",
|
|
131
|
+
19: "Precipitation Coldest Quarter",
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
results = []
|
|
135
|
+
for bio_num, bio_name in bioclim_names.items():
|
|
136
|
+
tif_path = bio_dir / f"wc2.1_{resolution}_bio_{bio_num}.tif"
|
|
137
|
+
if not tif_path.exists():
|
|
138
|
+
continue
|
|
139
|
+
with rasterio.open(tif_path) as src:
|
|
140
|
+
vals = list(sample_gen(src, [(lon, lat)]))
|
|
141
|
+
value = vals[0][0] if vals else None
|
|
142
|
+
results.append({
|
|
143
|
+
"variable": f"BIO{bio_num}",
|
|
144
|
+
"name": bio_name,
|
|
145
|
+
"value": value,
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
df = pd.DataFrame(results)
|
|
149
|
+
print(f"WorldClim ({lat}, {lon}): {len(df)} bioclim variables")
|
|
150
|
+
return df
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## 3. 種分布モデル環境変数統合
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
def sdm_environmental_stack(occurrences_df, lat_col="latitude",
|
|
157
|
+
lon_col="longitude", buffer_deg=0.5):
|
|
158
|
+
"""
|
|
159
|
+
SDM — 種の出現記録に対する環境変数スタック生成。
|
|
160
|
+
|
|
161
|
+
Parameters:
|
|
162
|
+
occurrences_df: pd.DataFrame — 種出現記録
|
|
163
|
+
lat_col: str — 緯度カラム名
|
|
164
|
+
lon_col: str — 経度カラム名
|
|
165
|
+
buffer_deg: float — バッファ距離 (度)
|
|
166
|
+
"""
|
|
167
|
+
results = []
|
|
168
|
+
for _, row in occurrences_df.iterrows():
|
|
169
|
+
lat, lon = row[lat_col], row[lon_col]
|
|
170
|
+
|
|
171
|
+
# SoilGrids
|
|
172
|
+
soil = soilgrids_get_properties(lat, lon,
|
|
173
|
+
properties=["phh2o", "soc", "clay"])
|
|
174
|
+
soil_mean = {}
|
|
175
|
+
for _, s in soil.iterrows():
|
|
176
|
+
if s.get("depth") == "0-5cm":
|
|
177
|
+
soil_mean[f"soil_{s['property']}"] = s.get("mean", None)
|
|
178
|
+
|
|
179
|
+
# WorldClim (if available)
|
|
180
|
+
bioclim = {}
|
|
181
|
+
try:
|
|
182
|
+
bio_df = worldclim_get_bioclim(lat, lon)
|
|
183
|
+
bioclim = {r["variable"]: r["value"]
|
|
184
|
+
for _, r in bio_df.iterrows()}
|
|
185
|
+
except Exception:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
combined = {
|
|
189
|
+
lat_col: lat,
|
|
190
|
+
lon_col: lon,
|
|
191
|
+
**soil_mean,
|
|
192
|
+
**bioclim,
|
|
193
|
+
}
|
|
194
|
+
results.append(combined)
|
|
195
|
+
|
|
196
|
+
df = pd.DataFrame(results)
|
|
197
|
+
print(f"SDM env stack: {len(df)} points, {len(df.columns)} variables")
|
|
198
|
+
return df
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## 4. 環境地理空間統合パイプライン
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
def environmental_geodata_pipeline(occurrences_csv, output_dir="results"):
|
|
205
|
+
"""
|
|
206
|
+
環境地理空間統合パイプライン。
|
|
207
|
+
|
|
208
|
+
Parameters:
|
|
209
|
+
occurrences_csv: str — 種出現記録 CSV パス
|
|
210
|
+
output_dir: str — 出力ディレクトリ
|
|
211
|
+
"""
|
|
212
|
+
from pathlib import Path
|
|
213
|
+
output_dir = Path(output_dir)
|
|
214
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
215
|
+
|
|
216
|
+
occ = pd.read_csv(occurrences_csv)
|
|
217
|
+
print(f"Occurrences: {len(occ)} records")
|
|
218
|
+
|
|
219
|
+
# 環境変数スタック
|
|
220
|
+
env_df = sdm_environmental_stack(occ)
|
|
221
|
+
env_df.to_csv(output_dir / "env_stack.csv", index=False)
|
|
222
|
+
|
|
223
|
+
# 環境空間要約
|
|
224
|
+
summary = env_df.describe().T
|
|
225
|
+
summary.to_csv(output_dir / "env_summary.csv")
|
|
226
|
+
|
|
227
|
+
print(f"Environmental pipeline: {output_dir}")
|
|
228
|
+
return {"occurrences": occ, "env_stack": env_df, "summary": summary}
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## ToolUniverse 連携
|
|
234
|
+
|
|
235
|
+
直接 REST API 使用 (SoilGrids, WorldClim は ToolUniverse 外)。
|
|
236
|
+
|
|
237
|
+
## パイプライン統合
|
|
238
|
+
|
|
239
|
+
```
|
|
240
|
+
environmental-ecology → environmental-geodata → marine-ecology
|
|
241
|
+
(GBIF/iNaturalist) (SoilGrids/WorldClim) (OBIS/WoRMS)
|
|
242
|
+
│ │ ↓
|
|
243
|
+
phylogenetics ───────────────┘ biodiversity-indices
|
|
244
|
+
(系統情報) │ (多様性指標)
|
|
245
|
+
↓
|
|
246
|
+
species-distribution-model
|
|
247
|
+
(SDM 統合)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## パイプライン出力
|
|
251
|
+
|
|
252
|
+
| ファイル | 説明 | 次スキル |
|
|
253
|
+
|---------|------|---------|
|
|
254
|
+
| `results/env_stack.csv` | 環境変数スタック | → species-distribution-model |
|
|
255
|
+
| `results/env_summary.csv` | 環境空間要約 | → environmental-ecology |
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-geo-expression
|
|
3
|
+
description: |
|
|
4
|
+
GEO (Gene Expression Omnibus) 発現プロファイルスキル。GEO REST
|
|
5
|
+
API データセット検索・サンプル情報・発現マトリクス取得・バルク
|
|
6
|
+
RNA-seq/マイクロアレイ差次的発現解析。ToolUniverse 連携: geo。
|
|
7
|
+
tu_tools:
|
|
8
|
+
- key: geo
|
|
9
|
+
name: GEO (Gene Expression Omnibus)
|
|
10
|
+
description: GEO データセット・サンプル情報・発現データ検索
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Scientific GEO Expression
|
|
14
|
+
|
|
15
|
+
GEO REST API を活用したトランスクリプトーム発現プロファイル
|
|
16
|
+
解析パイプラインを提供する。
|
|
17
|
+
|
|
18
|
+
## When to Use
|
|
19
|
+
|
|
20
|
+
- GEO データセット (GDS/GSE) を検索・ダウンロードするとき
|
|
21
|
+
- マイクロアレイ/RNA-seq 発現マトリクスを取得するとき
|
|
22
|
+
- 条件間差次的発現解析 (DEG) を実行するとき
|
|
23
|
+
- 複数 GEO データセットを横断比較するとき
|
|
24
|
+
- GEO メタデータから実験条件を構造化するとき
|
|
25
|
+
- 再解析パイプラインで GEO データを再利用するとき
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
## 1. GEO データセット検索
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import requests
|
|
35
|
+
import pandas as pd
|
|
36
|
+
import GEOparse
|
|
37
|
+
from io import StringIO
|
|
38
|
+
|
|
39
|
+
GEO_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def geo_search_datasets(query, organism="Homo sapiens",
|
|
43
|
+
study_type=None, limit=20):
|
|
44
|
+
"""
|
|
45
|
+
GEO — データセット検索 (E-utilities)。
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
query: str — 検索クエリ (例: "breast cancer RNA-seq")
|
|
49
|
+
organism: str — 生物種
|
|
50
|
+
study_type: str — 研究タイプ ("Expression profiling by array" etc.)
|
|
51
|
+
limit: int — 最大結果数
|
|
52
|
+
"""
|
|
53
|
+
search_term = f"{query} AND {organism}[Organism]"
|
|
54
|
+
if study_type:
|
|
55
|
+
search_term += f' AND "{study_type}"[Study Type]'
|
|
56
|
+
|
|
57
|
+
# ESearch
|
|
58
|
+
url = f"{GEO_BASE}/esearch.fcgi"
|
|
59
|
+
params = {
|
|
60
|
+
"db": "gds",
|
|
61
|
+
"term": search_term,
|
|
62
|
+
"retmax": limit,
|
|
63
|
+
"retmode": "json",
|
|
64
|
+
}
|
|
65
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
66
|
+
resp.raise_for_status()
|
|
67
|
+
ids = resp.json().get("esearchresult", {}).get("idlist", [])
|
|
68
|
+
|
|
69
|
+
if not ids:
|
|
70
|
+
print("No GEO datasets found")
|
|
71
|
+
return pd.DataFrame()
|
|
72
|
+
|
|
73
|
+
# ESummary
|
|
74
|
+
url = f"{GEO_BASE}/esummary.fcgi"
|
|
75
|
+
params = {"db": "gds", "id": ",".join(ids), "retmode": "json"}
|
|
76
|
+
resp = requests.get(url, params=params, timeout=30)
|
|
77
|
+
resp.raise_for_status()
|
|
78
|
+
summaries = resp.json().get("result", {})
|
|
79
|
+
|
|
80
|
+
results = []
|
|
81
|
+
for gds_id in ids:
|
|
82
|
+
info = summaries.get(gds_id, {})
|
|
83
|
+
results.append({
|
|
84
|
+
"accession": info.get("accession", ""),
|
|
85
|
+
"title": info.get("title", ""),
|
|
86
|
+
"summary": info.get("summary", "")[:200],
|
|
87
|
+
"organism": info.get("taxon", ""),
|
|
88
|
+
"platform": info.get("gpl", ""),
|
|
89
|
+
"sample_count": info.get("n_samples", 0),
|
|
90
|
+
"series_type": info.get("gdstype", ""),
|
|
91
|
+
"pub_date": info.get("pdat", ""),
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
df = pd.DataFrame(results)
|
|
95
|
+
print(f"GEO search: {len(df)} datasets")
|
|
96
|
+
return df
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## 2. GEO 発現マトリクス取得
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
def geo_get_expression_matrix(gse_id, log2_transform=True):
|
|
103
|
+
"""
|
|
104
|
+
GEO — GSE 発現マトリクス取得 (GEOparse)。
|
|
105
|
+
|
|
106
|
+
Parameters:
|
|
107
|
+
gse_id: str — GSE アクセッション (例: "GSE12345")
|
|
108
|
+
log2_transform: bool — log2 変換を適用するか
|
|
109
|
+
"""
|
|
110
|
+
import numpy as np
|
|
111
|
+
|
|
112
|
+
gse = GEOparse.get_GEO(geo=gse_id, destdir="/tmp", silent=True)
|
|
113
|
+
|
|
114
|
+
# サンプルメタデータ
|
|
115
|
+
samples = []
|
|
116
|
+
for gsm_name, gsm in gse.gsms.items():
|
|
117
|
+
meta = gsm.metadata
|
|
118
|
+
samples.append({
|
|
119
|
+
"sample_id": gsm_name,
|
|
120
|
+
"title": meta.get("title", [""])[0],
|
|
121
|
+
"source": meta.get("source_name_ch1", [""])[0],
|
|
122
|
+
"characteristics": "; ".join(
|
|
123
|
+
meta.get("characteristics_ch1", [])),
|
|
124
|
+
"platform": meta.get("platform_id", [""])[0],
|
|
125
|
+
})
|
|
126
|
+
sample_df = pd.DataFrame(samples)
|
|
127
|
+
|
|
128
|
+
# 発現マトリクス
|
|
129
|
+
pivoted = gse.pivot_samples("VALUE")
|
|
130
|
+
if pivoted.empty:
|
|
131
|
+
print(f"No expression data in {gse_id}")
|
|
132
|
+
return sample_df, pd.DataFrame()
|
|
133
|
+
|
|
134
|
+
if log2_transform:
|
|
135
|
+
pivoted = np.log2(pivoted.astype(float) + 1)
|
|
136
|
+
|
|
137
|
+
print(f"GEO {gse_id}: {pivoted.shape[0]} probes × "
|
|
138
|
+
f"{pivoted.shape[1]} samples")
|
|
139
|
+
return sample_df, pivoted
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## 3. 差次的発現解析
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from scipy import stats
|
|
146
|
+
import numpy as np
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def geo_differential_expression(expr_matrix, group_a_samples,
|
|
150
|
+
group_b_samples, method="ttest",
|
|
151
|
+
fdr_threshold=0.05, lfc_threshold=1.0):
|
|
152
|
+
"""
|
|
153
|
+
GEO — 差次的発現解析。
|
|
154
|
+
|
|
155
|
+
Parameters:
|
|
156
|
+
expr_matrix: pd.DataFrame — 発現マトリクス (genes × samples)
|
|
157
|
+
group_a_samples: list[str] — グループ A サンプル ID
|
|
158
|
+
group_b_samples: list[str] — グループ B サンプル ID
|
|
159
|
+
method: str — "ttest" or "wilcoxon"
|
|
160
|
+
fdr_threshold: float — FDR 閾値
|
|
161
|
+
lfc_threshold: float — log2FC 閾値
|
|
162
|
+
"""
|
|
163
|
+
a_data = expr_matrix[group_a_samples]
|
|
164
|
+
b_data = expr_matrix[group_b_samples]
|
|
165
|
+
|
|
166
|
+
results = []
|
|
167
|
+
for gene in expr_matrix.index:
|
|
168
|
+
a_vals = a_data.loc[gene].dropna().values
|
|
169
|
+
b_vals = b_data.loc[gene].dropna().values
|
|
170
|
+
|
|
171
|
+
if len(a_vals) < 2 or len(b_vals) < 2:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
lfc = b_vals.mean() - a_vals.mean()
|
|
175
|
+
|
|
176
|
+
if method == "ttest":
|
|
177
|
+
stat, pval = stats.ttest_ind(a_vals, b_vals)
|
|
178
|
+
else:
|
|
179
|
+
stat, pval = stats.mannwhitneyu(a_vals, b_vals,
|
|
180
|
+
alternative="two-sided")
|
|
181
|
+
|
|
182
|
+
results.append({
|
|
183
|
+
"gene": gene,
|
|
184
|
+
"log2fc": lfc,
|
|
185
|
+
"mean_a": a_vals.mean(),
|
|
186
|
+
"mean_b": b_vals.mean(),
|
|
187
|
+
"statistic": stat,
|
|
188
|
+
"p_value": pval,
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
df = pd.DataFrame(results)
|
|
192
|
+
|
|
193
|
+
# FDR correction (Benjamini-Hochberg)
|
|
194
|
+
from statsmodels.stats.multitest import multipletests
|
|
195
|
+
_, df["fdr"], _, _ = multipletests(df["p_value"], method="fdr_bh")
|
|
196
|
+
|
|
197
|
+
# DEG フィルタ
|
|
198
|
+
df["is_deg"] = (df["fdr"] < fdr_threshold) & (df["log2fc"].abs() > lfc_threshold)
|
|
199
|
+
n_deg = df["is_deg"].sum()
|
|
200
|
+
n_up = ((df["is_deg"]) & (df["log2fc"] > 0)).sum()
|
|
201
|
+
n_down = ((df["is_deg"]) & (df["log2fc"] < 0)).sum()
|
|
202
|
+
|
|
203
|
+
print(f"DEG: {n_deg} genes (↑{n_up} / ↓{n_down}), "
|
|
204
|
+
f"FDR<{fdr_threshold}, |LFC|>{lfc_threshold}")
|
|
205
|
+
return df.sort_values("p_value")
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## 4. GEO 発現プロファイリングパイプライン
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
def geo_expression_pipeline(gse_id, group_col="condition",
|
|
212
|
+
group_a="control", group_b="treatment",
|
|
213
|
+
output_dir="results"):
|
|
214
|
+
"""
|
|
215
|
+
GEO 発現プロファイリング統合パイプライン。
|
|
216
|
+
|
|
217
|
+
Parameters:
|
|
218
|
+
gse_id: str — GSE アクセッション
|
|
219
|
+
group_col: str — グループ化カラム
|
|
220
|
+
group_a: str — コントロールグループ
|
|
221
|
+
group_b: str — 処理グループ
|
|
222
|
+
output_dir: str — 出力ディレクトリ
|
|
223
|
+
"""
|
|
224
|
+
from pathlib import Path
|
|
225
|
+
output_dir = Path(output_dir)
|
|
226
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
227
|
+
|
|
228
|
+
# 1) データ取得
|
|
229
|
+
sample_df, expr = geo_get_expression_matrix(gse_id)
|
|
230
|
+
sample_df.to_csv(output_dir / "samples.csv", index=False)
|
|
231
|
+
|
|
232
|
+
# 2) グループ分割
|
|
233
|
+
a_samples = sample_df[
|
|
234
|
+
sample_df["source"].str.contains(group_a, case=False)
|
|
235
|
+
]["sample_id"].tolist()
|
|
236
|
+
b_samples = sample_df[
|
|
237
|
+
sample_df["source"].str.contains(group_b, case=False)
|
|
238
|
+
]["sample_id"].tolist()
|
|
239
|
+
|
|
240
|
+
# 3) 差次的発現
|
|
241
|
+
deg = geo_differential_expression(expr, a_samples, b_samples)
|
|
242
|
+
deg.to_csv(output_dir / "deg_results.csv", index=False)
|
|
243
|
+
|
|
244
|
+
print(f"GEO pipeline: {output_dir}")
|
|
245
|
+
return {"samples": sample_df, "expression": expr, "deg": deg}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## ToolUniverse 連携
|
|
251
|
+
|
|
252
|
+
| TU Key | ツール名 | 連携内容 |
|
|
253
|
+
|--------|---------|---------|
|
|
254
|
+
| `geo` | GEO | データセット検索・サンプル情報・発現データ |
|
|
255
|
+
|
|
256
|
+
## パイプライン統合
|
|
257
|
+
|
|
258
|
+
```
|
|
259
|
+
ebi-databases → geo-expression → gene-expression-transcriptomics
|
|
260
|
+
(ENA/EBI Search) (GEO データ) (DESeq2/GTEx)
|
|
261
|
+
│ │ ↓
|
|
262
|
+
literature-search ────┘ pathway-enrichment
|
|
263
|
+
(PubMed/OpenAlex) │ (KEGG/Reactome/GO)
|
|
264
|
+
↓
|
|
265
|
+
multi-omics
|
|
266
|
+
(統合解析)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## パイプライン出力
|
|
270
|
+
|
|
271
|
+
| ファイル | 説明 | 次スキル |
|
|
272
|
+
|---------|------|---------|
|
|
273
|
+
| `results/samples.csv` | サンプルメタデータ | → gene-expression-transcriptomics |
|
|
274
|
+
| `results/deg_results.csv` | 差次的発現結果 | → pathway-enrichment |
|