@nahisaho/satori 0.13.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +134 -43
- package/package.json +1 -1
- package/src/.github/skills/scientific-advanced-imaging/SKILL.md +382 -0
- package/src/.github/skills/scientific-biomedical-pubtator/SKILL.md +331 -0
- package/src/.github/skills/scientific-cell-line-resources/SKILL.md +258 -0
- package/src/.github/skills/scientific-chembl-assay-mining/SKILL.md +509 -0
- package/src/.github/skills/scientific-deep-chemistry/SKILL.md +350 -0
- package/src/.github/skills/scientific-ebi-databases/SKILL.md +280 -0
- package/src/.github/skills/scientific-ensembl-genomics/SKILL.md +378 -0
- package/src/.github/skills/scientific-expression-comparison/SKILL.md +303 -0
- package/src/.github/skills/scientific-md-simulation/SKILL.md +315 -0
- package/src/.github/skills/scientific-model-organism-db/SKILL.md +329 -0
- package/src/.github/skills/scientific-ontology-enrichment/SKILL.md +340 -0
- package/src/.github/skills/scientific-perturbation-analysis/SKILL.md +297 -0
- package/src/.github/skills/scientific-phylogenetics/SKILL.md +297 -0
- package/src/.github/skills/scientific-preprint-archive/SKILL.md +476 -0
- package/src/.github/skills/scientific-public-health-data/SKILL.md +322 -0
- package/src/.github/skills/scientific-regulatory-genomics/SKILL.md +274 -0
- package/src/.github/skills/scientific-reinforcement-learning/SKILL.md +280 -0
- package/src/.github/skills/scientific-scvi-integration/SKILL.md +344 -0
- package/src/.github/skills/scientific-string-network-api/SKILL.md +376 -0
- package/src/.github/skills/scientific-symbolic-mathematics/SKILL.md +277 -0
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-deep-chemistry
|
|
3
|
+
description: |
|
|
4
|
+
深層学習分子特性予測スキル。DeepChem による GCN/MPNN/AttentiveFP
|
|
5
|
+
分子特性予測・MoleculeNet ベンチマーク・ChemBERTa/GROVER
|
|
6
|
+
事前学習モデル・分子フィンガープリントフィーチャライザ。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Deep Chemistry
|
|
10
|
+
|
|
11
|
+
DeepChem を活用した深層学習ベース分子特性予測パイプラインを提供する。
|
|
12
|
+
グラフニューラルネットワーク (GCN/MPNN/AttentiveFP)、MoleculeNet
|
|
13
|
+
ベンチマーク、事前学習モデル (ChemBERTa/GROVER)。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- 分子の ADMET/物性を深層学習で予測するとき
|
|
18
|
+
- MoleculeNet ベンチマークデータセットを使うとき
|
|
19
|
+
- GCN / MPNN / AttentiveFP モデルを訓練するとき
|
|
20
|
+
- ChemBERTa で分子表現学習を行うとき
|
|
21
|
+
- 毒性予測 (Tox21, ToxCast) を行うとき
|
|
22
|
+
- 薬理活性予測の分子特徴量を生成するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. MoleculeNet データセット読込み
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import deepchem as dc
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def load_moleculenet(dataset_name="delaney", featurizer="GraphConv",
|
|
37
|
+
split="scaffold"):
|
|
38
|
+
"""
|
|
39
|
+
MoleculeNet ベンチマークデータセット読込み。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
dataset_name: str — データセット名
|
|
43
|
+
("delaney", "tox21", "bbbp", "hiv", "muv", "pcba",
|
|
44
|
+
"sider", "clintox", "freesolv", "lipo")
|
|
45
|
+
featurizer: str — 特徴量化手法
|
|
46
|
+
("GraphConv", "ECFP", "Weave", "MolGraphConv")
|
|
47
|
+
split: str — 分割方法 ("scaffold", "random", "stratified")
|
|
48
|
+
|
|
49
|
+
K-Dense: deepchem
|
|
50
|
+
"""
|
|
51
|
+
loader_map = {
|
|
52
|
+
"delaney": dc.molnet.load_delaney,
|
|
53
|
+
"tox21": dc.molnet.load_tox21,
|
|
54
|
+
"bbbp": dc.molnet.load_bbbp,
|
|
55
|
+
"hiv": dc.molnet.load_hiv,
|
|
56
|
+
"muv": dc.molnet.load_muv,
|
|
57
|
+
"pcba": dc.molnet.load_pcba,
|
|
58
|
+
"sider": dc.molnet.load_sider,
|
|
59
|
+
"clintox": dc.molnet.load_clintox,
|
|
60
|
+
"freesolv": dc.molnet.load_freesolv,
|
|
61
|
+
"lipo": dc.molnet.load_lipo,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if dataset_name not in loader_map:
|
|
65
|
+
raise ValueError(f"Unknown dataset: {dataset_name}")
|
|
66
|
+
|
|
67
|
+
tasks, datasets, transformers = loader_map[dataset_name](
|
|
68
|
+
featurizer=featurizer, splitter=split
|
|
69
|
+
)
|
|
70
|
+
train, valid, test = datasets
|
|
71
|
+
|
|
72
|
+
print(f"MoleculeNet '{dataset_name}':")
|
|
73
|
+
print(f" Tasks: {len(tasks)}")
|
|
74
|
+
print(f" Train: {len(train)}, Valid: {len(valid)}, Test: {len(test)}")
|
|
75
|
+
print(f" Featurizer: {featurizer}, Split: {split}")
|
|
76
|
+
return tasks, (train, valid, test), transformers
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## 2. GCN モデル訓練
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
def train_gcn(train_data, valid_data, tasks, n_epochs=50,
|
|
83
|
+
learning_rate=0.001, batch_size=64):
|
|
84
|
+
"""
|
|
85
|
+
Graph Convolutional Network (GCN) モデル訓練。
|
|
86
|
+
|
|
87
|
+
Parameters:
|
|
88
|
+
train_data: dc.data.Dataset — 訓練データ
|
|
89
|
+
valid_data: dc.data.Dataset — 検証データ
|
|
90
|
+
tasks: list — タスク名リスト
|
|
91
|
+
n_epochs: int — エポック数
|
|
92
|
+
"""
|
|
93
|
+
model = dc.models.GraphConvModel(
|
|
94
|
+
n_tasks=len(tasks),
|
|
95
|
+
mode="classification" if len(tasks) > 1 else "regression",
|
|
96
|
+
batch_size=batch_size,
|
|
97
|
+
learning_rate=learning_rate,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
for epoch in range(n_epochs):
|
|
101
|
+
loss = model.fit(train_data, nb_epoch=1)
|
|
102
|
+
if (epoch + 1) % 10 == 0:
|
|
103
|
+
metric = dc.metrics.Metric(
|
|
104
|
+
dc.metrics.roc_auc_score if len(tasks) > 1
|
|
105
|
+
else dc.metrics.pearson_r2_score
|
|
106
|
+
)
|
|
107
|
+
train_score = model.evaluate(train_data, [metric])
|
|
108
|
+
valid_score = model.evaluate(valid_data, [metric])
|
|
109
|
+
print(f" Epoch {epoch+1}: "
|
|
110
|
+
f"train={list(train_score.values())[0]:.4f}, "
|
|
111
|
+
f"valid={list(valid_score.values())[0]:.4f}")
|
|
112
|
+
|
|
113
|
+
return model
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## 3. MPNN モデル訓練
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
def train_mpnn(train_data, valid_data, tasks, n_epochs=50,
|
|
120
|
+
learning_rate=0.001):
|
|
121
|
+
"""
|
|
122
|
+
Message Passing Neural Network (MPNN) 訓練。
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
train_data: dc.data.Dataset — GraphConv 特徴量訓練データ
|
|
126
|
+
valid_data: dc.data.Dataset — 検証データ
|
|
127
|
+
tasks: list — タスク名リスト
|
|
128
|
+
"""
|
|
129
|
+
model = dc.models.MPNNModel(
|
|
130
|
+
n_tasks=len(tasks),
|
|
131
|
+
mode="classification" if len(tasks) > 1 else "regression",
|
|
132
|
+
learning_rate=learning_rate,
|
|
133
|
+
node_out_feats=64,
|
|
134
|
+
edge_hidden_feats=128,
|
|
135
|
+
num_step_message_passing=3,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
model.fit(train_data, nb_epoch=n_epochs)
|
|
139
|
+
|
|
140
|
+
metric = dc.metrics.Metric(
|
|
141
|
+
dc.metrics.roc_auc_score if len(tasks) > 1
|
|
142
|
+
else dc.metrics.pearson_r2_score
|
|
143
|
+
)
|
|
144
|
+
valid_score = model.evaluate(valid_data, [metric])
|
|
145
|
+
print(f"MPNN: valid score = {list(valid_score.values())[0]:.4f}")
|
|
146
|
+
return model
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## 4. AttentiveFP モデル訓練
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
def train_attentivefp(train_data, valid_data, tasks, n_epochs=50,
|
|
153
|
+
learning_rate=0.001, num_layers=2):
|
|
154
|
+
"""
|
|
155
|
+
AttentiveFP (Attention-based Fingerprint) 訓練。
|
|
156
|
+
|
|
157
|
+
Parameters:
|
|
158
|
+
train_data: dc.data.Dataset — 訓練データ
|
|
159
|
+
valid_data: dc.data.Dataset — 検証データ
|
|
160
|
+
tasks: list — タスク名
|
|
161
|
+
num_layers: int — GATレイヤー数
|
|
162
|
+
"""
|
|
163
|
+
model = dc.models.AttentiveFPModel(
|
|
164
|
+
n_tasks=len(tasks),
|
|
165
|
+
mode="classification" if len(tasks) > 1 else "regression",
|
|
166
|
+
learning_rate=learning_rate,
|
|
167
|
+
num_layers=num_layers,
|
|
168
|
+
graph_feat_size=200,
|
|
169
|
+
num_timesteps=2,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
model.fit(train_data, nb_epoch=n_epochs)
|
|
173
|
+
|
|
174
|
+
metric = dc.metrics.Metric(
|
|
175
|
+
dc.metrics.roc_auc_score if len(tasks) > 1
|
|
176
|
+
else dc.metrics.pearson_r2_score
|
|
177
|
+
)
|
|
178
|
+
valid_score = model.evaluate(valid_data, [metric])
|
|
179
|
+
print(f"AttentiveFP: valid score = {list(valid_score.values())[0]:.4f}")
|
|
180
|
+
return model
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## 5. ChemBERTa 分子表現学習
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
def chemberta_embeddings(smiles_list, model_name="seyonec/ChemBERTa-zinc-base-v1"):
|
|
187
|
+
"""
|
|
188
|
+
ChemBERTa で SMILES → 分子埋込みベクトル。
|
|
189
|
+
|
|
190
|
+
Parameters:
|
|
191
|
+
smiles_list: list — SMILES 文字列リスト
|
|
192
|
+
model_name: str — HuggingFace モデル名
|
|
193
|
+
"""
|
|
194
|
+
from transformers import AutoTokenizer, AutoModel
|
|
195
|
+
import torch
|
|
196
|
+
|
|
197
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
198
|
+
model = AutoModel.from_pretrained(model_name)
|
|
199
|
+
model.eval()
|
|
200
|
+
|
|
201
|
+
embeddings = []
|
|
202
|
+
batch_size = 32
|
|
203
|
+
|
|
204
|
+
for i in range(0, len(smiles_list), batch_size):
|
|
205
|
+
batch = smiles_list[i:i+batch_size]
|
|
206
|
+
inputs = tokenizer(batch, padding=True, truncation=True,
|
|
207
|
+
max_length=512, return_tensors="pt")
|
|
208
|
+
|
|
209
|
+
with torch.no_grad():
|
|
210
|
+
outputs = model(**inputs)
|
|
211
|
+
# CLS トークン埋込み
|
|
212
|
+
cls_emb = outputs.last_hidden_state[:, 0, :].numpy()
|
|
213
|
+
embeddings.append(cls_emb)
|
|
214
|
+
|
|
215
|
+
embeddings = np.vstack(embeddings)
|
|
216
|
+
print(f"ChemBERTa: {len(smiles_list)} molecules → "
|
|
217
|
+
f"{embeddings.shape[1]}D embeddings")
|
|
218
|
+
return embeddings
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## 6. モデル比較ベンチマーク
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
def benchmark_models(dataset_name="tox21", models_to_test=None,
|
|
225
|
+
n_epochs=30):
|
|
226
|
+
"""
|
|
227
|
+
複数モデルのベンチマーク比較。
|
|
228
|
+
|
|
229
|
+
Parameters:
|
|
230
|
+
dataset_name: str — MoleculeNet データセット
|
|
231
|
+
models_to_test: list — テストモデル名
|
|
232
|
+
n_epochs: int — エポック数
|
|
233
|
+
"""
|
|
234
|
+
if models_to_test is None:
|
|
235
|
+
models_to_test = ["GCN", "MPNN", "AttentiveFP"]
|
|
236
|
+
|
|
237
|
+
results = {}
|
|
238
|
+
|
|
239
|
+
for model_name in models_to_test:
|
|
240
|
+
featurizer = "GraphConv" if model_name != "ECFP_RF" else "ECFP"
|
|
241
|
+
tasks, (train, valid, test), transformers = load_moleculenet(
|
|
242
|
+
dataset_name, featurizer=featurizer
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
is_classification = len(tasks) > 1 or dataset_name in [
|
|
246
|
+
"tox21", "bbbp", "hiv", "sider", "clintox"
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
if model_name == "GCN":
|
|
250
|
+
model = train_gcn(train, valid, tasks, n_epochs=n_epochs)
|
|
251
|
+
elif model_name == "MPNN":
|
|
252
|
+
model = train_mpnn(train, valid, tasks, n_epochs=n_epochs)
|
|
253
|
+
elif model_name == "AttentiveFP":
|
|
254
|
+
model = train_attentivefp(train, valid, tasks, n_epochs=n_epochs)
|
|
255
|
+
else:
|
|
256
|
+
continue
|
|
257
|
+
|
|
258
|
+
metric = dc.metrics.Metric(
|
|
259
|
+
dc.metrics.roc_auc_score if is_classification
|
|
260
|
+
else dc.metrics.pearson_r2_score
|
|
261
|
+
)
|
|
262
|
+
test_score = model.evaluate(test, [metric])
|
|
263
|
+
results[model_name] = list(test_score.values())[0]
|
|
264
|
+
|
|
265
|
+
print(f"\nBenchmark on '{dataset_name}':")
|
|
266
|
+
for name, score in sorted(results.items(), key=lambda x: -x[1]):
|
|
267
|
+
print(f" {name}: {score:.4f}")
|
|
268
|
+
return results
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## 7. 分子特性予測パイプライン
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
def molecular_prediction_pipeline(smiles_list, property_name="solubility",
|
|
275
|
+
model_type="AttentiveFP"):
|
|
276
|
+
"""
|
|
277
|
+
SMILES → 分子特性予測 統合パイプライン。
|
|
278
|
+
|
|
279
|
+
Parameters:
|
|
280
|
+
smiles_list: list — SMILES リスト
|
|
281
|
+
property_name: str — 予測対象物性
|
|
282
|
+
model_type: str — 使用モデル
|
|
283
|
+
"""
|
|
284
|
+
# データセットマッピング
|
|
285
|
+
property_dataset = {
|
|
286
|
+
"solubility": "delaney",
|
|
287
|
+
"toxicity": "tox21",
|
|
288
|
+
"bbb_penetration": "bbbp",
|
|
289
|
+
"hiv_activity": "hiv",
|
|
290
|
+
"lipophilicity": "lipo",
|
|
291
|
+
"solvation_energy": "freesolv",
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
dataset_name = property_dataset.get(property_name, "delaney")
|
|
295
|
+
|
|
296
|
+
# 1) ベンチマークデータで訓練
|
|
297
|
+
tasks, (train, valid, test), transformers = load_moleculenet(
|
|
298
|
+
dataset_name, featurizer="GraphConv"
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
if model_type == "GCN":
|
|
302
|
+
model = train_gcn(train, valid, tasks)
|
|
303
|
+
elif model_type == "AttentiveFP":
|
|
304
|
+
model = train_attentivefp(train, valid, tasks)
|
|
305
|
+
else:
|
|
306
|
+
model = train_mpnn(train, valid, tasks)
|
|
307
|
+
|
|
308
|
+
# 2) 新規分子を予測
|
|
309
|
+
featurizer = dc.feat.MolGraphConvFeaturizer()
|
|
310
|
+
features = featurizer.featurize(smiles_list)
|
|
311
|
+
pred_dataset = dc.data.NumpyDataset(X=features)
|
|
312
|
+
predictions = model.predict(pred_dataset)
|
|
313
|
+
|
|
314
|
+
results = []
|
|
315
|
+
for smi, pred in zip(smiles_list, predictions):
|
|
316
|
+
results.append({
|
|
317
|
+
"smiles": smi,
|
|
318
|
+
"prediction": float(pred[0]) if pred.ndim > 1 else float(pred),
|
|
319
|
+
"property": property_name,
|
|
320
|
+
"model": model_type,
|
|
321
|
+
})
|
|
322
|
+
|
|
323
|
+
df = pd.DataFrame(results)
|
|
324
|
+
print(f"Predictions: {len(df)} molecules, property='{property_name}'")
|
|
325
|
+
return df
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
---
|
|
329
|
+
|
|
330
|
+
## パイプライン統合
|
|
331
|
+
|
|
332
|
+
```
|
|
333
|
+
cheminformatics → deep-chemistry → drug-target-profiling
|
|
334
|
+
(RDKit/SMILES) (GCN/MPNN/FP) (ChEMBL/標的)
|
|
335
|
+
│ │ ↓
|
|
336
|
+
molecular-docking ───────┘ admet-pharmacokinetics
|
|
337
|
+
(AutoDock/Vina) │ (ADMET予測)
|
|
338
|
+
↓
|
|
339
|
+
md-simulation
|
|
340
|
+
(分子動力学検証)
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
## パイプライン出力
|
|
344
|
+
|
|
345
|
+
| ファイル | 説明 | 次スキル |
|
|
346
|
+
|---------|------|---------|
|
|
347
|
+
| `results/predictions.csv` | 分子特性予測値 | → drug-target-profiling |
|
|
348
|
+
| `results/benchmark.json` | モデルベンチマーク結果 | — |
|
|
349
|
+
| `results/embeddings.npy` | ChemBERTa 埋込み | → cheminformatics |
|
|
350
|
+
| `results/model/` | 訓練済みモデル | → admet-pharmacokinetics |
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-ebi-databases
|
|
3
|
+
description: |
|
|
4
|
+
EBI データベース群統合アクセススキル。EBI Search 横断検索、ENA Browser
|
|
5
|
+
ヌクレオチドアーカイブ、BioStudies 研究データ、dbfetch エントリ取得、
|
|
6
|
+
MetaboLights メタボロミクスリポジトリの統合パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific EBI Databases
|
|
10
|
+
|
|
11
|
+
EBI Search / ENA Browser / BioStudies / dbfetch / MetaboLights を統合した
|
|
12
|
+
EBI データベース群アクセスパイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- EBI Search で複数データベースを横断検索するとき
|
|
17
|
+
- ENA (European Nucleotide Archive) で配列データを検索するとき
|
|
18
|
+
- BioStudies で研究プロジェクトデータを探すとき
|
|
19
|
+
- dbfetch でエントリを一括取得するとき
|
|
20
|
+
- MetaboLights でメタボロミクス実験データにアクセスするとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. EBI Search 横断検索
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
EBI_SEARCH_API = "https://www.ebi.ac.uk/ebisearch/ws/rest"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def search_ebi(query, domain="allebi", size=25, fields=None):
|
|
36
|
+
"""
|
|
37
|
+
EBI Search 横断検索 — 複数 EBI データベースを一括検索。
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
query: str — 検索クエリ
|
|
41
|
+
domain: str — 検索ドメイン ("allebi", "uniprot", "pdb", "ena", etc.)
|
|
42
|
+
size: int — 最大取得数
|
|
43
|
+
fields: list — 返却フィールド
|
|
44
|
+
|
|
45
|
+
ToolUniverse:
|
|
46
|
+
EBI_Search_query(query=query, domain=domain)
|
|
47
|
+
EBI_Search_get_entry(domain=domain, entry_id=entry_id)
|
|
48
|
+
"""
|
|
49
|
+
params = {
|
|
50
|
+
"query": query,
|
|
51
|
+
"size": size,
|
|
52
|
+
"format": "json",
|
|
53
|
+
}
|
|
54
|
+
if fields:
|
|
55
|
+
params["fields"] = ",".join(fields)
|
|
56
|
+
|
|
57
|
+
resp = requests.get(f"{EBI_SEARCH_API}/{domain}", params=params)
|
|
58
|
+
resp.raise_for_status()
|
|
59
|
+
data = resp.json()
|
|
60
|
+
|
|
61
|
+
results = []
|
|
62
|
+
for entry in data.get("entries", []):
|
|
63
|
+
row = {"id": entry.get("id", ""), "source": entry.get("source", "")}
|
|
64
|
+
for field in entry.get("fields", {}):
|
|
65
|
+
row[field] = entry["fields"][field][0] if entry["fields"][field] else ""
|
|
66
|
+
results.append(row)
|
|
67
|
+
|
|
68
|
+
df = pd.DataFrame(results)
|
|
69
|
+
total = data.get("hitCount", 0)
|
|
70
|
+
print(f"EBI Search [{domain}] '{query}': {total} total hits, {len(df)} returned")
|
|
71
|
+
return df
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 2. ENA (European Nucleotide Archive) 配列検索
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
ENA_API = "https://www.ebi.ac.uk/ena/browser/api"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def search_ena(query, result_type="sequence", limit=100):
|
|
81
|
+
"""
|
|
82
|
+
ENA ヌクレオチドアーカイブ検索。
|
|
83
|
+
|
|
84
|
+
Parameters:
|
|
85
|
+
query: str — 検索クエリ or Taxon ID
|
|
86
|
+
result_type: str — "sequence", "read_run", "analysis", "study"
|
|
87
|
+
limit: int — 最大取得数
|
|
88
|
+
|
|
89
|
+
ToolUniverse:
|
|
90
|
+
ENA_search(query=query, result=result_type)
|
|
91
|
+
ENA_get_entry(accession=accession)
|
|
92
|
+
"""
|
|
93
|
+
params = {
|
|
94
|
+
"query": query,
|
|
95
|
+
"result": result_type,
|
|
96
|
+
"limit": limit,
|
|
97
|
+
"format": "json",
|
|
98
|
+
}
|
|
99
|
+
resp = requests.get(f"{ENA_API}/search", params=params)
|
|
100
|
+
resp.raise_for_status()
|
|
101
|
+
data = resp.json()
|
|
102
|
+
|
|
103
|
+
df = pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame()
|
|
104
|
+
print(f"ENA search '{query}' [{result_type}]: {len(df)} entries")
|
|
105
|
+
return df
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_ena_entry(accession, display="json"):
|
|
109
|
+
"""
|
|
110
|
+
ENA アクセッション番号によるエントリ取得。
|
|
111
|
+
|
|
112
|
+
Parameters:
|
|
113
|
+
accession: str — ENA accession (e.g., "ERS000001", "ERR000001")
|
|
114
|
+
"""
|
|
115
|
+
resp = requests.get(
|
|
116
|
+
f"{ENA_API}/entry/{accession}",
|
|
117
|
+
params={"display": display}
|
|
118
|
+
)
|
|
119
|
+
resp.raise_for_status()
|
|
120
|
+
print(f"ENA entry {accession}: retrieved")
|
|
121
|
+
return resp.json() if display == "json" else resp.text
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## 3. BioStudies 研究データ検索
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
BIOSTUDIES_API = "https://www.ebi.ac.uk/biostudies/api/v1"
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def search_biostudies(query, page_size=25):
|
|
131
|
+
"""
|
|
132
|
+
BioStudies 研究プロジェクトデータ検索。
|
|
133
|
+
|
|
134
|
+
Parameters:
|
|
135
|
+
query: str — 検索クエリ
|
|
136
|
+
page_size: int — ページサイズ
|
|
137
|
+
|
|
138
|
+
ToolUniverse:
|
|
139
|
+
BioStudies_search(query=query)
|
|
140
|
+
BioStudies_get_study(accession=accession)
|
|
141
|
+
"""
|
|
142
|
+
params = {"query": query, "pageSize": page_size}
|
|
143
|
+
resp = requests.get(f"{BIOSTUDIES_API}/search", params=params)
|
|
144
|
+
resp.raise_for_status()
|
|
145
|
+
data = resp.json()
|
|
146
|
+
|
|
147
|
+
results = []
|
|
148
|
+
for hit in data.get("hits", []):
|
|
149
|
+
results.append({
|
|
150
|
+
"accession": hit.get("accno", ""),
|
|
151
|
+
"title": hit.get("title", ""),
|
|
152
|
+
"author": hit.get("author", ""),
|
|
153
|
+
"release_date": hit.get("rtime", ""),
|
|
154
|
+
"type": hit.get("type", ""),
|
|
155
|
+
"files": hit.get("files", 0),
|
|
156
|
+
"links": hit.get("links", 0),
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
df = pd.DataFrame(results)
|
|
160
|
+
total = data.get("totalHits", 0)
|
|
161
|
+
print(f"BioStudies search '{query}': {total} total, {len(df)} returned")
|
|
162
|
+
return df
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## 4. dbfetch エントリ一括取得
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
DBFETCH_API = "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def dbfetch(db, ids, format_type="json", style="raw"):
|
|
172
|
+
"""
|
|
173
|
+
dbfetch — EBI データベースエントリ一括取得。
|
|
174
|
+
|
|
175
|
+
Parameters:
|
|
176
|
+
db: str — データベース名 (e.g., "uniprotkb", "embl", "pdb")
|
|
177
|
+
ids: list — ID リスト
|
|
178
|
+
format_type: str — 出力形式 ("json", "fasta", "xml")
|
|
179
|
+
style: str — スタイル ("raw", "html")
|
|
180
|
+
|
|
181
|
+
ToolUniverse:
|
|
182
|
+
dbfetch_get_entries(db=db, ids=ids, format=format_type)
|
|
183
|
+
"""
|
|
184
|
+
ids_str = ",".join(ids) if isinstance(ids, list) else ids
|
|
185
|
+
params = {
|
|
186
|
+
"db": db,
|
|
187
|
+
"id": ids_str,
|
|
188
|
+
"format": format_type,
|
|
189
|
+
"style": style,
|
|
190
|
+
}
|
|
191
|
+
resp = requests.get(DBFETCH_API, params=params)
|
|
192
|
+
resp.raise_for_status()
|
|
193
|
+
|
|
194
|
+
print(f"dbfetch [{db}]: {len(ids) if isinstance(ids, list) else 1} entries, "
|
|
195
|
+
f"format={format_type}")
|
|
196
|
+
if format_type == "json":
|
|
197
|
+
return resp.json()
|
|
198
|
+
return resp.text
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## 5. MetaboLights メタボロミクスリポジトリ
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
METABOLIGHTS_API = "https://www.ebi.ac.uk/metabolights/ws"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def search_metabolights(query):
|
|
208
|
+
"""
|
|
209
|
+
MetaboLights メタボロミクス実験データ検索。
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
query: str — 検索クエリ (化合物名、疾患名、生物種)
|
|
213
|
+
|
|
214
|
+
ToolUniverse:
|
|
215
|
+
MetaboLights_search_studies(query=query)
|
|
216
|
+
MetaboLights_get_study(study_id=study_id)
|
|
217
|
+
"""
|
|
218
|
+
resp = requests.get(
|
|
219
|
+
f"{METABOLIGHTS_API}/studies/search",
|
|
220
|
+
params={"query": query}
|
|
221
|
+
)
|
|
222
|
+
resp.raise_for_status()
|
|
223
|
+
data = resp.json()
|
|
224
|
+
|
|
225
|
+
results = []
|
|
226
|
+
for study in data.get("content", []):
|
|
227
|
+
results.append({
|
|
228
|
+
"study_id": study.get("studyIdentifier", ""),
|
|
229
|
+
"title": study.get("title", ""),
|
|
230
|
+
"organism": study.get("organism", ""),
|
|
231
|
+
"description": (study.get("description") or "")[:200],
|
|
232
|
+
"submission_date": study.get("submissionDate", ""),
|
|
233
|
+
"status": study.get("studyStatus", ""),
|
|
234
|
+
})
|
|
235
|
+
|
|
236
|
+
df = pd.DataFrame(results)
|
|
237
|
+
print(f"MetaboLights search '{query}': {len(df)} studies")
|
|
238
|
+
return df
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def get_metabolights_study(study_id):
|
|
242
|
+
"""MetaboLights 個別研究取得。"""
|
|
243
|
+
resp = requests.get(f"{METABOLIGHTS_API}/studies/{study_id}")
|
|
244
|
+
resp.raise_for_status()
|
|
245
|
+
data = resp.json()
|
|
246
|
+
print(f"MetaboLights {study_id}: {data.get('title', '')[:80]}")
|
|
247
|
+
return data
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## 利用可能ツール
|
|
253
|
+
|
|
254
|
+
| ToolUniverse カテゴリ | 主なツール |
|
|
255
|
+
|---|---|
|
|
256
|
+
| `ebi_search` | `EBI_Search_query`, `EBI_Search_get_entry` |
|
|
257
|
+
| `ena_browser` | `ENA_search`, `ENA_get_entry` |
|
|
258
|
+
| `biostudies` | `BioStudies_search`, `BioStudies_get_study` |
|
|
259
|
+
| `dbfetch` | `dbfetch_get_entries` |
|
|
260
|
+
| `metabolights` | `MetaboLights_search_studies`, `MetaboLights_get_study` |
|
|
261
|
+
|
|
262
|
+
## パイプライン出力
|
|
263
|
+
|
|
264
|
+
| 出力ファイル | 説明 | 連携先スキル |
|
|
265
|
+
|---|---|---|
|
|
266
|
+
| `results/ebi_search.csv` | EBI 横断検索結果 | → bioinformatics, literature-search |
|
|
267
|
+
| `results/ena_sequences.fasta` | ENA 配列データ | → genome-sequence-tools, sequence-analysis |
|
|
268
|
+
| `results/biostudies_metadata.json` | 研究プロジェクト情報 | → multi-omics, systematic-review |
|
|
269
|
+
| `results/metabolights_study.json` | メタボロミクスデータ | → metabolomics, metabolomics-databases |
|
|
270
|
+
|
|
271
|
+
## パイプライン統合
|
|
272
|
+
|
|
273
|
+
```
|
|
274
|
+
genome-sequence-tools ──→ ebi-databases ──→ metabolomics-databases
|
|
275
|
+
(NCBI/BLAST) (ENA/EBI Search) (HMDB/MetaCyc)
|
|
276
|
+
│
|
|
277
|
+
├──→ bioinformatics (配列データ)
|
|
278
|
+
├──→ sequence-analysis (FASTA)
|
|
279
|
+
└──→ structural-proteomics (PDBe cross-ref)
|
|
280
|
+
```
|