@nahisaho/satori 0.11.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -56
- package/package.json +1 -1
- package/src/.github/skills/scientific-biothings-idmapping/SKILL.md +298 -0
- package/src/.github/skills/scientific-cancer-genomics/SKILL.md +287 -0
- package/src/.github/skills/scientific-clinical-reporting/SKILL.md +324 -0
- package/src/.github/skills/scientific-compound-screening/SKILL.md +245 -0
- package/src/.github/skills/scientific-genome-sequence-tools/SKILL.md +304 -0
- package/src/.github/skills/scientific-healthcare-ai/SKILL.md +273 -0
- package/src/.github/skills/scientific-human-protein-atlas/SKILL.md +244 -0
- package/src/.github/skills/scientific-literature-search/SKILL.md +443 -0
- package/src/.github/skills/scientific-metabolic-modeling/SKILL.md +288 -0
- package/src/.github/skills/scientific-metabolomics-databases/SKILL.md +288 -0
- package/src/.github/skills/scientific-molecular-docking/SKILL.md +303 -0
- package/src/.github/skills/scientific-noncoding-rna/SKILL.md +262 -0
- package/src/.github/skills/scientific-pathway-enrichment/SKILL.md +449 -0
- package/src/.github/skills/scientific-pharmacology-targets/SKILL.md +323 -0
- package/src/.github/skills/scientific-protein-domain-family/SKILL.md +369 -0
- package/src/.github/skills/scientific-protein-interaction-network/SKILL.md +352 -0
- package/src/.github/skills/scientific-rare-disease-genetics/SKILL.md +327 -0
- package/src/.github/skills/scientific-structural-proteomics/SKILL.md +317 -0
- package/src/.github/skills/scientific-systematic-review/SKILL.md +361 -0
- package/src/.github/skills/scientific-variant-effect-prediction/SKILL.md +325 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-molecular-docking
|
|
3
|
+
description: |
|
|
4
|
+
構造ベース分子ドッキングスキル。DiffDock (拡散生成モデル)、
|
|
5
|
+
AutoDock Vina (スコアリング関数)、GNINA (CNN ベーススコアリング) を統合した
|
|
6
|
+
タンパク質-リガンド結合ポーズ予測、バーチャルスクリーニング、
|
|
7
|
+
結合自由エネルギー推定、ドッキングスコア統合パイプライン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Molecular Docking
|
|
11
|
+
|
|
12
|
+
DiffDock / AutoDock Vina / GNINA の 3 大ドッキングエンジンによる
|
|
13
|
+
構造ベース仮想スクリーニング・結合ポーズ予測パイプラインを提供する。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- タンパク質-リガンド結合モードを予測するとき
|
|
18
|
+
- 化合物ライブラリのバーチャルスクリーニングが必要なとき
|
|
19
|
+
- 結合自由エネルギーを推定してリガンドをランキングするとき
|
|
20
|
+
- DiffDock で AI ベースの結合ポーズ生成を行うとき
|
|
21
|
+
- 複数のドッキング手法のコンセンサス評価が必要なとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. リガンド・受容体の準備
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import os
|
|
31
|
+
import subprocess
|
|
32
|
+
import pandas as pd
|
|
33
|
+
import numpy as np
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def prepare_receptor(pdb_file, output_dir="structures/prepared",
|
|
37
|
+
remove_water=True, add_hydrogens=True):
|
|
38
|
+
"""
|
|
39
|
+
ドッキング用受容体 (タンパク質) 準備。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
pdb_file: str — 入力 PDB ファイル
|
|
43
|
+
remove_water: bool — 水分子除去
|
|
44
|
+
add_hydrogens: bool — 水素原子付加
|
|
45
|
+
"""
|
|
46
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
47
|
+
base_name = os.path.splitext(os.path.basename(pdb_file))[0]
|
|
48
|
+
|
|
49
|
+
# PDB → PDBQT 変換 (AutoDock Vina 用)
|
|
50
|
+
pdbqt_file = f"{output_dir}/{base_name}_receptor.pdbqt"
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
from openbabel import pybel
|
|
54
|
+
mol = next(pybel.readfile("pdb", pdb_file))
|
|
55
|
+
if remove_water:
|
|
56
|
+
mol.OBMol.DeleteWater()
|
|
57
|
+
if add_hydrogens:
|
|
58
|
+
mol.addh()
|
|
59
|
+
mol.write("pdbqt", pdbqt_file, overwrite=True)
|
|
60
|
+
print(f"Receptor prepared: {pdbqt_file}")
|
|
61
|
+
except ImportError:
|
|
62
|
+
# Open Babel 不在時: MGLTools prepare_receptor4
|
|
63
|
+
cmd = ["prepare_receptor4.py", "-r", pdb_file,
|
|
64
|
+
"-o", pdbqt_file, "-A", "hydrogens"]
|
|
65
|
+
if remove_water:
|
|
66
|
+
cmd.extend(["-U", "waters"])
|
|
67
|
+
subprocess.run(cmd, check=True)
|
|
68
|
+
print(f"Receptor prepared (MGLTools): {pdbqt_file}")
|
|
69
|
+
|
|
70
|
+
return pdbqt_file
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def prepare_ligands(sdf_file, output_dir="structures/ligands"):
|
|
74
|
+
"""
|
|
75
|
+
リガンドファイル準備 (SDF → PDBQT/MOL2)。
|
|
76
|
+
"""
|
|
77
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
from openbabel import pybel
|
|
81
|
+
ligands = list(pybel.readfile("sdf", sdf_file))
|
|
82
|
+
prepared = []
|
|
83
|
+
for i, mol in enumerate(ligands):
|
|
84
|
+
mol.addh()
|
|
85
|
+
mol.make3D()
|
|
86
|
+
name = mol.title or f"ligand_{i}"
|
|
87
|
+
out = f"{output_dir}/{name}.pdbqt"
|
|
88
|
+
mol.write("pdbqt", out, overwrite=True)
|
|
89
|
+
prepared.append({"name": name, "file": out, "atoms": len(mol.atoms)})
|
|
90
|
+
print(f"Prepared {len(prepared)} ligands from {sdf_file}")
|
|
91
|
+
return pd.DataFrame(prepared)
|
|
92
|
+
except ImportError:
|
|
93
|
+
print("openbabel not available, using RDKit fallback")
|
|
94
|
+
from rdkit import Chem
|
|
95
|
+
from rdkit.Chem import AllChem
|
|
96
|
+
suppl = Chem.SDMolSupplier(sdf_file)
|
|
97
|
+
prepared = []
|
|
98
|
+
for i, mol in enumerate(suppl):
|
|
99
|
+
if mol is None:
|
|
100
|
+
continue
|
|
101
|
+
mol = Chem.AddHs(mol)
|
|
102
|
+
AllChem.EmbedMolecule(mol, randomSeed=42)
|
|
103
|
+
name = mol.GetProp("_Name") if mol.HasProp("_Name") else f"lig_{i}"
|
|
104
|
+
out = f"{output_dir}/{name}.mol2"
|
|
105
|
+
Chem.MolToMolFile(mol, out)
|
|
106
|
+
prepared.append({"name": name, "file": out})
|
|
107
|
+
return pd.DataFrame(prepared)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## 2. AutoDock Vina ドッキング
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
def autodock_vina_dock(receptor_pdbqt, ligand_pdbqt,
|
|
114
|
+
center, box_size,
|
|
115
|
+
exhaustiveness=32, n_poses=9):
|
|
116
|
+
"""
|
|
117
|
+
AutoDock Vina による分子ドッキング。
|
|
118
|
+
|
|
119
|
+
Parameters:
|
|
120
|
+
receptor_pdbqt: str — 受容体 PDBQT
|
|
121
|
+
ligand_pdbqt: str — リガンド PDBQT
|
|
122
|
+
center: tuple — (x, y, z) ボックス中心座標
|
|
123
|
+
box_size: tuple — (sx, sy, sz) ボックスサイズ (Å)
|
|
124
|
+
exhaustiveness: int — 探索精度 (8-64)
|
|
125
|
+
n_poses: int — 出力ポーズ数
|
|
126
|
+
"""
|
|
127
|
+
try:
|
|
128
|
+
from vina import Vina
|
|
129
|
+
v = Vina(sf_name="vina")
|
|
130
|
+
v.set_receptor(receptor_pdbqt)
|
|
131
|
+
v.set_ligand_from_file(ligand_pdbqt)
|
|
132
|
+
v.compute_vina_maps(center=list(center), box_size=list(box_size))
|
|
133
|
+
v.dock(exhaustiveness=exhaustiveness, n_poses=n_poses)
|
|
134
|
+
|
|
135
|
+
energies = v.energies()
|
|
136
|
+
results = []
|
|
137
|
+
for i, e in enumerate(energies):
|
|
138
|
+
results.append({
|
|
139
|
+
"pose": i + 1,
|
|
140
|
+
"affinity_kcal": e[0],
|
|
141
|
+
"rmsd_lb": e[1] if len(e) > 1 else None,
|
|
142
|
+
"rmsd_ub": e[2] if len(e) > 2 else None,
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
output = ligand_pdbqt.replace(".pdbqt", "_docked.pdbqt")
|
|
146
|
+
v.write_poses(output, n_poses=n_poses, overwrite=True)
|
|
147
|
+
|
|
148
|
+
df = pd.DataFrame(results)
|
|
149
|
+
print(f"Vina docking: best affinity = {df['affinity_kcal'].min():.1f} kcal/mol")
|
|
150
|
+
return df, output
|
|
151
|
+
|
|
152
|
+
except ImportError:
|
|
153
|
+
# CLI フォールバック
|
|
154
|
+
output = ligand_pdbqt.replace(".pdbqt", "_docked.pdbqt")
|
|
155
|
+
cmd = [
|
|
156
|
+
"vina",
|
|
157
|
+
"--receptor", receptor_pdbqt,
|
|
158
|
+
"--ligand", ligand_pdbqt,
|
|
159
|
+
"--center_x", str(center[0]),
|
|
160
|
+
"--center_y", str(center[1]),
|
|
161
|
+
"--center_z", str(center[2]),
|
|
162
|
+
"--size_x", str(box_size[0]),
|
|
163
|
+
"--size_y", str(box_size[1]),
|
|
164
|
+
"--size_z", str(box_size[2]),
|
|
165
|
+
"--exhaustiveness", str(exhaustiveness),
|
|
166
|
+
"--num_modes", str(n_poses),
|
|
167
|
+
"--out", output,
|
|
168
|
+
]
|
|
169
|
+
subprocess.run(cmd, check=True)
|
|
170
|
+
return pd.DataFrame(), output
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## 3. DiffDock AI ドッキング
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
def diffdock_predict(protein_file, ligand_file, n_poses=10,
|
|
177
|
+
output_dir="results/diffdock"):
|
|
178
|
+
"""
|
|
179
|
+
DiffDock (拡散生成モデル) ドッキング。
|
|
180
|
+
|
|
181
|
+
Parameters:
|
|
182
|
+
protein_file: str — タンパク質 PDB ファイル
|
|
183
|
+
ligand_file: str — リガンド SDF/MOL2 ファイル
|
|
184
|
+
n_poses: int — 生成ポーズ数
|
|
185
|
+
"""
|
|
186
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
187
|
+
|
|
188
|
+
# DiffDock-L (large model) 推論
|
|
189
|
+
cmd = [
|
|
190
|
+
"python", "-m", "diffdock.inference",
|
|
191
|
+
"--protein_path", protein_file,
|
|
192
|
+
"--ligand", ligand_file,
|
|
193
|
+
"--out_dir", output_dir,
|
|
194
|
+
"--samples_per_complex", str(n_poses),
|
|
195
|
+
"--model_dir", "DiffDock-L",
|
|
196
|
+
"--confidence_model_dir", "DiffDock-L",
|
|
197
|
+
]
|
|
198
|
+
|
|
199
|
+
print(f"Running DiffDock ({n_poses} poses)...")
|
|
200
|
+
try:
|
|
201
|
+
subprocess.run(cmd, check=True, capture_output=True)
|
|
202
|
+
except FileNotFoundError:
|
|
203
|
+
print("DiffDock not installed. Install from: "
|
|
204
|
+
"https://github.com/gcorso/DiffDock")
|
|
205
|
+
return pd.DataFrame()
|
|
206
|
+
|
|
207
|
+
# 結果パース
|
|
208
|
+
results = []
|
|
209
|
+
for i in range(n_poses):
|
|
210
|
+
pose_file = f"{output_dir}/rank{i+1}.sdf"
|
|
211
|
+
conf_file = f"{output_dir}/rank{i+1}_confidence.txt"
|
|
212
|
+
confidence = None
|
|
213
|
+
if os.path.exists(conf_file):
|
|
214
|
+
with open(conf_file) as f:
|
|
215
|
+
confidence = float(f.read().strip())
|
|
216
|
+
results.append({
|
|
217
|
+
"pose": i + 1,
|
|
218
|
+
"file": pose_file,
|
|
219
|
+
"confidence": confidence,
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
df = pd.DataFrame(results)
|
|
223
|
+
if len(df) > 0 and "confidence" in df.columns:
|
|
224
|
+
print(f"DiffDock: {len(df)} poses, "
|
|
225
|
+
f"best confidence = {df['confidence'].max()}")
|
|
226
|
+
return df
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## 4. バーチャルスクリーニング
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
def virtual_screening(receptor_pdbqt, ligand_library,
|
|
233
|
+
center, box_size,
|
|
234
|
+
method="vina", top_n=20):
|
|
235
|
+
"""
|
|
236
|
+
化合物ライブラリのバーチャルスクリーニング。
|
|
237
|
+
|
|
238
|
+
Parameters:
|
|
239
|
+
receptor_pdbqt: str — 受容体 PDBQT
|
|
240
|
+
ligand_library: list[str] — リガンド PDBQT ファイルのリスト
|
|
241
|
+
center/box_size: ドッキングボックスパラメータ
|
|
242
|
+
method: "vina" or "diffdock"
|
|
243
|
+
top_n: int — 上位候補数
|
|
244
|
+
"""
|
|
245
|
+
all_results = []
|
|
246
|
+
|
|
247
|
+
for i, ligand in enumerate(ligand_library):
|
|
248
|
+
lig_name = os.path.splitext(os.path.basename(ligand))[0]
|
|
249
|
+
print(f" [{i+1}/{len(ligand_library)}] Docking {lig_name}...", end=" ")
|
|
250
|
+
|
|
251
|
+
if method == "vina":
|
|
252
|
+
df, _ = autodock_vina_dock(
|
|
253
|
+
receptor_pdbqt, ligand, center, box_size,
|
|
254
|
+
exhaustiveness=16, n_poses=3
|
|
255
|
+
)
|
|
256
|
+
if len(df) > 0:
|
|
257
|
+
best = df.iloc[0]
|
|
258
|
+
all_results.append({
|
|
259
|
+
"ligand": lig_name,
|
|
260
|
+
"best_affinity": best["affinity_kcal"],
|
|
261
|
+
"n_poses": len(df),
|
|
262
|
+
})
|
|
263
|
+
print(f"{best['affinity_kcal']:.1f} kcal/mol")
|
|
264
|
+
|
|
265
|
+
results_df = pd.DataFrame(all_results)
|
|
266
|
+
results_df = results_df.sort_values("best_affinity").head(top_n)
|
|
267
|
+
|
|
268
|
+
print(f"\nVirtual screening: {len(ligand_library)} compounds → "
|
|
269
|
+
f"top {top_n} candidates")
|
|
270
|
+
return results_df
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
## References
|
|
274
|
+
|
|
275
|
+
### Output Files
|
|
276
|
+
|
|
277
|
+
| ファイル | 形式 |
|
|
278
|
+
|---|---|
|
|
279
|
+
| `structures/prepared/*_receptor.pdbqt` | PDBQT |
|
|
280
|
+
| `structures/ligands/*.pdbqt` | PDBQT |
|
|
281
|
+
| `results/docking_results.csv` | CSV |
|
|
282
|
+
| `results/diffdock/rank*.sdf` | SDF |
|
|
283
|
+
| `results/virtual_screening.csv` | CSV |
|
|
284
|
+
| `figures/docking_scores.png` | PNG |
|
|
285
|
+
|
|
286
|
+
### 利用可能ツール
|
|
287
|
+
|
|
288
|
+
> このスキルは主に K-Dense-AI/claude-scientific-skills の diffdock スキルを参照しています。ToolUniverse SMCP には専用ドッキングツールは含まれませんが、タンパク質構造は PDB/AlphaFold ツール経由で取得可能です。
|
|
289
|
+
|
|
290
|
+
### 参照スキル
|
|
291
|
+
|
|
292
|
+
| スキル | 関連 |
|
|
293
|
+
|---|---|
|
|
294
|
+
| `scientific-protein-structure-analysis` | 受容体構造取得・結合部位検出 |
|
|
295
|
+
| `scientific-drug-target-profiling` | 標的選定 → ドッキング |
|
|
296
|
+
| `scientific-cheminformatics` | リガンド記述子・フィルタリング |
|
|
297
|
+
| `scientific-admet-pharmacokinetics` | ドッキング → ADMET |
|
|
298
|
+
| `scientific-drug-repurposing` | リポジショニング候補ドッキング |
|
|
299
|
+
| `scientific-protein-interaction-network` | PPI → ドッキング界面 |
|
|
300
|
+
|
|
301
|
+
### 依存パッケージ
|
|
302
|
+
|
|
303
|
+
`vina` (AutoDock Vina), `rdkit`, `openbabel` (optional), `numpy`, `pandas`
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-noncoding-rna
|
|
3
|
+
description: |
|
|
4
|
+
非コード RNA (ncRNA) 解析スキル。Rfam RNA ファミリー検索、
|
|
5
|
+
RNAcentral 統合 ncRNA データベース、共分散モデル、構造マッピング、
|
|
6
|
+
系統樹解析パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Noncoding RNA
|
|
10
|
+
|
|
11
|
+
Rfam および RNAcentral を活用した ncRNA ファミリー検索、
|
|
12
|
+
配列アノテーション、構造予測パイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- RNA ファミリー (miRNA, lncRNA, rRNA, tRNA 等) を分類するとき
|
|
17
|
+
- Rfam 共分散モデルで RNA 配列を検索するとき
|
|
18
|
+
- RNAcentral で ncRNA のクロスリファレンスを取得するとき
|
|
19
|
+
- RNA 二次構造・構造マッピング情報を取得するとき
|
|
20
|
+
- RNA ファミリーの系統樹情報を調べるとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. Rfam ファミリー検索
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
RFAM_API = "https://rfam.org/family"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_rfam_family(rfam_acc):
|
|
36
|
+
"""
|
|
37
|
+
Rfam RNA ファミリーの詳細情報を取得。
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
rfam_acc: str — Rfam accession (e.g., "RF00001") or ID
|
|
41
|
+
|
|
42
|
+
ToolUniverse:
|
|
43
|
+
Rfam_get_family(rfam_acc=rfam_acc)
|
|
44
|
+
Rfam_id_to_accession(rfam_id=rfam_id)
|
|
45
|
+
"""
|
|
46
|
+
url = f"https://rfam.org/family/{rfam_acc}?content-type=application/json"
|
|
47
|
+
resp = requests.get(url)
|
|
48
|
+
resp.raise_for_status()
|
|
49
|
+
data = resp.json()
|
|
50
|
+
|
|
51
|
+
info = data.get("rfam", {}).get("acc", {})
|
|
52
|
+
desc = data.get("rfam", {}).get("description", "")
|
|
53
|
+
|
|
54
|
+
print(f"Rfam {rfam_acc}: {data.get('rfam', {}).get('id', '?')}")
|
|
55
|
+
return data
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## 2. Rfam 配列検索 (Infernal cmscan)
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
import time
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def rfam_sequence_search(sequence, email=None):
|
|
65
|
+
"""
|
|
66
|
+
Rfam に RNA 配列を投入し Infernal cmscan で
|
|
67
|
+
マッチする RNA ファミリーを同定。
|
|
68
|
+
|
|
69
|
+
Parameters:
|
|
70
|
+
sequence: str — RNA sequence
|
|
71
|
+
|
|
72
|
+
ToolUniverse:
|
|
73
|
+
Rfam_search_sequence(sequence=sequence)
|
|
74
|
+
"""
|
|
75
|
+
url = "https://rfam.org/search/sequence"
|
|
76
|
+
|
|
77
|
+
payload = {
|
|
78
|
+
"seq": sequence,
|
|
79
|
+
"output": "json",
|
|
80
|
+
}
|
|
81
|
+
resp = requests.post(url, data=payload)
|
|
82
|
+
resp.raise_for_status()
|
|
83
|
+
|
|
84
|
+
# Async job → poll
|
|
85
|
+
job_url = resp.json().get("resultURL", "")
|
|
86
|
+
if not job_url:
|
|
87
|
+
return resp.json()
|
|
88
|
+
|
|
89
|
+
for _ in range(30):
|
|
90
|
+
time.sleep(10)
|
|
91
|
+
result = requests.get(job_url)
|
|
92
|
+
if result.status_code == 200:
|
|
93
|
+
data = result.json()
|
|
94
|
+
if data.get("status", "") == "DONE":
|
|
95
|
+
hits = data.get("hits", {}).get("hit", [])
|
|
96
|
+
print(f"Rfam cmscan: {len(hits)} family hits")
|
|
97
|
+
return hits
|
|
98
|
+
|
|
99
|
+
print("Rfam cmscan: timeout")
|
|
100
|
+
return []
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## 3. Rfam 構造マッピング
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
def get_rfam_structure_mapping(rfam_acc):
|
|
107
|
+
"""
|
|
108
|
+
Rfam ファミリーの PDB 構造マッピング情報を取得。
|
|
109
|
+
|
|
110
|
+
ToolUniverse:
|
|
111
|
+
Rfam_get_structure_mapping(rfam_acc=rfam_acc)
|
|
112
|
+
Rfam_get_covariance_model(rfam_acc=rfam_acc)
|
|
113
|
+
Rfam_get_tree_data(rfam_acc=rfam_acc)
|
|
114
|
+
Rfam_get_sequence_regions(rfam_acc=rfam_acc)
|
|
115
|
+
"""
|
|
116
|
+
# Structure mapping
|
|
117
|
+
url_struct = (
|
|
118
|
+
f"https://rfam.org/family/{rfam_acc}/structures"
|
|
119
|
+
"?content-type=application/json"
|
|
120
|
+
)
|
|
121
|
+
resp_s = requests.get(url_struct)
|
|
122
|
+
structures = resp_s.json() if resp_s.status_code == 200 else []
|
|
123
|
+
|
|
124
|
+
# Sequence regions
|
|
125
|
+
url_regions = (
|
|
126
|
+
f"https://rfam.org/family/{rfam_acc}/regions"
|
|
127
|
+
"?content-type=application/json"
|
|
128
|
+
)
|
|
129
|
+
resp_r = requests.get(url_regions)
|
|
130
|
+
regions = resp_r.json() if resp_r.status_code == 200 else []
|
|
131
|
+
|
|
132
|
+
print(f"Rfam {rfam_acc}: {len(structures)} PDB structures, "
|
|
133
|
+
f"{len(regions) if isinstance(regions, list) else '?'} regions")
|
|
134
|
+
return structures, regions
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## 4. RNAcentral ncRNA 検索
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
RNACENTRAL_API = "https://rnacentral.org/api/v1"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def rnacentral_search(query, page_size=10):
|
|
144
|
+
"""
|
|
145
|
+
RNAcentral で ncRNA を検索。
|
|
146
|
+
|
|
147
|
+
Parameters:
|
|
148
|
+
query: str — search term (gene name, accession, keyword)
|
|
149
|
+
|
|
150
|
+
ToolUniverse:
|
|
151
|
+
RNAcentral_search(query=query)
|
|
152
|
+
"""
|
|
153
|
+
url = f"{RNACENTRAL_API}/rna/"
|
|
154
|
+
params = {"query": query, "page_size": page_size}
|
|
155
|
+
resp = requests.get(url, params=params)
|
|
156
|
+
resp.raise_for_status()
|
|
157
|
+
data = resp.json()
|
|
158
|
+
|
|
159
|
+
results = data.get("results", [])
|
|
160
|
+
entries = []
|
|
161
|
+
for r in results:
|
|
162
|
+
entries.append({
|
|
163
|
+
"rnacentral_id": r.get("rnacentral_id", ""),
|
|
164
|
+
"description": r.get("description", ""),
|
|
165
|
+
"rna_type": r.get("rna_type", ""),
|
|
166
|
+
"length": r.get("length", 0),
|
|
167
|
+
"num_xrefs": r.get("xref_count", 0),
|
|
168
|
+
})
|
|
169
|
+
|
|
170
|
+
df = pd.DataFrame(entries)
|
|
171
|
+
print(f"RNAcentral '{query}': {data.get('count', 0)} total, "
|
|
172
|
+
f"{len(df)} returned")
|
|
173
|
+
return df
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def rnacentral_get_by_accession(accession):
|
|
177
|
+
"""
|
|
178
|
+
RNAcentral アクセッションから ncRNA 詳細情報を取得。
|
|
179
|
+
|
|
180
|
+
ToolUniverse:
|
|
181
|
+
RNAcentral_get_by_accession(accession=accession)
|
|
182
|
+
"""
|
|
183
|
+
url = f"{RNACENTRAL_API}/rna/{accession}/"
|
|
184
|
+
resp = requests.get(url)
|
|
185
|
+
resp.raise_for_status()
|
|
186
|
+
data = resp.json()
|
|
187
|
+
|
|
188
|
+
print(f"RNAcentral {accession}: {data.get('description', '')}")
|
|
189
|
+
return data
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## 5. ncRNA 統合解析パイプライン
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
def ncRNA_integrated_search(sequence, rfam_acc=None):
|
|
196
|
+
"""
|
|
197
|
+
配列ベースの ncRNA 統合解析。
|
|
198
|
+
|
|
199
|
+
ToolUniverse (横断):
|
|
200
|
+
Rfam_search_sequence(sequence) → Rfam_get_family(rfam_acc)
|
|
201
|
+
RNAcentral_search(query)
|
|
202
|
+
"""
|
|
203
|
+
pipeline = {"sequence_length": len(sequence)}
|
|
204
|
+
|
|
205
|
+
# Step 1: Rfam family identification
|
|
206
|
+
rfam_hits = rfam_sequence_search(sequence)
|
|
207
|
+
pipeline["rfam_hits"] = len(rfam_hits) if isinstance(rfam_hits, list) else 0
|
|
208
|
+
|
|
209
|
+
# Step 2: If Rfam family found, get details
|
|
210
|
+
if rfam_hits and isinstance(rfam_hits, list) and len(rfam_hits) > 0:
|
|
211
|
+
top_hit = rfam_hits[0]
|
|
212
|
+
top_acc = top_hit.get("acc", rfam_acc or "")
|
|
213
|
+
if top_acc:
|
|
214
|
+
family = get_rfam_family(top_acc)
|
|
215
|
+
pipeline["rfam_family"] = top_acc
|
|
216
|
+
|
|
217
|
+
# Step 3: RNAcentral search
|
|
218
|
+
rna_df = rnacentral_search(sequence[:30]) # truncate for search
|
|
219
|
+
pipeline["rnacentral_hits"] = len(rna_df)
|
|
220
|
+
|
|
221
|
+
print(f"ncRNA pipeline: Rfam={pipeline.get('rfam_family', 'none')}, "
|
|
222
|
+
f"RNAcentral={pipeline['rnacentral_hits']} hits")
|
|
223
|
+
return pipeline
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## References
|
|
227
|
+
|
|
228
|
+
### Output Files
|
|
229
|
+
|
|
230
|
+
| ファイル | 形式 |
|
|
231
|
+
|---|---|
|
|
232
|
+
| `results/rfam_family.json` | JSON |
|
|
233
|
+
| `results/rfam_cmscan_hits.json` | JSON |
|
|
234
|
+
| `results/rfam_structures.json` | JSON |
|
|
235
|
+
| `results/rnacentral_search.csv` | CSV |
|
|
236
|
+
|
|
237
|
+
### 利用可能ツール
|
|
238
|
+
|
|
239
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
240
|
+
|---|---|---|
|
|
241
|
+
| Rfam | `Rfam_get_family` | ファミリー情報 |
|
|
242
|
+
| Rfam | `Rfam_search_sequence` | 配列→ファミリー同定 |
|
|
243
|
+
| Rfam | `Rfam_get_covariance_model` | 共分散モデル |
|
|
244
|
+
| Rfam | `Rfam_get_structure_mapping` | PDB マッピング |
|
|
245
|
+
| Rfam | `Rfam_get_tree_data` | 系統樹 |
|
|
246
|
+
| Rfam | `Rfam_get_sequence_regions` | 配列領域 |
|
|
247
|
+
| Rfam | `Rfam_id_to_accession` | ID→アクセッション変換 |
|
|
248
|
+
| RNAcentral | `RNAcentral_search` | ncRNA 検索 |
|
|
249
|
+
| RNAcentral | `RNAcentral_get_by_accession` | 詳細取得 |
|
|
250
|
+
|
|
251
|
+
### 参照スキル
|
|
252
|
+
|
|
253
|
+
| スキル | 関連 |
|
|
254
|
+
|---|---|
|
|
255
|
+
| `scientific-gene-expression-transcriptomics` | 転写産物解析 |
|
|
256
|
+
| `scientific-genome-sequence-tools` | 配列取得 |
|
|
257
|
+
| `scientific-structural-proteomics` | RNA 構造 |
|
|
258
|
+
| `scientific-biothings-idmapping` | ID マッピング |
|
|
259
|
+
|
|
260
|
+
### 依存パッケージ
|
|
261
|
+
|
|
262
|
+
`requests`, `pandas`
|