@nahisaho/satori 0.11.1 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +87 -47
- package/package.json +1 -1
- package/src/.github/skills/scientific-cancer-genomics/SKILL.md +287 -0
- package/src/.github/skills/scientific-clinical-reporting/SKILL.md +324 -0
- package/src/.github/skills/scientific-literature-search/SKILL.md +443 -0
- package/src/.github/skills/scientific-metabolomics-databases/SKILL.md +288 -0
- package/src/.github/skills/scientific-molecular-docking/SKILL.md +303 -0
- package/src/.github/skills/scientific-pathway-enrichment/SKILL.md +449 -0
- package/src/.github/skills/scientific-protein-domain-family/SKILL.md +369 -0
- package/src/.github/skills/scientific-protein-interaction-network/SKILL.md +352 -0
- package/src/.github/skills/scientific-systematic-review/SKILL.md +361 -0
- package/src/.github/skills/scientific-variant-effect-prediction/SKILL.md +325 -0
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-protein-domain-family
|
|
3
|
+
description: |
|
|
4
|
+
タンパク質ドメイン・ファミリー解析スキル。InterPro アノテーション検索、
|
|
5
|
+
InterProScan によるシーケンスベースドメイン予測、Pfam/SMART/CDD
|
|
6
|
+
ドメイン分類、ドメインアーキテクチャ可視化、ファミリー系統樹構築。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Protein Domain & Family Analysis
|
|
10
|
+
|
|
11
|
+
InterPro / InterProScan を中心としたタンパク質ドメイン解析
|
|
12
|
+
およびファミリー分類パイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 未知タンパク質のドメイン構成を同定するとき
|
|
17
|
+
- InterPro/Pfam アノテーションでファミリー分類するとき
|
|
18
|
+
- ドメインアーキテクチャを比較・可視化するとき
|
|
19
|
+
- InterProScan でバッチシーケンス解析するとき
|
|
20
|
+
- 進化的なドメイン保存性を評価するとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. InterPro ドメイン検索
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
import json
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
INTERPRO_API = "https://www.ebi.ac.uk/interpro/api"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def search_interpro_domains(query, db_filter=None):
|
|
38
|
+
"""
|
|
39
|
+
InterPro REST API でドメイン検索。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
query: str — ドメイン名またはキーワード (e.g., "kinase", "SH3")
|
|
43
|
+
db_filter: str — "pfam", "smart", "cdd", "prosite" など
|
|
44
|
+
|
|
45
|
+
ToolUniverse:
|
|
46
|
+
InterPro_search_domains(query=query)
|
|
47
|
+
"""
|
|
48
|
+
url = f"{INTERPRO_API}/entry/interpro"
|
|
49
|
+
params = {"search": query, "page_size": 25}
|
|
50
|
+
|
|
51
|
+
if db_filter:
|
|
52
|
+
url = f"{INTERPRO_API}/entry/{db_filter}"
|
|
53
|
+
|
|
54
|
+
resp = requests.get(url, params=params)
|
|
55
|
+
resp.raise_for_status()
|
|
56
|
+
data = resp.json()
|
|
57
|
+
|
|
58
|
+
results = []
|
|
59
|
+
for entry in data.get("results", []):
|
|
60
|
+
meta = entry.get("metadata", {})
|
|
61
|
+
results.append({
|
|
62
|
+
"accession": meta.get("accession", ""),
|
|
63
|
+
"name": meta.get("name", ""),
|
|
64
|
+
"type": meta.get("type", ""),
|
|
65
|
+
"source_database": meta.get("source_database", "interpro"),
|
|
66
|
+
"description": (meta.get("description", [{}])[0].get("text", "")[:200]
|
|
67
|
+
if meta.get("description") else ""),
|
|
68
|
+
"member_databases": meta.get("member_databases", {}),
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
df = pd.DataFrame(results)
|
|
72
|
+
print(f"InterPro search '{query}': {len(df)} entries found")
|
|
73
|
+
return df
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## 2. タンパク質のドメイン組成取得
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
def get_protein_domains(uniprot_id):
|
|
80
|
+
"""
|
|
81
|
+
UniProt タンパク質のドメインアノテーションを取得。
|
|
82
|
+
|
|
83
|
+
Parameters:
|
|
84
|
+
uniprot_id: str — UniProt accession (e.g., "P04637")
|
|
85
|
+
|
|
86
|
+
ToolUniverse:
|
|
87
|
+
InterPro_get_protein_domains(uniprot_id=uniprot_id)
|
|
88
|
+
"""
|
|
89
|
+
url = f"{INTERPRO_API}/protein/uniprot/{uniprot_id}"
|
|
90
|
+
params = {"page_size": 50}
|
|
91
|
+
|
|
92
|
+
resp = requests.get(url, params=params)
|
|
93
|
+
resp.raise_for_status()
|
|
94
|
+
data = resp.json()
|
|
95
|
+
|
|
96
|
+
domains = []
|
|
97
|
+
for entry in data.get("results", []):
|
|
98
|
+
meta = entry.get("metadata", {})
|
|
99
|
+
# 各ドメインの位置情報
|
|
100
|
+
for protein_info in entry.get("proteins", []):
|
|
101
|
+
for loc in protein_info.get("entry_protein_locations", []):
|
|
102
|
+
for frag in loc.get("fragments", []):
|
|
103
|
+
domains.append({
|
|
104
|
+
"accession": meta.get("accession", ""),
|
|
105
|
+
"name": meta.get("name", ""),
|
|
106
|
+
"type": meta.get("type", ""),
|
|
107
|
+
"start": frag.get("start", 0),
|
|
108
|
+
"end": frag.get("end", 0),
|
|
109
|
+
"source": meta.get("source_database", "interpro"),
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
df = pd.DataFrame(domains)
|
|
113
|
+
if not df.empty:
|
|
114
|
+
df = df.sort_values("start").reset_index(drop=True)
|
|
115
|
+
|
|
116
|
+
print(f"Protein {uniprot_id}: {len(df)} domain annotations")
|
|
117
|
+
return df
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## 3. InterProScan シーケンスベース予測
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
import time
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
IPRSCAN_API = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5"
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def submit_interproscan(sequence, email="user@example.com",
|
|
130
|
+
applications=None):
|
|
131
|
+
"""
|
|
132
|
+
InterProScan REST API でシーケンスベースドメイン予測。
|
|
133
|
+
|
|
134
|
+
Parameters:
|
|
135
|
+
sequence: str — アミノ酸配列 (FASTA or raw)
|
|
136
|
+
email: str — 結果通知メール
|
|
137
|
+
applications: list — ["Pfam", "SMART", "CDD", "ProSitePatterns"]
|
|
138
|
+
|
|
139
|
+
ToolUniverse:
|
|
140
|
+
InterProScan_scan_sequence(sequence=sequence)
|
|
141
|
+
InterProScan_get_job_status(job_id=job_id)
|
|
142
|
+
InterProScan_get_job_results(job_id=job_id)
|
|
143
|
+
"""
|
|
144
|
+
if applications is None:
|
|
145
|
+
applications = ["Pfam", "SMART", "CDD", "ProSitePatterns",
|
|
146
|
+
"SUPERFAMILY", "Gene3D"]
|
|
147
|
+
|
|
148
|
+
# 1. Submit job
|
|
149
|
+
payload = {
|
|
150
|
+
"email": email,
|
|
151
|
+
"sequence": sequence,
|
|
152
|
+
"appl": ",".join(applications),
|
|
153
|
+
"goterms": "true",
|
|
154
|
+
"pathways": "true",
|
|
155
|
+
}
|
|
156
|
+
resp = requests.post(f"{IPRSCAN_API}/run", data=payload)
|
|
157
|
+
resp.raise_for_status()
|
|
158
|
+
job_id = resp.text.strip()
|
|
159
|
+
print(f"InterProScan job submitted: {job_id}")
|
|
160
|
+
|
|
161
|
+
# 2. Poll for results
|
|
162
|
+
max_wait = 600 # 10 min
|
|
163
|
+
elapsed = 0
|
|
164
|
+
while elapsed < max_wait:
|
|
165
|
+
status_resp = requests.get(f"{IPRSCAN_API}/status/{job_id}")
|
|
166
|
+
status = status_resp.text.strip()
|
|
167
|
+
if status == "FINISHED":
|
|
168
|
+
break
|
|
169
|
+
elif status in ("ERROR", "FAILURE", "NOT_FOUND"):
|
|
170
|
+
raise RuntimeError(f"InterProScan job {job_id}: {status}")
|
|
171
|
+
time.sleep(30)
|
|
172
|
+
elapsed += 30
|
|
173
|
+
print(f" Waiting... ({elapsed}s, status={status})")
|
|
174
|
+
|
|
175
|
+
# 3. Get results
|
|
176
|
+
result_resp = requests.get(f"{IPRSCAN_API}/result/{job_id}/json")
|
|
177
|
+
result_resp.raise_for_status()
|
|
178
|
+
results = result_resp.json()
|
|
179
|
+
|
|
180
|
+
# Parse matches
|
|
181
|
+
matches = []
|
|
182
|
+
for res in results.get("results", [results]):
|
|
183
|
+
for match in res.get("matches", []):
|
|
184
|
+
sig = match.get("signature", {})
|
|
185
|
+
for loc in match.get("locations", []):
|
|
186
|
+
matches.append({
|
|
187
|
+
"accession": sig.get("accession", ""),
|
|
188
|
+
"name": sig.get("name", ""),
|
|
189
|
+
"description": sig.get("description", ""),
|
|
190
|
+
"database": sig.get("signatureLibraryRelease", {}).get("library", ""),
|
|
191
|
+
"start": loc.get("start", 0),
|
|
192
|
+
"end": loc.get("end", 0),
|
|
193
|
+
"score": loc.get("score", None),
|
|
194
|
+
"evalue": loc.get("evalue", None),
|
|
195
|
+
"interpro_accession": (
|
|
196
|
+
sig.get("entry", {}).get("accession", "")
|
|
197
|
+
if sig.get("entry") else ""
|
|
198
|
+
),
|
|
199
|
+
"interpro_name": (
|
|
200
|
+
sig.get("entry", {}).get("name", "")
|
|
201
|
+
if sig.get("entry") else ""
|
|
202
|
+
),
|
|
203
|
+
})
|
|
204
|
+
|
|
205
|
+
df = pd.DataFrame(matches)
|
|
206
|
+
print(f"InterProScan: {len(df)} domain matches from {len(applications)} DBs")
|
|
207
|
+
return df, job_id
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## 4. ドメインアーキテクチャ可視化
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
import matplotlib
|
|
214
|
+
matplotlib.use("Agg")
|
|
215
|
+
import matplotlib.pyplot as plt
|
|
216
|
+
import matplotlib.patches as mpatches
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def visualize_domain_architecture(domains_df, protein_length=None,
|
|
220
|
+
output="figures/domain_architecture.png"):
|
|
221
|
+
"""
|
|
222
|
+
ドメインアーキテクチャ図を生成。
|
|
223
|
+
|
|
224
|
+
Parameters:
|
|
225
|
+
domains_df: DataFrame — columns: [name, start, end, source]
|
|
226
|
+
protein_length: int — 全長 (None なら max(end) 使用)
|
|
227
|
+
output: str — 出力ファイルパス
|
|
228
|
+
"""
|
|
229
|
+
import os
|
|
230
|
+
os.makedirs(os.path.dirname(output), exist_ok=True)
|
|
231
|
+
|
|
232
|
+
if protein_length is None:
|
|
233
|
+
protein_length = int(domains_df["end"].max()) + 10
|
|
234
|
+
|
|
235
|
+
# 色パレット
|
|
236
|
+
colors = plt.cm.Set3.colors
|
|
237
|
+
unique_sources = domains_df["source"].unique() if "source" in domains_df.columns else [""]
|
|
238
|
+
source_color = {s: colors[i % len(colors)] for i, s in enumerate(unique_sources)}
|
|
239
|
+
|
|
240
|
+
fig, ax = plt.subplots(figsize=(14, 3))
|
|
241
|
+
|
|
242
|
+
# バックボーン
|
|
243
|
+
ax.plot([0, protein_length], [0.5, 0.5], "k-", linewidth=2)
|
|
244
|
+
|
|
245
|
+
# ドメインボックス
|
|
246
|
+
for _, row in domains_df.iterrows():
|
|
247
|
+
start = row["start"]
|
|
248
|
+
end = row["end"]
|
|
249
|
+
source = row.get("source", "")
|
|
250
|
+
color = source_color.get(source, "steelblue")
|
|
251
|
+
|
|
252
|
+
rect = mpatches.FancyBboxPatch(
|
|
253
|
+
(start, 0.2), end - start, 0.6,
|
|
254
|
+
boxstyle="round,pad=0.02",
|
|
255
|
+
facecolor=color, edgecolor="black", linewidth=1
|
|
256
|
+
)
|
|
257
|
+
ax.add_patch(rect)
|
|
258
|
+
|
|
259
|
+
# ラベル
|
|
260
|
+
mid = (start + end) / 2
|
|
261
|
+
label = row.get("name", row.get("accession", ""))
|
|
262
|
+
if len(label) > 12:
|
|
263
|
+
label = label[:10] + ".."
|
|
264
|
+
ax.text(mid, 0.5, label, ha="center", va="center",
|
|
265
|
+
fontsize=7, fontweight="bold")
|
|
266
|
+
|
|
267
|
+
ax.set_xlim(-10, protein_length + 10)
|
|
268
|
+
ax.set_ylim(-0.5, 1.5)
|
|
269
|
+
ax.set_xlabel("Residue position")
|
|
270
|
+
ax.set_title("Domain Architecture")
|
|
271
|
+
ax.set_yticks([])
|
|
272
|
+
|
|
273
|
+
plt.tight_layout()
|
|
274
|
+
plt.savefig(output, dpi=150, bbox_inches="tight")
|
|
275
|
+
plt.close()
|
|
276
|
+
print(f"Domain architecture: {output}")
|
|
277
|
+
return output
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## 5. ドメインファミリー比較
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
def compare_domain_architectures(protein_list):
|
|
284
|
+
"""
|
|
285
|
+
複数タンパク質のドメインアーキテクチャを比較。
|
|
286
|
+
|
|
287
|
+
Parameters:
|
|
288
|
+
protein_list: list of str — UniProt accessions
|
|
289
|
+
"""
|
|
290
|
+
all_domains = {}
|
|
291
|
+
for uniprot_id in protein_list:
|
|
292
|
+
try:
|
|
293
|
+
df = get_protein_domains(uniprot_id)
|
|
294
|
+
all_domains[uniprot_id] = df
|
|
295
|
+
except Exception as e:
|
|
296
|
+
print(f" Warning: {uniprot_id} failed — {e}")
|
|
297
|
+
all_domains[uniprot_id] = pd.DataFrame()
|
|
298
|
+
|
|
299
|
+
# 共通ドメイン分析
|
|
300
|
+
domain_sets = {}
|
|
301
|
+
for prot, df in all_domains.items():
|
|
302
|
+
if not df.empty:
|
|
303
|
+
domain_sets[prot] = set(df["accession"].unique())
|
|
304
|
+
else:
|
|
305
|
+
domain_sets[prot] = set()
|
|
306
|
+
|
|
307
|
+
# 全タンパク質に共通するドメイン
|
|
308
|
+
if domain_sets:
|
|
309
|
+
common = set.intersection(*domain_sets.values()) if domain_sets else set()
|
|
310
|
+
all_unique = set.union(*domain_sets.values()) if domain_sets else set()
|
|
311
|
+
else:
|
|
312
|
+
common = set()
|
|
313
|
+
all_unique = set()
|
|
314
|
+
|
|
315
|
+
summary = {
|
|
316
|
+
"proteins_analyzed": len(protein_list),
|
|
317
|
+
"proteins_with_domains": sum(
|
|
318
|
+
1 for df in all_domains.values() if not df.empty
|
|
319
|
+
),
|
|
320
|
+
"common_domains": sorted(common),
|
|
321
|
+
"total_unique_domains": len(all_unique),
|
|
322
|
+
"per_protein": {
|
|
323
|
+
prot: {"count": len(s), "domains": sorted(s)}
|
|
324
|
+
for prot, s in domain_sets.items()
|
|
325
|
+
},
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
print(f"Domain comparison: {len(protein_list)} proteins, "
|
|
329
|
+
f"{len(common)} common domains, "
|
|
330
|
+
f"{len(all_unique)} unique domains total")
|
|
331
|
+
return summary, all_domains
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
## References
|
|
335
|
+
|
|
336
|
+
### Output Files
|
|
337
|
+
|
|
338
|
+
| ファイル | 形式 |
|
|
339
|
+
|---|---|
|
|
340
|
+
| `results/interpro_search.csv` | CSV |
|
|
341
|
+
| `results/protein_domains.csv` | CSV |
|
|
342
|
+
| `results/interproscan_results.csv` | CSV |
|
|
343
|
+
| `results/domain_comparison.json` | JSON |
|
|
344
|
+
| `figures/domain_architecture.png` | PNG |
|
|
345
|
+
|
|
346
|
+
### 利用可能ツール
|
|
347
|
+
|
|
348
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
349
|
+
|---|---|---|
|
|
350
|
+
| InterPro | `InterPro_search_domains` | ドメイン検索 |
|
|
351
|
+
| InterPro | `InterPro_get_protein_domains` | タンパク質ドメイン取得 |
|
|
352
|
+
| InterPro | `InterPro_get_domain_details` | ドメイン詳細 |
|
|
353
|
+
| InterProScan | `InterProScan_scan_sequence` | 配列ベース予測 |
|
|
354
|
+
| InterProScan | `InterProScan_get_job_status` | ジョブステータス |
|
|
355
|
+
| InterProScan | `InterProScan_get_job_results` | 結果取得 |
|
|
356
|
+
|
|
357
|
+
### 参照スキル
|
|
358
|
+
|
|
359
|
+
| スキル | 関連 |
|
|
360
|
+
|---|---|
|
|
361
|
+
| `scientific-protein-structure-analysis` | 3D 構造解析 |
|
|
362
|
+
| `scientific-protein-interaction-network` | PPI ネットワーク |
|
|
363
|
+
| `scientific-sequence-alignment` | 多重配列アラインメント |
|
|
364
|
+
| `scientific-phylogenetics` | 系統樹構築 |
|
|
365
|
+
| `scientific-gene-expression-transcriptomics` | 発現相関 |
|
|
366
|
+
|
|
367
|
+
### 依存パッケージ
|
|
368
|
+
|
|
369
|
+
`requests`, `pandas`, `matplotlib`, `json` (stdlib), `time` (stdlib)
|