@nahisaho/satori 0.11.1 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,369 @@
1
+ ---
2
+ name: scientific-protein-domain-family
3
+ description: |
4
+ タンパク質ドメイン・ファミリー解析スキル。InterPro アノテーション検索、
5
+ InterProScan によるシーケンスベースドメイン予測、Pfam/SMART/CDD
6
+ ドメイン分類、ドメインアーキテクチャ可視化、ファミリー系統樹構築。
7
+ ---
8
+
9
+ # Scientific Protein Domain & Family Analysis
10
+
11
+ InterPro / InterProScan を中心としたタンパク質ドメイン解析
12
+ およびファミリー分類パイプラインを提供する。
13
+
14
+ ## When to Use
15
+
16
+ - 未知タンパク質のドメイン構成を同定するとき
17
+ - InterPro/Pfam アノテーションでファミリー分類するとき
18
+ - ドメインアーキテクチャを比較・可視化するとき
19
+ - InterProScan でバッチシーケンス解析するとき
20
+ - 進化的なドメイン保存性を評価するとき
21
+
22
+ ---
23
+
24
+ ## Quick Start
25
+
26
+ ## 1. InterPro ドメイン検索
27
+
28
+ ```python
29
+ import requests
30
+ import pandas as pd
31
+ import json
32
+
33
+
34
+ INTERPRO_API = "https://www.ebi.ac.uk/interpro/api"
35
+
36
+
37
+ def search_interpro_domains(query, db_filter=None):
38
+ """
39
+ InterPro REST API でドメイン検索。
40
+
41
+ Parameters:
42
+ query: str — ドメイン名またはキーワード (e.g., "kinase", "SH3")
43
+ db_filter: str — "pfam", "smart", "cdd", "prosite" など
44
+
45
+ ToolUniverse:
46
+ InterPro_search_domains(query=query)
47
+ """
48
+ url = f"{INTERPRO_API}/entry/interpro"
49
+ params = {"search": query, "page_size": 25}
50
+
51
+ if db_filter:
52
+ url = f"{INTERPRO_API}/entry/{db_filter}"
53
+
54
+ resp = requests.get(url, params=params)
55
+ resp.raise_for_status()
56
+ data = resp.json()
57
+
58
+ results = []
59
+ for entry in data.get("results", []):
60
+ meta = entry.get("metadata", {})
61
+ results.append({
62
+ "accession": meta.get("accession", ""),
63
+ "name": meta.get("name", ""),
64
+ "type": meta.get("type", ""),
65
+ "source_database": meta.get("source_database", "interpro"),
66
+ "description": (meta.get("description", [{}])[0].get("text", "")[:200]
67
+ if meta.get("description") else ""),
68
+ "member_databases": meta.get("member_databases", {}),
69
+ })
70
+
71
+ df = pd.DataFrame(results)
72
+ print(f"InterPro search '{query}': {len(df)} entries found")
73
+ return df
74
+ ```
75
+
76
+ ## 2. タンパク質のドメイン組成取得
77
+
78
+ ```python
79
+ def get_protein_domains(uniprot_id):
80
+ """
81
+ UniProt タンパク質のドメインアノテーションを取得。
82
+
83
+ Parameters:
84
+ uniprot_id: str — UniProt accession (e.g., "P04637")
85
+
86
+ ToolUniverse:
87
+ InterPro_get_protein_domains(uniprot_id=uniprot_id)
88
+ """
89
+ url = f"{INTERPRO_API}/protein/uniprot/{uniprot_id}"
90
+ params = {"page_size": 50}
91
+
92
+ resp = requests.get(url, params=params)
93
+ resp.raise_for_status()
94
+ data = resp.json()
95
+
96
+ domains = []
97
+ for entry in data.get("results", []):
98
+ meta = entry.get("metadata", {})
99
+ # 各ドメインの位置情報
100
+ for protein_info in entry.get("proteins", []):
101
+ for loc in protein_info.get("entry_protein_locations", []):
102
+ for frag in loc.get("fragments", []):
103
+ domains.append({
104
+ "accession": meta.get("accession", ""),
105
+ "name": meta.get("name", ""),
106
+ "type": meta.get("type", ""),
107
+ "start": frag.get("start", 0),
108
+ "end": frag.get("end", 0),
109
+ "source": meta.get("source_database", "interpro"),
110
+ })
111
+
112
+ df = pd.DataFrame(domains)
113
+ if not df.empty:
114
+ df = df.sort_values("start").reset_index(drop=True)
115
+
116
+ print(f"Protein {uniprot_id}: {len(df)} domain annotations")
117
+ return df
118
+ ```
119
+
120
+ ## 3. InterProScan シーケンスベース予測
121
+
122
+ ```python
123
+ import time
124
+
125
+
126
+ IPRSCAN_API = "https://www.ebi.ac.uk/Tools/services/rest/iprscan5"
127
+
128
+
129
+ def submit_interproscan(sequence, email="user@example.com",
130
+ applications=None):
131
+ """
132
+ InterProScan REST API でシーケンスベースドメイン予測。
133
+
134
+ Parameters:
135
+ sequence: str — アミノ酸配列 (FASTA or raw)
136
+ email: str — 結果通知メール
137
+ applications: list — ["Pfam", "SMART", "CDD", "ProSitePatterns"]
138
+
139
+ ToolUniverse:
140
+ InterProScan_scan_sequence(sequence=sequence)
141
+ InterProScan_get_job_status(job_id=job_id)
142
+ InterProScan_get_job_results(job_id=job_id)
143
+ """
144
+ if applications is None:
145
+ applications = ["Pfam", "SMART", "CDD", "ProSitePatterns",
146
+ "SUPERFAMILY", "Gene3D"]
147
+
148
+ # 1. Submit job
149
+ payload = {
150
+ "email": email,
151
+ "sequence": sequence,
152
+ "appl": ",".join(applications),
153
+ "goterms": "true",
154
+ "pathways": "true",
155
+ }
156
+ resp = requests.post(f"{IPRSCAN_API}/run", data=payload)
157
+ resp.raise_for_status()
158
+ job_id = resp.text.strip()
159
+ print(f"InterProScan job submitted: {job_id}")
160
+
161
+ # 2. Poll for results
162
+ max_wait = 600 # 10 min
163
+ elapsed = 0
164
+ while elapsed < max_wait:
165
+ status_resp = requests.get(f"{IPRSCAN_API}/status/{job_id}")
166
+ status = status_resp.text.strip()
167
+ if status == "FINISHED":
168
+ break
169
+ elif status in ("ERROR", "FAILURE", "NOT_FOUND"):
170
+ raise RuntimeError(f"InterProScan job {job_id}: {status}")
171
+ time.sleep(30)
172
+ elapsed += 30
173
+ print(f" Waiting... ({elapsed}s, status={status})")
174
+
175
+ # 3. Get results
176
+ result_resp = requests.get(f"{IPRSCAN_API}/result/{job_id}/json")
177
+ result_resp.raise_for_status()
178
+ results = result_resp.json()
179
+
180
+ # Parse matches
181
+ matches = []
182
+ for res in results.get("results", [results]):
183
+ for match in res.get("matches", []):
184
+ sig = match.get("signature", {})
185
+ for loc in match.get("locations", []):
186
+ matches.append({
187
+ "accession": sig.get("accession", ""),
188
+ "name": sig.get("name", ""),
189
+ "description": sig.get("description", ""),
190
+ "database": sig.get("signatureLibraryRelease", {}).get("library", ""),
191
+ "start": loc.get("start", 0),
192
+ "end": loc.get("end", 0),
193
+ "score": loc.get("score", None),
194
+ "evalue": loc.get("evalue", None),
195
+ "interpro_accession": (
196
+ sig.get("entry", {}).get("accession", "")
197
+ if sig.get("entry") else ""
198
+ ),
199
+ "interpro_name": (
200
+ sig.get("entry", {}).get("name", "")
201
+ if sig.get("entry") else ""
202
+ ),
203
+ })
204
+
205
+ df = pd.DataFrame(matches)
206
+ print(f"InterProScan: {len(df)} domain matches from {len(applications)} DBs")
207
+ return df, job_id
208
+ ```
209
+
210
+ ## 4. ドメインアーキテクチャ可視化
211
+
212
+ ```python
213
+ import matplotlib
214
+ matplotlib.use("Agg")
215
+ import matplotlib.pyplot as plt
216
+ import matplotlib.patches as mpatches
217
+
218
+
219
+ def visualize_domain_architecture(domains_df, protein_length=None,
220
+ output="figures/domain_architecture.png"):
221
+ """
222
+ ドメインアーキテクチャ図を生成。
223
+
224
+ Parameters:
225
+ domains_df: DataFrame — columns: [name, start, end, source]
226
+ protein_length: int — 全長 (None なら max(end) 使用)
227
+ output: str — 出力ファイルパス
228
+ """
229
+ import os
230
+ os.makedirs(os.path.dirname(output), exist_ok=True)
231
+
232
+ if protein_length is None:
233
+ protein_length = int(domains_df["end"].max()) + 10
234
+
235
+ # 色パレット
236
+ colors = plt.cm.Set3.colors
237
+ unique_sources = domains_df["source"].unique() if "source" in domains_df.columns else [""]
238
+ source_color = {s: colors[i % len(colors)] for i, s in enumerate(unique_sources)}
239
+
240
+ fig, ax = plt.subplots(figsize=(14, 3))
241
+
242
+ # バックボーン
243
+ ax.plot([0, protein_length], [0.5, 0.5], "k-", linewidth=2)
244
+
245
+ # ドメインボックス
246
+ for _, row in domains_df.iterrows():
247
+ start = row["start"]
248
+ end = row["end"]
249
+ source = row.get("source", "")
250
+ color = source_color.get(source, "steelblue")
251
+
252
+ rect = mpatches.FancyBboxPatch(
253
+ (start, 0.2), end - start, 0.6,
254
+ boxstyle="round,pad=0.02",
255
+ facecolor=color, edgecolor="black", linewidth=1
256
+ )
257
+ ax.add_patch(rect)
258
+
259
+ # ラベル
260
+ mid = (start + end) / 2
261
+ label = row.get("name", row.get("accession", ""))
262
+ if len(label) > 12:
263
+ label = label[:10] + ".."
264
+ ax.text(mid, 0.5, label, ha="center", va="center",
265
+ fontsize=7, fontweight="bold")
266
+
267
+ ax.set_xlim(-10, protein_length + 10)
268
+ ax.set_ylim(-0.5, 1.5)
269
+ ax.set_xlabel("Residue position")
270
+ ax.set_title("Domain Architecture")
271
+ ax.set_yticks([])
272
+
273
+ plt.tight_layout()
274
+ plt.savefig(output, dpi=150, bbox_inches="tight")
275
+ plt.close()
276
+ print(f"Domain architecture: {output}")
277
+ return output
278
+ ```
279
+
280
+ ## 5. ドメインファミリー比較
281
+
282
+ ```python
283
+ def compare_domain_architectures(protein_list):
284
+ """
285
+ 複数タンパク質のドメインアーキテクチャを比較。
286
+
287
+ Parameters:
288
+ protein_list: list of str — UniProt accessions
289
+ """
290
+ all_domains = {}
291
+ for uniprot_id in protein_list:
292
+ try:
293
+ df = get_protein_domains(uniprot_id)
294
+ all_domains[uniprot_id] = df
295
+ except Exception as e:
296
+ print(f" Warning: {uniprot_id} failed — {e}")
297
+ all_domains[uniprot_id] = pd.DataFrame()
298
+
299
+ # 共通ドメイン分析
300
+ domain_sets = {}
301
+ for prot, df in all_domains.items():
302
+ if not df.empty:
303
+ domain_sets[prot] = set(df["accession"].unique())
304
+ else:
305
+ domain_sets[prot] = set()
306
+
307
+ # 全タンパク質に共通するドメイン
308
+ if domain_sets:
309
+ common = set.intersection(*domain_sets.values()) if domain_sets else set()
310
+ all_unique = set.union(*domain_sets.values()) if domain_sets else set()
311
+ else:
312
+ common = set()
313
+ all_unique = set()
314
+
315
+ summary = {
316
+ "proteins_analyzed": len(protein_list),
317
+ "proteins_with_domains": sum(
318
+ 1 for df in all_domains.values() if not df.empty
319
+ ),
320
+ "common_domains": sorted(common),
321
+ "total_unique_domains": len(all_unique),
322
+ "per_protein": {
323
+ prot: {"count": len(s), "domains": sorted(s)}
324
+ for prot, s in domain_sets.items()
325
+ },
326
+ }
327
+
328
+ print(f"Domain comparison: {len(protein_list)} proteins, "
329
+ f"{len(common)} common domains, "
330
+ f"{len(all_unique)} unique domains total")
331
+ return summary, all_domains
332
+ ```
333
+
334
+ ## References
335
+
336
+ ### Output Files
337
+
338
+ | ファイル | 形式 |
339
+ |---|---|
340
+ | `results/interpro_search.csv` | CSV |
341
+ | `results/protein_domains.csv` | CSV |
342
+ | `results/interproscan_results.csv` | CSV |
343
+ | `results/domain_comparison.json` | JSON |
344
+ | `figures/domain_architecture.png` | PNG |
345
+
346
+ ### 利用可能ツール
347
+
348
+ | カテゴリ | 主要ツール | 用途 |
349
+ |---|---|---|
350
+ | InterPro | `InterPro_search_domains` | ドメイン検索 |
351
+ | InterPro | `InterPro_get_protein_domains` | タンパク質ドメイン取得 |
352
+ | InterPro | `InterPro_get_domain_details` | ドメイン詳細 |
353
+ | InterProScan | `InterProScan_scan_sequence` | 配列ベース予測 |
354
+ | InterProScan | `InterProScan_get_job_status` | ジョブステータス |
355
+ | InterProScan | `InterProScan_get_job_results` | 結果取得 |
356
+
357
+ ### 参照スキル
358
+
359
+ | スキル | 関連 |
360
+ |---|---|
361
+ | `scientific-protein-structure-analysis` | 3D 構造解析 |
362
+ | `scientific-protein-interaction-network` | PPI ネットワーク |
363
+ | `scientific-sequence-alignment` | 多重配列アラインメント |
364
+ | `scientific-phylogenetics` | 系統樹構築 |
365
+ | `scientific-gene-expression-transcriptomics` | 発現相関 |
366
+
367
+ ### 依存パッケージ
368
+
369
+ `requests`, `pandas`, `matplotlib`, `json` (stdlib), `time` (stdlib)