@nahisaho/satori 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -29
- package/package.json +1 -1
- package/src/.github/skills/scientific-biothings-idmapping/SKILL.md +298 -0
- package/src/.github/skills/scientific-compound-screening/SKILL.md +245 -0
- package/src/.github/skills/scientific-genome-sequence-tools/SKILL.md +304 -0
- package/src/.github/skills/scientific-healthcare-ai/SKILL.md +273 -0
- package/src/.github/skills/scientific-human-protein-atlas/SKILL.md +244 -0
- package/src/.github/skills/scientific-metabolic-modeling/SKILL.md +288 -0
- package/src/.github/skills/scientific-noncoding-rna/SKILL.md +262 -0
- package/src/.github/skills/scientific-pharmacology-targets/SKILL.md +323 -0
- package/src/.github/skills/scientific-rare-disease-genetics/SKILL.md +327 -0
- package/src/.github/skills/scientific-structural-proteomics/SKILL.md +317 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-compound-screening
|
|
3
|
+
description: |
|
|
4
|
+
化合物スクリーニングスキル。ZINC データベースを活用した購入可能化合物検索、
|
|
5
|
+
SMILES/名前ベースの類似性検索、カタログフィルタリング、
|
|
6
|
+
バーチャルスクリーニング前処理パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Compound Screening
|
|
10
|
+
|
|
11
|
+
ZINC データベースを活用した化合物ライブラリ検索・
|
|
12
|
+
バーチャルスクリーニング前処理パイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 購入可能な化合物ライブラリを検索するとき
|
|
17
|
+
- SMILES 構造式から類似化合物を探すとき
|
|
18
|
+
- 化合物名からデータベースレコードを取得するとき
|
|
19
|
+
- ベンダーカタログの絞り込みを行うとき
|
|
20
|
+
- バーチャルスクリーニング用の化合物セットを準備するとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. ZINC 化合物名検索
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
ZINC_API = "https://zinc15.docking.org"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def zinc_search_by_name(name, max_results=20):
|
|
36
|
+
"""
|
|
37
|
+
ZINC データベースで化合物名による検索。
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
name: str — compound name (e.g., "aspirin")
|
|
41
|
+
max_results: int — maximum results
|
|
42
|
+
|
|
43
|
+
ToolUniverse:
|
|
44
|
+
ZINC_search_by_name(name=name)
|
|
45
|
+
"""
|
|
46
|
+
url = f"{ZINC_API}/substances/search"
|
|
47
|
+
params = {"q": name, "count": max_results}
|
|
48
|
+
resp = requests.get(url, params=params)
|
|
49
|
+
resp.raise_for_status()
|
|
50
|
+
data = resp.json()
|
|
51
|
+
|
|
52
|
+
results = []
|
|
53
|
+
for item in data:
|
|
54
|
+
results.append({
|
|
55
|
+
"zinc_id": item.get("zinc_id", ""),
|
|
56
|
+
"name": item.get("name", ""),
|
|
57
|
+
"smiles": item.get("smiles", ""),
|
|
58
|
+
"mwt": item.get("mwt", ""),
|
|
59
|
+
"logp": item.get("logp", ""),
|
|
60
|
+
"purchasable": item.get("purchasability", ""),
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
df = pd.DataFrame(results)
|
|
64
|
+
print(f"ZINC search '{name}': {len(df)} compounds")
|
|
65
|
+
return df
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## 2. ZINC SMILES 類似性検索
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
def zinc_search_by_smiles(smiles, similarity=0.7, max_results=20):
|
|
72
|
+
"""
|
|
73
|
+
ZINC で SMILES 構造式による類似性検索。
|
|
74
|
+
|
|
75
|
+
Parameters:
|
|
76
|
+
smiles: str — SMILES string
|
|
77
|
+
similarity: float — Tanimoto similarity threshold (0-1)
|
|
78
|
+
|
|
79
|
+
ToolUniverse:
|
|
80
|
+
ZINC_search_by_smiles(smiles=smiles)
|
|
81
|
+
"""
|
|
82
|
+
url = f"{ZINC_API}/substances/search"
|
|
83
|
+
params = {
|
|
84
|
+
"smiles": smiles,
|
|
85
|
+
"similarity": similarity,
|
|
86
|
+
"count": max_results,
|
|
87
|
+
}
|
|
88
|
+
resp = requests.get(url, params=params)
|
|
89
|
+
resp.raise_for_status()
|
|
90
|
+
data = resp.json()
|
|
91
|
+
|
|
92
|
+
results = []
|
|
93
|
+
for item in data:
|
|
94
|
+
results.append({
|
|
95
|
+
"zinc_id": item.get("zinc_id", ""),
|
|
96
|
+
"smiles": item.get("smiles", ""),
|
|
97
|
+
"similarity": item.get("similarity", ""),
|
|
98
|
+
"mwt": item.get("mwt", ""),
|
|
99
|
+
"logp": item.get("logp", ""),
|
|
100
|
+
"purchasable": item.get("purchasability", ""),
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
df = pd.DataFrame(results)
|
|
104
|
+
print(f"ZINC SMILES search: {len(df)} similar compounds "
|
|
105
|
+
f"(threshold={similarity})")
|
|
106
|
+
return df
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## 3. ZINC 化合物詳細取得
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
def zinc_get_substance(zinc_id):
|
|
113
|
+
"""
|
|
114
|
+
ZINC ID から化合物の完全情報を取得。
|
|
115
|
+
|
|
116
|
+
Parameters:
|
|
117
|
+
zinc_id: str — ZINC ID (e.g., "ZINC000000000001")
|
|
118
|
+
|
|
119
|
+
ToolUniverse:
|
|
120
|
+
ZINC_get_substance(zinc_id=zinc_id)
|
|
121
|
+
"""
|
|
122
|
+
url = f"{ZINC_API}/substances/{zinc_id}.json"
|
|
123
|
+
resp = requests.get(url)
|
|
124
|
+
resp.raise_for_status()
|
|
125
|
+
data = resp.json()
|
|
126
|
+
|
|
127
|
+
info = {
|
|
128
|
+
"zinc_id": data.get("zinc_id", ""),
|
|
129
|
+
"name": data.get("name", ""),
|
|
130
|
+
"smiles": data.get("smiles", ""),
|
|
131
|
+
"inchikey": data.get("inchikey", ""),
|
|
132
|
+
"mwt": data.get("mwt", ""),
|
|
133
|
+
"logp": data.get("logp", ""),
|
|
134
|
+
"num_rotatable_bonds": data.get("num_rotatable_bonds", ""),
|
|
135
|
+
"num_hba": data.get("num_hba", ""),
|
|
136
|
+
"num_hbd": data.get("num_hbd", ""),
|
|
137
|
+
"tpsa": data.get("tpsa", ""),
|
|
138
|
+
"purchasable": data.get("purchasability", ""),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
print(f"ZINC {zinc_id}: {info['name']} (MW={info['mwt']})")
|
|
142
|
+
return info, data
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## 4. ZINC カタログ一覧
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
def zinc_get_catalogs():
|
|
149
|
+
"""
|
|
150
|
+
ZINC の利用可能カタログ (ベンダー) 一覧を取得。
|
|
151
|
+
|
|
152
|
+
ToolUniverse:
|
|
153
|
+
ZINC_get_catalogs()
|
|
154
|
+
"""
|
|
155
|
+
url = f"{ZINC_API}/catalogs.json"
|
|
156
|
+
resp = requests.get(url)
|
|
157
|
+
resp.raise_for_status()
|
|
158
|
+
data = resp.json()
|
|
159
|
+
|
|
160
|
+
results = []
|
|
161
|
+
for cat in data:
|
|
162
|
+
results.append({
|
|
163
|
+
"catalog_name": cat.get("name", ""),
|
|
164
|
+
"short_name": cat.get("short_name", ""),
|
|
165
|
+
"num_substances": cat.get("num_substances", 0),
|
|
166
|
+
"url": cat.get("url", ""),
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
df = pd.DataFrame(results)
|
|
170
|
+
print(f"ZINC catalogs: {len(df)} vendors")
|
|
171
|
+
return df
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## 5. バーチャルスクリーニング前処理パイプライン
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
def virtual_screening_prep(query_smiles, lipinski=True, max_compounds=100):
|
|
178
|
+
"""
|
|
179
|
+
バーチャルスクリーニング用の化合物セット準備。
|
|
180
|
+
Lipinski's Rule of Five フィルタリング含む。
|
|
181
|
+
|
|
182
|
+
ToolUniverse (横断):
|
|
183
|
+
ZINC_search_by_smiles(smiles=query_smiles) → ZINC_get_substance(zinc_id)
|
|
184
|
+
"""
|
|
185
|
+
# Step 1: Similar compound search
|
|
186
|
+
df = zinc_search_by_smiles(query_smiles, similarity=0.6,
|
|
187
|
+
max_results=max_compounds)
|
|
188
|
+
|
|
189
|
+
if df.empty:
|
|
190
|
+
print("No similar compounds found")
|
|
191
|
+
return df
|
|
192
|
+
|
|
193
|
+
# Step 2: Lipinski filter
|
|
194
|
+
if lipinski:
|
|
195
|
+
df["mwt"] = pd.to_numeric(df["mwt"], errors="coerce")
|
|
196
|
+
df["logp"] = pd.to_numeric(df["logp"], errors="coerce")
|
|
197
|
+
before = len(df)
|
|
198
|
+
df = df[
|
|
199
|
+
(df["mwt"] <= 500)
|
|
200
|
+
& (df["logp"] <= 5)
|
|
201
|
+
]
|
|
202
|
+
print(f"Lipinski filter: {before} → {len(df)} compounds")
|
|
203
|
+
|
|
204
|
+
# Step 3: Sort by similarity
|
|
205
|
+
df["similarity"] = pd.to_numeric(df["similarity"], errors="coerce")
|
|
206
|
+
df = df.sort_values("similarity", ascending=False)
|
|
207
|
+
|
|
208
|
+
print(f"VS prep: {len(df)} compounds ready for screening")
|
|
209
|
+
return df
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## References
|
|
213
|
+
|
|
214
|
+
### Output Files
|
|
215
|
+
|
|
216
|
+
| ファイル | 形式 |
|
|
217
|
+
|---|---|
|
|
218
|
+
| `results/zinc_search.csv` | CSV |
|
|
219
|
+
| `results/zinc_similar.csv` | CSV |
|
|
220
|
+
| `results/zinc_substance.json` | JSON |
|
|
221
|
+
| `results/zinc_catalogs.csv` | CSV |
|
|
222
|
+
| `results/vs_library.csv` | CSV |
|
|
223
|
+
|
|
224
|
+
### 利用可能ツール
|
|
225
|
+
|
|
226
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
227
|
+
|---|---|---|
|
|
228
|
+
| ZINC | `ZINC_search_by_name` | 化合物名検索 |
|
|
229
|
+
| ZINC | `ZINC_search_by_smiles` | SMILES 類似性検索 |
|
|
230
|
+
| ZINC | `ZINC_get_substance` | 化合物詳細 |
|
|
231
|
+
| ZINC | `ZINC_get_catalogs` | カタログ一覧 |
|
|
232
|
+
|
|
233
|
+
### 参照スキル
|
|
234
|
+
|
|
235
|
+
| スキル | 関連 |
|
|
236
|
+
|---|---|
|
|
237
|
+
| `scientific-compound-similarity` | 化合物類似性 |
|
|
238
|
+
| `scientific-pharmacology-targets` | 薬理学ターゲット |
|
|
239
|
+
| `scientific-molecular-docking` | 分子ドッキング |
|
|
240
|
+
| `scientific-drug-target-interaction` | DTI 解析 |
|
|
241
|
+
| `scientific-admet-toxicity` | ADMET 毒性 |
|
|
242
|
+
|
|
243
|
+
### 依存パッケージ
|
|
244
|
+
|
|
245
|
+
`requests`, `pandas`
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-genome-sequence-tools
|
|
3
|
+
description: |
|
|
4
|
+
ゲノム配列解析総合スキル。Ensembl ゲノムブラウザ、dbSNP 変異データ、
|
|
5
|
+
BLAST 相同性検索、NCBI Nucleotide 配列取得、GDC がんゲノミクスデータの
|
|
6
|
+
統合パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Genome Sequence Tools
|
|
10
|
+
|
|
11
|
+
公的ゲノムデータベース (Ensembl, dbSNP, BLAST, NCBI, GDC) を横断した
|
|
12
|
+
配列検索・変異アノテーション・がんゲノミクスパイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- ゲノム配列・エクソン構造を Ensembl から取得するとき
|
|
17
|
+
- rsID から変異のアレル頻度を調べるとき
|
|
18
|
+
- BLAST で塩基/アミノ酸配列の相同性検索を行うとき
|
|
19
|
+
- NCBI Nucleotide から配列をフェッチするとき
|
|
20
|
+
- GDC がんゲノミクスデータ (体細胞変異, CNV, 発現) を取得するとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. dbSNP 変異情報取得
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_dbsnp_variant(rsid):
|
|
34
|
+
"""
|
|
35
|
+
dbSNP から rsID ベースの変異情報 (アレル頻度含む) を取得。
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
rsid: str — e.g. "rs7412"
|
|
39
|
+
|
|
40
|
+
ToolUniverse:
|
|
41
|
+
dbsnp_get_variant_by_rsid(rsid=rsid)
|
|
42
|
+
dbsnp_get_frequencies(rsid=rsid)
|
|
43
|
+
dbsnp_search_by_gene(gene_symbol=gene_symbol)
|
|
44
|
+
"""
|
|
45
|
+
url = f"https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/{rsid.lstrip('rs')}"
|
|
46
|
+
resp = requests.get(url)
|
|
47
|
+
resp.raise_for_status()
|
|
48
|
+
data = resp.json()
|
|
49
|
+
|
|
50
|
+
# Extract primary info
|
|
51
|
+
info = {
|
|
52
|
+
"rsid": f"rs{data.get('refsnp_id', '')}",
|
|
53
|
+
"create_date": data.get("create_date", ""),
|
|
54
|
+
"update_date": data.get("update_date", ""),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Allele frequencies
|
|
58
|
+
alleles = data.get("primary_snapshot_data", {}).get(
|
|
59
|
+
"allele_annotations", []
|
|
60
|
+
)
|
|
61
|
+
freq_data = []
|
|
62
|
+
for allele in alleles:
|
|
63
|
+
for freq_entry in allele.get("frequency", []):
|
|
64
|
+
freq_data.append({
|
|
65
|
+
"study": freq_entry.get("study_name", ""),
|
|
66
|
+
"allele": freq_entry.get("allele", ""),
|
|
67
|
+
"count": freq_entry.get("allele_count", 0),
|
|
68
|
+
"total": freq_entry.get("total_count", 0),
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
df_freq = pd.DataFrame(freq_data)
|
|
72
|
+
print(f"dbSNP {info['rsid']}: {len(df_freq)} frequency entries")
|
|
73
|
+
return info, df_freq
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## 2. BLAST 相同性検索
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import time
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def blast_search(sequence, program="blastn", database="nt", max_hits=10):
|
|
83
|
+
"""
|
|
84
|
+
NCBI BLAST REST API で相同性検索。
|
|
85
|
+
|
|
86
|
+
Parameters:
|
|
87
|
+
sequence: str — query sequence (nucleotide or protein)
|
|
88
|
+
program: str — "blastn", "blastp", "blastx", "tblastn"
|
|
89
|
+
database: str — "nt", "nr", "refseq_rna", etc.
|
|
90
|
+
|
|
91
|
+
ToolUniverse:
|
|
92
|
+
BLAST_nucleotide_search(sequence=sequence, database=database)
|
|
93
|
+
BLAST_protein_search(sequence=sequence, database=database)
|
|
94
|
+
"""
|
|
95
|
+
put_url = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
|
|
96
|
+
params = {
|
|
97
|
+
"CMD": "Put",
|
|
98
|
+
"PROGRAM": program,
|
|
99
|
+
"DATABASE": database,
|
|
100
|
+
"QUERY": sequence,
|
|
101
|
+
"FORMAT_TYPE": "JSON2",
|
|
102
|
+
"HITLIST_SIZE": max_hits,
|
|
103
|
+
}
|
|
104
|
+
resp = requests.post(put_url, data=params)
|
|
105
|
+
resp.raise_for_status()
|
|
106
|
+
|
|
107
|
+
# Extract RID
|
|
108
|
+
import re
|
|
109
|
+
rid_match = re.search(r"RID = (\S+)", resp.text)
|
|
110
|
+
if not rid_match:
|
|
111
|
+
raise ValueError("BLAST RID not found")
|
|
112
|
+
rid = rid_match.group(1)
|
|
113
|
+
print(f"BLAST submitted: RID={rid}")
|
|
114
|
+
|
|
115
|
+
# Poll for results
|
|
116
|
+
for _ in range(60):
|
|
117
|
+
time.sleep(10)
|
|
118
|
+
check = requests.get(put_url, params={
|
|
119
|
+
"CMD": "Get", "RID": rid, "FORMAT_TYPE": "JSON2"
|
|
120
|
+
})
|
|
121
|
+
if "Status=WAITING" not in check.text:
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
return check.json() if check.headers.get(
|
|
125
|
+
"Content-Type", ""
|
|
126
|
+
).startswith("application/json") else check.text
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## 3. NCBI Nucleotide 配列フェッチ
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
def fetch_ncbi_sequence(accession, rettype="fasta"):
|
|
133
|
+
"""
|
|
134
|
+
NCBI Nucleotide (E-utilities) から配列を取得。
|
|
135
|
+
|
|
136
|
+
Parameters:
|
|
137
|
+
accession: str — NCBI accession (e.g., "NM_000546.6")
|
|
138
|
+
rettype: str — "fasta", "gb", "gbwithparts"
|
|
139
|
+
|
|
140
|
+
ToolUniverse:
|
|
141
|
+
NCBI_search_nucleotide(query=query)
|
|
142
|
+
NCBI_fetch_accessions(accessions=accessions)
|
|
143
|
+
NCBI_get_sequence(accession=accession)
|
|
144
|
+
"""
|
|
145
|
+
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
|
146
|
+
params = {
|
|
147
|
+
"db": "nucleotide",
|
|
148
|
+
"id": accession,
|
|
149
|
+
"rettype": rettype,
|
|
150
|
+
"retmode": "text",
|
|
151
|
+
}
|
|
152
|
+
resp = requests.get(url, params=params)
|
|
153
|
+
resp.raise_for_status()
|
|
154
|
+
|
|
155
|
+
print(f"NCBI Nucleotide '{accession}': {len(resp.text)} chars ({rettype})")
|
|
156
|
+
return resp.text
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## 4. GDC がんゲノミクスデータ
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
def get_gdc_mutations(gene_symbol, project_id=None):
|
|
163
|
+
"""
|
|
164
|
+
NCI GDC (Genomic Data Commons) から体細胞変異データを取得。
|
|
165
|
+
|
|
166
|
+
Parameters:
|
|
167
|
+
gene_symbol: str — e.g. "TP53"
|
|
168
|
+
project_id: str | None — e.g. "TCGA-BRCA"
|
|
169
|
+
|
|
170
|
+
ToolUniverse:
|
|
171
|
+
GDC_get_ssm_by_gene(gene_symbol=gene_symbol)
|
|
172
|
+
GDC_get_mutation_frequency(project_id=project_id)
|
|
173
|
+
GDC_get_gene_expression(gene_id=gene_id, project_id=project_id)
|
|
174
|
+
GDC_get_cnv_data(gene_id=gene_id)
|
|
175
|
+
GDC_list_projects()
|
|
176
|
+
GDC_search_cases(filters=filters)
|
|
177
|
+
GDC_list_files(filters=filters)
|
|
178
|
+
"""
|
|
179
|
+
url = "https://api.gdc.cancer.gov/ssms"
|
|
180
|
+
filters = {
|
|
181
|
+
"op": "and",
|
|
182
|
+
"content": [
|
|
183
|
+
{"op": "in", "content": {
|
|
184
|
+
"field": "consequence.transcript.gene.symbol",
|
|
185
|
+
"value": [gene_symbol],
|
|
186
|
+
}},
|
|
187
|
+
],
|
|
188
|
+
}
|
|
189
|
+
if project_id:
|
|
190
|
+
filters["content"].append({
|
|
191
|
+
"op": "in",
|
|
192
|
+
"content": {
|
|
193
|
+
"field": "cases.project.project_id",
|
|
194
|
+
"value": [project_id],
|
|
195
|
+
},
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
import json
|
|
199
|
+
params = {
|
|
200
|
+
"filters": json.dumps(filters),
|
|
201
|
+
"fields": ("ssm_id,consequence.transcript.gene.symbol,"
|
|
202
|
+
"consequence.transcript.aa_change,"
|
|
203
|
+
"consequence.transcript.consequence_type,"
|
|
204
|
+
"genomic_dna_change"),
|
|
205
|
+
"size": 100,
|
|
206
|
+
"format": "json",
|
|
207
|
+
}
|
|
208
|
+
resp = requests.get(url, params=params)
|
|
209
|
+
resp.raise_for_status()
|
|
210
|
+
hits = resp.json().get("data", {}).get("hits", [])
|
|
211
|
+
|
|
212
|
+
results = []
|
|
213
|
+
for hit in hits:
|
|
214
|
+
for csq in hit.get("consequence", []):
|
|
215
|
+
tx = csq.get("transcript", {})
|
|
216
|
+
results.append({
|
|
217
|
+
"ssm_id": hit.get("ssm_id", ""),
|
|
218
|
+
"gene": tx.get("gene", {}).get("symbol", ""),
|
|
219
|
+
"aa_change": tx.get("aa_change", ""),
|
|
220
|
+
"consequence_type": tx.get("consequence_type", ""),
|
|
221
|
+
"genomic_dna_change": hit.get("genomic_dna_change", ""),
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
df = pd.DataFrame(results)
|
|
225
|
+
print(f"GDC SSMs '{gene_symbol}'"
|
|
226
|
+
f"{f' ({project_id})' if project_id else ''}: {len(df)} mutations")
|
|
227
|
+
return df
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## 5. 統合ゲノム変異パイプライン
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
def integrated_variant_pipeline(rsid, gene_symbol=None):
|
|
234
|
+
"""
|
|
235
|
+
dbSNP + GDC を統合したゲノム変異解析パイプライン。
|
|
236
|
+
|
|
237
|
+
ToolUniverse (横断):
|
|
238
|
+
dbsnp_get_variant_by_rsid(rsid) → GDC_get_ssm_by_gene(gene_symbol)
|
|
239
|
+
"""
|
|
240
|
+
pipeline_result = {"rsid": rsid}
|
|
241
|
+
|
|
242
|
+
# Step 1: dbSNP
|
|
243
|
+
info, freq_df = get_dbsnp_variant(rsid)
|
|
244
|
+
pipeline_result["dbsnp"] = info
|
|
245
|
+
|
|
246
|
+
# Step 2: GDC somatic mutations (if gene provided)
|
|
247
|
+
if gene_symbol:
|
|
248
|
+
gdc_df = get_gdc_mutations(gene_symbol)
|
|
249
|
+
pipeline_result["gdc_mutation_count"] = len(gdc_df)
|
|
250
|
+
pipeline_result["gdc_top_consequences"] = (
|
|
251
|
+
gdc_df["consequence_type"].value_counts().head(5).to_dict()
|
|
252
|
+
if not gdc_df.empty else {}
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
print(f"Integrated variant: {rsid}"
|
|
256
|
+
f" | GDC={pipeline_result.get('gdc_mutation_count', 'N/A')}")
|
|
257
|
+
return pipeline_result
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## References
|
|
261
|
+
|
|
262
|
+
### Output Files
|
|
263
|
+
|
|
264
|
+
| ファイル | 形式 |
|
|
265
|
+
|---|---|
|
|
266
|
+
| `results/dbsnp_variant.json` | JSON |
|
|
267
|
+
| `results/dbsnp_frequencies.csv` | CSV |
|
|
268
|
+
| `results/blast_results.json` | JSON |
|
|
269
|
+
| `results/ncbi_sequence.fasta` | FASTA |
|
|
270
|
+
| `results/gdc_mutations.csv` | CSV |
|
|
271
|
+
|
|
272
|
+
### 利用可能ツール
|
|
273
|
+
|
|
274
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
275
|
+
|---|---|---|
|
|
276
|
+
| dbSNP | `dbsnp_get_variant_by_rsid` | rsID 変異情報 |
|
|
277
|
+
| dbSNP | `dbsnp_get_frequencies` | アレル頻度 |
|
|
278
|
+
| dbSNP | `dbsnp_search_by_gene` | 遺伝子→変異 |
|
|
279
|
+
| BLAST | `BLAST_nucleotide_search` | 核酸相同性検索 |
|
|
280
|
+
| BLAST | `BLAST_protein_search` | タンパク質相同性検索 |
|
|
281
|
+
| NCBI | `NCBI_search_nucleotide` | 配列検索 |
|
|
282
|
+
| NCBI | `NCBI_fetch_accessions` | アクセッション取得 |
|
|
283
|
+
| NCBI | `NCBI_get_sequence` | 配列フェッチ |
|
|
284
|
+
| GDC | `GDC_get_ssm_by_gene` | 体細胞変異 |
|
|
285
|
+
| GDC | `GDC_get_mutation_frequency` | 変異頻度 |
|
|
286
|
+
| GDC | `GDC_get_gene_expression` | 発現データ |
|
|
287
|
+
| GDC | `GDC_get_cnv_data` | CNV データ |
|
|
288
|
+
| GDC | `GDC_list_projects` | プロジェクト一覧 |
|
|
289
|
+
| GDC | `GDC_search_cases` | 症例検索 |
|
|
290
|
+
| GDC | `GDC_list_files` | ファイル一覧 |
|
|
291
|
+
|
|
292
|
+
### 参照スキル
|
|
293
|
+
|
|
294
|
+
| スキル | 関連 |
|
|
295
|
+
|---|---|
|
|
296
|
+
| `scientific-variant-interpretation` | 変異アノテーション |
|
|
297
|
+
| `scientific-population-genetics` | 集団遺伝学 |
|
|
298
|
+
| `scientific-cancer-genomics` | がんゲノミクス |
|
|
299
|
+
| `scientific-rare-disease-genetics` | 希少疾患遺伝学 |
|
|
300
|
+
| `scientific-biothings-idmapping` | ID マッピング |
|
|
301
|
+
|
|
302
|
+
### 依存パッケージ
|
|
303
|
+
|
|
304
|
+
`requests`, `pandas`
|