@nahisaho/satori 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +150 -54
- package/package.json +1 -1
- package/src/.github/skills/scientific-biomedical-pubtator/SKILL.md +331 -0
- package/src/.github/skills/scientific-biothings-idmapping/SKILL.md +298 -0
- package/src/.github/skills/scientific-cell-line-resources/SKILL.md +258 -0
- package/src/.github/skills/scientific-compound-screening/SKILL.md +245 -0
- package/src/.github/skills/scientific-ebi-databases/SKILL.md +280 -0
- package/src/.github/skills/scientific-genome-sequence-tools/SKILL.md +304 -0
- package/src/.github/skills/scientific-healthcare-ai/SKILL.md +273 -0
- package/src/.github/skills/scientific-human-protein-atlas/SKILL.md +244 -0
- package/src/.github/skills/scientific-metabolic-modeling/SKILL.md +288 -0
- package/src/.github/skills/scientific-noncoding-rna/SKILL.md +262 -0
- package/src/.github/skills/scientific-ontology-enrichment/SKILL.md +340 -0
- package/src/.github/skills/scientific-pharmacology-targets/SKILL.md +323 -0
- package/src/.github/skills/scientific-phylogenetics/SKILL.md +297 -0
- package/src/.github/skills/scientific-preprint-archive/SKILL.md +476 -0
- package/src/.github/skills/scientific-public-health-data/SKILL.md +322 -0
- package/src/.github/skills/scientific-rare-disease-genetics/SKILL.md +327 -0
- package/src/.github/skills/scientific-regulatory-genomics/SKILL.md +274 -0
- package/src/.github/skills/scientific-reinforcement-learning/SKILL.md +280 -0
- package/src/.github/skills/scientific-structural-proteomics/SKILL.md +317 -0
- package/src/.github/skills/scientific-symbolic-mathematics/SKILL.md +277 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-ebi-databases
|
|
3
|
+
description: |
|
|
4
|
+
EBI データベース群統合アクセススキル。EBI Search 横断検索、ENA Browser
|
|
5
|
+
ヌクレオチドアーカイブ、BioStudies 研究データ、dbfetch エントリ取得、
|
|
6
|
+
MetaboLights メタボロミクスリポジトリの統合パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific EBI Databases
|
|
10
|
+
|
|
11
|
+
EBI Search / ENA Browser / BioStudies / dbfetch / MetaboLights を統合した
|
|
12
|
+
EBI データベース群アクセスパイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- EBI Search で複数データベースを横断検索するとき
|
|
17
|
+
- ENA (European Nucleotide Archive) で配列データを検索するとき
|
|
18
|
+
- BioStudies で研究プロジェクトデータを探すとき
|
|
19
|
+
- dbfetch でエントリを一括取得するとき
|
|
20
|
+
- MetaboLights でメタボロミクス実験データにアクセスするとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. EBI Search 横断検索
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
EBI_SEARCH_API = "https://www.ebi.ac.uk/ebisearch/ws/rest"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def search_ebi(query, domain="allebi", size=25, fields=None):
|
|
36
|
+
"""
|
|
37
|
+
EBI Search 横断検索 — 複数 EBI データベースを一括検索。
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
query: str — 検索クエリ
|
|
41
|
+
domain: str — 検索ドメイン ("allebi", "uniprot", "pdb", "ena", etc.)
|
|
42
|
+
size: int — 最大取得数
|
|
43
|
+
fields: list — 返却フィールド
|
|
44
|
+
|
|
45
|
+
ToolUniverse:
|
|
46
|
+
EBI_Search_query(query=query, domain=domain)
|
|
47
|
+
EBI_Search_get_entry(domain=domain, entry_id=entry_id)
|
|
48
|
+
"""
|
|
49
|
+
params = {
|
|
50
|
+
"query": query,
|
|
51
|
+
"size": size,
|
|
52
|
+
"format": "json",
|
|
53
|
+
}
|
|
54
|
+
if fields:
|
|
55
|
+
params["fields"] = ",".join(fields)
|
|
56
|
+
|
|
57
|
+
resp = requests.get(f"{EBI_SEARCH_API}/{domain}", params=params)
|
|
58
|
+
resp.raise_for_status()
|
|
59
|
+
data = resp.json()
|
|
60
|
+
|
|
61
|
+
results = []
|
|
62
|
+
for entry in data.get("entries", []):
|
|
63
|
+
row = {"id": entry.get("id", ""), "source": entry.get("source", "")}
|
|
64
|
+
for field in entry.get("fields", {}):
|
|
65
|
+
row[field] = entry["fields"][field][0] if entry["fields"][field] else ""
|
|
66
|
+
results.append(row)
|
|
67
|
+
|
|
68
|
+
df = pd.DataFrame(results)
|
|
69
|
+
total = data.get("hitCount", 0)
|
|
70
|
+
print(f"EBI Search [{domain}] '{query}': {total} total hits, {len(df)} returned")
|
|
71
|
+
return df
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 2. ENA (European Nucleotide Archive) 配列検索
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
ENA_API = "https://www.ebi.ac.uk/ena/browser/api"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def search_ena(query, result_type="sequence", limit=100):
|
|
81
|
+
"""
|
|
82
|
+
ENA ヌクレオチドアーカイブ検索。
|
|
83
|
+
|
|
84
|
+
Parameters:
|
|
85
|
+
query: str — 検索クエリ or Taxon ID
|
|
86
|
+
result_type: str — "sequence", "read_run", "analysis", "study"
|
|
87
|
+
limit: int — 最大取得数
|
|
88
|
+
|
|
89
|
+
ToolUniverse:
|
|
90
|
+
ENA_search(query=query, result=result_type)
|
|
91
|
+
ENA_get_entry(accession=accession)
|
|
92
|
+
"""
|
|
93
|
+
params = {
|
|
94
|
+
"query": query,
|
|
95
|
+
"result": result_type,
|
|
96
|
+
"limit": limit,
|
|
97
|
+
"format": "json",
|
|
98
|
+
}
|
|
99
|
+
resp = requests.get(f"{ENA_API}/search", params=params)
|
|
100
|
+
resp.raise_for_status()
|
|
101
|
+
data = resp.json()
|
|
102
|
+
|
|
103
|
+
df = pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame()
|
|
104
|
+
print(f"ENA search '{query}' [{result_type}]: {len(df)} entries")
|
|
105
|
+
return df
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_ena_entry(accession, display="json"):
|
|
109
|
+
"""
|
|
110
|
+
ENA アクセッション番号によるエントリ取得。
|
|
111
|
+
|
|
112
|
+
Parameters:
|
|
113
|
+
accession: str — ENA accession (e.g., "ERS000001", "ERR000001")
|
|
114
|
+
"""
|
|
115
|
+
resp = requests.get(
|
|
116
|
+
f"{ENA_API}/entry/{accession}",
|
|
117
|
+
params={"display": display}
|
|
118
|
+
)
|
|
119
|
+
resp.raise_for_status()
|
|
120
|
+
print(f"ENA entry {accession}: retrieved")
|
|
121
|
+
return resp.json() if display == "json" else resp.text
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## 3. BioStudies 研究データ検索
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
BIOSTUDIES_API = "https://www.ebi.ac.uk/biostudies/api/v1"
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def search_biostudies(query, page_size=25):
|
|
131
|
+
"""
|
|
132
|
+
BioStudies 研究プロジェクトデータ検索。
|
|
133
|
+
|
|
134
|
+
Parameters:
|
|
135
|
+
query: str — 検索クエリ
|
|
136
|
+
page_size: int — ページサイズ
|
|
137
|
+
|
|
138
|
+
ToolUniverse:
|
|
139
|
+
BioStudies_search(query=query)
|
|
140
|
+
BioStudies_get_study(accession=accession)
|
|
141
|
+
"""
|
|
142
|
+
params = {"query": query, "pageSize": page_size}
|
|
143
|
+
resp = requests.get(f"{BIOSTUDIES_API}/search", params=params)
|
|
144
|
+
resp.raise_for_status()
|
|
145
|
+
data = resp.json()
|
|
146
|
+
|
|
147
|
+
results = []
|
|
148
|
+
for hit in data.get("hits", []):
|
|
149
|
+
results.append({
|
|
150
|
+
"accession": hit.get("accno", ""),
|
|
151
|
+
"title": hit.get("title", ""),
|
|
152
|
+
"author": hit.get("author", ""),
|
|
153
|
+
"release_date": hit.get("rtime", ""),
|
|
154
|
+
"type": hit.get("type", ""),
|
|
155
|
+
"files": hit.get("files", 0),
|
|
156
|
+
"links": hit.get("links", 0),
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
df = pd.DataFrame(results)
|
|
160
|
+
total = data.get("totalHits", 0)
|
|
161
|
+
print(f"BioStudies search '{query}': {total} total, {len(df)} returned")
|
|
162
|
+
return df
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## 4. dbfetch エントリ一括取得
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
DBFETCH_API = "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def dbfetch(db, ids, format_type="json", style="raw"):
|
|
172
|
+
"""
|
|
173
|
+
dbfetch — EBI データベースエントリ一括取得。
|
|
174
|
+
|
|
175
|
+
Parameters:
|
|
176
|
+
db: str — データベース名 (e.g., "uniprotkb", "embl", "pdb")
|
|
177
|
+
ids: list — ID リスト
|
|
178
|
+
format_type: str — 出力形式 ("json", "fasta", "xml")
|
|
179
|
+
style: str — スタイル ("raw", "html")
|
|
180
|
+
|
|
181
|
+
ToolUniverse:
|
|
182
|
+
dbfetch_get_entries(db=db, ids=ids, format=format_type)
|
|
183
|
+
"""
|
|
184
|
+
ids_str = ",".join(ids) if isinstance(ids, list) else ids
|
|
185
|
+
params = {
|
|
186
|
+
"db": db,
|
|
187
|
+
"id": ids_str,
|
|
188
|
+
"format": format_type,
|
|
189
|
+
"style": style,
|
|
190
|
+
}
|
|
191
|
+
resp = requests.get(DBFETCH_API, params=params)
|
|
192
|
+
resp.raise_for_status()
|
|
193
|
+
|
|
194
|
+
print(f"dbfetch [{db}]: {len(ids) if isinstance(ids, list) else 1} entries, "
|
|
195
|
+
f"format={format_type}")
|
|
196
|
+
if format_type == "json":
|
|
197
|
+
return resp.json()
|
|
198
|
+
return resp.text
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## 5. MetaboLights メタボロミクスリポジトリ
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
METABOLIGHTS_API = "https://www.ebi.ac.uk/metabolights/ws"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def search_metabolights(query):
|
|
208
|
+
"""
|
|
209
|
+
MetaboLights メタボロミクス実験データ検索。
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
query: str — 検索クエリ (化合物名、疾患名、生物種)
|
|
213
|
+
|
|
214
|
+
ToolUniverse:
|
|
215
|
+
MetaboLights_search_studies(query=query)
|
|
216
|
+
MetaboLights_get_study(study_id=study_id)
|
|
217
|
+
"""
|
|
218
|
+
resp = requests.get(
|
|
219
|
+
f"{METABOLIGHTS_API}/studies/search",
|
|
220
|
+
params={"query": query}
|
|
221
|
+
)
|
|
222
|
+
resp.raise_for_status()
|
|
223
|
+
data = resp.json()
|
|
224
|
+
|
|
225
|
+
results = []
|
|
226
|
+
for study in data.get("content", []):
|
|
227
|
+
results.append({
|
|
228
|
+
"study_id": study.get("studyIdentifier", ""),
|
|
229
|
+
"title": study.get("title", ""),
|
|
230
|
+
"organism": study.get("organism", ""),
|
|
231
|
+
"description": (study.get("description") or "")[:200],
|
|
232
|
+
"submission_date": study.get("submissionDate", ""),
|
|
233
|
+
"status": study.get("studyStatus", ""),
|
|
234
|
+
})
|
|
235
|
+
|
|
236
|
+
df = pd.DataFrame(results)
|
|
237
|
+
print(f"MetaboLights search '{query}': {len(df)} studies")
|
|
238
|
+
return df
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def get_metabolights_study(study_id):
|
|
242
|
+
"""MetaboLights 個別研究取得。"""
|
|
243
|
+
resp = requests.get(f"{METABOLIGHTS_API}/studies/{study_id}")
|
|
244
|
+
resp.raise_for_status()
|
|
245
|
+
data = resp.json()
|
|
246
|
+
print(f"MetaboLights {study_id}: {data.get('title', '')[:80]}")
|
|
247
|
+
return data
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## 利用可能ツール
|
|
253
|
+
|
|
254
|
+
| ToolUniverse カテゴリ | 主なツール |
|
|
255
|
+
|---|---|
|
|
256
|
+
| `ebi_search` | `EBI_Search_query`, `EBI_Search_get_entry` |
|
|
257
|
+
| `ena_browser` | `ENA_search`, `ENA_get_entry` |
|
|
258
|
+
| `biostudies` | `BioStudies_search`, `BioStudies_get_study` |
|
|
259
|
+
| `dbfetch` | `dbfetch_get_entries` |
|
|
260
|
+
| `metabolights` | `MetaboLights_search_studies`, `MetaboLights_get_study` |
|
|
261
|
+
|
|
262
|
+
## パイプライン出力
|
|
263
|
+
|
|
264
|
+
| 出力ファイル | 説明 | 連携先スキル |
|
|
265
|
+
|---|---|---|
|
|
266
|
+
| `results/ebi_search.csv` | EBI 横断検索結果 | → bioinformatics, literature-search |
|
|
267
|
+
| `results/ena_sequences.fasta` | ENA 配列データ | → genome-sequence-tools, sequence-analysis |
|
|
268
|
+
| `results/biostudies_metadata.json` | 研究プロジェクト情報 | → multi-omics, systematic-review |
|
|
269
|
+
| `results/metabolights_study.json` | メタボロミクスデータ | → metabolomics, metabolomics-databases |
|
|
270
|
+
|
|
271
|
+
## パイプライン統合
|
|
272
|
+
|
|
273
|
+
```
|
|
274
|
+
genome-sequence-tools ──→ ebi-databases ──→ metabolomics-databases
|
|
275
|
+
(NCBI/BLAST) (ENA/EBI Search) (HMDB/MetaCyc)
|
|
276
|
+
│
|
|
277
|
+
├──→ bioinformatics (配列データ)
|
|
278
|
+
├──→ sequence-analysis (FASTA)
|
|
279
|
+
└──→ structural-proteomics (PDBe cross-ref)
|
|
280
|
+
```
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-genome-sequence-tools
|
|
3
|
+
description: |
|
|
4
|
+
ゲノム配列解析総合スキル。Ensembl ゲノムブラウザ、dbSNP 変異データ、
|
|
5
|
+
BLAST 相同性検索、NCBI Nucleotide 配列取得、GDC がんゲノミクスデータの
|
|
6
|
+
統合パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Genome Sequence Tools
|
|
10
|
+
|
|
11
|
+
公的ゲノムデータベース (Ensembl, dbSNP, BLAST, NCBI, GDC) を横断した
|
|
12
|
+
配列検索・変異アノテーション・がんゲノミクスパイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- ゲノム配列・エクソン構造を Ensembl から取得するとき
|
|
17
|
+
- rsID から変異のアレル頻度を調べるとき
|
|
18
|
+
- BLAST で塩基/アミノ酸配列の相同性検索を行うとき
|
|
19
|
+
- NCBI Nucleotide から配列をフェッチするとき
|
|
20
|
+
- GDC がんゲノミクスデータ (体細胞変異, CNV, 発現) を取得するとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. dbSNP 変異情報取得
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_dbsnp_variant(rsid):
|
|
34
|
+
"""
|
|
35
|
+
dbSNP から rsID ベースの変異情報 (アレル頻度含む) を取得。
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
rsid: str — e.g. "rs7412"
|
|
39
|
+
|
|
40
|
+
ToolUniverse:
|
|
41
|
+
dbsnp_get_variant_by_rsid(rsid=rsid)
|
|
42
|
+
dbsnp_get_frequencies(rsid=rsid)
|
|
43
|
+
dbsnp_search_by_gene(gene_symbol=gene_symbol)
|
|
44
|
+
"""
|
|
45
|
+
url = f"https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/{rsid.lstrip('rs')}"
|
|
46
|
+
resp = requests.get(url)
|
|
47
|
+
resp.raise_for_status()
|
|
48
|
+
data = resp.json()
|
|
49
|
+
|
|
50
|
+
# Extract primary info
|
|
51
|
+
info = {
|
|
52
|
+
"rsid": f"rs{data.get('refsnp_id', '')}",
|
|
53
|
+
"create_date": data.get("create_date", ""),
|
|
54
|
+
"update_date": data.get("update_date", ""),
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Allele frequencies
|
|
58
|
+
alleles = data.get("primary_snapshot_data", {}).get(
|
|
59
|
+
"allele_annotations", []
|
|
60
|
+
)
|
|
61
|
+
freq_data = []
|
|
62
|
+
for allele in alleles:
|
|
63
|
+
for freq_entry in allele.get("frequency", []):
|
|
64
|
+
freq_data.append({
|
|
65
|
+
"study": freq_entry.get("study_name", ""),
|
|
66
|
+
"allele": freq_entry.get("allele", ""),
|
|
67
|
+
"count": freq_entry.get("allele_count", 0),
|
|
68
|
+
"total": freq_entry.get("total_count", 0),
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
df_freq = pd.DataFrame(freq_data)
|
|
72
|
+
print(f"dbSNP {info['rsid']}: {len(df_freq)} frequency entries")
|
|
73
|
+
return info, df_freq
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## 2. BLAST 相同性検索
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import time
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def blast_search(sequence, program="blastn", database="nt", max_hits=10):
|
|
83
|
+
"""
|
|
84
|
+
NCBI BLAST REST API で相同性検索。
|
|
85
|
+
|
|
86
|
+
Parameters:
|
|
87
|
+
sequence: str — query sequence (nucleotide or protein)
|
|
88
|
+
program: str — "blastn", "blastp", "blastx", "tblastn"
|
|
89
|
+
database: str — "nt", "nr", "refseq_rna", etc.
|
|
90
|
+
|
|
91
|
+
ToolUniverse:
|
|
92
|
+
BLAST_nucleotide_search(sequence=sequence, database=database)
|
|
93
|
+
BLAST_protein_search(sequence=sequence, database=database)
|
|
94
|
+
"""
|
|
95
|
+
put_url = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
|
|
96
|
+
params = {
|
|
97
|
+
"CMD": "Put",
|
|
98
|
+
"PROGRAM": program,
|
|
99
|
+
"DATABASE": database,
|
|
100
|
+
"QUERY": sequence,
|
|
101
|
+
"FORMAT_TYPE": "JSON2",
|
|
102
|
+
"HITLIST_SIZE": max_hits,
|
|
103
|
+
}
|
|
104
|
+
resp = requests.post(put_url, data=params)
|
|
105
|
+
resp.raise_for_status()
|
|
106
|
+
|
|
107
|
+
# Extract RID
|
|
108
|
+
import re
|
|
109
|
+
rid_match = re.search(r"RID = (\S+)", resp.text)
|
|
110
|
+
if not rid_match:
|
|
111
|
+
raise ValueError("BLAST RID not found")
|
|
112
|
+
rid = rid_match.group(1)
|
|
113
|
+
print(f"BLAST submitted: RID={rid}")
|
|
114
|
+
|
|
115
|
+
# Poll for results
|
|
116
|
+
for _ in range(60):
|
|
117
|
+
time.sleep(10)
|
|
118
|
+
check = requests.get(put_url, params={
|
|
119
|
+
"CMD": "Get", "RID": rid, "FORMAT_TYPE": "JSON2"
|
|
120
|
+
})
|
|
121
|
+
if "Status=WAITING" not in check.text:
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
return check.json() if check.headers.get(
|
|
125
|
+
"Content-Type", ""
|
|
126
|
+
).startswith("application/json") else check.text
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## 3. NCBI Nucleotide 配列フェッチ
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
def fetch_ncbi_sequence(accession, rettype="fasta"):
|
|
133
|
+
"""
|
|
134
|
+
NCBI Nucleotide (E-utilities) から配列を取得。
|
|
135
|
+
|
|
136
|
+
Parameters:
|
|
137
|
+
accession: str — NCBI accession (e.g., "NM_000546.6")
|
|
138
|
+
rettype: str — "fasta", "gb", "gbwithparts"
|
|
139
|
+
|
|
140
|
+
ToolUniverse:
|
|
141
|
+
NCBI_search_nucleotide(query=query)
|
|
142
|
+
NCBI_fetch_accessions(accessions=accessions)
|
|
143
|
+
NCBI_get_sequence(accession=accession)
|
|
144
|
+
"""
|
|
145
|
+
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
|
146
|
+
params = {
|
|
147
|
+
"db": "nucleotide",
|
|
148
|
+
"id": accession,
|
|
149
|
+
"rettype": rettype,
|
|
150
|
+
"retmode": "text",
|
|
151
|
+
}
|
|
152
|
+
resp = requests.get(url, params=params)
|
|
153
|
+
resp.raise_for_status()
|
|
154
|
+
|
|
155
|
+
print(f"NCBI Nucleotide '{accession}': {len(resp.text)} chars ({rettype})")
|
|
156
|
+
return resp.text
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## 4. GDC がんゲノミクスデータ
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
def get_gdc_mutations(gene_symbol, project_id=None):
|
|
163
|
+
"""
|
|
164
|
+
NCI GDC (Genomic Data Commons) から体細胞変異データを取得。
|
|
165
|
+
|
|
166
|
+
Parameters:
|
|
167
|
+
gene_symbol: str — e.g. "TP53"
|
|
168
|
+
project_id: str | None — e.g. "TCGA-BRCA"
|
|
169
|
+
|
|
170
|
+
ToolUniverse:
|
|
171
|
+
GDC_get_ssm_by_gene(gene_symbol=gene_symbol)
|
|
172
|
+
GDC_get_mutation_frequency(project_id=project_id)
|
|
173
|
+
GDC_get_gene_expression(gene_id=gene_id, project_id=project_id)
|
|
174
|
+
GDC_get_cnv_data(gene_id=gene_id)
|
|
175
|
+
GDC_list_projects()
|
|
176
|
+
GDC_search_cases(filters=filters)
|
|
177
|
+
GDC_list_files(filters=filters)
|
|
178
|
+
"""
|
|
179
|
+
url = "https://api.gdc.cancer.gov/ssms"
|
|
180
|
+
filters = {
|
|
181
|
+
"op": "and",
|
|
182
|
+
"content": [
|
|
183
|
+
{"op": "in", "content": {
|
|
184
|
+
"field": "consequence.transcript.gene.symbol",
|
|
185
|
+
"value": [gene_symbol],
|
|
186
|
+
}},
|
|
187
|
+
],
|
|
188
|
+
}
|
|
189
|
+
if project_id:
|
|
190
|
+
filters["content"].append({
|
|
191
|
+
"op": "in",
|
|
192
|
+
"content": {
|
|
193
|
+
"field": "cases.project.project_id",
|
|
194
|
+
"value": [project_id],
|
|
195
|
+
},
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
import json
|
|
199
|
+
params = {
|
|
200
|
+
"filters": json.dumps(filters),
|
|
201
|
+
"fields": ("ssm_id,consequence.transcript.gene.symbol,"
|
|
202
|
+
"consequence.transcript.aa_change,"
|
|
203
|
+
"consequence.transcript.consequence_type,"
|
|
204
|
+
"genomic_dna_change"),
|
|
205
|
+
"size": 100,
|
|
206
|
+
"format": "json",
|
|
207
|
+
}
|
|
208
|
+
resp = requests.get(url, params=params)
|
|
209
|
+
resp.raise_for_status()
|
|
210
|
+
hits = resp.json().get("data", {}).get("hits", [])
|
|
211
|
+
|
|
212
|
+
results = []
|
|
213
|
+
for hit in hits:
|
|
214
|
+
for csq in hit.get("consequence", []):
|
|
215
|
+
tx = csq.get("transcript", {})
|
|
216
|
+
results.append({
|
|
217
|
+
"ssm_id": hit.get("ssm_id", ""),
|
|
218
|
+
"gene": tx.get("gene", {}).get("symbol", ""),
|
|
219
|
+
"aa_change": tx.get("aa_change", ""),
|
|
220
|
+
"consequence_type": tx.get("consequence_type", ""),
|
|
221
|
+
"genomic_dna_change": hit.get("genomic_dna_change", ""),
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
df = pd.DataFrame(results)
|
|
225
|
+
print(f"GDC SSMs '{gene_symbol}'"
|
|
226
|
+
f"{f' ({project_id})' if project_id else ''}: {len(df)} mutations")
|
|
227
|
+
return df
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## 5. 統合ゲノム変異パイプライン
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
def integrated_variant_pipeline(rsid, gene_symbol=None):
|
|
234
|
+
"""
|
|
235
|
+
dbSNP + GDC を統合したゲノム変異解析パイプライン。
|
|
236
|
+
|
|
237
|
+
ToolUniverse (横断):
|
|
238
|
+
dbsnp_get_variant_by_rsid(rsid) → GDC_get_ssm_by_gene(gene_symbol)
|
|
239
|
+
"""
|
|
240
|
+
pipeline_result = {"rsid": rsid}
|
|
241
|
+
|
|
242
|
+
# Step 1: dbSNP
|
|
243
|
+
info, freq_df = get_dbsnp_variant(rsid)
|
|
244
|
+
pipeline_result["dbsnp"] = info
|
|
245
|
+
|
|
246
|
+
# Step 2: GDC somatic mutations (if gene provided)
|
|
247
|
+
if gene_symbol:
|
|
248
|
+
gdc_df = get_gdc_mutations(gene_symbol)
|
|
249
|
+
pipeline_result["gdc_mutation_count"] = len(gdc_df)
|
|
250
|
+
pipeline_result["gdc_top_consequences"] = (
|
|
251
|
+
gdc_df["consequence_type"].value_counts().head(5).to_dict()
|
|
252
|
+
if not gdc_df.empty else {}
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
print(f"Integrated variant: {rsid}"
|
|
256
|
+
f" | GDC={pipeline_result.get('gdc_mutation_count', 'N/A')}")
|
|
257
|
+
return pipeline_result
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## References
|
|
261
|
+
|
|
262
|
+
### Output Files
|
|
263
|
+
|
|
264
|
+
| ファイル | 形式 |
|
|
265
|
+
|---|---|
|
|
266
|
+
| `results/dbsnp_variant.json` | JSON |
|
|
267
|
+
| `results/dbsnp_frequencies.csv` | CSV |
|
|
268
|
+
| `results/blast_results.json` | JSON |
|
|
269
|
+
| `results/ncbi_sequence.fasta` | FASTA |
|
|
270
|
+
| `results/gdc_mutations.csv` | CSV |
|
|
271
|
+
|
|
272
|
+
### 利用可能ツール
|
|
273
|
+
|
|
274
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
275
|
+
|---|---|---|
|
|
276
|
+
| dbSNP | `dbsnp_get_variant_by_rsid` | rsID 変異情報 |
|
|
277
|
+
| dbSNP | `dbsnp_get_frequencies` | アレル頻度 |
|
|
278
|
+
| dbSNP | `dbsnp_search_by_gene` | 遺伝子→変異 |
|
|
279
|
+
| BLAST | `BLAST_nucleotide_search` | 核酸相同性検索 |
|
|
280
|
+
| BLAST | `BLAST_protein_search` | タンパク質相同性検索 |
|
|
281
|
+
| NCBI | `NCBI_search_nucleotide` | 配列検索 |
|
|
282
|
+
| NCBI | `NCBI_fetch_accessions` | アクセッション取得 |
|
|
283
|
+
| NCBI | `NCBI_get_sequence` | 配列フェッチ |
|
|
284
|
+
| GDC | `GDC_get_ssm_by_gene` | 体細胞変異 |
|
|
285
|
+
| GDC | `GDC_get_mutation_frequency` | 変異頻度 |
|
|
286
|
+
| GDC | `GDC_get_gene_expression` | 発現データ |
|
|
287
|
+
| GDC | `GDC_get_cnv_data` | CNV データ |
|
|
288
|
+
| GDC | `GDC_list_projects` | プロジェクト一覧 |
|
|
289
|
+
| GDC | `GDC_search_cases` | 症例検索 |
|
|
290
|
+
| GDC | `GDC_list_files` | ファイル一覧 |
|
|
291
|
+
|
|
292
|
+
### 参照スキル
|
|
293
|
+
|
|
294
|
+
| スキル | 関連 |
|
|
295
|
+
|---|---|
|
|
296
|
+
| `scientific-variant-interpretation` | 変異アノテーション |
|
|
297
|
+
| `scientific-population-genetics` | 集団遺伝学 |
|
|
298
|
+
| `scientific-cancer-genomics` | がんゲノミクス |
|
|
299
|
+
| `scientific-rare-disease-genetics` | 希少疾患遺伝学 |
|
|
300
|
+
| `scientific-biothings-idmapping` | ID マッピング |
|
|
301
|
+
|
|
302
|
+
### 依存パッケージ
|
|
303
|
+
|
|
304
|
+
`requests`, `pandas`
|