@nahisaho/satori 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +150 -54
- package/package.json +1 -1
- package/src/.github/skills/scientific-biomedical-pubtator/SKILL.md +331 -0
- package/src/.github/skills/scientific-biothings-idmapping/SKILL.md +298 -0
- package/src/.github/skills/scientific-cell-line-resources/SKILL.md +258 -0
- package/src/.github/skills/scientific-compound-screening/SKILL.md +245 -0
- package/src/.github/skills/scientific-ebi-databases/SKILL.md +280 -0
- package/src/.github/skills/scientific-genome-sequence-tools/SKILL.md +304 -0
- package/src/.github/skills/scientific-healthcare-ai/SKILL.md +273 -0
- package/src/.github/skills/scientific-human-protein-atlas/SKILL.md +244 -0
- package/src/.github/skills/scientific-metabolic-modeling/SKILL.md +288 -0
- package/src/.github/skills/scientific-noncoding-rna/SKILL.md +262 -0
- package/src/.github/skills/scientific-ontology-enrichment/SKILL.md +340 -0
- package/src/.github/skills/scientific-pharmacology-targets/SKILL.md +323 -0
- package/src/.github/skills/scientific-phylogenetics/SKILL.md +297 -0
- package/src/.github/skills/scientific-preprint-archive/SKILL.md +476 -0
- package/src/.github/skills/scientific-public-health-data/SKILL.md +322 -0
- package/src/.github/skills/scientific-rare-disease-genetics/SKILL.md +327 -0
- package/src/.github/skills/scientific-regulatory-genomics/SKILL.md +274 -0
- package/src/.github/skills/scientific-reinforcement-learning/SKILL.md +280 -0
- package/src/.github/skills/scientific-structural-proteomics/SKILL.md +317 -0
- package/src/.github/skills/scientific-symbolic-mathematics/SKILL.md +277 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-biomedical-pubtator
|
|
3
|
+
description: |
|
|
4
|
+
バイオメディカルテキストマイニングスキル。PubTator3 API による
|
|
5
|
+
遺伝子・疾患・化合物・変異・種のエンティティ認識、関係抽出、
|
|
6
|
+
バイオメディカル文献アノテーション自動化パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Biomedical PubTator
|
|
10
|
+
|
|
11
|
+
PubTator3 API を活用したバイオメディカル文献エンティティ認識・
|
|
12
|
+
関係抽出パイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- PubMed 論文から遺伝子・疾患・化合物のエンティティを自動抽出するとき
|
|
17
|
+
- バイオメディカル NER (Named Entity Recognition) を実行するとき
|
|
18
|
+
- 遺伝子-疾患・薬物-標的の関係を文献から抽出するとき
|
|
19
|
+
- 大規模文献コーパスのバイオアノテーションを行うとき
|
|
20
|
+
- テキストマイニング結果を知識グラフに統合するとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. PubTator3 エンティティアノテーション
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
import json
|
|
32
|
+
import time
|
|
33
|
+
|
|
34
|
+
PUBTATOR_API = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def annotate_pmids(pmids, concepts=None):
|
|
38
|
+
"""
|
|
39
|
+
PubTator3 — PMID リストのバイオメディカルエンティティアノテーション。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
pmids: list — PMID リスト (e.g., [12345678, 23456789])
|
|
43
|
+
concepts: list — エンティティタイプ
|
|
44
|
+
"gene", "disease", "chemical", "mutation", "species", "cellline"
|
|
45
|
+
|
|
46
|
+
ToolUniverse:
|
|
47
|
+
PubTator_annotate(pmids=pmids, concepts=concepts)
|
|
48
|
+
PubTator_search(query=query)
|
|
49
|
+
"""
|
|
50
|
+
if concepts is None:
|
|
51
|
+
concepts = ["gene", "disease", "chemical", "mutation", "species"]
|
|
52
|
+
|
|
53
|
+
pmid_str = ",".join(str(p) for p in pmids)
|
|
54
|
+
params = {
|
|
55
|
+
"pmids": pmid_str,
|
|
56
|
+
"concepts": ",".join(concepts),
|
|
57
|
+
"format": "biocjson",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
resp = requests.get(f"{PUBTATOR_API}/publications/export/biocjson", params=params)
|
|
61
|
+
resp.raise_for_status()
|
|
62
|
+
data = resp.json()
|
|
63
|
+
|
|
64
|
+
# Parse annotations
|
|
65
|
+
all_annotations = []
|
|
66
|
+
for doc in data.get("PubTator3", []) if isinstance(data, dict) else [data]:
|
|
67
|
+
pmid = doc.get("pmid", doc.get("id", ""))
|
|
68
|
+
for passage in doc.get("passages", []):
|
|
69
|
+
for annotation in passage.get("annotations", []):
|
|
70
|
+
infons = annotation.get("infons", {})
|
|
71
|
+
all_annotations.append({
|
|
72
|
+
"pmid": pmid,
|
|
73
|
+
"text": annotation.get("text", ""),
|
|
74
|
+
"type": infons.get("type", ""),
|
|
75
|
+
"identifier": infons.get("identifier", ""),
|
|
76
|
+
"offset": annotation.get("locations", [{}])[0].get("offset", ""),
|
|
77
|
+
"length": annotation.get("locations", [{}])[0].get("length", ""),
|
|
78
|
+
"passage_type": passage.get("infons", {}).get("type", ""),
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
df = pd.DataFrame(all_annotations)
|
|
82
|
+
type_counts = df["type"].value_counts().to_dict() if not df.empty else {}
|
|
83
|
+
print(f"PubTator annotation: {len(pmids)} PMIDs → "
|
|
84
|
+
f"{len(df)} entities {type_counts}")
|
|
85
|
+
return df
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## 2. PubTator3 テキスト検索
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
def search_pubtator(query, max_results=100):
|
|
92
|
+
"""
|
|
93
|
+
PubTator3 テキスト検索 — バイオメディカルエンティティ付き論文検索。
|
|
94
|
+
|
|
95
|
+
Parameters:
|
|
96
|
+
query: str — 検索クエリ (遺伝子名、疾患名、化合物名)
|
|
97
|
+
max_results: int — 最大取得数
|
|
98
|
+
"""
|
|
99
|
+
params = {
|
|
100
|
+
"text": query,
|
|
101
|
+
"sort": "score",
|
|
102
|
+
"page_size": min(max_results, 100),
|
|
103
|
+
}
|
|
104
|
+
resp = requests.get(f"{PUBTATOR_API}/search/", params=params)
|
|
105
|
+
resp.raise_for_status()
|
|
106
|
+
data = resp.json()
|
|
107
|
+
|
|
108
|
+
results = []
|
|
109
|
+
for hit in data.get("results", []):
|
|
110
|
+
results.append({
|
|
111
|
+
"pmid": hit.get("pmid", ""),
|
|
112
|
+
"title": hit.get("title", ""),
|
|
113
|
+
"journal": hit.get("journal", ""),
|
|
114
|
+
"year": hit.get("year", ""),
|
|
115
|
+
"score": hit.get("score", 0),
|
|
116
|
+
"genes": hit.get("genes", []),
|
|
117
|
+
"diseases": hit.get("diseases", []),
|
|
118
|
+
"chemicals": hit.get("chemicals", []),
|
|
119
|
+
"mutations": hit.get("mutations", []),
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
df = pd.DataFrame(results)
|
|
123
|
+
total = data.get("count", 0)
|
|
124
|
+
print(f"PubTator search '{query}': {total} total, {len(df)} returned")
|
|
125
|
+
return df
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## 3. エンティティ関係抽出
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
def extract_entity_relations(pmids, relation_types=None):
|
|
132
|
+
"""
|
|
133
|
+
PubTator3 — エンティティ間関係 (gene-disease, drug-target 等) 抽出。
|
|
134
|
+
|
|
135
|
+
Parameters:
|
|
136
|
+
pmids: list — PMID リスト
|
|
137
|
+
relation_types: list — 関係タイプフィルタ
|
|
138
|
+
"GDA" (gene-disease), "CDA" (chemical-disease),
|
|
139
|
+
"CGA" (chemical-gene), "PPI" (protein-protein)
|
|
140
|
+
"""
|
|
141
|
+
# Get annotations with relations
|
|
142
|
+
df_annotations = annotate_pmids(pmids)
|
|
143
|
+
|
|
144
|
+
# Extract co-occurrences within same passage
|
|
145
|
+
relations = []
|
|
146
|
+
for pmid in df_annotations["pmid"].unique():
|
|
147
|
+
pmid_df = df_annotations[df_annotations["pmid"] == pmid]
|
|
148
|
+
|
|
149
|
+
# Gene-Disease relations
|
|
150
|
+
genes = pmid_df[pmid_df["type"] == "Gene"]
|
|
151
|
+
diseases = pmid_df[pmid_df["type"] == "Disease"]
|
|
152
|
+
chemicals = pmid_df[pmid_df["type"] == "Chemical"]
|
|
153
|
+
|
|
154
|
+
if not relation_types or "GDA" in relation_types:
|
|
155
|
+
for _, gene in genes.iterrows():
|
|
156
|
+
for _, disease in diseases.iterrows():
|
|
157
|
+
relations.append({
|
|
158
|
+
"pmid": pmid,
|
|
159
|
+
"relation_type": "GDA",
|
|
160
|
+
"entity1_type": "Gene",
|
|
161
|
+
"entity1_text": gene["text"],
|
|
162
|
+
"entity1_id": gene["identifier"],
|
|
163
|
+
"entity2_type": "Disease",
|
|
164
|
+
"entity2_text": disease["text"],
|
|
165
|
+
"entity2_id": disease["identifier"],
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
if not relation_types or "CGA" in relation_types:
|
|
169
|
+
for _, chem in chemicals.iterrows():
|
|
170
|
+
for _, gene in genes.iterrows():
|
|
171
|
+
relations.append({
|
|
172
|
+
"pmid": pmid,
|
|
173
|
+
"relation_type": "CGA",
|
|
174
|
+
"entity1_type": "Chemical",
|
|
175
|
+
"entity1_text": chem["text"],
|
|
176
|
+
"entity1_id": chem["identifier"],
|
|
177
|
+
"entity2_type": "Gene",
|
|
178
|
+
"entity2_text": gene["text"],
|
|
179
|
+
"entity2_id": gene["identifier"],
|
|
180
|
+
})
|
|
181
|
+
|
|
182
|
+
if not relation_types or "CDA" in relation_types:
|
|
183
|
+
for _, chem in chemicals.iterrows():
|
|
184
|
+
for _, disease in diseases.iterrows():
|
|
185
|
+
relations.append({
|
|
186
|
+
"pmid": pmid,
|
|
187
|
+
"relation_type": "CDA",
|
|
188
|
+
"entity1_type": "Chemical",
|
|
189
|
+
"entity1_text": chem["text"],
|
|
190
|
+
"entity1_id": chem["identifier"],
|
|
191
|
+
"entity2_type": "Disease",
|
|
192
|
+
"entity2_text": disease["text"],
|
|
193
|
+
"entity2_id": disease["identifier"],
|
|
194
|
+
})
|
|
195
|
+
|
|
196
|
+
rel_df = pd.DataFrame(relations)
|
|
197
|
+
rel_counts = rel_df["relation_type"].value_counts().to_dict() if not rel_df.empty else {}
|
|
198
|
+
print(f"Entity relations: {len(rel_df)} total {rel_counts}")
|
|
199
|
+
return rel_df
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## 4. バイオアノテーション集計ダッシュボード
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
def annotation_summary_dashboard(pmids, output_prefix="pubtator"):
|
|
206
|
+
"""
|
|
207
|
+
PubTator アノテーション集計・可視化。
|
|
208
|
+
|
|
209
|
+
Parameters:
|
|
210
|
+
pmids: list — PMID リスト
|
|
211
|
+
output_prefix: str — 出力ファイルプレフィックス
|
|
212
|
+
"""
|
|
213
|
+
import matplotlib.pyplot as plt
|
|
214
|
+
|
|
215
|
+
# Get annotations
|
|
216
|
+
df = annotate_pmids(pmids)
|
|
217
|
+
if df.empty:
|
|
218
|
+
print("No annotations found")
|
|
219
|
+
return {}
|
|
220
|
+
|
|
221
|
+
# Entity type distribution
|
|
222
|
+
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
|
|
223
|
+
|
|
224
|
+
# 1. Entity type counts
|
|
225
|
+
type_counts = df["type"].value_counts()
|
|
226
|
+
type_counts.plot(kind="bar", ax=axes[0], color="#2196F3")
|
|
227
|
+
axes[0].set_title("Entity Type Distribution")
|
|
228
|
+
axes[0].set_ylabel("Count")
|
|
229
|
+
|
|
230
|
+
# 2. Top entities per type
|
|
231
|
+
for entity_type in ["Gene", "Disease", "Chemical"]:
|
|
232
|
+
sub = df[df["type"] == entity_type]
|
|
233
|
+
top = sub["text"].value_counts().head(10)
|
|
234
|
+
if not top.empty:
|
|
235
|
+
print(f"\nTop {entity_type}s: {top.to_dict()}")
|
|
236
|
+
|
|
237
|
+
# 3. Articles per entity count
|
|
238
|
+
per_article = df.groupby("pmid")["type"].count()
|
|
239
|
+
per_article.hist(ax=axes[1], bins=20, color="#4CAF50")
|
|
240
|
+
axes[1].set_title("Entities per Article")
|
|
241
|
+
axes[1].set_xlabel("Number of entities")
|
|
242
|
+
|
|
243
|
+
# Entity type per article
|
|
244
|
+
pivot = df.groupby(["pmid", "type"]).size().unstack(fill_value=0)
|
|
245
|
+
pivot.plot(kind="box", ax=axes[2])
|
|
246
|
+
axes[2].set_title("Entity Types per Article")
|
|
247
|
+
|
|
248
|
+
plt.tight_layout()
|
|
249
|
+
fig_path = f"figures/{output_prefix}_dashboard.png"
|
|
250
|
+
plt.savefig(fig_path, dpi=150, bbox_inches="tight")
|
|
251
|
+
plt.close()
|
|
252
|
+
|
|
253
|
+
# Save results
|
|
254
|
+
df.to_csv(f"results/{output_prefix}_annotations.csv", index=False)
|
|
255
|
+
|
|
256
|
+
summary = {
|
|
257
|
+
"total_pmids": df["pmid"].nunique(),
|
|
258
|
+
"total_annotations": len(df),
|
|
259
|
+
"entity_types": type_counts.to_dict(),
|
|
260
|
+
"unique_entities": df.groupby("type")["text"].nunique().to_dict(),
|
|
261
|
+
}
|
|
262
|
+
print(f"\nSummary: {summary}")
|
|
263
|
+
return summary
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## 5. 知識グラフ構築用エンティティネットワーク
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
def build_entity_network(pmids, min_cooccurrence=2):
|
|
270
|
+
"""
|
|
271
|
+
PubTator エンティティ共起ネットワーク構築。
|
|
272
|
+
|
|
273
|
+
Parameters:
|
|
274
|
+
pmids: list — PMID リスト
|
|
275
|
+
min_cooccurrence: int — 最小共起回数
|
|
276
|
+
"""
|
|
277
|
+
import networkx as nx
|
|
278
|
+
from collections import Counter
|
|
279
|
+
|
|
280
|
+
rel_df = extract_entity_relations(pmids)
|
|
281
|
+
if rel_df.empty:
|
|
282
|
+
return nx.Graph()
|
|
283
|
+
|
|
284
|
+
# Count co-occurrences
|
|
285
|
+
edge_counter = Counter()
|
|
286
|
+
for _, row in rel_df.iterrows():
|
|
287
|
+
key = tuple(sorted([
|
|
288
|
+
f"{row['entity1_type']}:{row['entity1_text']}",
|
|
289
|
+
f"{row['entity2_type']}:{row['entity2_text']}",
|
|
290
|
+
]))
|
|
291
|
+
edge_counter[key] += 1
|
|
292
|
+
|
|
293
|
+
# Build network
|
|
294
|
+
G = nx.Graph()
|
|
295
|
+
for (node1, node2), count in edge_counter.items():
|
|
296
|
+
if count >= min_cooccurrence:
|
|
297
|
+
G.add_edge(node1, node2, weight=count)
|
|
298
|
+
|
|
299
|
+
print(f"Entity network: {G.number_of_nodes()} nodes, "
|
|
300
|
+
f"{G.number_of_edges()} edges "
|
|
301
|
+
f"(min cooccurrence = {min_cooccurrence})")
|
|
302
|
+
return G
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
---
|
|
306
|
+
|
|
307
|
+
## 利用可能ツール
|
|
308
|
+
|
|
309
|
+
| ToolUniverse カテゴリ | 主なツール |
|
|
310
|
+
|---|---|
|
|
311
|
+
| `pubtator` | `PubTator_annotate`, `PubTator_search` |
|
|
312
|
+
|
|
313
|
+
## パイプライン出力
|
|
314
|
+
|
|
315
|
+
| 出力ファイル | 説明 | 連携先スキル |
|
|
316
|
+
|---|---|---|
|
|
317
|
+
| `results/pubtator_annotations.csv` | エンティティアノテーション | → text-mining-nlp, knowledge-graph |
|
|
318
|
+
| `results/entity_relations.csv` | エンティティ間関係 | → network-analysis, disease-research |
|
|
319
|
+
| `results/entity_network.graphml` | エンティティ共起ネットワーク | → graph-neural-networks |
|
|
320
|
+
| `figures/pubtator_dashboard.png` | アノテーション集計 | → publication-figures |
|
|
321
|
+
|
|
322
|
+
## パイプライン統合
|
|
323
|
+
|
|
324
|
+
```
|
|
325
|
+
literature-search ──→ biomedical-pubtator ──→ text-mining-nlp
|
|
326
|
+
(PubMed/OpenAlex) (PubTator NER) (KG 構築)
|
|
327
|
+
│
|
|
328
|
+
├──→ disease-research (GDA 関係)
|
|
329
|
+
├──→ drug-target-profiling (CGA 関係)
|
|
330
|
+
└──→ preprint-archive (プレプリント NER)
|
|
331
|
+
```
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-biothings-idmapping
|
|
3
|
+
description: |
|
|
4
|
+
BioThings API (MyGene.info, MyVariant.info, MyChem.info) を活用した
|
|
5
|
+
遺伝子・変異・化合物の横断的 ID マッピングおよびアノテーション統合スキル。
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Scientific BioThings ID Mapping
|
|
9
|
+
|
|
10
|
+
BioThings API スイート (MyGene, MyVariant, MyChem) を活用した
|
|
11
|
+
多データベース横断の ID 変換・アノテーション取得パイプラインを提供する。
|
|
12
|
+
|
|
13
|
+
## When to Use
|
|
14
|
+
|
|
15
|
+
- 遺伝子 ID 間の変換 (Entrez ↔ Ensembl ↔ Symbol ↔ UniProt) を行うとき
|
|
16
|
+
- 変異 ID のアノテーション (ClinVar, dbSNP, CADD 等) を取得するとき
|
|
17
|
+
- 化合物 ID の変換 (DrugBank ↔ ChEMBL ↔ InChIKey ↔ PubChem) を行うとき
|
|
18
|
+
- バッチクエリで多数の ID を一括アノテーションするとき
|
|
19
|
+
- 複数データベースのメタ情報を統合するとき
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
## 1. MyGene.info 遺伝子アノテーション
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import requests
|
|
29
|
+
import pandas as pd
|
|
30
|
+
|
|
31
|
+
MYGENE_API = "https://mygene.info/v3"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def mygene_query(query, fields=None, species="human", size=10):
|
|
35
|
+
"""
|
|
36
|
+
MyGene.info で遺伝子検索。
|
|
37
|
+
|
|
38
|
+
Parameters:
|
|
39
|
+
query: str — gene symbol, Entrez ID, or keyword
|
|
40
|
+
fields: str | None — comma-separated fields
|
|
41
|
+
species: str — "human", "mouse", etc.
|
|
42
|
+
|
|
43
|
+
ToolUniverse:
|
|
44
|
+
MyGene_query_genes(q=query, fields=fields, species=species)
|
|
45
|
+
"""
|
|
46
|
+
params = {
|
|
47
|
+
"q": query,
|
|
48
|
+
"species": species,
|
|
49
|
+
"size": size,
|
|
50
|
+
}
|
|
51
|
+
if fields:
|
|
52
|
+
params["fields"] = fields
|
|
53
|
+
|
|
54
|
+
resp = requests.get(f"{MYGENE_API}/query", params=params)
|
|
55
|
+
resp.raise_for_status()
|
|
56
|
+
data = resp.json()
|
|
57
|
+
|
|
58
|
+
hits = data.get("hits", [])
|
|
59
|
+
print(f"MyGene query '{query}': {data.get('total', 0)} total, "
|
|
60
|
+
f"{len(hits)} returned")
|
|
61
|
+
return hits
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def mygene_get_gene(gene_id, fields=None):
|
|
65
|
+
"""
|
|
66
|
+
MyGene.info 遺伝子詳細アノテーション取得。
|
|
67
|
+
|
|
68
|
+
ToolUniverse:
|
|
69
|
+
MyGene_get_gene_annotation(gene_id=gene_id, fields=fields)
|
|
70
|
+
"""
|
|
71
|
+
params = {}
|
|
72
|
+
if fields:
|
|
73
|
+
params["fields"] = fields
|
|
74
|
+
|
|
75
|
+
resp = requests.get(f"{MYGENE_API}/gene/{gene_id}", params=params)
|
|
76
|
+
resp.raise_for_status()
|
|
77
|
+
data = resp.json()
|
|
78
|
+
|
|
79
|
+
print(f"MyGene gene {gene_id}: {data.get('symbol', '?')} "
|
|
80
|
+
f"({data.get('name', '')})")
|
|
81
|
+
return data
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def mygene_batch_query(gene_ids, fields=None, species="human"):
|
|
85
|
+
"""
|
|
86
|
+
MyGene.info バッチ遺伝子アノテーション。
|
|
87
|
+
|
|
88
|
+
ToolUniverse:
|
|
89
|
+
MyGene_batch_query(ids=gene_ids, fields=fields, species=species)
|
|
90
|
+
"""
|
|
91
|
+
payload = {
|
|
92
|
+
"ids": ",".join(str(g) for g in gene_ids),
|
|
93
|
+
"species": species,
|
|
94
|
+
}
|
|
95
|
+
if fields:
|
|
96
|
+
payload["fields"] = fields
|
|
97
|
+
|
|
98
|
+
resp = requests.post(f"{MYGENE_API}/gene", json=payload)
|
|
99
|
+
resp.raise_for_status()
|
|
100
|
+
data = resp.json()
|
|
101
|
+
|
|
102
|
+
print(f"MyGene batch: {len(gene_ids)} queried → {len(data)} results")
|
|
103
|
+
return data
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## 2. MyVariant.info 変異アノテーション
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
MYVARIANT_API = "https://myvariant.info/v1"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def myvariant_get(variant_id, fields=None):
|
|
113
|
+
"""
|
|
114
|
+
MyVariant.info 変異アノテーション取得。
|
|
115
|
+
|
|
116
|
+
Parameters:
|
|
117
|
+
variant_id: str — HGVS notation (e.g., "chr17:g.7674220C>T")
|
|
118
|
+
|
|
119
|
+
ToolUniverse:
|
|
120
|
+
MyVariant_get_variant_annotation(variant_id=variant_id, fields=fields)
|
|
121
|
+
"""
|
|
122
|
+
params = {}
|
|
123
|
+
if fields:
|
|
124
|
+
params["fields"] = fields
|
|
125
|
+
|
|
126
|
+
resp = requests.get(f"{MYVARIANT_API}/variant/{variant_id}", params=params)
|
|
127
|
+
resp.raise_for_status()
|
|
128
|
+
data = resp.json()
|
|
129
|
+
|
|
130
|
+
clinvar = data.get("clinvar", {})
|
|
131
|
+
cadd = data.get("cadd", {})
|
|
132
|
+
print(f"MyVariant {variant_id}: "
|
|
133
|
+
f"ClinVar={clinvar.get('clinical_significance', 'N/A')}, "
|
|
134
|
+
f"CADD={cadd.get('phred', 'N/A')}")
|
|
135
|
+
return data
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def myvariant_query(query, fields=None, size=10):
|
|
139
|
+
"""
|
|
140
|
+
MyVariant.info 変異検索。
|
|
141
|
+
|
|
142
|
+
ToolUniverse:
|
|
143
|
+
MyVariant_query_variants(q=query, fields=fields, size=size)
|
|
144
|
+
"""
|
|
145
|
+
params = {"q": query, "size": size}
|
|
146
|
+
if fields:
|
|
147
|
+
params["fields"] = fields
|
|
148
|
+
|
|
149
|
+
resp = requests.get(f"{MYVARIANT_API}/query", params=params)
|
|
150
|
+
resp.raise_for_status()
|
|
151
|
+
data = resp.json()
|
|
152
|
+
|
|
153
|
+
hits = data.get("hits", [])
|
|
154
|
+
print(f"MyVariant query '{query}': {data.get('total', 0)} total")
|
|
155
|
+
return hits
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## 3. MyChem.info 化合物アノテーション
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
MYCHEM_API = "https://mychem.info/v1"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def mychem_get(chem_id, fields=None):
|
|
165
|
+
"""
|
|
166
|
+
MyChem.info 化合物アノテーション取得。
|
|
167
|
+
|
|
168
|
+
Parameters:
|
|
169
|
+
chem_id: str — InChIKey, DrugBank ID, ChEMBL ID, etc.
|
|
170
|
+
|
|
171
|
+
ToolUniverse:
|
|
172
|
+
MyChem_get_chemical_annotation(chem_id=chem_id, fields=fields)
|
|
173
|
+
"""
|
|
174
|
+
params = {}
|
|
175
|
+
if fields:
|
|
176
|
+
params["fields"] = fields
|
|
177
|
+
|
|
178
|
+
resp = requests.get(f"{MYCHEM_API}/chem/{chem_id}", params=params)
|
|
179
|
+
resp.raise_for_status()
|
|
180
|
+
data = resp.json()
|
|
181
|
+
|
|
182
|
+
drugbank = data.get("drugbank", {})
|
|
183
|
+
print(f"MyChem {chem_id}: {drugbank.get('name', 'N/A')}")
|
|
184
|
+
return data
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def mychem_query(query, fields=None, size=10):
|
|
188
|
+
"""
|
|
189
|
+
MyChem.info 化合物検索。
|
|
190
|
+
|
|
191
|
+
ToolUniverse:
|
|
192
|
+
MyChem_query_chemicals(q=query, fields=fields, size=size)
|
|
193
|
+
"""
|
|
194
|
+
params = {"q": query, "size": size}
|
|
195
|
+
if fields:
|
|
196
|
+
params["fields"] = fields
|
|
197
|
+
|
|
198
|
+
resp = requests.get(f"{MYCHEM_API}/query", params=params)
|
|
199
|
+
resp.raise_for_status()
|
|
200
|
+
data = resp.json()
|
|
201
|
+
|
|
202
|
+
hits = data.get("hits", [])
|
|
203
|
+
print(f"MyChem query '{query}': {data.get('total', 0)} total")
|
|
204
|
+
return hits
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## 4. クロスデータベース ID マッピング
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
def cross_db_id_mapping(gene_symbol):
|
|
211
|
+
"""
|
|
212
|
+
遺伝子シンボルから Entrez, Ensembl, UniProt, RefSeq を一括取得。
|
|
213
|
+
|
|
214
|
+
ToolUniverse (横断):
|
|
215
|
+
MyGene_query_genes(q=gene_symbol, fields="entrezgene,ensembl.gene,uniprot,refseq")
|
|
216
|
+
"""
|
|
217
|
+
fields = "entrezgene,ensembl.gene,uniprot.Swiss-Prot,refseq.rna,symbol,name"
|
|
218
|
+
hits = mygene_query(gene_symbol, fields=fields)
|
|
219
|
+
|
|
220
|
+
results = []
|
|
221
|
+
for hit in hits:
|
|
222
|
+
ensembl = hit.get("ensembl", {})
|
|
223
|
+
if isinstance(ensembl, list):
|
|
224
|
+
ensembl = ensembl[0] if ensembl else {}
|
|
225
|
+
uniprot = hit.get("uniprot", {})
|
|
226
|
+
|
|
227
|
+
results.append({
|
|
228
|
+
"symbol": hit.get("symbol", ""),
|
|
229
|
+
"name": hit.get("name", ""),
|
|
230
|
+
"entrez_id": hit.get("entrezgene", ""),
|
|
231
|
+
"ensembl_gene": ensembl.get("gene", ""),
|
|
232
|
+
"uniprot_swissprot": uniprot.get("Swiss-Prot", ""),
|
|
233
|
+
"refseq_rna": hit.get("refseq", {}).get("rna", []),
|
|
234
|
+
})
|
|
235
|
+
|
|
236
|
+
df = pd.DataFrame(results)
|
|
237
|
+
print(f"ID mapping '{gene_symbol}': {len(df)} entries")
|
|
238
|
+
return df
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
## 5. バッチ統合アノテーション
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
def batch_integrated_annotation(gene_symbols, include_variants=False):
|
|
245
|
+
"""
|
|
246
|
+
複数遺伝子のバッチ統合アノテーション。
|
|
247
|
+
|
|
248
|
+
ToolUniverse (横断):
|
|
249
|
+
MyGene_batch_query(ids=entrez_ids, fields=fields)
|
|
250
|
+
MyVariant_query_variants(q=gene_query) [optional]
|
|
251
|
+
"""
|
|
252
|
+
# Step 1: Batch gene annotation
|
|
253
|
+
all_hits = []
|
|
254
|
+
for symbol in gene_symbols:
|
|
255
|
+
hits = mygene_query(symbol, fields="entrezgene,symbol,name,summary")
|
|
256
|
+
all_hits.extend(hits[:1]) # top hit per symbol
|
|
257
|
+
|
|
258
|
+
df = pd.DataFrame(all_hits)
|
|
259
|
+
print(f"Batch annotation: {len(gene_symbols)} genes → {len(df)} results")
|
|
260
|
+
return df
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## References
|
|
264
|
+
|
|
265
|
+
### Output Files
|
|
266
|
+
|
|
267
|
+
| ファイル | 形式 |
|
|
268
|
+
|---|---|
|
|
269
|
+
| `results/mygene_annotation.json` | JSON |
|
|
270
|
+
| `results/myvariant_annotation.json` | JSON |
|
|
271
|
+
| `results/mychem_annotation.json` | JSON |
|
|
272
|
+
| `results/id_mapping.csv` | CSV |
|
|
273
|
+
|
|
274
|
+
### 利用可能ツール
|
|
275
|
+
|
|
276
|
+
| カテゴリ | 主要ツール | 用途 |
|
|
277
|
+
|---|---|---|
|
|
278
|
+
| BioThings | `MyGene_query_genes` | 遺伝子検索 |
|
|
279
|
+
| BioThings | `MyGene_get_gene_annotation` | 遺伝子詳細 |
|
|
280
|
+
| BioThings | `MyGene_batch_query` | バッチアノテーション |
|
|
281
|
+
| BioThings | `MyVariant_get_variant_annotation` | 変異アノテーション |
|
|
282
|
+
| BioThings | `MyVariant_query_variants` | 変異検索 |
|
|
283
|
+
| BioThings | `MyChem_get_chemical_annotation` | 化合物アノテーション |
|
|
284
|
+
| BioThings | `MyChem_query_chemicals` | 化合物検索 |
|
|
285
|
+
|
|
286
|
+
### 参照スキル
|
|
287
|
+
|
|
288
|
+
| スキル | 関連 |
|
|
289
|
+
|---|---|
|
|
290
|
+
| `scientific-variant-interpretation` | 変異アノテーション |
|
|
291
|
+
| `scientific-gene-expression-transcriptomics` | 遺伝子発現 |
|
|
292
|
+
| `scientific-drug-target-interaction` | DTI 解析 |
|
|
293
|
+
| `scientific-rare-disease-genetics` | 希少疾患 |
|
|
294
|
+
| `scientific-pathway-enrichment` | パスウェイ解析 |
|
|
295
|
+
|
|
296
|
+
### 依存パッケージ
|
|
297
|
+
|
|
298
|
+
`requests`, `pandas`
|