@nahisaho/satori 0.13.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +134 -43
- package/package.json +1 -1
- package/src/.github/skills/scientific-advanced-imaging/SKILL.md +382 -0
- package/src/.github/skills/scientific-biomedical-pubtator/SKILL.md +331 -0
- package/src/.github/skills/scientific-cell-line-resources/SKILL.md +258 -0
- package/src/.github/skills/scientific-chembl-assay-mining/SKILL.md +509 -0
- package/src/.github/skills/scientific-deep-chemistry/SKILL.md +350 -0
- package/src/.github/skills/scientific-ebi-databases/SKILL.md +280 -0
- package/src/.github/skills/scientific-ensembl-genomics/SKILL.md +378 -0
- package/src/.github/skills/scientific-expression-comparison/SKILL.md +303 -0
- package/src/.github/skills/scientific-md-simulation/SKILL.md +315 -0
- package/src/.github/skills/scientific-model-organism-db/SKILL.md +329 -0
- package/src/.github/skills/scientific-ontology-enrichment/SKILL.md +340 -0
- package/src/.github/skills/scientific-perturbation-analysis/SKILL.md +297 -0
- package/src/.github/skills/scientific-phylogenetics/SKILL.md +297 -0
- package/src/.github/skills/scientific-preprint-archive/SKILL.md +476 -0
- package/src/.github/skills/scientific-public-health-data/SKILL.md +322 -0
- package/src/.github/skills/scientific-regulatory-genomics/SKILL.md +274 -0
- package/src/.github/skills/scientific-reinforcement-learning/SKILL.md +280 -0
- package/src/.github/skills/scientific-scvi-integration/SKILL.md +344 -0
- package/src/.github/skills/scientific-string-network-api/SKILL.md +376 -0
- package/src/.github/skills/scientific-symbolic-mathematics/SKILL.md +277 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-biomedical-pubtator
|
|
3
|
+
description: |
|
|
4
|
+
バイオメディカルテキストマイニングスキル。PubTator3 API による
|
|
5
|
+
遺伝子・疾患・化合物・変異・種のエンティティ認識、関係抽出、
|
|
6
|
+
バイオメディカル文献アノテーション自動化パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Biomedical PubTator
|
|
10
|
+
|
|
11
|
+
PubTator3 API を活用したバイオメディカル文献エンティティ認識・
|
|
12
|
+
関係抽出パイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- PubMed 論文から遺伝子・疾患・化合物のエンティティを自動抽出するとき
|
|
17
|
+
- バイオメディカル NER (Named Entity Recognition) を実行するとき
|
|
18
|
+
- 遺伝子-疾患・薬物-標的の関係を文献から抽出するとき
|
|
19
|
+
- 大規模文献コーパスのバイオアノテーションを行うとき
|
|
20
|
+
- テキストマイニング結果を知識グラフに統合するとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. PubTator3 エンティティアノテーション
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
import json
|
|
32
|
+
import time
|
|
33
|
+
|
|
34
|
+
PUBTATOR_API = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def annotate_pmids(pmids, concepts=None):
|
|
38
|
+
"""
|
|
39
|
+
PubTator3 — PMID リストのバイオメディカルエンティティアノテーション。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
pmids: list — PMID リスト (e.g., [12345678, 23456789])
|
|
43
|
+
concepts: list — エンティティタイプ
|
|
44
|
+
"gene", "disease", "chemical", "mutation", "species", "cellline"
|
|
45
|
+
|
|
46
|
+
ToolUniverse:
|
|
47
|
+
PubTator_annotate(pmids=pmids, concepts=concepts)
|
|
48
|
+
PubTator_search(query=query)
|
|
49
|
+
"""
|
|
50
|
+
if concepts is None:
|
|
51
|
+
concepts = ["gene", "disease", "chemical", "mutation", "species"]
|
|
52
|
+
|
|
53
|
+
pmid_str = ",".join(str(p) for p in pmids)
|
|
54
|
+
params = {
|
|
55
|
+
"pmids": pmid_str,
|
|
56
|
+
"concepts": ",".join(concepts),
|
|
57
|
+
"format": "biocjson",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
resp = requests.get(f"{PUBTATOR_API}/publications/export/biocjson", params=params)
|
|
61
|
+
resp.raise_for_status()
|
|
62
|
+
data = resp.json()
|
|
63
|
+
|
|
64
|
+
# Parse annotations
|
|
65
|
+
all_annotations = []
|
|
66
|
+
for doc in data.get("PubTator3", []) if isinstance(data, dict) else [data]:
|
|
67
|
+
pmid = doc.get("pmid", doc.get("id", ""))
|
|
68
|
+
for passage in doc.get("passages", []):
|
|
69
|
+
for annotation in passage.get("annotations", []):
|
|
70
|
+
infons = annotation.get("infons", {})
|
|
71
|
+
all_annotations.append({
|
|
72
|
+
"pmid": pmid,
|
|
73
|
+
"text": annotation.get("text", ""),
|
|
74
|
+
"type": infons.get("type", ""),
|
|
75
|
+
"identifier": infons.get("identifier", ""),
|
|
76
|
+
"offset": annotation.get("locations", [{}])[0].get("offset", ""),
|
|
77
|
+
"length": annotation.get("locations", [{}])[0].get("length", ""),
|
|
78
|
+
"passage_type": passage.get("infons", {}).get("type", ""),
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
df = pd.DataFrame(all_annotations)
|
|
82
|
+
type_counts = df["type"].value_counts().to_dict() if not df.empty else {}
|
|
83
|
+
print(f"PubTator annotation: {len(pmids)} PMIDs → "
|
|
84
|
+
f"{len(df)} entities {type_counts}")
|
|
85
|
+
return df
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## 2. PubTator3 テキスト検索
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
def search_pubtator(query, max_results=100):
|
|
92
|
+
"""
|
|
93
|
+
PubTator3 テキスト検索 — バイオメディカルエンティティ付き論文検索。
|
|
94
|
+
|
|
95
|
+
Parameters:
|
|
96
|
+
query: str — 検索クエリ (遺伝子名、疾患名、化合物名)
|
|
97
|
+
max_results: int — 最大取得数
|
|
98
|
+
"""
|
|
99
|
+
params = {
|
|
100
|
+
"text": query,
|
|
101
|
+
"sort": "score",
|
|
102
|
+
"page_size": min(max_results, 100),
|
|
103
|
+
}
|
|
104
|
+
resp = requests.get(f"{PUBTATOR_API}/search/", params=params)
|
|
105
|
+
resp.raise_for_status()
|
|
106
|
+
data = resp.json()
|
|
107
|
+
|
|
108
|
+
results = []
|
|
109
|
+
for hit in data.get("results", []):
|
|
110
|
+
results.append({
|
|
111
|
+
"pmid": hit.get("pmid", ""),
|
|
112
|
+
"title": hit.get("title", ""),
|
|
113
|
+
"journal": hit.get("journal", ""),
|
|
114
|
+
"year": hit.get("year", ""),
|
|
115
|
+
"score": hit.get("score", 0),
|
|
116
|
+
"genes": hit.get("genes", []),
|
|
117
|
+
"diseases": hit.get("diseases", []),
|
|
118
|
+
"chemicals": hit.get("chemicals", []),
|
|
119
|
+
"mutations": hit.get("mutations", []),
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
df = pd.DataFrame(results)
|
|
123
|
+
total = data.get("count", 0)
|
|
124
|
+
print(f"PubTator search '{query}': {total} total, {len(df)} returned")
|
|
125
|
+
return df
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## 3. エンティティ関係抽出
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
def extract_entity_relations(pmids, relation_types=None):
|
|
132
|
+
"""
|
|
133
|
+
PubTator3 — エンティティ間関係 (gene-disease, drug-target 等) 抽出。
|
|
134
|
+
|
|
135
|
+
Parameters:
|
|
136
|
+
pmids: list — PMID リスト
|
|
137
|
+
relation_types: list — 関係タイプフィルタ
|
|
138
|
+
"GDA" (gene-disease), "CDA" (chemical-disease),
|
|
139
|
+
"CGA" (chemical-gene), "PPI" (protein-protein)
|
|
140
|
+
"""
|
|
141
|
+
# Get annotations with relations
|
|
142
|
+
df_annotations = annotate_pmids(pmids)
|
|
143
|
+
|
|
144
|
+
# Extract co-occurrences within same passage
|
|
145
|
+
relations = []
|
|
146
|
+
for pmid in df_annotations["pmid"].unique():
|
|
147
|
+
pmid_df = df_annotations[df_annotations["pmid"] == pmid]
|
|
148
|
+
|
|
149
|
+
# Gene-Disease relations
|
|
150
|
+
genes = pmid_df[pmid_df["type"] == "Gene"]
|
|
151
|
+
diseases = pmid_df[pmid_df["type"] == "Disease"]
|
|
152
|
+
chemicals = pmid_df[pmid_df["type"] == "Chemical"]
|
|
153
|
+
|
|
154
|
+
if not relation_types or "GDA" in relation_types:
|
|
155
|
+
for _, gene in genes.iterrows():
|
|
156
|
+
for _, disease in diseases.iterrows():
|
|
157
|
+
relations.append({
|
|
158
|
+
"pmid": pmid,
|
|
159
|
+
"relation_type": "GDA",
|
|
160
|
+
"entity1_type": "Gene",
|
|
161
|
+
"entity1_text": gene["text"],
|
|
162
|
+
"entity1_id": gene["identifier"],
|
|
163
|
+
"entity2_type": "Disease",
|
|
164
|
+
"entity2_text": disease["text"],
|
|
165
|
+
"entity2_id": disease["identifier"],
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
if not relation_types or "CGA" in relation_types:
|
|
169
|
+
for _, chem in chemicals.iterrows():
|
|
170
|
+
for _, gene in genes.iterrows():
|
|
171
|
+
relations.append({
|
|
172
|
+
"pmid": pmid,
|
|
173
|
+
"relation_type": "CGA",
|
|
174
|
+
"entity1_type": "Chemical",
|
|
175
|
+
"entity1_text": chem["text"],
|
|
176
|
+
"entity1_id": chem["identifier"],
|
|
177
|
+
"entity2_type": "Gene",
|
|
178
|
+
"entity2_text": gene["text"],
|
|
179
|
+
"entity2_id": gene["identifier"],
|
|
180
|
+
})
|
|
181
|
+
|
|
182
|
+
if not relation_types or "CDA" in relation_types:
|
|
183
|
+
for _, chem in chemicals.iterrows():
|
|
184
|
+
for _, disease in diseases.iterrows():
|
|
185
|
+
relations.append({
|
|
186
|
+
"pmid": pmid,
|
|
187
|
+
"relation_type": "CDA",
|
|
188
|
+
"entity1_type": "Chemical",
|
|
189
|
+
"entity1_text": chem["text"],
|
|
190
|
+
"entity1_id": chem["identifier"],
|
|
191
|
+
"entity2_type": "Disease",
|
|
192
|
+
"entity2_text": disease["text"],
|
|
193
|
+
"entity2_id": disease["identifier"],
|
|
194
|
+
})
|
|
195
|
+
|
|
196
|
+
rel_df = pd.DataFrame(relations)
|
|
197
|
+
rel_counts = rel_df["relation_type"].value_counts().to_dict() if not rel_df.empty else {}
|
|
198
|
+
print(f"Entity relations: {len(rel_df)} total {rel_counts}")
|
|
199
|
+
return rel_df
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## 4. バイオアノテーション集計ダッシュボード
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
def annotation_summary_dashboard(pmids, output_prefix="pubtator"):
|
|
206
|
+
"""
|
|
207
|
+
PubTator アノテーション集計・可視化。
|
|
208
|
+
|
|
209
|
+
Parameters:
|
|
210
|
+
pmids: list — PMID リスト
|
|
211
|
+
output_prefix: str — 出力ファイルプレフィックス
|
|
212
|
+
"""
|
|
213
|
+
import matplotlib.pyplot as plt
|
|
214
|
+
|
|
215
|
+
# Get annotations
|
|
216
|
+
df = annotate_pmids(pmids)
|
|
217
|
+
if df.empty:
|
|
218
|
+
print("No annotations found")
|
|
219
|
+
return {}
|
|
220
|
+
|
|
221
|
+
# Entity type distribution
|
|
222
|
+
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
|
|
223
|
+
|
|
224
|
+
# 1. Entity type counts
|
|
225
|
+
type_counts = df["type"].value_counts()
|
|
226
|
+
type_counts.plot(kind="bar", ax=axes[0], color="#2196F3")
|
|
227
|
+
axes[0].set_title("Entity Type Distribution")
|
|
228
|
+
axes[0].set_ylabel("Count")
|
|
229
|
+
|
|
230
|
+
# 2. Top entities per type
|
|
231
|
+
for entity_type in ["Gene", "Disease", "Chemical"]:
|
|
232
|
+
sub = df[df["type"] == entity_type]
|
|
233
|
+
top = sub["text"].value_counts().head(10)
|
|
234
|
+
if not top.empty:
|
|
235
|
+
print(f"\nTop {entity_type}s: {top.to_dict()}")
|
|
236
|
+
|
|
237
|
+
# 3. Articles per entity count
|
|
238
|
+
per_article = df.groupby("pmid")["type"].count()
|
|
239
|
+
per_article.hist(ax=axes[1], bins=20, color="#4CAF50")
|
|
240
|
+
axes[1].set_title("Entities per Article")
|
|
241
|
+
axes[1].set_xlabel("Number of entities")
|
|
242
|
+
|
|
243
|
+
# Entity type per article
|
|
244
|
+
pivot = df.groupby(["pmid", "type"]).size().unstack(fill_value=0)
|
|
245
|
+
pivot.plot(kind="box", ax=axes[2])
|
|
246
|
+
axes[2].set_title("Entity Types per Article")
|
|
247
|
+
|
|
248
|
+
plt.tight_layout()
|
|
249
|
+
fig_path = f"figures/{output_prefix}_dashboard.png"
|
|
250
|
+
plt.savefig(fig_path, dpi=150, bbox_inches="tight")
|
|
251
|
+
plt.close()
|
|
252
|
+
|
|
253
|
+
# Save results
|
|
254
|
+
df.to_csv(f"results/{output_prefix}_annotations.csv", index=False)
|
|
255
|
+
|
|
256
|
+
summary = {
|
|
257
|
+
"total_pmids": df["pmid"].nunique(),
|
|
258
|
+
"total_annotations": len(df),
|
|
259
|
+
"entity_types": type_counts.to_dict(),
|
|
260
|
+
"unique_entities": df.groupby("type")["text"].nunique().to_dict(),
|
|
261
|
+
}
|
|
262
|
+
print(f"\nSummary: {summary}")
|
|
263
|
+
return summary
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## 5. 知識グラフ構築用エンティティネットワーク
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
def build_entity_network(pmids, min_cooccurrence=2):
|
|
270
|
+
"""
|
|
271
|
+
PubTator エンティティ共起ネットワーク構築。
|
|
272
|
+
|
|
273
|
+
Parameters:
|
|
274
|
+
pmids: list — PMID リスト
|
|
275
|
+
min_cooccurrence: int — 最小共起回数
|
|
276
|
+
"""
|
|
277
|
+
import networkx as nx
|
|
278
|
+
from collections import Counter
|
|
279
|
+
|
|
280
|
+
rel_df = extract_entity_relations(pmids)
|
|
281
|
+
if rel_df.empty:
|
|
282
|
+
return nx.Graph()
|
|
283
|
+
|
|
284
|
+
# Count co-occurrences
|
|
285
|
+
edge_counter = Counter()
|
|
286
|
+
for _, row in rel_df.iterrows():
|
|
287
|
+
key = tuple(sorted([
|
|
288
|
+
f"{row['entity1_type']}:{row['entity1_text']}",
|
|
289
|
+
f"{row['entity2_type']}:{row['entity2_text']}",
|
|
290
|
+
]))
|
|
291
|
+
edge_counter[key] += 1
|
|
292
|
+
|
|
293
|
+
# Build network
|
|
294
|
+
G = nx.Graph()
|
|
295
|
+
for (node1, node2), count in edge_counter.items():
|
|
296
|
+
if count >= min_cooccurrence:
|
|
297
|
+
G.add_edge(node1, node2, weight=count)
|
|
298
|
+
|
|
299
|
+
print(f"Entity network: {G.number_of_nodes()} nodes, "
|
|
300
|
+
f"{G.number_of_edges()} edges "
|
|
301
|
+
f"(min cooccurrence = {min_cooccurrence})")
|
|
302
|
+
return G
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
---
|
|
306
|
+
|
|
307
|
+
## 利用可能ツール
|
|
308
|
+
|
|
309
|
+
| ToolUniverse カテゴリ | 主なツール |
|
|
310
|
+
|---|---|
|
|
311
|
+
| `pubtator` | `PubTator_annotate`, `PubTator_search` |
|
|
312
|
+
|
|
313
|
+
## パイプライン出力
|
|
314
|
+
|
|
315
|
+
| 出力ファイル | 説明 | 連携先スキル |
|
|
316
|
+
|---|---|---|
|
|
317
|
+
| `results/pubtator_annotations.csv` | エンティティアノテーション | → text-mining-nlp, knowledge-graph |
|
|
318
|
+
| `results/entity_relations.csv` | エンティティ間関係 | → network-analysis, disease-research |
|
|
319
|
+
| `results/entity_network.graphml` | エンティティ共起ネットワーク | → graph-neural-networks |
|
|
320
|
+
| `figures/pubtator_dashboard.png` | アノテーション集計 | → publication-figures |
|
|
321
|
+
|
|
322
|
+
## パイプライン統合
|
|
323
|
+
|
|
324
|
+
```
|
|
325
|
+
literature-search ──→ biomedical-pubtator ──→ text-mining-nlp
|
|
326
|
+
(PubMed/OpenAlex) (PubTator NER) (KG 構築)
|
|
327
|
+
│
|
|
328
|
+
├──→ disease-research (GDA 関係)
|
|
329
|
+
├──→ drug-target-profiling (CGA 関係)
|
|
330
|
+
└──→ preprint-archive (プレプリント NER)
|
|
331
|
+
```
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-cell-line-resources
|
|
3
|
+
description: |
|
|
4
|
+
細胞株リソーススキル。Cellosaurus 細胞株データベース検索、
|
|
5
|
+
STR プロファイルマッチング、コンタミネーション検出、
|
|
6
|
+
細胞株メタデータ (由来組織・疾患・種) 取得パイプライン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Cell Line Resources
|
|
10
|
+
|
|
11
|
+
Cellosaurus を中心とした細胞株リソースデータベースアクセス
|
|
12
|
+
パイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 細胞株の正式名称・アクセッション番号を確認するとき
|
|
17
|
+
- 細胞株の由来 (組織・疾患・種) を調べるとき
|
|
18
|
+
- STR プロファイルで細胞株の同一性を検証するとき
|
|
19
|
+
- 細胞株のコンタミネーション (ミスアイデンティフィケーション) を確認するとき
|
|
20
|
+
- 実験に使用する細胞株の参考文献・データベースリンクを取得するとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. Cellosaurus 細胞株検索
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import requests
|
|
30
|
+
import pandas as pd
|
|
31
|
+
|
|
32
|
+
CELLOSAURUS_API = "https://api.cellosaurus.org"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def search_cellosaurus(query, limit=25):
|
|
36
|
+
"""
|
|
37
|
+
Cellosaurus 細胞株検索。
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
query: str — 細胞株名 (e.g., "HeLa", "MCF-7", "A549")
|
|
41
|
+
limit: int — 最大取得数
|
|
42
|
+
|
|
43
|
+
ToolUniverse:
|
|
44
|
+
Cellosaurus_search(query=query)
|
|
45
|
+
Cellosaurus_get_cell_line(accession=accession)
|
|
46
|
+
Cellosaurus_get_str_profile(accession=accession)
|
|
47
|
+
"""
|
|
48
|
+
params = {"q": query, "rows": limit, "format": "json"}
|
|
49
|
+
resp = requests.get(f"{CELLOSAURUS_API}/search/cell-line", params=params)
|
|
50
|
+
resp.raise_for_status()
|
|
51
|
+
data = resp.json()
|
|
52
|
+
|
|
53
|
+
results = []
|
|
54
|
+
for cell_line in data.get("result", {}).get("cellLineList", []):
|
|
55
|
+
cl = cell_line.get("cellLine", {})
|
|
56
|
+
results.append({
|
|
57
|
+
"accession": cl.get("accession", ""),
|
|
58
|
+
"name": cl.get("name", ""),
|
|
59
|
+
"synonyms": [s.get("value", "") for s in cl.get("synonymList", [])],
|
|
60
|
+
"category": cl.get("category", ""),
|
|
61
|
+
"sex": cl.get("sex", ""),
|
|
62
|
+
"species": cl.get("species", {}).get("value", ""),
|
|
63
|
+
"diseases": [
|
|
64
|
+
d.get("terminology", {}).get("value", "")
|
|
65
|
+
for d in cl.get("diseaseList", [])
|
|
66
|
+
],
|
|
67
|
+
"derived_from_site": cl.get("derivedFromSite", {}).get("value", ""),
|
|
68
|
+
"is_contaminated": cl.get("isContaminated", False),
|
|
69
|
+
"is_problematic": cl.get("isProblematic", False),
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
df = pd.DataFrame(results)
|
|
73
|
+
print(f"Cellosaurus search '{query}': {len(df)} cell lines")
|
|
74
|
+
return df
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## 2. 細胞株詳細情報取得
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
def get_cellosaurus_entry(accession):
|
|
81
|
+
"""
|
|
82
|
+
Cellosaurus 細胞株詳細情報取得。
|
|
83
|
+
|
|
84
|
+
Parameters:
|
|
85
|
+
accession: str — Cellosaurus アクセッション (e.g., "CVCL_0030")
|
|
86
|
+
"""
|
|
87
|
+
resp = requests.get(
|
|
88
|
+
f"{CELLOSAURUS_API}/cell-line/{accession}",
|
|
89
|
+
params={"format": "json"}
|
|
90
|
+
)
|
|
91
|
+
resp.raise_for_status()
|
|
92
|
+
data = resp.json()
|
|
93
|
+
|
|
94
|
+
cl = data.get("cellLine", {})
|
|
95
|
+
entry = {
|
|
96
|
+
"accession": cl.get("accession", ""),
|
|
97
|
+
"name": cl.get("name", ""),
|
|
98
|
+
"category": cl.get("category", ""),
|
|
99
|
+
"sex": cl.get("sex", ""),
|
|
100
|
+
"age": cl.get("age", ""),
|
|
101
|
+
"species": cl.get("species", {}).get("value", ""),
|
|
102
|
+
"diseases": [
|
|
103
|
+
{
|
|
104
|
+
"name": d.get("terminology", {}).get("value", ""),
|
|
105
|
+
"accession": d.get("terminology", {}).get("accession", ""),
|
|
106
|
+
}
|
|
107
|
+
for d in cl.get("diseaseList", [])
|
|
108
|
+
],
|
|
109
|
+
"derived_from_site": cl.get("derivedFromSite", {}).get("value", ""),
|
|
110
|
+
"is_contaminated": cl.get("isContaminated", False),
|
|
111
|
+
"contamination_comment": cl.get("contaminationComment", ""),
|
|
112
|
+
"str_profile": cl.get("strList", []),
|
|
113
|
+
"references": [
|
|
114
|
+
{
|
|
115
|
+
"pmid": r.get("pubmedId", ""),
|
|
116
|
+
"title": r.get("title", ""),
|
|
117
|
+
}
|
|
118
|
+
for r in cl.get("referenceList", [])
|
|
119
|
+
],
|
|
120
|
+
"cross_references": [
|
|
121
|
+
{
|
|
122
|
+
"database": xr.get("database", ""),
|
|
123
|
+
"accession": xr.get("accession", ""),
|
|
124
|
+
}
|
|
125
|
+
for xr in cl.get("xrefList", [])
|
|
126
|
+
],
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
print(f"Cellosaurus {accession}: {entry['name']} "
|
|
130
|
+
f"({entry['species']}, {entry['category']})")
|
|
131
|
+
return entry
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## 3. STR プロファイル検証
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
def check_str_profile(accession, str_data=None):
|
|
138
|
+
"""
|
|
139
|
+
STR (Short Tandem Repeat) プロファイルによる細胞株同一性検証。
|
|
140
|
+
|
|
141
|
+
Parameters:
|
|
142
|
+
accession: str — Cellosaurus アクセッション
|
|
143
|
+
str_data: dict — 測定した STR データ {marker: alleles}
|
|
144
|
+
"""
|
|
145
|
+
entry = get_cellosaurus_entry(accession)
|
|
146
|
+
ref_str = entry.get("str_profile", [])
|
|
147
|
+
|
|
148
|
+
if not ref_str:
|
|
149
|
+
print(f"WARNING: {accession} has no STR profile in Cellosaurus")
|
|
150
|
+
return {"match": None, "message": "No reference STR profile available"}
|
|
151
|
+
|
|
152
|
+
ref_markers = {}
|
|
153
|
+
for marker in ref_str:
|
|
154
|
+
name = marker.get("marker", "")
|
|
155
|
+
alleles = marker.get("alleles", "")
|
|
156
|
+
ref_markers[name] = alleles
|
|
157
|
+
|
|
158
|
+
if str_data is None:
|
|
159
|
+
print(f"Reference STR for {accession}: {len(ref_markers)} markers")
|
|
160
|
+
return {"reference_str": ref_markers, "marker_count": len(ref_markers)}
|
|
161
|
+
|
|
162
|
+
# Calculate match percentage
|
|
163
|
+
matched = 0
|
|
164
|
+
total = 0
|
|
165
|
+
details = []
|
|
166
|
+
for marker, ref_alleles in ref_markers.items():
|
|
167
|
+
if marker in str_data:
|
|
168
|
+
total += 1
|
|
169
|
+
measured = str_data[marker]
|
|
170
|
+
if set(str(ref_alleles).split(",")) == set(str(measured).split(",")):
|
|
171
|
+
matched += 1
|
|
172
|
+
details.append({"marker": marker, "match": True})
|
|
173
|
+
else:
|
|
174
|
+
details.append({
|
|
175
|
+
"marker": marker, "match": False,
|
|
176
|
+
"reference": ref_alleles, "measured": measured,
|
|
177
|
+
})
|
|
178
|
+
|
|
179
|
+
match_pct = (matched / total * 100) if total > 0 else 0
|
|
180
|
+
result = {
|
|
181
|
+
"match_percentage": match_pct,
|
|
182
|
+
"matched": matched,
|
|
183
|
+
"total_compared": total,
|
|
184
|
+
"is_authenticated": match_pct >= 80,
|
|
185
|
+
"details": details,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
status = "PASS" if result["is_authenticated"] else "FAIL"
|
|
189
|
+
print(f"STR verification {accession}: {match_pct:.1f}% match → {status}")
|
|
190
|
+
return result
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## 4. コンタミネーション・問題細胞株チェック
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
def check_contamination_status(cell_line_names):
|
|
197
|
+
"""
|
|
198
|
+
細胞株リストのコンタミネーション/ミスアイデンティフィケーション確認。
|
|
199
|
+
|
|
200
|
+
Parameters:
|
|
201
|
+
cell_line_names: list — 細胞株名リスト
|
|
202
|
+
"""
|
|
203
|
+
results = []
|
|
204
|
+
for name in cell_line_names:
|
|
205
|
+
df = search_cellosaurus(name, limit=1)
|
|
206
|
+
if df.empty:
|
|
207
|
+
results.append({
|
|
208
|
+
"name": name, "found": False,
|
|
209
|
+
"is_contaminated": None, "is_problematic": None,
|
|
210
|
+
})
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
row = df.iloc[0]
|
|
214
|
+
results.append({
|
|
215
|
+
"name": name,
|
|
216
|
+
"found": True,
|
|
217
|
+
"accession": row.get("accession", ""),
|
|
218
|
+
"official_name": row.get("name", ""),
|
|
219
|
+
"is_contaminated": row.get("is_contaminated", False),
|
|
220
|
+
"is_problematic": row.get("is_problematic", False),
|
|
221
|
+
"species": row.get("species", ""),
|
|
222
|
+
"diseases": row.get("diseases", []),
|
|
223
|
+
})
|
|
224
|
+
|
|
225
|
+
df = pd.DataFrame(results)
|
|
226
|
+
contaminated = df["is_contaminated"].sum() if "is_contaminated" in df else 0
|
|
227
|
+
problematic = df["is_problematic"].sum() if "is_problematic" in df else 0
|
|
228
|
+
print(f"Cell line check: {len(cell_line_names)} lines, "
|
|
229
|
+
f"{contaminated} contaminated, {problematic} problematic")
|
|
230
|
+
return df
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## 利用可能ツール
|
|
236
|
+
|
|
237
|
+
| ToolUniverse カテゴリ | 主なツール |
|
|
238
|
+
|---|---|
|
|
239
|
+
| `cellosaurus` | `Cellosaurus_search`, `Cellosaurus_get_cell_line`, `Cellosaurus_get_str_profile` |
|
|
240
|
+
|
|
241
|
+
## パイプライン出力
|
|
242
|
+
|
|
243
|
+
| 出力ファイル | 説明 | 連携先スキル |
|
|
244
|
+
|---|---|---|
|
|
245
|
+
| `results/cell_lines.csv` | 細胞株メタデータ | → cancer-genomics, precision-oncology |
|
|
246
|
+
| `results/str_verification.json` | STR 検証結果 | → lab-automation, lab-data-management |
|
|
247
|
+
| `results/contamination_report.json` | コンタミレポート | → research-methodology |
|
|
248
|
+
|
|
249
|
+
## パイプライン統合
|
|
250
|
+
|
|
251
|
+
```
|
|
252
|
+
cancer-genomics ──→ cell-line-resources ──→ lab-automation
|
|
253
|
+
(COSMIC/DepMap) (Cellosaurus STR) (プロトコル管理)
|
|
254
|
+
│
|
|
255
|
+
├──→ precision-oncology (腫瘍細胞株)
|
|
256
|
+
├──→ disease-research (疾患モデル)
|
|
257
|
+
└──→ human-protein-atlas (発現データ)
|
|
258
|
+
```
|