@nahisaho/satori 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +72 -30
- package/package.json +1 -1
- package/src/.github/skills/scientific-advanced-imaging/SKILL.md +382 -0
- package/src/.github/skills/scientific-chembl-assay-mining/SKILL.md +509 -0
- package/src/.github/skills/scientific-data-submission/SKILL.md +357 -0
- package/src/.github/skills/scientific-deep-chemistry/SKILL.md +350 -0
- package/src/.github/skills/scientific-ensembl-genomics/SKILL.md +378 -0
- package/src/.github/skills/scientific-expression-comparison/SKILL.md +303 -0
- package/src/.github/skills/scientific-gpu-singlecell/SKILL.md +296 -0
- package/src/.github/skills/scientific-marine-ecology/SKILL.md +429 -0
- package/src/.github/skills/scientific-md-simulation/SKILL.md +315 -0
- package/src/.github/skills/scientific-model-organism-db/SKILL.md +329 -0
- package/src/.github/skills/scientific-nci60-screening/SKILL.md +307 -0
- package/src/.github/skills/scientific-perturbation-analysis/SKILL.md +297 -0
- package/src/.github/skills/scientific-plant-biology/SKILL.md +321 -0
- package/src/.github/skills/scientific-rrna-taxonomy/SKILL.md +379 -0
- package/src/.github/skills/scientific-scatac-signac/SKILL.md +300 -0
- package/src/.github/skills/scientific-scvi-integration/SKILL.md +344 -0
- package/src/.github/skills/scientific-string-network-api/SKILL.md +376 -0
- package/src/.github/skills/scientific-toxicology-env/SKILL.md +309 -0
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-data-submission
|
|
3
|
+
description: |
|
|
4
|
+
科学データ登録・アーカイブスキル。GenBank/SRA 配列登録・
|
|
5
|
+
ENA 配列アーカイブ・GEO 発現データ登録・BioProject/BioSample
|
|
6
|
+
メタデータ管理・FAIR 原則準拠データ共有。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Data Submission
|
|
10
|
+
|
|
11
|
+
GenBank / SRA / ENA / GEO / BioProject を活用した科学データの
|
|
12
|
+
登録・アーカイブパイプラインを提供する。FAIR 原則に準拠した
|
|
13
|
+
配列データ・発現データ・メタデータの公開準備。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- 配列データを GenBank/DDBJ/ENA に登録するとき
|
|
18
|
+
- RNA-seq/WGS データを SRA にアーカイブするとき
|
|
19
|
+
- GEO にマイクロアレイ/RNA-seq 発現データを登録するとき
|
|
20
|
+
- BioProject/BioSample でメタデータを構造化するとき
|
|
21
|
+
- 論文投稿時にデータアクセッション番号が必要なとき
|
|
22
|
+
- FAIR 原則 (Findable, Accessible, Interoperable, Reusable) に準拠するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. BioProject/BioSample メタデータ作成
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import json
|
|
32
|
+
import pandas as pd
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
from datetime import date
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def create_bioproject_metadata(title, description, organism,
|
|
38
|
+
data_type="Genome Sequencing",
|
|
39
|
+
relevance="Medical"):
|
|
40
|
+
"""
|
|
41
|
+
BioProject メタデータ XML/JSON 生成。
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
title: str — プロジェクトタイトル
|
|
45
|
+
description: str — プロジェクト説明
|
|
46
|
+
organism: str — 生物種名
|
|
47
|
+
data_type: str — データ種別
|
|
48
|
+
relevance: str — 関連分野
|
|
49
|
+
"""
|
|
50
|
+
bioproject = {
|
|
51
|
+
"Project": {
|
|
52
|
+
"ProjectID": {"ArchiveID": {"accession": "PRJNA_PENDING"}},
|
|
53
|
+
"Descriptor": {
|
|
54
|
+
"Title": title,
|
|
55
|
+
"Description": description,
|
|
56
|
+
"Relevance": relevance,
|
|
57
|
+
},
|
|
58
|
+
"ProjectType": {
|
|
59
|
+
"ProjectTypeSubmission": {
|
|
60
|
+
"Target": {
|
|
61
|
+
"Organism": {"OrganismName": organism},
|
|
62
|
+
},
|
|
63
|
+
"Method": {"MethodType": data_type},
|
|
64
|
+
"Objectives": {"Data": {"DataType": data_type}},
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
print(f"BioProject metadata created:")
|
|
71
|
+
print(f" Title: {title}")
|
|
72
|
+
print(f" Organism: {organism}")
|
|
73
|
+
print(f" Data type: {data_type}")
|
|
74
|
+
return bioproject
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def create_biosample_table(samples, organism, package="Generic"):
|
|
78
|
+
"""
|
|
79
|
+
BioSample TSV テンプレート生成。
|
|
80
|
+
|
|
81
|
+
Parameters:
|
|
82
|
+
samples: list[dict] — サンプル情報
|
|
83
|
+
organism: str — 生物種名
|
|
84
|
+
package: str — BioSample パッケージ
|
|
85
|
+
"""
|
|
86
|
+
required_fields = [
|
|
87
|
+
"sample_name", "organism", "collection_date",
|
|
88
|
+
"geo_loc_name", "tissue", "description",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
rows = []
|
|
92
|
+
for s in samples:
|
|
93
|
+
row = {
|
|
94
|
+
"sample_name": s.get("name", ""),
|
|
95
|
+
"organism": organism,
|
|
96
|
+
"collection_date": s.get("date", str(date.today())),
|
|
97
|
+
"geo_loc_name": s.get("location", "not collected"),
|
|
98
|
+
"tissue": s.get("tissue", "not applicable"),
|
|
99
|
+
"description": s.get("description", ""),
|
|
100
|
+
}
|
|
101
|
+
row.update({k: v for k, v in s.items()
|
|
102
|
+
if k not in ["name", "date", "location"]})
|
|
103
|
+
rows.append(row)
|
|
104
|
+
|
|
105
|
+
df = pd.DataFrame(rows)
|
|
106
|
+
print(f"BioSample table: {len(df)} samples, package='{package}'")
|
|
107
|
+
return df
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## 2. GenBank 配列登録準備
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
def prepare_genbank_submission(sequences, annotations, output_dir="submission"):
|
|
114
|
+
"""
|
|
115
|
+
GenBank 配列登録用 .sqn ファイル準備。
|
|
116
|
+
|
|
117
|
+
Parameters:
|
|
118
|
+
sequences: dict — {seq_id: sequence_string}
|
|
119
|
+
annotations: dict — {seq_id: {gene, product, organism, ...}}
|
|
120
|
+
output_dir: str — 出力ディレクトリ
|
|
121
|
+
"""
|
|
122
|
+
output_dir = Path(output_dir)
|
|
123
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
124
|
+
|
|
125
|
+
# FASTA 生成
|
|
126
|
+
fasta_path = output_dir / "sequences.fsa"
|
|
127
|
+
with open(fasta_path, "w") as f:
|
|
128
|
+
for seq_id, seq in sequences.items():
|
|
129
|
+
ann = annotations.get(seq_id, {})
|
|
130
|
+
organism = ann.get("organism", "Unknown organism")
|
|
131
|
+
f.write(f">{seq_id} [organism={organism}]\n")
|
|
132
|
+
# 80文字折返し
|
|
133
|
+
for i in range(0, len(seq), 80):
|
|
134
|
+
f.write(seq[i:i+80] + "\n")
|
|
135
|
+
|
|
136
|
+
# Feature Table 生成
|
|
137
|
+
tbl_path = output_dir / "sequences.tbl"
|
|
138
|
+
with open(tbl_path, "w") as f:
|
|
139
|
+
for seq_id, ann in annotations.items():
|
|
140
|
+
f.write(f">Feature {seq_id}\n")
|
|
141
|
+
if "gene" in ann:
|
|
142
|
+
f.write(f"1\t{len(sequences[seq_id])}\tgene\n")
|
|
143
|
+
f.write(f"\t\t\tgene\t{ann['gene']}\n")
|
|
144
|
+
if "product" in ann:
|
|
145
|
+
f.write(f"1\t{len(sequences[seq_id])}\tCDS\n")
|
|
146
|
+
f.write(f"\t\t\tproduct\t{ann['product']}\n")
|
|
147
|
+
|
|
148
|
+
# Template 生成
|
|
149
|
+
template = {
|
|
150
|
+
"source": {
|
|
151
|
+
"organism": list(annotations.values())[0].get("organism", ""),
|
|
152
|
+
"mol_type": "genomic DNA",
|
|
153
|
+
},
|
|
154
|
+
"submitter": {
|
|
155
|
+
"name": "AutoSubmission",
|
|
156
|
+
},
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
template_path = output_dir / "template.json"
|
|
160
|
+
with open(template_path, "w") as f:
|
|
161
|
+
json.dump(template, f, indent=2)
|
|
162
|
+
|
|
163
|
+
print(f"GenBank submission prepared: {len(sequences)} sequences")
|
|
164
|
+
print(f" FASTA: {fasta_path}")
|
|
165
|
+
print(f" Feature Table: {tbl_path}")
|
|
166
|
+
return {"fasta": str(fasta_path), "tbl": str(tbl_path)}
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## 3. SRA メタデータ & アップロード
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
def prepare_sra_metadata(samples, library_strategy="WGS",
|
|
173
|
+
library_source="GENOMIC",
|
|
174
|
+
platform="ILLUMINA",
|
|
175
|
+
instrument_model="Illumina NovaSeq 6000"):
|
|
176
|
+
"""
|
|
177
|
+
SRA メタデータ TSV 生成。
|
|
178
|
+
|
|
179
|
+
Parameters:
|
|
180
|
+
samples: list[dict] — {biosample, title, file_r1, file_r2}
|
|
181
|
+
library_strategy: str — WGS/RNA-Seq/AMPLICON/etc.
|
|
182
|
+
library_source: str — GENOMIC/TRANSCRIPTOMIC/etc.
|
|
183
|
+
platform: str — ILLUMINA/OXFORD_NANOPORE/etc.
|
|
184
|
+
"""
|
|
185
|
+
rows = []
|
|
186
|
+
for s in samples:
|
|
187
|
+
rows.append({
|
|
188
|
+
"biosample_accession": s.get("biosample", "SAMN_PENDING"),
|
|
189
|
+
"library_ID": s.get("library_id", s.get("title", "")),
|
|
190
|
+
"title": s.get("title", ""),
|
|
191
|
+
"library_strategy": library_strategy,
|
|
192
|
+
"library_source": library_source,
|
|
193
|
+
"library_selection": s.get("selection", "RANDOM"),
|
|
194
|
+
"library_layout": "paired" if s.get("file_r2") else "single",
|
|
195
|
+
"platform": platform,
|
|
196
|
+
"instrument_model": instrument_model,
|
|
197
|
+
"filetype": "fastq",
|
|
198
|
+
"filename": s.get("file_r1", ""),
|
|
199
|
+
"filename2": s.get("file_r2", ""),
|
|
200
|
+
})
|
|
201
|
+
|
|
202
|
+
df = pd.DataFrame(rows)
|
|
203
|
+
print(f"SRA metadata: {len(df)} runs, strategy={library_strategy}")
|
|
204
|
+
return df
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def sra_upload_ascp(files, destination, ascp_key=None):
|
|
208
|
+
"""
|
|
209
|
+
Aspera (ascp) による SRA データ高速アップロード。
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
files: list — アップロードファイルリスト
|
|
213
|
+
destination: str — SRA アップロード先
|
|
214
|
+
ascp_key: str — Aspera SSH キーパス
|
|
215
|
+
"""
|
|
216
|
+
import subprocess
|
|
217
|
+
|
|
218
|
+
if ascp_key is None:
|
|
219
|
+
ascp_key = Path.home() / ".aspera/connect/etc/asperaweb_id_dsa.openssh"
|
|
220
|
+
|
|
221
|
+
for f in files:
|
|
222
|
+
cmd = [
|
|
223
|
+
"ascp", "-i", str(ascp_key),
|
|
224
|
+
"-QT", "-l", "300m", "-k", "1",
|
|
225
|
+
str(f), destination,
|
|
226
|
+
]
|
|
227
|
+
print(f"Uploading: {f}")
|
|
228
|
+
subprocess.run(cmd, check=True)
|
|
229
|
+
|
|
230
|
+
print(f"SRA upload complete: {len(files)} files")
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## 4. GEO 発現データ登録
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
def prepare_geo_submission(expression_matrix, sample_metadata,
|
|
237
|
+
platform="GPL16791", output_dir="geo_submission"):
|
|
238
|
+
"""
|
|
239
|
+
GEO SOFT 形式サブミッション準備。
|
|
240
|
+
|
|
241
|
+
Parameters:
|
|
242
|
+
expression_matrix: pd.DataFrame — 遺伝子 × サンプルマトリクス
|
|
243
|
+
sample_metadata: pd.DataFrame — サンプルメタデータ
|
|
244
|
+
platform: str — GEO プラットフォーム ID
|
|
245
|
+
output_dir: str — 出力ディレクトリ
|
|
246
|
+
"""
|
|
247
|
+
output_dir = Path(output_dir)
|
|
248
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
249
|
+
|
|
250
|
+
# SOFT テンプレート
|
|
251
|
+
soft_path = output_dir / "submission.soft"
|
|
252
|
+
with open(soft_path, "w") as f:
|
|
253
|
+
# Series section
|
|
254
|
+
f.write("^SERIES\n")
|
|
255
|
+
f.write("!Series_title = \n")
|
|
256
|
+
f.write("!Series_summary = \n")
|
|
257
|
+
f.write(f"!Series_platform_id = {platform}\n")
|
|
258
|
+
|
|
259
|
+
# Sample sections
|
|
260
|
+
for col in expression_matrix.columns:
|
|
261
|
+
meta = sample_metadata[sample_metadata["sample_id"] == col]
|
|
262
|
+
f.write(f"\n^SAMPLE = {col}\n")
|
|
263
|
+
f.write(f"!Sample_title = {col}\n")
|
|
264
|
+
if len(meta) > 0:
|
|
265
|
+
for key, val in meta.iloc[0].items():
|
|
266
|
+
if key != "sample_id":
|
|
267
|
+
f.write(f"!Sample_characteristics_ch1 = {key}: {val}\n")
|
|
268
|
+
|
|
269
|
+
# Matrix file
|
|
270
|
+
matrix_path = output_dir / "expression_matrix.txt"
|
|
271
|
+
expression_matrix.to_csv(matrix_path, sep="\t")
|
|
272
|
+
|
|
273
|
+
# Raw data files list
|
|
274
|
+
raw_files_path = output_dir / "raw_files.txt"
|
|
275
|
+
with open(raw_files_path, "w") as f:
|
|
276
|
+
for col in expression_matrix.columns:
|
|
277
|
+
f.write(f"{col}.fastq.gz\n")
|
|
278
|
+
|
|
279
|
+
print(f"GEO submission: {expression_matrix.shape[1]} samples, "
|
|
280
|
+
f"{expression_matrix.shape[0]} genes")
|
|
281
|
+
return {"soft": str(soft_path), "matrix": str(matrix_path)}
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
## 5. FAIR データ検証チェックリスト
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
def fair_checklist(submission_package):
|
|
288
|
+
"""
|
|
289
|
+
FAIR 原則準拠チェックリスト。
|
|
290
|
+
|
|
291
|
+
Parameters:
|
|
292
|
+
submission_package: dict — 登録パッケージ情報
|
|
293
|
+
"""
|
|
294
|
+
checks = {
|
|
295
|
+
"Findable": {
|
|
296
|
+
"F1_persistent_id": bool(submission_package.get("accession")),
|
|
297
|
+
"F2_metadata_rich": bool(submission_package.get("metadata")),
|
|
298
|
+
"F3_id_in_metadata": True,
|
|
299
|
+
"F4_searchable_registry": bool(submission_package.get("repository")),
|
|
300
|
+
},
|
|
301
|
+
"Accessible": {
|
|
302
|
+
"A1_retrievable_protocol": bool(submission_package.get("access_url")),
|
|
303
|
+
"A1_1_open_protocol": True,
|
|
304
|
+
"A2_metadata_persists": True,
|
|
305
|
+
},
|
|
306
|
+
"Interoperable": {
|
|
307
|
+
"I1_formal_language": bool(submission_package.get("format")),
|
|
308
|
+
"I2_fair_vocabularies": bool(submission_package.get("ontology_terms")),
|
|
309
|
+
"I3_qualified_references": bool(submission_package.get("references")),
|
|
310
|
+
},
|
|
311
|
+
"Reusable": {
|
|
312
|
+
"R1_usage_license": bool(submission_package.get("license")),
|
|
313
|
+
"R1_1_community_standards": bool(submission_package.get("standard")),
|
|
314
|
+
"R1_2_provenance": bool(submission_package.get("methods")),
|
|
315
|
+
},
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
total = 0
|
|
319
|
+
passed = 0
|
|
320
|
+
for principle, items in checks.items():
|
|
321
|
+
for check, status in items.items():
|
|
322
|
+
total += 1
|
|
323
|
+
if status:
|
|
324
|
+
passed += 1
|
|
325
|
+
|
|
326
|
+
score = passed / total * 100 if total > 0 else 0
|
|
327
|
+
print(f"FAIR checklist: {passed}/{total} ({score:.0f}%)")
|
|
328
|
+
for principle, items in checks.items():
|
|
329
|
+
n_pass = sum(items.values())
|
|
330
|
+
print(f" {principle}: {n_pass}/{len(items)}")
|
|
331
|
+
|
|
332
|
+
return checks
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
---
|
|
336
|
+
|
|
337
|
+
## パイプライン統合
|
|
338
|
+
|
|
339
|
+
```
|
|
340
|
+
bioinformatics → data-submission → literature-search
|
|
341
|
+
(解析完了) (データ登録) (論文投稿時)
|
|
342
|
+
│ │ ↓
|
|
343
|
+
lab-data-management ───┘ academic-writing
|
|
344
|
+
(Benchling/OMERO) │ (論文執筆)
|
|
345
|
+
↓
|
|
346
|
+
ebi-databases
|
|
347
|
+
(ENA/BioStudies 連携)
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
## パイプライン出力
|
|
351
|
+
|
|
352
|
+
| ファイル | 説明 | 次スキル |
|
|
353
|
+
|---------|------|---------|
|
|
354
|
+
| `submission/sequences.fsa` | GenBank 登録用 FASTA | → bioinformatics |
|
|
355
|
+
| `submission/sra_metadata.tsv` | SRA メタデータ | → ebi-databases |
|
|
356
|
+
| `geo_submission/submission.soft` | GEO SOFT テンプレート | → gene-expression |
|
|
357
|
+
| `submission/fair_report.json` | FAIR チェックリスト結果 | → academic-writing |
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-deep-chemistry
|
|
3
|
+
description: |
|
|
4
|
+
深層学習分子特性予測スキル。DeepChem による GCN/MPNN/AttentiveFP
|
|
5
|
+
分子特性予測・MoleculeNet ベンチマーク・ChemBERTa/GROVER
|
|
6
|
+
事前学習モデル・分子フィンガープリントフィーチャライザ。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Deep Chemistry
|
|
10
|
+
|
|
11
|
+
DeepChem を活用した深層学習ベース分子特性予測パイプラインを提供する。
|
|
12
|
+
グラフニューラルネットワーク (GCN/MPNN/AttentiveFP)、MoleculeNet
|
|
13
|
+
ベンチマーク、事前学習モデル (ChemBERTa/GROVER)。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- 分子の ADMET/物性を深層学習で予測するとき
|
|
18
|
+
- MoleculeNet ベンチマークデータセットを使うとき
|
|
19
|
+
- GCN / MPNN / AttentiveFP モデルを訓練するとき
|
|
20
|
+
- ChemBERTa で分子表現学習を行うとき
|
|
21
|
+
- 毒性予測 (Tox21, ToxCast) を行うとき
|
|
22
|
+
- 薬理活性予測の分子特徴量を生成するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. MoleculeNet データセット読込み
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import deepchem as dc
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def load_moleculenet(dataset_name="delaney", featurizer="GraphConv",
|
|
37
|
+
split="scaffold"):
|
|
38
|
+
"""
|
|
39
|
+
MoleculeNet ベンチマークデータセット読込み。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
dataset_name: str — データセット名
|
|
43
|
+
("delaney", "tox21", "bbbp", "hiv", "muv", "pcba",
|
|
44
|
+
"sider", "clintox", "freesolv", "lipo")
|
|
45
|
+
featurizer: str — 特徴量化手法
|
|
46
|
+
("GraphConv", "ECFP", "Weave", "MolGraphConv")
|
|
47
|
+
split: str — 分割方法 ("scaffold", "random", "stratified")
|
|
48
|
+
|
|
49
|
+
K-Dense: deepchem
|
|
50
|
+
"""
|
|
51
|
+
loader_map = {
|
|
52
|
+
"delaney": dc.molnet.load_delaney,
|
|
53
|
+
"tox21": dc.molnet.load_tox21,
|
|
54
|
+
"bbbp": dc.molnet.load_bbbp,
|
|
55
|
+
"hiv": dc.molnet.load_hiv,
|
|
56
|
+
"muv": dc.molnet.load_muv,
|
|
57
|
+
"pcba": dc.molnet.load_pcba,
|
|
58
|
+
"sider": dc.molnet.load_sider,
|
|
59
|
+
"clintox": dc.molnet.load_clintox,
|
|
60
|
+
"freesolv": dc.molnet.load_freesolv,
|
|
61
|
+
"lipo": dc.molnet.load_lipo,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if dataset_name not in loader_map:
|
|
65
|
+
raise ValueError(f"Unknown dataset: {dataset_name}")
|
|
66
|
+
|
|
67
|
+
tasks, datasets, transformers = loader_map[dataset_name](
|
|
68
|
+
featurizer=featurizer, splitter=split
|
|
69
|
+
)
|
|
70
|
+
train, valid, test = datasets
|
|
71
|
+
|
|
72
|
+
print(f"MoleculeNet '{dataset_name}':")
|
|
73
|
+
print(f" Tasks: {len(tasks)}")
|
|
74
|
+
print(f" Train: {len(train)}, Valid: {len(valid)}, Test: {len(test)}")
|
|
75
|
+
print(f" Featurizer: {featurizer}, Split: {split}")
|
|
76
|
+
return tasks, (train, valid, test), transformers
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## 2. GCN モデル訓練
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
def train_gcn(train_data, valid_data, tasks, n_epochs=50,
|
|
83
|
+
learning_rate=0.001, batch_size=64):
|
|
84
|
+
"""
|
|
85
|
+
Graph Convolutional Network (GCN) モデル訓練。
|
|
86
|
+
|
|
87
|
+
Parameters:
|
|
88
|
+
train_data: dc.data.Dataset — 訓練データ
|
|
89
|
+
valid_data: dc.data.Dataset — 検証データ
|
|
90
|
+
tasks: list — タスク名リスト
|
|
91
|
+
n_epochs: int — エポック数
|
|
92
|
+
"""
|
|
93
|
+
model = dc.models.GraphConvModel(
|
|
94
|
+
n_tasks=len(tasks),
|
|
95
|
+
mode="classification" if len(tasks) > 1 else "regression",
|
|
96
|
+
batch_size=batch_size,
|
|
97
|
+
learning_rate=learning_rate,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
for epoch in range(n_epochs):
|
|
101
|
+
loss = model.fit(train_data, nb_epoch=1)
|
|
102
|
+
if (epoch + 1) % 10 == 0:
|
|
103
|
+
metric = dc.metrics.Metric(
|
|
104
|
+
dc.metrics.roc_auc_score if len(tasks) > 1
|
|
105
|
+
else dc.metrics.pearson_r2_score
|
|
106
|
+
)
|
|
107
|
+
train_score = model.evaluate(train_data, [metric])
|
|
108
|
+
valid_score = model.evaluate(valid_data, [metric])
|
|
109
|
+
print(f" Epoch {epoch+1}: "
|
|
110
|
+
f"train={list(train_score.values())[0]:.4f}, "
|
|
111
|
+
f"valid={list(valid_score.values())[0]:.4f}")
|
|
112
|
+
|
|
113
|
+
return model
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## 3. MPNN モデル訓練
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
def train_mpnn(train_data, valid_data, tasks, n_epochs=50,
|
|
120
|
+
learning_rate=0.001):
|
|
121
|
+
"""
|
|
122
|
+
Message Passing Neural Network (MPNN) 訓練。
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
train_data: dc.data.Dataset — GraphConv 特徴量訓練データ
|
|
126
|
+
valid_data: dc.data.Dataset — 検証データ
|
|
127
|
+
tasks: list — タスク名リスト
|
|
128
|
+
"""
|
|
129
|
+
model = dc.models.MPNNModel(
|
|
130
|
+
n_tasks=len(tasks),
|
|
131
|
+
mode="classification" if len(tasks) > 1 else "regression",
|
|
132
|
+
learning_rate=learning_rate,
|
|
133
|
+
node_out_feats=64,
|
|
134
|
+
edge_hidden_feats=128,
|
|
135
|
+
num_step_message_passing=3,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
model.fit(train_data, nb_epoch=n_epochs)
|
|
139
|
+
|
|
140
|
+
metric = dc.metrics.Metric(
|
|
141
|
+
dc.metrics.roc_auc_score if len(tasks) > 1
|
|
142
|
+
else dc.metrics.pearson_r2_score
|
|
143
|
+
)
|
|
144
|
+
valid_score = model.evaluate(valid_data, [metric])
|
|
145
|
+
print(f"MPNN: valid score = {list(valid_score.values())[0]:.4f}")
|
|
146
|
+
return model
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## 4. AttentiveFP モデル訓練
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
def train_attentivefp(train_data, valid_data, tasks, n_epochs=50,
|
|
153
|
+
learning_rate=0.001, num_layers=2):
|
|
154
|
+
"""
|
|
155
|
+
AttentiveFP (Attention-based Fingerprint) 訓練。
|
|
156
|
+
|
|
157
|
+
Parameters:
|
|
158
|
+
train_data: dc.data.Dataset — 訓練データ
|
|
159
|
+
valid_data: dc.data.Dataset — 検証データ
|
|
160
|
+
tasks: list — タスク名
|
|
161
|
+
num_layers: int — GATレイヤー数
|
|
162
|
+
"""
|
|
163
|
+
model = dc.models.AttentiveFPModel(
|
|
164
|
+
n_tasks=len(tasks),
|
|
165
|
+
mode="classification" if len(tasks) > 1 else "regression",
|
|
166
|
+
learning_rate=learning_rate,
|
|
167
|
+
num_layers=num_layers,
|
|
168
|
+
graph_feat_size=200,
|
|
169
|
+
num_timesteps=2,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
model.fit(train_data, nb_epoch=n_epochs)
|
|
173
|
+
|
|
174
|
+
metric = dc.metrics.Metric(
|
|
175
|
+
dc.metrics.roc_auc_score if len(tasks) > 1
|
|
176
|
+
else dc.metrics.pearson_r2_score
|
|
177
|
+
)
|
|
178
|
+
valid_score = model.evaluate(valid_data, [metric])
|
|
179
|
+
print(f"AttentiveFP: valid score = {list(valid_score.values())[0]:.4f}")
|
|
180
|
+
return model
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## 5. ChemBERTa 分子表現学習
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
def chemberta_embeddings(smiles_list, model_name="seyonec/ChemBERTa-zinc-base-v1"):
|
|
187
|
+
"""
|
|
188
|
+
ChemBERTa で SMILES → 分子埋込みベクトル。
|
|
189
|
+
|
|
190
|
+
Parameters:
|
|
191
|
+
smiles_list: list — SMILES 文字列リスト
|
|
192
|
+
model_name: str — HuggingFace モデル名
|
|
193
|
+
"""
|
|
194
|
+
from transformers import AutoTokenizer, AutoModel
|
|
195
|
+
import torch
|
|
196
|
+
|
|
197
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
198
|
+
model = AutoModel.from_pretrained(model_name)
|
|
199
|
+
model.eval()
|
|
200
|
+
|
|
201
|
+
embeddings = []
|
|
202
|
+
batch_size = 32
|
|
203
|
+
|
|
204
|
+
for i in range(0, len(smiles_list), batch_size):
|
|
205
|
+
batch = smiles_list[i:i+batch_size]
|
|
206
|
+
inputs = tokenizer(batch, padding=True, truncation=True,
|
|
207
|
+
max_length=512, return_tensors="pt")
|
|
208
|
+
|
|
209
|
+
with torch.no_grad():
|
|
210
|
+
outputs = model(**inputs)
|
|
211
|
+
# CLS トークン埋込み
|
|
212
|
+
cls_emb = outputs.last_hidden_state[:, 0, :].numpy()
|
|
213
|
+
embeddings.append(cls_emb)
|
|
214
|
+
|
|
215
|
+
embeddings = np.vstack(embeddings)
|
|
216
|
+
print(f"ChemBERTa: {len(smiles_list)} molecules → "
|
|
217
|
+
f"{embeddings.shape[1]}D embeddings")
|
|
218
|
+
return embeddings
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## 6. モデル比較ベンチマーク
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
def benchmark_models(dataset_name="tox21", models_to_test=None,
|
|
225
|
+
n_epochs=30):
|
|
226
|
+
"""
|
|
227
|
+
複数モデルのベンチマーク比較。
|
|
228
|
+
|
|
229
|
+
Parameters:
|
|
230
|
+
dataset_name: str — MoleculeNet データセット
|
|
231
|
+
models_to_test: list — テストモデル名
|
|
232
|
+
n_epochs: int — エポック数
|
|
233
|
+
"""
|
|
234
|
+
if models_to_test is None:
|
|
235
|
+
models_to_test = ["GCN", "MPNN", "AttentiveFP"]
|
|
236
|
+
|
|
237
|
+
results = {}
|
|
238
|
+
|
|
239
|
+
for model_name in models_to_test:
|
|
240
|
+
featurizer = "GraphConv" if model_name != "ECFP_RF" else "ECFP"
|
|
241
|
+
tasks, (train, valid, test), transformers = load_moleculenet(
|
|
242
|
+
dataset_name, featurizer=featurizer
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
is_classification = len(tasks) > 1 or dataset_name in [
|
|
246
|
+
"tox21", "bbbp", "hiv", "sider", "clintox"
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
if model_name == "GCN":
|
|
250
|
+
model = train_gcn(train, valid, tasks, n_epochs=n_epochs)
|
|
251
|
+
elif model_name == "MPNN":
|
|
252
|
+
model = train_mpnn(train, valid, tasks, n_epochs=n_epochs)
|
|
253
|
+
elif model_name == "AttentiveFP":
|
|
254
|
+
model = train_attentivefp(train, valid, tasks, n_epochs=n_epochs)
|
|
255
|
+
else:
|
|
256
|
+
continue
|
|
257
|
+
|
|
258
|
+
metric = dc.metrics.Metric(
|
|
259
|
+
dc.metrics.roc_auc_score if is_classification
|
|
260
|
+
else dc.metrics.pearson_r2_score
|
|
261
|
+
)
|
|
262
|
+
test_score = model.evaluate(test, [metric])
|
|
263
|
+
results[model_name] = list(test_score.values())[0]
|
|
264
|
+
|
|
265
|
+
print(f"\nBenchmark on '{dataset_name}':")
|
|
266
|
+
for name, score in sorted(results.items(), key=lambda x: -x[1]):
|
|
267
|
+
print(f" {name}: {score:.4f}")
|
|
268
|
+
return results
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## 7. 分子特性予測パイプライン
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
def molecular_prediction_pipeline(smiles_list, property_name="solubility",
|
|
275
|
+
model_type="AttentiveFP"):
|
|
276
|
+
"""
|
|
277
|
+
SMILES → 分子特性予測 統合パイプライン。
|
|
278
|
+
|
|
279
|
+
Parameters:
|
|
280
|
+
smiles_list: list — SMILES リスト
|
|
281
|
+
property_name: str — 予測対象物性
|
|
282
|
+
model_type: str — 使用モデル
|
|
283
|
+
"""
|
|
284
|
+
# データセットマッピング
|
|
285
|
+
property_dataset = {
|
|
286
|
+
"solubility": "delaney",
|
|
287
|
+
"toxicity": "tox21",
|
|
288
|
+
"bbb_penetration": "bbbp",
|
|
289
|
+
"hiv_activity": "hiv",
|
|
290
|
+
"lipophilicity": "lipo",
|
|
291
|
+
"solvation_energy": "freesolv",
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
dataset_name = property_dataset.get(property_name, "delaney")
|
|
295
|
+
|
|
296
|
+
# 1) ベンチマークデータで訓練
|
|
297
|
+
tasks, (train, valid, test), transformers = load_moleculenet(
|
|
298
|
+
dataset_name, featurizer="GraphConv"
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
if model_type == "GCN":
|
|
302
|
+
model = train_gcn(train, valid, tasks)
|
|
303
|
+
elif model_type == "AttentiveFP":
|
|
304
|
+
model = train_attentivefp(train, valid, tasks)
|
|
305
|
+
else:
|
|
306
|
+
model = train_mpnn(train, valid, tasks)
|
|
307
|
+
|
|
308
|
+
# 2) 新規分子を予測
|
|
309
|
+
featurizer = dc.feat.MolGraphConvFeaturizer()
|
|
310
|
+
features = featurizer.featurize(smiles_list)
|
|
311
|
+
pred_dataset = dc.data.NumpyDataset(X=features)
|
|
312
|
+
predictions = model.predict(pred_dataset)
|
|
313
|
+
|
|
314
|
+
results = []
|
|
315
|
+
for smi, pred in zip(smiles_list, predictions):
|
|
316
|
+
results.append({
|
|
317
|
+
"smiles": smi,
|
|
318
|
+
"prediction": float(pred[0]) if pred.ndim > 1 else float(pred),
|
|
319
|
+
"property": property_name,
|
|
320
|
+
"model": model_type,
|
|
321
|
+
})
|
|
322
|
+
|
|
323
|
+
df = pd.DataFrame(results)
|
|
324
|
+
print(f"Predictions: {len(df)} molecules, property='{property_name}'")
|
|
325
|
+
return df
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
---
|
|
329
|
+
|
|
330
|
+
## パイプライン統合
|
|
331
|
+
|
|
332
|
+
```
|
|
333
|
+
cheminformatics → deep-chemistry → drug-target-profiling
|
|
334
|
+
(RDKit/SMILES) (GCN/MPNN/FP) (ChEMBL/標的)
|
|
335
|
+
│ │ ↓
|
|
336
|
+
molecular-docking ───────┘ admet-pharmacokinetics
|
|
337
|
+
(AutoDock/Vina) │ (ADMET予測)
|
|
338
|
+
↓
|
|
339
|
+
md-simulation
|
|
340
|
+
(分子動力学検証)
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
## パイプライン出力
|
|
344
|
+
|
|
345
|
+
| ファイル | 説明 | 次スキル |
|
|
346
|
+
|---------|------|---------|
|
|
347
|
+
| `results/predictions.csv` | 分子特性予測値 | → drug-target-profiling |
|
|
348
|
+
| `results/benchmark.json` | モデルベンチマーク結果 | — |
|
|
349
|
+
| `results/embeddings.npy` | ChemBERTa 埋込み | → cheminformatics |
|
|
350
|
+
| `results/model/` | 訓練済みモデル | → admet-pharmacokinetics |
|