@nahisaho/satori 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ ---
2
+ name: scientific-deep-chemistry
3
+ description: |
4
+ 深層学習分子特性予測スキル。DeepChem による GCN/MPNN/AttentiveFP
5
+ 分子特性予測・MoleculeNet ベンチマーク・ChemBERTa/GROVER
6
+ 事前学習モデル・分子フィンガープリントフィーチャライザ。
7
+ ---
8
+
9
+ # Scientific Deep Chemistry
10
+
11
+ DeepChem を活用した深層学習ベース分子特性予測パイプラインを提供する。
12
+ グラフニューラルネットワーク (GCN/MPNN/AttentiveFP)、MoleculeNet
13
+ ベンチマーク、事前学習モデル (ChemBERTa/GROVER)。
14
+
15
+ ## When to Use
16
+
17
+ - 分子の ADMET/物性を深層学習で予測するとき
18
+ - MoleculeNet ベンチマークデータセットを使うとき
19
+ - GCN / MPNN / AttentiveFP モデルを訓練するとき
20
+ - ChemBERTa で分子表現学習を行うとき
21
+ - 毒性予測 (Tox21, ToxCast) を行うとき
22
+ - 薬理活性予測の分子特徴量を生成するとき
23
+
24
+ ---
25
+
26
+ ## Quick Start
27
+
28
+ ## 1. MoleculeNet データセット読込み
29
+
30
+ ```python
31
+ import deepchem as dc
32
+ import numpy as np
33
+ import pandas as pd
34
+
35
+
36
+ def load_moleculenet(dataset_name="delaney", featurizer="GraphConv",
37
+ split="scaffold"):
38
+ """
39
+ MoleculeNet ベンチマークデータセット読込み。
40
+
41
+ Parameters:
42
+ dataset_name: str — データセット名
43
+ ("delaney", "tox21", "bbbp", "hiv", "muv", "pcba",
44
+ "sider", "clintox", "freesolv", "lipo")
45
+ featurizer: str — 特徴量化手法
46
+ ("GraphConv", "ECFP", "Weave", "MolGraphConv")
47
+ split: str — 分割方法 ("scaffold", "random", "stratified")
48
+
49
+ K-Dense: deepchem
50
+ """
51
+ loader_map = {
52
+ "delaney": dc.molnet.load_delaney,
53
+ "tox21": dc.molnet.load_tox21,
54
+ "bbbp": dc.molnet.load_bbbp,
55
+ "hiv": dc.molnet.load_hiv,
56
+ "muv": dc.molnet.load_muv,
57
+ "pcba": dc.molnet.load_pcba,
58
+ "sider": dc.molnet.load_sider,
59
+ "clintox": dc.molnet.load_clintox,
60
+ "freesolv": dc.molnet.load_freesolv,
61
+ "lipo": dc.molnet.load_lipo,
62
+ }
63
+
64
+ if dataset_name not in loader_map:
65
+ raise ValueError(f"Unknown dataset: {dataset_name}")
66
+
67
+ tasks, datasets, transformers = loader_map[dataset_name](
68
+ featurizer=featurizer, splitter=split
69
+ )
70
+ train, valid, test = datasets
71
+
72
+ print(f"MoleculeNet '{dataset_name}':")
73
+ print(f" Tasks: {len(tasks)}")
74
+ print(f" Train: {len(train)}, Valid: {len(valid)}, Test: {len(test)}")
75
+ print(f" Featurizer: {featurizer}, Split: {split}")
76
+ return tasks, (train, valid, test), transformers
77
+ ```
78
+
79
+ ## 2. GCN モデル訓練
80
+
81
+ ```python
82
+ def train_gcn(train_data, valid_data, tasks, n_epochs=50,
83
+ learning_rate=0.001, batch_size=64):
84
+ """
85
+ Graph Convolutional Network (GCN) モデル訓練。
86
+
87
+ Parameters:
88
+ train_data: dc.data.Dataset — 訓練データ
89
+ valid_data: dc.data.Dataset — 検証データ
90
+ tasks: list — タスク名リスト
91
+ n_epochs: int — エポック数
92
+ """
93
+ model = dc.models.GraphConvModel(
94
+ n_tasks=len(tasks),
95
+ mode="classification" if len(tasks) > 1 else "regression",
96
+ batch_size=batch_size,
97
+ learning_rate=learning_rate,
98
+ )
99
+
100
+ for epoch in range(n_epochs):
101
+ loss = model.fit(train_data, nb_epoch=1)
102
+ if (epoch + 1) % 10 == 0:
103
+ metric = dc.metrics.Metric(
104
+ dc.metrics.roc_auc_score if len(tasks) > 1
105
+ else dc.metrics.pearson_r2_score
106
+ )
107
+ train_score = model.evaluate(train_data, [metric])
108
+ valid_score = model.evaluate(valid_data, [metric])
109
+ print(f" Epoch {epoch+1}: "
110
+ f"train={list(train_score.values())[0]:.4f}, "
111
+ f"valid={list(valid_score.values())[0]:.4f}")
112
+
113
+ return model
114
+ ```
115
+
116
+ ## 3. MPNN モデル訓練
117
+
118
+ ```python
119
+ def train_mpnn(train_data, valid_data, tasks, n_epochs=50,
120
+ learning_rate=0.001):
121
+ """
122
+ Message Passing Neural Network (MPNN) 訓練。
123
+
124
+ Parameters:
125
+ train_data: dc.data.Dataset — GraphConv 特徴量訓練データ
126
+ valid_data: dc.data.Dataset — 検証データ
127
+ tasks: list — タスク名リスト
128
+ """
129
+ model = dc.models.MPNNModel(
130
+ n_tasks=len(tasks),
131
+ mode="classification" if len(tasks) > 1 else "regression",
132
+ learning_rate=learning_rate,
133
+ node_out_feats=64,
134
+ edge_hidden_feats=128,
135
+ num_step_message_passing=3,
136
+ )
137
+
138
+ model.fit(train_data, nb_epoch=n_epochs)
139
+
140
+ metric = dc.metrics.Metric(
141
+ dc.metrics.roc_auc_score if len(tasks) > 1
142
+ else dc.metrics.pearson_r2_score
143
+ )
144
+ valid_score = model.evaluate(valid_data, [metric])
145
+ print(f"MPNN: valid score = {list(valid_score.values())[0]:.4f}")
146
+ return model
147
+ ```
148
+
149
+ ## 4. AttentiveFP モデル訓練
150
+
151
+ ```python
152
+ def train_attentivefp(train_data, valid_data, tasks, n_epochs=50,
153
+ learning_rate=0.001, num_layers=2):
154
+ """
155
+ AttentiveFP (Attention-based Fingerprint) 訓練。
156
+
157
+ Parameters:
158
+ train_data: dc.data.Dataset — 訓練データ
159
+ valid_data: dc.data.Dataset — 検証データ
160
+ tasks: list — タスク名
161
+ num_layers: int — GATレイヤー数
162
+ """
163
+ model = dc.models.AttentiveFPModel(
164
+ n_tasks=len(tasks),
165
+ mode="classification" if len(tasks) > 1 else "regression",
166
+ learning_rate=learning_rate,
167
+ num_layers=num_layers,
168
+ graph_feat_size=200,
169
+ num_timesteps=2,
170
+ )
171
+
172
+ model.fit(train_data, nb_epoch=n_epochs)
173
+
174
+ metric = dc.metrics.Metric(
175
+ dc.metrics.roc_auc_score if len(tasks) > 1
176
+ else dc.metrics.pearson_r2_score
177
+ )
178
+ valid_score = model.evaluate(valid_data, [metric])
179
+ print(f"AttentiveFP: valid score = {list(valid_score.values())[0]:.4f}")
180
+ return model
181
+ ```
182
+
183
+ ## 5. ChemBERTa 分子表現学習
184
+
185
+ ```python
186
+ def chemberta_embeddings(smiles_list, model_name="seyonec/ChemBERTa-zinc-base-v1"):
187
+ """
188
+ ChemBERTa で SMILES → 分子埋込みベクトル。
189
+
190
+ Parameters:
191
+ smiles_list: list — SMILES 文字列リスト
192
+ model_name: str — HuggingFace モデル名
193
+ """
194
+ from transformers import AutoTokenizer, AutoModel
195
+ import torch
196
+
197
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
198
+ model = AutoModel.from_pretrained(model_name)
199
+ model.eval()
200
+
201
+ embeddings = []
202
+ batch_size = 32
203
+
204
+ for i in range(0, len(smiles_list), batch_size):
205
+ batch = smiles_list[i:i+batch_size]
206
+ inputs = tokenizer(batch, padding=True, truncation=True,
207
+ max_length=512, return_tensors="pt")
208
+
209
+ with torch.no_grad():
210
+ outputs = model(**inputs)
211
+ # CLS トークン埋込み
212
+ cls_emb = outputs.last_hidden_state[:, 0, :].numpy()
213
+ embeddings.append(cls_emb)
214
+
215
+ embeddings = np.vstack(embeddings)
216
+ print(f"ChemBERTa: {len(smiles_list)} molecules → "
217
+ f"{embeddings.shape[1]}D embeddings")
218
+ return embeddings
219
+ ```
220
+
221
+ ## 6. モデル比較ベンチマーク
222
+
223
+ ```python
224
+ def benchmark_models(dataset_name="tox21", models_to_test=None,
225
+ n_epochs=30):
226
+ """
227
+ 複数モデルのベンチマーク比較。
228
+
229
+ Parameters:
230
+ dataset_name: str — MoleculeNet データセット
231
+ models_to_test: list — テストモデル名
232
+ n_epochs: int — エポック数
233
+ """
234
+ if models_to_test is None:
235
+ models_to_test = ["GCN", "MPNN", "AttentiveFP"]
236
+
237
+ results = {}
238
+
239
+ for model_name in models_to_test:
240
+ featurizer = "GraphConv" if model_name != "ECFP_RF" else "ECFP"
241
+ tasks, (train, valid, test), transformers = load_moleculenet(
242
+ dataset_name, featurizer=featurizer
243
+ )
244
+
245
+ is_classification = len(tasks) > 1 or dataset_name in [
246
+ "tox21", "bbbp", "hiv", "sider", "clintox"
247
+ ]
248
+
249
+ if model_name == "GCN":
250
+ model = train_gcn(train, valid, tasks, n_epochs=n_epochs)
251
+ elif model_name == "MPNN":
252
+ model = train_mpnn(train, valid, tasks, n_epochs=n_epochs)
253
+ elif model_name == "AttentiveFP":
254
+ model = train_attentivefp(train, valid, tasks, n_epochs=n_epochs)
255
+ else:
256
+ continue
257
+
258
+ metric = dc.metrics.Metric(
259
+ dc.metrics.roc_auc_score if is_classification
260
+ else dc.metrics.pearson_r2_score
261
+ )
262
+ test_score = model.evaluate(test, [metric])
263
+ results[model_name] = list(test_score.values())[0]
264
+
265
+ print(f"\nBenchmark on '{dataset_name}':")
266
+ for name, score in sorted(results.items(), key=lambda x: -x[1]):
267
+ print(f" {name}: {score:.4f}")
268
+ return results
269
+ ```
270
+
271
+ ## 7. 分子特性予測パイプライン
272
+
273
+ ```python
274
+ def molecular_prediction_pipeline(smiles_list, property_name="solubility",
275
+ model_type="AttentiveFP"):
276
+ """
277
+ SMILES → 分子特性予測 統合パイプライン。
278
+
279
+ Parameters:
280
+ smiles_list: list — SMILES リスト
281
+ property_name: str — 予測対象物性
282
+ model_type: str — 使用モデル
283
+ """
284
+ # データセットマッピング
285
+ property_dataset = {
286
+ "solubility": "delaney",
287
+ "toxicity": "tox21",
288
+ "bbb_penetration": "bbbp",
289
+ "hiv_activity": "hiv",
290
+ "lipophilicity": "lipo",
291
+ "solvation_energy": "freesolv",
292
+ }
293
+
294
+ dataset_name = property_dataset.get(property_name, "delaney")
295
+
296
+ # 1) ベンチマークデータで訓練
297
+ tasks, (train, valid, test), transformers = load_moleculenet(
298
+ dataset_name, featurizer="GraphConv"
299
+ )
300
+
301
+ if model_type == "GCN":
302
+ model = train_gcn(train, valid, tasks)
303
+ elif model_type == "AttentiveFP":
304
+ model = train_attentivefp(train, valid, tasks)
305
+ else:
306
+ model = train_mpnn(train, valid, tasks)
307
+
308
+ # 2) 新規分子を予測
309
+ featurizer = dc.feat.MolGraphConvFeaturizer()
310
+ features = featurizer.featurize(smiles_list)
311
+ pred_dataset = dc.data.NumpyDataset(X=features)
312
+ predictions = model.predict(pred_dataset)
313
+
314
+ results = []
315
+ for smi, pred in zip(smiles_list, predictions):
316
+ results.append({
317
+ "smiles": smi,
318
+ "prediction": float(pred[0]) if pred.ndim > 1 else float(pred),
319
+ "property": property_name,
320
+ "model": model_type,
321
+ })
322
+
323
+ df = pd.DataFrame(results)
324
+ print(f"Predictions: {len(df)} molecules, property='{property_name}'")
325
+ return df
326
+ ```
327
+
328
+ ---
329
+
330
+ ## パイプライン統合
331
+
332
+ ```
333
+ cheminformatics → deep-chemistry → drug-target-profiling
334
+ (RDKit/SMILES) (GCN/MPNN/FP) (ChEMBL/標的)
335
+ │ │ ↓
336
+ molecular-docking ───────┘ admet-pharmacokinetics
337
+ (AutoDock/Vina) │ (ADMET予測)
338
+
339
+ md-simulation
340
+ (分子動力学検証)
341
+ ```
342
+
343
+ ## パイプライン出力
344
+
345
+ | ファイル | 説明 | 次スキル |
346
+ |---------|------|---------|
347
+ | `results/predictions.csv` | 分子特性予測値 | → drug-target-profiling |
348
+ | `results/benchmark.json` | モデルベンチマーク結果 | — |
349
+ | `results/embeddings.npy` | ChemBERTa 埋込み | → cheminformatics |
350
+ | `results/model/` | 訓練済みモデル | → admet-pharmacokinetics |