@nahisaho/satori 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENCE +0 -0
- package/README.md +191 -0
- package/bin/satori.js +95 -0
- package/package.json +29 -0
- package/src/.github/skills/scientific-academic-writing/SKILL.md +361 -0
- package/src/.github/skills/scientific-academic-writing/assets/acs_article.md +199 -0
- package/src/.github/skills/scientific-academic-writing/assets/elsevier_article.md +244 -0
- package/src/.github/skills/scientific-academic-writing/assets/ieee_transactions.md +212 -0
- package/src/.github/skills/scientific-academic-writing/assets/imrad_standard.md +181 -0
- package/src/.github/skills/scientific-academic-writing/assets/nature_article.md +179 -0
- package/src/.github/skills/scientific-academic-writing/assets/qiita_technical_article.md +385 -0
- package/src/.github/skills/scientific-academic-writing/assets/science_research_article.md +169 -0
- package/src/.github/skills/scientific-bioinformatics/SKILL.md +220 -0
- package/src/.github/skills/scientific-biosignal-processing/SKILL.md +357 -0
- package/src/.github/skills/scientific-causal-inference/SKILL.md +347 -0
- package/src/.github/skills/scientific-cheminformatics/SKILL.md +196 -0
- package/src/.github/skills/scientific-data-preprocessing/SKILL.md +413 -0
- package/src/.github/skills/scientific-data-simulation/SKILL.md +244 -0
- package/src/.github/skills/scientific-doe/SKILL.md +360 -0
- package/src/.github/skills/scientific-eda-correlation/SKILL.md +141 -0
- package/src/.github/skills/scientific-feature-importance/SKILL.md +208 -0
- package/src/.github/skills/scientific-image-analysis/SKILL.md +310 -0
- package/src/.github/skills/scientific-materials-characterization/SKILL.md +368 -0
- package/src/.github/skills/scientific-meta-analysis/SKILL.md +352 -0
- package/src/.github/skills/scientific-metabolomics/SKILL.md +326 -0
- package/src/.github/skills/scientific-ml-classification/SKILL.md +265 -0
- package/src/.github/skills/scientific-ml-regression/SKILL.md +215 -0
- package/src/.github/skills/scientific-multi-omics/SKILL.md +303 -0
- package/src/.github/skills/scientific-network-analysis/SKILL.md +257 -0
- package/src/.github/skills/scientific-pca-tsne/SKILL.md +235 -0
- package/src/.github/skills/scientific-pipeline-scaffold/SKILL.md +331 -0
- package/src/.github/skills/scientific-process-optimization/SKILL.md +215 -0
- package/src/.github/skills/scientific-publication-figures/SKILL.md +208 -0
- package/src/.github/skills/scientific-sequence-analysis/SKILL.md +389 -0
- package/src/.github/skills/scientific-spectral-signal/SKILL.md +227 -0
- package/src/.github/skills/scientific-statistical-testing/SKILL.md +240 -0
- package/src/.github/skills/scientific-survival-clinical/SKILL.md +239 -0
- package/src/.github/skills/scientific-time-series/SKILL.md +291 -0
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-causal-inference
|
|
3
|
+
description: |
|
|
4
|
+
因果推論スキル。傾向スコアマッチング(PSM)、逆確率重み付け(IPW / IPTW)、
|
|
5
|
+
操作変数法(2SLS)、差分の差分法(DID)、回帰不連続デザイン(RDD)、
|
|
6
|
+
DAG ベースの共変量選択(backdoor criterion)、感度分析テンプレートを提供。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Causal Inference
|
|
10
|
+
|
|
11
|
+
観察データから因果効果を推定するための統計的手法パイプライン。
|
|
12
|
+
RCT が実施できない状況で交絡因子を調整し、因果的解釈を可能にする。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 観察データから因果効果(ATE / ATT)を推定したいとき
|
|
17
|
+
- 交絡因子の調整が必要なとき
|
|
18
|
+
- 傾向スコアによるマッチングや重み付けが必要なとき
|
|
19
|
+
- 自然実験データ(DID / RDD)を分析するとき
|
|
20
|
+
- DAG を描いて因果構造を明示化するとき
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
## 1. DAG(有向非巡回グラフ)の定義
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import networkx as nx
|
|
30
|
+
import matplotlib.pyplot as plt
|
|
31
|
+
|
|
32
|
+
def define_causal_dag(edges, treatment, outcome, figsize=(10, 6)):
|
|
33
|
+
"""
|
|
34
|
+
因果 DAG を定義し可視化する。
|
|
35
|
+
|
|
36
|
+
Parameters:
|
|
37
|
+
edges: list of (cause, effect) tuples
|
|
38
|
+
treatment: 処置変数名
|
|
39
|
+
outcome: アウトカム変数名
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
edges = [("Age", "Treatment"), ("Age", "Outcome"),
|
|
43
|
+
("Treatment", "Outcome"), ("Gender", "Treatment")]
|
|
44
|
+
"""
|
|
45
|
+
G = nx.DiGraph()
|
|
46
|
+
G.add_edges_from(edges)
|
|
47
|
+
|
|
48
|
+
# 色分け
|
|
49
|
+
color_map = []
|
|
50
|
+
for node in G.nodes():
|
|
51
|
+
if node == treatment:
|
|
52
|
+
color_map.append("#FF6B6B")
|
|
53
|
+
elif node == outcome:
|
|
54
|
+
color_map.append("#4ECDC4")
|
|
55
|
+
else:
|
|
56
|
+
color_map.append("#95E1D3")
|
|
57
|
+
|
|
58
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
59
|
+
pos = nx.spring_layout(G, k=2, seed=42)
|
|
60
|
+
nx.draw(G, pos, ax=ax, with_labels=True, node_color=color_map,
|
|
61
|
+
node_size=2000, font_size=11, font_weight="bold",
|
|
62
|
+
edge_color="gray", arrows=True, arrowsize=20, width=2)
|
|
63
|
+
ax.set_title("Causal DAG", fontweight="bold", fontsize=14)
|
|
64
|
+
plt.tight_layout()
|
|
65
|
+
plt.savefig("figures/causal_dag.png", dpi=300, bbox_inches="tight")
|
|
66
|
+
plt.close()
|
|
67
|
+
|
|
68
|
+
return G
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def identify_confounders(dag, treatment, outcome):
|
|
72
|
+
"""
|
|
73
|
+
Backdoor criterion に基づく共変量の同定。
|
|
74
|
+
処置→アウトカムのバックドアパスを遮断するために調整すべき変数を返す。
|
|
75
|
+
"""
|
|
76
|
+
# 簡易版: treatment の親ノードのうち、outcome にもパスがある変数
|
|
77
|
+
parents_of_treatment = set(dag.predecessors(treatment))
|
|
78
|
+
confounders = set()
|
|
79
|
+
|
|
80
|
+
for parent in parents_of_treatment:
|
|
81
|
+
if nx.has_path(dag, parent, outcome):
|
|
82
|
+
confounders.add(parent)
|
|
83
|
+
|
|
84
|
+
return confounders
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## 2. 傾向スコアマッチング (PSM)
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
import numpy as np
|
|
91
|
+
import pandas as pd
|
|
92
|
+
from sklearn.linear_model import LogisticRegression
|
|
93
|
+
from scipy.spatial.distance import cdist
|
|
94
|
+
|
|
95
|
+
def propensity_score_matching(df, treatment_col, covariates, outcome_col,
|
|
96
|
+
caliper=0.2, n_matches=1):
|
|
97
|
+
"""
|
|
98
|
+
傾向スコアマッチング。
|
|
99
|
+
|
|
100
|
+
Steps:
|
|
101
|
+
1. ロジスティック回帰で傾向スコア P(T=1|X) を推定
|
|
102
|
+
2. 最近傍マッチング(キャリパー制約付き)
|
|
103
|
+
3. マッチング後の共変量バランスチェック
|
|
104
|
+
4. ATE / ATT の推定
|
|
105
|
+
|
|
106
|
+
Parameters:
|
|
107
|
+
caliper: マッチング許容距離(傾向スコアの標準偏差の倍率)
|
|
108
|
+
"""
|
|
109
|
+
# Step 1: 傾向スコア推定
|
|
110
|
+
X = df[covariates].values
|
|
111
|
+
T = df[treatment_col].values
|
|
112
|
+
|
|
113
|
+
lr = LogisticRegression(max_iter=1000, random_state=42)
|
|
114
|
+
lr.fit(X, T)
|
|
115
|
+
ps = lr.predict_proba(X)[:, 1]
|
|
116
|
+
df = df.copy()
|
|
117
|
+
df["propensity_score"] = ps
|
|
118
|
+
|
|
119
|
+
# Step 2: マッチング
|
|
120
|
+
treated_idx = df[df[treatment_col] == 1].index
|
|
121
|
+
control_idx = df[df[treatment_col] == 0].index
|
|
122
|
+
|
|
123
|
+
ps_treated = ps[treated_idx]
|
|
124
|
+
ps_control = ps[control_idx]
|
|
125
|
+
|
|
126
|
+
caliper_val = caliper * np.std(ps)
|
|
127
|
+
matched_pairs = []
|
|
128
|
+
|
|
129
|
+
for i, t_idx in enumerate(treated_idx):
|
|
130
|
+
distances = np.abs(ps_treated[i] - ps_control)
|
|
131
|
+
within_caliper = np.where(distances <= caliper_val)[0]
|
|
132
|
+
if len(within_caliper) > 0:
|
|
133
|
+
best = within_caliper[np.argmin(distances[within_caliper])]
|
|
134
|
+
matched_pairs.append((t_idx, control_idx[best]))
|
|
135
|
+
|
|
136
|
+
print(f" Matched {len(matched_pairs)} / {len(treated_idx)} treated units")
|
|
137
|
+
|
|
138
|
+
# Step 3: バランスチェック (SMD)
|
|
139
|
+
matched_treated = df.loc[[p[0] for p in matched_pairs]]
|
|
140
|
+
matched_control = df.loc[[p[1] for p in matched_pairs]]
|
|
141
|
+
|
|
142
|
+
balance = []
|
|
143
|
+
for cov in covariates:
|
|
144
|
+
smd_before = _standardized_mean_diff(
|
|
145
|
+
df[df[treatment_col]==1][cov], df[df[treatment_col]==0][cov])
|
|
146
|
+
smd_after = _standardized_mean_diff(
|
|
147
|
+
matched_treated[cov], matched_control[cov])
|
|
148
|
+
balance.append({
|
|
149
|
+
"covariate": cov,
|
|
150
|
+
"SMD_before": smd_before,
|
|
151
|
+
"SMD_after": smd_after,
|
|
152
|
+
"balanced": abs(smd_after) < 0.1,
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
balance_df = pd.DataFrame(balance)
|
|
156
|
+
|
|
157
|
+
# Step 4: ATT 推定
|
|
158
|
+
att = matched_treated[outcome_col].mean() - matched_control[outcome_col].mean()
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
"ATT": att,
|
|
162
|
+
"n_matched": len(matched_pairs),
|
|
163
|
+
"balance": balance_df,
|
|
164
|
+
"propensity_scores": ps,
|
|
165
|
+
"matched_pairs": matched_pairs,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _standardized_mean_diff(x1, x2):
|
|
170
|
+
"""Standardized Mean Difference (SMD) = |μ1 - μ2| / sqrt((s1² + s2²)/2)"""
|
|
171
|
+
return abs(x1.mean() - x2.mean()) / np.sqrt((x1.var() + x2.var()) / 2 + 1e-10)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## 3. 逆確率重み付け (IPW / IPTW)
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
def inverse_probability_weighting(df, treatment_col, covariates, outcome_col):
|
|
178
|
+
"""
|
|
179
|
+
逆確率重み付け推定量 (IPTW - Inverse Probability of Treatment Weighting)。
|
|
180
|
+
|
|
181
|
+
ATE = E[Y(1)] - E[Y(0)]
|
|
182
|
+
= Σ (T·Y/PS) / Σ (T/PS) - Σ ((1-T)·Y/(1-PS)) / Σ ((1-T)/(1-PS))
|
|
183
|
+
"""
|
|
184
|
+
X = df[covariates].values
|
|
185
|
+
T = df[treatment_col].values
|
|
186
|
+
Y = df[outcome_col].values
|
|
187
|
+
|
|
188
|
+
# 傾向スコア推定
|
|
189
|
+
lr = LogisticRegression(max_iter=1000, random_state=42)
|
|
190
|
+
lr.fit(X, T)
|
|
191
|
+
ps = lr.predict_proba(X)[:, 1]
|
|
192
|
+
|
|
193
|
+
# 重み計算(安定化重み)
|
|
194
|
+
w_treated = T / (ps + 1e-10)
|
|
195
|
+
w_control = (1 - T) / (1 - ps + 1e-10)
|
|
196
|
+
|
|
197
|
+
# ATE
|
|
198
|
+
E_Y1 = np.sum(w_treated * Y) / np.sum(w_treated)
|
|
199
|
+
E_Y0 = np.sum(w_control * Y) / np.sum(w_control)
|
|
200
|
+
ate = E_Y1 - E_Y0
|
|
201
|
+
|
|
202
|
+
# ATT
|
|
203
|
+
att = np.mean(T * Y) - np.sum((1-T) * ps / (1-ps+1e-10) * Y) / np.sum((1-T) * ps / (1-ps+1e-10))
|
|
204
|
+
|
|
205
|
+
return {
|
|
206
|
+
"ATE": ate,
|
|
207
|
+
"ATT": att,
|
|
208
|
+
"E_Y1": E_Y1,
|
|
209
|
+
"E_Y0": E_Y0,
|
|
210
|
+
"propensity_scores": ps,
|
|
211
|
+
"weights_treated": w_treated,
|
|
212
|
+
"weights_control": w_control,
|
|
213
|
+
}
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## 4. 差分の差分法 (DID)
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
import statsmodels.api as sm
|
|
220
|
+
|
|
221
|
+
def difference_in_differences(df, time_col, treatment_col, outcome_col,
|
|
222
|
+
covariates=None):
|
|
223
|
+
"""
|
|
224
|
+
差分の差分法 (Difference-in-Differences)。
|
|
225
|
+
|
|
226
|
+
Y = β0 + β1·Post + β2·Treat + β3·(Post × Treat) + ε
|
|
227
|
+
β3 が因果効果(DID 推定量)
|
|
228
|
+
"""
|
|
229
|
+
df = df.copy()
|
|
230
|
+
df["interaction"] = df[time_col] * df[treatment_col]
|
|
231
|
+
|
|
232
|
+
X_cols = [time_col, treatment_col, "interaction"]
|
|
233
|
+
if covariates:
|
|
234
|
+
X_cols += covariates
|
|
235
|
+
|
|
236
|
+
X = sm.add_constant(df[X_cols])
|
|
237
|
+
model = sm.OLS(df[outcome_col], X).fit()
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
"DID_estimate": model.params["interaction"],
|
|
241
|
+
"DID_se": model.bse["interaction"],
|
|
242
|
+
"DID_pvalue": model.pvalues["interaction"],
|
|
243
|
+
"DID_ci_95": model.conf_int().loc["interaction"].tolist(),
|
|
244
|
+
"model_summary": model.summary2().tables[1],
|
|
245
|
+
}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## 5. 回帰不連続デザイン (RDD)
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
def regression_discontinuity(df, running_var, outcome_col, cutoff,
|
|
252
|
+
bandwidth=None, kernel="triangular"):
|
|
253
|
+
"""
|
|
254
|
+
回帰不連続デザイン (Sharp RDD)。
|
|
255
|
+
|
|
256
|
+
カットオフ前後での局所回帰による処置効果の推定。
|
|
257
|
+
|
|
258
|
+
Parameters:
|
|
259
|
+
running_var: 強制変数(running variable)列名
|
|
260
|
+
cutoff: カットオフ値
|
|
261
|
+
bandwidth: バンド幅(None = IK 最適バンド幅)
|
|
262
|
+
"""
|
|
263
|
+
df = df.copy()
|
|
264
|
+
df["centered"] = df[running_var] - cutoff
|
|
265
|
+
df["treated"] = (df[running_var] >= cutoff).astype(int)
|
|
266
|
+
|
|
267
|
+
if bandwidth is None:
|
|
268
|
+
bandwidth = 1.5 * df["centered"].std()
|
|
269
|
+
|
|
270
|
+
# バンド幅内のデータ
|
|
271
|
+
in_band = df[df["centered"].abs() <= bandwidth]
|
|
272
|
+
|
|
273
|
+
# カーネル重み
|
|
274
|
+
if kernel == "triangular":
|
|
275
|
+
weights = 1 - np.abs(in_band["centered"]) / bandwidth
|
|
276
|
+
else:
|
|
277
|
+
weights = np.ones(len(in_band))
|
|
278
|
+
|
|
279
|
+
# 局所線形回帰
|
|
280
|
+
X = sm.add_constant(in_band[["centered", "treated"]])
|
|
281
|
+
X["interaction"] = in_band["centered"] * in_band["treated"]
|
|
282
|
+
model = sm.WLS(in_band[outcome_col], X, weights=weights).fit()
|
|
283
|
+
|
|
284
|
+
return {
|
|
285
|
+
"RDD_estimate": model.params["treated"],
|
|
286
|
+
"RDD_se": model.bse["treated"],
|
|
287
|
+
"RDD_pvalue": model.pvalues["treated"],
|
|
288
|
+
"RDD_ci_95": model.conf_int().loc["treated"].tolist(),
|
|
289
|
+
"bandwidth": bandwidth,
|
|
290
|
+
"n_in_bandwidth": len(in_band),
|
|
291
|
+
}
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
## 6. 感度分析(Rosenbaum Bounds)
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
def rosenbaum_sensitivity(matched_outcomes_treated, matched_outcomes_control,
|
|
298
|
+
gamma_range=None):
|
|
299
|
+
"""
|
|
300
|
+
Rosenbaum 感度分析。
|
|
301
|
+
隠れた交絡の影響度 Γ に対する因果推定の頑健性を評価。
|
|
302
|
+
|
|
303
|
+
Γ = 1: 交絡なし仮定
|
|
304
|
+
Γ > 1: 隠れた交絡がΓ倍のオッズ比を持つ場合
|
|
305
|
+
"""
|
|
306
|
+
from scipy.stats import norm
|
|
307
|
+
|
|
308
|
+
if gamma_range is None:
|
|
309
|
+
gamma_range = np.arange(1.0, 3.1, 0.1)
|
|
310
|
+
|
|
311
|
+
diffs = matched_outcomes_treated - matched_outcomes_control
|
|
312
|
+
n = len(diffs)
|
|
313
|
+
signs = (diffs > 0).astype(int)
|
|
314
|
+
T_obs = np.sum(signs)
|
|
315
|
+
|
|
316
|
+
results = []
|
|
317
|
+
for gamma in gamma_range:
|
|
318
|
+
p_upper = gamma / (1 + gamma)
|
|
319
|
+
E_T = n * p_upper
|
|
320
|
+
Var_T = n * p_upper * (1 - p_upper)
|
|
321
|
+
z = (T_obs - E_T) / np.sqrt(Var_T + 1e-10)
|
|
322
|
+
p_value = 1 - norm.cdf(z)
|
|
323
|
+
results.append({"gamma": gamma, "z_statistic": z, "p_value": p_value})
|
|
324
|
+
|
|
325
|
+
return pd.DataFrame(results)
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
## References
|
|
329
|
+
|
|
330
|
+
### Output Files
|
|
331
|
+
|
|
332
|
+
| ファイル | 形式 |
|
|
333
|
+
|---|---|
|
|
334
|
+
| `results/causal_estimates.csv` | CSV |
|
|
335
|
+
| `results/covariate_balance.csv` | CSV |
|
|
336
|
+
| `results/sensitivity_analysis.csv` | CSV |
|
|
337
|
+
| `figures/causal_dag.png` | PNG |
|
|
338
|
+
| `figures/propensity_distribution.png` | PNG |
|
|
339
|
+
| `figures/rdd_plot.png` | PNG |
|
|
340
|
+
|
|
341
|
+
#### 依存パッケージ
|
|
342
|
+
|
|
343
|
+
```
|
|
344
|
+
statsmodels>=0.14
|
|
345
|
+
scikit-learn>=1.3
|
|
346
|
+
networkx>=3.0
|
|
347
|
+
```
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-cheminformatics
|
|
3
|
+
description: |
|
|
4
|
+
ケモインフォマティクス解析のスキル。RDKit を用いた分子記述子計算、Morgan フィンガープリント、
|
|
5
|
+
Tanimoto 類似度、構造アラート検出、Lipinski Rule of 5 評価を行う際に使用。
|
|
6
|
+
Scientific Skills Exp-02, 05 で確立したパターン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Cheminformatics Analysis
|
|
10
|
+
|
|
11
|
+
RDKit を用いた分子解析パイプラインスキル。SMILES → 分子記述子 → SAR 解析 →
|
|
12
|
+
毒性予測までの創薬ケモインフォマティクスワークフローを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 化合物の物理化学的性質を算出したいとき
|
|
17
|
+
- SMILES 文字列から分子記述子を計算したいとき
|
|
18
|
+
- 化合物間の構造類似度を評価したいとき
|
|
19
|
+
- 構造活性相関(SAR)を解析したいとき
|
|
20
|
+
- 構造アラート(トキシコフォア)を検出したいとき
|
|
21
|
+
- Lipinski Rule of 5 / ドラッグライクネスを評価したいとき
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
## 標準パイプライン
|
|
26
|
+
|
|
27
|
+
### 1. SMILES → 分子オブジェクト変換
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from rdkit import Chem
|
|
31
|
+
from rdkit.Chem import Descriptors, AllChem, QED, Lipinski
|
|
32
|
+
from rdkit.Chem.Scaffolds import MurckoScaffold
|
|
33
|
+
import pandas as pd
|
|
34
|
+
import numpy as np
|
|
35
|
+
|
|
36
|
+
def smiles_to_mol(smiles):
|
|
37
|
+
"""SMILES 文字列から RDKit 分子オブジェクトを生成する。"""
|
|
38
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
39
|
+
if mol is None:
|
|
40
|
+
raise ValueError(f"Invalid SMILES: {smiles}")
|
|
41
|
+
return mol
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### 2. 分子記述子の一括計算
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
def calculate_descriptors(smiles_list, names=None):
|
|
48
|
+
"""
|
|
49
|
+
SMILES リストから主要な分子記述子を一括計算する。
|
|
50
|
+
返値: DataFrame
|
|
51
|
+
"""
|
|
52
|
+
records = []
|
|
53
|
+
for i, smi in enumerate(smiles_list):
|
|
54
|
+
mol = Chem.MolFromSmiles(smi)
|
|
55
|
+
if mol is None:
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
record = {
|
|
59
|
+
"Name": names[i] if names else f"Mol_{i}",
|
|
60
|
+
"SMILES": smi,
|
|
61
|
+
"MW": Descriptors.MolWt(mol),
|
|
62
|
+
"LogP": Descriptors.MolLogP(mol),
|
|
63
|
+
"TPSA": Descriptors.TPSA(mol),
|
|
64
|
+
"HBA": Descriptors.NumHAcceptors(mol),
|
|
65
|
+
"HBD": Descriptors.NumHDonors(mol),
|
|
66
|
+
"RotBonds": Descriptors.NumRotatableBonds(mol),
|
|
67
|
+
"AromaticRings": Descriptors.NumAromaticRings(mol),
|
|
68
|
+
"HeavyAtoms": mol.GetNumHeavyAtoms(),
|
|
69
|
+
"QED": QED.qed(mol),
|
|
70
|
+
"Fraction_CSP3": Descriptors.FractionCSP3(mol),
|
|
71
|
+
}
|
|
72
|
+
records.append(record)
|
|
73
|
+
|
|
74
|
+
return pd.DataFrame(records)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### 3. Morgan フィンガープリント & Tanimoto 類似度
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from rdkit import DataStructs
|
|
81
|
+
|
|
82
|
+
def compute_fingerprints(smiles_list, radius=2, nBits=2048):
|
|
83
|
+
"""Morgan フィンガープリントを生成する。"""
|
|
84
|
+
fps = []
|
|
85
|
+
for smi in smiles_list:
|
|
86
|
+
mol = Chem.MolFromSmiles(smi)
|
|
87
|
+
if mol:
|
|
88
|
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
|
|
89
|
+
fps.append(fp)
|
|
90
|
+
return fps
|
|
91
|
+
|
|
92
|
+
def tanimoto_similarity_matrix(fps, names=None):
|
|
93
|
+
"""Tanimoto 類似度行列を算出する。"""
|
|
94
|
+
n = len(fps)
|
|
95
|
+
sim_matrix = np.zeros((n, n))
|
|
96
|
+
for i in range(n):
|
|
97
|
+
for j in range(n):
|
|
98
|
+
sim_matrix[i, j] = DataStructs.TanimotoSimilarity(fps[i], fps[j])
|
|
99
|
+
|
|
100
|
+
if names is None:
|
|
101
|
+
names = [f"Mol_{i}" for i in range(n)]
|
|
102
|
+
|
|
103
|
+
sim_df = pd.DataFrame(sim_matrix, index=names, columns=names)
|
|
104
|
+
sim_df.to_csv("results/tanimoto_similarity.csv")
|
|
105
|
+
return sim_df
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 4. Lipinski Rule of 5 評価
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
def lipinski_evaluation(desc_df):
|
|
112
|
+
"""Lipinski Rule of 5 の準拠チェック。"""
|
|
113
|
+
desc_df = desc_df.copy()
|
|
114
|
+
desc_df["Lipinski_MW"] = desc_df["MW"] <= 500
|
|
115
|
+
desc_df["Lipinski_LogP"] = desc_df["LogP"] <= 5
|
|
116
|
+
desc_df["Lipinski_HBA"] = desc_df["HBA"] <= 10
|
|
117
|
+
desc_df["Lipinski_HBD"] = desc_df["HBD"] <= 5
|
|
118
|
+
desc_df["Lipinski_Violations"] = 4 - (
|
|
119
|
+
desc_df["Lipinski_MW"].astype(int) +
|
|
120
|
+
desc_df["Lipinski_LogP"].astype(int) +
|
|
121
|
+
desc_df["Lipinski_HBA"].astype(int) +
|
|
122
|
+
desc_df["Lipinski_HBD"].astype(int)
|
|
123
|
+
)
|
|
124
|
+
desc_df["Lipinski_Pass"] = desc_df["Lipinski_Violations"] <= 1
|
|
125
|
+
return desc_df
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### 5. 構造アラート(トキシコフォア)検出(Exp-05)
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
STRUCTURAL_ALERTS = {
|
|
132
|
+
"Nitro": "[N+](=O)[O-]",
|
|
133
|
+
"Epoxide": "C1OC1",
|
|
134
|
+
"Aldehyde": "[CH]=O",
|
|
135
|
+
"Michael_Acceptor": "C=CC(=O)",
|
|
136
|
+
"Acyl_Halide": "C(=O)[F,Cl,Br,I]",
|
|
137
|
+
"Aniline": "c1ccccc1N",
|
|
138
|
+
"Hydrazine": "NN",
|
|
139
|
+
"Sulfonate": "S(=O)(=O)[O-]",
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
def detect_structural_alerts(smiles_list, names=None, alerts=None):
|
|
143
|
+
"""SMARTS パターンによる構造アラートの検出。"""
|
|
144
|
+
if alerts is None:
|
|
145
|
+
alerts = STRUCTURAL_ALERTS
|
|
146
|
+
|
|
147
|
+
results = []
|
|
148
|
+
for i, smi in enumerate(smiles_list):
|
|
149
|
+
mol = Chem.MolFromSmiles(smi)
|
|
150
|
+
if mol is None:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
name = names[i] if names else f"Mol_{i}"
|
|
154
|
+
for alert_name, smarts in alerts.items():
|
|
155
|
+
pattern = Chem.MolFromSmarts(smarts)
|
|
156
|
+
if mol.HasSubstructMatch(pattern):
|
|
157
|
+
results.append({"Name": name, "SMILES": smi,
|
|
158
|
+
"Alert": alert_name, "SMARTS": smarts})
|
|
159
|
+
|
|
160
|
+
return pd.DataFrame(results)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### 6. Murcko スキャフォールド解析
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
def scaffold_analysis(smiles_list, names=None):
|
|
167
|
+
"""Murcko スキャフォールドの抽出と分類。"""
|
|
168
|
+
scaffolds = []
|
|
169
|
+
for i, smi in enumerate(smiles_list):
|
|
170
|
+
mol = Chem.MolFromSmiles(smi)
|
|
171
|
+
if mol:
|
|
172
|
+
core = MurckoScaffold.GetScaffoldForMol(mol)
|
|
173
|
+
scaffolds.append({
|
|
174
|
+
"Name": names[i] if names else f"Mol_{i}",
|
|
175
|
+
"SMILES": smi,
|
|
176
|
+
"Scaffold": Chem.MolToSmiles(core),
|
|
177
|
+
})
|
|
178
|
+
return pd.DataFrame(scaffolds)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## References
|
|
182
|
+
|
|
183
|
+
### Output Files
|
|
184
|
+
|
|
185
|
+
| ファイル | 形式 |
|
|
186
|
+
|---|---|
|
|
187
|
+
| `results/molecular_properties.csv` | CSV |
|
|
188
|
+
| `results/tanimoto_similarity.csv` | CSV |
|
|
189
|
+
| `results/structural_alerts.csv` | CSV |
|
|
190
|
+
| `figures/chemical_space_pca.png` | PNG |
|
|
191
|
+
| `figures/similarity_heatmap.png` | PNG |
|
|
192
|
+
|
|
193
|
+
#### 参照実験
|
|
194
|
+
|
|
195
|
+
- **Exp-02**: EGFR 阻害剤 SAR 解析(記述子、Tanimoto、MCS、Scaffold)
|
|
196
|
+
- **Exp-05**: 毒性予測(構造アラート、Morgan FP 分類モデル)
|