@nahisaho/satori 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENCE +0 -0
- package/README.md +191 -0
- package/bin/satori.js +95 -0
- package/package.json +29 -0
- package/src/.github/skills/scientific-academic-writing/SKILL.md +361 -0
- package/src/.github/skills/scientific-academic-writing/assets/acs_article.md +199 -0
- package/src/.github/skills/scientific-academic-writing/assets/elsevier_article.md +244 -0
- package/src/.github/skills/scientific-academic-writing/assets/ieee_transactions.md +212 -0
- package/src/.github/skills/scientific-academic-writing/assets/imrad_standard.md +181 -0
- package/src/.github/skills/scientific-academic-writing/assets/nature_article.md +179 -0
- package/src/.github/skills/scientific-academic-writing/assets/qiita_technical_article.md +385 -0
- package/src/.github/skills/scientific-academic-writing/assets/science_research_article.md +169 -0
- package/src/.github/skills/scientific-bioinformatics/SKILL.md +220 -0
- package/src/.github/skills/scientific-biosignal-processing/SKILL.md +357 -0
- package/src/.github/skills/scientific-causal-inference/SKILL.md +347 -0
- package/src/.github/skills/scientific-cheminformatics/SKILL.md +196 -0
- package/src/.github/skills/scientific-data-preprocessing/SKILL.md +413 -0
- package/src/.github/skills/scientific-data-simulation/SKILL.md +244 -0
- package/src/.github/skills/scientific-doe/SKILL.md +360 -0
- package/src/.github/skills/scientific-eda-correlation/SKILL.md +141 -0
- package/src/.github/skills/scientific-feature-importance/SKILL.md +208 -0
- package/src/.github/skills/scientific-image-analysis/SKILL.md +310 -0
- package/src/.github/skills/scientific-materials-characterization/SKILL.md +368 -0
- package/src/.github/skills/scientific-meta-analysis/SKILL.md +352 -0
- package/src/.github/skills/scientific-metabolomics/SKILL.md +326 -0
- package/src/.github/skills/scientific-ml-classification/SKILL.md +265 -0
- package/src/.github/skills/scientific-ml-regression/SKILL.md +215 -0
- package/src/.github/skills/scientific-multi-omics/SKILL.md +303 -0
- package/src/.github/skills/scientific-network-analysis/SKILL.md +257 -0
- package/src/.github/skills/scientific-pca-tsne/SKILL.md +235 -0
- package/src/.github/skills/scientific-pipeline-scaffold/SKILL.md +331 -0
- package/src/.github/skills/scientific-process-optimization/SKILL.md +215 -0
- package/src/.github/skills/scientific-publication-figures/SKILL.md +208 -0
- package/src/.github/skills/scientific-sequence-analysis/SKILL.md +389 -0
- package/src/.github/skills/scientific-spectral-signal/SKILL.md +227 -0
- package/src/.github/skills/scientific-statistical-testing/SKILL.md +240 -0
- package/src/.github/skills/scientific-survival-clinical/SKILL.md +239 -0
- package/src/.github/skills/scientific-time-series/SKILL.md +291 -0
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-doe
|
|
3
|
+
description: |
|
|
4
|
+
実験計画法(DOE)スキル。直交配列表(L9/L16/L27)、中心複合計画(CCD)、
|
|
5
|
+
Box-Behnken 設計、D-最適計画、応答曲面法(RSM)、交互作用解析、
|
|
6
|
+
ベイズ最適化(Gaussian Process)、効果プロット(主効果/交互作用/pareto)の
|
|
7
|
+
テンプレートを提供。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Design of Experiments (DOE)
|
|
11
|
+
|
|
12
|
+
体系的な実験計画と最適化のためのスキル。直交表による因子スクリーニングから
|
|
13
|
+
RSM による最適条件探索、ベイズ最適化による逐次最適化まで、実験の各段階に
|
|
14
|
+
対応するテンプレートを提供する。
|
|
15
|
+
|
|
16
|
+
## When to Use
|
|
17
|
+
|
|
18
|
+
- 多因子実験の計画(因子・水準の設計)が必要なとき
|
|
19
|
+
- 直交表やCCD で実験回数を最小化したいとき
|
|
20
|
+
- 主効果・交互作用の寄与率を定量化するとき
|
|
21
|
+
- 応答曲面で最適条件を探索するとき
|
|
22
|
+
- ベイズ最適化で逐次実験を行いたいとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. 因子設計テンプレート
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
|
|
34
|
+
def define_factors(factor_dict):
|
|
35
|
+
"""
|
|
36
|
+
因子定義テンプレート。
|
|
37
|
+
|
|
38
|
+
factor_dict 例:
|
|
39
|
+
{
|
|
40
|
+
"Temperature": {"levels": [200, 250, 300], "unit": "°C", "type": "continuous"},
|
|
41
|
+
"Pressure": {"levels": [1, 5, 10], "unit": "mTorr", "type": "continuous"},
|
|
42
|
+
"Gas_Ratio": {"levels": [0.2, 0.5, 0.8], "unit": "-", "type": "continuous"},
|
|
43
|
+
"Material": {"levels": ["ZnO", "ITO", "TiO2"], "unit": "-", "type": "categorical"},
|
|
44
|
+
}
|
|
45
|
+
"""
|
|
46
|
+
summary = pd.DataFrame([
|
|
47
|
+
{"Factor": k, "Levels": len(v["levels"]), "Values": str(v["levels"]),
|
|
48
|
+
"Unit": v["unit"], "Type": v["type"]}
|
|
49
|
+
for k, v in factor_dict.items()
|
|
50
|
+
])
|
|
51
|
+
print("=== Factor Design ===")
|
|
52
|
+
print(summary.to_string(index=False))
|
|
53
|
+
return summary
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## 2. 直交配列表
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
# 田口 L9 直交表 (3 因子 × 3 水準)
|
|
60
|
+
L9 = np.array([
|
|
61
|
+
[0, 0, 0],
|
|
62
|
+
[0, 1, 1],
|
|
63
|
+
[0, 2, 2],
|
|
64
|
+
[1, 0, 1],
|
|
65
|
+
[1, 1, 2],
|
|
66
|
+
[1, 2, 0],
|
|
67
|
+
[2, 0, 2],
|
|
68
|
+
[2, 1, 0],
|
|
69
|
+
[2, 2, 1],
|
|
70
|
+
])
|
|
71
|
+
|
|
72
|
+
def generate_taguchi_design(factor_dict, array="L9"):
|
|
73
|
+
"""
|
|
74
|
+
田口直交表から実験計画を生成する。
|
|
75
|
+
|
|
76
|
+
Available arrays: L4(2^3), L9(3^3-4), L16(2^15), L27(3^13)
|
|
77
|
+
"""
|
|
78
|
+
arrays = {
|
|
79
|
+
"L9": L9,
|
|
80
|
+
"L4": np.array([[0,0,0],[0,1,1],[1,0,1],[1,1,0]]),
|
|
81
|
+
}
|
|
82
|
+
oa = arrays.get(array, L9)
|
|
83
|
+
factors = list(factor_dict.keys())
|
|
84
|
+
|
|
85
|
+
runs = []
|
|
86
|
+
for row in oa:
|
|
87
|
+
run = {}
|
|
88
|
+
for i, factor in enumerate(factors[:oa.shape[1]]):
|
|
89
|
+
levels = factor_dict[factor]["levels"]
|
|
90
|
+
run[factor] = levels[row[i] % len(levels)]
|
|
91
|
+
runs.append(run)
|
|
92
|
+
|
|
93
|
+
design_df = pd.DataFrame(runs)
|
|
94
|
+
design_df.index.name = "Run"
|
|
95
|
+
design_df.index += 1
|
|
96
|
+
return design_df
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## 3. 中心複合計画 (CCD)
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from itertools import product
|
|
103
|
+
|
|
104
|
+
def central_composite_design(factor_dict, alpha="rotatable", center_points=3):
|
|
105
|
+
"""
|
|
106
|
+
中心複合計画 (Central Composite Design) を生成する。
|
|
107
|
+
|
|
108
|
+
Components:
|
|
109
|
+
- 2^k 完全実施要因計画 (cube points)
|
|
110
|
+
- 2k 軸点 (axial/star points)
|
|
111
|
+
- n_c 中心点
|
|
112
|
+
|
|
113
|
+
alpha:
|
|
114
|
+
"rotatable" — α = (2^k)^(1/4) (回転可能)
|
|
115
|
+
"face" — α = 1 (面心)
|
|
116
|
+
float — 任意の値
|
|
117
|
+
"""
|
|
118
|
+
continuous_factors = {k: v for k, v in factor_dict.items()
|
|
119
|
+
if v["type"] == "continuous"}
|
|
120
|
+
factor_names = list(continuous_factors.keys())
|
|
121
|
+
k = len(factor_names)
|
|
122
|
+
|
|
123
|
+
if alpha == "rotatable":
|
|
124
|
+
alpha_val = (2 ** k) ** 0.25
|
|
125
|
+
elif alpha == "face":
|
|
126
|
+
alpha_val = 1.0
|
|
127
|
+
else:
|
|
128
|
+
alpha_val = float(alpha)
|
|
129
|
+
|
|
130
|
+
# コード化: -1, 0, +1
|
|
131
|
+
midpoints = {}
|
|
132
|
+
half_ranges = {}
|
|
133
|
+
for name, info in continuous_factors.items():
|
|
134
|
+
levels = info["levels"]
|
|
135
|
+
mid = (max(levels) + min(levels)) / 2
|
|
136
|
+
half = (max(levels) - min(levels)) / 2
|
|
137
|
+
midpoints[name] = mid
|
|
138
|
+
half_ranges[name] = half
|
|
139
|
+
|
|
140
|
+
runs = []
|
|
141
|
+
|
|
142
|
+
# Cube points (2^k)
|
|
143
|
+
for combo in product([-1, 1], repeat=k):
|
|
144
|
+
run = {factor_names[i]: midpoints[factor_names[i]] + combo[i] * half_ranges[factor_names[i]]
|
|
145
|
+
for i in range(k)}
|
|
146
|
+
run["_type"] = "cube"
|
|
147
|
+
runs.append(run)
|
|
148
|
+
|
|
149
|
+
# Axial points (2k)
|
|
150
|
+
for i in range(k):
|
|
151
|
+
for direction in [-1, 1]:
|
|
152
|
+
run = {name: midpoints[name] for name in factor_names}
|
|
153
|
+
run[factor_names[i]] = midpoints[factor_names[i]] + direction * alpha_val * half_ranges[factor_names[i]]
|
|
154
|
+
run["_type"] = "axial"
|
|
155
|
+
runs.append(run)
|
|
156
|
+
|
|
157
|
+
# Center points
|
|
158
|
+
for _ in range(center_points):
|
|
159
|
+
run = {name: midpoints[name] for name in factor_names}
|
|
160
|
+
run["_type"] = "center"
|
|
161
|
+
runs.append(run)
|
|
162
|
+
|
|
163
|
+
design_df = pd.DataFrame(runs)
|
|
164
|
+
design_df.index.name = "Run"
|
|
165
|
+
design_df.index += 1
|
|
166
|
+
return design_df
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## 4. 分散分析 (ANOVA) — 因子効果解析
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from scipy.stats import f_oneway
|
|
173
|
+
|
|
174
|
+
def anova_factor_effects(design_df, response_col, factor_cols):
|
|
175
|
+
"""
|
|
176
|
+
各因子の主効果を ANOVA で評価する。
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
DataFrame with Factor, SS, DF, MS, F_value, p_value, contribution_pct
|
|
180
|
+
"""
|
|
181
|
+
ss_total = np.sum((design_df[response_col] - design_df[response_col].mean())**2)
|
|
182
|
+
results = []
|
|
183
|
+
|
|
184
|
+
for factor in factor_cols:
|
|
185
|
+
groups = [group[response_col].values
|
|
186
|
+
for _, group in design_df.groupby(factor)]
|
|
187
|
+
if len(groups) < 2:
|
|
188
|
+
continue
|
|
189
|
+
f_val, p_val = f_oneway(*groups)
|
|
190
|
+
|
|
191
|
+
# SS_factor
|
|
192
|
+
grand_mean = design_df[response_col].mean()
|
|
193
|
+
ss_factor = sum(len(g) * (np.mean(g) - grand_mean)**2 for g in groups)
|
|
194
|
+
df_factor = len(groups) - 1
|
|
195
|
+
ms_factor = ss_factor / df_factor
|
|
196
|
+
|
|
197
|
+
results.append({
|
|
198
|
+
"Factor": factor,
|
|
199
|
+
"SS": ss_factor,
|
|
200
|
+
"DF": df_factor,
|
|
201
|
+
"MS": ms_factor,
|
|
202
|
+
"F_value": f_val,
|
|
203
|
+
"p_value": p_val,
|
|
204
|
+
"Contribution_pct": ss_factor / ss_total * 100 if ss_total > 0 else 0,
|
|
205
|
+
})
|
|
206
|
+
|
|
207
|
+
return pd.DataFrame(results).sort_values("Contribution_pct", ascending=False)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## 5. 主効果プロット
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
import matplotlib.pyplot as plt
|
|
214
|
+
|
|
215
|
+
def main_effects_plot(design_df, response_col, factor_cols, figsize=None):
|
|
216
|
+
"""全因子の主効果プロットを描画する。"""
|
|
217
|
+
n = len(factor_cols)
|
|
218
|
+
if figsize is None:
|
|
219
|
+
figsize = (4 * n, 4)
|
|
220
|
+
|
|
221
|
+
fig, axes = plt.subplots(1, n, figsize=figsize, sharey=True)
|
|
222
|
+
if n == 1:
|
|
223
|
+
axes = [axes]
|
|
224
|
+
|
|
225
|
+
grand_mean = design_df[response_col].mean()
|
|
226
|
+
|
|
227
|
+
for ax, factor in zip(axes, factor_cols):
|
|
228
|
+
means = design_df.groupby(factor)[response_col].mean()
|
|
229
|
+
ax.plot(range(len(means)), means.values, "bo-", linewidth=2, markersize=8)
|
|
230
|
+
ax.axhline(grand_mean, color="gray", linestyle="--", alpha=0.5)
|
|
231
|
+
ax.set_xticks(range(len(means)))
|
|
232
|
+
ax.set_xticklabels(means.index, rotation=45)
|
|
233
|
+
ax.set_xlabel(factor)
|
|
234
|
+
ax.grid(alpha=0.3)
|
|
235
|
+
|
|
236
|
+
axes[0].set_ylabel(response_col)
|
|
237
|
+
plt.suptitle("Main Effects Plot", fontweight="bold", y=1.02)
|
|
238
|
+
plt.tight_layout()
|
|
239
|
+
plt.savefig("figures/main_effects_plot.png", dpi=300, bbox_inches="tight")
|
|
240
|
+
plt.close()
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## 6. ベイズ最適化(Gaussian Process)
|
|
244
|
+
|
|
245
|
+
```python
|
|
246
|
+
from sklearn.gaussian_process import GaussianProcessRegressor
|
|
247
|
+
from sklearn.gaussian_process.kernels import Matern
|
|
248
|
+
|
|
249
|
+
def bayesian_optimization(objective_func, bounds, n_initial=5,
|
|
250
|
+
n_iterations=20, kappa=2.576):
|
|
251
|
+
"""
|
|
252
|
+
ベイズ最適化(Gaussian Process + Expected Improvement)。
|
|
253
|
+
|
|
254
|
+
Parameters:
|
|
255
|
+
objective_func: callable f(x) → y (最大化)
|
|
256
|
+
bounds: dict {"param": (low, high)}
|
|
257
|
+
n_initial: 初期ランダムサンプリング数
|
|
258
|
+
n_iterations: 最適化ステップ数
|
|
259
|
+
kappa: 探索-活用トレードオフ (UCB の κ)
|
|
260
|
+
"""
|
|
261
|
+
from scipy.optimize import minimize as scipy_minimize
|
|
262
|
+
from scipy.stats import norm
|
|
263
|
+
|
|
264
|
+
param_names = list(bounds.keys())
|
|
265
|
+
lows = np.array([bounds[p][0] for p in param_names])
|
|
266
|
+
highs = np.array([bounds[p][1] for p in param_names])
|
|
267
|
+
|
|
268
|
+
# 初期サンプリング
|
|
269
|
+
X_init = np.random.uniform(lows, highs, size=(n_initial, len(param_names)))
|
|
270
|
+
y_init = np.array([objective_func(dict(zip(param_names, x))) for x in X_init])
|
|
271
|
+
|
|
272
|
+
X_observed = X_init.tolist()
|
|
273
|
+
y_observed = y_init.tolist()
|
|
274
|
+
|
|
275
|
+
gp = GaussianProcessRegressor(kernel=Matern(nu=2.5), n_restarts_optimizer=5,
|
|
276
|
+
random_state=42)
|
|
277
|
+
|
|
278
|
+
for i in range(n_iterations):
|
|
279
|
+
X_arr = np.array(X_observed)
|
|
280
|
+
y_arr = np.array(y_observed)
|
|
281
|
+
gp.fit(X_arr, y_arr)
|
|
282
|
+
|
|
283
|
+
# UCB acquisition function
|
|
284
|
+
def neg_ucb(x):
|
|
285
|
+
mu, sigma = gp.predict(x.reshape(1, -1), return_std=True)
|
|
286
|
+
return -(mu + kappa * sigma)
|
|
287
|
+
|
|
288
|
+
# 複数の開始点から最適化
|
|
289
|
+
best_x = None
|
|
290
|
+
best_val = float("inf")
|
|
291
|
+
for _ in range(10):
|
|
292
|
+
x0 = np.random.uniform(lows, highs)
|
|
293
|
+
res = scipy_minimize(neg_ucb, x0, bounds=list(zip(lows, highs)),
|
|
294
|
+
method="L-BFGS-B")
|
|
295
|
+
if res.fun < best_val:
|
|
296
|
+
best_val = res.fun
|
|
297
|
+
best_x = res.x
|
|
298
|
+
|
|
299
|
+
# 新しい点を評価
|
|
300
|
+
y_new = objective_func(dict(zip(param_names, best_x)))
|
|
301
|
+
X_observed.append(best_x.tolist())
|
|
302
|
+
y_observed.append(y_new)
|
|
303
|
+
|
|
304
|
+
# 最適解
|
|
305
|
+
best_idx = np.argmax(y_observed)
|
|
306
|
+
best_params = dict(zip(param_names, X_observed[best_idx]))
|
|
307
|
+
best_y = y_observed[best_idx]
|
|
308
|
+
|
|
309
|
+
return {
|
|
310
|
+
"best_params": best_params,
|
|
311
|
+
"best_value": best_y,
|
|
312
|
+
"X_history": np.array(X_observed),
|
|
313
|
+
"y_history": np.array(y_observed),
|
|
314
|
+
"gp_model": gp,
|
|
315
|
+
}
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
## 7. 交互作用プロット
|
|
319
|
+
|
|
320
|
+
```python
|
|
321
|
+
def interaction_plot(design_df, response_col, factor1, factor2, figsize=(8, 6)):
|
|
322
|
+
"""2 因子間の交互作用プロットを描画する。"""
|
|
323
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
324
|
+
|
|
325
|
+
for level2, group in design_df.groupby(factor2):
|
|
326
|
+
means = group.groupby(factor1)[response_col].mean()
|
|
327
|
+
ax.plot(range(len(means)), means.values, "o-", linewidth=2,
|
|
328
|
+
markersize=8, label=f"{factor2}={level2}")
|
|
329
|
+
|
|
330
|
+
ax.set_xticks(range(len(means)))
|
|
331
|
+
ax.set_xticklabels(means.index)
|
|
332
|
+
ax.set_xlabel(factor1)
|
|
333
|
+
ax.set_ylabel(response_col)
|
|
334
|
+
ax.set_title(f"Interaction Plot: {factor1} × {factor2}", fontweight="bold")
|
|
335
|
+
ax.legend()
|
|
336
|
+
ax.grid(alpha=0.3)
|
|
337
|
+
plt.tight_layout()
|
|
338
|
+
plt.savefig("figures/interaction_plot.png", dpi=300, bbox_inches="tight")
|
|
339
|
+
plt.close()
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
## References
|
|
343
|
+
|
|
344
|
+
### Output Files
|
|
345
|
+
|
|
346
|
+
| ファイル | 形式 |
|
|
347
|
+
|---|---|
|
|
348
|
+
| `results/experimental_design.csv` | CSV |
|
|
349
|
+
| `results/anova_factor_effects.csv` | CSV |
|
|
350
|
+
| `results/bayesian_optimization_history.csv` | CSV |
|
|
351
|
+
| `figures/main_effects_plot.png` | PNG |
|
|
352
|
+
| `figures/interaction_plot.png` | PNG |
|
|
353
|
+
| `figures/bayesian_convergence.png` | PNG |
|
|
354
|
+
|
|
355
|
+
#### 依存パッケージ
|
|
356
|
+
|
|
357
|
+
```
|
|
358
|
+
scipy>=1.10
|
|
359
|
+
scikit-learn>=1.3
|
|
360
|
+
```
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-eda-correlation
|
|
3
|
+
description: |
|
|
4
|
+
探索的データ解析(EDA)と相関分析のスキル。データの分布可視化、相関ヒートマップ、
|
|
5
|
+
散布図行列の作成を行う際に使用。Scientific Skills Exp-02, 12, 13 で確立したパターン。
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Scientific EDA & Correlation Analysis
|
|
9
|
+
|
|
10
|
+
探索的データ解析(Exploratory Data Analysis)のパイプラインスキル。
|
|
11
|
+
データ理解の初期段階で使用し、分布・外れ値・変数間相関を把握する。
|
|
12
|
+
|
|
13
|
+
## When to Use
|
|
14
|
+
|
|
15
|
+
- 新しいデータセットを受け取ったとき
|
|
16
|
+
- 変数間の関係性を把握したいとき
|
|
17
|
+
- 相関ヒートマップを作成したいとき
|
|
18
|
+
- 材料別・群別のボックスプロット比較が必要なとき
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
## 標準パイプライン
|
|
23
|
+
|
|
24
|
+
### 1. 記述統計量の算出
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import pandas as pd
|
|
28
|
+
import numpy as np
|
|
29
|
+
|
|
30
|
+
def descriptive_statistics(df, numeric_cols, group_col=None):
|
|
31
|
+
"""記述統計量を算出して CSV に保存する。"""
|
|
32
|
+
if group_col:
|
|
33
|
+
stats = df.groupby(group_col)[numeric_cols].describe()
|
|
34
|
+
else:
|
|
35
|
+
stats = df[numeric_cols].describe()
|
|
36
|
+
stats.to_csv("results/descriptive_statistics.csv")
|
|
37
|
+
return stats
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### 2. 分布可視化(ボックスプロット + バイオリンプロット)
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import matplotlib.pyplot as plt
|
|
44
|
+
import seaborn as sns
|
|
45
|
+
|
|
46
|
+
def plot_distributions(df, variables, group_col, figsize=(20, 16), ncols=3):
|
|
47
|
+
"""群別のボックスプロットを変数ごとに描画する。"""
|
|
48
|
+
nrows = (len(variables) + ncols - 1) // ncols
|
|
49
|
+
fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
|
|
50
|
+
axes = axes.flatten()
|
|
51
|
+
|
|
52
|
+
for i, var in enumerate(variables):
|
|
53
|
+
sns.boxplot(data=df, x=group_col, y=var, ax=axes[i],
|
|
54
|
+
palette="Set2", showfliers=True)
|
|
55
|
+
axes[i].set_title(var, fontsize=12, fontweight="bold")
|
|
56
|
+
axes[i].tick_params(axis="x", rotation=45)
|
|
57
|
+
|
|
58
|
+
for j in range(i + 1, len(axes)):
|
|
59
|
+
axes[j].set_visible(False)
|
|
60
|
+
|
|
61
|
+
plt.tight_layout()
|
|
62
|
+
plt.savefig("figures/distribution_boxplots.png", dpi=300, bbox_inches="tight")
|
|
63
|
+
plt.close()
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 3. 相関ヒートマップ(Exp-02 / Exp-13 パターン)
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
def plot_correlation_heatmap(df, numeric_cols, block_boundaries=None,
|
|
70
|
+
figsize=(14, 12), method="pearson"):
|
|
71
|
+
"""
|
|
72
|
+
相関ヒートマップを描画する。
|
|
73
|
+
block_boundaries: PSP などの階層境界を示す線の位置リスト(オプション)。
|
|
74
|
+
"""
|
|
75
|
+
corr = df[numeric_cols].corr(method=method)
|
|
76
|
+
|
|
77
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
78
|
+
mask = np.triu(np.ones_like(corr, dtype=bool), k=1)
|
|
79
|
+
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f",
|
|
80
|
+
cmap="RdBu_r", center=0, vmin=-1, vmax=1,
|
|
81
|
+
square=True, linewidths=0.5, ax=ax,
|
|
82
|
+
annot_kws={"size": 8})
|
|
83
|
+
|
|
84
|
+
# 階層境界線(PSP ブロック分離)
|
|
85
|
+
if block_boundaries:
|
|
86
|
+
for b in block_boundaries:
|
|
87
|
+
ax.axhline(y=b, color="black", linewidth=2)
|
|
88
|
+
ax.axvline(x=b, color="black", linewidth=2)
|
|
89
|
+
|
|
90
|
+
ax.set_title("Correlation Heatmap", fontsize=14, fontweight="bold")
|
|
91
|
+
plt.tight_layout()
|
|
92
|
+
plt.savefig("figures/correlation_heatmap.png", dpi=300, bbox_inches="tight")
|
|
93
|
+
plt.close()
|
|
94
|
+
return corr
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### 4. 散布図行列
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
def plot_scatter_matrix(df, variables, hue_col, figsize=(16, 14)):
|
|
101
|
+
"""主要変数の散布図行列を描画する。"""
|
|
102
|
+
g = sns.pairplot(df[variables + [hue_col]], hue=hue_col,
|
|
103
|
+
diag_kind="kde", palette="Set2",
|
|
104
|
+
plot_kws={"alpha": 0.6, "s": 30})
|
|
105
|
+
g.fig.suptitle("Scatter Matrix", y=1.02, fontsize=14, fontweight="bold")
|
|
106
|
+
plt.savefig("figures/scatter_matrix.png", dpi=300, bbox_inches="tight")
|
|
107
|
+
plt.close()
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### 5. PSP ブロック相関分析(Exp-13 独自)
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
def psp_block_correlation(df, process_cols, structure_cols, property_cols):
|
|
114
|
+
"""Process→Structure→Property の 3 ブロック相関を個別に算出する。"""
|
|
115
|
+
ps_corr = df[process_cols + structure_cols].corr().loc[process_cols, structure_cols]
|
|
116
|
+
sp_corr = df[structure_cols + property_cols].corr().loc[structure_cols, property_cols]
|
|
117
|
+
pp_corr = df[process_cols + property_cols].corr().loc[process_cols, property_cols]
|
|
118
|
+
|
|
119
|
+
ps_corr.to_csv("results/PSP_process_structure_corr.csv")
|
|
120
|
+
sp_corr.to_csv("results/PSP_structure_property_corr.csv")
|
|
121
|
+
pp_corr.to_csv("results/PSP_process_property_corr.csv")
|
|
122
|
+
|
|
123
|
+
return ps_corr, sp_corr, pp_corr
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## References
|
|
127
|
+
|
|
128
|
+
### Output Files
|
|
129
|
+
|
|
130
|
+
| ファイル | 形式 |
|
|
131
|
+
|---|---|
|
|
132
|
+
| `results/descriptive_statistics.csv` | CSV |
|
|
133
|
+
| `figures/distribution_boxplots.png` | PNG (300 DPI) |
|
|
134
|
+
| `figures/correlation_heatmap.png` | PNG (300 DPI) |
|
|
135
|
+
| `figures/scatter_matrix.png` | PNG (300 DPI) |
|
|
136
|
+
|
|
137
|
+
#### 参照実験
|
|
138
|
+
|
|
139
|
+
- **Exp-02**: `sns.heatmap` 相関ヒートマップの基本パターン
|
|
140
|
+
- **Exp-12**: 8 プロセスパラメータの EDA
|
|
141
|
+
- **Exp-13**: PSP 3 ブロック相関行列
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-feature-importance
|
|
3
|
+
description: |
|
|
4
|
+
特徴量重要度分析のスキル。Tree-based Feature Importance と Permutation Importance を
|
|
5
|
+
用いて予測モデルの説明可能性を向上させる際に使用。
|
|
6
|
+
Scientific Skills Exp-05, 12, 13 で確立したパターン。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Feature Importance Analysis
|
|
10
|
+
|
|
11
|
+
機械学習モデルの「どの特徴量が予測に最も寄与しているか」を定量化するスキル。
|
|
12
|
+
Tree-based Importance(MDI)と Permutation Importance の 2 手法を併用して
|
|
13
|
+
ロバストな解釈を提供する。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- 機械学習モデルの予測結果を解釈したいとき
|
|
18
|
+
- どのプロセスパラメータが最も影響力を持つか知りたいとき
|
|
19
|
+
- 特徴量選択の根拠が必要なとき
|
|
20
|
+
- 複数ターゲット変数に対する重要度の比較
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
## 標準パイプライン
|
|
25
|
+
|
|
26
|
+
### 1. Tree-based Feature Importance(MDI)
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import numpy as np
|
|
30
|
+
import pandas as pd
|
|
31
|
+
import matplotlib.pyplot as plt
|
|
32
|
+
|
|
33
|
+
def tree_feature_importance(model, feature_names, target_name,
|
|
34
|
+
top_n=10, figsize=(10, 6)):
|
|
35
|
+
"""
|
|
36
|
+
Tree ベースモデルの .feature_importances_ を取得して棒グラフで描画する。
|
|
37
|
+
RandomForest, GradientBoosting, ExtraTrees に対応。
|
|
38
|
+
"""
|
|
39
|
+
importances = model.feature_importances_
|
|
40
|
+
fi_df = pd.DataFrame({
|
|
41
|
+
"Feature": feature_names,
|
|
42
|
+
"Importance": importances,
|
|
43
|
+
}).sort_values("Importance", ascending=False)
|
|
44
|
+
|
|
45
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
46
|
+
top = fi_df.head(top_n)
|
|
47
|
+
ax.barh(range(len(top)), top["Importance"].values[::-1],
|
|
48
|
+
color="steelblue", edgecolor="black")
|
|
49
|
+
ax.set_yticks(range(len(top)))
|
|
50
|
+
ax.set_yticklabels(top["Feature"].values[::-1])
|
|
51
|
+
ax.set_xlabel("Feature Importance (MDI)")
|
|
52
|
+
ax.set_title(f"Feature Importance: {target_name}", fontweight="bold")
|
|
53
|
+
plt.tight_layout()
|
|
54
|
+
plt.savefig(f"figures/feature_importance_{target_name}.png",
|
|
55
|
+
dpi=300, bbox_inches="tight")
|
|
56
|
+
plt.close()
|
|
57
|
+
|
|
58
|
+
return fi_df
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### 2. Permutation Importance
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from sklearn.inspection import permutation_importance
|
|
65
|
+
|
|
66
|
+
def permutation_feature_importance(model, X_test, y_test, feature_names,
|
|
67
|
+
target_name, n_repeats=10,
|
|
68
|
+
top_n=10, figsize=(10, 6)):
|
|
69
|
+
"""
|
|
70
|
+
Permutation Importance を算出。モデルの種類によらず適用可能。
|
|
71
|
+
"""
|
|
72
|
+
result = permutation_importance(model, X_test, y_test,
|
|
73
|
+
n_repeats=n_repeats, random_state=42)
|
|
74
|
+
pi_df = pd.DataFrame({
|
|
75
|
+
"Feature": feature_names,
|
|
76
|
+
"Importance_mean": result.importances_mean,
|
|
77
|
+
"Importance_std": result.importances_std,
|
|
78
|
+
}).sort_values("Importance_mean", ascending=False)
|
|
79
|
+
|
|
80
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
81
|
+
top = pi_df.head(top_n)
|
|
82
|
+
ax.barh(range(len(top)), top["Importance_mean"].values[::-1],
|
|
83
|
+
xerr=top["Importance_std"].values[::-1],
|
|
84
|
+
color="coral", edgecolor="black", capsize=3)
|
|
85
|
+
ax.set_yticks(range(len(top)))
|
|
86
|
+
ax.set_yticklabels(top["Feature"].values[::-1])
|
|
87
|
+
ax.set_xlabel("Permutation Importance")
|
|
88
|
+
ax.set_title(f"Permutation Importance: {target_name}", fontweight="bold")
|
|
89
|
+
plt.tight_layout()
|
|
90
|
+
plt.savefig(f"figures/permutation_importance_{target_name}.png",
|
|
91
|
+
dpi=300, bbox_inches="tight")
|
|
92
|
+
plt.close()
|
|
93
|
+
|
|
94
|
+
return pi_df
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### 3. マルチターゲット重要度パネル(Exp-13 パターン)
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
def multi_target_importance_panel(models_dict, feature_names,
|
|
101
|
+
top_n=10, ncols=3, figsize=(20, 16)):
|
|
102
|
+
"""
|
|
103
|
+
複数ターゲットの特徴量重要度を一つの Figure にまとめて描画する。
|
|
104
|
+
models_dict: {target_name: fitted_model}
|
|
105
|
+
"""
|
|
106
|
+
targets = list(models_dict.keys())
|
|
107
|
+
nrows = (len(targets) + ncols - 1) // ncols
|
|
108
|
+
fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
|
|
109
|
+
axes = axes.flatten()
|
|
110
|
+
|
|
111
|
+
all_importances = []
|
|
112
|
+
|
|
113
|
+
for i, target in enumerate(targets):
|
|
114
|
+
model = models_dict[target]
|
|
115
|
+
if not hasattr(model, "feature_importances_"):
|
|
116
|
+
axes[i].text(0.5, 0.5, f"{target}\n(No FI available)",
|
|
117
|
+
ha="center", va="center", transform=axes[i].transAxes)
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
importances = model.feature_importances_
|
|
121
|
+
fi_df = pd.DataFrame({
|
|
122
|
+
"Feature": feature_names,
|
|
123
|
+
"Importance": importances,
|
|
124
|
+
"Target": target,
|
|
125
|
+
}).sort_values("Importance", ascending=False)
|
|
126
|
+
|
|
127
|
+
all_importances.append(fi_df)
|
|
128
|
+
|
|
129
|
+
top = fi_df.head(top_n)
|
|
130
|
+
axes[i].barh(range(len(top)), top["Importance"].values[::-1],
|
|
131
|
+
color="steelblue", edgecolor="black")
|
|
132
|
+
axes[i].set_yticks(range(len(top)))
|
|
133
|
+
axes[i].set_yticklabels(top["Feature"].values[::-1], fontsize=8)
|
|
134
|
+
axes[i].set_xlabel("Importance", fontsize=9)
|
|
135
|
+
axes[i].set_title(target, fontweight="bold", fontsize=10)
|
|
136
|
+
|
|
137
|
+
for j in range(i + 1, len(axes)):
|
|
138
|
+
axes[j].set_visible(False)
|
|
139
|
+
|
|
140
|
+
plt.suptitle("Feature Importance by Target", fontsize=14, fontweight="bold")
|
|
141
|
+
plt.tight_layout()
|
|
142
|
+
plt.savefig("figures/feature_importance_panel.png", dpi=300, bbox_inches="tight")
|
|
143
|
+
plt.close()
|
|
144
|
+
|
|
145
|
+
# 全重要度を CSV 保存
|
|
146
|
+
if all_importances:
|
|
147
|
+
combined = pd.concat(all_importances, ignore_index=True)
|
|
148
|
+
combined.to_csv("results/feature_importance.csv", index=False)
|
|
149
|
+
return combined
|
|
150
|
+
return pd.DataFrame()
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### 4. 部分依存プロット(PDP)
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from sklearn.inspection import PartialDependenceDisplay
|
|
157
|
+
|
|
158
|
+
def partial_dependence_plots(model, X_train, feature_names,
|
|
159
|
+
top_features, target_name, figsize=(16, 10)):
|
|
160
|
+
"""上位特徴量の部分依存プロットを描画する。"""
|
|
161
|
+
feature_indices = [list(feature_names).index(f) for f in top_features
|
|
162
|
+
if f in feature_names]
|
|
163
|
+
|
|
164
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
165
|
+
PartialDependenceDisplay.from_estimator(
|
|
166
|
+
model, X_train, feature_indices,
|
|
167
|
+
feature_names=feature_names, ax=ax
|
|
168
|
+
)
|
|
169
|
+
plt.suptitle(f"Partial Dependence: {target_name}", fontweight="bold")
|
|
170
|
+
plt.tight_layout()
|
|
171
|
+
plt.savefig(f"figures/pdp_{target_name}.png", dpi=300, bbox_inches="tight")
|
|
172
|
+
plt.close()
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## パラメータ–物性マッピング表の自動生成
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
def generate_importance_mapping_table(all_fi_df, top_n=3):
|
|
179
|
+
"""各ターゲットの上位 N 特徴量をまとめた対応表を生成する。"""
|
|
180
|
+
mapping = []
|
|
181
|
+
for target in all_fi_df["Target"].unique():
|
|
182
|
+
subset = all_fi_df[all_fi_df["Target"] == target].nlargest(top_n, "Importance")
|
|
183
|
+
for rank, (_, row) in enumerate(subset.iterrows(), 1):
|
|
184
|
+
mapping.append({
|
|
185
|
+
"Target": target,
|
|
186
|
+
f"Rank_{rank}": row["Feature"],
|
|
187
|
+
f"Importance_{rank}": f"{row['Importance']:.4f}",
|
|
188
|
+
})
|
|
189
|
+
return pd.DataFrame(mapping)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## References
|
|
193
|
+
|
|
194
|
+
### Output Files
|
|
195
|
+
|
|
196
|
+
| ファイル | 形式 |
|
|
197
|
+
|---|---|
|
|
198
|
+
| `results/feature_importance.csv` | CSV |
|
|
199
|
+
| `figures/feature_importance_*.png` | PNG |
|
|
200
|
+
| `figures/permutation_importance_*.png` | PNG |
|
|
201
|
+
| `figures/feature_importance_panel.png` | PNG |
|
|
202
|
+
| `figures/pdp_*.png` | PNG |
|
|
203
|
+
|
|
204
|
+
#### 参照実験
|
|
205
|
+
|
|
206
|
+
- **Exp-05**: Tree-based + Permutation Importance(毒性予測)
|
|
207
|
+
- **Exp-12**: 6 モデルの特徴量重要度比較(エッチング)
|
|
208
|
+
- **Exp-13**: マルチターゲットパネル + パラメータ–物性マッピング表
|