@nahisaho/satori 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -17
- package/package.json +1 -1
- package/src/.github/skills/scientific-active-learning/SKILL.md +289 -0
- package/src/.github/skills/scientific-advanced-visualization/SKILL.md +310 -0
- package/src/.github/skills/scientific-automl/SKILL.md +264 -0
- package/src/.github/skills/scientific-ensemble-methods/SKILL.md +263 -0
- package/src/.github/skills/scientific-interactive-dashboard/SKILL.md +346 -0
- package/src/.github/skills/scientific-missing-data-analysis/SKILL.md +312 -0
- package/src/.github/skills/scientific-transfer-learning/SKILL.md +298 -0
- package/src/.github/skills/scientific-uncertainty-quantification/SKILL.md +286 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-advanced-visualization
|
|
3
|
+
description: |
|
|
4
|
+
科学データ高度可視化スキル。Plotly インタラクティブ 3D ・
|
|
5
|
+
Altair 宣言的可視化・Seaborn 統計プロット・
|
|
6
|
+
アニメーション・Parallel Coordinates・出版品質図。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Advanced Visualization
|
|
10
|
+
|
|
11
|
+
科学データのインタラクティブ可視化・3D レンダリング・
|
|
12
|
+
出版品質図・アニメーションを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- インタラクティブな 3D 散布図・サーフェスプロットを描くとき
|
|
17
|
+
- Plotly / Altair で動的可視化を作成するとき
|
|
18
|
+
- 多変量データを Parallel Coordinates / Radar で可視化するとき
|
|
19
|
+
- 論文投稿用の出版品質 (Nature/Science style) 図を作成するとき
|
|
20
|
+
- 時系列・シミュレーション結果のアニメーションを作成するとき
|
|
21
|
+
- 複数パネルの複合図を作成するとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. Plotly インタラクティブ 3D
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def plotly_3d_scatter(df, x, y, z, color=None, size=None,
|
|
35
|
+
title="3D Scatter Plot"):
|
|
36
|
+
"""
|
|
37
|
+
Plotly 3D 散布図。
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
df: pd.DataFrame — データ
|
|
41
|
+
x, y, z: str — 軸カラム名
|
|
42
|
+
color: str | None — 色分けカラム
|
|
43
|
+
size: str | None — サイズカラム
|
|
44
|
+
title: str — タイトル
|
|
45
|
+
"""
|
|
46
|
+
import plotly.express as px
|
|
47
|
+
|
|
48
|
+
fig = px.scatter_3d(df, x=x, y=y, z=z, color=color, size=size,
|
|
49
|
+
title=title, opacity=0.7)
|
|
50
|
+
fig.update_layout(
|
|
51
|
+
scene=dict(
|
|
52
|
+
xaxis_title=x, yaxis_title=y, zaxis_title=z),
|
|
53
|
+
width=900, height=700)
|
|
54
|
+
|
|
55
|
+
path = "3d_scatter.html"
|
|
56
|
+
fig.write_html(path)
|
|
57
|
+
print(f"3D Scatter: {len(df)} points → {path}")
|
|
58
|
+
return fig
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def plotly_surface(X_grid, Y_grid, Z_grid, title="Surface Plot"):
|
|
62
|
+
"""
|
|
63
|
+
Plotly 3D サーフェスプロット。
|
|
64
|
+
|
|
65
|
+
Parameters:
|
|
66
|
+
X_grid, Y_grid, Z_grid: np.ndarray — メッシュグリッド
|
|
67
|
+
title: str — タイトル
|
|
68
|
+
"""
|
|
69
|
+
import plotly.graph_objects as go
|
|
70
|
+
|
|
71
|
+
fig = go.Figure(data=[go.Surface(x=X_grid, y=Y_grid, z=Z_grid,
|
|
72
|
+
colorscale="Viridis")])
|
|
73
|
+
fig.update_layout(
|
|
74
|
+
title=title,
|
|
75
|
+
scene=dict(xaxis_title="X", yaxis_title="Y", zaxis_title="Z"),
|
|
76
|
+
width=900, height=700)
|
|
77
|
+
|
|
78
|
+
path = "surface_plot.html"
|
|
79
|
+
fig.write_html(path)
|
|
80
|
+
print(f"Surface: {Z_grid.shape} grid → {path}")
|
|
81
|
+
return fig
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## 2. Altair 宣言的可視化
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
def altair_faceted_chart(df, x, y, color, facet_col=None,
|
|
88
|
+
chart_type="scatter"):
|
|
89
|
+
"""
|
|
90
|
+
Altair 宣言的ファセット付きチャート。
|
|
91
|
+
|
|
92
|
+
Parameters:
|
|
93
|
+
df: pd.DataFrame — データ
|
|
94
|
+
x, y: str — 軸カラム
|
|
95
|
+
color: str — 色分けカラム
|
|
96
|
+
facet_col: str | None — ファセットカラム
|
|
97
|
+
chart_type: str — "scatter" / "line" / "bar" / "box"
|
|
98
|
+
"""
|
|
99
|
+
import altair as alt
|
|
100
|
+
|
|
101
|
+
base = alt.Chart(df).encode(
|
|
102
|
+
x=alt.X(x, scale=alt.Scale(zero=False)),
|
|
103
|
+
y=alt.Y(y, scale=alt.Scale(zero=False)),
|
|
104
|
+
color=color)
|
|
105
|
+
|
|
106
|
+
if chart_type == "scatter":
|
|
107
|
+
chart = base.mark_circle(size=60, opacity=0.7)
|
|
108
|
+
elif chart_type == "line":
|
|
109
|
+
chart = base.mark_line()
|
|
110
|
+
elif chart_type == "bar":
|
|
111
|
+
chart = base.mark_bar()
|
|
112
|
+
elif chart_type == "box":
|
|
113
|
+
chart = base.mark_boxplot()
|
|
114
|
+
else:
|
|
115
|
+
chart = base.mark_circle()
|
|
116
|
+
|
|
117
|
+
if facet_col:
|
|
118
|
+
chart = chart.facet(facet_col, columns=3)
|
|
119
|
+
|
|
120
|
+
chart = chart.properties(width=300, height=250).interactive()
|
|
121
|
+
|
|
122
|
+
path = "altair_chart.html"
|
|
123
|
+
chart.save(path)
|
|
124
|
+
print(f"Altair {chart_type}: {len(df)} rows → {path}")
|
|
125
|
+
return chart
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## 3. 多変量可視化
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
def parallel_coordinates_plot(df, class_col, features=None,
|
|
132
|
+
title="Parallel Coordinates"):
|
|
133
|
+
"""
|
|
134
|
+
Parallel Coordinates プロット。
|
|
135
|
+
|
|
136
|
+
Parameters:
|
|
137
|
+
df: pd.DataFrame — データ
|
|
138
|
+
class_col: str — 分類カラム
|
|
139
|
+
features: list[str] | None — 表示特徴量 (None で全数値)
|
|
140
|
+
title: str — タイトル
|
|
141
|
+
"""
|
|
142
|
+
import plotly.express as px
|
|
143
|
+
|
|
144
|
+
if features is None:
|
|
145
|
+
features = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
146
|
+
if class_col in features:
|
|
147
|
+
features.remove(class_col)
|
|
148
|
+
|
|
149
|
+
fig = px.parallel_coordinates(
|
|
150
|
+
df, color=class_col, dimensions=features,
|
|
151
|
+
title=title, color_continuous_scale=px.colors.diverging.Tealrose)
|
|
152
|
+
|
|
153
|
+
fig.update_layout(width=1000, height=500)
|
|
154
|
+
|
|
155
|
+
path = "parallel_coordinates.html"
|
|
156
|
+
fig.write_html(path)
|
|
157
|
+
print(f"Parallel Coordinates: {len(features)} dims → {path}")
|
|
158
|
+
return fig
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def radar_chart(categories, values_dict, title="Radar Chart"):
|
|
162
|
+
"""
|
|
163
|
+
Radar (Spider) チャート — 複数グループ比較。
|
|
164
|
+
|
|
165
|
+
Parameters:
|
|
166
|
+
categories: list[str] — 軸ラベル
|
|
167
|
+
values_dict: dict[str, list[float]] — {グループ名: 値リスト}
|
|
168
|
+
title: str — タイトル
|
|
169
|
+
"""
|
|
170
|
+
import plotly.graph_objects as go
|
|
171
|
+
|
|
172
|
+
fig = go.Figure()
|
|
173
|
+
|
|
174
|
+
for name, vals in values_dict.items():
|
|
175
|
+
fig.add_trace(go.Scatterpolar(
|
|
176
|
+
r=vals + [vals[0]],
|
|
177
|
+
theta=categories + [categories[0]],
|
|
178
|
+
fill="toself", name=name, opacity=0.6))
|
|
179
|
+
|
|
180
|
+
fig.update_layout(
|
|
181
|
+
polar=dict(radialaxis=dict(visible=True)),
|
|
182
|
+
title=title, width=600, height=500)
|
|
183
|
+
|
|
184
|
+
path = "radar_chart.html"
|
|
185
|
+
fig.write_html(path)
|
|
186
|
+
print(f"Radar: {len(values_dict)} groups × {len(categories)} axes → {path}")
|
|
187
|
+
return fig
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## 4. 出版品質図 (Nature/Science style)
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
def publication_figure(plot_func, figsize=(3.5, 2.8),
|
|
194
|
+
dpi=300, style="nature",
|
|
195
|
+
output="publication_fig.pdf"):
|
|
196
|
+
"""
|
|
197
|
+
出版品質 (Nature/Science style) 図生成。
|
|
198
|
+
|
|
199
|
+
Parameters:
|
|
200
|
+
plot_func: callable — matplotlib 描画関数 (ax を引数に取る)
|
|
201
|
+
figsize: tuple — 図サイズ (インチ, Nature 1 col = 3.5in)
|
|
202
|
+
dpi: int — 解像度
|
|
203
|
+
style: str — "nature" / "science" / "acs"
|
|
204
|
+
output: str — 出力パス (.pdf / .svg / .png)
|
|
205
|
+
"""
|
|
206
|
+
import matplotlib.pyplot as plt
|
|
207
|
+
import matplotlib as mpl
|
|
208
|
+
|
|
209
|
+
# Nature/Science スタイル設定
|
|
210
|
+
style_params = {
|
|
211
|
+
"nature": {
|
|
212
|
+
"font.family": "Arial",
|
|
213
|
+
"font.size": 7,
|
|
214
|
+
"axes.linewidth": 0.5,
|
|
215
|
+
"xtick.major.width": 0.5,
|
|
216
|
+
"ytick.major.width": 0.5,
|
|
217
|
+
"lines.linewidth": 1.0,
|
|
218
|
+
"lines.markersize": 3,
|
|
219
|
+
},
|
|
220
|
+
"science": {
|
|
221
|
+
"font.family": "Helvetica",
|
|
222
|
+
"font.size": 8,
|
|
223
|
+
"axes.linewidth": 0.6,
|
|
224
|
+
"xtick.major.width": 0.6,
|
|
225
|
+
"ytick.major.width": 0.6,
|
|
226
|
+
"lines.linewidth": 1.2,
|
|
227
|
+
"lines.markersize": 4,
|
|
228
|
+
},
|
|
229
|
+
"acs": {
|
|
230
|
+
"font.family": "Arial",
|
|
231
|
+
"font.size": 9,
|
|
232
|
+
"axes.linewidth": 0.5,
|
|
233
|
+
"xtick.major.width": 0.5,
|
|
234
|
+
"ytick.major.width": 0.5,
|
|
235
|
+
"lines.linewidth": 1.0,
|
|
236
|
+
"lines.markersize": 4,
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
with mpl.rc_context(style_params.get(style, style_params["nature"])):
|
|
241
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
242
|
+
plot_func(ax)
|
|
243
|
+
ax.spines["top"].set_visible(False)
|
|
244
|
+
ax.spines["right"].set_visible(False)
|
|
245
|
+
plt.tight_layout()
|
|
246
|
+
fig.savefig(output, dpi=dpi, bbox_inches="tight")
|
|
247
|
+
plt.close()
|
|
248
|
+
|
|
249
|
+
print(f"Publication figure ({style}): {figsize} @ {dpi}dpi → {output}")
|
|
250
|
+
return output
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## 5. アニメーション
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
def create_animation(data_frames, x_col, y_col, time_col,
|
|
257
|
+
title="Animation", fps=10):
|
|
258
|
+
"""
|
|
259
|
+
Plotly アニメーション。
|
|
260
|
+
|
|
261
|
+
Parameters:
|
|
262
|
+
data_frames: pd.DataFrame — 時間列を含むデータ
|
|
263
|
+
x_col, y_col: str — 軸カラム
|
|
264
|
+
time_col: str — 時間 / フレームカラム
|
|
265
|
+
title: str — タイトル
|
|
266
|
+
fps: int — フレームレート
|
|
267
|
+
"""
|
|
268
|
+
import plotly.express as px
|
|
269
|
+
|
|
270
|
+
fig = px.scatter(data_frames, x=x_col, y=y_col,
|
|
271
|
+
animation_frame=time_col,
|
|
272
|
+
title=title, opacity=0.7,
|
|
273
|
+
range_x=[data_frames[x_col].min() * 0.9,
|
|
274
|
+
data_frames[x_col].max() * 1.1],
|
|
275
|
+
range_y=[data_frames[y_col].min() * 0.9,
|
|
276
|
+
data_frames[y_col].max() * 1.1])
|
|
277
|
+
|
|
278
|
+
fig.update_layout(
|
|
279
|
+
width=800, height=600,
|
|
280
|
+
updatemenus=[dict(type="buttons",
|
|
281
|
+
buttons=[dict(label="▶ Play",
|
|
282
|
+
method="animate",
|
|
283
|
+
args=[None, {"frame": {"duration": 1000 // fps}}])])])
|
|
284
|
+
|
|
285
|
+
path = "animation.html"
|
|
286
|
+
fig.write_html(path)
|
|
287
|
+
print(f"Animation: {data_frames[time_col].nunique()} frames @ {fps}fps → {path}")
|
|
288
|
+
return fig
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
## パイプライン統合
|
|
294
|
+
|
|
295
|
+
```
|
|
296
|
+
eda-correlation → advanced-visualization → presentation-design
|
|
297
|
+
(探索的解析) (高度可視化) (プレゼンテーション)
|
|
298
|
+
│ │ ↓
|
|
299
|
+
pca-tsne ───────────────┘ interactive-dashboard
|
|
300
|
+
(次元削減) (ダッシュボード)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
## パイプライン出力
|
|
304
|
+
|
|
305
|
+
| ファイル | 説明 | 次スキル |
|
|
306
|
+
|---------|------|---------|
|
|
307
|
+
| `3d_scatter.html` | インタラクティブ 3D 散布図 | → dashboard |
|
|
308
|
+
| `publication_fig.pdf` | 出版品質図 | → presentation |
|
|
309
|
+
| `parallel_coordinates.html` | 多変量可視化 | → reporting |
|
|
310
|
+
| `animation.html` | アニメーション | → presentation |
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-automl
|
|
3
|
+
description: |
|
|
4
|
+
AutoML パイプラインスキル。Optuna ハイパーパラメータ最適化・
|
|
5
|
+
FLAML 高速 AutoML・Auto-sklearn モデル選択・
|
|
6
|
+
NAS (Neural Architecture Search)・
|
|
7
|
+
特徴量エンジニアリング自動化・モデル比較パイプライン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific AutoML
|
|
11
|
+
|
|
12
|
+
ハイパーパラメータ最適化・モデル選択・特徴量エンジニアリングを
|
|
13
|
+
自動化する AutoML パイプラインを提供する。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- Optuna/Hyperopt でハイパーパラメータを最適化するとき
|
|
18
|
+
- 複数モデルの自動比較・選択を行うとき
|
|
19
|
+
- FLAML/Auto-sklearn で高速な AutoML を実行するとき
|
|
20
|
+
- 特徴量エンジニアリングを自動化するとき
|
|
21
|
+
- Neural Architecture Search (NAS) を設計するとき
|
|
22
|
+
- モデル選択根拠のレポートを生成するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. Optuna ハイパーパラメータ最適化
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import optuna
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
from sklearn.model_selection import cross_val_score
|
|
35
|
+
from sklearn.ensemble import (
|
|
36
|
+
RandomForestClassifier, GradientBoostingClassifier)
|
|
37
|
+
from sklearn.svm import SVC
|
|
38
|
+
from sklearn.metrics import make_scorer, f1_score
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def optuna_optimize(X, y, model_type="rf", n_trials=100,
|
|
42
|
+
cv=5, scoring="f1_macro", direction="maximize"):
|
|
43
|
+
"""
|
|
44
|
+
Optuna ベース ハイパーパラメータ最適化。
|
|
45
|
+
|
|
46
|
+
Parameters:
|
|
47
|
+
X: np.ndarray — 特徴量
|
|
48
|
+
y: np.ndarray — ラベル
|
|
49
|
+
model_type: str — "rf" / "gbm" / "svm"
|
|
50
|
+
n_trials: int — 試行回数
|
|
51
|
+
cv: int — CV 分割数
|
|
52
|
+
scoring: str — 評価指標
|
|
53
|
+
direction: str — "maximize" / "minimize"
|
|
54
|
+
"""
|
|
55
|
+
def objective(trial):
|
|
56
|
+
if model_type == "rf":
|
|
57
|
+
params = {
|
|
58
|
+
"n_estimators": trial.suggest_int("n_estimators", 50, 500),
|
|
59
|
+
"max_depth": trial.suggest_int("max_depth", 3, 20),
|
|
60
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
|
|
61
|
+
"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
|
|
62
|
+
"max_features": trial.suggest_categorical(
|
|
63
|
+
"max_features", ["sqrt", "log2", None]),
|
|
64
|
+
}
|
|
65
|
+
model = RandomForestClassifier(**params, random_state=42)
|
|
66
|
+
|
|
67
|
+
elif model_type == "gbm":
|
|
68
|
+
params = {
|
|
69
|
+
"n_estimators": trial.suggest_int("n_estimators", 50, 500),
|
|
70
|
+
"max_depth": trial.suggest_int("max_depth", 3, 10),
|
|
71
|
+
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
|
|
72
|
+
"subsample": trial.suggest_float("subsample", 0.5, 1.0),
|
|
73
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
|
|
74
|
+
}
|
|
75
|
+
model = GradientBoostingClassifier(**params, random_state=42)
|
|
76
|
+
|
|
77
|
+
elif model_type == "svm":
|
|
78
|
+
params = {
|
|
79
|
+
"C": trial.suggest_float("C", 0.01, 100, log=True),
|
|
80
|
+
"kernel": trial.suggest_categorical(
|
|
81
|
+
"kernel", ["rbf", "poly", "sigmoid"]),
|
|
82
|
+
"gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
|
|
83
|
+
}
|
|
84
|
+
model = SVC(**params, probability=True, random_state=42)
|
|
85
|
+
|
|
86
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
87
|
+
return scores.mean()
|
|
88
|
+
|
|
89
|
+
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
|
90
|
+
study = optuna.create_study(direction=direction)
|
|
91
|
+
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
|
|
92
|
+
|
|
93
|
+
print(f"Optuna ({model_type}): best {scoring} = {study.best_value:.4f}")
|
|
94
|
+
print(f" Best params: {study.best_params}")
|
|
95
|
+
return study
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## 2. マルチモデル AutoML パイプライン
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from sklearn.linear_model import LogisticRegression
|
|
102
|
+
from sklearn.neighbors import KNeighborsClassifier
|
|
103
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
104
|
+
from sklearn.neural_network import MLPClassifier
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def automl_model_selection(X, y, cv=5, scoring="f1_macro",
|
|
108
|
+
n_trials_per_model=50):
|
|
109
|
+
"""
|
|
110
|
+
AutoML マルチモデル選択パイプライン。
|
|
111
|
+
|
|
112
|
+
Parameters:
|
|
113
|
+
X: np.ndarray — 特徴量
|
|
114
|
+
y: np.ndarray — ラベル
|
|
115
|
+
cv: int — CV 分割数
|
|
116
|
+
scoring: str — 評価指標
|
|
117
|
+
n_trials_per_model: int — モデルあたり試行数
|
|
118
|
+
"""
|
|
119
|
+
model_types = ["rf", "gbm", "svm"]
|
|
120
|
+
results = []
|
|
121
|
+
|
|
122
|
+
for mt in model_types:
|
|
123
|
+
study = optuna_optimize(
|
|
124
|
+
X, y, model_type=mt,
|
|
125
|
+
n_trials=n_trials_per_model, cv=cv, scoring=scoring)
|
|
126
|
+
results.append({
|
|
127
|
+
"model_type": mt,
|
|
128
|
+
"best_score": round(study.best_value, 4),
|
|
129
|
+
"best_params": study.best_params,
|
|
130
|
+
"n_trials": len(study.trials),
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
# 簡易モデル (ベースライン)
|
|
134
|
+
baselines = [
|
|
135
|
+
("logistic", LogisticRegression(max_iter=1000, random_state=42)),
|
|
136
|
+
("knn", KNeighborsClassifier()),
|
|
137
|
+
("dt", DecisionTreeClassifier(random_state=42)),
|
|
138
|
+
]
|
|
139
|
+
for name, model in baselines:
|
|
140
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
141
|
+
results.append({
|
|
142
|
+
"model_type": name,
|
|
143
|
+
"best_score": round(scores.mean(), 4),
|
|
144
|
+
"best_params": {},
|
|
145
|
+
"n_trials": 1,
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
df = pd.DataFrame(results).sort_values("best_score", ascending=False)
|
|
149
|
+
best = df.iloc[0]
|
|
150
|
+
print(f"AutoML: best = {best['model_type']} "
|
|
151
|
+
f"({scoring} = {best['best_score']})")
|
|
152
|
+
return df
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## 3. 自動特徴量エンジニアリング
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
|
|
159
|
+
from sklearn.feature_selection import SelectKBest, mutual_info_classif
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def auto_feature_engineering(X, y, max_poly_degree=2,
|
|
163
|
+
top_k=None, interactions_only=False):
|
|
164
|
+
"""
|
|
165
|
+
自動特徴量エンジニアリング。
|
|
166
|
+
|
|
167
|
+
Parameters:
|
|
168
|
+
X: np.ndarray — 元特徴量
|
|
169
|
+
y: np.ndarray — ラベル
|
|
170
|
+
max_poly_degree: int — 多項式次数
|
|
171
|
+
top_k: int | None — 選択する特徴量数
|
|
172
|
+
interactions_only: bool — 交互作用のみ
|
|
173
|
+
"""
|
|
174
|
+
scaler = StandardScaler()
|
|
175
|
+
X_scaled = scaler.fit_transform(X)
|
|
176
|
+
|
|
177
|
+
# 多項式特徴量
|
|
178
|
+
poly = PolynomialFeatures(
|
|
179
|
+
degree=max_poly_degree,
|
|
180
|
+
interaction_only=interactions_only,
|
|
181
|
+
include_bias=False)
|
|
182
|
+
X_poly = poly.fit_transform(X_scaled)
|
|
183
|
+
|
|
184
|
+
# 特徴量選択
|
|
185
|
+
if top_k is None:
|
|
186
|
+
top_k = min(X_poly.shape[1], X.shape[1] * 3)
|
|
187
|
+
|
|
188
|
+
selector = SelectKBest(mutual_info_classif, k=min(top_k, X_poly.shape[1]))
|
|
189
|
+
X_selected = selector.fit_transform(X_poly, y)
|
|
190
|
+
|
|
191
|
+
print(f"Feature engineering: {X.shape[1]} → {X_poly.shape[1]} "
|
|
192
|
+
f"→ {X_selected.shape[1]} features")
|
|
193
|
+
return X_selected, poly, selector
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## 4. Optuna 可視化レポート
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
def automl_report(study, output_dir="results"):
|
|
200
|
+
"""
|
|
201
|
+
Optuna Study 可視化レポート。
|
|
202
|
+
|
|
203
|
+
Parameters:
|
|
204
|
+
study: optuna.Study — 最適化結果
|
|
205
|
+
output_dir: str — 出力ディレクトリ
|
|
206
|
+
"""
|
|
207
|
+
from pathlib import Path
|
|
208
|
+
import matplotlib.pyplot as plt
|
|
209
|
+
|
|
210
|
+
out = Path(output_dir)
|
|
211
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
212
|
+
|
|
213
|
+
# パラメータ重要度
|
|
214
|
+
try:
|
|
215
|
+
importances = optuna.importance.get_param_importances(study)
|
|
216
|
+
fig, ax = plt.subplots(figsize=(8, 5))
|
|
217
|
+
params = list(importances.keys())
|
|
218
|
+
values = list(importances.values())
|
|
219
|
+
ax.barh(params, values)
|
|
220
|
+
ax.set_xlabel("Importance")
|
|
221
|
+
ax.set_title("Hyperparameter Importance")
|
|
222
|
+
fig.tight_layout()
|
|
223
|
+
fig.savefig(out / "param_importance.png", dpi=150)
|
|
224
|
+
plt.close(fig)
|
|
225
|
+
except Exception:
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
# 最適化履歴
|
|
229
|
+
trials_df = study.trials_dataframe()
|
|
230
|
+
trials_df.to_csv(out / "optuna_trials.csv", index=False)
|
|
231
|
+
|
|
232
|
+
# ベストパラメータ
|
|
233
|
+
best = {
|
|
234
|
+
"best_value": study.best_value,
|
|
235
|
+
"best_params": study.best_params,
|
|
236
|
+
"n_trials": len(study.trials),
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
print(f"AutoML report → {out}")
|
|
240
|
+
return best
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## パイプライン統合
|
|
246
|
+
|
|
247
|
+
```
|
|
248
|
+
eda-correlation → automl → ensemble-methods
|
|
249
|
+
(データ探索) (モデル選択) (アンサンブル)
|
|
250
|
+
│ │ ↓
|
|
251
|
+
feature-importance ──┘ uncertainty-quantification
|
|
252
|
+
(特徴量解釈) (不確実性定量化)
|
|
253
|
+
│
|
|
254
|
+
active-learning
|
|
255
|
+
(能動学習)
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
## パイプライン出力
|
|
259
|
+
|
|
260
|
+
| ファイル | 説明 | 次スキル |
|
|
261
|
+
|---------|------|---------|
|
|
262
|
+
| `optuna_trials.csv` | 試行履歴 | → 可視化 |
|
|
263
|
+
| `param_importance.png` | パラメータ重要度 | → レポート |
|
|
264
|
+
| `model_comparison.csv` | モデル比較 | → ensemble-methods |
|