@nahisaho/satori 0.22.0 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -17
- package/package.json +1 -1
- package/src/.github/skills/scientific-active-learning/SKILL.md +289 -0
- package/src/.github/skills/scientific-advanced-visualization/SKILL.md +310 -0
- package/src/.github/skills/scientific-automl/SKILL.md +264 -0
- package/src/.github/skills/scientific-ensemble-methods/SKILL.md +263 -0
- package/src/.github/skills/scientific-interactive-dashboard/SKILL.md +346 -0
- package/src/.github/skills/scientific-missing-data-analysis/SKILL.md +312 -0
- package/src/.github/skills/scientific-transfer-learning/SKILL.md +298 -0
- package/src/.github/skills/scientific-uncertainty-quantification/SKILL.md +286 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-missing-data-analysis
|
|
3
|
+
description: |
|
|
4
|
+
欠損データ解析スキル。欠損パターン診断 (MCAR/MAR/MNAR) ・
|
|
5
|
+
Little's MCAR テスト・多重代入法 (MICE) ・KNN 補完・
|
|
6
|
+
MissForest・VAE/GAIN 補完・欠損パターン可視化・Rubin's Rules。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Missing Data Analysis
|
|
10
|
+
|
|
11
|
+
欠損データの診断・補完・感度分析パイプラインを提供し、
|
|
12
|
+
バイアスのない統計推論を実現する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- データセットの欠損パターンを診断するとき
|
|
17
|
+
- MCAR / MAR / MNAR のメカニズムを判定するとき
|
|
18
|
+
- 多重代入法 (MICE) で欠損値を補完するとき
|
|
19
|
+
- KNN / MissForest / 深層学習ベースの補完をするとき
|
|
20
|
+
- 複数の補完結果を Rubin's Rules で統合するとき
|
|
21
|
+
- 欠損パターンを可視化するとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. 欠損パターン診断
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
import matplotlib.pyplot as plt
|
|
33
|
+
import seaborn as sns
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def diagnose_missing_patterns(df, output_prefix="missing"):
|
|
37
|
+
"""
|
|
38
|
+
欠損パターン診断 — MCAR/MAR/MNAR 判定支援。
|
|
39
|
+
|
|
40
|
+
Parameters:
|
|
41
|
+
df: pd.DataFrame — 入力データ
|
|
42
|
+
output_prefix: str — 出力ファイル接頭辞
|
|
43
|
+
"""
|
|
44
|
+
n_rows, n_cols = df.shape
|
|
45
|
+
missing_counts = df.isnull().sum()
|
|
46
|
+
missing_pct = (missing_counts / n_rows * 100).round(2)
|
|
47
|
+
|
|
48
|
+
summary = pd.DataFrame({
|
|
49
|
+
"column": df.columns,
|
|
50
|
+
"n_missing": missing_counts.values,
|
|
51
|
+
"pct_missing": missing_pct.values,
|
|
52
|
+
"dtype": df.dtypes.values
|
|
53
|
+
}).sort_values("pct_missing", ascending=False)
|
|
54
|
+
|
|
55
|
+
# 欠損パターン行列 (msno 風)
|
|
56
|
+
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
|
|
57
|
+
|
|
58
|
+
# (1) 欠損マトリックス
|
|
59
|
+
ax = axes[0, 0]
|
|
60
|
+
missing_matrix = df.isnull().astype(int)
|
|
61
|
+
ax.imshow(missing_matrix.values[:200], aspect="auto", cmap="Greys",
|
|
62
|
+
interpolation="none")
|
|
63
|
+
ax.set_xlabel("Features")
|
|
64
|
+
ax.set_ylabel("Samples")
|
|
65
|
+
ax.set_title("Missing Pattern Matrix (first 200 rows)")
|
|
66
|
+
|
|
67
|
+
# (2) 欠損率バー
|
|
68
|
+
ax = axes[0, 1]
|
|
69
|
+
cols_with_missing = summary[summary["pct_missing"] > 0]
|
|
70
|
+
ax.barh(cols_with_missing["column"], cols_with_missing["pct_missing"])
|
|
71
|
+
ax.set_xlabel("Missing %")
|
|
72
|
+
ax.set_title("Missing Rate per Column")
|
|
73
|
+
|
|
74
|
+
# (3) 欠損相関ヒートマップ
|
|
75
|
+
ax = axes[1, 0]
|
|
76
|
+
miss_corr = df.isnull().corr()
|
|
77
|
+
sns.heatmap(miss_corr, ax=ax, cmap="RdBu_r", center=0,
|
|
78
|
+
square=True, cbar_kws={"shrink": 0.8})
|
|
79
|
+
ax.set_title("Missing Correlation")
|
|
80
|
+
|
|
81
|
+
# (4) 欠損パターン上位
|
|
82
|
+
ax = axes[1, 1]
|
|
83
|
+
patterns = df.isnull().apply(lambda x: tuple(x), axis=1)
|
|
84
|
+
pattern_counts = patterns.value_counts().head(10)
|
|
85
|
+
ax.barh(range(len(pattern_counts)),
|
|
86
|
+
pattern_counts.values)
|
|
87
|
+
ax.set_yticks(range(len(pattern_counts)))
|
|
88
|
+
ax.set_yticklabels([str(p)[:40] for p in pattern_counts.index],
|
|
89
|
+
fontsize=7)
|
|
90
|
+
ax.set_xlabel("Count")
|
|
91
|
+
ax.set_title("Top 10 Missing Patterns")
|
|
92
|
+
|
|
93
|
+
plt.tight_layout()
|
|
94
|
+
path = f"{output_prefix}_diagnosis.png"
|
|
95
|
+
plt.savefig(path, dpi=150, bbox_inches="tight")
|
|
96
|
+
plt.close()
|
|
97
|
+
|
|
98
|
+
print(f"Missing Diagnosis: {n_cols} cols, "
|
|
99
|
+
f"{missing_counts.sum()} total missing ({(missing_counts.sum()/(n_rows*n_cols)*100):.1f}%)")
|
|
100
|
+
return {"summary": summary, "fig": path}
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## 2. Little's MCAR テスト
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
def littles_mcar_test(df):
|
|
107
|
+
"""
|
|
108
|
+
Little's MCAR テスト — 完全ランダム欠損の検定。
|
|
109
|
+
|
|
110
|
+
Parameters:
|
|
111
|
+
df: pd.DataFrame — 数値データのみ
|
|
112
|
+
Returns:
|
|
113
|
+
dict — chi2 統計量, p値, 判定
|
|
114
|
+
"""
|
|
115
|
+
from scipy import stats
|
|
116
|
+
|
|
117
|
+
numeric_df = df.select_dtypes(include=[np.number])
|
|
118
|
+
n_rows, n_cols = numeric_df.shape
|
|
119
|
+
|
|
120
|
+
# 欠損パターンごとにグルーピング
|
|
121
|
+
patterns = numeric_df.isnull().apply(tuple, axis=1)
|
|
122
|
+
unique_patterns = patterns.unique()
|
|
123
|
+
|
|
124
|
+
# 全体平均と全体共分散
|
|
125
|
+
global_mean = numeric_df.mean()
|
|
126
|
+
global_cov = numeric_df.cov()
|
|
127
|
+
|
|
128
|
+
chi2_stat = 0.0
|
|
129
|
+
df_stat = 0
|
|
130
|
+
|
|
131
|
+
for pattern in unique_patterns:
|
|
132
|
+
mask = patterns == pattern
|
|
133
|
+
sub_df = numeric_df[mask]
|
|
134
|
+
n_j = len(sub_df)
|
|
135
|
+
if n_j < 2:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# このパターンで観測されているカラム
|
|
139
|
+
obs_cols = [i for i, m in enumerate(pattern) if not m]
|
|
140
|
+
if len(obs_cols) == 0:
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
obs_mean = sub_df.iloc[:, obs_cols].mean().values
|
|
144
|
+
exp_mean = global_mean.iloc[obs_cols].values
|
|
145
|
+
diff = obs_mean - exp_mean
|
|
146
|
+
|
|
147
|
+
obs_cov = global_cov.iloc[obs_cols, obs_cols].values
|
|
148
|
+
try:
|
|
149
|
+
cov_inv = np.linalg.pinv(obs_cov / n_j)
|
|
150
|
+
except np.linalg.LinAlgError:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
chi2_stat += diff @ cov_inv @ diff
|
|
154
|
+
df_stat += len(obs_cols)
|
|
155
|
+
|
|
156
|
+
df_stat -= n_cols # 自由度補正
|
|
157
|
+
|
|
158
|
+
if df_stat <= 0:
|
|
159
|
+
return {"chi2": np.nan, "p_value": np.nan,
|
|
160
|
+
"conclusion": "判定不能 (自由度不足)"}
|
|
161
|
+
|
|
162
|
+
p_value = 1 - stats.chi2.cdf(chi2_stat, df_stat)
|
|
163
|
+
conclusion = "MCAR (p > 0.05)" if p_value > 0.05 else "Not MCAR (p ≤ 0.05)"
|
|
164
|
+
|
|
165
|
+
print(f"Little's MCAR test: χ²={chi2_stat:.2f}, df={df_stat}, "
|
|
166
|
+
f"p={p_value:.4f} → {conclusion}")
|
|
167
|
+
return {"chi2": chi2_stat, "df": df_stat,
|
|
168
|
+
"p_value": p_value, "conclusion": conclusion}
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## 3. 多重代入法 (MICE)
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
def mice_imputation(df, n_imputations=5, max_iter=10, random_state=42):
|
|
175
|
+
"""
|
|
176
|
+
MICE (Multiple Imputation by Chained Equations)。
|
|
177
|
+
|
|
178
|
+
Parameters:
|
|
179
|
+
df: pd.DataFrame — 欠損を含むデータ
|
|
180
|
+
n_imputations: int — 代入データセット数
|
|
181
|
+
max_iter: int — 反復回数
|
|
182
|
+
random_state: int — 乱数シード
|
|
183
|
+
"""
|
|
184
|
+
from sklearn.experimental import enable_iterative_imputer # noqa
|
|
185
|
+
from sklearn.impute import IterativeImputer
|
|
186
|
+
|
|
187
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
188
|
+
cat_cols = df.select_dtypes(exclude=[np.number]).columns
|
|
189
|
+
|
|
190
|
+
imputed_datasets = []
|
|
191
|
+
|
|
192
|
+
for i in range(n_imputations):
|
|
193
|
+
imputer = IterativeImputer(
|
|
194
|
+
max_iter=max_iter,
|
|
195
|
+
random_state=random_state + i,
|
|
196
|
+
sample_posterior=True)
|
|
197
|
+
|
|
198
|
+
imputed_numeric = pd.DataFrame(
|
|
199
|
+
imputer.fit_transform(df[numeric_cols]),
|
|
200
|
+
columns=numeric_cols, index=df.index)
|
|
201
|
+
|
|
202
|
+
imputed_df = imputed_numeric.copy()
|
|
203
|
+
for col in cat_cols:
|
|
204
|
+
imputed_df[col] = df[col].fillna(df[col].mode().iloc[0]
|
|
205
|
+
if not df[col].mode().empty else "UNKNOWN")
|
|
206
|
+
|
|
207
|
+
imputed_datasets.append(imputed_df)
|
|
208
|
+
|
|
209
|
+
print(f"MICE: {n_imputations} datasets × {max_iter} iterations, "
|
|
210
|
+
f"{len(numeric_cols)} numeric cols")
|
|
211
|
+
return imputed_datasets
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def rubins_rules(estimates, variances):
|
|
215
|
+
"""
|
|
216
|
+
Rubin's Rules — 多重代入結果の統合。
|
|
217
|
+
|
|
218
|
+
Parameters:
|
|
219
|
+
estimates: list[float] — 各代入データセットからの推定値
|
|
220
|
+
variances: list[float] — 各代入データセットからの分散
|
|
221
|
+
"""
|
|
222
|
+
m = len(estimates)
|
|
223
|
+
Q_bar = np.mean(estimates)
|
|
224
|
+
U_bar = np.mean(variances) # Within-imputation variance
|
|
225
|
+
B = np.var(estimates, ddof=1) # Between-imputation variance
|
|
226
|
+
T = U_bar + (1 + 1 / m) * B # Total variance
|
|
227
|
+
|
|
228
|
+
# 自由度 (Barnard-Rubin)
|
|
229
|
+
r = (1 + 1 / m) * B / U_bar if U_bar > 0 else np.inf
|
|
230
|
+
df_old = (m - 1) * (1 + 1 / r) ** 2 if r > 0 else np.inf
|
|
231
|
+
|
|
232
|
+
print(f"Rubin's Rules: Q̄={Q_bar:.4f}, T={T:.4f}, "
|
|
233
|
+
f"within={U_bar:.4f}, between={B:.4f}")
|
|
234
|
+
return {"pooled_estimate": Q_bar, "total_variance": T,
|
|
235
|
+
"within_variance": U_bar, "between_variance": B,
|
|
236
|
+
"df": df_old}
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## 4. KNN / MissForest 補完
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
def knn_imputation(df, n_neighbors=5):
|
|
243
|
+
"""
|
|
244
|
+
KNN 欠損値補完。
|
|
245
|
+
|
|
246
|
+
Parameters:
|
|
247
|
+
df: pd.DataFrame — 欠損を含むデータ
|
|
248
|
+
n_neighbors: int — 近傍数
|
|
249
|
+
"""
|
|
250
|
+
from sklearn.impute import KNNImputer
|
|
251
|
+
|
|
252
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
253
|
+
imputer = KNNImputer(n_neighbors=n_neighbors)
|
|
254
|
+
imputed = pd.DataFrame(
|
|
255
|
+
imputer.fit_transform(df[numeric_cols]),
|
|
256
|
+
columns=numeric_cols, index=df.index)
|
|
257
|
+
|
|
258
|
+
n_imputed = df[numeric_cols].isnull().sum().sum()
|
|
259
|
+
print(f"KNN Imputation (k={n_neighbors}): {n_imputed} values imputed")
|
|
260
|
+
return imputed
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def missforest_imputation(df, n_estimators=100, max_iter=10):
|
|
264
|
+
"""
|
|
265
|
+
MissForest (Random Forest ベースの反復補完)。
|
|
266
|
+
|
|
267
|
+
Parameters:
|
|
268
|
+
df: pd.DataFrame — 欠損を含むデータ
|
|
269
|
+
n_estimators: int — Random Forest の木の数
|
|
270
|
+
max_iter: int — 反復回数
|
|
271
|
+
"""
|
|
272
|
+
from sklearn.experimental import enable_iterative_imputer # noqa
|
|
273
|
+
from sklearn.impute import IterativeImputer
|
|
274
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
275
|
+
|
|
276
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
277
|
+
|
|
278
|
+
imputer = IterativeImputer(
|
|
279
|
+
estimator=RandomForestRegressor(n_estimators=n_estimators,
|
|
280
|
+
random_state=42, n_jobs=-1),
|
|
281
|
+
max_iter=max_iter, random_state=42)
|
|
282
|
+
|
|
283
|
+
imputed = pd.DataFrame(
|
|
284
|
+
imputer.fit_transform(df[numeric_cols]),
|
|
285
|
+
columns=numeric_cols, index=df.index)
|
|
286
|
+
|
|
287
|
+
n_imputed = df[numeric_cols].isnull().sum().sum()
|
|
288
|
+
print(f"MissForest (n_trees={n_estimators}, iter={max_iter}): "
|
|
289
|
+
f"{n_imputed} values imputed")
|
|
290
|
+
return imputed
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## パイプライン統合
|
|
296
|
+
|
|
297
|
+
```
|
|
298
|
+
eda-correlation → missing-data-analysis → ml-classification
|
|
299
|
+
(探索的解析) (欠損診断・補完) (モデリング)
|
|
300
|
+
│ │ ↓
|
|
301
|
+
statistical-testing ────┘ advanced-visualization
|
|
302
|
+
(統計検定) (結果可視化)
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
## パイプライン出力
|
|
306
|
+
|
|
307
|
+
| ファイル | 説明 | 次スキル |
|
|
308
|
+
|---------|------|---------|
|
|
309
|
+
| `missing_diagnosis.png` | 欠損パターン可視化 | → reporting |
|
|
310
|
+
| `mcar_test_result.json` | Little's MCAR テスト | → 補完戦略選択 |
|
|
311
|
+
| `imputed_datasets/` | MICE 多重代入データ | → ml-classification |
|
|
312
|
+
| `imputation_comparison.csv` | 補完手法比較 | → 最終選択 |
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-transfer-learning
|
|
3
|
+
description: |
|
|
4
|
+
転移学習・ドメイン適応スキル。事前学習モデルファインチューニング・
|
|
5
|
+
Few-shot / Zero-shot 学習・ドメイン適応 (DA)・
|
|
6
|
+
知識蒸留・マルチタスク学習・科学ドメイン特化モデル転移。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Transfer Learning
|
|
10
|
+
|
|
11
|
+
事前学習モデルの科学データへの転移・ドメイン適応・
|
|
12
|
+
Few-shot 学習パイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 事前学習済みモデル (ImageNet/BERT) をファインチューニングするとき
|
|
17
|
+
- 小規模科学データセットで高精度を実現したいとき
|
|
18
|
+
- ドメイン適応で異なるデータ分布間のギャップを埋めるとき
|
|
19
|
+
- Few-shot 学習で数例から分類するとき
|
|
20
|
+
- 知識蒸留で大規模モデルを軽量化するとき
|
|
21
|
+
- マルチタスク学習で複数タスクを共同学習するとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. Vision モデルファインチューニング
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import torch
|
|
31
|
+
import torch.nn as nn
|
|
32
|
+
from torch.utils.data import DataLoader
|
|
33
|
+
import numpy as np
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def finetune_vision_model(train_loader, val_loader,
|
|
37
|
+
model_name="resnet50",
|
|
38
|
+
num_classes=10, epochs=20,
|
|
39
|
+
lr=1e-4, freeze_backbone=True):
|
|
40
|
+
"""
|
|
41
|
+
Vision モデルファインチューニング。
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
train_loader: DataLoader — 学習データ
|
|
45
|
+
val_loader: DataLoader — 検証データ
|
|
46
|
+
model_name: str — "resnet50" / "vit_b_16" / "efficientnet_b0"
|
|
47
|
+
num_classes: int — クラス数
|
|
48
|
+
epochs: int — エポック数
|
|
49
|
+
lr: float — 学習率
|
|
50
|
+
freeze_backbone: bool — バックボーン凍結
|
|
51
|
+
"""
|
|
52
|
+
import torchvision.models as models
|
|
53
|
+
|
|
54
|
+
# モデルロード
|
|
55
|
+
model_fn = getattr(models, model_name)
|
|
56
|
+
weights_name = model_name.replace("_", "").title() + "_Weights"
|
|
57
|
+
try:
|
|
58
|
+
weights = getattr(models, weights_name).DEFAULT
|
|
59
|
+
except AttributeError:
|
|
60
|
+
weights = "DEFAULT"
|
|
61
|
+
model = model_fn(weights=weights)
|
|
62
|
+
|
|
63
|
+
# 最終層置換
|
|
64
|
+
if hasattr(model, "fc"):
|
|
65
|
+
in_features = model.fc.in_features
|
|
66
|
+
model.fc = nn.Linear(in_features, num_classes)
|
|
67
|
+
elif hasattr(model, "classifier"):
|
|
68
|
+
if isinstance(model.classifier, nn.Sequential):
|
|
69
|
+
in_features = model.classifier[-1].in_features
|
|
70
|
+
model.classifier[-1] = nn.Linear(in_features, num_classes)
|
|
71
|
+
else:
|
|
72
|
+
in_features = model.classifier.in_features
|
|
73
|
+
model.classifier = nn.Linear(in_features, num_classes)
|
|
74
|
+
elif hasattr(model, "heads"):
|
|
75
|
+
in_features = model.heads.head.in_features
|
|
76
|
+
model.heads.head = nn.Linear(in_features, num_classes)
|
|
77
|
+
|
|
78
|
+
# バックボーン凍結
|
|
79
|
+
if freeze_backbone:
|
|
80
|
+
for name, param in model.named_parameters():
|
|
81
|
+
if "fc" not in name and "classifier" not in name and "heads" not in name:
|
|
82
|
+
param.requires_grad = False
|
|
83
|
+
|
|
84
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
85
|
+
model = model.to(device)
|
|
86
|
+
|
|
87
|
+
optimizer = torch.optim.AdamW(
|
|
88
|
+
filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
|
|
89
|
+
criterion = nn.CrossEntropyLoss()
|
|
90
|
+
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
|
|
91
|
+
|
|
92
|
+
best_acc = 0.0
|
|
93
|
+
history = []
|
|
94
|
+
|
|
95
|
+
for epoch in range(epochs):
|
|
96
|
+
model.train()
|
|
97
|
+
train_loss = 0.0
|
|
98
|
+
for X_batch, y_batch in train_loader:
|
|
99
|
+
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
|
100
|
+
optimizer.zero_grad()
|
|
101
|
+
outputs = model(X_batch)
|
|
102
|
+
loss = criterion(outputs, y_batch)
|
|
103
|
+
loss.backward()
|
|
104
|
+
optimizer.step()
|
|
105
|
+
train_loss += loss.item()
|
|
106
|
+
scheduler.step()
|
|
107
|
+
|
|
108
|
+
# Validation
|
|
109
|
+
model.eval()
|
|
110
|
+
correct = total = 0
|
|
111
|
+
with torch.no_grad():
|
|
112
|
+
for X_batch, y_batch in val_loader:
|
|
113
|
+
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
|
114
|
+
outputs = model(X_batch)
|
|
115
|
+
_, predicted = outputs.max(1)
|
|
116
|
+
total += y_batch.size(0)
|
|
117
|
+
correct += predicted.eq(y_batch).sum().item()
|
|
118
|
+
|
|
119
|
+
val_acc = correct / total
|
|
120
|
+
history.append({"epoch": epoch, "train_loss": train_loss / len(train_loader),
|
|
121
|
+
"val_acc": val_acc})
|
|
122
|
+
if val_acc > best_acc:
|
|
123
|
+
best_acc = val_acc
|
|
124
|
+
|
|
125
|
+
print(f"Finetune {model_name}: best val acc = {best_acc:.4f}")
|
|
126
|
+
return model, history
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## 2. NLP モデルファインチューニング
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
def finetune_text_classifier(train_texts, train_labels,
|
|
133
|
+
val_texts, val_labels,
|
|
134
|
+
model_name="dmis-lab/biobert-base-cased-v1.2",
|
|
135
|
+
num_labels=2, epochs=5, lr=2e-5):
|
|
136
|
+
"""
|
|
137
|
+
BERT/BioBERT テキスト分類ファインチューニング。
|
|
138
|
+
|
|
139
|
+
Parameters:
|
|
140
|
+
train_texts: list[str] — 学習テキスト
|
|
141
|
+
train_labels: list[int] — 学習ラベル
|
|
142
|
+
val_texts: list[str] — 検証テキスト
|
|
143
|
+
val_labels: list[int] — 検証ラベル
|
|
144
|
+
model_name: str — HuggingFace モデル名
|
|
145
|
+
num_labels: int — ラベル数
|
|
146
|
+
epochs: int — エポック数
|
|
147
|
+
lr: float — 学習率
|
|
148
|
+
"""
|
|
149
|
+
from transformers import (
|
|
150
|
+
AutoTokenizer, AutoModelForSequenceClassification,
|
|
151
|
+
TrainingArguments, Trainer)
|
|
152
|
+
from datasets import Dataset
|
|
153
|
+
|
|
154
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
155
|
+
model = AutoModelForSequenceClassification.from_pretrained(
|
|
156
|
+
model_name, num_labels=num_labels)
|
|
157
|
+
|
|
158
|
+
def tokenize(examples):
|
|
159
|
+
return tokenizer(examples["text"], truncation=True,
|
|
160
|
+
padding="max_length", max_length=512)
|
|
161
|
+
|
|
162
|
+
train_ds = Dataset.from_dict({"text": train_texts, "label": train_labels})
|
|
163
|
+
val_ds = Dataset.from_dict({"text": val_texts, "label": val_labels})
|
|
164
|
+
train_ds = train_ds.map(tokenize, batched=True)
|
|
165
|
+
val_ds = val_ds.map(tokenize, batched=True)
|
|
166
|
+
|
|
167
|
+
args = TrainingArguments(
|
|
168
|
+
output_dir="./ft_output", num_train_epochs=epochs,
|
|
169
|
+
per_device_train_batch_size=16, learning_rate=lr,
|
|
170
|
+
evaluation_strategy="epoch", save_strategy="epoch",
|
|
171
|
+
load_best_model_at_end=True, metric_for_best_model="accuracy")
|
|
172
|
+
|
|
173
|
+
def compute_metrics(eval_pred):
|
|
174
|
+
preds = np.argmax(eval_pred.predictions, axis=-1)
|
|
175
|
+
acc = (preds == eval_pred.label_ids).mean()
|
|
176
|
+
return {"accuracy": acc}
|
|
177
|
+
|
|
178
|
+
trainer = Trainer(model=model, args=args, train_dataset=train_ds,
|
|
179
|
+
eval_dataset=val_ds, compute_metrics=compute_metrics)
|
|
180
|
+
trainer.train()
|
|
181
|
+
|
|
182
|
+
metrics = trainer.evaluate()
|
|
183
|
+
print(f"Finetune {model_name}: val acc = {metrics['eval_accuracy']:.4f}")
|
|
184
|
+
return model, tokenizer, metrics
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## 3. Few-shot 学習
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
def prototypical_network(support_X, support_y, query_X,
|
|
191
|
+
feature_extractor=None):
|
|
192
|
+
"""
|
|
193
|
+
Prototypical Network — Few-shot 分類。
|
|
194
|
+
|
|
195
|
+
Parameters:
|
|
196
|
+
support_X: np.ndarray — サポートセット特徴量
|
|
197
|
+
support_y: np.ndarray — サポートラベル
|
|
198
|
+
query_X: np.ndarray — クエリセット特徴量
|
|
199
|
+
feature_extractor: callable | None — 特徴量抽出器
|
|
200
|
+
"""
|
|
201
|
+
if feature_extractor is not None:
|
|
202
|
+
support_emb = feature_extractor(support_X)
|
|
203
|
+
query_emb = feature_extractor(query_X)
|
|
204
|
+
else:
|
|
205
|
+
support_emb = support_X
|
|
206
|
+
query_emb = query_X
|
|
207
|
+
|
|
208
|
+
classes = np.unique(support_y)
|
|
209
|
+
prototypes = np.array([
|
|
210
|
+
support_emb[support_y == c].mean(axis=0) for c in classes])
|
|
211
|
+
|
|
212
|
+
# ユークリッド距離
|
|
213
|
+
dists = np.array([
|
|
214
|
+
np.linalg.norm(query_emb - p, axis=1) for p in prototypes]).T
|
|
215
|
+
|
|
216
|
+
predictions = classes[np.argmin(dists, axis=1)]
|
|
217
|
+
confidences = np.exp(-dists.min(axis=1))
|
|
218
|
+
|
|
219
|
+
print(f"Few-shot: {len(classes)} classes, "
|
|
220
|
+
f"{len(support_y)} support → {len(query_X)} query")
|
|
221
|
+
return predictions, confidences
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## 4. 知識蒸留
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
def knowledge_distillation(teacher, student, train_loader,
|
|
228
|
+
epochs=20, temperature=4.0, alpha=0.7,
|
|
229
|
+
lr=1e-3):
|
|
230
|
+
"""
|
|
231
|
+
知識蒸留 (Teacher → Student)。
|
|
232
|
+
|
|
233
|
+
Parameters:
|
|
234
|
+
teacher: nn.Module — 教師モデル (frozen)
|
|
235
|
+
student: nn.Module — 生徒モデル
|
|
236
|
+
train_loader: DataLoader — 学習データ
|
|
237
|
+
epochs: int — エポック数
|
|
238
|
+
temperature: float — 蒸留温度
|
|
239
|
+
alpha: float — soft loss の重み
|
|
240
|
+
lr: float — 学習率
|
|
241
|
+
"""
|
|
242
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
243
|
+
teacher = teacher.to(device).eval()
|
|
244
|
+
student = student.to(device)
|
|
245
|
+
|
|
246
|
+
optimizer = torch.optim.AdamW(student.parameters(), lr=lr)
|
|
247
|
+
ce_loss = nn.CrossEntropyLoss()
|
|
248
|
+
kl_loss = nn.KLDivLoss(reduction="batchmean")
|
|
249
|
+
|
|
250
|
+
for epoch in range(epochs):
|
|
251
|
+
student.train()
|
|
252
|
+
total_loss = 0.0
|
|
253
|
+
for X_batch, y_batch in train_loader:
|
|
254
|
+
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
|
255
|
+
|
|
256
|
+
with torch.no_grad():
|
|
257
|
+
teacher_logits = teacher(X_batch)
|
|
258
|
+
|
|
259
|
+
student_logits = student(X_batch)
|
|
260
|
+
|
|
261
|
+
soft_loss = kl_loss(
|
|
262
|
+
nn.functional.log_softmax(student_logits / temperature, dim=1),
|
|
263
|
+
nn.functional.softmax(teacher_logits / temperature, dim=1)
|
|
264
|
+
) * (temperature ** 2)
|
|
265
|
+
|
|
266
|
+
hard_loss = ce_loss(student_logits, y_batch)
|
|
267
|
+
loss = alpha * soft_loss + (1 - alpha) * hard_loss
|
|
268
|
+
|
|
269
|
+
optimizer.zero_grad()
|
|
270
|
+
loss.backward()
|
|
271
|
+
optimizer.step()
|
|
272
|
+
total_loss += loss.item()
|
|
273
|
+
|
|
274
|
+
print(f" Epoch {epoch}: loss = {total_loss / len(train_loader):.4f}")
|
|
275
|
+
|
|
276
|
+
print(f"Distillation: T={temperature}, α={alpha}, {epochs} epochs")
|
|
277
|
+
return student
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## パイプライン統合
|
|
283
|
+
|
|
284
|
+
```
|
|
285
|
+
deep-learning → transfer-learning → active-learning
|
|
286
|
+
(モデル設計) (転移・適応) (効率的ラベル付け)
|
|
287
|
+
│ │ ↓
|
|
288
|
+
healthcare-ai ───────┘ ensemble-methods
|
|
289
|
+
(臨床 AI) (アンサンブル)
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
## パイプライン出力
|
|
293
|
+
|
|
294
|
+
| ファイル | 説明 | 次スキル |
|
|
295
|
+
|---------|------|---------|
|
|
296
|
+
| `ft_model.pt` | ファインチューニング済みモデル | → 推論 |
|
|
297
|
+
| `ft_history.csv` | 学習履歴 | → visualization |
|
|
298
|
+
| `few_shot_predictions.csv` | Few-shot 予測 | → 評価 |
|