@nahisaho/satori 0.22.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -20
- package/package.json +1 -1
- package/src/.github/skills/scientific-active-learning/SKILL.md +289 -0
- package/src/.github/skills/scientific-advanced-visualization/SKILL.md +310 -0
- package/src/.github/skills/scientific-anomaly-detection/SKILL.md +296 -0
- package/src/.github/skills/scientific-automl/SKILL.md +264 -0
- package/src/.github/skills/scientific-causal-ml/SKILL.md +240 -0
- package/src/.github/skills/scientific-data-profiling/SKILL.md +247 -0
- package/src/.github/skills/scientific-ensemble-methods/SKILL.md +263 -0
- package/src/.github/skills/scientific-geospatial-analysis/SKILL.md +274 -0
- package/src/.github/skills/scientific-interactive-dashboard/SKILL.md +346 -0
- package/src/.github/skills/scientific-missing-data-analysis/SKILL.md +312 -0
- package/src/.github/skills/scientific-model-monitoring/SKILL.md +247 -0
- package/src/.github/skills/scientific-network-visualization/SKILL.md +278 -0
- package/src/.github/skills/scientific-reproducible-reporting/SKILL.md +330 -0
- package/src/.github/skills/scientific-time-series-forecasting/SKILL.md +246 -0
- package/src/.github/skills/scientific-transfer-learning/SKILL.md +298 -0
- package/src/.github/skills/scientific-uncertainty-quantification/SKILL.md +286 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-data-profiling
|
|
3
|
+
description: |
|
|
4
|
+
データプロファイリング・品質スキル。ydata-profiling 自動 EDA ・
|
|
5
|
+
Great Expectations データバリデーション・データ品質スコア・
|
|
6
|
+
型推論・相関検出・外れ値フラグ・データカタログ生成。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Data Profiling
|
|
10
|
+
|
|
11
|
+
データセットの包括的プロファイリング・品質評価・
|
|
12
|
+
自動 EDA レポートパイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 新しいデータセットの全体像を素早く把握するとき
|
|
17
|
+
- データ品質スコアを算出して品質基準をチェックするとき
|
|
18
|
+
- ydata-profiling で自動 EDA レポートを生成するとき
|
|
19
|
+
- Great Expectations でデータバリデーションルールを定義するとき
|
|
20
|
+
- データカタログ (辞書) を自動生成するとき
|
|
21
|
+
- 相関・外れ値・欠損を一括診断するとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. ydata-profiling 自動 EDA
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def auto_profile_report(df, title="Data Profile Report",
|
|
35
|
+
minimal=False, output="profile_report.html"):
|
|
36
|
+
"""
|
|
37
|
+
ydata-profiling 自動 EDA レポート。
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
df: pd.DataFrame — 入力データ
|
|
41
|
+
title: str — レポートタイトル
|
|
42
|
+
minimal: bool — 軽量モード
|
|
43
|
+
output: str — 出力 HTML パス
|
|
44
|
+
"""
|
|
45
|
+
from ydata_profiling import ProfileReport
|
|
46
|
+
|
|
47
|
+
profile = ProfileReport(
|
|
48
|
+
df, title=title, minimal=minimal,
|
|
49
|
+
correlations={"pearson": {"calculate": True},
|
|
50
|
+
"spearman": {"calculate": True},
|
|
51
|
+
"kendall": {"calculate": True}},
|
|
52
|
+
missing_diagrams={"bar": True, "matrix": True, "heatmap": True})
|
|
53
|
+
|
|
54
|
+
profile.to_file(output)
|
|
55
|
+
|
|
56
|
+
# サマリー抽出
|
|
57
|
+
desc = profile.get_description()
|
|
58
|
+
summary = {
|
|
59
|
+
"n_rows": len(df),
|
|
60
|
+
"n_cols": len(df.columns),
|
|
61
|
+
"n_numeric": len(df.select_dtypes(include=[np.number]).columns),
|
|
62
|
+
"n_categorical": len(df.select_dtypes(include=["object", "category"]).columns),
|
|
63
|
+
"total_missing": int(df.isnull().sum().sum()),
|
|
64
|
+
"missing_pct": float(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100),
|
|
65
|
+
"n_duplicates": int(df.duplicated().sum()),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
print(f"Profile Report → {output}")
|
|
69
|
+
print(f" {summary['n_rows']} rows × {summary['n_cols']} cols, "
|
|
70
|
+
f"{summary['missing_pct']:.1f}% missing, "
|
|
71
|
+
f"{summary['n_duplicates']} duplicates")
|
|
72
|
+
return {"report_path": output, "summary": summary}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## 2. データ品質スコア
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
def data_quality_score(df, rules=None):
|
|
79
|
+
"""
|
|
80
|
+
データ品質スコア算出 (0-100)。
|
|
81
|
+
|
|
82
|
+
Parameters:
|
|
83
|
+
df: pd.DataFrame — 入力データ
|
|
84
|
+
rules: dict | None — カスタムルール
|
|
85
|
+
"""
|
|
86
|
+
scores = {}
|
|
87
|
+
|
|
88
|
+
# 1. 完全性 (Completeness) — 非欠損率
|
|
89
|
+
completeness = 1.0 - df.isnull().sum().sum() / (len(df) * len(df.columns))
|
|
90
|
+
scores["completeness"] = completeness
|
|
91
|
+
|
|
92
|
+
# 2. 一意性 (Uniqueness) — 非重複率
|
|
93
|
+
uniqueness = 1.0 - df.duplicated().sum() / len(df) if len(df) > 0 else 1.0
|
|
94
|
+
scores["uniqueness"] = uniqueness
|
|
95
|
+
|
|
96
|
+
# 3. 一貫性 (Consistency) — 型一貫性
|
|
97
|
+
type_consistent = 0
|
|
98
|
+
for col in df.columns:
|
|
99
|
+
non_null = df[col].dropna()
|
|
100
|
+
if len(non_null) == 0:
|
|
101
|
+
type_consistent += 1
|
|
102
|
+
continue
|
|
103
|
+
try:
|
|
104
|
+
inferred = pd.api.types.infer_dtype(non_null, skipna=True)
|
|
105
|
+
if inferred not in ["mixed", "mixed-integer"]:
|
|
106
|
+
type_consistent += 1
|
|
107
|
+
except Exception:
|
|
108
|
+
pass
|
|
109
|
+
consistency = type_consistent / len(df.columns) if len(df.columns) > 0 else 1.0
|
|
110
|
+
scores["consistency"] = consistency
|
|
111
|
+
|
|
112
|
+
# 4. 適時性 (Timeliness) — 日付カラムの新しさ
|
|
113
|
+
date_cols = df.select_dtypes(include=["datetime64"]).columns
|
|
114
|
+
if len(date_cols) > 0:
|
|
115
|
+
max_date = df[date_cols[0]].max()
|
|
116
|
+
freshness = 1.0 # Placeholder
|
|
117
|
+
scores["timeliness"] = freshness
|
|
118
|
+
else:
|
|
119
|
+
scores["timeliness"] = 1.0
|
|
120
|
+
|
|
121
|
+
# 5. 妥当性 (Validity) — 数値カラムの有限性
|
|
122
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
123
|
+
if len(numeric_cols) > 0:
|
|
124
|
+
finite_rate = df[numeric_cols].apply(lambda x: np.isfinite(x.dropna()).mean()).mean()
|
|
125
|
+
scores["validity"] = float(finite_rate)
|
|
126
|
+
else:
|
|
127
|
+
scores["validity"] = 1.0
|
|
128
|
+
|
|
129
|
+
# 総合スコア
|
|
130
|
+
weights = {"completeness": 0.3, "uniqueness": 0.2,
|
|
131
|
+
"consistency": 0.2, "timeliness": 0.1, "validity": 0.2}
|
|
132
|
+
total_score = sum(scores[k] * weights[k] for k in weights) * 100
|
|
133
|
+
|
|
134
|
+
# カスタムルール
|
|
135
|
+
rule_results = []
|
|
136
|
+
if rules:
|
|
137
|
+
for rule_name, rule_fn in rules.items():
|
|
138
|
+
try:
|
|
139
|
+
passed = rule_fn(df)
|
|
140
|
+
rule_results.append({"rule": rule_name, "passed": passed})
|
|
141
|
+
except Exception as e:
|
|
142
|
+
rule_results.append({"rule": rule_name, "passed": False,
|
|
143
|
+
"error": str(e)})
|
|
144
|
+
|
|
145
|
+
print(f"Data Quality Score: {total_score:.1f}/100")
|
|
146
|
+
for k, v in scores.items():
|
|
147
|
+
print(f" {k}: {v:.3f}")
|
|
148
|
+
|
|
149
|
+
return {"total_score": total_score, "dimension_scores": scores,
|
|
150
|
+
"rule_results": rule_results}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## 3. Great Expectations バリデーション
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
def great_expectations_validate(df, expectations=None):
|
|
157
|
+
"""
|
|
158
|
+
Great Expectations スタイルのデータバリデーション。
|
|
159
|
+
|
|
160
|
+
Parameters:
|
|
161
|
+
df: pd.DataFrame — 入力データ
|
|
162
|
+
expectations: list[dict] | None — バリデーションルール
|
|
163
|
+
"""
|
|
164
|
+
if expectations is None:
|
|
165
|
+
expectations = _auto_generate_expectations(df)
|
|
166
|
+
|
|
167
|
+
results = []
|
|
168
|
+
for exp in expectations:
|
|
169
|
+
exp_type = exp["type"]
|
|
170
|
+
col = exp.get("column")
|
|
171
|
+
kwargs = exp.get("kwargs", {})
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
if exp_type == "expect_column_to_exist":
|
|
175
|
+
success = col in df.columns
|
|
176
|
+
elif exp_type == "expect_column_values_to_not_be_null":
|
|
177
|
+
max_pct = kwargs.get("mostly", 1.0)
|
|
178
|
+
non_null_pct = df[col].notnull().mean()
|
|
179
|
+
success = non_null_pct >= max_pct
|
|
180
|
+
elif exp_type == "expect_column_values_to_be_between":
|
|
181
|
+
min_val, max_val = kwargs["min_value"], kwargs["max_value"]
|
|
182
|
+
vals = df[col].dropna()
|
|
183
|
+
success = bool((vals >= min_val).all() and (vals <= max_val).all())
|
|
184
|
+
elif exp_type == "expect_column_values_to_be_unique":
|
|
185
|
+
success = not df[col].duplicated().any()
|
|
186
|
+
elif exp_type == "expect_column_values_to_be_in_set":
|
|
187
|
+
valid_set = set(kwargs["value_set"])
|
|
188
|
+
success = df[col].dropna().isin(valid_set).all()
|
|
189
|
+
elif exp_type == "expect_table_row_count_to_be_between":
|
|
190
|
+
success = kwargs["min_value"] <= len(df) <= kwargs["max_value"]
|
|
191
|
+
else:
|
|
192
|
+
success = None
|
|
193
|
+
|
|
194
|
+
results.append({"expectation": exp_type, "column": col,
|
|
195
|
+
"success": success})
|
|
196
|
+
except Exception as e:
|
|
197
|
+
results.append({"expectation": exp_type, "column": col,
|
|
198
|
+
"success": False, "error": str(e)})
|
|
199
|
+
|
|
200
|
+
results_df = pd.DataFrame(results)
|
|
201
|
+
n_pass = results_df["success"].sum()
|
|
202
|
+
n_total = len(results_df)
|
|
203
|
+
print(f"Validation: {n_pass}/{n_total} expectations passed "
|
|
204
|
+
f"({n_pass/n_total*100:.0f}%)")
|
|
205
|
+
return results_df
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _auto_generate_expectations(df):
|
|
209
|
+
"""自動でバリデーションルールを推論。"""
|
|
210
|
+
expectations = []
|
|
211
|
+
for col in df.columns:
|
|
212
|
+
expectations.append({"type": "expect_column_to_exist", "column": col})
|
|
213
|
+
expectations.append({
|
|
214
|
+
"type": "expect_column_values_to_not_be_null",
|
|
215
|
+
"column": col,
|
|
216
|
+
"kwargs": {"mostly": 0.9}})
|
|
217
|
+
|
|
218
|
+
if df[col].dtype in [np.float64, np.int64]:
|
|
219
|
+
q1, q3 = df[col].quantile([0.01, 0.99])
|
|
220
|
+
iqr = q3 - q1
|
|
221
|
+
expectations.append({
|
|
222
|
+
"type": "expect_column_values_to_be_between",
|
|
223
|
+
"column": col,
|
|
224
|
+
"kwargs": {"min_value": float(q1 - 3 * iqr),
|
|
225
|
+
"max_value": float(q3 + 3 * iqr)}})
|
|
226
|
+
return expectations
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## パイプライン統合
|
|
232
|
+
|
|
233
|
+
```
|
|
234
|
+
[データ取得] → data-profiling → eda-correlation
|
|
235
|
+
(品質診断) (探索的解析)
|
|
236
|
+
│ ↓
|
|
237
|
+
missing-data-analysis anomaly-detection
|
|
238
|
+
(欠損補完) (異常検知)
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
## パイプライン出力
|
|
242
|
+
|
|
243
|
+
| ファイル | 説明 | 次スキル |
|
|
244
|
+
|---------|------|---------|
|
|
245
|
+
| `profile_report.html` | ydata-profiling レポート | → EDA |
|
|
246
|
+
| `quality_score.json` | データ品質スコア | → 品質管理 |
|
|
247
|
+
| `validation_results.csv` | バリデーション結果 | → データ修正 |
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-ensemble-methods
|
|
3
|
+
description: |
|
|
4
|
+
アンサンブル学習スキル。Stacking/Blending 多段積層・
|
|
5
|
+
Boosting (XGBoost/LightGBM/CatBoost) 勾配ブースティング・
|
|
6
|
+
Bagging/Random Subspace・Voting 分類器/回帰器・
|
|
7
|
+
アンサンブル多様性評価・モデル統合パイプライン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific Ensemble Methods
|
|
11
|
+
|
|
12
|
+
複数モデルの組み合わせによる予測精度向上・安定化を実現する
|
|
13
|
+
アンサンブル学習手法の設計・評価パイプラインを提供する。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- XGBoost/LightGBM/CatBoost で勾配ブースティングを実行するとき
|
|
18
|
+
- Stacking/Blending で多段アンサンブルを構築するとき
|
|
19
|
+
- 複数モデルの Voting/Averaging で安定予測を得るとき
|
|
20
|
+
- アンサンブルの多様性を評価するとき
|
|
21
|
+
- Out-of-Fold 予測でリーク防止を行うとき
|
|
22
|
+
- モデルの寄与度を分析するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. 勾配ブースティング比較
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import numpy as np
|
|
32
|
+
import pandas as pd
|
|
33
|
+
from sklearn.model_selection import cross_val_score
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def compare_boosting(X, y, cv=5, scoring="f1_macro",
|
|
37
|
+
task="classification"):
|
|
38
|
+
"""
|
|
39
|
+
XGBoost / LightGBM / CatBoost 比較。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
X: np.ndarray — 特徴量
|
|
43
|
+
y: np.ndarray — ラベル
|
|
44
|
+
cv: int — CV 分割数
|
|
45
|
+
scoring: str — 評価指標
|
|
46
|
+
task: str — "classification" / "regression"
|
|
47
|
+
"""
|
|
48
|
+
results = []
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
from xgboost import XGBClassifier, XGBRegressor
|
|
52
|
+
model = (XGBClassifier(n_estimators=200, max_depth=6,
|
|
53
|
+
learning_rate=0.1, random_state=42,
|
|
54
|
+
use_label_encoder=False, eval_metric="logloss")
|
|
55
|
+
if task == "classification"
|
|
56
|
+
else XGBRegressor(n_estimators=200, max_depth=6,
|
|
57
|
+
learning_rate=0.1, random_state=42))
|
|
58
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
59
|
+
results.append({"model": "XGBoost", "mean": scores.mean(),
|
|
60
|
+
"std": scores.std()})
|
|
61
|
+
except ImportError:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
66
|
+
model = (LGBMClassifier(n_estimators=200, max_depth=6,
|
|
67
|
+
learning_rate=0.1, random_state=42, verbose=-1)
|
|
68
|
+
if task == "classification"
|
|
69
|
+
else LGBMRegressor(n_estimators=200, max_depth=6,
|
|
70
|
+
learning_rate=0.1, random_state=42, verbose=-1))
|
|
71
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
72
|
+
results.append({"model": "LightGBM", "mean": scores.mean(),
|
|
73
|
+
"std": scores.std()})
|
|
74
|
+
except ImportError:
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
79
|
+
model = (CatBoostClassifier(iterations=200, depth=6,
|
|
80
|
+
learning_rate=0.1, random_seed=42, verbose=0)
|
|
81
|
+
if task == "classification"
|
|
82
|
+
else CatBoostRegressor(iterations=200, depth=6,
|
|
83
|
+
learning_rate=0.1, random_seed=42, verbose=0))
|
|
84
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
85
|
+
results.append({"model": "CatBoost", "mean": scores.mean(),
|
|
86
|
+
"std": scores.std()})
|
|
87
|
+
except ImportError:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
df = pd.DataFrame(results).sort_values("mean", ascending=False)
|
|
91
|
+
if not df.empty:
|
|
92
|
+
print(f"Boosting: best = {df.iloc[0]['model']} "
|
|
93
|
+
f"({scoring} = {df.iloc[0]['mean']:.4f})")
|
|
94
|
+
return df
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## 2. Stacking アンサンブル
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from sklearn.model_selection import StratifiedKFold
|
|
101
|
+
from sklearn.linear_model import LogisticRegression
|
|
102
|
+
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
|
|
103
|
+
from sklearn.svm import SVC
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def stacking_ensemble(X_train, y_train, X_test,
|
|
107
|
+
base_models=None, meta_model=None,
|
|
108
|
+
n_folds=5):
|
|
109
|
+
"""
|
|
110
|
+
Stacking アンサンブル (Out-of-Fold 予測)。
|
|
111
|
+
|
|
112
|
+
Parameters:
|
|
113
|
+
X_train: np.ndarray — 学習データ
|
|
114
|
+
y_train: np.ndarray — 学習ラベル
|
|
115
|
+
X_test: np.ndarray — テストデータ
|
|
116
|
+
base_models: list | None — ベースモデル
|
|
117
|
+
meta_model: classifier | None — メタモデル
|
|
118
|
+
n_folds: int — CV 分割数
|
|
119
|
+
"""
|
|
120
|
+
if base_models is None:
|
|
121
|
+
base_models = [
|
|
122
|
+
("rf", RandomForestClassifier(n_estimators=200, random_state=42)),
|
|
123
|
+
("gbm", GradientBoostingClassifier(n_estimators=200, random_state=42)),
|
|
124
|
+
("svm", SVC(probability=True, random_state=42)),
|
|
125
|
+
]
|
|
126
|
+
if meta_model is None:
|
|
127
|
+
meta_model = LogisticRegression(max_iter=1000, random_state=42)
|
|
128
|
+
|
|
129
|
+
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
130
|
+
n_classes = len(np.unique(y_train))
|
|
131
|
+
|
|
132
|
+
# Out-of-Fold predictions
|
|
133
|
+
oof_preds = np.zeros((len(y_train), len(base_models) * n_classes))
|
|
134
|
+
test_preds = np.zeros((len(X_test), len(base_models) * n_classes))
|
|
135
|
+
|
|
136
|
+
for i, (name, model) in enumerate(base_models):
|
|
137
|
+
col_start = i * n_classes
|
|
138
|
+
col_end = (i + 1) * n_classes
|
|
139
|
+
test_fold_preds = np.zeros((len(X_test), n_classes, n_folds))
|
|
140
|
+
|
|
141
|
+
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
|
|
142
|
+
m = model.__class__(**model.get_params()).fit(
|
|
143
|
+
X_train[train_idx], y_train[train_idx])
|
|
144
|
+
oof_preds[val_idx, col_start:col_end] = m.predict_proba(
|
|
145
|
+
X_train[val_idx])
|
|
146
|
+
test_fold_preds[:, :, fold] = m.predict_proba(X_test)
|
|
147
|
+
|
|
148
|
+
test_preds[:, col_start:col_end] = test_fold_preds.mean(axis=2)
|
|
149
|
+
print(f" Stacking base: {name} done")
|
|
150
|
+
|
|
151
|
+
# Meta-model
|
|
152
|
+
meta_model.fit(oof_preds, y_train)
|
|
153
|
+
final_pred = meta_model.predict(test_preds)
|
|
154
|
+
final_proba = meta_model.predict_proba(test_preds)
|
|
155
|
+
|
|
156
|
+
print(f"Stacking: {len(base_models)} base models → meta-model")
|
|
157
|
+
return final_pred, final_proba, meta_model
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## 3. Voting アンサンブル
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from sklearn.ensemble import VotingClassifier, VotingRegressor
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def voting_ensemble(X, y, models=None, voting="soft",
|
|
167
|
+
cv=5, scoring="f1_macro"):
|
|
168
|
+
"""
|
|
169
|
+
Voting アンサンブル。
|
|
170
|
+
|
|
171
|
+
Parameters:
|
|
172
|
+
X: np.ndarray — 特徴量
|
|
173
|
+
y: np.ndarray — ラベル
|
|
174
|
+
models: list | None — (name, model) ペア
|
|
175
|
+
voting: str — "soft" / "hard"
|
|
176
|
+
cv: int — CV 分割数
|
|
177
|
+
scoring: str — 評価指標
|
|
178
|
+
"""
|
|
179
|
+
if models is None:
|
|
180
|
+
models = [
|
|
181
|
+
("rf", RandomForestClassifier(n_estimators=200, random_state=42)),
|
|
182
|
+
("gbm", GradientBoostingClassifier(n_estimators=200, random_state=42)),
|
|
183
|
+
("lr", LogisticRegression(max_iter=1000, random_state=42)),
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
# 個別モデル評価
|
|
187
|
+
results = []
|
|
188
|
+
for name, model in models:
|
|
189
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
190
|
+
results.append({"model": name, "mean": scores.mean(), "std": scores.std()})
|
|
191
|
+
|
|
192
|
+
# Voting
|
|
193
|
+
vc = VotingClassifier(estimators=models, voting=voting)
|
|
194
|
+
scores = cross_val_score(vc, X, y, cv=cv, scoring=scoring)
|
|
195
|
+
results.append({"model": f"Voting({voting})",
|
|
196
|
+
"mean": scores.mean(), "std": scores.std()})
|
|
197
|
+
|
|
198
|
+
df = pd.DataFrame(results).sort_values("mean", ascending=False)
|
|
199
|
+
print(f"Voting ensemble: {scoring} = {scores.mean():.4f} ± {scores.std():.4f}")
|
|
200
|
+
return df
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## 4. アンサンブル多様性評価
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
def ensemble_diversity(models, X, y):
|
|
207
|
+
"""
|
|
208
|
+
アンサンブル多様性 (Q-statistic / Disagreement)。
|
|
209
|
+
|
|
210
|
+
Parameters:
|
|
211
|
+
models: list — 学習済みモデルリスト
|
|
212
|
+
X: np.ndarray — 評価データ
|
|
213
|
+
y: np.ndarray — 真ラベル
|
|
214
|
+
"""
|
|
215
|
+
predictions = np.array([m.predict(X) for m in models])
|
|
216
|
+
n_models = len(models)
|
|
217
|
+
correct = (predictions == y).astype(int)
|
|
218
|
+
|
|
219
|
+
# 全ペアの Q-statistic
|
|
220
|
+
q_stats = []
|
|
221
|
+
disagree_rates = []
|
|
222
|
+
for i in range(n_models):
|
|
223
|
+
for j in range(i + 1, n_models):
|
|
224
|
+
n11 = np.sum((correct[i] == 1) & (correct[j] == 1))
|
|
225
|
+
n00 = np.sum((correct[i] == 0) & (correct[j] == 0))
|
|
226
|
+
n10 = np.sum((correct[i] == 1) & (correct[j] == 0))
|
|
227
|
+
n01 = np.sum((correct[i] == 0) & (correct[j] == 1))
|
|
228
|
+
|
|
229
|
+
denom = n11 * n00 - n10 * n01
|
|
230
|
+
numer = n11 * n00 + n10 * n01
|
|
231
|
+
q = denom / numer if numer != 0 else 0
|
|
232
|
+
q_stats.append(q)
|
|
233
|
+
disagree_rates.append((n10 + n01) / len(y))
|
|
234
|
+
|
|
235
|
+
result = {
|
|
236
|
+
"mean_q_statistic": round(np.mean(q_stats), 4),
|
|
237
|
+
"mean_disagreement": round(np.mean(disagree_rates), 4),
|
|
238
|
+
"n_models": n_models,
|
|
239
|
+
}
|
|
240
|
+
print(f"Diversity: Q={result['mean_q_statistic']:.3f}, "
|
|
241
|
+
f"Disagree={result['mean_disagreement']:.3f}")
|
|
242
|
+
return result
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
## パイプライン統合
|
|
248
|
+
|
|
249
|
+
```
|
|
250
|
+
automl → ensemble-methods → uncertainty-quantification
|
|
251
|
+
(モデル選択) (アンサンブル) (不確実性定量化)
|
|
252
|
+
│ │ ↓
|
|
253
|
+
feature-importance ┘ explainable-ai
|
|
254
|
+
(特徴量重要度) (説明可能 AI)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## パイプライン出力
|
|
258
|
+
|
|
259
|
+
| ファイル | 説明 | 次スキル |
|
|
260
|
+
|---------|------|---------|
|
|
261
|
+
| `stacking_meta.pkl` | Stacking メタモデル | → 予測 |
|
|
262
|
+
| `boosting_comparison.csv` | ブースティング比較 | → レポート |
|
|
263
|
+
| `ensemble_diversity.json` | 多様性指標 | → モデル改善 |
|