@nahisaho/satori 0.22.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -20
- package/package.json +1 -1
- package/src/.github/skills/scientific-active-learning/SKILL.md +289 -0
- package/src/.github/skills/scientific-advanced-visualization/SKILL.md +310 -0
- package/src/.github/skills/scientific-anomaly-detection/SKILL.md +296 -0
- package/src/.github/skills/scientific-automl/SKILL.md +264 -0
- package/src/.github/skills/scientific-causal-ml/SKILL.md +240 -0
- package/src/.github/skills/scientific-data-profiling/SKILL.md +247 -0
- package/src/.github/skills/scientific-ensemble-methods/SKILL.md +263 -0
- package/src/.github/skills/scientific-geospatial-analysis/SKILL.md +274 -0
- package/src/.github/skills/scientific-interactive-dashboard/SKILL.md +346 -0
- package/src/.github/skills/scientific-missing-data-analysis/SKILL.md +312 -0
- package/src/.github/skills/scientific-model-monitoring/SKILL.md +247 -0
- package/src/.github/skills/scientific-network-visualization/SKILL.md +278 -0
- package/src/.github/skills/scientific-reproducible-reporting/SKILL.md +330 -0
- package/src/.github/skills/scientific-time-series-forecasting/SKILL.md +246 -0
- package/src/.github/skills/scientific-transfer-learning/SKILL.md +298 -0
- package/src/.github/skills/scientific-uncertainty-quantification/SKILL.md +286 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-automl
|
|
3
|
+
description: |
|
|
4
|
+
AutoML パイプラインスキル。Optuna ハイパーパラメータ最適化・
|
|
5
|
+
FLAML 高速 AutoML・Auto-sklearn モデル選択・
|
|
6
|
+
NAS (Neural Architecture Search)・
|
|
7
|
+
特徴量エンジニアリング自動化・モデル比較パイプライン。
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Scientific AutoML
|
|
11
|
+
|
|
12
|
+
ハイパーパラメータ最適化・モデル選択・特徴量エンジニアリングを
|
|
13
|
+
自動化する AutoML パイプラインを提供する。
|
|
14
|
+
|
|
15
|
+
## When to Use
|
|
16
|
+
|
|
17
|
+
- Optuna/Hyperopt でハイパーパラメータを最適化するとき
|
|
18
|
+
- 複数モデルの自動比較・選択を行うとき
|
|
19
|
+
- FLAML/Auto-sklearn で高速な AutoML を実行するとき
|
|
20
|
+
- 特徴量エンジニアリングを自動化するとき
|
|
21
|
+
- Neural Architecture Search (NAS) を設計するとき
|
|
22
|
+
- モデル選択根拠のレポートを生成するとき
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
## 1. Optuna ハイパーパラメータ最適化
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import optuna
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
from sklearn.model_selection import cross_val_score
|
|
35
|
+
from sklearn.ensemble import (
|
|
36
|
+
RandomForestClassifier, GradientBoostingClassifier)
|
|
37
|
+
from sklearn.svm import SVC
|
|
38
|
+
from sklearn.metrics import make_scorer, f1_score
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def optuna_optimize(X, y, model_type="rf", n_trials=100,
|
|
42
|
+
cv=5, scoring="f1_macro", direction="maximize"):
|
|
43
|
+
"""
|
|
44
|
+
Optuna ベース ハイパーパラメータ最適化。
|
|
45
|
+
|
|
46
|
+
Parameters:
|
|
47
|
+
X: np.ndarray — 特徴量
|
|
48
|
+
y: np.ndarray — ラベル
|
|
49
|
+
model_type: str — "rf" / "gbm" / "svm"
|
|
50
|
+
n_trials: int — 試行回数
|
|
51
|
+
cv: int — CV 分割数
|
|
52
|
+
scoring: str — 評価指標
|
|
53
|
+
direction: str — "maximize" / "minimize"
|
|
54
|
+
"""
|
|
55
|
+
def objective(trial):
|
|
56
|
+
if model_type == "rf":
|
|
57
|
+
params = {
|
|
58
|
+
"n_estimators": trial.suggest_int("n_estimators", 50, 500),
|
|
59
|
+
"max_depth": trial.suggest_int("max_depth", 3, 20),
|
|
60
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
|
|
61
|
+
"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
|
|
62
|
+
"max_features": trial.suggest_categorical(
|
|
63
|
+
"max_features", ["sqrt", "log2", None]),
|
|
64
|
+
}
|
|
65
|
+
model = RandomForestClassifier(**params, random_state=42)
|
|
66
|
+
|
|
67
|
+
elif model_type == "gbm":
|
|
68
|
+
params = {
|
|
69
|
+
"n_estimators": trial.suggest_int("n_estimators", 50, 500),
|
|
70
|
+
"max_depth": trial.suggest_int("max_depth", 3, 10),
|
|
71
|
+
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
|
|
72
|
+
"subsample": trial.suggest_float("subsample", 0.5, 1.0),
|
|
73
|
+
"min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
|
|
74
|
+
}
|
|
75
|
+
model = GradientBoostingClassifier(**params, random_state=42)
|
|
76
|
+
|
|
77
|
+
elif model_type == "svm":
|
|
78
|
+
params = {
|
|
79
|
+
"C": trial.suggest_float("C", 0.01, 100, log=True),
|
|
80
|
+
"kernel": trial.suggest_categorical(
|
|
81
|
+
"kernel", ["rbf", "poly", "sigmoid"]),
|
|
82
|
+
"gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
|
|
83
|
+
}
|
|
84
|
+
model = SVC(**params, probability=True, random_state=42)
|
|
85
|
+
|
|
86
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
87
|
+
return scores.mean()
|
|
88
|
+
|
|
89
|
+
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
|
90
|
+
study = optuna.create_study(direction=direction)
|
|
91
|
+
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
|
|
92
|
+
|
|
93
|
+
print(f"Optuna ({model_type}): best {scoring} = {study.best_value:.4f}")
|
|
94
|
+
print(f" Best params: {study.best_params}")
|
|
95
|
+
return study
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## 2. マルチモデル AutoML パイプライン
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from sklearn.linear_model import LogisticRegression
|
|
102
|
+
from sklearn.neighbors import KNeighborsClassifier
|
|
103
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
104
|
+
from sklearn.neural_network import MLPClassifier
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def automl_model_selection(X, y, cv=5, scoring="f1_macro",
|
|
108
|
+
n_trials_per_model=50):
|
|
109
|
+
"""
|
|
110
|
+
AutoML マルチモデル選択パイプライン。
|
|
111
|
+
|
|
112
|
+
Parameters:
|
|
113
|
+
X: np.ndarray — 特徴量
|
|
114
|
+
y: np.ndarray — ラベル
|
|
115
|
+
cv: int — CV 分割数
|
|
116
|
+
scoring: str — 評価指標
|
|
117
|
+
n_trials_per_model: int — モデルあたり試行数
|
|
118
|
+
"""
|
|
119
|
+
model_types = ["rf", "gbm", "svm"]
|
|
120
|
+
results = []
|
|
121
|
+
|
|
122
|
+
for mt in model_types:
|
|
123
|
+
study = optuna_optimize(
|
|
124
|
+
X, y, model_type=mt,
|
|
125
|
+
n_trials=n_trials_per_model, cv=cv, scoring=scoring)
|
|
126
|
+
results.append({
|
|
127
|
+
"model_type": mt,
|
|
128
|
+
"best_score": round(study.best_value, 4),
|
|
129
|
+
"best_params": study.best_params,
|
|
130
|
+
"n_trials": len(study.trials),
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
# 簡易モデル (ベースライン)
|
|
134
|
+
baselines = [
|
|
135
|
+
("logistic", LogisticRegression(max_iter=1000, random_state=42)),
|
|
136
|
+
("knn", KNeighborsClassifier()),
|
|
137
|
+
("dt", DecisionTreeClassifier(random_state=42)),
|
|
138
|
+
]
|
|
139
|
+
for name, model in baselines:
|
|
140
|
+
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
|
|
141
|
+
results.append({
|
|
142
|
+
"model_type": name,
|
|
143
|
+
"best_score": round(scores.mean(), 4),
|
|
144
|
+
"best_params": {},
|
|
145
|
+
"n_trials": 1,
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
df = pd.DataFrame(results).sort_values("best_score", ascending=False)
|
|
149
|
+
best = df.iloc[0]
|
|
150
|
+
print(f"AutoML: best = {best['model_type']} "
|
|
151
|
+
f"({scoring} = {best['best_score']})")
|
|
152
|
+
return df
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## 3. 自動特徴量エンジニアリング
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
|
|
159
|
+
from sklearn.feature_selection import SelectKBest, mutual_info_classif
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def auto_feature_engineering(X, y, max_poly_degree=2,
|
|
163
|
+
top_k=None, interactions_only=False):
|
|
164
|
+
"""
|
|
165
|
+
自動特徴量エンジニアリング。
|
|
166
|
+
|
|
167
|
+
Parameters:
|
|
168
|
+
X: np.ndarray — 元特徴量
|
|
169
|
+
y: np.ndarray — ラベル
|
|
170
|
+
max_poly_degree: int — 多項式次数
|
|
171
|
+
top_k: int | None — 選択する特徴量数
|
|
172
|
+
interactions_only: bool — 交互作用のみ
|
|
173
|
+
"""
|
|
174
|
+
scaler = StandardScaler()
|
|
175
|
+
X_scaled = scaler.fit_transform(X)
|
|
176
|
+
|
|
177
|
+
# 多項式特徴量
|
|
178
|
+
poly = PolynomialFeatures(
|
|
179
|
+
degree=max_poly_degree,
|
|
180
|
+
interaction_only=interactions_only,
|
|
181
|
+
include_bias=False)
|
|
182
|
+
X_poly = poly.fit_transform(X_scaled)
|
|
183
|
+
|
|
184
|
+
# 特徴量選択
|
|
185
|
+
if top_k is None:
|
|
186
|
+
top_k = min(X_poly.shape[1], X.shape[1] * 3)
|
|
187
|
+
|
|
188
|
+
selector = SelectKBest(mutual_info_classif, k=min(top_k, X_poly.shape[1]))
|
|
189
|
+
X_selected = selector.fit_transform(X_poly, y)
|
|
190
|
+
|
|
191
|
+
print(f"Feature engineering: {X.shape[1]} → {X_poly.shape[1]} "
|
|
192
|
+
f"→ {X_selected.shape[1]} features")
|
|
193
|
+
return X_selected, poly, selector
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## 4. Optuna 可視化レポート
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
def automl_report(study, output_dir="results"):
|
|
200
|
+
"""
|
|
201
|
+
Optuna Study 可視化レポート。
|
|
202
|
+
|
|
203
|
+
Parameters:
|
|
204
|
+
study: optuna.Study — 最適化結果
|
|
205
|
+
output_dir: str — 出力ディレクトリ
|
|
206
|
+
"""
|
|
207
|
+
from pathlib import Path
|
|
208
|
+
import matplotlib.pyplot as plt
|
|
209
|
+
|
|
210
|
+
out = Path(output_dir)
|
|
211
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
212
|
+
|
|
213
|
+
# パラメータ重要度
|
|
214
|
+
try:
|
|
215
|
+
importances = optuna.importance.get_param_importances(study)
|
|
216
|
+
fig, ax = plt.subplots(figsize=(8, 5))
|
|
217
|
+
params = list(importances.keys())
|
|
218
|
+
values = list(importances.values())
|
|
219
|
+
ax.barh(params, values)
|
|
220
|
+
ax.set_xlabel("Importance")
|
|
221
|
+
ax.set_title("Hyperparameter Importance")
|
|
222
|
+
fig.tight_layout()
|
|
223
|
+
fig.savefig(out / "param_importance.png", dpi=150)
|
|
224
|
+
plt.close(fig)
|
|
225
|
+
except Exception:
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
# 最適化履歴
|
|
229
|
+
trials_df = study.trials_dataframe()
|
|
230
|
+
trials_df.to_csv(out / "optuna_trials.csv", index=False)
|
|
231
|
+
|
|
232
|
+
# ベストパラメータ
|
|
233
|
+
best = {
|
|
234
|
+
"best_value": study.best_value,
|
|
235
|
+
"best_params": study.best_params,
|
|
236
|
+
"n_trials": len(study.trials),
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
print(f"AutoML report → {out}")
|
|
240
|
+
return best
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## パイプライン統合
|
|
246
|
+
|
|
247
|
+
```
|
|
248
|
+
eda-correlation → automl → ensemble-methods
|
|
249
|
+
(データ探索) (モデル選択) (アンサンブル)
|
|
250
|
+
│ │ ↓
|
|
251
|
+
feature-importance ──┘ uncertainty-quantification
|
|
252
|
+
(特徴量解釈) (不確実性定量化)
|
|
253
|
+
│
|
|
254
|
+
active-learning
|
|
255
|
+
(能動学習)
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
## パイプライン出力
|
|
259
|
+
|
|
260
|
+
| ファイル | 説明 | 次スキル |
|
|
261
|
+
|---------|------|---------|
|
|
262
|
+
| `optuna_trials.csv` | 試行履歴 | → 可視化 |
|
|
263
|
+
| `param_importance.png` | パラメータ重要度 | → レポート |
|
|
264
|
+
| `model_comparison.csv` | モデル比較 | → ensemble-methods |
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-causal-ml
|
|
3
|
+
description: |
|
|
4
|
+
因果機械学習スキル。DoWhy 因果モデル・EconML CATE 推定・
|
|
5
|
+
Double/Debiased ML・Causal Forest・メタラーナー (S/T/X)・
|
|
6
|
+
異質的処置効果 (HTE)・因果特徴量選択。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Causal ML
|
|
10
|
+
|
|
11
|
+
機械学習ベースの因果推論パイプラインを提供し、
|
|
12
|
+
異質的処置効果 (HTE) の推定と因果特徴量発見を実現する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 処置効果が個人/サブグループで異なるとき (HTE 推定)
|
|
17
|
+
- Causal Forest で非パラメトリック因果効果を推定するとき
|
|
18
|
+
- Double ML で高次元データの処置効果を推定するとき
|
|
19
|
+
- メタラーナー (S/T/X-learner) で CATE を推定するとき
|
|
20
|
+
- DoWhy で因果モデルの同定・推定・反論をするとき
|
|
21
|
+
- 因果特徴量選択で重要な効果修飾因子を発見するとき
|
|
22
|
+
|
|
23
|
+
> **Note**: 統計的因果推論 (PSM/IPW/DID/RDD) は `scientific-causal-inference` を参照。
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
## 1. DoWhy 因果モデル
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def dowhy_causal_model(df, treatment, outcome, common_causes,
|
|
37
|
+
effect_modifiers=None, method="backdoor.linear_regression"):
|
|
38
|
+
"""
|
|
39
|
+
DoWhy 因果推論パイプライン (同定→推定→反論)。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
df: pd.DataFrame — 観測データ
|
|
43
|
+
treatment: str — 処置変数
|
|
44
|
+
outcome: str — 結果変数
|
|
45
|
+
common_causes: list[str] — 共変量
|
|
46
|
+
effect_modifiers: list[str] | None — 効果修飾因子
|
|
47
|
+
method: str — 推定手法
|
|
48
|
+
"""
|
|
49
|
+
import dowhy
|
|
50
|
+
|
|
51
|
+
model = dowhy.CausalModel(
|
|
52
|
+
data=df,
|
|
53
|
+
treatment=treatment,
|
|
54
|
+
outcome=outcome,
|
|
55
|
+
common_causes=common_causes,
|
|
56
|
+
effect_modifiers=effect_modifiers)
|
|
57
|
+
|
|
58
|
+
# 同定
|
|
59
|
+
estimand = model.identify_effect(proceed_when_unidentifiable=True)
|
|
60
|
+
print(f"Identified estimand: {estimand.get_frontdoor_variables()}")
|
|
61
|
+
|
|
62
|
+
# 推定
|
|
63
|
+
estimate = model.estimate_effect(
|
|
64
|
+
estimand, method_name=method)
|
|
65
|
+
print(f"ATE = {estimate.value:.4f} (95% CI: [{estimate.get_confidence_intervals()[0]:.4f}, "
|
|
66
|
+
f"{estimate.get_confidence_intervals()[1]:.4f}])")
|
|
67
|
+
|
|
68
|
+
# 反論テスト
|
|
69
|
+
refutations = {}
|
|
70
|
+
for refuter_name in ["random_common_cause", "placebo_treatment_refuter",
|
|
71
|
+
"data_subset_refuter"]:
|
|
72
|
+
try:
|
|
73
|
+
refutation = model.refute_estimate(
|
|
74
|
+
estimand, estimate, method_name=refuter_name)
|
|
75
|
+
refutations[refuter_name] = {
|
|
76
|
+
"new_effect": float(refutation.new_effect),
|
|
77
|
+
"p_value": getattr(refutation, "refutation_result", {}).get("p_value", None)
|
|
78
|
+
}
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
print(f"Refutation tests: {len(refutations)} passed")
|
|
83
|
+
return {"model": model, "estimand": estimand,
|
|
84
|
+
"estimate": estimate, "refutations": refutations}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## 2. EconML Double ML / Causal Forest
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
def double_ml_estimate(df, treatment, outcome, features,
|
|
91
|
+
n_splits=5, model_type="linear"):
|
|
92
|
+
"""
|
|
93
|
+
Double/Debiased ML による処置効果推定。
|
|
94
|
+
|
|
95
|
+
Parameters:
|
|
96
|
+
df: pd.DataFrame — データ
|
|
97
|
+
treatment: str — 処置変数
|
|
98
|
+
outcome: str — 結果変数
|
|
99
|
+
features: list[str] — 共変量
|
|
100
|
+
n_splits: int — クロスフィッティング分割数
|
|
101
|
+
model_type: str — "linear" / "forest"
|
|
102
|
+
"""
|
|
103
|
+
from econml.dml import LinearDML, CausalForestDML
|
|
104
|
+
|
|
105
|
+
Y = df[outcome].values
|
|
106
|
+
T = df[treatment].values
|
|
107
|
+
X = df[features].values
|
|
108
|
+
|
|
109
|
+
if model_type == "linear":
|
|
110
|
+
est = LinearDML(cv=n_splits, random_state=42)
|
|
111
|
+
else:
|
|
112
|
+
est = CausalForestDML(
|
|
113
|
+
n_estimators=200, cv=n_splits, random_state=42)
|
|
114
|
+
|
|
115
|
+
est.fit(Y, T, X=X)
|
|
116
|
+
|
|
117
|
+
ate = est.ate(X)
|
|
118
|
+
ate_ci = est.ate_interval(X, alpha=0.05)
|
|
119
|
+
|
|
120
|
+
# CATE (個人レベル)
|
|
121
|
+
cate = est.effect(X)
|
|
122
|
+
cate_ci = est.effect_interval(X, alpha=0.05)
|
|
123
|
+
|
|
124
|
+
result_df = pd.DataFrame(X, columns=features)
|
|
125
|
+
result_df["cate"] = cate
|
|
126
|
+
result_df["cate_lower"] = cate_ci[0]
|
|
127
|
+
result_df["cate_upper"] = cate_ci[1]
|
|
128
|
+
|
|
129
|
+
print(f"Double ML ({model_type}): ATE={ate:.4f} "
|
|
130
|
+
f"[{ate_ci[0]:.4f}, {ate_ci[1]:.4f}]")
|
|
131
|
+
print(f" CATE range: [{cate.min():.4f}, {cate.max():.4f}]")
|
|
132
|
+
return {"ate": ate, "ate_ci": ate_ci,
|
|
133
|
+
"cate_df": result_df, "model": est}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def causal_forest(df, treatment, outcome, features,
|
|
137
|
+
n_estimators=500):
|
|
138
|
+
"""
|
|
139
|
+
Causal Forest — 非パラメトリック HTE 推定。
|
|
140
|
+
|
|
141
|
+
Parameters:
|
|
142
|
+
df: pd.DataFrame — データ
|
|
143
|
+
treatment: str — 処置変数 (binary)
|
|
144
|
+
outcome: str — 結果変数
|
|
145
|
+
features: list[str] — 共変量
|
|
146
|
+
n_estimators: int — 木の数
|
|
147
|
+
"""
|
|
148
|
+
from econml.dml import CausalForestDML
|
|
149
|
+
|
|
150
|
+
Y = df[outcome].values
|
|
151
|
+
T = df[treatment].values
|
|
152
|
+
X = df[features].values
|
|
153
|
+
|
|
154
|
+
cf = CausalForestDML(
|
|
155
|
+
n_estimators=n_estimators, random_state=42,
|
|
156
|
+
min_samples_leaf=10)
|
|
157
|
+
cf.fit(Y, T, X=X)
|
|
158
|
+
|
|
159
|
+
cate = cf.effect(X)
|
|
160
|
+
cate_ci = cf.effect_interval(X, alpha=0.05)
|
|
161
|
+
|
|
162
|
+
# 特徴量重要度 (因果)
|
|
163
|
+
importances = cf.feature_importances_
|
|
164
|
+
feat_imp = pd.DataFrame({
|
|
165
|
+
"feature": features,
|
|
166
|
+
"causal_importance": importances
|
|
167
|
+
}).sort_values("causal_importance", ascending=False)
|
|
168
|
+
|
|
169
|
+
print(f"Causal Forest: {n_estimators} trees, "
|
|
170
|
+
f"CATE median={np.median(cate):.4f}")
|
|
171
|
+
print(f" Top causal features: {feat_imp.head(5).to_dict('records')}")
|
|
172
|
+
return {"cate": cate, "cate_ci": cate_ci,
|
|
173
|
+
"feature_importance": feat_imp, "model": cf}
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## 3. メタラーナー (S/T/X-Learner)
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
def meta_learner(df, treatment, outcome, features,
|
|
180
|
+
learner_type="t", base_model=None):
|
|
181
|
+
"""
|
|
182
|
+
メタラーナーによる CATE 推定。
|
|
183
|
+
|
|
184
|
+
Parameters:
|
|
185
|
+
df: pd.DataFrame — データ
|
|
186
|
+
treatment: str — 処置変数 (binary 0/1)
|
|
187
|
+
outcome: str — 結果変数
|
|
188
|
+
features: list[str] — 共変量
|
|
189
|
+
learner_type: str — "s" / "t" / "x"
|
|
190
|
+
base_model: BaseEstimator | None — ベースモデル
|
|
191
|
+
"""
|
|
192
|
+
from econml.metalearners import SLearner, TLearner, XLearner
|
|
193
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
194
|
+
|
|
195
|
+
if base_model is None:
|
|
196
|
+
base_model = GradientBoostingRegressor(
|
|
197
|
+
n_estimators=200, max_depth=5, random_state=42)
|
|
198
|
+
|
|
199
|
+
Y = df[outcome].values
|
|
200
|
+
T = df[treatment].values
|
|
201
|
+
X = df[features].values
|
|
202
|
+
|
|
203
|
+
learners = {"s": SLearner, "t": TLearner, "x": XLearner}
|
|
204
|
+
LearnerClass = learners[learner_type]
|
|
205
|
+
|
|
206
|
+
if learner_type == "s":
|
|
207
|
+
est = LearnerClass(overall_model=base_model)
|
|
208
|
+
else:
|
|
209
|
+
est = LearnerClass(models=base_model)
|
|
210
|
+
|
|
211
|
+
est.fit(Y, T, X=X)
|
|
212
|
+
cate = est.effect(X)
|
|
213
|
+
|
|
214
|
+
result_df = pd.DataFrame(X, columns=features)
|
|
215
|
+
result_df["cate"] = cate
|
|
216
|
+
|
|
217
|
+
print(f"{learner_type.upper()}-Learner: "
|
|
218
|
+
f"CATE mean={cate.mean():.4f}, std={cate.std():.4f}")
|
|
219
|
+
return {"cate": cate, "cate_df": result_df, "model": est}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## パイプライン統合
|
|
225
|
+
|
|
226
|
+
```
|
|
227
|
+
causal-inference → causal-ml → feature-importance
|
|
228
|
+
(統計的因果) (因果 ML) (特徴量解釈)
|
|
229
|
+
│ │ ↓
|
|
230
|
+
clinical-trial ───────┘ explainable-ai
|
|
231
|
+
(臨床試験) (説明可能 AI)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## パイプライン出力
|
|
235
|
+
|
|
236
|
+
| ファイル | 説明 | 次スキル |
|
|
237
|
+
|---------|------|---------|
|
|
238
|
+
| `dowhy_causal_model.json` | DoWhy 因果モデル | → reporting |
|
|
239
|
+
| `cate_estimates.csv` | CATE 推定値 | → precision-medicine |
|
|
240
|
+
| `causal_feature_importance.csv` | 因果特徴量重要度 | → explainable-ai |
|