@nahisaho/satori 0.23.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -29
- package/package.json +1 -1
- package/src/.github/skills/scientific-adaptive-experiments/SKILL.md +287 -0
- package/src/.github/skills/scientific-anomaly-detection/SKILL.md +296 -0
- package/src/.github/skills/scientific-causal-ml/SKILL.md +240 -0
- package/src/.github/skills/scientific-data-profiling/SKILL.md +247 -0
- package/src/.github/skills/scientific-federated-learning/SKILL.md +241 -0
- package/src/.github/skills/scientific-geospatial-analysis/SKILL.md +274 -0
- package/src/.github/skills/scientific-model-monitoring/SKILL.md +247 -0
- package/src/.github/skills/scientific-multi-task-learning/SKILL.md +238 -0
- package/src/.github/skills/scientific-network-visualization/SKILL.md +278 -0
- package/src/.github/skills/scientific-neural-architecture-search/SKILL.md +206 -0
- package/src/.github/skills/scientific-radiology-ai/SKILL.md +285 -0
- package/src/.github/skills/scientific-reproducible-reporting/SKILL.md +330 -0
- package/src/.github/skills/scientific-semi-supervised-learning/SKILL.md +210 -0
- package/src/.github/skills/scientific-statistical-simulation/SKILL.md +227 -0
- package/src/.github/skills/scientific-streaming-analytics/SKILL.md +221 -0
- package/src/.github/skills/scientific-time-series-forecasting/SKILL.md +246 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-anomaly-detection
|
|
3
|
+
description: |
|
|
4
|
+
異常検知・外れ値検出スキル。Isolation Forest・LOF・
|
|
5
|
+
One-Class SVM・Autoencoder 異常検知・統計的工程管理 (SPC)・
|
|
6
|
+
多変量異常検知・異常スコアリング・閾値最適化。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Anomaly Detection
|
|
10
|
+
|
|
11
|
+
科学データにおける異常値・外れ値・異常パターンの検出と
|
|
12
|
+
統計的工程管理 (SPC) パイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 実験データの外れ値を統計的に検出するとき
|
|
17
|
+
- 製造プロセスの異常監視 (SPC) をするとき
|
|
18
|
+
- 多変量データで異常パターンを発見するとき
|
|
19
|
+
- Autoencoder で複雑な異常を検出するとき
|
|
20
|
+
- 異常スコアの閾値を最適化するとき
|
|
21
|
+
- 複数手法のアンサンブル異常検知をするとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. 統計的異常検知アンサンブル
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
from sklearn.ensemble import IsolationForest
|
|
33
|
+
from sklearn.neighbors import LocalOutlierFactor
|
|
34
|
+
from sklearn.svm import OneClassSVM
|
|
35
|
+
from sklearn.preprocessing import StandardScaler
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def anomaly_detection_ensemble(X, contamination=0.05,
|
|
39
|
+
methods=None, threshold_vote=2):
|
|
40
|
+
"""
|
|
41
|
+
複数手法アンサンブル異常検知。
|
|
42
|
+
|
|
43
|
+
Parameters:
|
|
44
|
+
X: np.ndarray | pd.DataFrame — 入力データ
|
|
45
|
+
contamination: float — 想定異常率
|
|
46
|
+
methods: list[str] | None — 使用手法 ("iforest", "lof", "ocsvm")
|
|
47
|
+
threshold_vote: int — 最低投票数 (多数決)
|
|
48
|
+
"""
|
|
49
|
+
if methods is None:
|
|
50
|
+
methods = ["iforest", "lof", "ocsvm"]
|
|
51
|
+
|
|
52
|
+
if isinstance(X, pd.DataFrame):
|
|
53
|
+
feature_names = X.columns.tolist()
|
|
54
|
+
X_arr = X.values
|
|
55
|
+
else:
|
|
56
|
+
feature_names = [f"f{i}" for i in range(X.shape[1])]
|
|
57
|
+
X_arr = X
|
|
58
|
+
|
|
59
|
+
scaler = StandardScaler()
|
|
60
|
+
X_scaled = scaler.fit_transform(X_arr)
|
|
61
|
+
|
|
62
|
+
results = {}
|
|
63
|
+
predictions = {}
|
|
64
|
+
|
|
65
|
+
for method in methods:
|
|
66
|
+
if method == "iforest":
|
|
67
|
+
model = IsolationForest(
|
|
68
|
+
contamination=contamination, random_state=42, n_jobs=-1)
|
|
69
|
+
preds = model.fit_predict(X_scaled)
|
|
70
|
+
scores = -model.score_samples(X_scaled)
|
|
71
|
+
elif method == "lof":
|
|
72
|
+
model = LocalOutlierFactor(
|
|
73
|
+
n_neighbors=20, contamination=contamination)
|
|
74
|
+
preds = model.fit_predict(X_scaled)
|
|
75
|
+
scores = -model.negative_outlier_factor_
|
|
76
|
+
elif method == "ocsvm":
|
|
77
|
+
model = OneClassSVM(kernel="rbf", nu=contamination)
|
|
78
|
+
preds = model.fit_predict(X_scaled)
|
|
79
|
+
scores = -model.decision_function(X_scaled)
|
|
80
|
+
else:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
is_anomaly = (preds == -1).astype(int)
|
|
84
|
+
predictions[method] = is_anomaly
|
|
85
|
+
results[method] = {
|
|
86
|
+
"n_anomalies": int(is_anomaly.sum()),
|
|
87
|
+
"scores": scores
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# アンサンブル多数決
|
|
91
|
+
vote_matrix = np.column_stack(list(predictions.values()))
|
|
92
|
+
ensemble_votes = vote_matrix.sum(axis=1)
|
|
93
|
+
ensemble_anomaly = (ensemble_votes >= threshold_vote).astype(int)
|
|
94
|
+
|
|
95
|
+
result_df = pd.DataFrame(X_arr, columns=feature_names)
|
|
96
|
+
for method, preds in predictions.items():
|
|
97
|
+
result_df[f"anomaly_{method}"] = preds
|
|
98
|
+
result_df["ensemble_votes"] = ensemble_votes
|
|
99
|
+
result_df["is_anomaly"] = ensemble_anomaly
|
|
100
|
+
|
|
101
|
+
n_ens = ensemble_anomaly.sum()
|
|
102
|
+
print(f"Anomaly Ensemble ({len(methods)} methods, vote≥{threshold_vote}): "
|
|
103
|
+
f"{n_ens}/{len(X_arr)} anomalies ({n_ens/len(X_arr)*100:.1f}%)")
|
|
104
|
+
|
|
105
|
+
for m, r in results.items():
|
|
106
|
+
print(f" {m}: {r['n_anomalies']} anomalies")
|
|
107
|
+
|
|
108
|
+
return result_df, results
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## 2. Autoencoder 異常検知
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
def autoencoder_anomaly(X, encoding_dim=8, epochs=100,
|
|
115
|
+
threshold_percentile=95):
|
|
116
|
+
"""
|
|
117
|
+
Autoencoder ベース異常検知。
|
|
118
|
+
|
|
119
|
+
Parameters:
|
|
120
|
+
X: np.ndarray — 入力データ (正常データで学習)
|
|
121
|
+
encoding_dim: int — 潜在次元数
|
|
122
|
+
epochs: int — 学習エポック数
|
|
123
|
+
threshold_percentile: float — 再構成誤差の閾値パーセンタイル
|
|
124
|
+
"""
|
|
125
|
+
import torch
|
|
126
|
+
import torch.nn as nn
|
|
127
|
+
from torch.utils.data import DataLoader, TensorDataset
|
|
128
|
+
|
|
129
|
+
scaler = StandardScaler()
|
|
130
|
+
X_scaled = scaler.fit_transform(X)
|
|
131
|
+
n_features = X_scaled.shape[1]
|
|
132
|
+
|
|
133
|
+
# Autoencoder 定義
|
|
134
|
+
class AE(nn.Module):
|
|
135
|
+
def __init__(self):
|
|
136
|
+
super().__init__()
|
|
137
|
+
self.encoder = nn.Sequential(
|
|
138
|
+
nn.Linear(n_features, 64), nn.ReLU(),
|
|
139
|
+
nn.Linear(64, 32), nn.ReLU(),
|
|
140
|
+
nn.Linear(32, encoding_dim))
|
|
141
|
+
self.decoder = nn.Sequential(
|
|
142
|
+
nn.Linear(encoding_dim, 32), nn.ReLU(),
|
|
143
|
+
nn.Linear(32, 64), nn.ReLU(),
|
|
144
|
+
nn.Linear(64, n_features))
|
|
145
|
+
|
|
146
|
+
def forward(self, x):
|
|
147
|
+
z = self.encoder(x)
|
|
148
|
+
return self.decoder(z)
|
|
149
|
+
|
|
150
|
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
151
|
+
model = AE().to(device)
|
|
152
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
|
|
153
|
+
criterion = nn.MSELoss()
|
|
154
|
+
|
|
155
|
+
X_tensor = torch.FloatTensor(X_scaled).to(device)
|
|
156
|
+
dataset = TensorDataset(X_tensor, X_tensor)
|
|
157
|
+
loader = DataLoader(dataset, batch_size=64, shuffle=True)
|
|
158
|
+
|
|
159
|
+
model.train()
|
|
160
|
+
for epoch in range(epochs):
|
|
161
|
+
total_loss = 0
|
|
162
|
+
for batch_x, _ in loader:
|
|
163
|
+
optimizer.zero_grad()
|
|
164
|
+
recon = model(batch_x)
|
|
165
|
+
loss = criterion(recon, batch_x)
|
|
166
|
+
loss.backward()
|
|
167
|
+
optimizer.step()
|
|
168
|
+
total_loss += loss.item()
|
|
169
|
+
|
|
170
|
+
# 再構成誤差
|
|
171
|
+
model.eval()
|
|
172
|
+
with torch.no_grad():
|
|
173
|
+
recon = model(X_tensor).cpu().numpy()
|
|
174
|
+
|
|
175
|
+
recon_errors = np.mean((X_scaled - recon) ** 2, axis=1)
|
|
176
|
+
threshold = np.percentile(recon_errors, threshold_percentile)
|
|
177
|
+
is_anomaly = (recon_errors > threshold).astype(int)
|
|
178
|
+
|
|
179
|
+
print(f"Autoencoder Anomaly: threshold={threshold:.4f} (P{threshold_percentile}), "
|
|
180
|
+
f"{is_anomaly.sum()} anomalies")
|
|
181
|
+
return {"reconstruction_error": recon_errors, "threshold": threshold,
|
|
182
|
+
"is_anomaly": is_anomaly, "model": model}
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## 3. 統計的工程管理 (SPC)
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
def spc_control_chart(data, column, subgroup_size=1,
|
|
189
|
+
chart_type="individuals"):
|
|
190
|
+
"""
|
|
191
|
+
SPC 管理図 (X-bar, R, Individuals-MR)。
|
|
192
|
+
|
|
193
|
+
Parameters:
|
|
194
|
+
data: pd.DataFrame | pd.Series — 時系列データ
|
|
195
|
+
column: str — 対象カラム名
|
|
196
|
+
subgroup_size: int — サブグループサイズ
|
|
197
|
+
chart_type: str — "individuals" / "xbar_r" / "cusum"
|
|
198
|
+
"""
|
|
199
|
+
import matplotlib.pyplot as plt
|
|
200
|
+
|
|
201
|
+
if isinstance(data, pd.DataFrame):
|
|
202
|
+
values = data[column].values
|
|
203
|
+
else:
|
|
204
|
+
values = data.values
|
|
205
|
+
|
|
206
|
+
if chart_type == "individuals":
|
|
207
|
+
x_bar = np.mean(values)
|
|
208
|
+
mr = np.abs(np.diff(values))
|
|
209
|
+
mr_bar = np.mean(mr)
|
|
210
|
+
d2 = 1.128 # d2 for n=2
|
|
211
|
+
|
|
212
|
+
ucl = x_bar + 3 * (mr_bar / d2)
|
|
213
|
+
lcl = x_bar - 3 * (mr_bar / d2)
|
|
214
|
+
|
|
215
|
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)
|
|
216
|
+
|
|
217
|
+
# Individuals chart
|
|
218
|
+
ax1.plot(values, "b-o", markersize=3)
|
|
219
|
+
ax1.axhline(x_bar, color="g", linestyle="-", label=f"CL={x_bar:.3f}")
|
|
220
|
+
ax1.axhline(ucl, color="r", linestyle="--", label=f"UCL={ucl:.3f}")
|
|
221
|
+
ax1.axhline(lcl, color="r", linestyle="--", label=f"LCL={lcl:.3f}")
|
|
222
|
+
|
|
223
|
+
# OOC points
|
|
224
|
+
ooc = np.where((values > ucl) | (values < lcl))[0]
|
|
225
|
+
if len(ooc) > 0:
|
|
226
|
+
ax1.scatter(ooc, values[ooc], c="red", s=50, zorder=5,
|
|
227
|
+
label=f"OOC ({len(ooc)})")
|
|
228
|
+
ax1.set_title("Individuals Chart")
|
|
229
|
+
ax1.legend(fontsize=8)
|
|
230
|
+
|
|
231
|
+
# Moving Range chart
|
|
232
|
+
mr_ucl = 3.267 * mr_bar
|
|
233
|
+
ax2.plot(mr, "b-o", markersize=3)
|
|
234
|
+
ax2.axhline(mr_bar, color="g", linestyle="-")
|
|
235
|
+
ax2.axhline(mr_ucl, color="r", linestyle="--")
|
|
236
|
+
ax2.set_title("Moving Range Chart")
|
|
237
|
+
|
|
238
|
+
plt.tight_layout()
|
|
239
|
+
path = "spc_control_chart.png"
|
|
240
|
+
plt.savefig(path, dpi=150, bbox_inches="tight")
|
|
241
|
+
plt.close()
|
|
242
|
+
|
|
243
|
+
print(f"SPC Individuals: CL={x_bar:.3f}, UCL={ucl:.3f}, "
|
|
244
|
+
f"LCL={lcl:.3f}, OOC={len(ooc)}")
|
|
245
|
+
return {"cl": x_bar, "ucl": ucl, "lcl": lcl,
|
|
246
|
+
"ooc_indices": ooc, "fig": path}
|
|
247
|
+
|
|
248
|
+
elif chart_type == "cusum":
|
|
249
|
+
target = np.mean(values)
|
|
250
|
+
se = np.std(values)
|
|
251
|
+
k = 0.5 * se
|
|
252
|
+
h = 5 * se
|
|
253
|
+
|
|
254
|
+
cusum_pos = np.zeros(len(values))
|
|
255
|
+
cusum_neg = np.zeros(len(values))
|
|
256
|
+
|
|
257
|
+
for i in range(1, len(values)):
|
|
258
|
+
cusum_pos[i] = max(0, cusum_pos[i-1] + (values[i] - target) - k)
|
|
259
|
+
cusum_neg[i] = min(0, cusum_neg[i-1] + (values[i] - target) + k)
|
|
260
|
+
|
|
261
|
+
fig, ax = plt.subplots(figsize=(12, 5))
|
|
262
|
+
ax.plot(cusum_pos, "b-", label="CUSUM+")
|
|
263
|
+
ax.plot(cusum_neg, "r-", label="CUSUM-")
|
|
264
|
+
ax.axhline(h, color="b", linestyle="--", alpha=0.5)
|
|
265
|
+
ax.axhline(-h, color="r", linestyle="--", alpha=0.5)
|
|
266
|
+
ax.set_title("CUSUM Control Chart")
|
|
267
|
+
ax.legend()
|
|
268
|
+
|
|
269
|
+
path = "cusum_chart.png"
|
|
270
|
+
plt.savefig(path, dpi=150, bbox_inches="tight")
|
|
271
|
+
plt.close()
|
|
272
|
+
|
|
273
|
+
print(f"CUSUM: target={target:.3f}, k={k:.3f}, h={h:.3f}")
|
|
274
|
+
return {"target": target, "k": k, "h": h,
|
|
275
|
+
"cusum_pos": cusum_pos, "cusum_neg": cusum_neg, "fig": path}
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
---
|
|
279
|
+
|
|
280
|
+
## パイプライン統合
|
|
281
|
+
|
|
282
|
+
```
|
|
283
|
+
eda-correlation → anomaly-detection → ml-classification
|
|
284
|
+
(探索的解析) (外れ値検出) (モデリング)
|
|
285
|
+
│ │ ↓
|
|
286
|
+
data-profiling ────────┘ model-monitoring
|
|
287
|
+
(データ品質) (モデル監視)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## パイプライン出力
|
|
291
|
+
|
|
292
|
+
| ファイル | 説明 | 次スキル |
|
|
293
|
+
|---------|------|---------|
|
|
294
|
+
| `anomaly_ensemble.csv` | アンサンブル異常検知結果 | → EDA |
|
|
295
|
+
| `autoencoder_anomaly.json` | AE 異常スコア | → reporting |
|
|
296
|
+
| `spc_control_chart.png` | SPC 管理図 | → process-optimization |
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-causal-ml
|
|
3
|
+
description: |
|
|
4
|
+
因果機械学習スキル。DoWhy 因果モデル・EconML CATE 推定・
|
|
5
|
+
Double/Debiased ML・Causal Forest・メタラーナー (S/T/X)・
|
|
6
|
+
異質的処置効果 (HTE)・因果特徴量選択。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Causal ML
|
|
10
|
+
|
|
11
|
+
機械学習ベースの因果推論パイプラインを提供し、
|
|
12
|
+
異質的処置効果 (HTE) の推定と因果特徴量発見を実現する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 処置効果が個人/サブグループで異なるとき (HTE 推定)
|
|
17
|
+
- Causal Forest で非パラメトリック因果効果を推定するとき
|
|
18
|
+
- Double ML で高次元データの処置効果を推定するとき
|
|
19
|
+
- メタラーナー (S/T/X-learner) で CATE を推定するとき
|
|
20
|
+
- DoWhy で因果モデルの同定・推定・反論をするとき
|
|
21
|
+
- 因果特徴量選択で重要な効果修飾因子を発見するとき
|
|
22
|
+
|
|
23
|
+
> **Note**: 統計的因果推論 (PSM/IPW/DID/RDD) は `scientific-causal-inference` を参照。
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
## 1. DoWhy 因果モデル
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def dowhy_causal_model(df, treatment, outcome, common_causes,
|
|
37
|
+
effect_modifiers=None, method="backdoor.linear_regression"):
|
|
38
|
+
"""
|
|
39
|
+
DoWhy 因果推論パイプライン (同定→推定→反論)。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
df: pd.DataFrame — 観測データ
|
|
43
|
+
treatment: str — 処置変数
|
|
44
|
+
outcome: str — 結果変数
|
|
45
|
+
common_causes: list[str] — 共変量
|
|
46
|
+
effect_modifiers: list[str] | None — 効果修飾因子
|
|
47
|
+
method: str — 推定手法
|
|
48
|
+
"""
|
|
49
|
+
import dowhy
|
|
50
|
+
|
|
51
|
+
model = dowhy.CausalModel(
|
|
52
|
+
data=df,
|
|
53
|
+
treatment=treatment,
|
|
54
|
+
outcome=outcome,
|
|
55
|
+
common_causes=common_causes,
|
|
56
|
+
effect_modifiers=effect_modifiers)
|
|
57
|
+
|
|
58
|
+
# 同定
|
|
59
|
+
estimand = model.identify_effect(proceed_when_unidentifiable=True)
|
|
60
|
+
print(f"Identified estimand: {estimand.get_frontdoor_variables()}")
|
|
61
|
+
|
|
62
|
+
# 推定
|
|
63
|
+
estimate = model.estimate_effect(
|
|
64
|
+
estimand, method_name=method)
|
|
65
|
+
print(f"ATE = {estimate.value:.4f} (95% CI: [{estimate.get_confidence_intervals()[0]:.4f}, "
|
|
66
|
+
f"{estimate.get_confidence_intervals()[1]:.4f}])")
|
|
67
|
+
|
|
68
|
+
# 反論テスト
|
|
69
|
+
refutations = {}
|
|
70
|
+
for refuter_name in ["random_common_cause", "placebo_treatment_refuter",
|
|
71
|
+
"data_subset_refuter"]:
|
|
72
|
+
try:
|
|
73
|
+
refutation = model.refute_estimate(
|
|
74
|
+
estimand, estimate, method_name=refuter_name)
|
|
75
|
+
refutations[refuter_name] = {
|
|
76
|
+
"new_effect": float(refutation.new_effect),
|
|
77
|
+
"p_value": getattr(refutation, "refutation_result", {}).get("p_value", None)
|
|
78
|
+
}
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
print(f"Refutation tests: {len(refutations)} passed")
|
|
83
|
+
return {"model": model, "estimand": estimand,
|
|
84
|
+
"estimate": estimate, "refutations": refutations}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## 2. EconML Double ML / Causal Forest
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
def double_ml_estimate(df, treatment, outcome, features,
|
|
91
|
+
n_splits=5, model_type="linear"):
|
|
92
|
+
"""
|
|
93
|
+
Double/Debiased ML による処置効果推定。
|
|
94
|
+
|
|
95
|
+
Parameters:
|
|
96
|
+
df: pd.DataFrame — データ
|
|
97
|
+
treatment: str — 処置変数
|
|
98
|
+
outcome: str — 結果変数
|
|
99
|
+
features: list[str] — 共変量
|
|
100
|
+
n_splits: int — クロスフィッティング分割数
|
|
101
|
+
model_type: str — "linear" / "forest"
|
|
102
|
+
"""
|
|
103
|
+
from econml.dml import LinearDML, CausalForestDML
|
|
104
|
+
|
|
105
|
+
Y = df[outcome].values
|
|
106
|
+
T = df[treatment].values
|
|
107
|
+
X = df[features].values
|
|
108
|
+
|
|
109
|
+
if model_type == "linear":
|
|
110
|
+
est = LinearDML(cv=n_splits, random_state=42)
|
|
111
|
+
else:
|
|
112
|
+
est = CausalForestDML(
|
|
113
|
+
n_estimators=200, cv=n_splits, random_state=42)
|
|
114
|
+
|
|
115
|
+
est.fit(Y, T, X=X)
|
|
116
|
+
|
|
117
|
+
ate = est.ate(X)
|
|
118
|
+
ate_ci = est.ate_interval(X, alpha=0.05)
|
|
119
|
+
|
|
120
|
+
# CATE (個人レベル)
|
|
121
|
+
cate = est.effect(X)
|
|
122
|
+
cate_ci = est.effect_interval(X, alpha=0.05)
|
|
123
|
+
|
|
124
|
+
result_df = pd.DataFrame(X, columns=features)
|
|
125
|
+
result_df["cate"] = cate
|
|
126
|
+
result_df["cate_lower"] = cate_ci[0]
|
|
127
|
+
result_df["cate_upper"] = cate_ci[1]
|
|
128
|
+
|
|
129
|
+
print(f"Double ML ({model_type}): ATE={ate:.4f} "
|
|
130
|
+
f"[{ate_ci[0]:.4f}, {ate_ci[1]:.4f}]")
|
|
131
|
+
print(f" CATE range: [{cate.min():.4f}, {cate.max():.4f}]")
|
|
132
|
+
return {"ate": ate, "ate_ci": ate_ci,
|
|
133
|
+
"cate_df": result_df, "model": est}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def causal_forest(df, treatment, outcome, features,
|
|
137
|
+
n_estimators=500):
|
|
138
|
+
"""
|
|
139
|
+
Causal Forest — 非パラメトリック HTE 推定。
|
|
140
|
+
|
|
141
|
+
Parameters:
|
|
142
|
+
df: pd.DataFrame — データ
|
|
143
|
+
treatment: str — 処置変数 (binary)
|
|
144
|
+
outcome: str — 結果変数
|
|
145
|
+
features: list[str] — 共変量
|
|
146
|
+
n_estimators: int — 木の数
|
|
147
|
+
"""
|
|
148
|
+
from econml.dml import CausalForestDML
|
|
149
|
+
|
|
150
|
+
Y = df[outcome].values
|
|
151
|
+
T = df[treatment].values
|
|
152
|
+
X = df[features].values
|
|
153
|
+
|
|
154
|
+
cf = CausalForestDML(
|
|
155
|
+
n_estimators=n_estimators, random_state=42,
|
|
156
|
+
min_samples_leaf=10)
|
|
157
|
+
cf.fit(Y, T, X=X)
|
|
158
|
+
|
|
159
|
+
cate = cf.effect(X)
|
|
160
|
+
cate_ci = cf.effect_interval(X, alpha=0.05)
|
|
161
|
+
|
|
162
|
+
# 特徴量重要度 (因果)
|
|
163
|
+
importances = cf.feature_importances_
|
|
164
|
+
feat_imp = pd.DataFrame({
|
|
165
|
+
"feature": features,
|
|
166
|
+
"causal_importance": importances
|
|
167
|
+
}).sort_values("causal_importance", ascending=False)
|
|
168
|
+
|
|
169
|
+
print(f"Causal Forest: {n_estimators} trees, "
|
|
170
|
+
f"CATE median={np.median(cate):.4f}")
|
|
171
|
+
print(f" Top causal features: {feat_imp.head(5).to_dict('records')}")
|
|
172
|
+
return {"cate": cate, "cate_ci": cate_ci,
|
|
173
|
+
"feature_importance": feat_imp, "model": cf}
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## 3. メタラーナー (S/T/X-Learner)
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
def meta_learner(df, treatment, outcome, features,
|
|
180
|
+
learner_type="t", base_model=None):
|
|
181
|
+
"""
|
|
182
|
+
メタラーナーによる CATE 推定。
|
|
183
|
+
|
|
184
|
+
Parameters:
|
|
185
|
+
df: pd.DataFrame — データ
|
|
186
|
+
treatment: str — 処置変数 (binary 0/1)
|
|
187
|
+
outcome: str — 結果変数
|
|
188
|
+
features: list[str] — 共変量
|
|
189
|
+
learner_type: str — "s" / "t" / "x"
|
|
190
|
+
base_model: BaseEstimator | None — ベースモデル
|
|
191
|
+
"""
|
|
192
|
+
from econml.metalearners import SLearner, TLearner, XLearner
|
|
193
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
194
|
+
|
|
195
|
+
if base_model is None:
|
|
196
|
+
base_model = GradientBoostingRegressor(
|
|
197
|
+
n_estimators=200, max_depth=5, random_state=42)
|
|
198
|
+
|
|
199
|
+
Y = df[outcome].values
|
|
200
|
+
T = df[treatment].values
|
|
201
|
+
X = df[features].values
|
|
202
|
+
|
|
203
|
+
learners = {"s": SLearner, "t": TLearner, "x": XLearner}
|
|
204
|
+
LearnerClass = learners[learner_type]
|
|
205
|
+
|
|
206
|
+
if learner_type == "s":
|
|
207
|
+
est = LearnerClass(overall_model=base_model)
|
|
208
|
+
else:
|
|
209
|
+
est = LearnerClass(models=base_model)
|
|
210
|
+
|
|
211
|
+
est.fit(Y, T, X=X)
|
|
212
|
+
cate = est.effect(X)
|
|
213
|
+
|
|
214
|
+
result_df = pd.DataFrame(X, columns=features)
|
|
215
|
+
result_df["cate"] = cate
|
|
216
|
+
|
|
217
|
+
print(f"{learner_type.upper()}-Learner: "
|
|
218
|
+
f"CATE mean={cate.mean():.4f}, std={cate.std():.4f}")
|
|
219
|
+
return {"cate": cate, "cate_df": result_df, "model": est}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## パイプライン統合
|
|
225
|
+
|
|
226
|
+
```
|
|
227
|
+
causal-inference → causal-ml → feature-importance
|
|
228
|
+
(統計的因果) (因果 ML) (特徴量解釈)
|
|
229
|
+
│ │ ↓
|
|
230
|
+
clinical-trial ───────┘ explainable-ai
|
|
231
|
+
(臨床試験) (説明可能 AI)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## パイプライン出力
|
|
235
|
+
|
|
236
|
+
| ファイル | 説明 | 次スキル |
|
|
237
|
+
|---------|------|---------|
|
|
238
|
+
| `dowhy_causal_model.json` | DoWhy 因果モデル | → reporting |
|
|
239
|
+
| `cate_estimates.csv` | CATE 推定値 | → precision-medicine |
|
|
240
|
+
| `causal_feature_importance.csv` | 因果特徴量重要度 | → explainable-ai |
|