@nahisaho/satori 0.22.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,312 @@
1
+ ---
2
+ name: scientific-missing-data-analysis
3
+ description: |
4
+ 欠損データ解析スキル。欠損パターン診断 (MCAR/MAR/MNAR) ・
5
+ Little's MCAR テスト・多重代入法 (MICE) ・KNN 補完・
6
+ MissForest・VAE/GAIN 補完・欠損パターン可視化・Rubin's Rules。
7
+ ---
8
+
9
+ # Scientific Missing Data Analysis
10
+
11
+ 欠損データの診断・補完・感度分析パイプラインを提供し、
12
+ バイアスのない統計推論を実現する。
13
+
14
+ ## When to Use
15
+
16
+ - データセットの欠損パターンを診断するとき
17
+ - MCAR / MAR / MNAR のメカニズムを判定するとき
18
+ - 多重代入法 (MICE) で欠損値を補完するとき
19
+ - KNN / MissForest / 深層学習ベースの補完をするとき
20
+ - 複数の補完結果を Rubin's Rules で統合するとき
21
+ - 欠損パターンを可視化するとき
22
+
23
+ ---
24
+
25
+ ## Quick Start
26
+
27
+ ## 1. 欠損パターン診断
28
+
29
+ ```python
30
+ import numpy as np
31
+ import pandas as pd
32
+ import matplotlib.pyplot as plt
33
+ import seaborn as sns
34
+
35
+
36
+ def diagnose_missing_patterns(df, output_prefix="missing"):
37
+ """
38
+ 欠損パターン診断 — MCAR/MAR/MNAR 判定支援。
39
+
40
+ Parameters:
41
+ df: pd.DataFrame — 入力データ
42
+ output_prefix: str — 出力ファイル接頭辞
43
+ """
44
+ n_rows, n_cols = df.shape
45
+ missing_counts = df.isnull().sum()
46
+ missing_pct = (missing_counts / n_rows * 100).round(2)
47
+
48
+ summary = pd.DataFrame({
49
+ "column": df.columns,
50
+ "n_missing": missing_counts.values,
51
+ "pct_missing": missing_pct.values,
52
+ "dtype": df.dtypes.values
53
+ }).sort_values("pct_missing", ascending=False)
54
+
55
+ # 欠損パターン行列 (msno 風)
56
+ fig, axes = plt.subplots(2, 2, figsize=(16, 12))
57
+
58
+ # (1) 欠損マトリックス
59
+ ax = axes[0, 0]
60
+ missing_matrix = df.isnull().astype(int)
61
+ ax.imshow(missing_matrix.values[:200], aspect="auto", cmap="Greys",
62
+ interpolation="none")
63
+ ax.set_xlabel("Features")
64
+ ax.set_ylabel("Samples")
65
+ ax.set_title("Missing Pattern Matrix (first 200 rows)")
66
+
67
+ # (2) 欠損率バー
68
+ ax = axes[0, 1]
69
+ cols_with_missing = summary[summary["pct_missing"] > 0]
70
+ ax.barh(cols_with_missing["column"], cols_with_missing["pct_missing"])
71
+ ax.set_xlabel("Missing %")
72
+ ax.set_title("Missing Rate per Column")
73
+
74
+ # (3) 欠損相関ヒートマップ
75
+ ax = axes[1, 0]
76
+ miss_corr = df.isnull().corr()
77
+ sns.heatmap(miss_corr, ax=ax, cmap="RdBu_r", center=0,
78
+ square=True, cbar_kws={"shrink": 0.8})
79
+ ax.set_title("Missing Correlation")
80
+
81
+ # (4) 欠損パターン上位
82
+ ax = axes[1, 1]
83
+ patterns = df.isnull().apply(lambda x: tuple(x), axis=1)
84
+ pattern_counts = patterns.value_counts().head(10)
85
+ ax.barh(range(len(pattern_counts)),
86
+ pattern_counts.values)
87
+ ax.set_yticks(range(len(pattern_counts)))
88
+ ax.set_yticklabels([str(p)[:40] for p in pattern_counts.index],
89
+ fontsize=7)
90
+ ax.set_xlabel("Count")
91
+ ax.set_title("Top 10 Missing Patterns")
92
+
93
+ plt.tight_layout()
94
+ path = f"{output_prefix}_diagnosis.png"
95
+ plt.savefig(path, dpi=150, bbox_inches="tight")
96
+ plt.close()
97
+
98
+ print(f"Missing Diagnosis: {n_cols} cols, "
99
+ f"{missing_counts.sum()} total missing ({(missing_counts.sum()/(n_rows*n_cols)*100):.1f}%)")
100
+ return {"summary": summary, "fig": path}
101
+ ```
102
+
103
+ ## 2. Little's MCAR テスト
104
+
105
+ ```python
106
+ def littles_mcar_test(df):
107
+ """
108
+ Little's MCAR テスト — 完全ランダム欠損の検定。
109
+
110
+ Parameters:
111
+ df: pd.DataFrame — 数値データのみ
112
+ Returns:
113
+ dict — chi2 統計量, p値, 判定
114
+ """
115
+ from scipy import stats
116
+
117
+ numeric_df = df.select_dtypes(include=[np.number])
118
+ n_rows, n_cols = numeric_df.shape
119
+
120
+ # 欠損パターンごとにグルーピング
121
+ patterns = numeric_df.isnull().apply(tuple, axis=1)
122
+ unique_patterns = patterns.unique()
123
+
124
+ # 全体平均と全体共分散
125
+ global_mean = numeric_df.mean()
126
+ global_cov = numeric_df.cov()
127
+
128
+ chi2_stat = 0.0
129
+ df_stat = 0
130
+
131
+ for pattern in unique_patterns:
132
+ mask = patterns == pattern
133
+ sub_df = numeric_df[mask]
134
+ n_j = len(sub_df)
135
+ if n_j < 2:
136
+ continue
137
+
138
+ # このパターンで観測されているカラム
139
+ obs_cols = [i for i, m in enumerate(pattern) if not m]
140
+ if len(obs_cols) == 0:
141
+ continue
142
+
143
+ obs_mean = sub_df.iloc[:, obs_cols].mean().values
144
+ exp_mean = global_mean.iloc[obs_cols].values
145
+ diff = obs_mean - exp_mean
146
+
147
+ obs_cov = global_cov.iloc[obs_cols, obs_cols].values
148
+ try:
149
+ cov_inv = np.linalg.pinv(obs_cov / n_j)
150
+ except np.linalg.LinAlgError:
151
+ continue
152
+
153
+ chi2_stat += diff @ cov_inv @ diff
154
+ df_stat += len(obs_cols)
155
+
156
+ df_stat -= n_cols # 自由度補正
157
+
158
+ if df_stat <= 0:
159
+ return {"chi2": np.nan, "p_value": np.nan,
160
+ "conclusion": "判定不能 (自由度不足)"}
161
+
162
+ p_value = 1 - stats.chi2.cdf(chi2_stat, df_stat)
163
+ conclusion = "MCAR (p > 0.05)" if p_value > 0.05 else "Not MCAR (p ≤ 0.05)"
164
+
165
+ print(f"Little's MCAR test: χ²={chi2_stat:.2f}, df={df_stat}, "
166
+ f"p={p_value:.4f} → {conclusion}")
167
+ return {"chi2": chi2_stat, "df": df_stat,
168
+ "p_value": p_value, "conclusion": conclusion}
169
+ ```
170
+
171
+ ## 3. 多重代入法 (MICE)
172
+
173
+ ```python
174
+ def mice_imputation(df, n_imputations=5, max_iter=10, random_state=42):
175
+ """
176
+ MICE (Multiple Imputation by Chained Equations)。
177
+
178
+ Parameters:
179
+ df: pd.DataFrame — 欠損を含むデータ
180
+ n_imputations: int — 代入データセット数
181
+ max_iter: int — 反復回数
182
+ random_state: int — 乱数シード
183
+ """
184
+ from sklearn.experimental import enable_iterative_imputer # noqa
185
+ from sklearn.impute import IterativeImputer
186
+
187
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
188
+ cat_cols = df.select_dtypes(exclude=[np.number]).columns
189
+
190
+ imputed_datasets = []
191
+
192
+ for i in range(n_imputations):
193
+ imputer = IterativeImputer(
194
+ max_iter=max_iter,
195
+ random_state=random_state + i,
196
+ sample_posterior=True)
197
+
198
+ imputed_numeric = pd.DataFrame(
199
+ imputer.fit_transform(df[numeric_cols]),
200
+ columns=numeric_cols, index=df.index)
201
+
202
+ imputed_df = imputed_numeric.copy()
203
+ for col in cat_cols:
204
+ imputed_df[col] = df[col].fillna(df[col].mode().iloc[0]
205
+ if not df[col].mode().empty else "UNKNOWN")
206
+
207
+ imputed_datasets.append(imputed_df)
208
+
209
+ print(f"MICE: {n_imputations} datasets × {max_iter} iterations, "
210
+ f"{len(numeric_cols)} numeric cols")
211
+ return imputed_datasets
212
+
213
+
214
+ def rubins_rules(estimates, variances):
215
+ """
216
+ Rubin's Rules — 多重代入結果の統合。
217
+
218
+ Parameters:
219
+ estimates: list[float] — 各代入データセットからの推定値
220
+ variances: list[float] — 各代入データセットからの分散
221
+ """
222
+ m = len(estimates)
223
+ Q_bar = np.mean(estimates)
224
+ U_bar = np.mean(variances) # Within-imputation variance
225
+ B = np.var(estimates, ddof=1) # Between-imputation variance
226
+ T = U_bar + (1 + 1 / m) * B # Total variance
227
+
228
+ # 自由度 (Barnard-Rubin)
229
+ r = (1 + 1 / m) * B / U_bar if U_bar > 0 else np.inf
230
+ df_old = (m - 1) * (1 + 1 / r) ** 2 if r > 0 else np.inf
231
+
232
+ print(f"Rubin's Rules: Q̄={Q_bar:.4f}, T={T:.4f}, "
233
+ f"within={U_bar:.4f}, between={B:.4f}")
234
+ return {"pooled_estimate": Q_bar, "total_variance": T,
235
+ "within_variance": U_bar, "between_variance": B,
236
+ "df": df_old}
237
+ ```
238
+
239
+ ## 4. KNN / MissForest 補完
240
+
241
+ ```python
242
+ def knn_imputation(df, n_neighbors=5):
243
+ """
244
+ KNN 欠損値補完。
245
+
246
+ Parameters:
247
+ df: pd.DataFrame — 欠損を含むデータ
248
+ n_neighbors: int — 近傍数
249
+ """
250
+ from sklearn.impute import KNNImputer
251
+
252
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
253
+ imputer = KNNImputer(n_neighbors=n_neighbors)
254
+ imputed = pd.DataFrame(
255
+ imputer.fit_transform(df[numeric_cols]),
256
+ columns=numeric_cols, index=df.index)
257
+
258
+ n_imputed = df[numeric_cols].isnull().sum().sum()
259
+ print(f"KNN Imputation (k={n_neighbors}): {n_imputed} values imputed")
260
+ return imputed
261
+
262
+
263
+ def missforest_imputation(df, n_estimators=100, max_iter=10):
264
+ """
265
+ MissForest (Random Forest ベースの反復補完)。
266
+
267
+ Parameters:
268
+ df: pd.DataFrame — 欠損を含むデータ
269
+ n_estimators: int — Random Forest の木の数
270
+ max_iter: int — 反復回数
271
+ """
272
+ from sklearn.experimental import enable_iterative_imputer # noqa
273
+ from sklearn.impute import IterativeImputer
274
+ from sklearn.ensemble import RandomForestRegressor
275
+
276
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
277
+
278
+ imputer = IterativeImputer(
279
+ estimator=RandomForestRegressor(n_estimators=n_estimators,
280
+ random_state=42, n_jobs=-1),
281
+ max_iter=max_iter, random_state=42)
282
+
283
+ imputed = pd.DataFrame(
284
+ imputer.fit_transform(df[numeric_cols]),
285
+ columns=numeric_cols, index=df.index)
286
+
287
+ n_imputed = df[numeric_cols].isnull().sum().sum()
288
+ print(f"MissForest (n_trees={n_estimators}, iter={max_iter}): "
289
+ f"{n_imputed} values imputed")
290
+ return imputed
291
+ ```
292
+
293
+ ---
294
+
295
+ ## パイプライン統合
296
+
297
+ ```
298
+ eda-correlation → missing-data-analysis → ml-classification
299
+ (探索的解析) (欠損診断・補完) (モデリング)
300
+ │ │ ↓
301
+ statistical-testing ────┘ advanced-visualization
302
+ (統計検定) (結果可視化)
303
+ ```
304
+
305
+ ## パイプライン出力
306
+
307
+ | ファイル | 説明 | 次スキル |
308
+ |---------|------|---------|
309
+ | `missing_diagnosis.png` | 欠損パターン可視化 | → reporting |
310
+ | `mcar_test_result.json` | Little's MCAR テスト | → 補完戦略選択 |
311
+ | `imputed_datasets/` | MICE 多重代入データ | → ml-classification |
312
+ | `imputation_comparison.csv` | 補完手法比較 | → 最終選択 |
@@ -0,0 +1,247 @@
1
+ ---
2
+ name: scientific-model-monitoring
3
+ description: |
4
+ MLOps モデル監視スキル。データドリフト検出 (Evidently/NannyML)・
5
+ モデル性能劣化検出・特徴量ドリフト・コンセプトドリフト・
6
+ A/B テスト統計・モデルレジストリ・再学習トリガー。
7
+ ---
8
+
9
+ # Scientific Model Monitoring
10
+
11
+ 本番環境の ML モデル監視パイプラインを提供し、
12
+ データドリフト・性能劣化を検出して再学習トリガーを実現する。
13
+
14
+ ## When to Use
15
+
16
+ - デプロイ済みモデルの予測品質を継続監視するとき
17
+ - データドリフト (共変量シフト) を検出するとき
18
+ - コンセプトドリフト (P(Y|X) の変化) を検出するとき
19
+ - A/B テストで新旧モデルを比較するとき
20
+ - 特徴量分布の変化を追跡するとき
21
+ - 再学習トリガーの自動化ルールを設定するとき
22
+
23
+ ---
24
+
25
+ ## Quick Start
26
+
27
+ ## 1. データドリフト検出
28
+
29
+ ```python
30
+ import numpy as np
31
+ import pandas as pd
32
+ from scipy import stats
33
+
34
+
35
+ def detect_data_drift(reference_df, current_df,
36
+ method="ks", threshold=0.05):
37
+ """
38
+ データドリフト検出 — 参照データ vs 現在データ。
39
+
40
+ Parameters:
41
+ reference_df: pd.DataFrame — 学習時データ (参照)
42
+ current_df: pd.DataFrame — 推論時データ (現在)
43
+ method: str — "ks" (KS 検定) / "psi" (PSI) / "wasserstein"
44
+ threshold: float — 有意水準 or PSI 閾値
45
+ """
46
+ numeric_cols = reference_df.select_dtypes(include=[np.number]).columns
47
+ common_cols = [c for c in numeric_cols if c in current_df.columns]
48
+
49
+ drift_results = []
50
+
51
+ for col in common_cols:
52
+ ref_vals = reference_df[col].dropna().values
53
+ cur_vals = current_df[col].dropna().values
54
+
55
+ if method == "ks":
56
+ stat, p_value = stats.ks_2samp(ref_vals, cur_vals)
57
+ is_drift = p_value < threshold
58
+ drift_results.append({
59
+ "feature": col, "statistic": stat,
60
+ "p_value": p_value, "is_drift": is_drift})
61
+
62
+ elif method == "psi":
63
+ # Population Stability Index
64
+ psi_val = _compute_psi(ref_vals, cur_vals)
65
+ is_drift = psi_val > 0.2 # >0.2 = significant shift
66
+ drift_results.append({
67
+ "feature": col, "psi": psi_val,
68
+ "is_drift": is_drift,
69
+ "severity": "high" if psi_val > 0.25 else
70
+ "medium" if psi_val > 0.1 else "low"})
71
+
72
+ elif method == "wasserstein":
73
+ w_dist = stats.wasserstein_distance(ref_vals, cur_vals)
74
+ ref_std = np.std(ref_vals)
75
+ normalized = w_dist / ref_std if ref_std > 0 else w_dist
76
+ is_drift = normalized > 0.1
77
+ drift_results.append({
78
+ "feature": col, "wasserstein": w_dist,
79
+ "normalized": normalized, "is_drift": is_drift})
80
+
81
+ result_df = pd.DataFrame(drift_results)
82
+ n_drift = result_df["is_drift"].sum()
83
+ print(f"Data Drift ({method}): {n_drift}/{len(common_cols)} features drifted")
84
+ return result_df
85
+
86
+
87
+ def _compute_psi(expected, actual, n_bins=10):
88
+ """PSI (Population Stability Index) 計算。"""
89
+ breakpoints = np.quantile(expected, np.linspace(0, 1, n_bins + 1))
90
+ breakpoints[0] = -np.inf
91
+ breakpoints[-1] = np.inf
92
+
93
+ expected_pct = np.histogram(expected, bins=breakpoints)[0] / len(expected)
94
+ actual_pct = np.histogram(actual, bins=breakpoints)[0] / len(actual)
95
+
96
+ expected_pct = np.clip(expected_pct, 1e-4, None)
97
+ actual_pct = np.clip(actual_pct, 1e-4, None)
98
+
99
+ psi = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))
100
+ return psi
101
+ ```
102
+
103
+ ## 2. モデル性能劣化検出
104
+
105
+ ```python
106
+ def detect_performance_degradation(y_true_batches, y_pred_batches,
107
+ metric="accuracy",
108
+ window_size=10, alert_threshold=0.05):
109
+ """
110
+ モデル性能劣化のスライディングウィンドウ検出。
111
+
112
+ Parameters:
113
+ y_true_batches: list[np.ndarray] — バッチごとの真値
114
+ y_pred_batches: list[np.ndarray] — バッチごとの予測値
115
+ metric: str — "accuracy" / "f1" / "rmse" / "auc"
116
+ window_size: int — 移動平均ウィンドウ
117
+ alert_threshold: float — 性能低下アラート閾値
118
+ """
119
+ from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
120
+ from sklearn.metrics import roc_auc_score
121
+ import matplotlib.pyplot as plt
122
+
123
+ metric_funcs = {
124
+ "accuracy": accuracy_score,
125
+ "f1": lambda y, p: f1_score(y, p, average="macro"),
126
+ "rmse": lambda y, p: -np.sqrt(mean_squared_error(y, p)),
127
+ "auc": lambda y, p: roc_auc_score(y, p)
128
+ }
129
+
130
+ func = metric_funcs[metric]
131
+ scores = [func(yt, yp) for yt, yp in zip(y_true_batches, y_pred_batches)]
132
+
133
+ # 移動平均
134
+ scores_arr = np.array(scores)
135
+ if len(scores_arr) >= window_size:
136
+ ma = np.convolve(scores_arr, np.ones(window_size)/window_size, mode="valid")
137
+ else:
138
+ ma = scores_arr
139
+
140
+ # ベースライン (最初の window_size バッチ)
141
+ baseline = np.mean(scores_arr[:window_size])
142
+ current = np.mean(scores_arr[-window_size:])
143
+ degradation = baseline - current
144
+
145
+ is_degraded = degradation > alert_threshold
146
+
147
+ # 可視化
148
+ fig, ax = plt.subplots(figsize=(12, 5))
149
+ ax.plot(scores, "b-o", markersize=3, alpha=0.5, label="Batch score")
150
+ if len(ma) > 0:
151
+ ax.plot(range(window_size - 1, window_size - 1 + len(ma)),
152
+ ma, "r-", linewidth=2, label=f"MA({window_size})")
153
+ ax.axhline(baseline, color="g", linestyle="--",
154
+ label=f"Baseline={baseline:.4f}")
155
+ ax.axhline(baseline - alert_threshold, color="orange", linestyle="--",
156
+ label=f"Alert={baseline - alert_threshold:.4f}")
157
+ ax.set_xlabel("Batch")
158
+ ax.set_ylabel(metric)
159
+ ax.set_title(f"Model Performance Monitoring ({metric})")
160
+ ax.legend()
161
+
162
+ path = "performance_monitoring.png"
163
+ plt.savefig(path, dpi=150, bbox_inches="tight")
164
+ plt.close()
165
+
166
+ status = "DEGRADED ⚠️" if is_degraded else "OK ✓"
167
+ print(f"Performance ({metric}): baseline={baseline:.4f}, "
168
+ f"current={current:.4f}, Δ={degradation:.4f} → {status}")
169
+ return {"baseline": baseline, "current": current,
170
+ "degradation": degradation, "is_degraded": is_degraded,
171
+ "scores": scores, "fig": path}
172
+ ```
173
+
174
+ ## 3. A/B テスト統計
175
+
176
+ ```python
177
+ def ab_test_models(y_true, preds_a, preds_b, metric="accuracy",
178
+ n_bootstrap=10000, alpha=0.05):
179
+ """
180
+ A/B テスト — 2 モデルの統計的比較。
181
+
182
+ Parameters:
183
+ y_true: np.ndarray — 真値
184
+ preds_a: np.ndarray — モデル A 予測
185
+ preds_b: np.ndarray — モデル B 予測
186
+ metric: str — 評価指標
187
+ n_bootstrap: int — ブートストラップ回数
188
+ alpha: float — 有意水準
189
+ """
190
+ from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
191
+
192
+ metric_funcs = {
193
+ "accuracy": accuracy_score,
194
+ "f1": lambda y, p: f1_score(y, p, average="macro"),
195
+ "rmse": lambda y, p: np.sqrt(mean_squared_error(y, p))
196
+ }
197
+
198
+ func = metric_funcs[metric]
199
+ score_a = func(y_true, preds_a)
200
+ score_b = func(y_true, preds_b)
201
+
202
+ # Bootstrap confidence interval for difference
203
+ diffs = []
204
+ n = len(y_true)
205
+ rng = np.random.RandomState(42)
206
+
207
+ for _ in range(n_bootstrap):
208
+ idx = rng.choice(n, n, replace=True)
209
+ sa = func(y_true[idx], preds_a[idx])
210
+ sb = func(y_true[idx], preds_b[idx])
211
+ diffs.append(sb - sa)
212
+
213
+ diffs = np.array(diffs)
214
+ ci_lower = np.percentile(diffs, 100 * alpha / 2)
215
+ ci_upper = np.percentile(diffs, 100 * (1 - alpha / 2))
216
+ p_value = np.mean(diffs <= 0) # P(B ≤ A)
217
+
218
+ winner = "B" if ci_lower > 0 else ("A" if ci_upper < 0 else "Tie")
219
+
220
+ print(f"A/B Test ({metric}): A={score_a:.4f}, B={score_b:.4f}")
221
+ print(f" Δ(B-A)={score_b - score_a:.4f}, "
222
+ f"95% CI=[{ci_lower:.4f}, {ci_upper:.4f}], "
223
+ f"p={p_value:.4f} → Winner: {winner}")
224
+ return {"score_a": score_a, "score_b": score_b,
225
+ "diff": score_b - score_a, "ci": (ci_lower, ci_upper),
226
+ "p_value": p_value, "winner": winner}
227
+ ```
228
+
229
+ ---
230
+
231
+ ## パイプライン統合
232
+
233
+ ```
234
+ ensemble-methods → model-monitoring → anomaly-detection
235
+ (モデル構築) (監視) (異常検知)
236
+ │ │ ↓
237
+ automl ──────────────┘ active-learning
238
+ (AutoML) (再学習)
239
+ ```
240
+
241
+ ## パイプライン出力
242
+
243
+ | ファイル | 説明 | 次スキル |
244
+ |---------|------|---------|
245
+ | `drift_report.csv` | ドリフト検出結果 | → 再学習判断 |
246
+ | `performance_monitoring.png` | 性能推移 | → reporting |
247
+ | `ab_test_result.json` | A/B テスト結果 | → デプロイ判断 |