@nahisaho/satori 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,227 @@
1
+ ---
2
+ name: scientific-statistical-simulation
3
+ description: |
4
+ 統計シミュレーションスキル。Monte Carlo 法・Bootstrap 推論・
5
+ Permutation Test・統計的検出力分析・確率的リスク評価。
6
+ ---
7
+
8
+ # Scientific Statistical Simulation
9
+
10
+ コンピュータベースの統計シミュレーションにより、
11
+ 推論の不確実性定量化・検出力設計・リスク評価を行う。
12
+
13
+ ## When to Use
14
+
15
+ - Monte Carlo シミュレーションで確率分布を推定するとき
16
+ - Bootstrap で信頼区間を算出するとき
17
+ - Permutation Test でノンパラメトリック検定を行うとき
18
+ - 実験前に必要なサンプルサイズを検出力分析で決めるとき
19
+ - リスクシナリオの確率的評価を行うとき
20
+
21
+ ---
22
+
23
+ ## Quick Start
24
+
25
+ ## 1. Monte Carlo シミュレーション
26
+
27
+ ```python
28
+ import numpy as np
29
+ import pandas as pd
30
+ from typing import Callable, Dict, Any
31
+
32
+
33
+ def monte_carlo_simulation(func, param_distributions,
34
+ n_simulations=10000, seed=42,
35
+ summary_quantiles=None):
36
+ """
37
+ Monte Carlo シミュレーション。
38
+
39
+ Parameters:
40
+ func: Callable — シミュレーション対象関数 (dict → float)
41
+ param_distributions: dict — {param_name: scipy.stats distribution}
42
+ n_simulations: int — シミュレーション回数
43
+ seed: int — 乱数シード
44
+ summary_quantiles: list[float] | None — サマリー分位点
45
+ """
46
+ rng = np.random.default_rng(seed)
47
+ if summary_quantiles is None:
48
+ summary_quantiles = [0.025, 0.25, 0.5, 0.75, 0.975]
49
+
50
+ results = []
51
+ param_samples = {}
52
+
53
+ # パラメータサンプリング
54
+ for name, dist in param_distributions.items():
55
+ param_samples[name] = dist.rvs(
56
+ size=n_simulations, random_state=rng)
57
+
58
+ # シミュレーション実行
59
+ for i in range(n_simulations):
60
+ params = {name: samples[i]
61
+ for name, samples in param_samples.items()}
62
+ results.append(func(params))
63
+
64
+ results = np.array(results)
65
+
66
+ # サマリー統計
67
+ summary = {
68
+ "mean": np.mean(results),
69
+ "std": np.std(results),
70
+ "min": np.min(results),
71
+ "max": np.max(results),
72
+ }
73
+ for q in summary_quantiles:
74
+ summary[f"q{q:.3f}"] = np.quantile(results, q)
75
+
76
+ print(f"Monte Carlo ({n_simulations} runs):")
77
+ print(f" Mean={summary['mean']:.4f} ± {summary['std']:.4f}")
78
+ print(f" 95% CI: [{summary.get('q0.025', 'N/A'):.4f}, "
79
+ f"{summary.get('q0.975', 'N/A'):.4f}]")
80
+
81
+ return {"results": results, "summary": summary,
82
+ "param_samples": param_samples}
83
+ ```
84
+
85
+ ## 2. Bootstrap 推論
86
+
87
+ ```python
88
+ def bootstrap_inference(data, statistic_fn, n_bootstrap=10000,
89
+ confidence_level=0.95, method="bca",
90
+ seed=42):
91
+ """
92
+ Bootstrap 信頼区間推定。
93
+
94
+ Parameters:
95
+ data: np.ndarray — データ
96
+ statistic_fn: Callable — 統計量計算関数 (data → float)
97
+ n_bootstrap: int — Bootstrap 回数
98
+ confidence_level: float — 信頼水準
99
+ method: str — "percentile" / "bca" / "basic"
100
+ seed: int — 乱数シード
101
+ """
102
+ rng = np.random.default_rng(seed)
103
+ n = len(data)
104
+ observed = statistic_fn(data)
105
+
106
+ # Bootstrap 標本
107
+ boot_stats = np.array([
108
+ statistic_fn(data[rng.integers(0, n, size=n)])
109
+ for _ in range(n_bootstrap)])
110
+
111
+ alpha = 1 - confidence_level
112
+ if method == "percentile":
113
+ ci_low = np.quantile(boot_stats, alpha / 2)
114
+ ci_high = np.quantile(boot_stats, 1 - alpha / 2)
115
+ elif method == "bca":
116
+ from scipy import stats as sp_stats
117
+ # Bias correction
118
+ z0 = sp_stats.norm.ppf(np.mean(boot_stats < observed))
119
+ # Acceleration (jackknife)
120
+ jack_stats = np.array([
121
+ statistic_fn(np.delete(data, i)) for i in range(n)])
122
+ jack_mean = jack_stats.mean()
123
+ a = (np.sum((jack_mean - jack_stats) ** 3) /
124
+ (6 * np.sum((jack_mean - jack_stats) ** 2) ** 1.5 + 1e-10))
125
+ z_alpha = sp_stats.norm.ppf([alpha / 2, 1 - alpha / 2])
126
+ adjusted = sp_stats.norm.cdf(
127
+ z0 + (z0 + z_alpha) / (1 - a * (z0 + z_alpha)))
128
+ ci_low = np.quantile(boot_stats, adjusted[0])
129
+ ci_high = np.quantile(boot_stats, adjusted[1])
130
+ else: # basic
131
+ ci_low = 2 * observed - np.quantile(boot_stats, 1 - alpha / 2)
132
+ ci_high = 2 * observed - np.quantile(boot_stats, alpha / 2)
133
+
134
+ result = {
135
+ "observed": observed,
136
+ "boot_mean": np.mean(boot_stats),
137
+ "boot_std": np.std(boot_stats),
138
+ "ci_low": ci_low, "ci_high": ci_high,
139
+ "method": method,
140
+ "confidence_level": confidence_level,
141
+ }
142
+ print(f"Bootstrap ({method}): {observed:.4f} "
143
+ f"[{ci_low:.4f}, {ci_high:.4f}] ({confidence_level:.0%} CI)")
144
+ return result, boot_stats
145
+ ```
146
+
147
+ ## 3. 統計的検出力分析
148
+
149
+ ```python
150
+ def power_analysis(effect_size_range=None, n_range=None,
151
+ alpha=0.05, test_type="two-sample-t",
152
+ n_simulations=5000, seed=42):
153
+ """
154
+ シミュレーションベース統計的検出力分析。
155
+
156
+ Parameters:
157
+ effect_size_range: list[float] | None — 効果量の範囲
158
+ n_range: list[int] | None — サンプルサイズの範囲
159
+ alpha: float — 有意水準
160
+ test_type: str — "two-sample-t" / "paired-t" / "chi-square"
161
+ n_simulations: int — 各条件のシミュレーション回数
162
+ seed: int — 乱数シード
163
+ """
164
+ from scipy import stats as sp_stats
165
+
166
+ rng = np.random.default_rng(seed)
167
+ if effect_size_range is None:
168
+ effect_size_range = [0.2, 0.5, 0.8]
169
+ if n_range is None:
170
+ n_range = [10, 20, 30, 50, 100, 200]
171
+
172
+ records = []
173
+ for es in effect_size_range:
174
+ for n in n_range:
175
+ rejections = 0
176
+ for _ in range(n_simulations):
177
+ if test_type == "two-sample-t":
178
+ x = rng.normal(0, 1, n)
179
+ y = rng.normal(es, 1, n)
180
+ _, p = sp_stats.ttest_ind(x, y)
181
+ elif test_type == "paired-t":
182
+ diff = rng.normal(es, 1, n)
183
+ _, p = sp_stats.ttest_1samp(diff, 0)
184
+ else:
185
+ raise ValueError(f"Unknown test: {test_type}")
186
+
187
+ if p < alpha:
188
+ rejections += 1
189
+
190
+ power = rejections / n_simulations
191
+ records.append({
192
+ "effect_size": es, "n": n,
193
+ "power": power, "alpha": alpha,
194
+ })
195
+
196
+ df = pd.DataFrame(records)
197
+ for es in effect_size_range:
198
+ sub = df[df["effect_size"] == es]
199
+ adequate = sub[sub["power"] >= 0.8]
200
+ if len(adequate) > 0:
201
+ min_n = adequate["n"].min()
202
+ print(f"d={es}: min N={min_n} for power≥0.80")
203
+ else:
204
+ print(f"d={es}: power<0.80 for all tested N")
205
+
206
+ return df
207
+ ```
208
+
209
+ ---
210
+
211
+ ## パイプライン統合
212
+
213
+ ```
214
+ [仮説設計] → statistical-simulation → statistical-testing
215
+ (シミュレーション) (本解析)
216
+
217
+ doe ← adaptive-experiments
218
+ (実験計画) (適応実験)
219
+ ```
220
+
221
+ ## パイプライン出力
222
+
223
+ | ファイル | 説明 | 次スキル |
224
+ |---------|------|---------|
225
+ | `mc_results.npz` | Monte Carlo 結果 | → リスク評価 |
226
+ | `bootstrap_ci.csv` | 信頼区間 | → 統計レポート |
227
+ | `power_analysis.csv` | 検出力カーブ | → DOE サンプルサイズ |
@@ -0,0 +1,221 @@
1
+ ---
2
+ name: scientific-streaming-analytics
3
+ description: |
4
+ ストリーミング解析スキル。River オンライン学習・
5
+ リアルタイム異常検知・ストリーミング統計・
6
+ 増分データ可視化・概念ドリフト検出。
7
+ ---
8
+
9
+ # Scientific Streaming Analytics
10
+
11
+ データストリームに対するリアルタイム学習・異常検知・
12
+ 統計モニタリングパイプラインを提供する。
13
+
14
+ ## When to Use
15
+
16
+ - データが逐次的に到着するストリーミング環境のとき
17
+ - オンライン学習で逐次モデル更新が必要なとき
18
+ - リアルタイム異常検知を実装するとき
19
+ - 概念ドリフトを検出・対応するとき
20
+ - メモリ制約下で増分的に統計量を計算するとき
21
+
22
+ ---
23
+
24
+ ## Quick Start
25
+
26
+ ## 1. River オンライン学習
27
+
28
+ ```python
29
+ from river import (
30
+ compose, preprocessing, linear_model, metrics,
31
+ tree, ensemble, drift)
32
+ import pandas as pd
33
+ import numpy as np
34
+
35
+
36
+ def online_learning_pipeline(stream_data, model_type="ht",
37
+ target_col="y",
38
+ feature_cols=None):
39
+ """
40
+ River オンライン学習パイプライン。
41
+
42
+ Parameters:
43
+ stream_data: iterable — (X_dict, y) のストリーム or DataFrame
44
+ model_type: str — "ht" (Hoeffding Tree) / "lr" / "arf"
45
+ target_col: str — 目標変数名
46
+ feature_cols: list[str] | None — 特徴量カラム
47
+ """
48
+ models = {
49
+ "ht": tree.HoeffdingTreeClassifier(),
50
+ "lr": compose.Pipeline(
51
+ preprocessing.StandardScaler(),
52
+ linear_model.LogisticRegression()),
53
+ "arf": ensemble.AdaptiveRandomForestClassifier(
54
+ n_models=10, seed=42),
55
+ }
56
+ model = models.get(model_type, models["ht"])
57
+ metric = metrics.Accuracy()
58
+ history = []
59
+
60
+ if isinstance(stream_data, pd.DataFrame):
61
+ if feature_cols is None:
62
+ feature_cols = [c for c in stream_data.columns
63
+ if c != target_col]
64
+ iterator = (
65
+ (row[feature_cols].to_dict(), row[target_col])
66
+ for _, row in stream_data.iterrows())
67
+ else:
68
+ iterator = stream_data
69
+
70
+ for i, (x, y) in enumerate(iterator):
71
+ y_pred = model.predict_one(x)
72
+ if y_pred is not None:
73
+ metric.update(y, y_pred)
74
+ model.learn_one(x, y)
75
+
76
+ if (i + 1) % 100 == 0:
77
+ history.append({
78
+ "step": i + 1,
79
+ "accuracy": metric.get(),
80
+ })
81
+
82
+ print(f"Online {model_type}: {metric}")
83
+ return model, pd.DataFrame(history)
84
+ ```
85
+
86
+ ## 2. ストリーミング異常検知
87
+
88
+ ```python
89
+ def streaming_anomaly_detection(stream_data, window_size=100,
90
+ threshold_sigma=3.0,
91
+ method="zscore"):
92
+ """
93
+ ストリーミング異常検知。
94
+
95
+ Parameters:
96
+ stream_data: iterable — 数値ストリーム
97
+ window_size: int — スライディングウィンドウサイズ
98
+ threshold_sigma: float — 異常判定の σ 閾値
99
+ method: str — "zscore" / "iqr" / "ewma"
100
+ """
101
+ from collections import deque
102
+
103
+ window = deque(maxlen=window_size)
104
+ results = []
105
+ ewma_mean = None
106
+ ewma_var = None
107
+ alpha = 2.0 / (window_size + 1)
108
+
109
+ for i, value in enumerate(stream_data):
110
+ is_anomaly = False
111
+
112
+ if method == "zscore" and len(window) >= 10:
113
+ mean = np.mean(window)
114
+ std = np.std(window) + 1e-10
115
+ z = abs(value - mean) / std
116
+ is_anomaly = z > threshold_sigma
117
+
118
+ elif method == "iqr" and len(window) >= 10:
119
+ q1, q3 = np.percentile(window, [25, 75])
120
+ iqr = q3 - q1
121
+ lower = q1 - 1.5 * iqr
122
+ upper = q3 + 1.5 * iqr
123
+ is_anomaly = value < lower or value > upper
124
+
125
+ elif method == "ewma":
126
+ if ewma_mean is None:
127
+ ewma_mean = value
128
+ ewma_var = 0
129
+ else:
130
+ ewma_mean = alpha * value + (1 - alpha) * ewma_mean
131
+ ewma_var = alpha * (value - ewma_mean) ** 2 + \
132
+ (1 - alpha) * ewma_var
133
+ ewma_std = np.sqrt(ewma_var) + 1e-10
134
+ is_anomaly = abs(value - ewma_mean) / ewma_std > threshold_sigma
135
+
136
+ window.append(value)
137
+ results.append({
138
+ "step": i, "value": value,
139
+ "is_anomaly": is_anomaly,
140
+ })
141
+
142
+ df = pd.DataFrame(results)
143
+ n_anomalies = df["is_anomaly"].sum()
144
+ print(f"Streaming anomaly ({method}): "
145
+ f"{n_anomalies}/{len(df)} anomalies detected "
146
+ f"({n_anomalies/len(df):.1%})")
147
+ return df
148
+ ```
149
+
150
+ ## 3. 概念ドリフト検出
151
+
152
+ ```python
153
+ def concept_drift_detection(stream_data, target_col="y",
154
+ feature_cols=None,
155
+ detector_type="adwin"):
156
+ """
157
+ 概念ドリフト検出。
158
+
159
+ Parameters:
160
+ stream_data: pd.DataFrame — ストリームデータ
161
+ target_col: str — 目標変数名
162
+ feature_cols: list[str] | None — 特徴量カラム
163
+ detector_type: str — "adwin" / "ddm" / "eddm"
164
+ """
165
+ detectors = {
166
+ "adwin": drift.ADWIN(delta=0.002),
167
+ "ddm": drift.DDM(min_num_instances=30),
168
+ "eddm": drift.EDDM(),
169
+ }
170
+ detector = detectors.get(detector_type, detectors["adwin"])
171
+
172
+ model = tree.HoeffdingTreeClassifier()
173
+ metric = metrics.Accuracy()
174
+ drift_points = []
175
+
176
+ if feature_cols is None:
177
+ feature_cols = [c for c in stream_data.columns
178
+ if c != target_col]
179
+
180
+ for i, (_, row) in enumerate(stream_data.iterrows()):
181
+ x = row[feature_cols].to_dict()
182
+ y = row[target_col]
183
+ y_pred = model.predict_one(x)
184
+
185
+ if y_pred is not None:
186
+ is_correct = int(y_pred == y)
187
+ metric.update(y, y_pred)
188
+ detector.update(is_correct)
189
+
190
+ if detector.drift_detected:
191
+ drift_points.append({
192
+ "step": i,
193
+ "accuracy_at_drift": metric.get(),
194
+ })
195
+ print(f"⚠ Drift at step {i}, acc={metric.get():.3f}")
196
+
197
+ model.learn_one(x, y)
198
+
199
+ print(f"Total drifts: {len(drift_points)}")
200
+ return pd.DataFrame(drift_points)
201
+ ```
202
+
203
+ ---
204
+
205
+ ## パイプライン統合
206
+
207
+ ```
208
+ [データストリーム] → streaming-analytics → model-monitoring
209
+ (オンライン学習) (性能監視)
210
+
211
+ anomaly-detection ← data-profiling
212
+ (バッチ異常検知) (データ品質)
213
+ ```
214
+
215
+ ## パイプライン出力
216
+
217
+ | ファイル | 説明 | 次スキル |
218
+ |---------|------|---------|
219
+ | `online_model.pkl` | オンラインモデル | → 推論 |
220
+ | `stream_anomalies.csv` | 異常検知結果 | → alerting |
221
+ | `drift_report.csv` | ドリフト検出点 | → model-monitoring |