@nahisaho/satori 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ ---
2
+ name: scientific-geospatial-analysis
3
+ description: |
4
+ 地理空間データ解析スキル。GeoPandas ベクターデータ処理・
5
+ Rasterio ラスター解析・Folium/Kepler.gl インタラクティブ地図・
6
+ 空間自己相関 (Moran's I)・クリギング補間・CRS 変換。
7
+ ---
8
+
9
+ # Scientific Geospatial Analysis
10
+
11
+ 地理空間データの前処理・空間統計・インタラクティブ地図可視化
12
+ パイプラインを提供する。
13
+
14
+ ## When to Use
15
+
16
+ - GeoPandas でベクターデータ (Shapefile/GeoJSON) を処理するとき
17
+ - ラスターデータ (GeoTIFF) を読み込み解析するとき
18
+ - 空間自己相関 (Moran's I / LISA) を検定するとき
19
+ - クリギング (Kriging) で空間補間するとき
20
+ - Folium/Kepler.gl でインタラクティブ地図を作成するとき
21
+ - CRS (座標参照系) 変換・空間結合をするとき
22
+
23
+ > **Note**: 環境特化 GIS (SoilGrids/WorldClim) は `scientific-environmental-geodata` を参照。
24
+
25
+ ---
26
+
27
+ ## Quick Start
28
+
29
+ ## 1. GeoPandas ベクターデータ処理
30
+
31
+ ```python
32
+ import numpy as np
33
+ import pandas as pd
34
+
35
+
36
+ def load_and_process_geodata(filepath, target_crs="EPSG:4326"):
37
+ """
38
+ GeoPandas ベクター/ポイントデータ読み込み・CRS 変換。
39
+
40
+ Parameters:
41
+ filepath: str — Shapefile / GeoJSON / GPKG パス
42
+ target_crs: str — 変換先座標系
43
+ """
44
+ import geopandas as gpd
45
+
46
+ gdf = gpd.read_file(filepath)
47
+ original_crs = gdf.crs
48
+
49
+ if gdf.crs != target_crs:
50
+ gdf = gdf.to_crs(target_crs)
51
+
52
+ # 基本統計
53
+ bounds = gdf.total_bounds # [minx, miny, maxx, maxy]
54
+ geom_types = gdf.geometry.geom_type.value_counts().to_dict()
55
+
56
+ print(f"GeoData: {len(gdf)} features, CRS: {original_crs} → {target_crs}")
57
+ print(f" Bounds: [{bounds[0]:.4f}, {bounds[1]:.4f}] "
58
+ f"to [{bounds[2]:.4f}, {bounds[3]:.4f}]")
59
+ print(f" Geometry types: {geom_types}")
60
+ return gdf
61
+
62
+
63
+ def spatial_join(gdf_left, gdf_right, how="inner", predicate="intersects"):
64
+ """
65
+ 空間結合 (Spatial Join)。
66
+
67
+ Parameters:
68
+ gdf_left: GeoDataFrame — 左テーブル
69
+ gdf_right: GeoDataFrame — 右テーブル
70
+ how: str — "inner" / "left" / "right"
71
+ predicate: str — "intersects" / "within" / "contains"
72
+ """
73
+ import geopandas as gpd
74
+
75
+ if gdf_left.crs != gdf_right.crs:
76
+ gdf_right = gdf_right.to_crs(gdf_left.crs)
77
+
78
+ joined = gpd.sjoin(gdf_left, gdf_right, how=how, predicate=predicate)
79
+
80
+ print(f"Spatial Join ({predicate}, {how}): "
81
+ f"{len(gdf_left)} × {len(gdf_right)} → {len(joined)}")
82
+ return joined
83
+ ```
84
+
85
+ ## 2. 空間自己相関 (Moran's I / LISA)
86
+
87
+ ```python
88
+ def spatial_autocorrelation(gdf, value_col, weight_type="queen"):
89
+ """
90
+ 空間自己相関検定 — Global Moran's I + LISA。
91
+
92
+ Parameters:
93
+ gdf: GeoDataFrame — ジオメトリ + 属性データ
94
+ value_col: str — 解析対象カラム
95
+ weight_type: str — "queen" / "rook" / "knn"
96
+ """
97
+ from libpysal.weights import Queen, Rook, KNN
98
+ from esda.moran import Moran, Moran_Local
99
+ import matplotlib.pyplot as plt
100
+
101
+ # 空間重み行列
102
+ if weight_type == "queen":
103
+ w = Queen.from_dataframe(gdf)
104
+ elif weight_type == "rook":
105
+ w = Rook.from_dataframe(gdf)
106
+ elif weight_type == "knn":
107
+ w = KNN.from_dataframe(gdf, k=5)
108
+
109
+ w.transform = "r"
110
+ y = gdf[value_col].values
111
+
112
+ # Global Moran's I
113
+ moran_global = Moran(y, w)
114
+
115
+ # LISA (Local Indicators of Spatial Association)
116
+ moran_local = Moran_Local(y, w)
117
+
118
+ gdf = gdf.copy()
119
+ gdf["lisa_cluster"] = moran_local.q # 1=HH, 2=LH, 3=LL, 4=HL
120
+ gdf["lisa_significant"] = moran_local.p_sim < 0.05
121
+ gdf["local_moran_i"] = moran_local.Is
122
+
123
+ # 可視化
124
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
125
+
126
+ gdf.plot(column=value_col, ax=ax1, legend=True,
127
+ cmap="RdYlBu_r", edgecolor="gray", linewidth=0.3)
128
+ ax1.set_title(f"{value_col} (Moran's I={moran_global.I:.4f}, "
129
+ f"p={moran_global.p_sim:.4f})")
130
+
131
+ cluster_labels = {1: "High-High", 2: "Low-High",
132
+ 3: "Low-Low", 4: "High-Low", 0: "Not Significant"}
133
+ sig_gdf = gdf[gdf["lisa_significant"]]
134
+ if len(sig_gdf) > 0:
135
+ sig_gdf.plot(column="lisa_cluster", ax=ax2,
136
+ categorical=True, legend=True,
137
+ edgecolor="gray", linewidth=0.3)
138
+ ax2.set_title("LISA Clusters (p < 0.05)")
139
+
140
+ plt.tight_layout()
141
+ path = "spatial_autocorrelation.png"
142
+ plt.savefig(path, dpi=150, bbox_inches="tight")
143
+ plt.close()
144
+
145
+ print(f"Moran's I = {moran_global.I:.4f}, p = {moran_global.p_sim:.4f}")
146
+ print(f"LISA: {gdf['lisa_significant'].sum()} significant clusters")
147
+ return {"moran_i": moran_global.I, "p_value": moran_global.p_sim,
148
+ "gdf": gdf, "fig": path}
149
+ ```
150
+
151
+ ## 3. クリギング空間補間
152
+
153
+ ```python
154
+ def kriging_interpolation(points_df, x_col, y_col, value_col,
155
+ grid_resolution=100,
156
+ variogram_model="spherical"):
157
+ """
158
+ Ordinary Kriging 空間補間。
159
+
160
+ Parameters:
161
+ points_df: pd.DataFrame — 観測点データ
162
+ x_col, y_col: str — 座標カラム
163
+ value_col: str — 補間対象カラム
164
+ grid_resolution: int — グリッド解像度
165
+ variogram_model: str — "spherical" / "exponential" / "gaussian"
166
+ """
167
+ from pykrige.ok import OrdinaryKriging
168
+ import matplotlib.pyplot as plt
169
+
170
+ x = points_df[x_col].values
171
+ y = points_df[y_col].values
172
+ z = points_df[value_col].values
173
+
174
+ ok = OrdinaryKriging(
175
+ x, y, z,
176
+ variogram_model=variogram_model,
177
+ verbose=False, enable_plotting=False)
178
+
179
+ grid_x = np.linspace(x.min(), x.max(), grid_resolution)
180
+ grid_y = np.linspace(y.min(), y.max(), grid_resolution)
181
+ z_pred, ss_pred = ok.execute("grid", grid_x, grid_y)
182
+
183
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
184
+
185
+ im1 = ax1.imshow(z_pred, origin="lower",
186
+ extent=[x.min(), x.max(), y.min(), y.max()],
187
+ cmap="viridis")
188
+ ax1.scatter(x, y, c="red", s=10, edgecolors="black", linewidths=0.5)
189
+ ax1.set_title(f"Kriging Prediction ({variogram_model})")
190
+ plt.colorbar(im1, ax=ax1)
191
+
192
+ im2 = ax2.imshow(ss_pred, origin="lower",
193
+ extent=[x.min(), x.max(), y.min(), y.max()],
194
+ cmap="Reds")
195
+ ax2.set_title("Kriging Variance (Uncertainty)")
196
+ plt.colorbar(im2, ax=ax2)
197
+
198
+ plt.tight_layout()
199
+ path = "kriging_result.png"
200
+ plt.savefig(path, dpi=150, bbox_inches="tight")
201
+ plt.close()
202
+
203
+ print(f"Kriging ({variogram_model}): {grid_resolution}×{grid_resolution} grid, "
204
+ f"{len(x)} observation points")
205
+ return {"z_pred": z_pred, "variance": ss_pred,
206
+ "grid_x": grid_x, "grid_y": grid_y, "fig": path}
207
+ ```
208
+
209
+ ## 4. Folium インタラクティブ地図
210
+
211
+ ```python
212
+ def interactive_map(gdf, value_col=None, popup_cols=None,
213
+ tiles="CartoDB positron",
214
+ output="interactive_map.html"):
215
+ """
216
+ Folium インタラクティブ地図。
217
+
218
+ Parameters:
219
+ gdf: GeoDataFrame — 地理空間データ
220
+ value_col: str | None — Choropleth カラム
221
+ popup_cols: list[str] | None — ポップアップ表示カラム
222
+ tiles: str — タイル名
223
+ output: str — 出力 HTML
224
+ """
225
+ import folium
226
+
227
+ center = [gdf.geometry.centroid.y.mean(),
228
+ gdf.geometry.centroid.x.mean()]
229
+ m = folium.Map(location=center, zoom_start=8, tiles=tiles)
230
+
231
+ if value_col and gdf.geometry.geom_type.iloc[0] in ["Polygon", "MultiPolygon"]:
232
+ folium.Choropleth(
233
+ geo_data=gdf.__geo_interface__,
234
+ data=gdf, columns=[gdf.index.name or "index", value_col],
235
+ key_on="feature.id",
236
+ fill_color="YlOrRd",
237
+ legend_name=value_col
238
+ ).add_to(m)
239
+ else:
240
+ for _, row in gdf.iterrows():
241
+ popup_text = ""
242
+ if popup_cols:
243
+ popup_text = "<br>".join(
244
+ [f"<b>{c}</b>: {row[c]}" for c in popup_cols])
245
+ folium.CircleMarker(
246
+ location=[row.geometry.centroid.y, row.geometry.centroid.x],
247
+ radius=5, popup=popup_text,
248
+ color="blue", fill=True
249
+ ).add_to(m)
250
+
251
+ m.save(output)
252
+ print(f"Interactive map → {output} ({len(gdf)} features)")
253
+ return output
254
+ ```
255
+
256
+ ---
257
+
258
+ ## パイプライン統合
259
+
260
+ ```
261
+ environmental-geodata → geospatial-analysis → advanced-visualization
262
+ (環境データ取得) (空間解析) (高度可視化)
263
+ │ │ ↓
264
+ epidemiology ───────────────┘ interactive-dashboard
265
+ (空間疫学) (ダッシュボード)
266
+ ```
267
+
268
+ ## パイプライン出力
269
+
270
+ | ファイル | 説明 | 次スキル |
271
+ |---------|------|---------|
272
+ | `spatial_autocorrelation.png` | Moran's I + LISA | → reporting |
273
+ | `kriging_result.png` | クリギング補間 | → visualization |
274
+ | `interactive_map.html` | Folium 地図 | → dashboard |
@@ -0,0 +1,247 @@
1
+ ---
2
+ name: scientific-model-monitoring
3
+ description: |
4
+ MLOps モデル監視スキル。データドリフト検出 (Evidently/NannyML)・
5
+ モデル性能劣化検出・特徴量ドリフト・コンセプトドリフト・
6
+ A/B テスト統計・モデルレジストリ・再学習トリガー。
7
+ ---
8
+
9
+ # Scientific Model Monitoring
10
+
11
+ 本番環境の ML モデル監視パイプラインを提供し、
12
+ データドリフト・性能劣化を検出して再学習トリガーを実現する。
13
+
14
+ ## When to Use
15
+
16
+ - デプロイ済みモデルの予測品質を継続監視するとき
17
+ - データドリフト (共変量シフト) を検出するとき
18
+ - コンセプトドリフト (P(Y|X) の変化) を検出するとき
19
+ - A/B テストで新旧モデルを比較するとき
20
+ - 特徴量分布の変化を追跡するとき
21
+ - 再学習トリガーの自動化ルールを設定するとき
22
+
23
+ ---
24
+
25
+ ## Quick Start
26
+
27
+ ## 1. データドリフト検出
28
+
29
+ ```python
30
+ import numpy as np
31
+ import pandas as pd
32
+ from scipy import stats
33
+
34
+
35
+ def detect_data_drift(reference_df, current_df,
36
+ method="ks", threshold=0.05):
37
+ """
38
+ データドリフト検出 — 参照データ vs 現在データ。
39
+
40
+ Parameters:
41
+ reference_df: pd.DataFrame — 学習時データ (参照)
42
+ current_df: pd.DataFrame — 推論時データ (現在)
43
+ method: str — "ks" (KS 検定) / "psi" (PSI) / "wasserstein"
44
+ threshold: float — 有意水準 or PSI 閾値
45
+ """
46
+ numeric_cols = reference_df.select_dtypes(include=[np.number]).columns
47
+ common_cols = [c for c in numeric_cols if c in current_df.columns]
48
+
49
+ drift_results = []
50
+
51
+ for col in common_cols:
52
+ ref_vals = reference_df[col].dropna().values
53
+ cur_vals = current_df[col].dropna().values
54
+
55
+ if method == "ks":
56
+ stat, p_value = stats.ks_2samp(ref_vals, cur_vals)
57
+ is_drift = p_value < threshold
58
+ drift_results.append({
59
+ "feature": col, "statistic": stat,
60
+ "p_value": p_value, "is_drift": is_drift})
61
+
62
+ elif method == "psi":
63
+ # Population Stability Index
64
+ psi_val = _compute_psi(ref_vals, cur_vals)
65
+ is_drift = psi_val > 0.2 # >0.2 = significant shift
66
+ drift_results.append({
67
+ "feature": col, "psi": psi_val,
68
+ "is_drift": is_drift,
69
+ "severity": "high" if psi_val > 0.25 else
70
+ "medium" if psi_val > 0.1 else "low"})
71
+
72
+ elif method == "wasserstein":
73
+ w_dist = stats.wasserstein_distance(ref_vals, cur_vals)
74
+ ref_std = np.std(ref_vals)
75
+ normalized = w_dist / ref_std if ref_std > 0 else w_dist
76
+ is_drift = normalized > 0.1
77
+ drift_results.append({
78
+ "feature": col, "wasserstein": w_dist,
79
+ "normalized": normalized, "is_drift": is_drift})
80
+
81
+ result_df = pd.DataFrame(drift_results)
82
+ n_drift = result_df["is_drift"].sum()
83
+ print(f"Data Drift ({method}): {n_drift}/{len(common_cols)} features drifted")
84
+ return result_df
85
+
86
+
87
+ def _compute_psi(expected, actual, n_bins=10):
88
+ """PSI (Population Stability Index) 計算。"""
89
+ breakpoints = np.quantile(expected, np.linspace(0, 1, n_bins + 1))
90
+ breakpoints[0] = -np.inf
91
+ breakpoints[-1] = np.inf
92
+
93
+ expected_pct = np.histogram(expected, bins=breakpoints)[0] / len(expected)
94
+ actual_pct = np.histogram(actual, bins=breakpoints)[0] / len(actual)
95
+
96
+ expected_pct = np.clip(expected_pct, 1e-4, None)
97
+ actual_pct = np.clip(actual_pct, 1e-4, None)
98
+
99
+ psi = np.sum((actual_pct - expected_pct) * np.log(actual_pct / expected_pct))
100
+ return psi
101
+ ```
102
+
103
+ ## 2. モデル性能劣化検出
104
+
105
+ ```python
106
+ def detect_performance_degradation(y_true_batches, y_pred_batches,
107
+ metric="accuracy",
108
+ window_size=10, alert_threshold=0.05):
109
+ """
110
+ モデル性能劣化のスライディングウィンドウ検出。
111
+
112
+ Parameters:
113
+ y_true_batches: list[np.ndarray] — バッチごとの真値
114
+ y_pred_batches: list[np.ndarray] — バッチごとの予測値
115
+ metric: str — "accuracy" / "f1" / "rmse" / "auc"
116
+ window_size: int — 移動平均ウィンドウ
117
+ alert_threshold: float — 性能低下アラート閾値
118
+ """
119
+ from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
120
+ from sklearn.metrics import roc_auc_score
121
+ import matplotlib.pyplot as plt
122
+
123
+ metric_funcs = {
124
+ "accuracy": accuracy_score,
125
+ "f1": lambda y, p: f1_score(y, p, average="macro"),
126
+ "rmse": lambda y, p: -np.sqrt(mean_squared_error(y, p)),
127
+ "auc": lambda y, p: roc_auc_score(y, p)
128
+ }
129
+
130
+ func = metric_funcs[metric]
131
+ scores = [func(yt, yp) for yt, yp in zip(y_true_batches, y_pred_batches)]
132
+
133
+ # 移動平均
134
+ scores_arr = np.array(scores)
135
+ if len(scores_arr) >= window_size:
136
+ ma = np.convolve(scores_arr, np.ones(window_size)/window_size, mode="valid")
137
+ else:
138
+ ma = scores_arr
139
+
140
+ # ベースライン (最初の window_size バッチ)
141
+ baseline = np.mean(scores_arr[:window_size])
142
+ current = np.mean(scores_arr[-window_size:])
143
+ degradation = baseline - current
144
+
145
+ is_degraded = degradation > alert_threshold
146
+
147
+ # 可視化
148
+ fig, ax = plt.subplots(figsize=(12, 5))
149
+ ax.plot(scores, "b-o", markersize=3, alpha=0.5, label="Batch score")
150
+ if len(ma) > 0:
151
+ ax.plot(range(window_size - 1, window_size - 1 + len(ma)),
152
+ ma, "r-", linewidth=2, label=f"MA({window_size})")
153
+ ax.axhline(baseline, color="g", linestyle="--",
154
+ label=f"Baseline={baseline:.4f}")
155
+ ax.axhline(baseline - alert_threshold, color="orange", linestyle="--",
156
+ label=f"Alert={baseline - alert_threshold:.4f}")
157
+ ax.set_xlabel("Batch")
158
+ ax.set_ylabel(metric)
159
+ ax.set_title(f"Model Performance Monitoring ({metric})")
160
+ ax.legend()
161
+
162
+ path = "performance_monitoring.png"
163
+ plt.savefig(path, dpi=150, bbox_inches="tight")
164
+ plt.close()
165
+
166
+ status = "DEGRADED ⚠️" if is_degraded else "OK ✓"
167
+ print(f"Performance ({metric}): baseline={baseline:.4f}, "
168
+ f"current={current:.4f}, Δ={degradation:.4f} → {status}")
169
+ return {"baseline": baseline, "current": current,
170
+ "degradation": degradation, "is_degraded": is_degraded,
171
+ "scores": scores, "fig": path}
172
+ ```
173
+
174
+ ## 3. A/B テスト統計
175
+
176
+ ```python
177
+ def ab_test_models(y_true, preds_a, preds_b, metric="accuracy",
178
+ n_bootstrap=10000, alpha=0.05):
179
+ """
180
+ A/B テスト — 2 モデルの統計的比較。
181
+
182
+ Parameters:
183
+ y_true: np.ndarray — 真値
184
+ preds_a: np.ndarray — モデル A 予測
185
+ preds_b: np.ndarray — モデル B 予測
186
+ metric: str — 評価指標
187
+ n_bootstrap: int — ブートストラップ回数
188
+ alpha: float — 有意水準
189
+ """
190
+ from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
191
+
192
+ metric_funcs = {
193
+ "accuracy": accuracy_score,
194
+ "f1": lambda y, p: f1_score(y, p, average="macro"),
195
+ "rmse": lambda y, p: np.sqrt(mean_squared_error(y, p))
196
+ }
197
+
198
+ func = metric_funcs[metric]
199
+ score_a = func(y_true, preds_a)
200
+ score_b = func(y_true, preds_b)
201
+
202
+ # Bootstrap confidence interval for difference
203
+ diffs = []
204
+ n = len(y_true)
205
+ rng = np.random.RandomState(42)
206
+
207
+ for _ in range(n_bootstrap):
208
+ idx = rng.choice(n, n, replace=True)
209
+ sa = func(y_true[idx], preds_a[idx])
210
+ sb = func(y_true[idx], preds_b[idx])
211
+ diffs.append(sb - sa)
212
+
213
+ diffs = np.array(diffs)
214
+ ci_lower = np.percentile(diffs, 100 * alpha / 2)
215
+ ci_upper = np.percentile(diffs, 100 * (1 - alpha / 2))
216
+ p_value = np.mean(diffs <= 0) # P(B ≤ A)
217
+
218
+ winner = "B" if ci_lower > 0 else ("A" if ci_upper < 0 else "Tie")
219
+
220
+ print(f"A/B Test ({metric}): A={score_a:.4f}, B={score_b:.4f}")
221
+ print(f" Δ(B-A)={score_b - score_a:.4f}, "
222
+ f"95% CI=[{ci_lower:.4f}, {ci_upper:.4f}], "
223
+ f"p={p_value:.4f} → Winner: {winner}")
224
+ return {"score_a": score_a, "score_b": score_b,
225
+ "diff": score_b - score_a, "ci": (ci_lower, ci_upper),
226
+ "p_value": p_value, "winner": winner}
227
+ ```
228
+
229
+ ---
230
+
231
+ ## パイプライン統合
232
+
233
+ ```
234
+ ensemble-methods → model-monitoring → anomaly-detection
235
+ (モデル構築) (監視) (異常検知)
236
+ │ │ ↓
237
+ automl ──────────────┘ active-learning
238
+ (AutoML) (再学習)
239
+ ```
240
+
241
+ ## パイプライン出力
242
+
243
+ | ファイル | 説明 | 次スキル |
244
+ |---------|------|---------|
245
+ | `drift_report.csv` | ドリフト検出結果 | → 再学習判断 |
246
+ | `performance_monitoring.png` | 性能推移 | → reporting |
247
+ | `ab_test_result.json` | A/B テスト結果 | → デプロイ判断 |