@nahisaho/satori 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -16
- package/package.json +1 -1
- package/src/.github/skills/scientific-anomaly-detection/SKILL.md +296 -0
- package/src/.github/skills/scientific-causal-ml/SKILL.md +240 -0
- package/src/.github/skills/scientific-data-profiling/SKILL.md +247 -0
- package/src/.github/skills/scientific-geospatial-analysis/SKILL.md +274 -0
- package/src/.github/skills/scientific-model-monitoring/SKILL.md +247 -0
- package/src/.github/skills/scientific-network-visualization/SKILL.md +278 -0
- package/src/.github/skills/scientific-reproducible-reporting/SKILL.md +330 -0
- package/src/.github/skills/scientific-time-series-forecasting/SKILL.md +246 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-causal-ml
|
|
3
|
+
description: |
|
|
4
|
+
因果機械学習スキル。DoWhy 因果モデル・EconML CATE 推定・
|
|
5
|
+
Double/Debiased ML・Causal Forest・メタラーナー (S/T/X)・
|
|
6
|
+
異質的処置効果 (HTE)・因果特徴量選択。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Causal ML
|
|
10
|
+
|
|
11
|
+
機械学習ベースの因果推論パイプラインを提供し、
|
|
12
|
+
異質的処置効果 (HTE) の推定と因果特徴量発見を実現する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 処置効果が個人/サブグループで異なるとき (HTE 推定)
|
|
17
|
+
- Causal Forest で非パラメトリック因果効果を推定するとき
|
|
18
|
+
- Double ML で高次元データの処置効果を推定するとき
|
|
19
|
+
- メタラーナー (S/T/X-learner) で CATE を推定するとき
|
|
20
|
+
- DoWhy で因果モデルの同定・推定・反論をするとき
|
|
21
|
+
- 因果特徴量選択で重要な効果修飾因子を発見するとき
|
|
22
|
+
|
|
23
|
+
> **Note**: 統計的因果推論 (PSM/IPW/DID/RDD) は `scientific-causal-inference` を参照。
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
## 1. DoWhy 因果モデル
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def dowhy_causal_model(df, treatment, outcome, common_causes,
|
|
37
|
+
effect_modifiers=None, method="backdoor.linear_regression"):
|
|
38
|
+
"""
|
|
39
|
+
DoWhy 因果推論パイプライン (同定→推定→反論)。
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
df: pd.DataFrame — 観測データ
|
|
43
|
+
treatment: str — 処置変数
|
|
44
|
+
outcome: str — 結果変数
|
|
45
|
+
common_causes: list[str] — 共変量
|
|
46
|
+
effect_modifiers: list[str] | None — 効果修飾因子
|
|
47
|
+
method: str — 推定手法
|
|
48
|
+
"""
|
|
49
|
+
import dowhy
|
|
50
|
+
|
|
51
|
+
model = dowhy.CausalModel(
|
|
52
|
+
data=df,
|
|
53
|
+
treatment=treatment,
|
|
54
|
+
outcome=outcome,
|
|
55
|
+
common_causes=common_causes,
|
|
56
|
+
effect_modifiers=effect_modifiers)
|
|
57
|
+
|
|
58
|
+
# 同定
|
|
59
|
+
estimand = model.identify_effect(proceed_when_unidentifiable=True)
|
|
60
|
+
print(f"Identified estimand: {estimand.get_frontdoor_variables()}")
|
|
61
|
+
|
|
62
|
+
# 推定
|
|
63
|
+
estimate = model.estimate_effect(
|
|
64
|
+
estimand, method_name=method)
|
|
65
|
+
print(f"ATE = {estimate.value:.4f} (95% CI: [{estimate.get_confidence_intervals()[0]:.4f}, "
|
|
66
|
+
f"{estimate.get_confidence_intervals()[1]:.4f}])")
|
|
67
|
+
|
|
68
|
+
# 反論テスト
|
|
69
|
+
refutations = {}
|
|
70
|
+
for refuter_name in ["random_common_cause", "placebo_treatment_refuter",
|
|
71
|
+
"data_subset_refuter"]:
|
|
72
|
+
try:
|
|
73
|
+
refutation = model.refute_estimate(
|
|
74
|
+
estimand, estimate, method_name=refuter_name)
|
|
75
|
+
refutations[refuter_name] = {
|
|
76
|
+
"new_effect": float(refutation.new_effect),
|
|
77
|
+
"p_value": getattr(refutation, "refutation_result", {}).get("p_value", None)
|
|
78
|
+
}
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
print(f"Refutation tests: {len(refutations)} passed")
|
|
83
|
+
return {"model": model, "estimand": estimand,
|
|
84
|
+
"estimate": estimate, "refutations": refutations}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## 2. EconML Double ML / Causal Forest
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
def double_ml_estimate(df, treatment, outcome, features,
|
|
91
|
+
n_splits=5, model_type="linear"):
|
|
92
|
+
"""
|
|
93
|
+
Double/Debiased ML による処置効果推定。
|
|
94
|
+
|
|
95
|
+
Parameters:
|
|
96
|
+
df: pd.DataFrame — データ
|
|
97
|
+
treatment: str — 処置変数
|
|
98
|
+
outcome: str — 結果変数
|
|
99
|
+
features: list[str] — 共変量
|
|
100
|
+
n_splits: int — クロスフィッティング分割数
|
|
101
|
+
model_type: str — "linear" / "forest"
|
|
102
|
+
"""
|
|
103
|
+
from econml.dml import LinearDML, CausalForestDML
|
|
104
|
+
|
|
105
|
+
Y = df[outcome].values
|
|
106
|
+
T = df[treatment].values
|
|
107
|
+
X = df[features].values
|
|
108
|
+
|
|
109
|
+
if model_type == "linear":
|
|
110
|
+
est = LinearDML(cv=n_splits, random_state=42)
|
|
111
|
+
else:
|
|
112
|
+
est = CausalForestDML(
|
|
113
|
+
n_estimators=200, cv=n_splits, random_state=42)
|
|
114
|
+
|
|
115
|
+
est.fit(Y, T, X=X)
|
|
116
|
+
|
|
117
|
+
ate = est.ate(X)
|
|
118
|
+
ate_ci = est.ate_interval(X, alpha=0.05)
|
|
119
|
+
|
|
120
|
+
# CATE (個人レベル)
|
|
121
|
+
cate = est.effect(X)
|
|
122
|
+
cate_ci = est.effect_interval(X, alpha=0.05)
|
|
123
|
+
|
|
124
|
+
result_df = pd.DataFrame(X, columns=features)
|
|
125
|
+
result_df["cate"] = cate
|
|
126
|
+
result_df["cate_lower"] = cate_ci[0]
|
|
127
|
+
result_df["cate_upper"] = cate_ci[1]
|
|
128
|
+
|
|
129
|
+
print(f"Double ML ({model_type}): ATE={ate:.4f} "
|
|
130
|
+
f"[{ate_ci[0]:.4f}, {ate_ci[1]:.4f}]")
|
|
131
|
+
print(f" CATE range: [{cate.min():.4f}, {cate.max():.4f}]")
|
|
132
|
+
return {"ate": ate, "ate_ci": ate_ci,
|
|
133
|
+
"cate_df": result_df, "model": est}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def causal_forest(df, treatment, outcome, features,
|
|
137
|
+
n_estimators=500):
|
|
138
|
+
"""
|
|
139
|
+
Causal Forest — 非パラメトリック HTE 推定。
|
|
140
|
+
|
|
141
|
+
Parameters:
|
|
142
|
+
df: pd.DataFrame — データ
|
|
143
|
+
treatment: str — 処置変数 (binary)
|
|
144
|
+
outcome: str — 結果変数
|
|
145
|
+
features: list[str] — 共変量
|
|
146
|
+
n_estimators: int — 木の数
|
|
147
|
+
"""
|
|
148
|
+
from econml.dml import CausalForestDML
|
|
149
|
+
|
|
150
|
+
Y = df[outcome].values
|
|
151
|
+
T = df[treatment].values
|
|
152
|
+
X = df[features].values
|
|
153
|
+
|
|
154
|
+
cf = CausalForestDML(
|
|
155
|
+
n_estimators=n_estimators, random_state=42,
|
|
156
|
+
min_samples_leaf=10)
|
|
157
|
+
cf.fit(Y, T, X=X)
|
|
158
|
+
|
|
159
|
+
cate = cf.effect(X)
|
|
160
|
+
cate_ci = cf.effect_interval(X, alpha=0.05)
|
|
161
|
+
|
|
162
|
+
# 特徴量重要度 (因果)
|
|
163
|
+
importances = cf.feature_importances_
|
|
164
|
+
feat_imp = pd.DataFrame({
|
|
165
|
+
"feature": features,
|
|
166
|
+
"causal_importance": importances
|
|
167
|
+
}).sort_values("causal_importance", ascending=False)
|
|
168
|
+
|
|
169
|
+
print(f"Causal Forest: {n_estimators} trees, "
|
|
170
|
+
f"CATE median={np.median(cate):.4f}")
|
|
171
|
+
print(f" Top causal features: {feat_imp.head(5).to_dict('records')}")
|
|
172
|
+
return {"cate": cate, "cate_ci": cate_ci,
|
|
173
|
+
"feature_importance": feat_imp, "model": cf}
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## 3. メタラーナー (S/T/X-Learner)
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
def meta_learner(df, treatment, outcome, features,
|
|
180
|
+
learner_type="t", base_model=None):
|
|
181
|
+
"""
|
|
182
|
+
メタラーナーによる CATE 推定。
|
|
183
|
+
|
|
184
|
+
Parameters:
|
|
185
|
+
df: pd.DataFrame — データ
|
|
186
|
+
treatment: str — 処置変数 (binary 0/1)
|
|
187
|
+
outcome: str — 結果変数
|
|
188
|
+
features: list[str] — 共変量
|
|
189
|
+
learner_type: str — "s" / "t" / "x"
|
|
190
|
+
base_model: BaseEstimator | None — ベースモデル
|
|
191
|
+
"""
|
|
192
|
+
from econml.metalearners import SLearner, TLearner, XLearner
|
|
193
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
194
|
+
|
|
195
|
+
if base_model is None:
|
|
196
|
+
base_model = GradientBoostingRegressor(
|
|
197
|
+
n_estimators=200, max_depth=5, random_state=42)
|
|
198
|
+
|
|
199
|
+
Y = df[outcome].values
|
|
200
|
+
T = df[treatment].values
|
|
201
|
+
X = df[features].values
|
|
202
|
+
|
|
203
|
+
learners = {"s": SLearner, "t": TLearner, "x": XLearner}
|
|
204
|
+
LearnerClass = learners[learner_type]
|
|
205
|
+
|
|
206
|
+
if learner_type == "s":
|
|
207
|
+
est = LearnerClass(overall_model=base_model)
|
|
208
|
+
else:
|
|
209
|
+
est = LearnerClass(models=base_model)
|
|
210
|
+
|
|
211
|
+
est.fit(Y, T, X=X)
|
|
212
|
+
cate = est.effect(X)
|
|
213
|
+
|
|
214
|
+
result_df = pd.DataFrame(X, columns=features)
|
|
215
|
+
result_df["cate"] = cate
|
|
216
|
+
|
|
217
|
+
print(f"{learner_type.upper()}-Learner: "
|
|
218
|
+
f"CATE mean={cate.mean():.4f}, std={cate.std():.4f}")
|
|
219
|
+
return {"cate": cate, "cate_df": result_df, "model": est}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## パイプライン統合
|
|
225
|
+
|
|
226
|
+
```
|
|
227
|
+
causal-inference → causal-ml → feature-importance
|
|
228
|
+
(統計的因果) (因果 ML) (特徴量解釈)
|
|
229
|
+
│ │ ↓
|
|
230
|
+
clinical-trial ───────┘ explainable-ai
|
|
231
|
+
(臨床試験) (説明可能 AI)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## パイプライン出力
|
|
235
|
+
|
|
236
|
+
| ファイル | 説明 | 次スキル |
|
|
237
|
+
|---------|------|---------|
|
|
238
|
+
| `dowhy_causal_model.json` | DoWhy 因果モデル | → reporting |
|
|
239
|
+
| `cate_estimates.csv` | CATE 推定値 | → precision-medicine |
|
|
240
|
+
| `causal_feature_importance.csv` | 因果特徴量重要度 | → explainable-ai |
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: scientific-data-profiling
|
|
3
|
+
description: |
|
|
4
|
+
データプロファイリング・品質スキル。ydata-profiling 自動 EDA ・
|
|
5
|
+
Great Expectations データバリデーション・データ品質スコア・
|
|
6
|
+
型推論・相関検出・外れ値フラグ・データカタログ生成。
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Scientific Data Profiling
|
|
10
|
+
|
|
11
|
+
データセットの包括的プロファイリング・品質評価・
|
|
12
|
+
自動 EDA レポートパイプラインを提供する。
|
|
13
|
+
|
|
14
|
+
## When to Use
|
|
15
|
+
|
|
16
|
+
- 新しいデータセットの全体像を素早く把握するとき
|
|
17
|
+
- データ品質スコアを算出して品質基準をチェックするとき
|
|
18
|
+
- ydata-profiling で自動 EDA レポートを生成するとき
|
|
19
|
+
- Great Expectations でデータバリデーションルールを定義するとき
|
|
20
|
+
- データカタログ (辞書) を自動生成するとき
|
|
21
|
+
- 相関・外れ値・欠損を一括診断するとき
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
## 1. ydata-profiling 自動 EDA
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import numpy as np
|
|
31
|
+
import pandas as pd
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def auto_profile_report(df, title="Data Profile Report",
|
|
35
|
+
minimal=False, output="profile_report.html"):
|
|
36
|
+
"""
|
|
37
|
+
ydata-profiling 自動 EDA レポート。
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
df: pd.DataFrame — 入力データ
|
|
41
|
+
title: str — レポートタイトル
|
|
42
|
+
minimal: bool — 軽量モード
|
|
43
|
+
output: str — 出力 HTML パス
|
|
44
|
+
"""
|
|
45
|
+
from ydata_profiling import ProfileReport
|
|
46
|
+
|
|
47
|
+
profile = ProfileReport(
|
|
48
|
+
df, title=title, minimal=minimal,
|
|
49
|
+
correlations={"pearson": {"calculate": True},
|
|
50
|
+
"spearman": {"calculate": True},
|
|
51
|
+
"kendall": {"calculate": True}},
|
|
52
|
+
missing_diagrams={"bar": True, "matrix": True, "heatmap": True})
|
|
53
|
+
|
|
54
|
+
profile.to_file(output)
|
|
55
|
+
|
|
56
|
+
# サマリー抽出
|
|
57
|
+
desc = profile.get_description()
|
|
58
|
+
summary = {
|
|
59
|
+
"n_rows": len(df),
|
|
60
|
+
"n_cols": len(df.columns),
|
|
61
|
+
"n_numeric": len(df.select_dtypes(include=[np.number]).columns),
|
|
62
|
+
"n_categorical": len(df.select_dtypes(include=["object", "category"]).columns),
|
|
63
|
+
"total_missing": int(df.isnull().sum().sum()),
|
|
64
|
+
"missing_pct": float(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100),
|
|
65
|
+
"n_duplicates": int(df.duplicated().sum()),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
print(f"Profile Report → {output}")
|
|
69
|
+
print(f" {summary['n_rows']} rows × {summary['n_cols']} cols, "
|
|
70
|
+
f"{summary['missing_pct']:.1f}% missing, "
|
|
71
|
+
f"{summary['n_duplicates']} duplicates")
|
|
72
|
+
return {"report_path": output, "summary": summary}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## 2. データ品質スコア
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
def data_quality_score(df, rules=None):
|
|
79
|
+
"""
|
|
80
|
+
データ品質スコア算出 (0-100)。
|
|
81
|
+
|
|
82
|
+
Parameters:
|
|
83
|
+
df: pd.DataFrame — 入力データ
|
|
84
|
+
rules: dict | None — カスタムルール
|
|
85
|
+
"""
|
|
86
|
+
scores = {}
|
|
87
|
+
|
|
88
|
+
# 1. 完全性 (Completeness) — 非欠損率
|
|
89
|
+
completeness = 1.0 - df.isnull().sum().sum() / (len(df) * len(df.columns))
|
|
90
|
+
scores["completeness"] = completeness
|
|
91
|
+
|
|
92
|
+
# 2. 一意性 (Uniqueness) — 非重複率
|
|
93
|
+
uniqueness = 1.0 - df.duplicated().sum() / len(df) if len(df) > 0 else 1.0
|
|
94
|
+
scores["uniqueness"] = uniqueness
|
|
95
|
+
|
|
96
|
+
# 3. 一貫性 (Consistency) — 型一貫性
|
|
97
|
+
type_consistent = 0
|
|
98
|
+
for col in df.columns:
|
|
99
|
+
non_null = df[col].dropna()
|
|
100
|
+
if len(non_null) == 0:
|
|
101
|
+
type_consistent += 1
|
|
102
|
+
continue
|
|
103
|
+
try:
|
|
104
|
+
inferred = pd.api.types.infer_dtype(non_null, skipna=True)
|
|
105
|
+
if inferred not in ["mixed", "mixed-integer"]:
|
|
106
|
+
type_consistent += 1
|
|
107
|
+
except Exception:
|
|
108
|
+
pass
|
|
109
|
+
consistency = type_consistent / len(df.columns) if len(df.columns) > 0 else 1.0
|
|
110
|
+
scores["consistency"] = consistency
|
|
111
|
+
|
|
112
|
+
# 4. 適時性 (Timeliness) — 日付カラムの新しさ
|
|
113
|
+
date_cols = df.select_dtypes(include=["datetime64"]).columns
|
|
114
|
+
if len(date_cols) > 0:
|
|
115
|
+
max_date = df[date_cols[0]].max()
|
|
116
|
+
freshness = 1.0 # Placeholder
|
|
117
|
+
scores["timeliness"] = freshness
|
|
118
|
+
else:
|
|
119
|
+
scores["timeliness"] = 1.0
|
|
120
|
+
|
|
121
|
+
# 5. 妥当性 (Validity) — 数値カラムの有限性
|
|
122
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
123
|
+
if len(numeric_cols) > 0:
|
|
124
|
+
finite_rate = df[numeric_cols].apply(lambda x: np.isfinite(x.dropna()).mean()).mean()
|
|
125
|
+
scores["validity"] = float(finite_rate)
|
|
126
|
+
else:
|
|
127
|
+
scores["validity"] = 1.0
|
|
128
|
+
|
|
129
|
+
# 総合スコア
|
|
130
|
+
weights = {"completeness": 0.3, "uniqueness": 0.2,
|
|
131
|
+
"consistency": 0.2, "timeliness": 0.1, "validity": 0.2}
|
|
132
|
+
total_score = sum(scores[k] * weights[k] for k in weights) * 100
|
|
133
|
+
|
|
134
|
+
# カスタムルール
|
|
135
|
+
rule_results = []
|
|
136
|
+
if rules:
|
|
137
|
+
for rule_name, rule_fn in rules.items():
|
|
138
|
+
try:
|
|
139
|
+
passed = rule_fn(df)
|
|
140
|
+
rule_results.append({"rule": rule_name, "passed": passed})
|
|
141
|
+
except Exception as e:
|
|
142
|
+
rule_results.append({"rule": rule_name, "passed": False,
|
|
143
|
+
"error": str(e)})
|
|
144
|
+
|
|
145
|
+
print(f"Data Quality Score: {total_score:.1f}/100")
|
|
146
|
+
for k, v in scores.items():
|
|
147
|
+
print(f" {k}: {v:.3f}")
|
|
148
|
+
|
|
149
|
+
return {"total_score": total_score, "dimension_scores": scores,
|
|
150
|
+
"rule_results": rule_results}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## 3. Great Expectations バリデーション
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
def great_expectations_validate(df, expectations=None):
|
|
157
|
+
"""
|
|
158
|
+
Great Expectations スタイルのデータバリデーション。
|
|
159
|
+
|
|
160
|
+
Parameters:
|
|
161
|
+
df: pd.DataFrame — 入力データ
|
|
162
|
+
expectations: list[dict] | None — バリデーションルール
|
|
163
|
+
"""
|
|
164
|
+
if expectations is None:
|
|
165
|
+
expectations = _auto_generate_expectations(df)
|
|
166
|
+
|
|
167
|
+
results = []
|
|
168
|
+
for exp in expectations:
|
|
169
|
+
exp_type = exp["type"]
|
|
170
|
+
col = exp.get("column")
|
|
171
|
+
kwargs = exp.get("kwargs", {})
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
if exp_type == "expect_column_to_exist":
|
|
175
|
+
success = col in df.columns
|
|
176
|
+
elif exp_type == "expect_column_values_to_not_be_null":
|
|
177
|
+
max_pct = kwargs.get("mostly", 1.0)
|
|
178
|
+
non_null_pct = df[col].notnull().mean()
|
|
179
|
+
success = non_null_pct >= max_pct
|
|
180
|
+
elif exp_type == "expect_column_values_to_be_between":
|
|
181
|
+
min_val, max_val = kwargs["min_value"], kwargs["max_value"]
|
|
182
|
+
vals = df[col].dropna()
|
|
183
|
+
success = bool((vals >= min_val).all() and (vals <= max_val).all())
|
|
184
|
+
elif exp_type == "expect_column_values_to_be_unique":
|
|
185
|
+
success = not df[col].duplicated().any()
|
|
186
|
+
elif exp_type == "expect_column_values_to_be_in_set":
|
|
187
|
+
valid_set = set(kwargs["value_set"])
|
|
188
|
+
success = df[col].dropna().isin(valid_set).all()
|
|
189
|
+
elif exp_type == "expect_table_row_count_to_be_between":
|
|
190
|
+
success = kwargs["min_value"] <= len(df) <= kwargs["max_value"]
|
|
191
|
+
else:
|
|
192
|
+
success = None
|
|
193
|
+
|
|
194
|
+
results.append({"expectation": exp_type, "column": col,
|
|
195
|
+
"success": success})
|
|
196
|
+
except Exception as e:
|
|
197
|
+
results.append({"expectation": exp_type, "column": col,
|
|
198
|
+
"success": False, "error": str(e)})
|
|
199
|
+
|
|
200
|
+
results_df = pd.DataFrame(results)
|
|
201
|
+
n_pass = results_df["success"].sum()
|
|
202
|
+
n_total = len(results_df)
|
|
203
|
+
print(f"Validation: {n_pass}/{n_total} expectations passed "
|
|
204
|
+
f"({n_pass/n_total*100:.0f}%)")
|
|
205
|
+
return results_df
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _auto_generate_expectations(df):
|
|
209
|
+
"""自動でバリデーションルールを推論。"""
|
|
210
|
+
expectations = []
|
|
211
|
+
for col in df.columns:
|
|
212
|
+
expectations.append({"type": "expect_column_to_exist", "column": col})
|
|
213
|
+
expectations.append({
|
|
214
|
+
"type": "expect_column_values_to_not_be_null",
|
|
215
|
+
"column": col,
|
|
216
|
+
"kwargs": {"mostly": 0.9}})
|
|
217
|
+
|
|
218
|
+
if df[col].dtype in [np.float64, np.int64]:
|
|
219
|
+
q1, q3 = df[col].quantile([0.01, 0.99])
|
|
220
|
+
iqr = q3 - q1
|
|
221
|
+
expectations.append({
|
|
222
|
+
"type": "expect_column_values_to_be_between",
|
|
223
|
+
"column": col,
|
|
224
|
+
"kwargs": {"min_value": float(q1 - 3 * iqr),
|
|
225
|
+
"max_value": float(q3 + 3 * iqr)}})
|
|
226
|
+
return expectations
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## パイプライン統合
|
|
232
|
+
|
|
233
|
+
```
|
|
234
|
+
[データ取得] → data-profiling → eda-correlation
|
|
235
|
+
(品質診断) (探索的解析)
|
|
236
|
+
│ ↓
|
|
237
|
+
missing-data-analysis anomaly-detection
|
|
238
|
+
(欠損補完) (異常検知)
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
## パイプライン出力
|
|
242
|
+
|
|
243
|
+
| ファイル | 説明 | 次スキル |
|
|
244
|
+
|---------|------|---------|
|
|
245
|
+
| `profile_report.html` | ydata-profiling レポート | → EDA |
|
|
246
|
+
| `quality_score.json` | データ品質スコア | → 品質管理 |
|
|
247
|
+
| `validation_results.csv` | バリデーション結果 | → データ修正 |
|