openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
"""Advanced ML: SHAP, hyperopt, learning curve, cross-validation, PLS/PCR."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
8
|
+
from openstat.session import Session
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ── PLS / PCR ────────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
@command("pls", usage="pls <y> <x1> [x2 ...] [--components=2]")
|
|
14
|
+
def cmd_pls(session: Session, args: str) -> str:
|
|
15
|
+
"""Partial Least Squares regression (PLS1/PLS2).
|
|
16
|
+
|
|
17
|
+
Handles multicollinearity by projecting predictors into latent components.
|
|
18
|
+
Useful when n_features >> n_samples or predictors are highly correlated.
|
|
19
|
+
|
|
20
|
+
Options:
|
|
21
|
+
--components=N number of latent components (default: 2)
|
|
22
|
+
--cv=K k-fold cross-validation (default: 5)
|
|
23
|
+
|
|
24
|
+
Examples:
|
|
25
|
+
pls y x1 x2 x3 x4 x5 --components=3
|
|
26
|
+
pls outcome pred1 pred2 pred3 --cv=10
|
|
27
|
+
"""
|
|
28
|
+
try:
|
|
29
|
+
from sklearn.cross_decomposition import PLSRegression
|
|
30
|
+
from sklearn.model_selection import cross_val_score
|
|
31
|
+
from sklearn.preprocessing import StandardScaler
|
|
32
|
+
except ImportError:
|
|
33
|
+
return "scikit-learn required. Install: pip install scikit-learn"
|
|
34
|
+
|
|
35
|
+
import polars as pl
|
|
36
|
+
ca = CommandArgs(args)
|
|
37
|
+
preds = [p for p in ca.positional if not p.startswith("-")]
|
|
38
|
+
if len(preds) < 2:
|
|
39
|
+
return "Usage: pls <y> <x1> [x2 ...] [--components=N]"
|
|
40
|
+
|
|
41
|
+
y_col = preds[0]
|
|
42
|
+
x_cols = preds[1:]
|
|
43
|
+
n_components = int(ca.options.get("components", 2))
|
|
44
|
+
cv_k = int(ca.options.get("cv", 5))
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
df = session.require_data()
|
|
48
|
+
sub = df.select([y_col] + x_cols).drop_nulls()
|
|
49
|
+
y = sub[y_col].to_numpy().astype(float).reshape(-1, 1)
|
|
50
|
+
X = sub.select(x_cols).to_numpy().astype(float)
|
|
51
|
+
|
|
52
|
+
scaler = StandardScaler()
|
|
53
|
+
X_sc = scaler.fit_transform(X)
|
|
54
|
+
|
|
55
|
+
n_comp = min(n_components, X.shape[1], X.shape[0] - 1)
|
|
56
|
+
pls = PLSRegression(n_components=n_comp)
|
|
57
|
+
pls.fit(X_sc, y)
|
|
58
|
+
|
|
59
|
+
y_pred = pls.predict(X_sc).ravel()
|
|
60
|
+
y_flat = y.ravel()
|
|
61
|
+
ss_res = np.sum((y_flat - y_pred) ** 2)
|
|
62
|
+
ss_tot = np.sum((y_flat - y_flat.mean()) ** 2)
|
|
63
|
+
r2 = 1 - ss_res / ss_tot if ss_tot > 0 else 0
|
|
64
|
+
|
|
65
|
+
# CV score
|
|
66
|
+
cv_scores = cross_val_score(pls, X_sc, y_flat, cv=min(cv_k, len(y_flat)), scoring="r2")
|
|
67
|
+
|
|
68
|
+
lines = [
|
|
69
|
+
f"Partial Least Squares Regression: {y_col} ~ {' + '.join(x_cols)}",
|
|
70
|
+
f"N={sub.height} Components={n_comp}",
|
|
71
|
+
"=" * 55,
|
|
72
|
+
f" R² : {r2:.4f}",
|
|
73
|
+
f" CV R² (k={min(cv_k, len(y_flat))}): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}",
|
|
74
|
+
"",
|
|
75
|
+
" X Loadings (first component):",
|
|
76
|
+
]
|
|
77
|
+
x_load = pls.x_loadings_[:, 0]
|
|
78
|
+
for name, load in sorted(zip(x_cols, x_load), key=lambda t: -abs(t[1])):
|
|
79
|
+
lines.append(f" {name:<20} {load:9.4f}")
|
|
80
|
+
|
|
81
|
+
return "\n".join(lines)
|
|
82
|
+
except Exception as e:
|
|
83
|
+
return friendly_error(e, "pls")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@command("pcr", usage="pcr <y> <x1> [x2 ...] [--components=2]")
|
|
87
|
+
def cmd_pcr(session: Session, args: str) -> str:
|
|
88
|
+
"""Principal Component Regression (PCR).
|
|
89
|
+
|
|
90
|
+
First runs PCA on predictors, then regresses outcome on components.
|
|
91
|
+
|
|
92
|
+
Options:
|
|
93
|
+
--components=N number of PC components to keep (default: 2)
|
|
94
|
+
--cv=K k-fold cross-validation (default: 5)
|
|
95
|
+
|
|
96
|
+
Examples:
|
|
97
|
+
pcr y x1 x2 x3 x4 x5 --components=3
|
|
98
|
+
"""
|
|
99
|
+
try:
|
|
100
|
+
from sklearn.decomposition import PCA
|
|
101
|
+
from sklearn.linear_model import LinearRegression
|
|
102
|
+
from sklearn.model_selection import cross_val_score
|
|
103
|
+
from sklearn.pipeline import Pipeline
|
|
104
|
+
from sklearn.preprocessing import StandardScaler
|
|
105
|
+
except ImportError:
|
|
106
|
+
return "scikit-learn required. Install: pip install scikit-learn"
|
|
107
|
+
|
|
108
|
+
import polars as pl
|
|
109
|
+
ca = CommandArgs(args)
|
|
110
|
+
preds = [p for p in ca.positional if not p.startswith("-")]
|
|
111
|
+
if len(preds) < 2:
|
|
112
|
+
return "Usage: pcr <y> <x1> [x2 ...] [--components=N]"
|
|
113
|
+
|
|
114
|
+
y_col = preds[0]
|
|
115
|
+
x_cols = preds[1:]
|
|
116
|
+
n_components = int(ca.options.get("components", 2))
|
|
117
|
+
cv_k = int(ca.options.get("cv", 5))
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
df = session.require_data()
|
|
121
|
+
sub = df.select([y_col] + x_cols).drop_nulls()
|
|
122
|
+
y = sub[y_col].to_numpy().astype(float)
|
|
123
|
+
X = sub.select(x_cols).to_numpy().astype(float)
|
|
124
|
+
|
|
125
|
+
n_comp = min(n_components, X.shape[1], X.shape[0] - 1)
|
|
126
|
+
pipe = Pipeline([
|
|
127
|
+
("scaler", StandardScaler()),
|
|
128
|
+
("pca", PCA(n_components=n_comp)),
|
|
129
|
+
("reg", LinearRegression()),
|
|
130
|
+
])
|
|
131
|
+
pipe.fit(X, y)
|
|
132
|
+
y_pred = pipe.predict(X)
|
|
133
|
+
ss_res = np.sum((y - y_pred) ** 2)
|
|
134
|
+
ss_tot = np.sum((y - y.mean()) ** 2)
|
|
135
|
+
r2 = 1 - ss_res / ss_tot if ss_tot > 0 else 0
|
|
136
|
+
|
|
137
|
+
pca = pipe.named_steps["pca"]
|
|
138
|
+
var_exp = pca.explained_variance_ratio_
|
|
139
|
+
|
|
140
|
+
cv_scores = cross_val_score(pipe, X, y, cv=min(cv_k, len(y)), scoring="r2")
|
|
141
|
+
|
|
142
|
+
lines = [
|
|
143
|
+
f"Principal Component Regression: {y_col} ~ {' + '.join(x_cols)}",
|
|
144
|
+
f"N={sub.height} Components={n_comp}",
|
|
145
|
+
"=" * 55,
|
|
146
|
+
f" R² : {r2:.4f}",
|
|
147
|
+
f" CV R² (k={min(cv_k, len(y))}): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}",
|
|
148
|
+
"",
|
|
149
|
+
" PCA Components — Variance Explained:",
|
|
150
|
+
]
|
|
151
|
+
cum = 0.0
|
|
152
|
+
for i, ve in enumerate(var_exp):
|
|
153
|
+
cum += ve
|
|
154
|
+
lines.append(f" PC{i+1}: {ve*100:.1f}% (cumulative: {cum*100:.1f}%)")
|
|
155
|
+
|
|
156
|
+
return "\n".join(lines)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
return friendly_error(e, "pcr")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ── Cross-validation ─────────────────────────────────────────────────────────
|
|
162
|
+
|
|
163
|
+
@command("crossval", usage="crossval [--folds=5] [--metric=r2|rmse|mae|accuracy|auc]")
|
|
164
|
+
def cmd_crossval(session: Session, args: str) -> str:
|
|
165
|
+
"""K-fold cross-validation on the last fitted model.
|
|
166
|
+
|
|
167
|
+
Evaluates model generalization using the dataset in the current session.
|
|
168
|
+
|
|
169
|
+
Options:
|
|
170
|
+
--folds=K number of folds (default: 5)
|
|
171
|
+
--metric=<name> scoring metric: r2, rmse, mae, accuracy, auc (default: r2)
|
|
172
|
+
--seed=N random seed
|
|
173
|
+
|
|
174
|
+
Examples:
|
|
175
|
+
ols income educ age
|
|
176
|
+
crossval --folds=10 --metric=rmse
|
|
177
|
+
|
|
178
|
+
logit employed educ age female
|
|
179
|
+
crossval --metric=auc
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
|
|
183
|
+
from sklearn.preprocessing import StandardScaler
|
|
184
|
+
except ImportError:
|
|
185
|
+
return "scikit-learn required. Install: pip install scikit-learn"
|
|
186
|
+
|
|
187
|
+
import polars as pl
|
|
188
|
+
ca = CommandArgs(args)
|
|
189
|
+
k = int(ca.options.get("folds", 5))
|
|
190
|
+
metric = ca.options.get("metric", "r2").lower()
|
|
191
|
+
seed = int(ca.options.get("seed", getattr(session, "_repro_seed", 42) or 42))
|
|
192
|
+
|
|
193
|
+
if session._last_model is None or session._last_model_vars is None:
|
|
194
|
+
return "No model fitted. Run ols/logit/etc. first."
|
|
195
|
+
|
|
196
|
+
dep, indeps = session._last_model_vars
|
|
197
|
+
try:
|
|
198
|
+
df = session.require_data()
|
|
199
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
200
|
+
y = sub[dep].to_numpy().astype(float)
|
|
201
|
+
X = sub.select(indeps).to_numpy().astype(float)
|
|
202
|
+
|
|
203
|
+
# Rebuild sklearn model from statsmodels fit
|
|
204
|
+
model_name = type(session._last_model.model).__name__.lower()
|
|
205
|
+
|
|
206
|
+
if "logit" in model_name or "probit" in model_name or "mnlogit" in model_name:
|
|
207
|
+
from sklearn.linear_model import LogisticRegression
|
|
208
|
+
sk_model = LogisticRegression(max_iter=500, random_state=seed)
|
|
209
|
+
cv = StratifiedKFold(n_splits=min(k, len(y)), shuffle=True, random_state=seed)
|
|
210
|
+
metric_map = {
|
|
211
|
+
"accuracy": "accuracy", "auc": "roc_auc",
|
|
212
|
+
"r2": "accuracy", # fallback
|
|
213
|
+
}
|
|
214
|
+
sk_metric = metric_map.get(metric, "accuracy")
|
|
215
|
+
else:
|
|
216
|
+
from sklearn.linear_model import LinearRegression
|
|
217
|
+
sk_model = LinearRegression()
|
|
218
|
+
cv = KFold(n_splits=min(k, len(y)), shuffle=True, random_state=seed)
|
|
219
|
+
metric_map = {
|
|
220
|
+
"r2": "r2", "rmse": "neg_root_mean_squared_error",
|
|
221
|
+
"mae": "neg_mean_absolute_error",
|
|
222
|
+
}
|
|
223
|
+
sk_metric = metric_map.get(metric, "r2")
|
|
224
|
+
|
|
225
|
+
scores = cross_val_score(sk_model, X, y, cv=cv, scoring=sk_metric)
|
|
226
|
+
# Negate for neg_* metrics
|
|
227
|
+
if sk_metric.startswith("neg_"):
|
|
228
|
+
scores = -scores
|
|
229
|
+
display_metric = metric.upper()
|
|
230
|
+
else:
|
|
231
|
+
display_metric = metric.upper()
|
|
232
|
+
|
|
233
|
+
lines = [
|
|
234
|
+
f"Cross-Validation: {dep} ~ {' + '.join(indeps)}",
|
|
235
|
+
f"Folds={min(k, len(y))} Metric={display_metric} N={len(y)}",
|
|
236
|
+
"=" * 45,
|
|
237
|
+
f" Mean: {scores.mean():.4f}",
|
|
238
|
+
f" Std: {scores.std():.4f}",
|
|
239
|
+
f" Min: {scores.min():.4f}",
|
|
240
|
+
f" Max: {scores.max():.4f}",
|
|
241
|
+
"",
|
|
242
|
+
" Per-fold scores:",
|
|
243
|
+
]
|
|
244
|
+
for i, s in enumerate(scores):
|
|
245
|
+
lines.append(f" Fold {i+1}: {s:.4f}")
|
|
246
|
+
return "\n".join(lines)
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
return friendly_error(e, "crossval")
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# ── Hyperparameter Optimization ──────────────────────────────────────────────
|
|
253
|
+
|
|
254
|
+
@command("hyperopt", usage="hyperopt <y> <x1> [x2 ...] --model=rf|gb|svm|logit [--cv=5]")
|
|
255
|
+
def cmd_hyperopt(session: Session, args: str) -> str:
|
|
256
|
+
"""Hyperparameter optimization via GridSearch / RandomSearch.
|
|
257
|
+
|
|
258
|
+
Finds optimal hyperparameters for ML models using cross-validation.
|
|
259
|
+
|
|
260
|
+
Models: rf (Random Forest), gb (Gradient Boosting), svm, logit, ridge, lasso
|
|
261
|
+
|
|
262
|
+
Options:
|
|
263
|
+
--model=<name> model to optimize (required)
|
|
264
|
+
--cv=K cross-validation folds (default: 5)
|
|
265
|
+
--n_iter=N number of random search iterations (default: 20)
|
|
266
|
+
--metric=<name> scoring metric (default: r2 or accuracy)
|
|
267
|
+
--task=reg|class regression or classification (auto-detected)
|
|
268
|
+
|
|
269
|
+
Examples:
|
|
270
|
+
hyperopt income educ age --model=rf
|
|
271
|
+
hyperopt employed educ age female --model=gb --task=class --cv=10
|
|
272
|
+
"""
|
|
273
|
+
try:
|
|
274
|
+
from sklearn.model_selection import RandomizedSearchCV
|
|
275
|
+
from sklearn.preprocessing import LabelEncoder
|
|
276
|
+
except ImportError:
|
|
277
|
+
return "scikit-learn required. Install: pip install scikit-learn"
|
|
278
|
+
|
|
279
|
+
import polars as pl
|
|
280
|
+
ca = CommandArgs(args)
|
|
281
|
+
preds = [p for p in ca.positional if not p.startswith("-")]
|
|
282
|
+
if len(preds) < 2:
|
|
283
|
+
return "Usage: hyperopt <y> <x1> [x2 ...] --model=rf|gb|svm|logit"
|
|
284
|
+
|
|
285
|
+
y_col = preds[0]
|
|
286
|
+
x_cols = preds[1:]
|
|
287
|
+
model_name = ca.options.get("model", "rf").lower()
|
|
288
|
+
cv_k = int(ca.options.get("cv", 5))
|
|
289
|
+
n_iter = int(ca.options.get("n_iter", 20))
|
|
290
|
+
seed = int(ca.options.get("seed", getattr(session, "_repro_seed", 42) or 42))
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
df = session.require_data()
|
|
294
|
+
sub = df.select([y_col] + x_cols).drop_nulls()
|
|
295
|
+
y = sub[y_col].to_numpy().astype(float)
|
|
296
|
+
X = sub.select(x_cols).to_numpy().astype(float)
|
|
297
|
+
|
|
298
|
+
# Auto-detect task
|
|
299
|
+
task = ca.options.get("task", "")
|
|
300
|
+
if not task:
|
|
301
|
+
n_uniq = len(set(y))
|
|
302
|
+
task = "class" if n_uniq <= 10 and (y == y.astype(int)).all() else "reg"
|
|
303
|
+
|
|
304
|
+
is_clf = task.startswith("class")
|
|
305
|
+
metric = ca.options.get("metric", "accuracy" if is_clf else "r2")
|
|
306
|
+
|
|
307
|
+
# Model + param grid
|
|
308
|
+
from scipy.stats import randint, uniform
|
|
309
|
+
if model_name == "rf":
|
|
310
|
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
|
311
|
+
mdl = RandomForestClassifier(random_state=seed) if is_clf else RandomForestRegressor(random_state=seed)
|
|
312
|
+
param_dist = {"n_estimators": randint(50, 300), "max_depth": [None, 3, 5, 10, 20],
|
|
313
|
+
"min_samples_split": randint(2, 10), "max_features": ["sqrt", "log2", None]}
|
|
314
|
+
elif model_name == "gb":
|
|
315
|
+
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
|
|
316
|
+
mdl = GradientBoostingClassifier(random_state=seed) if is_clf else GradientBoostingRegressor(random_state=seed)
|
|
317
|
+
param_dist = {"n_estimators": randint(50, 300), "learning_rate": uniform(0.01, 0.3),
|
|
318
|
+
"max_depth": randint(2, 8), "subsample": uniform(0.6, 0.4)}
|
|
319
|
+
elif model_name == "svm":
|
|
320
|
+
from sklearn.svm import SVC, SVR
|
|
321
|
+
from sklearn.preprocessing import StandardScaler
|
|
322
|
+
from sklearn.pipeline import Pipeline
|
|
323
|
+
scaler = StandardScaler()
|
|
324
|
+
base = SVC(random_state=seed, probability=True) if is_clf else SVR()
|
|
325
|
+
mdl = Pipeline([("sc", scaler), ("svm", base)])
|
|
326
|
+
param_dist = {"svm__C": uniform(0.01, 100), "svm__kernel": ["rbf", "linear"],
|
|
327
|
+
"svm__gamma": ["scale", "auto"]}
|
|
328
|
+
elif model_name in ("logit", "ridge"):
|
|
329
|
+
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
|
|
330
|
+
if is_clf:
|
|
331
|
+
mdl = LogisticRegression(max_iter=500, random_state=seed)
|
|
332
|
+
param_dist = {"C": uniform(0.001, 10), "penalty": ["l2"], "solver": ["lbfgs", "liblinear"]}
|
|
333
|
+
else:
|
|
334
|
+
mdl = Ridge(random_state=seed)
|
|
335
|
+
param_dist = {"alpha": uniform(0.001, 10)}
|
|
336
|
+
elif model_name == "lasso":
|
|
337
|
+
from sklearn.linear_model import Lasso
|
|
338
|
+
mdl = Lasso(random_state=seed)
|
|
339
|
+
param_dist = {"alpha": uniform(0.001, 10)}
|
|
340
|
+
else:
|
|
341
|
+
return f"Unknown model: {model_name}. Use rf, gb, svm, logit, ridge, lasso."
|
|
342
|
+
|
|
343
|
+
search = RandomizedSearchCV(
|
|
344
|
+
mdl, param_distributions=param_dist,
|
|
345
|
+
n_iter=n_iter, cv=min(cv_k, len(y)),
|
|
346
|
+
scoring=metric, random_state=seed, n_jobs=-1,
|
|
347
|
+
)
|
|
348
|
+
search.fit(X, y)
|
|
349
|
+
|
|
350
|
+
best_params = search.best_params_
|
|
351
|
+
lines = [
|
|
352
|
+
f"Hyperparameter Optimization: {y_col} ~ {' + '.join(x_cols)}",
|
|
353
|
+
f"Model: {model_name.upper()} Task: {'Classification' if is_clf else 'Regression'}",
|
|
354
|
+
f"Search: RandomSearch({n_iter} iterations) CV={min(cv_k, len(y))} Metric={metric}",
|
|
355
|
+
"=" * 60,
|
|
356
|
+
f" Best score: {search.best_score_:.4f}",
|
|
357
|
+
"",
|
|
358
|
+
" Best parameters:",
|
|
359
|
+
]
|
|
360
|
+
for k_p, v in sorted(best_params.items()):
|
|
361
|
+
lines.append(f" {k_p:<30} {v}")
|
|
362
|
+
|
|
363
|
+
return "\n".join(lines)
|
|
364
|
+
except Exception as e:
|
|
365
|
+
return friendly_error(e, "hyperopt")
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# ── SHAP values ──────────────────────────────────────────────────────────────
|
|
369
|
+
|
|
370
|
+
@command("shap", usage="shap <y> <x1> [x2 ...] [--model=rf|gb|linear] [--plot]")
|
|
371
|
+
def cmd_shap(session: Session, args: str) -> str:
|
|
372
|
+
"""SHAP (SHapley Additive exPlanations) feature importance.
|
|
373
|
+
|
|
374
|
+
Computes SHAP values to explain model predictions.
|
|
375
|
+
Works with tree models (RF, GB) and linear models.
|
|
376
|
+
|
|
377
|
+
Options:
|
|
378
|
+
--model=rf|gb|linear model type (default: rf)
|
|
379
|
+
--plot save SHAP summary plot
|
|
380
|
+
--n_samples=N max samples for SHAP (default: 500)
|
|
381
|
+
|
|
382
|
+
Examples:
|
|
383
|
+
shap income educ age female --model=rf --plot
|
|
384
|
+
shap y x1 x2 x3 --model=gb
|
|
385
|
+
"""
|
|
386
|
+
try:
|
|
387
|
+
import shap as _shap
|
|
388
|
+
except ImportError:
|
|
389
|
+
return "shap required. Install: pip install shap"
|
|
390
|
+
try:
|
|
391
|
+
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
|
392
|
+
from sklearn.linear_model import LinearRegression
|
|
393
|
+
except ImportError:
|
|
394
|
+
return "scikit-learn required. Install: pip install scikit-learn"
|
|
395
|
+
|
|
396
|
+
import polars as pl
|
|
397
|
+
ca = CommandArgs(args)
|
|
398
|
+
preds = [p for p in ca.positional if not p.startswith("-")]
|
|
399
|
+
if len(preds) < 2:
|
|
400
|
+
return "Usage: shap <y> <x1> [x2 ...] [--model=rf]"
|
|
401
|
+
|
|
402
|
+
y_col = preds[0]
|
|
403
|
+
x_cols = preds[1:]
|
|
404
|
+
model_name = ca.options.get("model", "rf").lower()
|
|
405
|
+
make_plot = "--plot" in args
|
|
406
|
+
n_samples = int(ca.options.get("n_samples", 500))
|
|
407
|
+
seed = int(ca.options.get("seed", getattr(session, "_repro_seed", 42) or 42))
|
|
408
|
+
|
|
409
|
+
try:
|
|
410
|
+
df = session.require_data()
|
|
411
|
+
sub = df.select([y_col] + x_cols).drop_nulls()
|
|
412
|
+
y = sub[y_col].to_numpy().astype(float)
|
|
413
|
+
X = sub.select(x_cols).to_numpy().astype(float)
|
|
414
|
+
|
|
415
|
+
# Use subset for SHAP if large
|
|
416
|
+
if len(X) > n_samples:
|
|
417
|
+
rng = np.random.default_rng(seed)
|
|
418
|
+
idx = rng.choice(len(X), n_samples, replace=False)
|
|
419
|
+
X_shap, y_shap = X[idx], y[idx]
|
|
420
|
+
else:
|
|
421
|
+
X_shap, y_shap = X, y
|
|
422
|
+
|
|
423
|
+
if model_name == "rf":
|
|
424
|
+
mdl = RandomForestRegressor(n_estimators=100, random_state=seed)
|
|
425
|
+
mdl.fit(X, y)
|
|
426
|
+
explainer = _shap.TreeExplainer(mdl)
|
|
427
|
+
elif model_name == "gb":
|
|
428
|
+
mdl = GradientBoostingRegressor(n_estimators=100, random_state=seed)
|
|
429
|
+
mdl.fit(X, y)
|
|
430
|
+
explainer = _shap.TreeExplainer(mdl)
|
|
431
|
+
else: # linear
|
|
432
|
+
mdl = LinearRegression()
|
|
433
|
+
mdl.fit(X, y)
|
|
434
|
+
explainer = _shap.LinearExplainer(mdl, X_shap)
|
|
435
|
+
|
|
436
|
+
shap_values = explainer.shap_values(X_shap)
|
|
437
|
+
mean_abs = np.abs(shap_values).mean(axis=0)
|
|
438
|
+
|
|
439
|
+
lines = [
|
|
440
|
+
f"SHAP Feature Importance: {y_col} ~ {' + '.join(x_cols)}",
|
|
441
|
+
f"Model: {model_name.upper()} N={len(X_shap)} samples",
|
|
442
|
+
"=" * 50,
|
|
443
|
+
f" {'Feature':<25} {'Mean |SHAP|':>12}",
|
|
444
|
+
"-" * 50,
|
|
445
|
+
]
|
|
446
|
+
for name, val in sorted(zip(x_cols, mean_abs), key=lambda t: -t[1]):
|
|
447
|
+
bar = "█" * int(val / mean_abs.max() * 20)
|
|
448
|
+
lines.append(f" {name:<25} {val:12.4f} {bar}")
|
|
449
|
+
|
|
450
|
+
if make_plot:
|
|
451
|
+
import matplotlib
|
|
452
|
+
matplotlib.use("Agg")
|
|
453
|
+
import matplotlib.pyplot as plt
|
|
454
|
+
|
|
455
|
+
fig, ax = plt.subplots(figsize=(8, max(4, len(x_cols) * 0.5 + 1)))
|
|
456
|
+
order = np.argsort(mean_abs)
|
|
457
|
+
ax.barh([x_cols[i] for i in order], mean_abs[order], color="#4C72B0")
|
|
458
|
+
ax.set_xlabel("Mean |SHAP value|")
|
|
459
|
+
ax.set_title(f"SHAP Feature Importance ({model_name.upper()})")
|
|
460
|
+
fig.tight_layout()
|
|
461
|
+
from pathlib import Path
|
|
462
|
+
session.output_dir.mkdir(parents=True, exist_ok=True)
|
|
463
|
+
path = session.output_dir / "shap_importance.png"
|
|
464
|
+
fig.savefig(path, dpi=150)
|
|
465
|
+
plt.close(fig)
|
|
466
|
+
session.plot_paths.append(str(path))
|
|
467
|
+
lines.append(f"\nSHAP plot saved: {path}")
|
|
468
|
+
|
|
469
|
+
return "\n".join(lines)
|
|
470
|
+
except Exception as e:
|
|
471
|
+
return friendly_error(e, "shap")
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# ── Learning curve ───────────────────────────────────────────────────────────
|
|
475
|
+
|
|
476
|
+
@command("learncurve", usage="learncurve <y> <x1> [x2 ...] [--model=ols|rf|logit]")
|
|
477
|
+
def cmd_learncurve(session: Session, args: str) -> str:
|
|
478
|
+
"""Plot learning curve: training and CV score vs training set size.
|
|
479
|
+
|
|
480
|
+
Helps diagnose bias/variance tradeoff.
|
|
481
|
+
|
|
482
|
+
Options:
|
|
483
|
+
--model=ols|rf|logit|gb model (default: ols for continuous y, logit for binary)
|
|
484
|
+
--cv=K cross-validation folds (default: 5)
|
|
485
|
+
--steps=N number of training size steps (default: 10)
|
|
486
|
+
|
|
487
|
+
Examples:
|
|
488
|
+
learncurve income educ age
|
|
489
|
+
learncurve employed educ age female --model=logit
|
|
490
|
+
"""
|
|
491
|
+
try:
|
|
492
|
+
from sklearn.model_selection import learning_curve
|
|
493
|
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
494
|
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
|
|
495
|
+
except ImportError:
|
|
496
|
+
return "scikit-learn required. Install: pip install scikit-learn"
|
|
497
|
+
|
|
498
|
+
import polars as pl
|
|
499
|
+
import matplotlib
|
|
500
|
+
matplotlib.use("Agg")
|
|
501
|
+
import matplotlib.pyplot as plt
|
|
502
|
+
|
|
503
|
+
ca = CommandArgs(args)
|
|
504
|
+
preds = [p for p in ca.positional if not p.startswith("-")]
|
|
505
|
+
if len(preds) < 2:
|
|
506
|
+
return "Usage: learncurve <y> <x1> [x2 ...]"
|
|
507
|
+
|
|
508
|
+
y_col = preds[0]
|
|
509
|
+
x_cols = preds[1:]
|
|
510
|
+
cv_k = int(ca.options.get("cv", 5))
|
|
511
|
+
steps = int(ca.options.get("steps", 10))
|
|
512
|
+
seed = int(ca.options.get("seed", getattr(session, "_repro_seed", 42) or 42))
|
|
513
|
+
|
|
514
|
+
try:
|
|
515
|
+
df = session.require_data()
|
|
516
|
+
sub = df.select([y_col] + x_cols).drop_nulls()
|
|
517
|
+
y = sub[y_col].to_numpy().astype(float)
|
|
518
|
+
X = sub.select(x_cols).to_numpy().astype(float)
|
|
519
|
+
|
|
520
|
+
model_opt = ca.options.get("model", "")
|
|
521
|
+
n_uniq = len(set(y))
|
|
522
|
+
is_clf = (n_uniq <= 10 and (y == y.astype(int)).all()) if not model_opt else ("logit" in model_opt or "class" in model_opt)
|
|
523
|
+
|
|
524
|
+
mdl_map = {
|
|
525
|
+
"ols": LinearRegression(), "linear": LinearRegression(),
|
|
526
|
+
"logit": LogisticRegression(max_iter=500, random_state=seed),
|
|
527
|
+
"rf": RandomForestClassifier(random_state=seed) if is_clf else RandomForestRegressor(random_state=seed),
|
|
528
|
+
"gb": GradientBoostingRegressor(random_state=seed),
|
|
529
|
+
}
|
|
530
|
+
mdl = mdl_map.get(model_opt, LogisticRegression(max_iter=500, random_state=seed) if is_clf else LinearRegression())
|
|
531
|
+
metric = "accuracy" if is_clf else "r2"
|
|
532
|
+
|
|
533
|
+
train_sizes = np.linspace(0.1, 1.0, steps)
|
|
534
|
+
ts, train_scores, cv_scores = learning_curve(
|
|
535
|
+
mdl, X, y, cv=min(cv_k, len(y)), scoring=metric,
|
|
536
|
+
train_sizes=train_sizes, random_state=seed,
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
fig, ax = plt.subplots(figsize=(8, 5))
|
|
540
|
+
ax.fill_between(ts, train_scores.mean(1) - train_scores.std(1),
|
|
541
|
+
train_scores.mean(1) + train_scores.std(1), alpha=0.15, color="#4C72B0")
|
|
542
|
+
ax.plot(ts, train_scores.mean(1), "o-", color="#4C72B0", label="Train")
|
|
543
|
+
ax.fill_between(ts, cv_scores.mean(1) - cv_scores.std(1),
|
|
544
|
+
cv_scores.mean(1) + cv_scores.std(1), alpha=0.15, color="#DD8452")
|
|
545
|
+
ax.plot(ts, cv_scores.mean(1), "s-", color="#DD8452", label="CV")
|
|
546
|
+
ax.set_xlabel("Training set size")
|
|
547
|
+
ax.set_ylabel(metric.upper())
|
|
548
|
+
ax.set_title(f"Learning Curve: {y_col}")
|
|
549
|
+
ax.legend()
|
|
550
|
+
fig.tight_layout()
|
|
551
|
+
|
|
552
|
+
session.output_dir.mkdir(parents=True, exist_ok=True)
|
|
553
|
+
from pathlib import Path
|
|
554
|
+
path = session.output_dir / "learning_curve.png"
|
|
555
|
+
fig.savefig(path, dpi=150)
|
|
556
|
+
plt.close(fig)
|
|
557
|
+
session.plot_paths.append(str(path))
|
|
558
|
+
|
|
559
|
+
final_cv = cv_scores.mean(1)[-1]
|
|
560
|
+
final_train = train_scores.mean(1)[-1]
|
|
561
|
+
gap = final_train - final_cv
|
|
562
|
+
diagnosis = (
|
|
563
|
+
"Possible overfitting (high variance)" if gap > 0.1 else
|
|
564
|
+
"Possible underfitting (high bias)" if final_cv < 0.5 else
|
|
565
|
+
"Good fit"
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
return (
|
|
569
|
+
f"Learning Curve: {y_col} ~ {' + '.join(x_cols)}\n"
|
|
570
|
+
f"Final train {metric}: {final_train:.4f} | "
|
|
571
|
+
f"CV {metric}: {final_cv:.4f} | Gap: {gap:.4f}\n"
|
|
572
|
+
f"Diagnosis: {diagnosis}\n"
|
|
573
|
+
f"Plot saved: {path}"
|
|
574
|
+
)
|
|
575
|
+
except Exception as e:
|
|
576
|
+
return friendly_error(e, "learncurve")
|