openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,576 @@
1
+ """Advanced ML: SHAP, hyperopt, learning curve, cross-validation, PLS/PCR."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+ from openstat.commands.base import command, CommandArgs, friendly_error
8
+ from openstat.session import Session
9
+
10
+
11
+ # ── PLS / PCR ────────────────────────────────────────────────────────────────
12
+
13
+ @command("pls", usage="pls <y> <x1> [x2 ...] [--components=2]")
14
+ def cmd_pls(session: Session, args: str) -> str:
15
+ """Partial Least Squares regression (PLS1/PLS2).
16
+
17
+ Handles multicollinearity by projecting predictors into latent components.
18
+ Useful when n_features >> n_samples or predictors are highly correlated.
19
+
20
+ Options:
21
+ --components=N number of latent components (default: 2)
22
+ --cv=K k-fold cross-validation (default: 5)
23
+
24
+ Examples:
25
+ pls y x1 x2 x3 x4 x5 --components=3
26
+ pls outcome pred1 pred2 pred3 --cv=10
27
+ """
28
+ try:
29
+ from sklearn.cross_decomposition import PLSRegression
30
+ from sklearn.model_selection import cross_val_score
31
+ from sklearn.preprocessing import StandardScaler
32
+ except ImportError:
33
+ return "scikit-learn required. Install: pip install scikit-learn"
34
+
35
+ import polars as pl
36
+ ca = CommandArgs(args)
37
+ preds = [p for p in ca.positional if not p.startswith("-")]
38
+ if len(preds) < 2:
39
+ return "Usage: pls <y> <x1> [x2 ...] [--components=N]"
40
+
41
+ y_col = preds[0]
42
+ x_cols = preds[1:]
43
+ n_components = int(ca.options.get("components", 2))
44
+ cv_k = int(ca.options.get("cv", 5))
45
+
46
+ try:
47
+ df = session.require_data()
48
+ sub = df.select([y_col] + x_cols).drop_nulls()
49
+ y = sub[y_col].to_numpy().astype(float).reshape(-1, 1)
50
+ X = sub.select(x_cols).to_numpy().astype(float)
51
+
52
+ scaler = StandardScaler()
53
+ X_sc = scaler.fit_transform(X)
54
+
55
+ n_comp = min(n_components, X.shape[1], X.shape[0] - 1)
56
+ pls = PLSRegression(n_components=n_comp)
57
+ pls.fit(X_sc, y)
58
+
59
+ y_pred = pls.predict(X_sc).ravel()
60
+ y_flat = y.ravel()
61
+ ss_res = np.sum((y_flat - y_pred) ** 2)
62
+ ss_tot = np.sum((y_flat - y_flat.mean()) ** 2)
63
+ r2 = 1 - ss_res / ss_tot if ss_tot > 0 else 0
64
+
65
+ # CV score
66
+ cv_scores = cross_val_score(pls, X_sc, y_flat, cv=min(cv_k, len(y_flat)), scoring="r2")
67
+
68
+ lines = [
69
+ f"Partial Least Squares Regression: {y_col} ~ {' + '.join(x_cols)}",
70
+ f"N={sub.height} Components={n_comp}",
71
+ "=" * 55,
72
+ f" R² : {r2:.4f}",
73
+ f" CV R² (k={min(cv_k, len(y_flat))}): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}",
74
+ "",
75
+ " X Loadings (first component):",
76
+ ]
77
+ x_load = pls.x_loadings_[:, 0]
78
+ for name, load in sorted(zip(x_cols, x_load), key=lambda t: -abs(t[1])):
79
+ lines.append(f" {name:<20} {load:9.4f}")
80
+
81
+ return "\n".join(lines)
82
+ except Exception as e:
83
+ return friendly_error(e, "pls")
84
+
85
+
86
+ @command("pcr", usage="pcr <y> <x1> [x2 ...] [--components=2]")
87
+ def cmd_pcr(session: Session, args: str) -> str:
88
+ """Principal Component Regression (PCR).
89
+
90
+ First runs PCA on predictors, then regresses outcome on components.
91
+
92
+ Options:
93
+ --components=N number of PC components to keep (default: 2)
94
+ --cv=K k-fold cross-validation (default: 5)
95
+
96
+ Examples:
97
+ pcr y x1 x2 x3 x4 x5 --components=3
98
+ """
99
+ try:
100
+ from sklearn.decomposition import PCA
101
+ from sklearn.linear_model import LinearRegression
102
+ from sklearn.model_selection import cross_val_score
103
+ from sklearn.pipeline import Pipeline
104
+ from sklearn.preprocessing import StandardScaler
105
+ except ImportError:
106
+ return "scikit-learn required. Install: pip install scikit-learn"
107
+
108
+ import polars as pl
109
+ ca = CommandArgs(args)
110
+ preds = [p for p in ca.positional if not p.startswith("-")]
111
+ if len(preds) < 2:
112
+ return "Usage: pcr <y> <x1> [x2 ...] [--components=N]"
113
+
114
+ y_col = preds[0]
115
+ x_cols = preds[1:]
116
+ n_components = int(ca.options.get("components", 2))
117
+ cv_k = int(ca.options.get("cv", 5))
118
+
119
+ try:
120
+ df = session.require_data()
121
+ sub = df.select([y_col] + x_cols).drop_nulls()
122
+ y = sub[y_col].to_numpy().astype(float)
123
+ X = sub.select(x_cols).to_numpy().astype(float)
124
+
125
+ n_comp = min(n_components, X.shape[1], X.shape[0] - 1)
126
+ pipe = Pipeline([
127
+ ("scaler", StandardScaler()),
128
+ ("pca", PCA(n_components=n_comp)),
129
+ ("reg", LinearRegression()),
130
+ ])
131
+ pipe.fit(X, y)
132
+ y_pred = pipe.predict(X)
133
+ ss_res = np.sum((y - y_pred) ** 2)
134
+ ss_tot = np.sum((y - y.mean()) ** 2)
135
+ r2 = 1 - ss_res / ss_tot if ss_tot > 0 else 0
136
+
137
+ pca = pipe.named_steps["pca"]
138
+ var_exp = pca.explained_variance_ratio_
139
+
140
+ cv_scores = cross_val_score(pipe, X, y, cv=min(cv_k, len(y)), scoring="r2")
141
+
142
+ lines = [
143
+ f"Principal Component Regression: {y_col} ~ {' + '.join(x_cols)}",
144
+ f"N={sub.height} Components={n_comp}",
145
+ "=" * 55,
146
+ f" R² : {r2:.4f}",
147
+ f" CV R² (k={min(cv_k, len(y))}): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}",
148
+ "",
149
+ " PCA Components — Variance Explained:",
150
+ ]
151
+ cum = 0.0
152
+ for i, ve in enumerate(var_exp):
153
+ cum += ve
154
+ lines.append(f" PC{i+1}: {ve*100:.1f}% (cumulative: {cum*100:.1f}%)")
155
+
156
+ return "\n".join(lines)
157
+ except Exception as e:
158
+ return friendly_error(e, "pcr")
159
+
160
+
161
+ # ── Cross-validation ─────────────────────────────────────────────────────────
162
+
163
+ @command("crossval", usage="crossval [--folds=5] [--metric=r2|rmse|mae|accuracy|auc]")
164
+ def cmd_crossval(session: Session, args: str) -> str:
165
+ """K-fold cross-validation on the last fitted model.
166
+
167
+ Evaluates model generalization using the dataset in the current session.
168
+
169
+ Options:
170
+ --folds=K number of folds (default: 5)
171
+ --metric=<name> scoring metric: r2, rmse, mae, accuracy, auc (default: r2)
172
+ --seed=N random seed
173
+
174
+ Examples:
175
+ ols income educ age
176
+ crossval --folds=10 --metric=rmse
177
+
178
+ logit employed educ age female
179
+ crossval --metric=auc
180
+ """
181
+ try:
182
+ from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
183
+ from sklearn.preprocessing import StandardScaler
184
+ except ImportError:
185
+ return "scikit-learn required. Install: pip install scikit-learn"
186
+
187
+ import polars as pl
188
+ ca = CommandArgs(args)
189
+ k = int(ca.options.get("folds", 5))
190
+ metric = ca.options.get("metric", "r2").lower()
191
+ seed = int(ca.options.get("seed", getattr(session, "_repro_seed", 42) or 42))
192
+
193
+ if session._last_model is None or session._last_model_vars is None:
194
+ return "No model fitted. Run ols/logit/etc. first."
195
+
196
+ dep, indeps = session._last_model_vars
197
+ try:
198
+ df = session.require_data()
199
+ sub = df.select([dep] + indeps).drop_nulls()
200
+ y = sub[dep].to_numpy().astype(float)
201
+ X = sub.select(indeps).to_numpy().astype(float)
202
+
203
+ # Rebuild sklearn model from statsmodels fit
204
+ model_name = type(session._last_model.model).__name__.lower()
205
+
206
+ if "logit" in model_name or "probit" in model_name or "mnlogit" in model_name:
207
+ from sklearn.linear_model import LogisticRegression
208
+ sk_model = LogisticRegression(max_iter=500, random_state=seed)
209
+ cv = StratifiedKFold(n_splits=min(k, len(y)), shuffle=True, random_state=seed)
210
+ metric_map = {
211
+ "accuracy": "accuracy", "auc": "roc_auc",
212
+ "r2": "accuracy", # fallback
213
+ }
214
+ sk_metric = metric_map.get(metric, "accuracy")
215
+ else:
216
+ from sklearn.linear_model import LinearRegression
217
+ sk_model = LinearRegression()
218
+ cv = KFold(n_splits=min(k, len(y)), shuffle=True, random_state=seed)
219
+ metric_map = {
220
+ "r2": "r2", "rmse": "neg_root_mean_squared_error",
221
+ "mae": "neg_mean_absolute_error",
222
+ }
223
+ sk_metric = metric_map.get(metric, "r2")
224
+
225
+ scores = cross_val_score(sk_model, X, y, cv=cv, scoring=sk_metric)
226
+ # Negate for neg_* metrics
227
+ if sk_metric.startswith("neg_"):
228
+ scores = -scores
229
+ display_metric = metric.upper()
230
+ else:
231
+ display_metric = metric.upper()
232
+
233
+ lines = [
234
+ f"Cross-Validation: {dep} ~ {' + '.join(indeps)}",
235
+ f"Folds={min(k, len(y))} Metric={display_metric} N={len(y)}",
236
+ "=" * 45,
237
+ f" Mean: {scores.mean():.4f}",
238
+ f" Std: {scores.std():.4f}",
239
+ f" Min: {scores.min():.4f}",
240
+ f" Max: {scores.max():.4f}",
241
+ "",
242
+ " Per-fold scores:",
243
+ ]
244
+ for i, s in enumerate(scores):
245
+ lines.append(f" Fold {i+1}: {s:.4f}")
246
+ return "\n".join(lines)
247
+
248
+ except Exception as e:
249
+ return friendly_error(e, "crossval")
250
+
251
+
252
+ # ── Hyperparameter Optimization ──────────────────────────────────────────────
253
+
254
+ @command("hyperopt", usage="hyperopt <y> <x1> [x2 ...] --model=rf|gb|svm|logit [--cv=5]")
255
+ def cmd_hyperopt(session: Session, args: str) -> str:
256
+ """Hyperparameter optimization via GridSearch / RandomSearch.
257
+
258
+ Finds optimal hyperparameters for ML models using cross-validation.
259
+
260
+ Models: rf (Random Forest), gb (Gradient Boosting), svm, logit, ridge, lasso
261
+
262
+ Options:
263
+ --model=<name> model to optimize (required)
264
+ --cv=K cross-validation folds (default: 5)
265
+ --n_iter=N number of random search iterations (default: 20)
266
+ --metric=<name> scoring metric (default: r2 or accuracy)
267
+ --task=reg|class regression or classification (auto-detected)
268
+
269
+ Examples:
270
+ hyperopt income educ age --model=rf
271
+ hyperopt employed educ age female --model=gb --task=class --cv=10
272
+ """
273
+ try:
274
+ from sklearn.model_selection import RandomizedSearchCV
275
+ from sklearn.preprocessing import LabelEncoder
276
+ except ImportError:
277
+ return "scikit-learn required. Install: pip install scikit-learn"
278
+
279
+ import polars as pl
280
+ ca = CommandArgs(args)
281
+ preds = [p for p in ca.positional if not p.startswith("-")]
282
+ if len(preds) < 2:
283
+ return "Usage: hyperopt <y> <x1> [x2 ...] --model=rf|gb|svm|logit"
284
+
285
+ y_col = preds[0]
286
+ x_cols = preds[1:]
287
+ model_name = ca.options.get("model", "rf").lower()
288
+ cv_k = int(ca.options.get("cv", 5))
289
+ n_iter = int(ca.options.get("n_iter", 20))
290
+ seed = int(ca.options.get("seed", getattr(session, "_repro_seed", 42) or 42))
291
+
292
+ try:
293
+ df = session.require_data()
294
+ sub = df.select([y_col] + x_cols).drop_nulls()
295
+ y = sub[y_col].to_numpy().astype(float)
296
+ X = sub.select(x_cols).to_numpy().astype(float)
297
+
298
+ # Auto-detect task
299
+ task = ca.options.get("task", "")
300
+ if not task:
301
+ n_uniq = len(set(y))
302
+ task = "class" if n_uniq <= 10 and (y == y.astype(int)).all() else "reg"
303
+
304
+ is_clf = task.startswith("class")
305
+ metric = ca.options.get("metric", "accuracy" if is_clf else "r2")
306
+
307
+ # Model + param grid
308
+ from scipy.stats import randint, uniform
309
+ if model_name == "rf":
310
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
311
+ mdl = RandomForestClassifier(random_state=seed) if is_clf else RandomForestRegressor(random_state=seed)
312
+ param_dist = {"n_estimators": randint(50, 300), "max_depth": [None, 3, 5, 10, 20],
313
+ "min_samples_split": randint(2, 10), "max_features": ["sqrt", "log2", None]}
314
+ elif model_name == "gb":
315
+ from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
316
+ mdl = GradientBoostingClassifier(random_state=seed) if is_clf else GradientBoostingRegressor(random_state=seed)
317
+ param_dist = {"n_estimators": randint(50, 300), "learning_rate": uniform(0.01, 0.3),
318
+ "max_depth": randint(2, 8), "subsample": uniform(0.6, 0.4)}
319
+ elif model_name == "svm":
320
+ from sklearn.svm import SVC, SVR
321
+ from sklearn.preprocessing import StandardScaler
322
+ from sklearn.pipeline import Pipeline
323
+ scaler = StandardScaler()
324
+ base = SVC(random_state=seed, probability=True) if is_clf else SVR()
325
+ mdl = Pipeline([("sc", scaler), ("svm", base)])
326
+ param_dist = {"svm__C": uniform(0.01, 100), "svm__kernel": ["rbf", "linear"],
327
+ "svm__gamma": ["scale", "auto"]}
328
+ elif model_name in ("logit", "ridge"):
329
+ from sklearn.linear_model import LogisticRegression, Ridge, Lasso
330
+ if is_clf:
331
+ mdl = LogisticRegression(max_iter=500, random_state=seed)
332
+ param_dist = {"C": uniform(0.001, 10), "penalty": ["l2"], "solver": ["lbfgs", "liblinear"]}
333
+ else:
334
+ mdl = Ridge(random_state=seed)
335
+ param_dist = {"alpha": uniform(0.001, 10)}
336
+ elif model_name == "lasso":
337
+ from sklearn.linear_model import Lasso
338
+ mdl = Lasso(random_state=seed)
339
+ param_dist = {"alpha": uniform(0.001, 10)}
340
+ else:
341
+ return f"Unknown model: {model_name}. Use rf, gb, svm, logit, ridge, lasso."
342
+
343
+ search = RandomizedSearchCV(
344
+ mdl, param_distributions=param_dist,
345
+ n_iter=n_iter, cv=min(cv_k, len(y)),
346
+ scoring=metric, random_state=seed, n_jobs=-1,
347
+ )
348
+ search.fit(X, y)
349
+
350
+ best_params = search.best_params_
351
+ lines = [
352
+ f"Hyperparameter Optimization: {y_col} ~ {' + '.join(x_cols)}",
353
+ f"Model: {model_name.upper()} Task: {'Classification' if is_clf else 'Regression'}",
354
+ f"Search: RandomSearch({n_iter} iterations) CV={min(cv_k, len(y))} Metric={metric}",
355
+ "=" * 60,
356
+ f" Best score: {search.best_score_:.4f}",
357
+ "",
358
+ " Best parameters:",
359
+ ]
360
+ for k_p, v in sorted(best_params.items()):
361
+ lines.append(f" {k_p:<30} {v}")
362
+
363
+ return "\n".join(lines)
364
+ except Exception as e:
365
+ return friendly_error(e, "hyperopt")
366
+
367
+
368
+ # ── SHAP values ──────────────────────────────────────────────────────────────
369
+
370
+ @command("shap", usage="shap <y> <x1> [x2 ...] [--model=rf|gb|linear] [--plot]")
371
+ def cmd_shap(session: Session, args: str) -> str:
372
+ """SHAP (SHapley Additive exPlanations) feature importance.
373
+
374
+ Computes SHAP values to explain model predictions.
375
+ Works with tree models (RF, GB) and linear models.
376
+
377
+ Options:
378
+ --model=rf|gb|linear model type (default: rf)
379
+ --plot save SHAP summary plot
380
+ --n_samples=N max samples for SHAP (default: 500)
381
+
382
+ Examples:
383
+ shap income educ age female --model=rf --plot
384
+ shap y x1 x2 x3 --model=gb
385
+ """
386
+ try:
387
+ import shap as _shap
388
+ except ImportError:
389
+ return "shap required. Install: pip install shap"
390
+ try:
391
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
392
+ from sklearn.linear_model import LinearRegression
393
+ except ImportError:
394
+ return "scikit-learn required. Install: pip install scikit-learn"
395
+
396
+ import polars as pl
397
+ ca = CommandArgs(args)
398
+ preds = [p for p in ca.positional if not p.startswith("-")]
399
+ if len(preds) < 2:
400
+ return "Usage: shap <y> <x1> [x2 ...] [--model=rf]"
401
+
402
+ y_col = preds[0]
403
+ x_cols = preds[1:]
404
+ model_name = ca.options.get("model", "rf").lower()
405
+ make_plot = "--plot" in args
406
+ n_samples = int(ca.options.get("n_samples", 500))
407
+ seed = int(ca.options.get("seed", getattr(session, "_repro_seed", 42) or 42))
408
+
409
+ try:
410
+ df = session.require_data()
411
+ sub = df.select([y_col] + x_cols).drop_nulls()
412
+ y = sub[y_col].to_numpy().astype(float)
413
+ X = sub.select(x_cols).to_numpy().astype(float)
414
+
415
+ # Use subset for SHAP if large
416
+ if len(X) > n_samples:
417
+ rng = np.random.default_rng(seed)
418
+ idx = rng.choice(len(X), n_samples, replace=False)
419
+ X_shap, y_shap = X[idx], y[idx]
420
+ else:
421
+ X_shap, y_shap = X, y
422
+
423
+ if model_name == "rf":
424
+ mdl = RandomForestRegressor(n_estimators=100, random_state=seed)
425
+ mdl.fit(X, y)
426
+ explainer = _shap.TreeExplainer(mdl)
427
+ elif model_name == "gb":
428
+ mdl = GradientBoostingRegressor(n_estimators=100, random_state=seed)
429
+ mdl.fit(X, y)
430
+ explainer = _shap.TreeExplainer(mdl)
431
+ else: # linear
432
+ mdl = LinearRegression()
433
+ mdl.fit(X, y)
434
+ explainer = _shap.LinearExplainer(mdl, X_shap)
435
+
436
+ shap_values = explainer.shap_values(X_shap)
437
+ mean_abs = np.abs(shap_values).mean(axis=0)
438
+
439
+ lines = [
440
+ f"SHAP Feature Importance: {y_col} ~ {' + '.join(x_cols)}",
441
+ f"Model: {model_name.upper()} N={len(X_shap)} samples",
442
+ "=" * 50,
443
+ f" {'Feature':<25} {'Mean |SHAP|':>12}",
444
+ "-" * 50,
445
+ ]
446
+ for name, val in sorted(zip(x_cols, mean_abs), key=lambda t: -t[1]):
447
+ bar = "█" * int(val / mean_abs.max() * 20)
448
+ lines.append(f" {name:<25} {val:12.4f} {bar}")
449
+
450
+ if make_plot:
451
+ import matplotlib
452
+ matplotlib.use("Agg")
453
+ import matplotlib.pyplot as plt
454
+
455
+ fig, ax = plt.subplots(figsize=(8, max(4, len(x_cols) * 0.5 + 1)))
456
+ order = np.argsort(mean_abs)
457
+ ax.barh([x_cols[i] for i in order], mean_abs[order], color="#4C72B0")
458
+ ax.set_xlabel("Mean |SHAP value|")
459
+ ax.set_title(f"SHAP Feature Importance ({model_name.upper()})")
460
+ fig.tight_layout()
461
+ from pathlib import Path
462
+ session.output_dir.mkdir(parents=True, exist_ok=True)
463
+ path = session.output_dir / "shap_importance.png"
464
+ fig.savefig(path, dpi=150)
465
+ plt.close(fig)
466
+ session.plot_paths.append(str(path))
467
+ lines.append(f"\nSHAP plot saved: {path}")
468
+
469
+ return "\n".join(lines)
470
+ except Exception as e:
471
+ return friendly_error(e, "shap")
472
+
473
+
474
+ # ── Learning curve ───────────────────────────────────────────────────────────
475
+
476
+ @command("learncurve", usage="learncurve <y> <x1> [x2 ...] [--model=ols|rf|logit]")
477
+ def cmd_learncurve(session: Session, args: str) -> str:
478
+ """Plot learning curve: training and CV score vs training set size.
479
+
480
+ Helps diagnose bias/variance tradeoff.
481
+
482
+ Options:
483
+ --model=ols|rf|logit|gb model (default: ols for continuous y, logit for binary)
484
+ --cv=K cross-validation folds (default: 5)
485
+ --steps=N number of training size steps (default: 10)
486
+
487
+ Examples:
488
+ learncurve income educ age
489
+ learncurve employed educ age female --model=logit
490
+ """
491
+ try:
492
+ from sklearn.model_selection import learning_curve
493
+ from sklearn.linear_model import LinearRegression, LogisticRegression
494
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
495
+ except ImportError:
496
+ return "scikit-learn required. Install: pip install scikit-learn"
497
+
498
+ import polars as pl
499
+ import matplotlib
500
+ matplotlib.use("Agg")
501
+ import matplotlib.pyplot as plt
502
+
503
+ ca = CommandArgs(args)
504
+ preds = [p for p in ca.positional if not p.startswith("-")]
505
+ if len(preds) < 2:
506
+ return "Usage: learncurve <y> <x1> [x2 ...]"
507
+
508
+ y_col = preds[0]
509
+ x_cols = preds[1:]
510
+ cv_k = int(ca.options.get("cv", 5))
511
+ steps = int(ca.options.get("steps", 10))
512
+ seed = int(ca.options.get("seed", getattr(session, "_repro_seed", 42) or 42))
513
+
514
+ try:
515
+ df = session.require_data()
516
+ sub = df.select([y_col] + x_cols).drop_nulls()
517
+ y = sub[y_col].to_numpy().astype(float)
518
+ X = sub.select(x_cols).to_numpy().astype(float)
519
+
520
+ model_opt = ca.options.get("model", "")
521
+ n_uniq = len(set(y))
522
+ is_clf = (n_uniq <= 10 and (y == y.astype(int)).all()) if not model_opt else ("logit" in model_opt or "class" in model_opt)
523
+
524
+ mdl_map = {
525
+ "ols": LinearRegression(), "linear": LinearRegression(),
526
+ "logit": LogisticRegression(max_iter=500, random_state=seed),
527
+ "rf": RandomForestClassifier(random_state=seed) if is_clf else RandomForestRegressor(random_state=seed),
528
+ "gb": GradientBoostingRegressor(random_state=seed),
529
+ }
530
+ mdl = mdl_map.get(model_opt, LogisticRegression(max_iter=500, random_state=seed) if is_clf else LinearRegression())
531
+ metric = "accuracy" if is_clf else "r2"
532
+
533
+ train_sizes = np.linspace(0.1, 1.0, steps)
534
+ ts, train_scores, cv_scores = learning_curve(
535
+ mdl, X, y, cv=min(cv_k, len(y)), scoring=metric,
536
+ train_sizes=train_sizes, random_state=seed,
537
+ )
538
+
539
+ fig, ax = plt.subplots(figsize=(8, 5))
540
+ ax.fill_between(ts, train_scores.mean(1) - train_scores.std(1),
541
+ train_scores.mean(1) + train_scores.std(1), alpha=0.15, color="#4C72B0")
542
+ ax.plot(ts, train_scores.mean(1), "o-", color="#4C72B0", label="Train")
543
+ ax.fill_between(ts, cv_scores.mean(1) - cv_scores.std(1),
544
+ cv_scores.mean(1) + cv_scores.std(1), alpha=0.15, color="#DD8452")
545
+ ax.plot(ts, cv_scores.mean(1), "s-", color="#DD8452", label="CV")
546
+ ax.set_xlabel("Training set size")
547
+ ax.set_ylabel(metric.upper())
548
+ ax.set_title(f"Learning Curve: {y_col}")
549
+ ax.legend()
550
+ fig.tight_layout()
551
+
552
+ session.output_dir.mkdir(parents=True, exist_ok=True)
553
+ from pathlib import Path
554
+ path = session.output_dir / "learning_curve.png"
555
+ fig.savefig(path, dpi=150)
556
+ plt.close(fig)
557
+ session.plot_paths.append(str(path))
558
+
559
+ final_cv = cv_scores.mean(1)[-1]
560
+ final_train = train_scores.mean(1)[-1]
561
+ gap = final_train - final_cv
562
+ diagnosis = (
563
+ "Possible overfitting (high variance)" if gap > 0.1 else
564
+ "Possible underfitting (high bias)" if final_cv < 0.5 else
565
+ "Good fit"
566
+ )
567
+
568
+ return (
569
+ f"Learning Curve: {y_col} ~ {' + '.join(x_cols)}\n"
570
+ f"Final train {metric}: {final_train:.4f} | "
571
+ f"CV {metric}: {final_cv:.4f} | Gap: {gap:.4f}\n"
572
+ f"Diagnosis: {diagnosis}\n"
573
+ f"Plot saved: {path}"
574
+ )
575
+ except Exception as e:
576
+ return friendly_error(e, "learncurve")