openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Mixture models and changepoint detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
5
|
+
from openstat.session import Session
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@command("mixture", usage="mixture <col> [--k=3] [--covariance=full]")
|
|
9
|
+
def cmd_mixture(session: Session, args: str) -> str:
|
|
10
|
+
"""Gaussian Mixture Model clustering / density estimation.
|
|
11
|
+
|
|
12
|
+
Options:
|
|
13
|
+
--k=<n> number of components (default: 3)
|
|
14
|
+
--covariance=<type> full, tied, diag, spherical (default: full)
|
|
15
|
+
--maxiter=<n> max EM iterations (default: 200)
|
|
16
|
+
--assign add component assignment column to data
|
|
17
|
+
|
|
18
|
+
Examples:
|
|
19
|
+
mixture income --k=3
|
|
20
|
+
mixture income --k=4 --covariance=diag --assign
|
|
21
|
+
"""
|
|
22
|
+
try:
|
|
23
|
+
from sklearn.mixture import GaussianMixture
|
|
24
|
+
except ImportError:
|
|
25
|
+
return "scikit-learn required. Install: pip install scikit-learn"
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
import polars as pl
|
|
29
|
+
|
|
30
|
+
ca = CommandArgs(args)
|
|
31
|
+
if not ca.positional:
|
|
32
|
+
return "Usage: mixture <col> [--k=3]"
|
|
33
|
+
|
|
34
|
+
col = ca.positional[0]
|
|
35
|
+
k = int(ca.options.get("k", 3))
|
|
36
|
+
cov_type = ca.options.get("covariance", "full")
|
|
37
|
+
max_iter = int(ca.options.get("maxiter", 200))
|
|
38
|
+
assign = "assign" in ca.flags
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
df = session.require_data()
|
|
42
|
+
if col not in df.columns:
|
|
43
|
+
return f"Column not found: {col}"
|
|
44
|
+
|
|
45
|
+
X = df[col].drop_nulls().to_numpy().reshape(-1, 1)
|
|
46
|
+
if len(X) < k * 2:
|
|
47
|
+
return f"Too few observations ({len(X)}) for {k} components."
|
|
48
|
+
|
|
49
|
+
gm = GaussianMixture(n_components=k, covariance_type=cov_type,
|
|
50
|
+
max_iter=max_iter, random_state=42)
|
|
51
|
+
gm.fit(X)
|
|
52
|
+
|
|
53
|
+
lines = [f"Gaussian Mixture Model — {col} (k={k}, cov={cov_type})", ""]
|
|
54
|
+
lines.append(f" Log-likelihood : {gm.lower_bound_:.4f}")
|
|
55
|
+
lines.append(f" BIC : {gm.bic(X):.4f}")
|
|
56
|
+
lines.append(f" AIC : {gm.aic(X):.4f}")
|
|
57
|
+
lines.append(f" Converged : {gm.converged_}")
|
|
58
|
+
lines.append("")
|
|
59
|
+
lines.append(f" {'Component':<12} {'Weight':>10} {'Mean':>12} {'Std':>12}")
|
|
60
|
+
lines.append(" " + "-" * 50)
|
|
61
|
+
for i in range(k):
|
|
62
|
+
w = gm.weights_[i]
|
|
63
|
+
mu = gm.means_[i, 0]
|
|
64
|
+
if cov_type == "full":
|
|
65
|
+
sigma = float(np.sqrt(gm.covariances_[i, 0, 0]))
|
|
66
|
+
elif cov_type == "tied":
|
|
67
|
+
sigma = float(np.sqrt(gm.covariances_[0, 0]))
|
|
68
|
+
elif cov_type == "diag":
|
|
69
|
+
sigma = float(np.sqrt(gm.covariances_[i, 0]))
|
|
70
|
+
else:
|
|
71
|
+
sigma = float(np.sqrt(gm.covariances_[i]))
|
|
72
|
+
lines.append(f" {i+1:<12} {w:>10.4f} {mu:>12.4f} {sigma:>12.4f}")
|
|
73
|
+
|
|
74
|
+
if assign:
|
|
75
|
+
all_X = df[col].to_numpy().reshape(-1, 1)
|
|
76
|
+
labels = gm.predict(all_X)
|
|
77
|
+
session.df = df.with_columns(
|
|
78
|
+
pl.Series(f"{col}_component", labels.astype(int))
|
|
79
|
+
)
|
|
80
|
+
lines.append(f"\nComponent assignments added as '{col}_component'.")
|
|
81
|
+
|
|
82
|
+
return "\n".join(lines)
|
|
83
|
+
except Exception as e:
|
|
84
|
+
return friendly_error(e, "mixture")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@command("changepoint", usage="changepoint <col> [--model=rbf] [--n=5] [--pen=10]")
|
|
88
|
+
def cmd_changepoint(session: Session, args: str) -> str:
|
|
89
|
+
"""Change point detection in time series.
|
|
90
|
+
|
|
91
|
+
Uses the ruptures library to detect structural breaks in a variable.
|
|
92
|
+
|
|
93
|
+
Options:
|
|
94
|
+
--model=<m> cost model: rbf, l1, l2, normal, ar (default: rbf)
|
|
95
|
+
--n=<k> number of change points to find (default: 5)
|
|
96
|
+
--pen=<p> penalty for automatic detection via PELT (overrides --n)
|
|
97
|
+
--algo=<a> algorithm: pelt, binseg, dynp, window (default: pelt if --pen, else binseg)
|
|
98
|
+
|
|
99
|
+
Examples:
|
|
100
|
+
changepoint price --n=3
|
|
101
|
+
changepoint gdp --pen=5 --model=l2
|
|
102
|
+
changepoint returns --algo=binseg --n=4
|
|
103
|
+
"""
|
|
104
|
+
try:
|
|
105
|
+
import ruptures as rpt
|
|
106
|
+
except ImportError:
|
|
107
|
+
return "ruptures required. Install: pip install ruptures"
|
|
108
|
+
|
|
109
|
+
import numpy as np
|
|
110
|
+
|
|
111
|
+
ca = CommandArgs(args)
|
|
112
|
+
if not ca.positional:
|
|
113
|
+
return "Usage: changepoint <col> [--model=rbf] [--n=5]"
|
|
114
|
+
|
|
115
|
+
col = ca.positional[0]
|
|
116
|
+
model = ca.options.get("model", "rbf")
|
|
117
|
+
n_bkps = int(ca.options.get("n", 5))
|
|
118
|
+
pen = ca.options.get("pen")
|
|
119
|
+
algo_name = ca.options.get("algo", "pelt" if pen else "binseg")
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
df = session.require_data()
|
|
123
|
+
if col not in df.columns:
|
|
124
|
+
return f"Column not found: {col}"
|
|
125
|
+
|
|
126
|
+
signal = df[col].drop_nulls().to_numpy().astype(float)
|
|
127
|
+
if len(signal) < 10:
|
|
128
|
+
return f"Need at least 10 observations (got {len(signal)})."
|
|
129
|
+
|
|
130
|
+
# Select algorithm
|
|
131
|
+
algos = {"pelt": rpt.Pelt, "binseg": rpt.Binseg,
|
|
132
|
+
"dynp": rpt.Dynp, "window": rpt.Window}
|
|
133
|
+
AlgoClass = algos.get(algo_name.lower(), rpt.Binseg)
|
|
134
|
+
algo = AlgoClass(model=model).fit(signal)
|
|
135
|
+
|
|
136
|
+
if pen is not None:
|
|
137
|
+
breakpoints = algo.predict(pen=float(pen))
|
|
138
|
+
else:
|
|
139
|
+
breakpoints = algo.predict(n_bkps=n_bkps)
|
|
140
|
+
|
|
141
|
+
# Remove the last element (= length of signal, not a real break)
|
|
142
|
+
breaks = [b for b in breakpoints if b < len(signal)]
|
|
143
|
+
|
|
144
|
+
lines = [f"Change Point Detection — {col}", ""]
|
|
145
|
+
lines.append(f" Algorithm : {algo_name}")
|
|
146
|
+
lines.append(f" Model : {model}")
|
|
147
|
+
lines.append(f" Signal length: {len(signal)}")
|
|
148
|
+
lines.append(f" Change points found: {len(breaks)}")
|
|
149
|
+
lines.append("")
|
|
150
|
+
|
|
151
|
+
if breaks:
|
|
152
|
+
lines.append(f" {'#':<6} {'Index':>8} {'Segment mean':>14} {'Segment std':>12}")
|
|
153
|
+
lines.append(" " + "-" * 45)
|
|
154
|
+
prev = 0
|
|
155
|
+
for i, bp in enumerate(breaks + [len(signal)], 1):
|
|
156
|
+
seg = signal[prev:bp]
|
|
157
|
+
lines.append(
|
|
158
|
+
f" {i:<6} {bp:>8} {seg.mean():>14.4f} {seg.std():>12.4f}"
|
|
159
|
+
)
|
|
160
|
+
prev = bp
|
|
161
|
+
else:
|
|
162
|
+
lines.append(" No change points detected.")
|
|
163
|
+
|
|
164
|
+
return "\n".join(lines)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
return friendly_error(e, "changepoint")
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Advanced ML commands: randomforest, gbm, svm, tsne."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from openstat.commands.base import command
|
|
8
|
+
from openstat.session import Session
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _stata_opts(raw: str) -> tuple[list[str], dict[str, str]]:
|
|
12
|
+
opts: dict[str, str] = {}
|
|
13
|
+
for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
|
|
14
|
+
opts[m.group(1).lower()] = m.group(2)
|
|
15
|
+
rest = re.sub(r'\w+\([^)]*\)', '', raw)
|
|
16
|
+
positional = [t.strip(',') for t in rest.split() if t.strip(',')]
|
|
17
|
+
return positional, opts
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _fmt_imp(feat_imp: dict, top: int = 10) -> str:
|
|
21
|
+
sorted_imp = sorted(feat_imp.items(), key=lambda x: -x[1])[:top]
|
|
22
|
+
lines = ["\n Feature Importances:"]
|
|
23
|
+
for feat, imp in sorted_imp:
|
|
24
|
+
bar = "█" * int(imp * 40)
|
|
25
|
+
lines.append(f" {feat:<20} {imp:.4f} {bar}")
|
|
26
|
+
return "\n".join(lines)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@command("randomforest", usage="randomforest dep var1 var2 ... [n(100) depth(none) task(regression)]")
|
|
30
|
+
def cmd_randomforest(session: Session, args: str) -> str:
|
|
31
|
+
"""Random Forest regression or classification."""
|
|
32
|
+
try:
|
|
33
|
+
import sklearn
|
|
34
|
+
except ImportError:
|
|
35
|
+
return "sklearn not installed. Run: pip install scikit-learn"
|
|
36
|
+
from openstat.stats.ml_advanced import fit_random_forest
|
|
37
|
+
df = session.require_data()
|
|
38
|
+
positional, opts = _stata_opts(args)
|
|
39
|
+
if len(positional) < 2:
|
|
40
|
+
return "Usage: randomforest dep var1 [var2 ...] [n(100) depth(5) task(regression)]"
|
|
41
|
+
dep = positional[0]
|
|
42
|
+
indeps = [c for c in positional[1:] if c in df.columns]
|
|
43
|
+
n_est = int(opts.get("n", 100))
|
|
44
|
+
max_depth = int(opts["depth"]) if "depth" in opts else None
|
|
45
|
+
task = opts.get("task", "regression")
|
|
46
|
+
try:
|
|
47
|
+
r = fit_random_forest(df, dep, indeps, n_estimators=n_est, max_depth=max_depth, task=task)
|
|
48
|
+
session._last_model = r
|
|
49
|
+
metric = "r_squared" if task == "regression" else "accuracy"
|
|
50
|
+
lines = [f"\nRandom Forest ({task})", "=" * 50]
|
|
51
|
+
lines.append(f" Dep: {dep}, N: {r['n_obs']}, Trees: {n_est}")
|
|
52
|
+
lines.append(f" {metric.replace('_', ' ').title()}: {r.get(metric, 'N/A'):.4f}")
|
|
53
|
+
lines.append(_fmt_imp(r["feature_importances"]))
|
|
54
|
+
return "\n".join(lines)
|
|
55
|
+
except Exception as exc:
|
|
56
|
+
return f"randomforest error: {exc}"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@command("gbm", usage="gbm dep var1 var2 ... [n(100) lr(0.1) depth(3) task(regression)]")
|
|
60
|
+
def cmd_gbm(session: Session, args: str) -> str:
|
|
61
|
+
"""Gradient Boosting Machine."""
|
|
62
|
+
try:
|
|
63
|
+
import sklearn
|
|
64
|
+
except ImportError:
|
|
65
|
+
return "sklearn not installed. Run: pip install scikit-learn"
|
|
66
|
+
from openstat.stats.ml_advanced import fit_gradient_boosting
|
|
67
|
+
df = session.require_data()
|
|
68
|
+
positional, opts = _stata_opts(args)
|
|
69
|
+
if len(positional) < 2:
|
|
70
|
+
return "Usage: gbm dep var1 [var2 ...] [n(100) lr(0.1) depth(3) task(regression)]"
|
|
71
|
+
dep = positional[0]
|
|
72
|
+
indeps = [c for c in positional[1:] if c in df.columns]
|
|
73
|
+
n_est = int(opts.get("n", 100))
|
|
74
|
+
lr = float(opts.get("lr", 0.1))
|
|
75
|
+
depth = int(opts.get("depth", 3))
|
|
76
|
+
task = opts.get("task", "regression")
|
|
77
|
+
try:
|
|
78
|
+
r = fit_gradient_boosting(df, dep, indeps, n_estimators=n_est, learning_rate=lr, max_depth=depth, task=task)
|
|
79
|
+
session._last_model = r
|
|
80
|
+
metric = "r_squared" if task == "regression" else "accuracy"
|
|
81
|
+
lines = [f"\nGradient Boosting ({task})", "=" * 50]
|
|
82
|
+
lines.append(f" Dep: {dep}, N: {r['n_obs']}, Trees: {n_est}, LR: {lr}")
|
|
83
|
+
lines.append(f" {metric.replace('_', ' ').title()}: {r.get(metric, 'N/A'):.4f}")
|
|
84
|
+
lines.append(_fmt_imp(r["feature_importances"]))
|
|
85
|
+
return "\n".join(lines)
|
|
86
|
+
except Exception as exc:
|
|
87
|
+
return f"gbm error: {exc}"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@command("svm", usage="svm dep var1 var2 ... [kernel(rbf) C(1.0) task(regression)]")
|
|
91
|
+
def cmd_svm(session: Session, args: str) -> str:
|
|
92
|
+
"""Support Vector Machine."""
|
|
93
|
+
try:
|
|
94
|
+
import sklearn
|
|
95
|
+
except ImportError:
|
|
96
|
+
return "sklearn not installed. Run: pip install scikit-learn"
|
|
97
|
+
from openstat.stats.ml_advanced import fit_svm
|
|
98
|
+
df = session.require_data()
|
|
99
|
+
positional, opts = _stata_opts(args)
|
|
100
|
+
if len(positional) < 2:
|
|
101
|
+
return "Usage: svm dep var1 [var2 ...] [kernel(rbf) C(1.0) task(regression)]"
|
|
102
|
+
dep = positional[0]
|
|
103
|
+
indeps = [c for c in positional[1:] if c in df.columns]
|
|
104
|
+
kernel = opts.get("kernel", "rbf")
|
|
105
|
+
C = float(opts.get("c", 1.0))
|
|
106
|
+
task = opts.get("task", "regression")
|
|
107
|
+
try:
|
|
108
|
+
r = fit_svm(df, dep, indeps, kernel=kernel, C=C, task=task)
|
|
109
|
+
session._last_model = r
|
|
110
|
+
metric = "r_squared" if task == "regression" else "accuracy"
|
|
111
|
+
lines = [f"\nSVM ({task}, kernel={kernel})", "=" * 50]
|
|
112
|
+
lines.append(f" Dep: {dep}, N: {r['n_obs']}, C: {C}")
|
|
113
|
+
lines.append(f" {metric.replace('_', ' ').title()}: {r.get(metric, 'N/A'):.4f}")
|
|
114
|
+
return "\n".join(lines)
|
|
115
|
+
except Exception as exc:
|
|
116
|
+
return f"svm error: {exc}"
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@command("tsne", usage="tsne var1 var2 ... [components(2) perplexity(30) gen(tsne)]")
|
|
120
|
+
def cmd_tsne(session: Session, args: str) -> str:
|
|
121
|
+
"""t-SNE dimensionality reduction. Adds embedding columns to data."""
|
|
122
|
+
try:
|
|
123
|
+
from sklearn.manifold import TSNE
|
|
124
|
+
except ImportError:
|
|
125
|
+
return "sklearn not installed. Run: pip install scikit-learn"
|
|
126
|
+
from openstat.stats.ml_advanced import fit_tsne
|
|
127
|
+
import polars as pl
|
|
128
|
+
df = session.require_data()
|
|
129
|
+
positional, opts = _stata_opts(args)
|
|
130
|
+
cols = [c for c in positional if c in df.columns]
|
|
131
|
+
if len(cols) < 2:
|
|
132
|
+
return "tsne requires at least 2 variables."
|
|
133
|
+
n_comp = int(opts.get("components", 2))
|
|
134
|
+
perp = float(opts.get("perplexity", 30.0))
|
|
135
|
+
prefix = opts.get("gen", "tsne")
|
|
136
|
+
session.snapshot()
|
|
137
|
+
try:
|
|
138
|
+
r = fit_tsne(df, cols, n_components=n_comp, perplexity=perp)
|
|
139
|
+
emb = r["embedding"]
|
|
140
|
+
new_df = df
|
|
141
|
+
for i in range(n_comp):
|
|
142
|
+
col_name = f"{prefix}{i+1}"
|
|
143
|
+
new_df = new_df.with_columns(pl.Series(col_name, [row[i] for row in emb]))
|
|
144
|
+
session.df = new_df
|
|
145
|
+
return f"t-SNE complete. Added columns: {[f'{prefix}{i+1}' for i in range(n_comp)]}"
|
|
146
|
+
except Exception as exc:
|
|
147
|
+
return f"tsne error: {exc}"
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Machine learning commands: lasso, ridge, elasticnet, cart, crossval."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from openstat.commands.base import command
|
|
8
|
+
from openstat.session import Session
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _stata_opts(raw: str) -> tuple[list[str], dict[str, str]]:
|
|
12
|
+
opts: dict[str, str] = {}
|
|
13
|
+
for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
|
|
14
|
+
opts[m.group(1).lower()] = m.group(2)
|
|
15
|
+
rest = re.sub(r'\w+\([^)]*\)', '', raw)
|
|
16
|
+
positional = [t.strip(',') for t in rest.split() if t.strip(',')]
|
|
17
|
+
return positional, opts
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _parse_varlist(args: str, df_cols: list[str]) -> tuple[str, list[str], dict]:
|
|
21
|
+
"""Parse 'dep indeps [, opts]' from raw args."""
|
|
22
|
+
positional, opts = _stata_opts(args)
|
|
23
|
+
dep = positional[0] if positional else ""
|
|
24
|
+
indeps = [c for c in positional[1:] if c in df_cols]
|
|
25
|
+
return dep, indeps, opts
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _ml_table(result: dict) -> str:
|
|
29
|
+
method = result.get("method", "")
|
|
30
|
+
dep = result.get("dep", "")
|
|
31
|
+
lines = [f"\n{method}: {dep}", "=" * 55]
|
|
32
|
+
coef = result.get("coefficients", {})
|
|
33
|
+
if coef:
|
|
34
|
+
lines.append(f" {'Variable':<25} {'Coefficient':>12}")
|
|
35
|
+
lines.append(" " + "-" * 40)
|
|
36
|
+
for var, val in coef.items():
|
|
37
|
+
lines.append(f" {var:<25} {val:>12.6f}")
|
|
38
|
+
lines.append("")
|
|
39
|
+
for key in ("intercept", "alpha", "l1_ratio", "r_squared", "mse", "rmse",
|
|
40
|
+
"n_obs", "n_nonzero", "n_zeroed", "accuracy"):
|
|
41
|
+
if key in result:
|
|
42
|
+
v = result[key]
|
|
43
|
+
lines.append(f" {key:<25} {v!s:>12}" if not isinstance(v, float)
|
|
44
|
+
else f" {key:<25} {v:>12.6f}")
|
|
45
|
+
lines.append("=" * 55)
|
|
46
|
+
return "\n".join(lines)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@command("lasso", usage="lasso depvar indepvars [, alpha(0.1) cv(5)]")
|
|
50
|
+
def cmd_lasso(session: Session, args: str) -> str:
|
|
51
|
+
"""Lasso regression with optional CV-tuned penalty."""
|
|
52
|
+
df = session.require_data()
|
|
53
|
+
dep, indeps, opts = _parse_varlist(args, df.columns)
|
|
54
|
+
if not dep or not indeps:
|
|
55
|
+
return "Usage: lasso depvar indepvar1 indepvar2 ... [, alpha(0.1) cv(5)]"
|
|
56
|
+
alpha_raw = opts.get("alpha")
|
|
57
|
+
alpha = float(alpha_raw) if alpha_raw else None
|
|
58
|
+
cv = int(opts.get("cv", 5))
|
|
59
|
+
try:
|
|
60
|
+
from openstat.stats.ml import fit_lasso
|
|
61
|
+
result = fit_lasso(df, dep, indeps, alpha=alpha, cv=cv)
|
|
62
|
+
session._last_model = result
|
|
63
|
+
return _ml_table(result)
|
|
64
|
+
except ImportError as e:
|
|
65
|
+
return str(e)
|
|
66
|
+
except Exception as exc:
|
|
67
|
+
return f"lasso error: {exc}"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@command("ridge", usage="ridge depvar indepvars [, alpha(1.0) cv(5)]")
|
|
71
|
+
def cmd_ridge(session: Session, args: str) -> str:
|
|
72
|
+
"""Ridge regression with optional CV-tuned penalty."""
|
|
73
|
+
df = session.require_data()
|
|
74
|
+
dep, indeps, opts = _parse_varlist(args, df.columns)
|
|
75
|
+
if not dep or not indeps:
|
|
76
|
+
return "Usage: ridge depvar indepvar1 ... [, alpha(1.0) cv(5)]"
|
|
77
|
+
alpha_raw = opts.get("alpha")
|
|
78
|
+
alpha = float(alpha_raw) if alpha_raw else None
|
|
79
|
+
cv = int(opts.get("cv", 5))
|
|
80
|
+
try:
|
|
81
|
+
from openstat.stats.ml import fit_ridge
|
|
82
|
+
result = fit_ridge(df, dep, indeps, alpha=alpha, cv=cv)
|
|
83
|
+
session._last_model = result
|
|
84
|
+
return _ml_table(result)
|
|
85
|
+
except ImportError as e:
|
|
86
|
+
return str(e)
|
|
87
|
+
except Exception as exc:
|
|
88
|
+
return f"ridge error: {exc}"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@command("elasticnet", usage="elasticnet depvar indepvars [, alpha(1.0) l1ratio(0.5)]")
|
|
92
|
+
def cmd_elasticnet(session: Session, args: str) -> str:
|
|
93
|
+
"""Elastic Net regression."""
|
|
94
|
+
df = session.require_data()
|
|
95
|
+
dep, indeps, opts = _parse_varlist(args, df.columns)
|
|
96
|
+
if not dep or not indeps:
|
|
97
|
+
return "Usage: elasticnet depvar indepvar1 ... [, alpha(1.0) l1ratio(0.5)]"
|
|
98
|
+
alpha_raw = opts.get("alpha")
|
|
99
|
+
alpha = float(alpha_raw) if alpha_raw else None
|
|
100
|
+
l1 = float(opts.get("l1ratio", 0.5))
|
|
101
|
+
cv = int(opts.get("cv", 5))
|
|
102
|
+
try:
|
|
103
|
+
from openstat.stats.ml import fit_elasticnet
|
|
104
|
+
result = fit_elasticnet(df, dep, indeps, alpha=alpha, l1_ratio=l1, cv=cv)
|
|
105
|
+
session._last_model = result
|
|
106
|
+
return _ml_table(result)
|
|
107
|
+
except ImportError as e:
|
|
108
|
+
return str(e)
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
return f"elasticnet error: {exc}"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@command("cart", usage="cart depvar indepvars [, depth(5) task(regression|classification)]")
|
|
114
|
+
def cmd_cart(session: Session, args: str) -> str:
|
|
115
|
+
"""CART decision tree (regression or classification)."""
|
|
116
|
+
df = session.require_data()
|
|
117
|
+
dep, indeps, opts = _parse_varlist(args, df.columns)
|
|
118
|
+
if not dep or not indeps:
|
|
119
|
+
return "Usage: cart depvar indepvar1 ... [, depth(5) task(regression)]"
|
|
120
|
+
depth_raw = opts.get("depth")
|
|
121
|
+
max_depth = int(depth_raw) if depth_raw else 5
|
|
122
|
+
task = opts.get("task", "regression")
|
|
123
|
+
min_leaf = int(opts.get("minleaf", 5))
|
|
124
|
+
try:
|
|
125
|
+
from openstat.stats.ml import fit_cart
|
|
126
|
+
result = fit_cart(df, dep, indeps, task=task, max_depth=max_depth,
|
|
127
|
+
min_samples_leaf=min_leaf)
|
|
128
|
+
session._last_model = result
|
|
129
|
+
lines = [f"\nCART ({task}): {dep}", "=" * 55]
|
|
130
|
+
lines.append(f" {'max_depth':<25} {max_depth!s:>12}")
|
|
131
|
+
lines.append(f" {'n_leaves':<25} {result['n_leaves']:>12}")
|
|
132
|
+
lines.append(f" {'n_obs':<25} {result['n_obs']:>12}")
|
|
133
|
+
metric = "r_squared" if task == "regression" else "accuracy"
|
|
134
|
+
lines.append(f" {metric:<25} {result[metric]:>12.4f}")
|
|
135
|
+
lines.append("\nFeature Importances:")
|
|
136
|
+
for feat, imp in sorted(result["feature_importances"].items(),
|
|
137
|
+
key=lambda x: -x[1]):
|
|
138
|
+
lines.append(f" {feat:<25} {imp:>12.4f}")
|
|
139
|
+
lines.append("=" * 55)
|
|
140
|
+
return "\n".join(lines)
|
|
141
|
+
except ImportError as e:
|
|
142
|
+
return str(e)
|
|
143
|
+
except Exception as exc:
|
|
144
|
+
return f"cart error: {exc}"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@command("crossval", usage="crossval depvar indepvars [, method(ols) k(5) scoring(r2)]")
|
|
148
|
+
def cmd_crossval(session: Session, args: str) -> str:
|
|
149
|
+
"""K-fold cross-validation for regression models."""
|
|
150
|
+
df = session.require_data()
|
|
151
|
+
dep, indeps, opts = _parse_varlist(args, df.columns)
|
|
152
|
+
if not dep or not indeps:
|
|
153
|
+
return "Usage: crossval depvar indepvar1 ... [, method(ols) k(5) scoring(r2)]"
|
|
154
|
+
method = opts.get("method", "ols")
|
|
155
|
+
k = int(opts.get("k", 5))
|
|
156
|
+
alpha = float(opts.get("alpha", 1.0))
|
|
157
|
+
scoring = opts.get("scoring", "r2")
|
|
158
|
+
try:
|
|
159
|
+
from openstat.stats.ml import cross_validate_model
|
|
160
|
+
result = cross_validate_model(df, dep, indeps, method=method, k=k,
|
|
161
|
+
alpha=alpha, scoring=scoring)
|
|
162
|
+
lines = [f"\nCross-Validation ({k}-fold): {method}", "=" * 55]
|
|
163
|
+
lines.append(f" {'Dependent':<25} {dep}")
|
|
164
|
+
lines.append(f" {'Scoring':<25} {scoring}")
|
|
165
|
+
lines.append(f" {'Mean score':<25} {result['mean_score']:>12.4f}")
|
|
166
|
+
lines.append(f" {'Std score':<25} {result['std_score']:>12.4f}")
|
|
167
|
+
lines.append(f" {'Min score':<25} {result['min_score']:>12.4f}")
|
|
168
|
+
lines.append(f" {'Max score':<25} {result['max_score']:>12.4f}")
|
|
169
|
+
lines.append(f" {'N obs':<25} {result['n_obs']:>12}")
|
|
170
|
+
lines.append("\nFold scores:")
|
|
171
|
+
for i, s in enumerate(result["scores"], 1):
|
|
172
|
+
lines.append(f" Fold {i:<3} {s:.4f}")
|
|
173
|
+
lines.append("=" * 55)
|
|
174
|
+
return "\n".join(lines)
|
|
175
|
+
except ImportError as e:
|
|
176
|
+
return str(e)
|
|
177
|
+
except Exception as exc:
|
|
178
|
+
return f"crossval error: {exc}"
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Model evaluation commands: roc, confmatrix, calibration, shap."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from openstat.commands.base import command
|
|
8
|
+
from openstat.session import Session
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _stata_opts(raw: str) -> tuple[list[str], dict[str, str]]:
|
|
12
|
+
opts: dict[str, str] = {}
|
|
13
|
+
for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
|
|
14
|
+
opts[m.group(1).lower()] = m.group(2)
|
|
15
|
+
rest = re.sub(r'\w+\([^)]*\)', '', raw)
|
|
16
|
+
positional = [t.strip(',') for t in rest.split() if t.strip(',')]
|
|
17
|
+
return positional, opts
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@command("roc", usage="roc outcome score_col")
|
|
21
|
+
def cmd_roc(session: Session, args: str) -> str:
|
|
22
|
+
"""ROC curve and AUC for binary classification."""
|
|
23
|
+
from openstat.stats.model_eval import roc_auc
|
|
24
|
+
df = session.require_data()
|
|
25
|
+
positional, opts = _stata_opts(args)
|
|
26
|
+
if len(positional) < 2:
|
|
27
|
+
return "Usage: roc outcome score_col"
|
|
28
|
+
outcome, score = positional[0], positional[1]
|
|
29
|
+
for v in (outcome, score):
|
|
30
|
+
if v not in df.columns:
|
|
31
|
+
return f"Column '{v}' not found."
|
|
32
|
+
try:
|
|
33
|
+
r = roc_auc(df, outcome, score)
|
|
34
|
+
session._last_model = r
|
|
35
|
+
lines = ["\nROC Analysis", "=" * 50]
|
|
36
|
+
lines.append(f" {'Outcome':<30} {outcome}")
|
|
37
|
+
lines.append(f" {'Score':<30} {score}")
|
|
38
|
+
lines.append(f" {'AUC':<30} {r['auc']:.4f}")
|
|
39
|
+
lines.append(f" {'Optimal threshold':<30} {r['optimal_threshold']:.4f}")
|
|
40
|
+
lines.append(f" {'N observations':<30} {r['n_obs']}")
|
|
41
|
+
lines.append(f" {'Prevalence':<30} {r['prevalence']:.4f}")
|
|
42
|
+
lines.append(f"\n Interpretation:")
|
|
43
|
+
auc = r['auc']
|
|
44
|
+
if auc >= 0.9:
|
|
45
|
+
interp = "Excellent (≥0.90)"
|
|
46
|
+
elif auc >= 0.8:
|
|
47
|
+
interp = "Good (0.80–0.89)"
|
|
48
|
+
elif auc >= 0.7:
|
|
49
|
+
interp = "Fair (0.70–0.79)"
|
|
50
|
+
elif auc >= 0.6:
|
|
51
|
+
interp = "Poor (0.60–0.69)"
|
|
52
|
+
else:
|
|
53
|
+
interp = "Fail (<0.60)"
|
|
54
|
+
lines.append(f" {'AUC interpretation':<30} {interp}")
|
|
55
|
+
return "\n".join(lines)
|
|
56
|
+
except Exception as exc:
|
|
57
|
+
return f"roc error: {exc}"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@command("confmatrix", usage="confmatrix outcome predicted [threshold(0.5)]")
|
|
61
|
+
def cmd_confmatrix(session: Session, args: str) -> str:
|
|
62
|
+
"""Confusion matrix and classification metrics."""
|
|
63
|
+
from openstat.stats.model_eval import confusion_matrix
|
|
64
|
+
df = session.require_data()
|
|
65
|
+
positional, opts = _stata_opts(args)
|
|
66
|
+
if len(positional) < 2:
|
|
67
|
+
return "Usage: confmatrix outcome predicted [threshold(0.5)]"
|
|
68
|
+
outcome, predicted = positional[0], positional[1]
|
|
69
|
+
for v in (outcome, predicted):
|
|
70
|
+
if v not in df.columns:
|
|
71
|
+
return f"Column '{v}' not found."
|
|
72
|
+
threshold = float(opts.get("threshold", 0.5))
|
|
73
|
+
try:
|
|
74
|
+
r = confusion_matrix(df, outcome, predicted, threshold=threshold)
|
|
75
|
+
lines = ["\nConfusion Matrix", "=" * 50]
|
|
76
|
+
lines.append(f"\n Predicted → Positive Negative")
|
|
77
|
+
lines.append(f" Actual Positive {r['tp']:>8} {r['fn']:>8}")
|
|
78
|
+
lines.append(f" Actual Negative {r['fp']:>8} {r['tn']:>8}")
|
|
79
|
+
lines.append("\n Metrics:")
|
|
80
|
+
for k in ["accuracy", "precision", "recall_sensitivity", "specificity", "f1_score", "npv", "mcc"]:
|
|
81
|
+
lines.append(f" {k.replace('_', ' ').title():<30} {r[k]:.4f}")
|
|
82
|
+
return "\n".join(lines)
|
|
83
|
+
except Exception as exc:
|
|
84
|
+
return f"confmatrix error: {exc}"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@command("calibration", usage="calibration outcome score [bins(10)]")
|
|
88
|
+
def cmd_calibration(session: Session, args: str) -> str:
|
|
89
|
+
"""Calibration curve (reliability diagram) and Brier score."""
|
|
90
|
+
from openstat.stats.model_eval import calibration_curve
|
|
91
|
+
df = session.require_data()
|
|
92
|
+
positional, opts = _stata_opts(args)
|
|
93
|
+
if len(positional) < 2:
|
|
94
|
+
return "Usage: calibration outcome score [bins(10)]"
|
|
95
|
+
outcome, score = positional[0], positional[1]
|
|
96
|
+
for v in (outcome, score):
|
|
97
|
+
if v not in df.columns:
|
|
98
|
+
return f"Column '{v}' not found."
|
|
99
|
+
n_bins = int(opts.get("bins", 10))
|
|
100
|
+
try:
|
|
101
|
+
r = calibration_curve(df, outcome, score, n_bins=n_bins)
|
|
102
|
+
lines = ["\nCalibration Analysis", "=" * 55]
|
|
103
|
+
lines.append(f" {'Brier score':<35} {r['brier_score']:.6f}")
|
|
104
|
+
lines.append(f" {'N observations':<35} {r['n_obs']}")
|
|
105
|
+
lines.append(f"\n {'Bin Center':>10} {'Mean Pred':>10} {'Frac Pos':>10}")
|
|
106
|
+
lines.append(" " + "-" * 35)
|
|
107
|
+
for bc, mp, fp in zip(r["bin_centers"], r["mean_predicted"], r["fraction_positive"]):
|
|
108
|
+
lines.append(f" {bc:>10.3f} {mp:>10.3f} {fp:>10.3f}")
|
|
109
|
+
return "\n".join(lines)
|
|
110
|
+
except Exception as exc:
|
|
111
|
+
return f"calibration error: {exc}"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@command("shap", usage="shap dep var1 var2 ...")
|
|
115
|
+
def cmd_shap(session: Session, args: str) -> str:
|
|
116
|
+
"""Compute SHAP values for linear regression (exact for OLS)."""
|
|
117
|
+
from openstat.stats.model_eval import compute_shap_linear
|
|
118
|
+
df = session.require_data()
|
|
119
|
+
positional, opts = _stata_opts(args)
|
|
120
|
+
if len(positional) < 2:
|
|
121
|
+
return "Usage: shap dep var1 var2 ..."
|
|
122
|
+
dep = positional[0]
|
|
123
|
+
indeps = [c for c in positional[1:] if c in df.columns]
|
|
124
|
+
if not indeps:
|
|
125
|
+
return "No valid predictor variables found."
|
|
126
|
+
if dep not in df.columns:
|
|
127
|
+
return f"Column '{dep}' not found."
|
|
128
|
+
try:
|
|
129
|
+
r = compute_shap_linear(df, dep, indeps)
|
|
130
|
+
session._last_model = r
|
|
131
|
+
lines = ["\nSHAP Values (Linear)", "=" * 50]
|
|
132
|
+
lines.append(f" Dep: {dep}, N: {r['n_obs']}")
|
|
133
|
+
lines.append(f"\n {'Feature':<20} {'Mean |SHAP|':>12} Bar")
|
|
134
|
+
lines.append(" " + "-" * 50)
|
|
135
|
+
max_shap = max(r["mean_abs_shap"].values()) if r["mean_abs_shap"] else 1
|
|
136
|
+
for feat in r["feature_ranking"]:
|
|
137
|
+
val = r["mean_abs_shap"][feat]
|
|
138
|
+
bar = "█" * int(val / max_shap * 30)
|
|
139
|
+
lines.append(f" {feat:<20} {val:>12.4f} {bar}")
|
|
140
|
+
return "\n".join(lines)
|
|
141
|
+
except Exception as exc:
|
|
142
|
+
return f"shap error: {exc}"
|