openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
openstat/stats/manova.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""MANOVA and two-way ANOVA."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
from scipy import stats as sp_stats
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# ── Two-way ANOVA ─────────────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
def twoway_anova(
|
|
13
|
+
df: pl.DataFrame,
|
|
14
|
+
dep: str,
|
|
15
|
+
factor1: str,
|
|
16
|
+
factor2: str,
|
|
17
|
+
*,
|
|
18
|
+
interaction: bool = True,
|
|
19
|
+
) -> dict:
|
|
20
|
+
"""Two-way ANOVA with optional interaction term.
|
|
21
|
+
|
|
22
|
+
Uses OLS approach (type III sums of squares via statsmodels).
|
|
23
|
+
"""
|
|
24
|
+
import statsmodels.formula.api as smf
|
|
25
|
+
|
|
26
|
+
formula = f"Q('{dep}') ~ C(Q('{factor1}')) + C(Q('{factor2}'))"
|
|
27
|
+
if interaction:
|
|
28
|
+
formula += f" + C(Q('{factor1}')):C(Q('{factor2}'))"
|
|
29
|
+
|
|
30
|
+
pdf = df.select([dep, factor1, factor2]).drop_nulls().to_pandas()
|
|
31
|
+
# rename cols to safe names for formula
|
|
32
|
+
pdf.columns = ["dep", "f1", "f2"]
|
|
33
|
+
formula = "dep ~ C(f1) + C(f2)"
|
|
34
|
+
if interaction:
|
|
35
|
+
formula += " + C(f1):C(f2)"
|
|
36
|
+
|
|
37
|
+
model = smf.ols(formula, data=pdf).fit()
|
|
38
|
+
|
|
39
|
+
from statsmodels.stats.anova import anova_lm
|
|
40
|
+
anova_table = anova_lm(model, typ=3)
|
|
41
|
+
|
|
42
|
+
rows = []
|
|
43
|
+
for source, row in anova_table.iterrows():
|
|
44
|
+
rows.append({
|
|
45
|
+
"source": str(source).replace("C(f1)", factor1).replace("C(f2)", factor2),
|
|
46
|
+
"df": int(row.get("df", 0)),
|
|
47
|
+
"SS": float(row.get("sum_sq", float("nan"))),
|
|
48
|
+
"MS": float(row.get("mean_sq", float("nan"))),
|
|
49
|
+
"F": float(row.get("F", float("nan"))),
|
|
50
|
+
"p_value": float(row.get("PR(>F)", float("nan"))),
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
"test": "Two-way ANOVA",
|
|
55
|
+
"dep": dep,
|
|
56
|
+
"factor1": factor1,
|
|
57
|
+
"factor2": factor2,
|
|
58
|
+
"interaction": interaction,
|
|
59
|
+
"n_obs": int(pdf.shape[0]),
|
|
60
|
+
"r_squared": float(model.rsquared),
|
|
61
|
+
"table": rows,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ── MANOVA ─────────────────────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
def fit_manova(
|
|
68
|
+
df: pl.DataFrame,
|
|
69
|
+
dep_vars: list[str],
|
|
70
|
+
group: str,
|
|
71
|
+
) -> dict:
|
|
72
|
+
"""
|
|
73
|
+
One-way MANOVA via statsmodels.
|
|
74
|
+
|
|
75
|
+
Tests whether group means differ on a set of dependent variables.
|
|
76
|
+
"""
|
|
77
|
+
try:
|
|
78
|
+
from statsmodels.multivariate.manova import MANOVA
|
|
79
|
+
|
|
80
|
+
dep_str = " + ".join(f"Q('{d}')" for d in dep_vars)
|
|
81
|
+
pdf = df.select(dep_vars + [group]).drop_nulls().to_pandas()
|
|
82
|
+
# safe column names
|
|
83
|
+
safe_deps = [f"y{i}" for i in range(len(dep_vars))]
|
|
84
|
+
safe_group = "group_var"
|
|
85
|
+
mapping = dict(zip(dep_vars + [group], safe_deps + [safe_group]))
|
|
86
|
+
pdf.rename(columns=mapping, inplace=True)
|
|
87
|
+
dep_formula = " + ".join(safe_deps)
|
|
88
|
+
formula = f"{dep_formula} ~ C({safe_group})"
|
|
89
|
+
|
|
90
|
+
mv = MANOVA.from_formula(formula, data=pdf)
|
|
91
|
+
res = mv.mv_test()
|
|
92
|
+
|
|
93
|
+
# Extract Wilks' Lambda from the test
|
|
94
|
+
stats_dict = {}
|
|
95
|
+
for effect, effect_res in res.results.items():
|
|
96
|
+
for stat_name, vals in effect_res["stat"].items():
|
|
97
|
+
stats_dict[f"{effect}_{stat_name}"] = vals
|
|
98
|
+
|
|
99
|
+
# Build a clean summary
|
|
100
|
+
effects = []
|
|
101
|
+
for effect_name, effect_res in res.results.items():
|
|
102
|
+
stat_df = effect_res["stat"]
|
|
103
|
+
for test_name in stat_df.index:
|
|
104
|
+
effects.append({
|
|
105
|
+
"effect": str(effect_name),
|
|
106
|
+
"test": str(test_name),
|
|
107
|
+
"statistic": float(stat_df.loc[test_name, "Value"]),
|
|
108
|
+
"F": float(stat_df.loc[test_name, "F Value"]),
|
|
109
|
+
"num_df": float(stat_df.loc[test_name, "Num DF"]),
|
|
110
|
+
"den_df": float(stat_df.loc[test_name, "Den DF"]),
|
|
111
|
+
"p_value": float(stat_df.loc[test_name, "Pr > F"]),
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
return {
|
|
115
|
+
"test": "MANOVA",
|
|
116
|
+
"dep_vars": dep_vars,
|
|
117
|
+
"group": group,
|
|
118
|
+
"n_obs": len(pdf),
|
|
119
|
+
"n_groups": int(pdf[safe_group].nunique()),
|
|
120
|
+
"effects": effects,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
except Exception as exc:
|
|
124
|
+
raise RuntimeError(f"MANOVA failed: {exc}") from exc
|
openstat/stats/mixed.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Mixed / hierarchical linear models: random intercepts, random slopes, ICC."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
import statsmodels.api as sm
|
|
8
|
+
from statsmodels.regression.mixed_linear_model import MixedLM
|
|
9
|
+
from scipy import stats as sp_stats
|
|
10
|
+
|
|
11
|
+
from openstat.stats.models import FitResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _mixed_to_fit_result(result, dep: str, fixed: list[str], group_var: str, re_vars: list[str]) -> FitResult:
|
|
15
|
+
"""Convert statsmodels MixedLMResults to FitResult."""
|
|
16
|
+
fe_params = result.fe_params
|
|
17
|
+
bse = result.bse_fe
|
|
18
|
+
tvalues = result.tvalues
|
|
19
|
+
pvalues = result.pvalues
|
|
20
|
+
|
|
21
|
+
var_names = list(fe_params.index)
|
|
22
|
+
params = {name: float(fe_params[name]) for name in var_names}
|
|
23
|
+
std_errors = {name: float(bse[name]) for name in var_names}
|
|
24
|
+
t_vals = {name: float(tvalues[name]) for name in var_names}
|
|
25
|
+
p_vals = {name: float(pvalues[name]) for name in var_names}
|
|
26
|
+
ci = result.conf_int()
|
|
27
|
+
conf_low = {name: float(ci.loc[name, 0]) for name in var_names}
|
|
28
|
+
conf_high = {name: float(ci.loc[name, 1]) for name in var_names}
|
|
29
|
+
|
|
30
|
+
warnings_list: list[str] = []
|
|
31
|
+
warnings_list.append(f"Group variable: {group_var}")
|
|
32
|
+
n_groups = result.model.n_groups if hasattr(result.model, 'n_groups') else "?"
|
|
33
|
+
warnings_list.append(f"Number of groups: {n_groups}")
|
|
34
|
+
|
|
35
|
+
# Random effects variance
|
|
36
|
+
re_cov = result.cov_re
|
|
37
|
+
if re_cov is not None and re_cov.size > 0:
|
|
38
|
+
if hasattr(re_cov, 'iloc'):
|
|
39
|
+
re_var = float(re_cov.iloc[0, 0])
|
|
40
|
+
else:
|
|
41
|
+
re_var = float(re_cov[0, 0]) if re_cov.ndim > 1 else float(re_cov[0])
|
|
42
|
+
warnings_list.append(f"Random intercept variance: {re_var:.4f}")
|
|
43
|
+
resid_var = float(result.scale)
|
|
44
|
+
icc = re_var / (re_var + resid_var)
|
|
45
|
+
warnings_list.append(f"ICC: {icc:.4f}")
|
|
46
|
+
|
|
47
|
+
re_desc = "Random intercept" if not re_vars else f"Random intercept + slopes: {', '.join(re_vars)}"
|
|
48
|
+
warnings_list.append(re_desc)
|
|
49
|
+
|
|
50
|
+
return FitResult(
|
|
51
|
+
model_type="Mixed LM",
|
|
52
|
+
formula=f"{dep} ~ {' + '.join(fixed)} || {group_var}: {' + '.join(re_vars) if re_vars else '(intercept)'}",
|
|
53
|
+
dep_var=dep,
|
|
54
|
+
indep_vars=var_names,
|
|
55
|
+
n_obs=int(result.nobs),
|
|
56
|
+
params=params,
|
|
57
|
+
std_errors=std_errors,
|
|
58
|
+
t_values=t_vals,
|
|
59
|
+
p_values=p_vals,
|
|
60
|
+
conf_int_low=conf_low,
|
|
61
|
+
conf_int_high=conf_high,
|
|
62
|
+
log_likelihood=float(result.llf),
|
|
63
|
+
aic=float(result.aic),
|
|
64
|
+
bic=float(result.bic),
|
|
65
|
+
warnings=warnings_list,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def fit_mixed(
|
|
70
|
+
df: pl.DataFrame,
|
|
71
|
+
dep: str,
|
|
72
|
+
fixed: list[str],
|
|
73
|
+
group_var: str,
|
|
74
|
+
re_vars: list[str] | None = None,
|
|
75
|
+
) -> tuple[FitResult, object]:
|
|
76
|
+
"""Fit a mixed/hierarchical linear model.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
dep: Dependent variable name.
|
|
80
|
+
fixed: Fixed effect variable names.
|
|
81
|
+
group_var: Grouping variable name for random effects.
|
|
82
|
+
re_vars: Variables with random slopes. Empty = random intercept only.
|
|
83
|
+
"""
|
|
84
|
+
all_cols = list(dict.fromkeys([dep] + fixed + [group_var] + (re_vars or [])))
|
|
85
|
+
pdf = df.select(all_cols).to_pandas().dropna()
|
|
86
|
+
|
|
87
|
+
endog = pdf[dep]
|
|
88
|
+
exog = sm.add_constant(pdf[fixed])
|
|
89
|
+
groups = pdf[group_var]
|
|
90
|
+
|
|
91
|
+
if re_vars:
|
|
92
|
+
exog_re = pdf[re_vars]
|
|
93
|
+
else:
|
|
94
|
+
exog_re = None # random intercept only
|
|
95
|
+
|
|
96
|
+
model = MixedLM(endog, exog, groups, exog_re=exog_re)
|
|
97
|
+
result = model.fit(reml=True)
|
|
98
|
+
|
|
99
|
+
fit = _mixed_to_fit_result(result, dep, ["const"] + fixed, group_var, re_vars or [])
|
|
100
|
+
return fit, result
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def compute_icc(result) -> float:
|
|
104
|
+
"""Compute Intraclass Correlation Coefficient from mixed model result."""
|
|
105
|
+
re_cov = result.cov_re
|
|
106
|
+
if hasattr(re_cov, 'iloc'):
|
|
107
|
+
re_var = float(re_cov.iloc[0, 0])
|
|
108
|
+
else:
|
|
109
|
+
re_var = float(re_cov[0, 0]) if re_cov.ndim > 1 else float(re_cov[0])
|
|
110
|
+
resid_var = float(result.scale)
|
|
111
|
+
return re_var / (re_var + resid_var)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def lr_test(result_restricted, result_full) -> dict:
|
|
115
|
+
"""Likelihood ratio test between nested mixed models.
|
|
116
|
+
|
|
117
|
+
Returns dict with statistic, df, p_value.
|
|
118
|
+
"""
|
|
119
|
+
ll_r = result_restricted.llf
|
|
120
|
+
ll_f = result_full.llf
|
|
121
|
+
lr_stat = 2 * (ll_f - ll_r)
|
|
122
|
+
df_r = result_restricted.df_modelwc
|
|
123
|
+
df_f = result_full.df_modelwc
|
|
124
|
+
df_diff = df_f - df_r
|
|
125
|
+
if df_diff <= 0:
|
|
126
|
+
df_diff = 1
|
|
127
|
+
p_value = float(1 - sp_stats.chi2.cdf(lr_stat, df_diff))
|
|
128
|
+
return {"lr_stat": float(lr_stat), "df": int(df_diff), "p_value": p_value}
|
openstat/stats/ml.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""Machine learning / penalized regression and decision trees."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from sklearn.linear_model import ( # type: ignore[import]
|
|
10
|
+
Lasso, Ridge, ElasticNet, LassoCV, RidgeCV, ElasticNetCV,
|
|
11
|
+
)
|
|
12
|
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor # type: ignore[import]
|
|
13
|
+
from sklearn.model_selection import cross_val_score, KFold # type: ignore[import]
|
|
14
|
+
from sklearn.preprocessing import StandardScaler # type: ignore[import]
|
|
15
|
+
from sklearn.metrics import r2_score, mean_squared_error # type: ignore[import]
|
|
16
|
+
_HAS_SKLEARN = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
_HAS_SKLEARN = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _require_sklearn():
|
|
22
|
+
if not _HAS_SKLEARN:
|
|
23
|
+
raise ImportError(
|
|
24
|
+
"scikit-learn is required for ML commands.\n"
|
|
25
|
+
"Install: pip install scikit-learn"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _prep(df: pl.DataFrame, dep: str, indeps: list[str]):
|
|
30
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
31
|
+
y = sub[dep].to_numpy().astype(float)
|
|
32
|
+
X = sub.select(indeps).to_numpy().astype(float)
|
|
33
|
+
scaler = StandardScaler()
|
|
34
|
+
X_s = scaler.fit_transform(X)
|
|
35
|
+
return y, X_s, X, scaler
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ── Lasso ─────────────────────────────────────────────────────────────────
|
|
39
|
+
|
|
40
|
+
def fit_lasso(
|
|
41
|
+
df: pl.DataFrame,
|
|
42
|
+
dep: str,
|
|
43
|
+
indeps: list[str],
|
|
44
|
+
*,
|
|
45
|
+
alpha: float | None = None,
|
|
46
|
+
cv: int = 5,
|
|
47
|
+
) -> dict:
|
|
48
|
+
"""Lasso regression with optional cross-validated alpha selection."""
|
|
49
|
+
_require_sklearn()
|
|
50
|
+
y, X_s, X_raw, scaler = _prep(df, dep, indeps)
|
|
51
|
+
|
|
52
|
+
if alpha is None:
|
|
53
|
+
model = LassoCV(cv=cv, max_iter=10000)
|
|
54
|
+
model.fit(X_s, y)
|
|
55
|
+
alpha = float(model.alpha_)
|
|
56
|
+
else:
|
|
57
|
+
model = Lasso(alpha=alpha, max_iter=10000)
|
|
58
|
+
model.fit(X_s, y)
|
|
59
|
+
|
|
60
|
+
coef = model.coef_
|
|
61
|
+
y_pred = model.predict(X_s)
|
|
62
|
+
r2 = float(r2_score(y, y_pred))
|
|
63
|
+
mse = float(mean_squared_error(y, y_pred))
|
|
64
|
+
n_nonzero = int(np.sum(coef != 0))
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"method": "Lasso",
|
|
68
|
+
"dep": dep,
|
|
69
|
+
"indeps": indeps,
|
|
70
|
+
"alpha": alpha,
|
|
71
|
+
"coefficients": dict(zip(indeps, coef.tolist())),
|
|
72
|
+
"intercept": float(model.intercept_),
|
|
73
|
+
"r_squared": r2,
|
|
74
|
+
"mse": mse,
|
|
75
|
+
"rmse": float(np.sqrt(mse)),
|
|
76
|
+
"n_obs": len(y),
|
|
77
|
+
"n_nonzero": n_nonzero,
|
|
78
|
+
"n_zeroed": len(indeps) - n_nonzero,
|
|
79
|
+
"_model": model,
|
|
80
|
+
"_scaler": scaler,
|
|
81
|
+
"_indeps": indeps,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ── Ridge ─────────────────────────────────────────────────────────────────
|
|
86
|
+
|
|
87
|
+
def fit_ridge(
|
|
88
|
+
df: pl.DataFrame,
|
|
89
|
+
dep: str,
|
|
90
|
+
indeps: list[str],
|
|
91
|
+
*,
|
|
92
|
+
alpha: float | None = None,
|
|
93
|
+
cv: int = 5,
|
|
94
|
+
) -> dict:
|
|
95
|
+
"""Ridge regression with optional cross-validated alpha selection."""
|
|
96
|
+
_require_sklearn()
|
|
97
|
+
y, X_s, X_raw, scaler = _prep(df, dep, indeps)
|
|
98
|
+
|
|
99
|
+
if alpha is None:
|
|
100
|
+
alphas = np.logspace(-3, 5, 50)
|
|
101
|
+
model = RidgeCV(alphas=alphas, cv=cv)
|
|
102
|
+
model.fit(X_s, y)
|
|
103
|
+
alpha = float(model.alpha_)
|
|
104
|
+
else:
|
|
105
|
+
model = Ridge(alpha=alpha)
|
|
106
|
+
model.fit(X_s, y)
|
|
107
|
+
|
|
108
|
+
coef = model.coef_
|
|
109
|
+
y_pred = model.predict(X_s)
|
|
110
|
+
r2 = float(r2_score(y, y_pred))
|
|
111
|
+
mse = float(mean_squared_error(y, y_pred))
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
"method": "Ridge",
|
|
115
|
+
"dep": dep,
|
|
116
|
+
"indeps": indeps,
|
|
117
|
+
"alpha": alpha,
|
|
118
|
+
"coefficients": dict(zip(indeps, coef.tolist())),
|
|
119
|
+
"intercept": float(model.intercept_),
|
|
120
|
+
"r_squared": r2,
|
|
121
|
+
"mse": mse,
|
|
122
|
+
"rmse": float(np.sqrt(mse)),
|
|
123
|
+
"n_obs": len(y),
|
|
124
|
+
"_model": model,
|
|
125
|
+
"_scaler": scaler,
|
|
126
|
+
"_indeps": indeps,
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ── Elastic Net ────────────────────────────────────────────────────────────
|
|
131
|
+
|
|
132
|
+
def fit_elasticnet(
|
|
133
|
+
df: pl.DataFrame,
|
|
134
|
+
dep: str,
|
|
135
|
+
indeps: list[str],
|
|
136
|
+
*,
|
|
137
|
+
alpha: float | None = None,
|
|
138
|
+
l1_ratio: float = 0.5,
|
|
139
|
+
cv: int = 5,
|
|
140
|
+
) -> dict:
|
|
141
|
+
"""Elastic Net regression."""
|
|
142
|
+
_require_sklearn()
|
|
143
|
+
y, X_s, X_raw, scaler = _prep(df, dep, indeps)
|
|
144
|
+
|
|
145
|
+
if alpha is None:
|
|
146
|
+
model = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 1.0], cv=cv, max_iter=10000)
|
|
147
|
+
model.fit(X_s, y)
|
|
148
|
+
alpha = float(model.alpha_)
|
|
149
|
+
l1_ratio = float(model.l1_ratio_)
|
|
150
|
+
else:
|
|
151
|
+
model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=10000)
|
|
152
|
+
model.fit(X_s, y)
|
|
153
|
+
|
|
154
|
+
coef = model.coef_
|
|
155
|
+
y_pred = model.predict(X_s)
|
|
156
|
+
r2 = float(r2_score(y, y_pred))
|
|
157
|
+
mse = float(mean_squared_error(y, y_pred))
|
|
158
|
+
|
|
159
|
+
return {
|
|
160
|
+
"method": "ElasticNet",
|
|
161
|
+
"dep": dep,
|
|
162
|
+
"indeps": indeps,
|
|
163
|
+
"alpha": alpha,
|
|
164
|
+
"l1_ratio": l1_ratio,
|
|
165
|
+
"coefficients": dict(zip(indeps, coef.tolist())),
|
|
166
|
+
"intercept": float(model.intercept_),
|
|
167
|
+
"r_squared": r2,
|
|
168
|
+
"mse": mse,
|
|
169
|
+
"rmse": float(np.sqrt(mse)),
|
|
170
|
+
"n_obs": len(y),
|
|
171
|
+
"_model": model,
|
|
172
|
+
"_scaler": scaler,
|
|
173
|
+
"_indeps": indeps,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ── Decision Tree ──────────────────────────────────────────────────────────
|
|
178
|
+
|
|
179
|
+
def fit_cart(
|
|
180
|
+
df: pl.DataFrame,
|
|
181
|
+
dep: str,
|
|
182
|
+
indeps: list[str],
|
|
183
|
+
*,
|
|
184
|
+
task: str = "regression",
|
|
185
|
+
max_depth: int | None = 5,
|
|
186
|
+
min_samples_leaf: int = 5,
|
|
187
|
+
) -> dict:
|
|
188
|
+
"""CART: decision tree for regression or classification."""
|
|
189
|
+
_require_sklearn()
|
|
190
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
191
|
+
y = sub[dep].to_numpy()
|
|
192
|
+
X = sub.select(indeps).to_numpy().astype(float)
|
|
193
|
+
|
|
194
|
+
if task == "classification":
|
|
195
|
+
y = y.astype(str)
|
|
196
|
+
model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
|
|
197
|
+
model.fit(X, y)
|
|
198
|
+
score = float(model.score(X, y))
|
|
199
|
+
metric_name = "accuracy"
|
|
200
|
+
else:
|
|
201
|
+
y = y.astype(float)
|
|
202
|
+
model = DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
|
|
203
|
+
model.fit(X, y)
|
|
204
|
+
y_pred = model.predict(X)
|
|
205
|
+
score = float(r2_score(y, y_pred))
|
|
206
|
+
metric_name = "r_squared"
|
|
207
|
+
|
|
208
|
+
importances = dict(zip(indeps, model.feature_importances_.tolist()))
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
"method": "CART",
|
|
212
|
+
"task": task,
|
|
213
|
+
"dep": dep,
|
|
214
|
+
"indeps": indeps,
|
|
215
|
+
"max_depth": max_depth,
|
|
216
|
+
"n_leaves": int(model.get_n_leaves()),
|
|
217
|
+
"n_obs": len(y),
|
|
218
|
+
metric_name: score,
|
|
219
|
+
"feature_importances": importances,
|
|
220
|
+
"_model": model,
|
|
221
|
+
"_indeps": indeps,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# ── Cross-validation ───────────────────────────────────────────────────────
|
|
226
|
+
|
|
227
|
+
def cross_validate_model(
|
|
228
|
+
df: pl.DataFrame,
|
|
229
|
+
dep: str,
|
|
230
|
+
indeps: list[str],
|
|
231
|
+
*,
|
|
232
|
+
method: str = "ols",
|
|
233
|
+
k: int = 5,
|
|
234
|
+
alpha: float = 1.0,
|
|
235
|
+
scoring: str = "r2",
|
|
236
|
+
) -> dict:
|
|
237
|
+
"""K-fold cross-validation for various models."""
|
|
238
|
+
_require_sklearn()
|
|
239
|
+
y, X_s, X_raw, scaler = _prep(df, dep, indeps)
|
|
240
|
+
|
|
241
|
+
method_lower = method.lower()
|
|
242
|
+
if method_lower == "lasso":
|
|
243
|
+
model = Lasso(alpha=alpha, max_iter=10000)
|
|
244
|
+
X_fit = X_s
|
|
245
|
+
elif method_lower == "ridge":
|
|
246
|
+
model = Ridge(alpha=alpha)
|
|
247
|
+
X_fit = X_s
|
|
248
|
+
elif method_lower == "elasticnet":
|
|
249
|
+
model = ElasticNet(alpha=alpha, max_iter=10000)
|
|
250
|
+
X_fit = X_s
|
|
251
|
+
elif method_lower == "cart":
|
|
252
|
+
model = DecisionTreeRegressor(max_depth=5)
|
|
253
|
+
X_fit = X_raw
|
|
254
|
+
else:
|
|
255
|
+
# OLS via sklearn-compatible wrapper
|
|
256
|
+
from sklearn.linear_model import LinearRegression
|
|
257
|
+
model = LinearRegression()
|
|
258
|
+
X_fit = X_s
|
|
259
|
+
|
|
260
|
+
kf = KFold(n_splits=k, shuffle=True, random_state=42)
|
|
261
|
+
scores = cross_val_score(model, X_fit, y, cv=kf, scoring=scoring)
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
"method": method,
|
|
265
|
+
"dep": dep,
|
|
266
|
+
"indeps": indeps,
|
|
267
|
+
"k_folds": k,
|
|
268
|
+
"scoring": scoring,
|
|
269
|
+
"scores": scores.tolist(),
|
|
270
|
+
"mean_score": float(scores.mean()),
|
|
271
|
+
"std_score": float(scores.std()),
|
|
272
|
+
"min_score": float(scores.min()),
|
|
273
|
+
"max_score": float(scores.max()),
|
|
274
|
+
"n_obs": len(y),
|
|
275
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Advanced ML: RandomForest, GradientBoosting, SVM, t-SNE."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def fit_random_forest(df: pl.DataFrame, dep: str, indeps: list[str],
|
|
10
|
+
n_estimators: int = 100, max_depth: int | None = None,
|
|
11
|
+
task: str = "regression", seed: int = 42) -> dict:
|
|
12
|
+
"""Random Forest regressor or classifier."""
|
|
13
|
+
sklearn = __import__("sklearn.ensemble", fromlist=["RandomForestRegressor", "RandomForestClassifier"])
|
|
14
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
15
|
+
y = sub[dep].to_numpy()
|
|
16
|
+
X = sub.select(indeps).to_numpy().astype(float)
|
|
17
|
+
if task == "classification":
|
|
18
|
+
clf = sklearn.RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=seed)
|
|
19
|
+
clf.fit(X, y)
|
|
20
|
+
score = float(clf.score(X, y))
|
|
21
|
+
metric = "accuracy"
|
|
22
|
+
else:
|
|
23
|
+
clf = sklearn.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=seed)
|
|
24
|
+
clf.fit(X, y.astype(float))
|
|
25
|
+
score = float(clf.score(X, y.astype(float)))
|
|
26
|
+
metric = "r_squared"
|
|
27
|
+
feat_imp = {col: float(imp) for col, imp in zip(indeps, clf.feature_importances_)}
|
|
28
|
+
return {
|
|
29
|
+
"method": f"Random Forest ({task})",
|
|
30
|
+
"dep": dep, "indeps": indeps,
|
|
31
|
+
"n_estimators": n_estimators,
|
|
32
|
+
"max_depth": max_depth,
|
|
33
|
+
metric: score,
|
|
34
|
+
"feature_importances": feat_imp,
|
|
35
|
+
"n_obs": len(y),
|
|
36
|
+
"_model": clf,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def fit_gradient_boosting(df: pl.DataFrame, dep: str, indeps: list[str],
|
|
41
|
+
n_estimators: int = 100, learning_rate: float = 0.1,
|
|
42
|
+
max_depth: int = 3, task: str = "regression", seed: int = 42) -> dict:
|
|
43
|
+
"""Gradient Boosting regressor or classifier."""
|
|
44
|
+
sklearn = __import__("sklearn.ensemble", fromlist=["GradientBoostingRegressor", "GradientBoostingClassifier"])
|
|
45
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
46
|
+
y = sub[dep].to_numpy()
|
|
47
|
+
X = sub.select(indeps).to_numpy().astype(float)
|
|
48
|
+
if task == "classification":
|
|
49
|
+
clf = sklearn.GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
|
|
50
|
+
max_depth=max_depth, random_state=seed)
|
|
51
|
+
clf.fit(X, y)
|
|
52
|
+
score = float(clf.score(X, y))
|
|
53
|
+
metric = "accuracy"
|
|
54
|
+
else:
|
|
55
|
+
clf = sklearn.GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate,
|
|
56
|
+
max_depth=max_depth, random_state=seed)
|
|
57
|
+
clf.fit(X, y.astype(float))
|
|
58
|
+
score = float(clf.score(X, y.astype(float)))
|
|
59
|
+
metric = "r_squared"
|
|
60
|
+
feat_imp = {col: float(imp) for col, imp in zip(indeps, clf.feature_importances_)}
|
|
61
|
+
return {
|
|
62
|
+
"method": f"Gradient Boosting ({task})",
|
|
63
|
+
"dep": dep, "indeps": indeps,
|
|
64
|
+
"n_estimators": n_estimators,
|
|
65
|
+
"learning_rate": learning_rate,
|
|
66
|
+
metric: score,
|
|
67
|
+
"feature_importances": feat_imp,
|
|
68
|
+
"n_obs": len(y),
|
|
69
|
+
"_model": clf,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def fit_svm(df: pl.DataFrame, dep: str, indeps: list[str],
|
|
74
|
+
kernel: str = "rbf", C: float = 1.0,
|
|
75
|
+
task: str = "regression", seed: int = 42) -> dict:
|
|
76
|
+
"""Support Vector Machine regressor or classifier."""
|
|
77
|
+
sklearn_svm = __import__("sklearn.svm", fromlist=["SVR", "SVC"])
|
|
78
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
79
|
+
y = sub[dep].to_numpy()
|
|
80
|
+
X = sub.select(indeps).to_numpy().astype(float)
|
|
81
|
+
if task == "classification":
|
|
82
|
+
clf = sklearn_svm.SVC(kernel=kernel, C=C, random_state=seed)
|
|
83
|
+
clf.fit(X, y)
|
|
84
|
+
score = float(clf.score(X, y))
|
|
85
|
+
metric = "accuracy"
|
|
86
|
+
else:
|
|
87
|
+
clf = sklearn_svm.SVR(kernel=kernel, C=C)
|
|
88
|
+
clf.fit(X, y.astype(float))
|
|
89
|
+
score = float(clf.score(X, y.astype(float)))
|
|
90
|
+
metric = "r_squared"
|
|
91
|
+
return {
|
|
92
|
+
"method": f"SVM ({task}, kernel={kernel})",
|
|
93
|
+
"dep": dep, "indeps": indeps,
|
|
94
|
+
"kernel": kernel, "C": C,
|
|
95
|
+
metric: score,
|
|
96
|
+
"n_obs": len(y),
|
|
97
|
+
"_model": clf,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def fit_tsne(df: pl.DataFrame, cols: list[str], n_components: int = 2,
|
|
102
|
+
perplexity: float = 30.0, seed: int = 42) -> dict:
|
|
103
|
+
"""t-SNE dimensionality reduction."""
|
|
104
|
+
from sklearn.manifold import TSNE
|
|
105
|
+
sub = df.select(cols).drop_nulls()
|
|
106
|
+
X = sub.to_numpy().astype(float)
|
|
107
|
+
tsne = TSNE(n_components=n_components, perplexity=min(perplexity, len(X) - 1),
|
|
108
|
+
random_state=seed)
|
|
109
|
+
embedding = tsne.fit_transform(X)
|
|
110
|
+
return {
|
|
111
|
+
"method": "t-SNE",
|
|
112
|
+
"cols": cols,
|
|
113
|
+
"n_components": n_components,
|
|
114
|
+
"perplexity": perplexity,
|
|
115
|
+
"embedding": embedding.tolist(),
|
|
116
|
+
"n_obs": len(X),
|
|
117
|
+
}
|