openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,124 @@
1
+ """MANOVA and two-way ANOVA."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ from scipy import stats as sp_stats
8
+
9
+
10
+ # ── Two-way ANOVA ─────────────────────────────────────────────────────────
11
+
12
+ def twoway_anova(
13
+ df: pl.DataFrame,
14
+ dep: str,
15
+ factor1: str,
16
+ factor2: str,
17
+ *,
18
+ interaction: bool = True,
19
+ ) -> dict:
20
+ """Two-way ANOVA with optional interaction term.
21
+
22
+ Uses OLS approach (type III sums of squares via statsmodels).
23
+ """
24
+ import statsmodels.formula.api as smf
25
+
26
+ formula = f"Q('{dep}') ~ C(Q('{factor1}')) + C(Q('{factor2}'))"
27
+ if interaction:
28
+ formula += f" + C(Q('{factor1}')):C(Q('{factor2}'))"
29
+
30
+ pdf = df.select([dep, factor1, factor2]).drop_nulls().to_pandas()
31
+ # rename cols to safe names for formula
32
+ pdf.columns = ["dep", "f1", "f2"]
33
+ formula = "dep ~ C(f1) + C(f2)"
34
+ if interaction:
35
+ formula += " + C(f1):C(f2)"
36
+
37
+ model = smf.ols(formula, data=pdf).fit()
38
+
39
+ from statsmodels.stats.anova import anova_lm
40
+ anova_table = anova_lm(model, typ=3)
41
+
42
+ rows = []
43
+ for source, row in anova_table.iterrows():
44
+ rows.append({
45
+ "source": str(source).replace("C(f1)", factor1).replace("C(f2)", factor2),
46
+ "df": int(row.get("df", 0)),
47
+ "SS": float(row.get("sum_sq", float("nan"))),
48
+ "MS": float(row.get("mean_sq", float("nan"))),
49
+ "F": float(row.get("F", float("nan"))),
50
+ "p_value": float(row.get("PR(>F)", float("nan"))),
51
+ })
52
+
53
+ return {
54
+ "test": "Two-way ANOVA",
55
+ "dep": dep,
56
+ "factor1": factor1,
57
+ "factor2": factor2,
58
+ "interaction": interaction,
59
+ "n_obs": int(pdf.shape[0]),
60
+ "r_squared": float(model.rsquared),
61
+ "table": rows,
62
+ }
63
+
64
+
65
+ # ── MANOVA ─────────────────────────────────────────────────────────────────
66
+
67
+ def fit_manova(
68
+ df: pl.DataFrame,
69
+ dep_vars: list[str],
70
+ group: str,
71
+ ) -> dict:
72
+ """
73
+ One-way MANOVA via statsmodels.
74
+
75
+ Tests whether group means differ on a set of dependent variables.
76
+ """
77
+ try:
78
+ from statsmodels.multivariate.manova import MANOVA
79
+
80
+ dep_str = " + ".join(f"Q('{d}')" for d in dep_vars)
81
+ pdf = df.select(dep_vars + [group]).drop_nulls().to_pandas()
82
+ # safe column names
83
+ safe_deps = [f"y{i}" for i in range(len(dep_vars))]
84
+ safe_group = "group_var"
85
+ mapping = dict(zip(dep_vars + [group], safe_deps + [safe_group]))
86
+ pdf.rename(columns=mapping, inplace=True)
87
+ dep_formula = " + ".join(safe_deps)
88
+ formula = f"{dep_formula} ~ C({safe_group})"
89
+
90
+ mv = MANOVA.from_formula(formula, data=pdf)
91
+ res = mv.mv_test()
92
+
93
+ # Extract Wilks' Lambda from the test
94
+ stats_dict = {}
95
+ for effect, effect_res in res.results.items():
96
+ for stat_name, vals in effect_res["stat"].items():
97
+ stats_dict[f"{effect}_{stat_name}"] = vals
98
+
99
+ # Build a clean summary
100
+ effects = []
101
+ for effect_name, effect_res in res.results.items():
102
+ stat_df = effect_res["stat"]
103
+ for test_name in stat_df.index:
104
+ effects.append({
105
+ "effect": str(effect_name),
106
+ "test": str(test_name),
107
+ "statistic": float(stat_df.loc[test_name, "Value"]),
108
+ "F": float(stat_df.loc[test_name, "F Value"]),
109
+ "num_df": float(stat_df.loc[test_name, "Num DF"]),
110
+ "den_df": float(stat_df.loc[test_name, "Den DF"]),
111
+ "p_value": float(stat_df.loc[test_name, "Pr > F"]),
112
+ })
113
+
114
+ return {
115
+ "test": "MANOVA",
116
+ "dep_vars": dep_vars,
117
+ "group": group,
118
+ "n_obs": len(pdf),
119
+ "n_groups": int(pdf[safe_group].nunique()),
120
+ "effects": effects,
121
+ }
122
+
123
+ except Exception as exc:
124
+ raise RuntimeError(f"MANOVA failed: {exc}") from exc
@@ -0,0 +1,128 @@
1
+ """Mixed / hierarchical linear models: random intercepts, random slopes, ICC."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ import statsmodels.api as sm
8
+ from statsmodels.regression.mixed_linear_model import MixedLM
9
+ from scipy import stats as sp_stats
10
+
11
+ from openstat.stats.models import FitResult
12
+
13
+
14
+ def _mixed_to_fit_result(result, dep: str, fixed: list[str], group_var: str, re_vars: list[str]) -> FitResult:
15
+ """Convert statsmodels MixedLMResults to FitResult."""
16
+ fe_params = result.fe_params
17
+ bse = result.bse_fe
18
+ tvalues = result.tvalues
19
+ pvalues = result.pvalues
20
+
21
+ var_names = list(fe_params.index)
22
+ params = {name: float(fe_params[name]) for name in var_names}
23
+ std_errors = {name: float(bse[name]) for name in var_names}
24
+ t_vals = {name: float(tvalues[name]) for name in var_names}
25
+ p_vals = {name: float(pvalues[name]) for name in var_names}
26
+ ci = result.conf_int()
27
+ conf_low = {name: float(ci.loc[name, 0]) for name in var_names}
28
+ conf_high = {name: float(ci.loc[name, 1]) for name in var_names}
29
+
30
+ warnings_list: list[str] = []
31
+ warnings_list.append(f"Group variable: {group_var}")
32
+ n_groups = result.model.n_groups if hasattr(result.model, 'n_groups') else "?"
33
+ warnings_list.append(f"Number of groups: {n_groups}")
34
+
35
+ # Random effects variance
36
+ re_cov = result.cov_re
37
+ if re_cov is not None and re_cov.size > 0:
38
+ if hasattr(re_cov, 'iloc'):
39
+ re_var = float(re_cov.iloc[0, 0])
40
+ else:
41
+ re_var = float(re_cov[0, 0]) if re_cov.ndim > 1 else float(re_cov[0])
42
+ warnings_list.append(f"Random intercept variance: {re_var:.4f}")
43
+ resid_var = float(result.scale)
44
+ icc = re_var / (re_var + resid_var)
45
+ warnings_list.append(f"ICC: {icc:.4f}")
46
+
47
+ re_desc = "Random intercept" if not re_vars else f"Random intercept + slopes: {', '.join(re_vars)}"
48
+ warnings_list.append(re_desc)
49
+
50
+ return FitResult(
51
+ model_type="Mixed LM",
52
+ formula=f"{dep} ~ {' + '.join(fixed)} || {group_var}: {' + '.join(re_vars) if re_vars else '(intercept)'}",
53
+ dep_var=dep,
54
+ indep_vars=var_names,
55
+ n_obs=int(result.nobs),
56
+ params=params,
57
+ std_errors=std_errors,
58
+ t_values=t_vals,
59
+ p_values=p_vals,
60
+ conf_int_low=conf_low,
61
+ conf_int_high=conf_high,
62
+ log_likelihood=float(result.llf),
63
+ aic=float(result.aic),
64
+ bic=float(result.bic),
65
+ warnings=warnings_list,
66
+ )
67
+
68
+
69
+ def fit_mixed(
70
+ df: pl.DataFrame,
71
+ dep: str,
72
+ fixed: list[str],
73
+ group_var: str,
74
+ re_vars: list[str] | None = None,
75
+ ) -> tuple[FitResult, object]:
76
+ """Fit a mixed/hierarchical linear model.
77
+
78
+ Args:
79
+ dep: Dependent variable name.
80
+ fixed: Fixed effect variable names.
81
+ group_var: Grouping variable name for random effects.
82
+ re_vars: Variables with random slopes. Empty = random intercept only.
83
+ """
84
+ all_cols = list(dict.fromkeys([dep] + fixed + [group_var] + (re_vars or [])))
85
+ pdf = df.select(all_cols).to_pandas().dropna()
86
+
87
+ endog = pdf[dep]
88
+ exog = sm.add_constant(pdf[fixed])
89
+ groups = pdf[group_var]
90
+
91
+ if re_vars:
92
+ exog_re = pdf[re_vars]
93
+ else:
94
+ exog_re = None # random intercept only
95
+
96
+ model = MixedLM(endog, exog, groups, exog_re=exog_re)
97
+ result = model.fit(reml=True)
98
+
99
+ fit = _mixed_to_fit_result(result, dep, ["const"] + fixed, group_var, re_vars or [])
100
+ return fit, result
101
+
102
+
103
+ def compute_icc(result) -> float:
104
+ """Compute Intraclass Correlation Coefficient from mixed model result."""
105
+ re_cov = result.cov_re
106
+ if hasattr(re_cov, 'iloc'):
107
+ re_var = float(re_cov.iloc[0, 0])
108
+ else:
109
+ re_var = float(re_cov[0, 0]) if re_cov.ndim > 1 else float(re_cov[0])
110
+ resid_var = float(result.scale)
111
+ return re_var / (re_var + resid_var)
112
+
113
+
114
+ def lr_test(result_restricted, result_full) -> dict:
115
+ """Likelihood ratio test between nested mixed models.
116
+
117
+ Returns dict with statistic, df, p_value.
118
+ """
119
+ ll_r = result_restricted.llf
120
+ ll_f = result_full.llf
121
+ lr_stat = 2 * (ll_f - ll_r)
122
+ df_r = result_restricted.df_modelwc
123
+ df_f = result_full.df_modelwc
124
+ df_diff = df_f - df_r
125
+ if df_diff <= 0:
126
+ df_diff = 1
127
+ p_value = float(1 - sp_stats.chi2.cdf(lr_stat, df_diff))
128
+ return {"lr_stat": float(lr_stat), "df": int(df_diff), "p_value": p_value}
openstat/stats/ml.py ADDED
@@ -0,0 +1,275 @@
1
+ """Machine learning / penalized regression and decision trees."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+
8
+ try:
9
+ from sklearn.linear_model import ( # type: ignore[import]
10
+ Lasso, Ridge, ElasticNet, LassoCV, RidgeCV, ElasticNetCV,
11
+ )
12
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor # type: ignore[import]
13
+ from sklearn.model_selection import cross_val_score, KFold # type: ignore[import]
14
+ from sklearn.preprocessing import StandardScaler # type: ignore[import]
15
+ from sklearn.metrics import r2_score, mean_squared_error # type: ignore[import]
16
+ _HAS_SKLEARN = True
17
+ except ImportError:
18
+ _HAS_SKLEARN = False
19
+
20
+
21
+ def _require_sklearn():
22
+ if not _HAS_SKLEARN:
23
+ raise ImportError(
24
+ "scikit-learn is required for ML commands.\n"
25
+ "Install: pip install scikit-learn"
26
+ )
27
+
28
+
29
+ def _prep(df: pl.DataFrame, dep: str, indeps: list[str]):
30
+ sub = df.select([dep] + indeps).drop_nulls()
31
+ y = sub[dep].to_numpy().astype(float)
32
+ X = sub.select(indeps).to_numpy().astype(float)
33
+ scaler = StandardScaler()
34
+ X_s = scaler.fit_transform(X)
35
+ return y, X_s, X, scaler
36
+
37
+
38
+ # ── Lasso ─────────────────────────────────────────────────────────────────
39
+
40
+ def fit_lasso(
41
+ df: pl.DataFrame,
42
+ dep: str,
43
+ indeps: list[str],
44
+ *,
45
+ alpha: float | None = None,
46
+ cv: int = 5,
47
+ ) -> dict:
48
+ """Lasso regression with optional cross-validated alpha selection."""
49
+ _require_sklearn()
50
+ y, X_s, X_raw, scaler = _prep(df, dep, indeps)
51
+
52
+ if alpha is None:
53
+ model = LassoCV(cv=cv, max_iter=10000)
54
+ model.fit(X_s, y)
55
+ alpha = float(model.alpha_)
56
+ else:
57
+ model = Lasso(alpha=alpha, max_iter=10000)
58
+ model.fit(X_s, y)
59
+
60
+ coef = model.coef_
61
+ y_pred = model.predict(X_s)
62
+ r2 = float(r2_score(y, y_pred))
63
+ mse = float(mean_squared_error(y, y_pred))
64
+ n_nonzero = int(np.sum(coef != 0))
65
+
66
+ return {
67
+ "method": "Lasso",
68
+ "dep": dep,
69
+ "indeps": indeps,
70
+ "alpha": alpha,
71
+ "coefficients": dict(zip(indeps, coef.tolist())),
72
+ "intercept": float(model.intercept_),
73
+ "r_squared": r2,
74
+ "mse": mse,
75
+ "rmse": float(np.sqrt(mse)),
76
+ "n_obs": len(y),
77
+ "n_nonzero": n_nonzero,
78
+ "n_zeroed": len(indeps) - n_nonzero,
79
+ "_model": model,
80
+ "_scaler": scaler,
81
+ "_indeps": indeps,
82
+ }
83
+
84
+
85
+ # ── Ridge ─────────────────────────────────────────────────────────────────
86
+
87
+ def fit_ridge(
88
+ df: pl.DataFrame,
89
+ dep: str,
90
+ indeps: list[str],
91
+ *,
92
+ alpha: float | None = None,
93
+ cv: int = 5,
94
+ ) -> dict:
95
+ """Ridge regression with optional cross-validated alpha selection."""
96
+ _require_sklearn()
97
+ y, X_s, X_raw, scaler = _prep(df, dep, indeps)
98
+
99
+ if alpha is None:
100
+ alphas = np.logspace(-3, 5, 50)
101
+ model = RidgeCV(alphas=alphas, cv=cv)
102
+ model.fit(X_s, y)
103
+ alpha = float(model.alpha_)
104
+ else:
105
+ model = Ridge(alpha=alpha)
106
+ model.fit(X_s, y)
107
+
108
+ coef = model.coef_
109
+ y_pred = model.predict(X_s)
110
+ r2 = float(r2_score(y, y_pred))
111
+ mse = float(mean_squared_error(y, y_pred))
112
+
113
+ return {
114
+ "method": "Ridge",
115
+ "dep": dep,
116
+ "indeps": indeps,
117
+ "alpha": alpha,
118
+ "coefficients": dict(zip(indeps, coef.tolist())),
119
+ "intercept": float(model.intercept_),
120
+ "r_squared": r2,
121
+ "mse": mse,
122
+ "rmse": float(np.sqrt(mse)),
123
+ "n_obs": len(y),
124
+ "_model": model,
125
+ "_scaler": scaler,
126
+ "_indeps": indeps,
127
+ }
128
+
129
+
130
+ # ── Elastic Net ────────────────────────────────────────────────────────────
131
+
132
+ def fit_elasticnet(
133
+ df: pl.DataFrame,
134
+ dep: str,
135
+ indeps: list[str],
136
+ *,
137
+ alpha: float | None = None,
138
+ l1_ratio: float = 0.5,
139
+ cv: int = 5,
140
+ ) -> dict:
141
+ """Elastic Net regression."""
142
+ _require_sklearn()
143
+ y, X_s, X_raw, scaler = _prep(df, dep, indeps)
144
+
145
+ if alpha is None:
146
+ model = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 1.0], cv=cv, max_iter=10000)
147
+ model.fit(X_s, y)
148
+ alpha = float(model.alpha_)
149
+ l1_ratio = float(model.l1_ratio_)
150
+ else:
151
+ model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=10000)
152
+ model.fit(X_s, y)
153
+
154
+ coef = model.coef_
155
+ y_pred = model.predict(X_s)
156
+ r2 = float(r2_score(y, y_pred))
157
+ mse = float(mean_squared_error(y, y_pred))
158
+
159
+ return {
160
+ "method": "ElasticNet",
161
+ "dep": dep,
162
+ "indeps": indeps,
163
+ "alpha": alpha,
164
+ "l1_ratio": l1_ratio,
165
+ "coefficients": dict(zip(indeps, coef.tolist())),
166
+ "intercept": float(model.intercept_),
167
+ "r_squared": r2,
168
+ "mse": mse,
169
+ "rmse": float(np.sqrt(mse)),
170
+ "n_obs": len(y),
171
+ "_model": model,
172
+ "_scaler": scaler,
173
+ "_indeps": indeps,
174
+ }
175
+
176
+
177
+ # ── Decision Tree ──────────────────────────────────────────────────────────
178
+
179
+ def fit_cart(
180
+ df: pl.DataFrame,
181
+ dep: str,
182
+ indeps: list[str],
183
+ *,
184
+ task: str = "regression",
185
+ max_depth: int | None = 5,
186
+ min_samples_leaf: int = 5,
187
+ ) -> dict:
188
+ """CART: decision tree for regression or classification."""
189
+ _require_sklearn()
190
+ sub = df.select([dep] + indeps).drop_nulls()
191
+ y = sub[dep].to_numpy()
192
+ X = sub.select(indeps).to_numpy().astype(float)
193
+
194
+ if task == "classification":
195
+ y = y.astype(str)
196
+ model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
197
+ model.fit(X, y)
198
+ score = float(model.score(X, y))
199
+ metric_name = "accuracy"
200
+ else:
201
+ y = y.astype(float)
202
+ model = DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
203
+ model.fit(X, y)
204
+ y_pred = model.predict(X)
205
+ score = float(r2_score(y, y_pred))
206
+ metric_name = "r_squared"
207
+
208
+ importances = dict(zip(indeps, model.feature_importances_.tolist()))
209
+
210
+ return {
211
+ "method": "CART",
212
+ "task": task,
213
+ "dep": dep,
214
+ "indeps": indeps,
215
+ "max_depth": max_depth,
216
+ "n_leaves": int(model.get_n_leaves()),
217
+ "n_obs": len(y),
218
+ metric_name: score,
219
+ "feature_importances": importances,
220
+ "_model": model,
221
+ "_indeps": indeps,
222
+ }
223
+
224
+
225
+ # ── Cross-validation ───────────────────────────────────────────────────────
226
+
227
+ def cross_validate_model(
228
+ df: pl.DataFrame,
229
+ dep: str,
230
+ indeps: list[str],
231
+ *,
232
+ method: str = "ols",
233
+ k: int = 5,
234
+ alpha: float = 1.0,
235
+ scoring: str = "r2",
236
+ ) -> dict:
237
+ """K-fold cross-validation for various models."""
238
+ _require_sklearn()
239
+ y, X_s, X_raw, scaler = _prep(df, dep, indeps)
240
+
241
+ method_lower = method.lower()
242
+ if method_lower == "lasso":
243
+ model = Lasso(alpha=alpha, max_iter=10000)
244
+ X_fit = X_s
245
+ elif method_lower == "ridge":
246
+ model = Ridge(alpha=alpha)
247
+ X_fit = X_s
248
+ elif method_lower == "elasticnet":
249
+ model = ElasticNet(alpha=alpha, max_iter=10000)
250
+ X_fit = X_s
251
+ elif method_lower == "cart":
252
+ model = DecisionTreeRegressor(max_depth=5)
253
+ X_fit = X_raw
254
+ else:
255
+ # OLS via sklearn-compatible wrapper
256
+ from sklearn.linear_model import LinearRegression
257
+ model = LinearRegression()
258
+ X_fit = X_s
259
+
260
+ kf = KFold(n_splits=k, shuffle=True, random_state=42)
261
+ scores = cross_val_score(model, X_fit, y, cv=kf, scoring=scoring)
262
+
263
+ return {
264
+ "method": method,
265
+ "dep": dep,
266
+ "indeps": indeps,
267
+ "k_folds": k,
268
+ "scoring": scoring,
269
+ "scores": scores.tolist(),
270
+ "mean_score": float(scores.mean()),
271
+ "std_score": float(scores.std()),
272
+ "min_score": float(scores.min()),
273
+ "max_score": float(scores.max()),
274
+ "n_obs": len(y),
275
+ }
@@ -0,0 +1,117 @@
1
+ """Advanced ML: RandomForest, GradientBoosting, SVM, t-SNE."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+
8
+
9
+ def fit_random_forest(df: pl.DataFrame, dep: str, indeps: list[str],
10
+ n_estimators: int = 100, max_depth: int | None = None,
11
+ task: str = "regression", seed: int = 42) -> dict:
12
+ """Random Forest regressor or classifier."""
13
+ sklearn = __import__("sklearn.ensemble", fromlist=["RandomForestRegressor", "RandomForestClassifier"])
14
+ sub = df.select([dep] + indeps).drop_nulls()
15
+ y = sub[dep].to_numpy()
16
+ X = sub.select(indeps).to_numpy().astype(float)
17
+ if task == "classification":
18
+ clf = sklearn.RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=seed)
19
+ clf.fit(X, y)
20
+ score = float(clf.score(X, y))
21
+ metric = "accuracy"
22
+ else:
23
+ clf = sklearn.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=seed)
24
+ clf.fit(X, y.astype(float))
25
+ score = float(clf.score(X, y.astype(float)))
26
+ metric = "r_squared"
27
+ feat_imp = {col: float(imp) for col, imp in zip(indeps, clf.feature_importances_)}
28
+ return {
29
+ "method": f"Random Forest ({task})",
30
+ "dep": dep, "indeps": indeps,
31
+ "n_estimators": n_estimators,
32
+ "max_depth": max_depth,
33
+ metric: score,
34
+ "feature_importances": feat_imp,
35
+ "n_obs": len(y),
36
+ "_model": clf,
37
+ }
38
+
39
+
40
+ def fit_gradient_boosting(df: pl.DataFrame, dep: str, indeps: list[str],
41
+ n_estimators: int = 100, learning_rate: float = 0.1,
42
+ max_depth: int = 3, task: str = "regression", seed: int = 42) -> dict:
43
+ """Gradient Boosting regressor or classifier."""
44
+ sklearn = __import__("sklearn.ensemble", fromlist=["GradientBoostingRegressor", "GradientBoostingClassifier"])
45
+ sub = df.select([dep] + indeps).drop_nulls()
46
+ y = sub[dep].to_numpy()
47
+ X = sub.select(indeps).to_numpy().astype(float)
48
+ if task == "classification":
49
+ clf = sklearn.GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
50
+ max_depth=max_depth, random_state=seed)
51
+ clf.fit(X, y)
52
+ score = float(clf.score(X, y))
53
+ metric = "accuracy"
54
+ else:
55
+ clf = sklearn.GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate,
56
+ max_depth=max_depth, random_state=seed)
57
+ clf.fit(X, y.astype(float))
58
+ score = float(clf.score(X, y.astype(float)))
59
+ metric = "r_squared"
60
+ feat_imp = {col: float(imp) for col, imp in zip(indeps, clf.feature_importances_)}
61
+ return {
62
+ "method": f"Gradient Boosting ({task})",
63
+ "dep": dep, "indeps": indeps,
64
+ "n_estimators": n_estimators,
65
+ "learning_rate": learning_rate,
66
+ metric: score,
67
+ "feature_importances": feat_imp,
68
+ "n_obs": len(y),
69
+ "_model": clf,
70
+ }
71
+
72
+
73
+ def fit_svm(df: pl.DataFrame, dep: str, indeps: list[str],
74
+ kernel: str = "rbf", C: float = 1.0,
75
+ task: str = "regression", seed: int = 42) -> dict:
76
+ """Support Vector Machine regressor or classifier."""
77
+ sklearn_svm = __import__("sklearn.svm", fromlist=["SVR", "SVC"])
78
+ sub = df.select([dep] + indeps).drop_nulls()
79
+ y = sub[dep].to_numpy()
80
+ X = sub.select(indeps).to_numpy().astype(float)
81
+ if task == "classification":
82
+ clf = sklearn_svm.SVC(kernel=kernel, C=C, random_state=seed)
83
+ clf.fit(X, y)
84
+ score = float(clf.score(X, y))
85
+ metric = "accuracy"
86
+ else:
87
+ clf = sklearn_svm.SVR(kernel=kernel, C=C)
88
+ clf.fit(X, y.astype(float))
89
+ score = float(clf.score(X, y.astype(float)))
90
+ metric = "r_squared"
91
+ return {
92
+ "method": f"SVM ({task}, kernel={kernel})",
93
+ "dep": dep, "indeps": indeps,
94
+ "kernel": kernel, "C": C,
95
+ metric: score,
96
+ "n_obs": len(y),
97
+ "_model": clf,
98
+ }
99
+
100
+
101
+ def fit_tsne(df: pl.DataFrame, cols: list[str], n_components: int = 2,
102
+ perplexity: float = 30.0, seed: int = 42) -> dict:
103
+ """t-SNE dimensionality reduction."""
104
+ from sklearn.manifold import TSNE
105
+ sub = df.select(cols).drop_nulls()
106
+ X = sub.to_numpy().astype(float)
107
+ tsne = TSNE(n_components=n_components, perplexity=min(perplexity, len(X) - 1),
108
+ random_state=seed)
109
+ embedding = tsne.fit_transform(X)
110
+ return {
111
+ "method": "t-SNE",
112
+ "cols": cols,
113
+ "n_components": n_components,
114
+ "perplexity": perplexity,
115
+ "embedding": embedding.tolist(),
116
+ "n_obs": len(X),
117
+ }