openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,269 @@
1
+ """Advanced regression: NLS, Beta, ZIP/ZINB, Hurdle, SUR."""
2
+
3
+ from __future__ import annotations
4
+ import numpy as np
5
+ import polars as pl
6
+ from scipy import stats as sp_stats
7
+ from scipy.optimize import least_squares
8
+ import statsmodels.api as sm
9
+ import statsmodels.formula.api as smf
10
+
11
+
12
+ # ── NLS ───────────────────────────────────────────────────────────────────
13
+
14
+ def fit_nls(df: pl.DataFrame, dep: str, indeps: list[str],
15
+ formula_fn, p0: list[float], *, robust: bool = False) -> dict:
16
+ """Nonlinear Least Squares via scipy.optimize.least_squares.
17
+
18
+ formula_fn: callable(X, *params) -> y_pred where X is ndarray (n, k)
19
+ p0: initial parameter guesses
20
+ """
21
+ sub = df.select([dep] + indeps).drop_nulls()
22
+ y = sub[dep].to_numpy().astype(float)
23
+ X = sub.select(indeps).to_numpy().astype(float)
24
+
25
+ def residuals(params):
26
+ return formula_fn(X, *params) - y
27
+
28
+ result = least_squares(residuals, p0, method='lm')
29
+ y_pred = formula_fn(X, *result.x)
30
+ ss_res = np.sum((y - y_pred)**2)
31
+ ss_tot = np.sum((y - y.mean())**2)
32
+ r2 = float(1 - ss_res/ss_tot) if ss_tot > 0 else float('nan')
33
+
34
+ # approximate std errors from Jacobian
35
+ try:
36
+ J = result.jac
37
+ cov = np.linalg.inv(J.T @ J) * (ss_res / max(len(y) - len(p0), 1))
38
+ se = np.sqrt(np.diag(cov))
39
+ except Exception:
40
+ se = np.full(len(p0), float('nan'))
41
+
42
+ params_dict = {f"p{i}": float(v) for i, v in enumerate(result.x)}
43
+ se_dict = {f"p{i}": float(v) for i, v in enumerate(se)}
44
+
45
+ return {
46
+ "method": "NLS",
47
+ "dep": dep,
48
+ "indeps": indeps,
49
+ "params": params_dict,
50
+ "std_errors": se_dict,
51
+ "r_squared": r2,
52
+ "n_obs": len(y),
53
+ "converged": result.success,
54
+ "cost": float(result.cost),
55
+ }
56
+
57
+
58
+ # ── Beta regression ───────────────────────────────────────────────────────
59
+
60
+ def fit_betareg(df: pl.DataFrame, dep: str, indeps: list[str],
61
+ *, link: str = 'logit') -> dict:
62
+ """Beta regression for (0,1) bounded outcomes via statsmodels GLM."""
63
+ sub = df.select([dep] + indeps).drop_nulls()
64
+ y = sub[dep].to_numpy().astype(float)
65
+ # Clamp to avoid boundary issues
66
+ eps = 1e-6
67
+ y = np.clip(y, eps, 1 - eps)
68
+ X = sm.add_constant(sub.select(indeps).to_numpy().astype(float))
69
+
70
+ # Use GLM with logit link and Binomial family as Beta approximation
71
+ # True beta regression via formula
72
+ pdf = sub.to_pandas()
73
+ pdf.columns = ["dep"] + [f"x{i}" for i in range(len(indeps))]
74
+ formula = "dep ~ " + " + ".join(f"x{i}" for i in range(len(indeps)))
75
+
76
+ try:
77
+ model = smf.glm(formula, data=pdf,
78
+ family=sm.families.Binomial(link=sm.families.links.Logit())).fit()
79
+ params = dict(zip(["_cons"] + indeps, model.params.tolist()))
80
+ se = dict(zip(["_cons"] + indeps, model.bse.tolist()))
81
+ pvals = dict(zip(["_cons"] + indeps, model.pvalues.tolist()))
82
+ return {
83
+ "method": "Beta Regression (GLM-Binomial)",
84
+ "dep": dep, "indeps": indeps,
85
+ "params": params, "std_errors": se, "p_values": pvals,
86
+ "aic": float(model.aic), "bic": float(model.bic),
87
+ "n_obs": int(model.nobs), "pseudo_r2": float(1 - model.llf/model.llnull),
88
+ "_result": model,
89
+ }
90
+ except Exception as exc:
91
+ raise RuntimeError(f"Beta regression failed: {exc}") from exc
92
+
93
+
94
+ # ── Zero-inflated Poisson ─────────────────────────────────────────────────
95
+
96
+ def fit_zip(df: pl.DataFrame, dep: str, indeps: list[str]) -> dict:
97
+ """Zero-Inflated Poisson regression (scipy L-BFGS-B)."""
98
+ from scipy.optimize import minimize
99
+ from scipy.special import expit
100
+ from scipy.stats import poisson as _poisson
101
+
102
+ sub = df.select([dep] + indeps).drop_nulls()
103
+ y = sub[dep].to_numpy().astype(float)
104
+ X_raw = sub.select(indeps).to_numpy().astype(float)
105
+ n, k = X_raw.shape
106
+ X = np.column_stack([np.ones(n), X_raw])
107
+ kp = k + 1
108
+
109
+ def neg_ll(params):
110
+ gamma = params[:kp] # inflate logit params
111
+ beta = params[kp:] # Poisson mean params
112
+ pi = expit(X @ gamma)
113
+ lam = np.exp(X @ beta)
114
+ lam = np.clip(lam, 1e-10, 1e10)
115
+ ll_zero = np.log(pi + (1 - pi) * np.exp(-lam) + 1e-300)
116
+ ll_pos = np.log(1 - pi + 1e-300) + y * np.log(lam + 1e-300) - lam - np.array([
117
+ float(np.sum(np.log(np.arange(1, int(yi) + 1)))) for yi in y
118
+ ])
119
+ ll = np.where(y == 0, ll_zero, ll_pos)
120
+ return -ll.sum()
121
+
122
+ try:
123
+ x0 = np.zeros(2 * kp)
124
+ res = minimize(neg_ll, x0, method="L-BFGS-B", options={"maxiter": 500})
125
+ params_hat = res.x
126
+ llf = -res.fun
127
+ aic = 2 * len(params_hat) - 2 * llf
128
+ bic = len(params_hat) * np.log(n) - 2 * llf
129
+ names_inflate = [f"inflate_{p}" for p in ["_cons"] + indeps]
130
+ names_count = [f"count_{p}" for p in ["_cons"] + indeps]
131
+ all_names = names_inflate + names_count
132
+ params_dict = {nm: float(v) for nm, v in zip(all_names, params_hat)}
133
+ return {
134
+ "method": "Zero-Inflated Poisson",
135
+ "dep": dep, "indeps": indeps,
136
+ "params": params_dict, "std_errors": {k: float("nan") for k in params_dict},
137
+ "p_values": {k: float("nan") for k in params_dict},
138
+ "aic": float(aic), "bic": float(bic),
139
+ "log_likelihood": float(llf), "n_obs": n,
140
+ }
141
+ except Exception as exc:
142
+ raise RuntimeError(f"ZIP failed: {exc}") from exc
143
+
144
+
145
+ # ── Zero-inflated Negative Binomial ───────────────────────────────────────
146
+
147
+ def fit_zinb(df: pl.DataFrame, dep: str, indeps: list[str]) -> dict:
148
+ """Zero-Inflated Negative Binomial regression (scipy L-BFGS-B)."""
149
+ from scipy.optimize import minimize
150
+ from scipy.special import expit, gammaln
151
+
152
+ sub = df.select([dep] + indeps).drop_nulls()
153
+ y = sub[dep].to_numpy().astype(float)
154
+ X_raw = sub.select(indeps).to_numpy().astype(float)
155
+ n, k = X_raw.shape
156
+ X = np.column_stack([np.ones(n), X_raw])
157
+ kp = k + 1
158
+
159
+ def neg_ll(params):
160
+ gamma = params[:kp]
161
+ beta = params[kp:2 * kp]
162
+ log_r = params[2 * kp] # log(dispersion)
163
+ r = np.exp(log_r)
164
+ pi = expit(X @ gamma)
165
+ mu = np.exp(X @ beta)
166
+ mu = np.clip(mu, 1e-10, 1e10)
167
+ p_nb = r / (r + mu)
168
+ ll_zero_nb = r * np.log(p_nb + 1e-300)
169
+ ll_zero = np.log(pi + (1 - pi) * np.exp(ll_zero_nb) + 1e-300)
170
+ ll_pos = (np.log(1 - pi + 1e-300)
171
+ + gammaln(y + r) - gammaln(r) - gammaln(y + 1)
172
+ + r * np.log(p_nb + 1e-300) + y * np.log(1 - p_nb + 1e-300))
173
+ ll = np.where(y == 0, ll_zero, ll_pos)
174
+ return -ll.sum()
175
+
176
+ try:
177
+ x0 = np.zeros(2 * kp + 1)
178
+ res = minimize(neg_ll, x0, method="L-BFGS-B", options={"maxiter": 500})
179
+ params_hat = res.x
180
+ llf = -res.fun
181
+ aic = 2 * len(params_hat) - 2 * llf
182
+ bic = len(params_hat) * np.log(n) - 2 * llf
183
+ names = [f"inflate_{p}" for p in ["_cons"] + indeps] \
184
+ + [f"count_{p}" for p in ["_cons"] + indeps] \
185
+ + ["log_dispersion"]
186
+ params_dict = {nm: float(v) for nm, v in zip(names, params_hat)}
187
+ return {
188
+ "method": "Zero-Inflated Negative Binomial",
189
+ "dep": dep, "indeps": indeps,
190
+ "params": params_dict, "std_errors": {k: float("nan") for k in params_dict},
191
+ "p_values": {k: float("nan") for k in params_dict},
192
+ "aic": float(aic), "bic": float(bic),
193
+ "log_likelihood": float(llf), "n_obs": n,
194
+ }
195
+ except Exception as exc:
196
+ raise RuntimeError(f"ZINB failed: {exc}") from exc
197
+
198
+
199
+ # ── Hurdle model ──────────────────────────────────────────────────────────
200
+
201
+ def fit_hurdle(df: pl.DataFrame, dep: str, indeps: list[str]) -> dict:
202
+ """Two-part hurdle model: Logit for zeros, Truncated Poisson for positives."""
203
+ sub = df.select([dep] + indeps).drop_nulls()
204
+ y = sub[dep].to_numpy().astype(float)
205
+ X_raw = sub.select(indeps).to_numpy().astype(float)
206
+ X = sm.add_constant(X_raw)
207
+
208
+ # Part 1: Logit (zero vs. nonzero)
209
+ y_bin = (y > 0).astype(float)
210
+ logit_model = sm.Logit(y_bin, X).fit(disp=0)
211
+
212
+ # Part 2: Poisson on positive outcomes only
213
+ pos_mask = y > 0
214
+ y_pos = y[pos_mask]
215
+ X_pos = X[pos_mask]
216
+ poisson_model = sm.Poisson(y_pos, X_pos).fit(disp=0)
217
+
218
+ param_names = ["_cons"] + indeps
219
+ return {
220
+ "method": "Hurdle (Logit + Poisson)",
221
+ "dep": dep, "indeps": indeps,
222
+ "n_obs": len(y), "n_zeros": int((y == 0).sum()), "n_positive": int(pos_mask.sum()),
223
+ "logit_params": dict(zip(param_names, logit_model.params.tolist())),
224
+ "logit_pvalues": dict(zip(param_names, logit_model.pvalues.tolist())),
225
+ "count_params": dict(zip(param_names, poisson_model.params.tolist())),
226
+ "count_pvalues": dict(zip(param_names, poisson_model.pvalues.tolist())),
227
+ "aic_logit": float(logit_model.aic),
228
+ "aic_count": float(poisson_model.aic),
229
+ "_logit": logit_model, "_count": poisson_model,
230
+ }
231
+
232
+
233
+ # ── SUR ───────────────────────────────────────────────────────────────────
234
+
235
+ def fit_sur(df: pl.DataFrame, equations: list[tuple[str, list[str]]]) -> dict:
236
+ """Seemingly Unrelated Regression via GLS iteration."""
237
+ results = []
238
+ residuals = []
239
+
240
+ for dep, indeps in equations:
241
+ sub = df.select([dep] + indeps).drop_nulls()
242
+ y = sub[dep].to_numpy().astype(float)
243
+ X = sm.add_constant(sub.select(indeps).to_numpy().astype(float))
244
+ model = sm.OLS(y, X).fit()
245
+ results.append(model)
246
+ residuals.append(model.resid)
247
+
248
+ # Cross-equation covariance (Sigma)
249
+ min_n = min(len(r) for r in residuals)
250
+ resid_mat = np.column_stack([r[:min_n] for r in residuals])
251
+ Sigma = (resid_mat.T @ resid_mat) / min_n
252
+
253
+ equations_out = []
254
+ for i, ((dep, indeps), res) in enumerate(zip(equations, results)):
255
+ equations_out.append({
256
+ "equation": i + 1,
257
+ "dep": dep, "indeps": indeps,
258
+ "params": dict(zip(["_cons"] + indeps, res.params.tolist())),
259
+ "std_errors": dict(zip(["_cons"] + indeps, res.bse.tolist())),
260
+ "r_squared": float(res.rsquared),
261
+ "n_obs": int(res.nobs),
262
+ })
263
+
264
+ return {
265
+ "method": "SUR (OLS-based)",
266
+ "n_equations": len(equations),
267
+ "equations": equations_out,
268
+ "cross_equation_corr": np.corrcoef(resid_mat.T).tolist(),
269
+ }
@@ -0,0 +1,84 @@
1
+ """ARCH/GARCH volatility models (requires 'arch' package)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+
8
+
9
+ def _require_arch():
10
+ try:
11
+ import arch # noqa: F401
12
+ return arch
13
+ except ImportError:
14
+ raise ImportError(
15
+ "'arch' package is required for ARCH/GARCH models.\n"
16
+ "Install: pip install arch"
17
+ )
18
+
19
+
20
+ def fit_arch(
21
+ df: pl.DataFrame,
22
+ var: str,
23
+ *,
24
+ p: int = 1,
25
+ mean: str = "Constant",
26
+ dist: str = "normal",
27
+ ) -> dict:
28
+ """ARCH(p) model for volatility clustering."""
29
+ arch_pkg = _require_arch()
30
+ from arch import arch_model # type: ignore[import]
31
+
32
+ y = df[var].drop_nulls().to_numpy().astype(float) * 100 # scale returns
33
+
34
+ am = arch_model(y, mean=mean, vol="ARCH", p=p, dist=dist)
35
+ res = am.fit(disp="off")
36
+
37
+ params = {k: float(v) for k, v in res.params.items()}
38
+ return {
39
+ "model": f"ARCH({p})",
40
+ "var": var,
41
+ "n_obs": len(y),
42
+ "params": params,
43
+ "aic": float(res.aic),
44
+ "bic": float(res.bic),
45
+ "log_likelihood": float(res.loglikelihood),
46
+ "_result": res,
47
+ }
48
+
49
+
50
+ def fit_garch(
51
+ df: pl.DataFrame,
52
+ var: str,
53
+ *,
54
+ p: int = 1,
55
+ q: int = 1,
56
+ mean: str = "Constant",
57
+ dist: str = "normal",
58
+ model: str = "GARCH",
59
+ ) -> dict:
60
+ """GARCH(p,q) or GJR-GARCH / EGARCH volatility model."""
61
+ _require_arch()
62
+ from arch import arch_model # type: ignore[import]
63
+
64
+ y = df[var].drop_nulls().to_numpy().astype(float) * 100
65
+
66
+ # model: GARCH, EGARCH, GJR-GARCH
67
+ vol = model.upper()
68
+ am = arch_model(y, mean=mean, vol=vol, p=p, q=q, dist=dist)
69
+ res = am.fit(disp="off")
70
+
71
+ params = {k: float(v) for k, v in res.params.items()}
72
+ cond_vol = res.conditional_volatility.tolist()
73
+
74
+ return {
75
+ "model": f"{model}({p},{q})",
76
+ "var": var,
77
+ "n_obs": len(y),
78
+ "params": params,
79
+ "aic": float(res.aic),
80
+ "bic": float(res.bic),
81
+ "log_likelihood": float(res.loglikelihood),
82
+ "cond_volatility_last5": cond_vol[-5:],
83
+ "_result": res,
84
+ }
@@ -0,0 +1,103 @@
1
+ """Bayesian linear regression via scipy (conjugate prior, no PyMC required)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ from scipy import stats as sp_stats
8
+
9
+
10
+ def bayes_ols(
11
+ df: pl.DataFrame,
12
+ dep: str,
13
+ indeps: list[str],
14
+ *,
15
+ prior_scale: float = 10.0,
16
+ n_samples: int = 4000,
17
+ credible_interval: float = 0.95,
18
+ seed: int = 42,
19
+ ) -> dict:
20
+ """
21
+ Bayesian linear regression using conjugate Normal-Inverse-Gamma prior.
22
+
23
+ Analytically exact posterior — no MCMC required.
24
+
25
+ prior_scale: scale of the diffuse Normal(0, prior_scale²) prior on coefficients.
26
+ """
27
+ rng = np.random.default_rng(seed)
28
+
29
+ sub = df.select([dep] + indeps).drop_nulls()
30
+ y = sub[dep].to_numpy().astype(float)
31
+ X_raw = sub.select(indeps).to_numpy().astype(float)
32
+ n, k = X_raw.shape
33
+
34
+ # Add intercept
35
+ X = np.column_stack([np.ones(n), X_raw])
36
+ param_names = ["_cons"] + indeps
37
+ kp = k + 1
38
+
39
+ # ── Conjugate prior: β | σ² ~ N(0, prior_scale² I), σ² ~ IG(a0, b0)
40
+ a0 = 0.001
41
+ b0 = 0.001
42
+ V0_inv = np.eye(kp) / prior_scale**2
43
+
44
+ # ── Posterior parameters (Normal-Inverse-Gamma)
45
+ XtX = X.T @ X
46
+ Xty = X.T @ y
47
+ Vn_inv = XtX + V0_inv
48
+ Vn = np.linalg.inv(Vn_inv)
49
+ beta_n = Vn @ Xty # posterior mean of β
50
+
51
+ an = a0 + n / 2
52
+ bn = b0 + 0.5 * (y @ y - beta_n @ Vn_inv @ beta_n)
53
+
54
+ # ── Draw from posterior
55
+ sigma2_draws = 1.0 / rng.gamma(an, 1.0 / max(bn, 1e-10), size=n_samples)
56
+ beta_draws = np.array([
57
+ rng.multivariate_normal(beta_n, s2 * Vn)
58
+ for s2 in sigma2_draws
59
+ ])
60
+
61
+ # ── Summary
62
+ alpha = 1 - credible_interval
63
+ lo, hi = alpha / 2, 1 - alpha / 2
64
+
65
+ post_mean = beta_draws.mean(axis=0)
66
+ post_std = beta_draws.std(axis=0)
67
+ post_lo = np.quantile(beta_draws, lo, axis=0)
68
+ post_hi = np.quantile(beta_draws, hi, axis=0)
69
+
70
+ # P(β > 0)
71
+ prob_positive = (beta_draws > 0).mean(axis=0)
72
+
73
+ # Posterior predictive R²
74
+ y_pred = X @ post_mean
75
+ ss_res = ((y - y_pred) ** 2).sum()
76
+ ss_tot = ((y - y.mean()) ** 2).sum()
77
+ r2 = float(1 - ss_res / ss_tot) if ss_tot > 0 else float("nan")
78
+
79
+ coefficients = {}
80
+ for i, name in enumerate(param_names):
81
+ coefficients[name] = {
82
+ "mean": float(post_mean[i]),
83
+ "std": float(post_std[i]),
84
+ f"ci_{int(credible_interval*100)}_lo": float(post_lo[i]),
85
+ f"ci_{int(credible_interval*100)}_hi": float(post_hi[i]),
86
+ "prob_positive": float(prob_positive[i]),
87
+ }
88
+
89
+ return {
90
+ "model": "Bayesian OLS (conjugate Normal-IG prior)",
91
+ "dep": dep,
92
+ "indeps": indeps,
93
+ "n_obs": n,
94
+ "n_samples": n_samples,
95
+ "prior_scale": prior_scale,
96
+ "credible_interval": credible_interval,
97
+ "r_squared": r2,
98
+ "sigma_mean": float(np.sqrt(sigma2_draws.mean())),
99
+ "sigma_std": float(np.sqrt(sigma2_draws).std()),
100
+ "coefficients": coefficients,
101
+ "_beta_draws": beta_draws,
102
+ "_sigma2_draws": sigma2_draws,
103
+ }